中間テストが終わったので久々にSSE最適化を…
SSEに対応していないCPU(PenII以前)やOS(Windows95以前)だとハングアップするので注意する必要があります。
SSEが使えるかどうかに応じて関数ポインタで処理を切り替えるのが得策かもしれません。
int CreateMultiplyMatrix_SSE(MATRIX *Dest,const MATRIX *Src1,const MATRIX *Src2)
{
if(Dest == NULL)return -1;
__asm
{
mov eax, Src1
mov ebx, Src2
mov ecx, Dest
movups xmm4, [ebx + 0] //Src2のデータをセット
movups xmm5, [ebx + 16]
movups xmm6, [ebx + 32]
movups xmm7, [ebx + 48]
movss xmm0, [eax + 0] //m1x計算
movss xmm1, [eax + 4]
movss xmm2, [eax + 8]
movss xmm3, [eax + 12]
shufps xmm0, xmm0 ,0x00
shufps xmm1, xmm1 ,0x00
shufps xmm2, xmm2 ,0x00
shufps xmm3, xmm3 ,0x00
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
addps xmm0, xmm3
movups [ecx + 0],xmm0
movss xmm0, [eax + 16] //m2x計算
movss xmm1, [eax + 20]
movss xmm2, [eax + 24]
movss xmm3, [eax + 28]
shufps xmm0, xmm0 ,0x00
shufps xmm1, xmm1 ,0x00
shufps xmm2, xmm2 ,0x00
shufps xmm3, xmm3 ,0x00
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
addps xmm0, xmm3
movups [ecx + 16],xmm0
movss xmm0, [eax + 32] //m3x計算
movss xmm1, [eax + 36]
movss xmm2, [eax + 40]
movss xmm3, [eax + 44]
shufps xmm0, xmm0 ,0x00
shufps xmm1, xmm1 ,0x00
shufps xmm2, xmm2 ,0x00
shufps xmm3, xmm3 ,0x00
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
addps xmm0, xmm3
movups [ecx + 32],xmm0
movss xmm0, [eax + 48] //m4x計算
movss xmm1, [eax + 52]
movss xmm2, [eax + 56]
movss xmm3, [eax + 60]
shufps xmm0, xmm0 ,0x00
shufps xmm1, xmm1 ,0x00
shufps xmm2, xmm2 ,0x00
shufps xmm3, xmm3 ,0x00
mulps xmm0, xmm4
mulps xmm1, xmm5
mulps xmm2, xmm6
mulps xmm3, xmm7
addps xmm0, xmm1
addps xmm0, xmm2
addps xmm0, xmm3
movups [ecx + 48],xmm0
}
return 0;
}
以下の環境でDXライブラリに用意されている関数のおおよそ3.3倍ほど高速でした。
CPU:Core2Duo@2.5GHz
MEM:4GB