Optimizing for SSE: Code Samples

来源:百度文库 编辑:神马文学网 时间:2024/04/29 00:14:19
#include "Matrix4f.h"#include "Vector4f.h"// MatrixMultiply1 -- a naive C++ matrix-vector multiplication function.// It's correct, but that's about the only thing impressive about it.//// Performance: ~90 cycles/vectorVector4f MatrixMultiply1(Matrix4f &m, Vector4f &vin){float v0 =   m.elts[0][0]*vin[0] + m.elts[0][1]*vin[1] +m.elts[0][2]*vin[2] + m.elts[0][3]*vin[3];float v1 =  m.elts[1][0]*vin[0] + m.elts[1][1]*vin[1] +m.elts[1][2]*vin[2] + m.elts[1][3]*vin[3];float v2 =  m.elts[2][0]*vin[0] + m.elts[2][1]*vin[1] +m.elts[2][2]*vin[2] + m.elts[2][3]*vin[3];float v3 =  m.elts[3][0]*vin[0] + m.elts[3][1]*vin[1] +m.elts[3][2]*vin[2] + m.elts[3][3]*vin[3];return Vector4f(v0,v1,v2,v3);}// MatrixMultiply2 -- a faster version of MatrixMultiply1, still in C++.//// Performance: 70 cycles/vectorvoid MatrixMultiply2(Matrix4f &m, Vector4f *vin, Vector4f *vout){float *in = vin->Ref();float *out = vout->Ref();out[0] =   m.elts[0][0]*in[0] + m.elts[0][1]*in[1] +m.elts[0][2]*in[2] + m.elts[0][3]*in[3];out[1] =  m.elts[1][0]*in[0] + m.elts[1][1]*in[1] +m.elts[1][2]*in[2] + m.elts[1][3]*in[3];out[2] =  m.elts[2][0]*in[0] + m.elts[2][1]*in[1] +m.elts[2][2]*in[2] + m.elts[2][3]*in[3];out[3] =  m.elts[3][0]*in[0] + m.elts[3][1]*in[1] +m.elts[3][2]*in[2] + m.elts[3][3]*in[3];}// MatrixMultiply3 -- a C++/ASM version of MatrixMultiply2, which takes// advantage of Intel's SSE instructions.  This version requires that// M be in column-major order.//// Performance: 57 cycles/vectorvoid MatrixMultiply3(Matrix4f &m, Vector4f *vin, Vector4f *vout){// Get a pointer to the elements of mfloat *row0 = m.Ref();__asm {mov         esi, vinmov         edi, vout// load columns of matrix into xmm4-7mov         edx, row0movups   xmm4, [edx]movups   xmm5, [edx+0x10]movups   xmm6, [edx+0x20]movups   xmm7, [edx+0x30]// load v into xmm0.movups   xmm0, [esi]// we'll store the final result in xmm2; initialize it// to zeroxorps      xmm2, xmm2// broadcast x into xmm1, multiply it by the first// column of the matrix (xmm4), and add it to the totalmovups   xmm1, xmm0shufps   xmm1, xmm1, 0x00mulps      xmm1, xmm4addps      xmm2, xmm1// repeat the process for y, z and wmovups   xmm1, xmm0shufps   xmm1, xmm1, 0x55mulps      xmm1, xmm5addps      xmm2, xmm1movups   xmm1, xmm0shufps   xmm1, xmm1, 0xAAmulps      xmm1, xmm6addps      xmm2, xmm1movups   xmm1, xmm0shufps   xmm1, xmm1, 0xFFmulps      xmm1, xmm7addps      xmm2, xmm1// write the results to voutmovups   [edi], xmm2}}// BatchMultiply1 -- A modification to MatrixMultiply2 in which we// multiply several input vectors (vin) by the same matrix (m), storing the// results in 'vout'.  A total of 'len' vectors are processed.  This// prevents us from having to re-load 'm' every time through the loop.// This also allows us to embed the tranpose operation into the function// body, so we can continue to store our matrices in row-major order,// if we wish.//// Performance: 32 cycles/vectorvoid BatchMultiply1(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// transpose the matrix into the xmm4-7m.TransposeIntoXMM();static const int vecSize = sizeof(Vector4f);__asm {mov         esi, vinmov         edi, voutmov         ecx, lenBM1_START:// load the next input vector into xmm0, and advance the input// pointermovups   xmm0, [esi]add         esi, vecSize// we'll store the final result in xmm2; initialize it// to zeroxorps      xmm2, xmm2// broadcast x into xmm1, multiply it by the first// column of the matrix (xmm4), and add it to the totalmovups   xmm1, xmm0shufps   xmm1, xmm1, 0x00mulps      xmm1, xmm4addps      xmm2, xmm1// repeat the process for y, z and wmovups   xmm1, xmm0shufps   xmm1, xmm1, 0x55mulps      xmm1, xmm5addps      xmm2, xmm1movups   xmm1, xmm0shufps   xmm1, xmm1, 0xAAmulps      xmm1, xmm6addps      xmm2, xmm1movups   xmm1, xmm0shufps   xmm1, xmm1, 0xFFmulps      xmm1, xmm7addps      xmm2, xmm1// write the results to vout, and advance the output pointermovups   [edi], xmm2add         edi, vecSizedec         ecxjnz         BM1_START}}// BatchMultiply2 -- A simple modification to BatchMultiply1: we now use// aligned moves (movaps) instead of unaligned moves (movups).  This is much// faster, but requires that Matrix4f and Vector4f objects are aligned// on 16-byte boundaries.  We use the __declspec(align(16)) specifier in// the Matrix4f and Vector4f class definitions to accomplish this.//// Performance: 28 cycles/vectorvoid BatchMultiply2(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// transpose the matrix into the xmm4-7m.TransposeIntoXMM();static const int vecSize = sizeof(Vector4f);__asm {mov         esi, vinmov         edi, voutmov         ecx, lenBM2_START:// load the next input vector into xmm0, and advance the input// pointermovaps   xmm0, [esi]add         esi, vecSize// we'll store the final result in xmm2; initialize it// to zeroxorps      xmm2, xmm2// broadcast x into xmm1, multiply it by the first// column of the matrix (xmm4), and add it to the totalmovaps   xmm1, xmm0shufps   xmm1, xmm1, 0x00mulps      xmm1, xmm4addps      xmm2, xmm1// repeat the process for y, z and wmovaps   xmm1, xmm0shufps   xmm1, xmm1, 0x55mulps      xmm1, xmm5addps      xmm2, xmm1movaps   xmm1, xmm0shufps   xmm1, xmm1, 0xAAmulps      xmm1, xmm6addps      xmm2, xmm1movaps   xmm1, xmm0shufps   xmm1, xmm1, 0xFFmulps      xmm1, xmm7addps      xmm2, xmm1// write the results to vout, advance the output pointer,// and loopmovaps   [edi], xmm2add         edi, vecSizedec         ecxjnz         BM2_START}}// BatchMultiply3 -- A modification to BatchMultiply2 which makes better// use of instruction pairing.//// Performance: 22 cycles/vectorvoid BatchMultiply3(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// transpose the matrix into the xmm4-7m.TransposeIntoXMM();static const int vecSize = sizeof(Vector4f);__asm {mov         esi, vinmov         edi, voutmov         ecx, lenBM3_START:// load the next input vector into xmm0, and advance the input// and output pointersmovaps   xmm0, [esi]add         edi, vecSize// broadcast y into xmm1, z into xmm2, and w into xmm3 (leaving// x in xmm0).movaps   xmm1, xmm0add         esi, vecSizemovaps   xmm2, xmm0movaps   xmm3, xmm0shufps   xmm0, xmm0, 0x00shufps   xmm1, xmm1, 0x55shufps   xmm2, xmm2, 0xAAshufps   xmm3, xmm3, 0xFF// multiply xmm0-3 by the appropriate columns of the matrixmulps      xmm0, xmm4mulps      xmm1, xmm5mulps      xmm2, xmm6mulps      xmm3, xmm7// sum the results into xmm1addps      xmm1, xmm0addps      xmm2, xmm3addps      xmm1, xmm2// write the results to vout, and loopmovaps   [edi-0x10], xmm1dec         ecxjnz         BM3_START}}// BatchMultiply4 -- A modification to BatchMultiply3 which uses // SSE prefetching instructions to improve performance with large// input sets.//// Performance: 21 cycles/vectorvoid BatchMultiply4(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// transpose the matrix into the xmm4-7m.TransposeIntoXMM();static const int vecSize = sizeof(Vector4f);__asm {mov         esi, vinmov         edi, voutmov         ecx, lenBM4_START:// load the next input vector into xmm0, and advance the input// pointer.  Prefetch upcoming vectors into the cachemovaps   xmm0, [esi]prefetchnta   [esi+0x30]// broadcast y into xmm1, z into xmm2, and w into xmm3 (leaving// x in xmm0).movaps   xmm1, xmm0add         esi, vecSizemovaps   xmm2, xmm0add         edi, vecSizemovaps   xmm3, xmm0prefetchnta [edi+0x30]shufps   xmm0, xmm0, 0x00shufps   xmm1, xmm1, 0x55shufps   xmm2, xmm2, 0xAAshufps   xmm3, xmm3, 0xFF// multiply xmm0-3 by the appropriate columns of the matrix// (hiding a pointer increment between the multiplies)mulps      xmm0, xmm4mulps      xmm1, xmm5mulps      xmm2, xmm6mulps      xmm3, xmm7// sum the results into xmm1addps      xmm1, xmm0addps      xmm2, xmm3addps      xmm1, xmm2// write the results to vout, and loopmovaps   [edi-0x10], xmm1dec         ecxjnz         BM4_START}}// BatchMultiply5 -- A modified version of BatchMultiply4 which loads// vector components individually from memory, thereby allowing us// to work on TWO VECTORS SIMULTANEOUSLY!//// Performance: 20 cycles/vectorvoid BatchMultiply5(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// initializations in C++ landMatrix4f mt(m, Matrix4f::TRANSPOSE); // work from a float *row0 = mt.Ref();static const int vecSize = 2 * sizeof(Vector4f);// if there are an odd number of vectors, process the first one// separately and advance the pointersif (len & 0x1) {MatrixMultiply3(mt, vin, vout);++vin;++vout;}len >>= 1; // we process two vectors at a time__asm {mov         esi, vinmov         edi, voutmov         ecx, len// load columns of matrix into xmm4-7mov         edx, row0movaps   xmm4, [edx]movaps   xmm5, [edx+0x10]movaps   xmm6, [edx+0x20]movaps   xmm7, [edx+0x30]BM5_START:// process xmovss      xmm1, [esi+0x00]movss      xmm3, [esi+0x10]shufps   xmm1, xmm1, 0x00prefetchnta   [esi+0x30]shufps   xmm3, xmm3, 0x00mulps      xmm1, xmm4prefetchnta [edi+0x30]mulps      xmm3, xmm4// process ymovss      xmm0, [esi+0x04]movss      xmm2, [esi+0x14]shufps   xmm0, xmm0, 0x00shufps   xmm2, xmm2, 0x00mulps      xmm0, xmm5mulps      xmm2, xmm5addps      xmm1, xmm0addps      xmm3, xmm2// process zmovss      xmm0, [esi+0x08]movss      xmm2, [esi+0x18]shufps   xmm0, xmm0, 0x00shufps   xmm2, xmm2, 0x00mulps      xmm0, xmm6mulps      xmm2, xmm6addps      xmm1, xmm0addps      xmm3, xmm2// process w (hiding some pointer increments between the// multiplies)movss      xmm0, [esi+0x0C]movss      xmm2, [esi+0x1C]shufps   xmm0, xmm0, 0x00shufps   xmm2, xmm2, 0x00mulps      xmm0, xmm7add         esi, vecSizemulps      xmm2, xmm7add         edi, vecSizeaddps      xmm1, xmm0addps      xmm3, xmm2// write output vectors to memory, and loopmovaps   [edi-0x20], xmm1movaps   [edi-0x10], xmm3dec         ecxjnz         BM5_START}}// BatchTransform1 -- A modified version of BatchMultiply4 which makes// an additional assumption about the vectors in vin: if each vector's// 4th element (the homogenous coordinate w) is assumed to be 1.0 (as is// the case for 3D vertices), we can eliminate a move, a shuffle and a// multiply instruction.//// Performance: 17 cycles/vectorvoid BatchTransform1(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// initializations in C++ landMatrix4f mt(m, Matrix4f::TRANSPOSE); // work from a float *row0 = mt.Ref();static const int vecSize = 2 * sizeof(Vector4f);// if there are an odd number of vectors, process the first one// separately and advance the pointersif (len & 0x1) {MatrixMultiply3(mt, vin, vout);++vin;++vout;}len >>= 1; // we process two vectors at a time__asm {mov      esi, vinmov      edi, voutmov      ecx, len// load columns of matrix into xmm4-7mov      edx, row0movaps   xmm4, [edx]movaps   xmm5, [edx+0x10]movaps   xmm6, [edx+0x20]movaps   xmm7, [edx+0x30]BT2_START:// process x (hiding the prefetches in the delays)movss      xmm1, [esi+0x00]movss      xmm3, [esi+0x10]shufps   xmm1, xmm1, 0x00prefetchnta [edi+0x30]shufps   xmm3, xmm3, 0x00mulps      xmm1, xmm4prefetchnta   [esi+0x30]mulps      xmm3, xmm4// process ymovss      xmm0, [esi+0x04]movss      xmm2, [esi+0x14]shufps   xmm0, xmm0, 0x00shufps   xmm2, xmm2, 0x00mulps      xmm0, xmm5mulps      xmm2, xmm5addps      xmm1, xmm0addps      xmm3, xmm2// process z (hiding some pointer arithmetic between// the multiplies)movss      xmm0, [esi+0x08]movss      xmm2, [esi+0x18]shufps   xmm0, xmm0, 0x00shufps   xmm2, xmm2, 0x00mulps      xmm0, xmm6add         esi, vecSizemulps      xmm2, xmm6add         edi, vecSizeaddps      xmm1, xmm0addps      xmm3, xmm2// process waddps      xmm1, xmm7addps      xmm3, xmm7// write output vectors to memory and loopmovaps   [edi-0x20], xmm1movaps   [edi-0x10], xmm3dec         ecxjnz         BT2_START}}