Optimizing for SSE: Code Samples
来源:百度文库 编辑:神马文学网 时间:2024/04/29 00:14:19
#include "Matrix4f.h"#include "Vector4f.h"// MatrixMultiply1 -- a naive C++ matrix-vector multiplication function.// It's correct, but that's about the only thing impressive about it.//// Performance: ~90 cycles/vectorVector4f MatrixMultiply1(Matrix4f &m, Vector4f &vin){float v0 = m.elts[0][0]*vin[0] + m.elts[0][1]*vin[1] +m.elts[0][2]*vin[2] + m.elts[0][3]*vin[3];float v1 = m.elts[1][0]*vin[0] + m.elts[1][1]*vin[1] +m.elts[1][2]*vin[2] + m.elts[1][3]*vin[3];float v2 = m.elts[2][0]*vin[0] + m.elts[2][1]*vin[1] +m.elts[2][2]*vin[2] + m.elts[2][3]*vin[3];float v3 = m.elts[3][0]*vin[0] + m.elts[3][1]*vin[1] +m.elts[3][2]*vin[2] + m.elts[3][3]*vin[3];return Vector4f(v0,v1,v2,v3);}// MatrixMultiply2 -- a faster version of MatrixMultiply1, still in C++.//// Performance: 70 cycles/vectorvoid MatrixMultiply2(Matrix4f &m, Vector4f *vin, Vector4f *vout){float *in = vin->Ref();float *out = vout->Ref();out[0] = m.elts[0][0]*in[0] + m.elts[0][1]*in[1] +m.elts[0][2]*in[2] + m.elts[0][3]*in[3];out[1] = m.elts[1][0]*in[0] + m.elts[1][1]*in[1] +m.elts[1][2]*in[2] + m.elts[1][3]*in[3];out[2] = m.elts[2][0]*in[0] + m.elts[2][1]*in[1] +m.elts[2][2]*in[2] + m.elts[2][3]*in[3];out[3] = m.elts[3][0]*in[0] + m.elts[3][1]*in[1] +m.elts[3][2]*in[2] + m.elts[3][3]*in[3];}// MatrixMultiply3 -- a C++/ASM version of MatrixMultiply2, which takes// advantage of Intel's SSE instructions. This version requires that// M be in column-major order.//// Performance: 57 cycles/vectorvoid MatrixMultiply3(Matrix4f &m, Vector4f *vin, Vector4f *vout){// Get a pointer to the elements of mfloat *row0 = m.Ref();__asm {mov esi, vinmov edi, vout// load columns of matrix into xmm4-7mov edx, row0movups xmm4, [edx]movups xmm5, [edx+0x10]movups xmm6, [edx+0x20]movups xmm7, [edx+0x30]// load v into xmm0.movups xmm0, [esi]// we'll store the final result in xmm2; initialize it// to zeroxorps xmm2, xmm2// broadcast x into xmm1, multiply it by the first// column of the matrix (xmm4), and add it to the totalmovups xmm1, xmm0shufps xmm1, xmm1, 0x00mulps xmm1, xmm4addps xmm2, xmm1// repeat the process for y, z and wmovups xmm1, xmm0shufps xmm1, xmm1, 0x55mulps xmm1, xmm5addps xmm2, xmm1movups xmm1, xmm0shufps xmm1, xmm1, 0xAAmulps xmm1, xmm6addps xmm2, xmm1movups xmm1, xmm0shufps xmm1, xmm1, 0xFFmulps xmm1, xmm7addps xmm2, xmm1// write the results to voutmovups [edi], xmm2}}// BatchMultiply1 -- A modification to MatrixMultiply2 in which we// multiply several input vectors (vin) by the same matrix (m), storing the// results in 'vout'. A total of 'len' vectors are processed. This// prevents us from having to re-load 'm' every time through the loop.// This also allows us to embed the tranpose operation into the function// body, so we can continue to store our matrices in row-major order,// if we wish.//// Performance: 32 cycles/vectorvoid BatchMultiply1(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// transpose the matrix into the xmm4-7m.TransposeIntoXMM();static const int vecSize = sizeof(Vector4f);__asm {mov esi, vinmov edi, voutmov ecx, lenBM1_START:// load the next input vector into xmm0, and advance the input// pointermovups xmm0, [esi]add esi, vecSize// we'll store the final result in xmm2; initialize it// to zeroxorps xmm2, xmm2// broadcast x into xmm1, multiply it by the first// column of the matrix (xmm4), and add it to the totalmovups xmm1, xmm0shufps xmm1, xmm1, 0x00mulps xmm1, xmm4addps xmm2, xmm1// repeat the process for y, z and wmovups xmm1, xmm0shufps xmm1, xmm1, 0x55mulps xmm1, xmm5addps xmm2, xmm1movups xmm1, xmm0shufps xmm1, xmm1, 0xAAmulps xmm1, xmm6addps xmm2, xmm1movups xmm1, xmm0shufps xmm1, xmm1, 0xFFmulps xmm1, xmm7addps xmm2, xmm1// write the results to vout, and advance the output pointermovups [edi], xmm2add edi, vecSizedec ecxjnz BM1_START}}// BatchMultiply2 -- A simple modification to BatchMultiply1: we now use// aligned moves (movaps) instead of unaligned moves (movups). This is much// faster, but requires that Matrix4f and Vector4f objects are aligned// on 16-byte boundaries. We use the __declspec(align(16)) specifier in// the Matrix4f and Vector4f class definitions to accomplish this.//// Performance: 28 cycles/vectorvoid BatchMultiply2(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// transpose the matrix into the xmm4-7m.TransposeIntoXMM();static const int vecSize = sizeof(Vector4f);__asm {mov esi, vinmov edi, voutmov ecx, lenBM2_START:// load the next input vector into xmm0, and advance the input// pointermovaps xmm0, [esi]add esi, vecSize// we'll store the final result in xmm2; initialize it// to zeroxorps xmm2, xmm2// broadcast x into xmm1, multiply it by the first// column of the matrix (xmm4), and add it to the totalmovaps xmm1, xmm0shufps xmm1, xmm1, 0x00mulps xmm1, xmm4addps xmm2, xmm1// repeat the process for y, z and wmovaps xmm1, xmm0shufps xmm1, xmm1, 0x55mulps xmm1, xmm5addps xmm2, xmm1movaps xmm1, xmm0shufps xmm1, xmm1, 0xAAmulps xmm1, xmm6addps xmm2, xmm1movaps xmm1, xmm0shufps xmm1, xmm1, 0xFFmulps xmm1, xmm7addps xmm2, xmm1// write the results to vout, advance the output pointer,// and loopmovaps [edi], xmm2add edi, vecSizedec ecxjnz BM2_START}}// BatchMultiply3 -- A modification to BatchMultiply2 which makes better// use of instruction pairing.//// Performance: 22 cycles/vectorvoid BatchMultiply3(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// transpose the matrix into the xmm4-7m.TransposeIntoXMM();static const int vecSize = sizeof(Vector4f);__asm {mov esi, vinmov edi, voutmov ecx, lenBM3_START:// load the next input vector into xmm0, and advance the input// and output pointersmovaps xmm0, [esi]add edi, vecSize// broadcast y into xmm1, z into xmm2, and w into xmm3 (leaving// x in xmm0).movaps xmm1, xmm0add esi, vecSizemovaps xmm2, xmm0movaps xmm3, xmm0shufps xmm0, xmm0, 0x00shufps xmm1, xmm1, 0x55shufps xmm2, xmm2, 0xAAshufps xmm3, xmm3, 0xFF// multiply xmm0-3 by the appropriate columns of the matrixmulps xmm0, xmm4mulps xmm1, xmm5mulps xmm2, xmm6mulps xmm3, xmm7// sum the results into xmm1addps xmm1, xmm0addps xmm2, xmm3addps xmm1, xmm2// write the results to vout, and loopmovaps [edi-0x10], xmm1dec ecxjnz BM3_START}}// BatchMultiply4 -- A modification to BatchMultiply3 which uses // SSE prefetching instructions to improve performance with large// input sets.//// Performance: 21 cycles/vectorvoid BatchMultiply4(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// transpose the matrix into the xmm4-7m.TransposeIntoXMM();static const int vecSize = sizeof(Vector4f);__asm {mov esi, vinmov edi, voutmov ecx, lenBM4_START:// load the next input vector into xmm0, and advance the input// pointer. Prefetch upcoming vectors into the cachemovaps xmm0, [esi]prefetchnta [esi+0x30]// broadcast y into xmm1, z into xmm2, and w into xmm3 (leaving// x in xmm0).movaps xmm1, xmm0add esi, vecSizemovaps xmm2, xmm0add edi, vecSizemovaps xmm3, xmm0prefetchnta [edi+0x30]shufps xmm0, xmm0, 0x00shufps xmm1, xmm1, 0x55shufps xmm2, xmm2, 0xAAshufps xmm3, xmm3, 0xFF// multiply xmm0-3 by the appropriate columns of the matrix// (hiding a pointer increment between the multiplies)mulps xmm0, xmm4mulps xmm1, xmm5mulps xmm2, xmm6mulps xmm3, xmm7// sum the results into xmm1addps xmm1, xmm0addps xmm2, xmm3addps xmm1, xmm2// write the results to vout, and loopmovaps [edi-0x10], xmm1dec ecxjnz BM4_START}}// BatchMultiply5 -- A modified version of BatchMultiply4 which loads// vector components individually from memory, thereby allowing us// to work on TWO VECTORS SIMULTANEOUSLY!//// Performance: 20 cycles/vectorvoid BatchMultiply5(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// initializations in C++ landMatrix4f mt(m, Matrix4f::TRANSPOSE); // work from a float *row0 = mt.Ref();static const int vecSize = 2 * sizeof(Vector4f);// if there are an odd number of vectors, process the first one// separately and advance the pointersif (len & 0x1) {MatrixMultiply3(mt, vin, vout);++vin;++vout;}len >>= 1; // we process two vectors at a time__asm {mov esi, vinmov edi, voutmov ecx, len// load columns of matrix into xmm4-7mov edx, row0movaps xmm4, [edx]movaps xmm5, [edx+0x10]movaps xmm6, [edx+0x20]movaps xmm7, [edx+0x30]BM5_START:// process xmovss xmm1, [esi+0x00]movss xmm3, [esi+0x10]shufps xmm1, xmm1, 0x00prefetchnta [esi+0x30]shufps xmm3, xmm3, 0x00mulps xmm1, xmm4prefetchnta [edi+0x30]mulps xmm3, xmm4// process ymovss xmm0, [esi+0x04]movss xmm2, [esi+0x14]shufps xmm0, xmm0, 0x00shufps xmm2, xmm2, 0x00mulps xmm0, xmm5mulps xmm2, xmm5addps xmm1, xmm0addps xmm3, xmm2// process zmovss xmm0, [esi+0x08]movss xmm2, [esi+0x18]shufps xmm0, xmm0, 0x00shufps xmm2, xmm2, 0x00mulps xmm0, xmm6mulps xmm2, xmm6addps xmm1, xmm0addps xmm3, xmm2// process w (hiding some pointer increments between the// multiplies)movss xmm0, [esi+0x0C]movss xmm2, [esi+0x1C]shufps xmm0, xmm0, 0x00shufps xmm2, xmm2, 0x00mulps xmm0, xmm7add esi, vecSizemulps xmm2, xmm7add edi, vecSizeaddps xmm1, xmm0addps xmm3, xmm2// write output vectors to memory, and loopmovaps [edi-0x20], xmm1movaps [edi-0x10], xmm3dec ecxjnz BM5_START}}// BatchTransform1 -- A modified version of BatchMultiply4 which makes// an additional assumption about the vectors in vin: if each vector's// 4th element (the homogenous coordinate w) is assumed to be 1.0 (as is// the case for 3D vertices), we can eliminate a move, a shuffle and a// multiply instruction.//// Performance: 17 cycles/vectorvoid BatchTransform1(Matrix4f &m, Vector4f *vin, Vector4f *vout, int len){// initializations in C++ landMatrix4f mt(m, Matrix4f::TRANSPOSE); // work from a float *row0 = mt.Ref();static const int vecSize = 2 * sizeof(Vector4f);// if there are an odd number of vectors, process the first one// separately and advance the pointersif (len & 0x1) {MatrixMultiply3(mt, vin, vout);++vin;++vout;}len >>= 1; // we process two vectors at a time__asm {mov esi, vinmov edi, voutmov ecx, len// load columns of matrix into xmm4-7mov edx, row0movaps xmm4, [edx]movaps xmm5, [edx+0x10]movaps xmm6, [edx+0x20]movaps xmm7, [edx+0x30]BT2_START:// process x (hiding the prefetches in the delays)movss xmm1, [esi+0x00]movss xmm3, [esi+0x10]shufps xmm1, xmm1, 0x00prefetchnta [edi+0x30]shufps xmm3, xmm3, 0x00mulps xmm1, xmm4prefetchnta [esi+0x30]mulps xmm3, xmm4// process ymovss xmm0, [esi+0x04]movss xmm2, [esi+0x14]shufps xmm0, xmm0, 0x00shufps xmm2, xmm2, 0x00mulps xmm0, xmm5mulps xmm2, xmm5addps xmm1, xmm0addps xmm3, xmm2// process z (hiding some pointer arithmetic between// the multiplies)movss xmm0, [esi+0x08]movss xmm2, [esi+0x18]shufps xmm0, xmm0, 0x00shufps xmm2, xmm2, 0x00mulps xmm0, xmm6add esi, vecSizemulps xmm2, xmm6add edi, vecSizeaddps xmm1, xmm0addps xmm3, xmm2// process waddps xmm1, xmm7addps xmm3, xmm7// write output vectors to memory and loopmovaps [edi-0x20], xmm1movaps [edi-0x10], xmm3dec ecxjnz BT2_START}}
Optimizing for SSE: Code Samples
40 Tips for optimizing your php Code (php代码优化提示)
internet for reed solomon code
CSS For Beginners - The Code Project - HTML /...
Optimizing the Limited Resources for More Efficiency in Teacher Development Programs in China
Emergent Properties » Optimizing MySQL and Apache for Low Memory Usage, Part 1
Aspect Oriented Programming for Benchmarking (转与 The Code Project )
Guestbook for ASP.NET - The Code Project - AS...
List of tools for static code analysis - Wiki...
Build dynamic native c code for android ? Left or Right
free online templates, samples, examples, articles, resources and tools for business training and organizational development - free downloads
Improve Your Understanding of .NET Internals by Building a Debugger for Managed Code
Java BluePrints: Guidelines, patterns, and code for end-to-end applications
A library for evaluating Excel-style formulas (转与 Code Project)
Domain-Specific Modeling for Full Code Generation(转 www.metacase.com)
Hardware - SSE Performance Programming
MMX及SSE优化
SSE指令简明参考-
A HOWTO on Optimizing PHP
Technology: SIMD / MMX / SSE / SSE2 /
ls code
SQL code
SQL code
Business Process Management Samples & Tutorials - Version 6.1