123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878 |
- /*
- ===========================================================================
- Doom 3 GPL Source Code
- Copyright (C) 1999-2011 id Software LLC, a ZeniMax Media company.
- This file is part of the Doom 3 GPL Source Code (?Doom 3 Source Code?).
- Doom 3 Source Code is free software: you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation, either version 3 of the License, or
- (at your option) any later version.
- Doom 3 Source Code is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with Doom 3 Source Code. If not, see <http://www.gnu.org/licenses/>.
- In addition, the Doom 3 Source Code is also subject to certain additional terms. You should have received a copy of these additional terms immediately following the terms and conditions of the GNU General Public License which accompanied the Doom 3 Source Code. If not, please request a copy in writing from id Software at the address below.
- If you have questions concerning this license or the applicable additional terms, you may contact in writing id Software LLC, c/o ZeniMax Media Inc., Suite 120, Rockville, Maryland 20850 USA.
- ===========================================================================
- */
- #include "../precompiled.h"
- #pragma hdrstop
- #include "Simd_Generic.h"
- #include "Simd_MMX.h"
- #include "Simd_SSE.h"
- #include "Simd_SSE2.h"
- //===============================================================
- //
- // SSE2 implementation of idSIMDProcessor
- //
- //===============================================================
- #if defined(MACOS_X) && defined(__i386__)
- #include <xmmintrin.h>
- #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
- #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
- /*
- ============
- idSIMD_SSE2::GetName
- ============
- */
- const char * idSIMD_SSE2::GetName( void ) const {
- return "MMX & SSE & SSE2";
- }
- /*
- ============
- idSIMD_SSE::CmpLT
- dst[i] |= ( src0[i] < constant ) << bitNum;
- ============
- */
- void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
- int i, cnt, pre, post;
- float *aligned;
- __m128 xmm0, xmm1;
- __m128i xmm0i;
- int cnt_l;
- char *src0_p;
- char *constant_p;
- char *dst_p;
- int mask_l;
- int dst_l;
-
- /* if the float array is not aligned on a 4 byte boundary */
- if ( ((int) src0) & 3 ) {
- /* unaligned memory access */
- pre = 0;
- cnt = count >> 2;
- post = count - (cnt<<2);
- /*
- __asm mov edx, cnt
- __asm test edx, edx
- __asm je doneCmp
- */
- cnt_l = cnt;
- if(cnt_l != 0) {
- /*
- __asm push ebx
- __asm neg edx
- __asm mov esi, src0
- __asm prefetchnta [esi+64]
- __asm movss xmm1, constant
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- __asm mov edi, dst
- __asm mov cl, bitNum
- */
- cnt_l = -cnt_l;
- src0_p = (char *) src0;
- _mm_prefetch(src0_p+64, _MM_HINT_NTA);
- constant_p = (char *) &constant;
- xmm1 = _mm_load_ss((float *)constant_p);
- xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
- dst_p = (char *)dst;
- /*
- __asm loopNA:
- */
- do {
- /*
- __asm movups xmm0, [esi]
- __asm prefetchnta [esi+128]
- __asm cmpltps xmm0, xmm1
- __asm movmskps eax, xmm0 \
- __asm mov ah, al
- __asm shr ah, 1
- __asm mov bx, ax
- __asm shl ebx, 14
- __asm mov bx, ax
- __asm and ebx, 0x01010101
- __asm shl ebx, cl
- __asm or ebx, dword ptr [edi]
- __asm mov dword ptr [edi], ebx
- __asm add esi, 16
- __asm add edi, 4
- __asm inc edx
- __asm jl loopNA
- __asm pop ebx
- */
- xmm0 = _mm_loadu_ps((float *) src0_p);
- _mm_prefetch(src0_p+128, _MM_HINT_NTA);
- xmm0 = _mm_cmplt_ps(xmm0, xmm1);
- // Simplify using SSE2
- xmm0i = (__m128i) xmm0;
- xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
- xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
- mask_l = _mm_cvtsi128_si32(xmm0i);
- // End
- mask_l = mask_l & 0x01010101;
- mask_l = mask_l << bitNum;
- dst_l = *((int *) dst_p);
- mask_l = mask_l | dst_l;
- *((int *) dst_p) = mask_l;
- src0_p = src0_p + 16;
- dst_p = dst_p + 4;
- cnt_l = cnt_l + 1;
- } while (cnt_l < 0);
- }
- }
- else {
- /* aligned memory access */
- aligned = (float *) ((((int) src0) + 15) & ~15);
- if ( (int)aligned > ((int)src0) + count ) {
- pre = count;
- post = 0;
- }
- else {
- pre = aligned - src0;
- cnt = (count - pre) >> 2;
- post = count - pre - (cnt<<2);
- /*
- __asm mov edx, cnt
- __asm test edx, edx
- __asm je doneCmp
- */
- cnt_l = cnt;
- if(cnt_l != 0) {
- /*
- __asm push ebx
- __asm neg edx
- __asm mov esi, aligned
- __asm prefetchnta [esi+64]
- __asm movss xmm1, constant
- __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )
- __asm mov edi, dst
- __asm add edi, pre
- __asm mov cl, bitNum
- */
- cnt_l = -cnt_l;
- src0_p = (char *) src0;
- _mm_prefetch(src0_p+64, _MM_HINT_NTA);
- constant_p = (char *) &constant;
- xmm1 = _mm_load_ss((float *)constant_p);
- xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
- dst_p = (char *)dst;
- dst_p = dst_p + pre;
- /*
- __asm loopA:
- */
- do {
- /*
- __asm movaps xmm0, [esi]
- __asm prefetchnta [esi+128]
- __asm cmpltps xmm0, xmm1
- __asm movmskps eax, xmm0 \
- __asm mov ah, al
- __asm shr ah, 1
- __asm mov bx, ax
- __asm shl ebx, 14
- __asm mov bx, ax
- __asm and ebx, 0x01010101
- __asm shl ebx, cl
- __asm or ebx, dword ptr [edi]
- __asm mov dword ptr [edi], ebx
- __asm add esi, 16
- __asm add edi, 4
- __asm inc edx
- __asm jl loopA
- __asm pop ebx
- */
- xmm0 = _mm_load_ps((float *) src0_p);
- _mm_prefetch(src0_p+128, _MM_HINT_NTA);
- xmm0 = _mm_cmplt_ps(xmm0, xmm1);
- // Simplify using SSE2
- xmm0i = (__m128i) xmm0;
- xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
- xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
- mask_l = _mm_cvtsi128_si32(xmm0i);
- // End
- mask_l = mask_l & 0x01010101;
- mask_l = mask_l << bitNum;
- dst_l = *((int *) dst_p);
- mask_l = mask_l | dst_l;
- *((int *) dst_p) = mask_l;
- src0_p = src0_p + 16;
- dst_p = dst_p + 4;
- cnt_l = cnt_l + 1;
- } while (cnt_l < 0);
- }
- }
- }
- /*
- doneCmp:
- */
- float c = constant;
- for ( i = 0; i < pre; i++ ) {
- dst[i] |= ( src0[i] < c ) << bitNum;
- }
- for ( i = count - post; i < count; i++ ) {
- dst[i] |= ( src0[i] < c ) << bitNum;
- }
- }
- #elif defined(_WIN32)
- #include <xmmintrin.h>
- #define SHUFFLEPS( x, y, z, w ) (( (x) & 3 ) << 6 | ( (y) & 3 ) << 4 | ( (z) & 3 ) << 2 | ( (w) & 3 ))
- #define R_SHUFFLEPS( x, y, z, w ) (( (w) & 3 ) << 6 | ( (z) & 3 ) << 4 | ( (y) & 3 ) << 2 | ( (x) & 3 ))
- #define SHUFFLEPD( x, y ) (( (x) & 1 ) << 1 | ( (y) & 1 ))
- #define R_SHUFFLEPD( x, y ) (( (y) & 1 ) << 1 | ( (x) & 1 ))
- #define ALIGN4_INIT1( X, INIT ) ALIGN16( static X[4] ) = { INIT, INIT, INIT, INIT }
- #define ALIGN4_INIT4( X, I0, I1, I2, I3 ) ALIGN16( static X[4] ) = { I0, I1, I2, I3 }
- #define ALIGN8_INIT1( X, INIT ) ALIGN16( static X[8] ) = { INIT, INIT, INIT, INIT, INIT, INIT, INIT, INIT }
- ALIGN8_INIT1( unsigned short SIMD_W_zero, 0 );
- ALIGN8_INIT1( unsigned short SIMD_W_maxShort, 1<<15 );
- ALIGN4_INIT4( unsigned long SIMD_SP_singleSignBitMask, (unsigned long) ( 1 << 31 ), 0, 0, 0 );
- ALIGN4_INIT1( unsigned long SIMD_SP_signBitMask, (unsigned long) ( 1 << 31 ) );
- ALIGN4_INIT1( unsigned long SIMD_SP_absMask, (unsigned long) ~( 1 << 31 ) );
- ALIGN4_INIT1( unsigned long SIMD_SP_infinityMask, (unsigned long) ~( 1 << 23 ) );
- ALIGN4_INIT1( float SIMD_SP_zero, 0.0f );
- ALIGN4_INIT1( float SIMD_SP_one, 1.0f );
- ALIGN4_INIT1( float SIMD_SP_two, 2.0f );
- ALIGN4_INIT1( float SIMD_SP_three, 3.0f );
- ALIGN4_INIT1( float SIMD_SP_four, 4.0f );
- ALIGN4_INIT1( float SIMD_SP_maxShort, (1<<15) );
- ALIGN4_INIT1( float SIMD_SP_tiny, 1e-10f );
- ALIGN4_INIT1( float SIMD_SP_PI, idMath::PI );
- ALIGN4_INIT1( float SIMD_SP_halfPI, idMath::HALF_PI );
- ALIGN4_INIT1( float SIMD_SP_twoPI, idMath::TWO_PI );
- ALIGN4_INIT1( float SIMD_SP_oneOverTwoPI, 1.0f / idMath::TWO_PI );
- ALIGN4_INIT1( float SIMD_SP_infinity, idMath::INFINITY );
- /*
- ============
- idSIMD_SSE2::GetName
- ============
- */
- const char * idSIMD_SSE2::GetName( void ) const {
- return "MMX & SSE & SSE2";
- }
- #if 0 // the SSE2 code is ungodly slow
- /*
- ============
- idSIMD_SSE2::MatX_LowerTriangularSolve
- solves x in Lx = b for the n * n sub-matrix of L
- if skip > 0 the first skip elements of x are assumed to be valid already
- L has to be a lower triangular matrix with (implicit) ones on the diagonal
- x == b is allowed
- ============
- */
- void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolve( const idMatX &L, float *x, const float *b, const int n, int skip ) {
- int nc;
- const float *lptr;
- if ( skip >= n ) {
- return;
- }
- lptr = L[skip];
- nc = L.GetNumColumns();
- // unrolled cases for n < 8
- if ( n < 8 ) {
- #define NSKIP( n, s ) ((n<<3)|(s&7))
- switch( NSKIP( n, skip ) ) {
- case NSKIP( 1, 0 ): x[0] = b[0];
- return;
- case NSKIP( 2, 0 ): x[0] = b[0];
- case NSKIP( 2, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- return;
- case NSKIP( 3, 0 ): x[0] = b[0];
- case NSKIP( 3, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 3, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- return;
- case NSKIP( 4, 0 ): x[0] = b[0];
- case NSKIP( 4, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 4, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 4, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- return;
- case NSKIP( 5, 0 ): x[0] = b[0];
- case NSKIP( 5, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 5, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 5, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- case NSKIP( 5, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
- return;
- case NSKIP( 6, 0 ): x[0] = b[0];
- case NSKIP( 6, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 6, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 6, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- case NSKIP( 6, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
- case NSKIP( 6, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
- return;
- case NSKIP( 7, 0 ): x[0] = b[0];
- case NSKIP( 7, 1 ): x[1] = b[1] - lptr[1*nc+0] * x[0];
- case NSKIP( 7, 2 ): x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case NSKIP( 7, 3 ): x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- case NSKIP( 7, 4 ): x[4] = b[4] - lptr[4*nc+0] * x[0] - lptr[4*nc+1] * x[1] - lptr[4*nc+2] * x[2] - lptr[4*nc+3] * x[3];
- case NSKIP( 7, 5 ): x[5] = b[5] - lptr[5*nc+0] * x[0] - lptr[5*nc+1] * x[1] - lptr[5*nc+2] * x[2] - lptr[5*nc+3] * x[3] - lptr[5*nc+4] * x[4];
- case NSKIP( 7, 6 ): x[6] = b[6] - lptr[6*nc+0] * x[0] - lptr[6*nc+1] * x[1] - lptr[6*nc+2] * x[2] - lptr[6*nc+3] * x[3] - lptr[6*nc+4] * x[4] - lptr[6*nc+5] * x[5];
- return;
- }
- return;
- }
- // process first 4 rows
- switch( skip ) {
- case 0: x[0] = b[0];
- case 1: x[1] = b[1] - lptr[1*nc+0] * x[0];
- case 2: x[2] = b[2] - lptr[2*nc+0] * x[0] - lptr[2*nc+1] * x[1];
- case 3: x[3] = b[3] - lptr[3*nc+0] * x[0] - lptr[3*nc+1] * x[1] - lptr[3*nc+2] * x[2];
- skip = 4;
- }
- lptr = L[skip];
- __asm {
- push ebx
- mov eax, skip // eax = i
- shl eax, 2 // eax = i*4
- mov edx, n // edx = n
- shl edx, 2 // edx = n*4
- mov esi, x // esi = x
- mov edi, lptr // edi = lptr
- add esi, eax
- add edi, eax
- mov ebx, b // ebx = b
- // aligned
- looprow:
- mov ecx, eax
- neg ecx
- cvtps2pd xmm0, [esi+ecx]
- cvtps2pd xmm2, [edi+ecx]
- mulpd xmm0, xmm2
- cvtps2pd xmm1, [esi+ecx+8]
- cvtps2pd xmm3, [edi+ecx+8]
- mulpd xmm1, xmm3
- add ecx, 20*4
- jg donedot16
- dot16:
- cvtps2pd xmm2, [esi+ecx-(16*4)]
- cvtps2pd xmm3, [edi+ecx-(16*4)]
- cvtps2pd xmm4, [esi+ecx-(14*4)]
- mulpd xmm2, xmm3
- cvtps2pd xmm5, [edi+ecx-(14*4)]
- addpd xmm0, xmm2
- cvtps2pd xmm2, [esi+ecx-(12*4)]
- mulpd xmm4, xmm5
- cvtps2pd xmm3, [edi+ecx-(12*4)]
- addpd xmm1, xmm4
- cvtps2pd xmm4, [esi+ecx-(10*4)]
- mulpd xmm2, xmm3
- cvtps2pd xmm5, [edi+ecx-(10*4)]
- addpd xmm0, xmm2
- cvtps2pd xmm2, [esi+ecx-(8*4)]
- mulpd xmm4, xmm5
- cvtps2pd xmm3, [edi+ecx-(8*4)]
- addpd xmm1, xmm4
- cvtps2pd xmm4, [esi+ecx-(6*4)]
- mulpd xmm2, xmm3
- cvtps2pd xmm5, [edi+ecx-(6*4)]
- addpd xmm0, xmm2
- cvtps2pd xmm2, [esi+ecx-(4*4)]
- mulpd xmm4, xmm5
- cvtps2pd xmm3, [edi+ecx-(4*4)]
- addpd xmm1, xmm4
- cvtps2pd xmm4, [esi+ecx-(2*4)]
- mulpd xmm2, xmm3
- cvtps2pd xmm5, [edi+ecx-(2*4)]
- addpd xmm0, xmm2
- add ecx, 16*4
- mulpd xmm4, xmm5
- addpd xmm1, xmm4
- jle dot16
- donedot16:
- sub ecx, 8*4
- jg donedot8
- dot8:
- cvtps2pd xmm2, [esi+ecx-(8*4)]
- cvtps2pd xmm3, [edi+ecx-(8*4)]
- cvtps2pd xmm7, [esi+ecx-(6*4)]
- mulpd xmm2, xmm3
- cvtps2pd xmm5, [edi+ecx-(6*4)]
- addpd xmm0, xmm2
- cvtps2pd xmm6, [esi+ecx-(4*4)]
- mulpd xmm7, xmm5
- cvtps2pd xmm3, [edi+ecx-(4*4)]
- addpd xmm1, xmm7
- cvtps2pd xmm4, [esi+ecx-(2*4)]
- mulpd xmm6, xmm3
- cvtps2pd xmm7, [edi+ecx-(2*4)]
- addpd xmm0, xmm6
- add ecx, 8*4
- mulpd xmm4, xmm7
- addpd xmm1, xmm4
- donedot8:
- sub ecx, 4*4
- jg donedot4
- dot4:
- cvtps2pd xmm2, [esi+ecx-(4*4)]
- cvtps2pd xmm3, [edi+ecx-(4*4)]
- cvtps2pd xmm4, [esi+ecx-(2*4)]
- mulpd xmm2, xmm3
- cvtps2pd xmm5, [edi+ecx-(2*4)]
- addpd xmm0, xmm2
- add ecx, 4*4
- mulpd xmm4, xmm5
- addpd xmm1, xmm4
- donedot4:
- addpd xmm0, xmm1
- movaps xmm1, xmm0
- shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 0 )
- addsd xmm0, xmm1
- sub ecx, 4*4
- jz dot0
- add ecx, 4
- jz dot1
- add ecx, 4
- jz dot2
- //dot3:
- cvtss2sd xmm1, [esi-(3*4)]
- cvtss2sd xmm2, [edi-(3*4)]
- mulsd xmm1, xmm2
- addsd xmm0, xmm1
- dot2:
- cvtss2sd xmm3, [esi-(2*4)]
- cvtss2sd xmm4, [edi-(2*4)]
- mulsd xmm3, xmm4
- addsd xmm0, xmm3
- dot1:
- cvtss2sd xmm5, [esi-(1*4)]
- cvtss2sd xmm6, [edi-(1*4)]
- mulsd xmm5, xmm6
- addsd xmm0, xmm5
- dot0:
- cvtss2sd xmm1, [ebx+eax]
- subsd xmm1, xmm0
- cvtsd2ss xmm0, xmm1
- movss [esi], xmm0
- add eax, 4
- cmp eax, edx
- jge done
- add esi, 4
- mov ecx, nc
- shl ecx, 2
- add edi, ecx
- add edi, 4
- jmp looprow
- // done
- done:
- pop ebx
- }
- }
- /*
- ============
- idSIMD_SSE2::MatX_LowerTriangularSolveTranspose
- solves x in L'x = b for the n * n sub-matrix of L
- L has to be a lower triangular matrix with (implicit) ones on the diagonal
- x == b is allowed
- ============
- */
- void VPCALL idSIMD_SSE2::MatX_LowerTriangularSolveTranspose( const idMatX &L, float *x, const float *b, const int n ) {
- int nc;
- const float *lptr;
- lptr = L.ToFloatPtr();
- nc = L.GetNumColumns();
- // unrolled cases for n < 8
- if ( n < 8 ) {
- switch( n ) {
- case 0:
- return;
- case 1:
- x[0] = b[0];
- return;
- case 2:
- x[1] = b[1];
- x[0] = b[0] - lptr[1*nc+0] * x[1];
- return;
- case 3:
- x[2] = b[2];
- x[1] = b[1] - lptr[2*nc+1] * x[2];
- x[0] = b[0] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
- return;
- case 4:
- x[3] = b[3];
- x[2] = b[2] - lptr[3*nc+2] * x[3];
- x[1] = b[1] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
- x[0] = b[0] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
- return;
- case 5:
- x[4] = b[4];
- x[3] = b[3] - lptr[4*nc+3] * x[4];
- x[2] = b[2] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
- x[1] = b[1] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
- x[0] = b[0] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
- return;
- case 6:
- x[5] = b[5];
- x[4] = b[4] - lptr[5*nc+4] * x[5];
- x[3] = b[3] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
- x[2] = b[2] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
- x[1] = b[1] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
- x[0] = b[0] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
- return;
- case 7:
- x[6] = b[6];
- x[5] = b[5] - lptr[6*nc+5] * x[6];
- x[4] = b[4] - lptr[6*nc+4] * x[6] - lptr[5*nc+4] * x[5];
- x[3] = b[3] - lptr[6*nc+3] * x[6] - lptr[5*nc+3] * x[5] - lptr[4*nc+3] * x[4];
- x[2] = b[2] - lptr[6*nc+2] * x[6] - lptr[5*nc+2] * x[5] - lptr[4*nc+2] * x[4] - lptr[3*nc+2] * x[3];
- x[1] = b[1] - lptr[6*nc+1] * x[6] - lptr[5*nc+1] * x[5] - lptr[4*nc+1] * x[4] - lptr[3*nc+1] * x[3] - lptr[2*nc+1] * x[2];
- x[0] = b[0] - lptr[6*nc+0] * x[6] - lptr[5*nc+0] * x[5] - lptr[4*nc+0] * x[4] - lptr[3*nc+0] * x[3] - lptr[2*nc+0] * x[2] - lptr[1*nc+0] * x[1];
- return;
- }
- return;
- }
- int i, j, m;
- float *xptr;
- double s0;
- // if the number of columns is not a multiple of 2 we're screwed for alignment.
- // however, if the number of columns is a multiple of 2 but the number of to be
- // processed rows is not a multiple of 2 we can still run 8 byte aligned
- m = n;
- if ( m & 1 ) {
- m--;
- x[m] = b[m];
- lptr = L[m] + m - 4;
- xptr = x + m;
- __asm {
- push ebx
- mov eax, m // eax = i
- mov esi, xptr // esi = xptr
- mov edi, lptr // edi = lptr
- mov ebx, b // ebx = b
- mov edx, nc // edx = nc*sizeof(float)
- shl edx, 2
- process4rows_1:
- cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
- cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
- xor ecx, ecx
- sub eax, m
- neg eax
- jz done4x4_1
- process4x4_1: // process 4x4 blocks
- cvtps2pd xmm3, [edi]
- cvtps2pd xmm4, [edi+8]
- add edi, edx
- cvtss2sd xmm5, [esi+4*ecx+0]
- shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
- mulpd xmm3, xmm5
- cvtps2pd xmm1, [edi]
- mulpd xmm4, xmm5
- cvtps2pd xmm6, [edi+8]
- subpd xmm0, xmm3
- subpd xmm2, xmm4
- add edi, edx
- cvtss2sd xmm7, [esi+4*ecx+4]
- shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
- mulpd xmm1, xmm7
- cvtps2pd xmm3, [edi]
- mulpd xmm6, xmm7
- cvtps2pd xmm4, [edi+8]
- subpd xmm0, xmm1
- subpd xmm2, xmm6
- add edi, edx
- cvtss2sd xmm5, [esi+4*ecx+8]
- shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
- mulpd xmm3, xmm5
- cvtps2pd xmm1, [edi]
- mulpd xmm4, xmm5
- cvtps2pd xmm6, [edi+8]
- subpd xmm0, xmm3
- subpd xmm2, xmm4
- add edi, edx
- cvtss2sd xmm7, [esi+4*ecx+12]
- shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
- mulpd xmm1, xmm7
- add ecx, 4
- mulpd xmm6, xmm7
- cmp ecx, eax
- subpd xmm0, xmm1
- subpd xmm2, xmm6
- jl process4x4_1
- done4x4_1: // process left over of the 4 rows
- cvtps2pd xmm3, [edi]
- cvtps2pd xmm4, [edi+8]
- cvtss2sd xmm5, [esi+4*ecx]
- shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
- mulpd xmm3, xmm5
- mulpd xmm4, xmm5
- subpd xmm0, xmm3
- subpd xmm2, xmm4
- imul ecx, edx
- sub edi, ecx
- neg eax
- add eax, m
- sub eax, 4
- movapd xmm1, xmm0
- shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
- movapd xmm3, xmm2
- shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
- sub edi, edx
- cvtsd2ss xmm7, xmm3
- movss [esi-4], xmm7 // xptr[-1] = s3
- movsd xmm4, xmm3
- movsd xmm5, xmm3
- cvtss2sd xmm7, [edi+8]
- mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
- cvtss2sd xmm7, [edi+4]
- mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
- cvtss2sd xmm7, [edi]
- mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
- subsd xmm2, xmm3
- cvtsd2ss xmm7, xmm2
- movss [esi-8], xmm7 // xptr[-2] = s2
- movsd xmm6, xmm2
- sub edi, edx
- subsd xmm0, xmm5
- subsd xmm1, xmm4
- cvtss2sd xmm7, [edi+4]
- mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
- cvtss2sd xmm7, [edi]
- mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
- subsd xmm1, xmm2
- cvtsd2ss xmm7, xmm1
- movss [esi-12], xmm7 // xptr[-3] = s1
- subsd xmm0, xmm6
- sub edi, edx
- cmp eax, 4
- cvtss2sd xmm7, [edi]
- mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
- subsd xmm0, xmm1
- cvtsd2ss xmm7, xmm0
- movss [esi-16], xmm7 // xptr[-4] = s0
- jl done4rows_1
- sub edi, edx
- sub edi, 16
- sub esi, 16
- jmp process4rows_1
- done4rows_1:
- pop ebx
- }
- }
- else {
- lptr = L.ToFloatPtr() + m * L.GetNumColumns() + m - 4;
- xptr = x + m;
- __asm {
- push ebx
- mov eax, m // eax = i
- mov esi, xptr // esi = xptr
- mov edi, lptr // edi = lptr
- mov ebx, b // ebx = b
- mov edx, nc // edx = nc*sizeof(float)
- shl edx, 2
- process4rows:
- cvtps2pd xmm0, [ebx+eax*4-16] // load b[i-2], b[i-1]
- cvtps2pd xmm2, [ebx+eax*4-8] // load b[i-4], b[i-3]
- sub eax, m
- jz done4x4
- neg eax
- xor ecx, ecx
- process4x4: // process 4x4 blocks
- cvtps2pd xmm3, [edi]
- cvtps2pd xmm4, [edi+8]
- add edi, edx
- cvtss2sd xmm5, [esi+4*ecx+0]
- shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
- mulpd xmm3, xmm5
- cvtps2pd xmm1, [edi]
- mulpd xmm4, xmm5
- cvtps2pd xmm6, [edi+8]
- subpd xmm0, xmm3
- subpd xmm2, xmm4
- add edi, edx
- cvtss2sd xmm7, [esi+4*ecx+4]
- shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
- mulpd xmm1, xmm7
- cvtps2pd xmm3, [edi]
- mulpd xmm6, xmm7
- cvtps2pd xmm4, [edi+8]
- subpd xmm0, xmm1
- subpd xmm2, xmm6
- add edi, edx
- cvtss2sd xmm5, [esi+4*ecx+8]
- shufpd xmm5, xmm5, R_SHUFFLEPD( 0, 0 )
- mulpd xmm3, xmm5
- cvtps2pd xmm1, [edi]
- mulpd xmm4, xmm5
- cvtps2pd xmm6, [edi+8]
- subpd xmm0, xmm3
- subpd xmm2, xmm4
- add edi, edx
- cvtss2sd xmm7, [esi+4*ecx+12]
- shufpd xmm7, xmm7, R_SHUFFLEPD( 0, 0 )
- mulpd xmm1, xmm7
- add ecx, 4
- mulpd xmm6, xmm7
- cmp ecx, eax
- subpd xmm0, xmm1
- subpd xmm2, xmm6
- jl process4x4
- imul ecx, edx
- sub edi, ecx
- neg eax
- done4x4: // process left over of the 4 rows
- add eax, m
- sub eax, 4
- movapd xmm1, xmm0
- shufpd xmm1, xmm1, R_SHUFFLEPD( 1, 1 )
- movapd xmm3, xmm2
- shufpd xmm3, xmm3, R_SHUFFLEPD( 1, 1 )
- sub edi, edx
- cvtsd2ss xmm7, xmm3
- movss [esi-4], xmm7 // xptr[-1] = s3
- movsd xmm4, xmm3
- movsd xmm5, xmm3
- cvtss2sd xmm7, [edi+8]
- mulsd xmm3, xmm7 // lptr[-1*nc+2] * s3
- cvtss2sd xmm7, [edi+4]
- mulsd xmm4, xmm7 // lptr[-1*nc+1] * s3
- cvtss2sd xmm7, [edi]
- mulsd xmm5, xmm7 // lptr[-1*nc+0] * s3
- subsd xmm2, xmm3
- cvtsd2ss xmm7, xmm2
- movss [esi-8], xmm7 // xptr[-2] = s2
- movsd xmm6, xmm2
- sub edi, edx
- subsd xmm0, xmm5
- subsd xmm1, xmm4
- cvtss2sd xmm7, [edi+4]
- mulsd xmm2, xmm7 // lptr[-2*nc+1] * s2
- cvtss2sd xmm7, [edi]
- mulsd xmm6, xmm7 // lptr[-2*nc+0] * s2
- subsd xmm1, xmm2
- cvtsd2ss xmm7, xmm1
- movss [esi-12], xmm7 // xptr[-3] = s1
- subsd xmm0, xmm6
- sub edi, edx
- cmp eax, 4
- cvtss2sd xmm7, [edi]
- mulsd xmm1, xmm7 // lptr[-3*nc+0] * s1
- subsd xmm0, xmm1
- cvtsd2ss xmm7, xmm0
- movss [esi-16], xmm7 // xptr[-4] = s0
- jl done4rows
- sub edi, edx
- sub edi, 16
- sub esi, 16
- jmp process4rows
- done4rows:
- pop ebx
- }
- }
- // process left over rows
- for ( i = (m&3)-1; i >= 0; i-- ) {
- s0 = b[i];
- lptr = L[i+1] + i;
- for ( j = i + 1; j < m; j++ ) {
- s0 -= lptr[0] * x[j];
- lptr += nc;
- }
- x[i] = s0;
- }
- }
- #endif
- /*
- ============
- idSIMD_SSE2::MixedSoundToSamples
- ============
- */
- void VPCALL idSIMD_SSE2::MixedSoundToSamples( short *samples, const float *mixBuffer, const int numSamples ) {
- assert( ( numSamples % MIXBUFFER_SAMPLES ) == 0 );
- __asm {
- mov eax, numSamples
- mov edi, mixBuffer
- mov esi, samples
- shl eax, 2
- add edi, eax
- neg eax
- loop16:
- movaps xmm0, [edi+eax+0*16]
- movaps xmm1, [edi+eax+1*16]
- movaps xmm2, [edi+eax+2*16]
- movaps xmm3, [edi+eax+3*16]
- add esi, 4*4*2
- cvtps2dq xmm4, xmm0
- cvtps2dq xmm5, xmm1
- cvtps2dq xmm6, xmm2
- cvtps2dq xmm7, xmm3
- prefetchnta [edi+eax+128]
- packssdw xmm4, xmm5
- packssdw xmm6, xmm7
- add eax, 4*16
- movlps [esi-4*4*2], xmm4 // FIXME: should not use movlps/movhps to move integer data
- movhps [esi-3*4*2], xmm4
- movlps [esi-2*4*2], xmm6
- movhps [esi-1*4*2], xmm6
- jl loop16
- }
- }
- #endif /* _WIN32 */
|