27 #if defined(_XBOX) // SIMD intrinsics 28 #include <ppcintrinsics.h> 30 #include <emmintrin.h> 36 #if !defined(DSPASSERT) 38 #define DSPASSERT(exp) if (!(exp)) { OutputDebugStringA("XDSP ASSERT: " #exp ", {" __FUNCTION__ "}\n"); __debugbreak(); } 40 #define DSPASSERT(exp) __assume(exp) 45 #if !defined(ISPOWEROF2) 46 #define ISPOWEROF2(n) ( ((n)&((n)-1)) == 0 && (n) != 0 ) 53 #pragma warning(disable: 4328 4640) // disable "indirection alignment of formal parameter", "construction of local static object is not thread-safe" compile warnings 74 rResult = _mm_sub_ps(vr1r2, vi1i2);
75 iResult = _mm_add_ps(vr1i2, vr2i1);
84 r1 = _mm_sub_ps(vr1r2, vi1i2);
85 i1 = _mm_add_ps(vr1i2, vr2i1);
120 const static XVECTOR vDFT4SignBits1 = { 0.0f, -0.0f, 0.0f, -0.0f };
121 const static XVECTOR vDFT4SignBits2 = { 0.0f, 0.0f, -0.0f, -0.0f };
122 const static XVECTOR vDFT4SignBits3 = { 0.0f, -0.0f, -0.0f, 0.0f };
126 XVECTOR rTemp = _mm_add_ps( _mm_shuffle_ps(
r1,
r1, _MM_SHUFFLE(1, 1, 0, 0)),
127 _mm_xor_ps(_mm_shuffle_ps(
r1,
r1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) );
128 XVECTOR iTemp = _mm_add_ps( _mm_shuffle_ps(
i1,
i1, _MM_SHUFFLE(1, 1, 0, 0)),
129 _mm_xor_ps(_mm_shuffle_ps(
i1,
i1, _MM_SHUFFLE(3, 3, 2, 2)), vDFT4SignBits1) );
132 XVECTOR rZrWiZiW = _mm_shuffle_ps(rTemp, iTemp, _MM_SHUFFLE(3, 2, 3, 2));
133 XVECTOR rZiWrZiW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(3, 0, 3, 0));
134 XVECTOR iZrWiZrW = _mm_shuffle_ps(rZrWiZiW, rZrWiZiW, _MM_SHUFFLE(1, 2, 1, 2));
135 r1 = _mm_add_ps( _mm_shuffle_ps(rTemp, rTemp, _MM_SHUFFLE(1, 0, 1, 0)),
136 _mm_xor_ps(rZiWrZiW, vDFT4SignBits2) );
137 i1 = _mm_add_ps( _mm_shuffle_ps(iTemp, iTemp, _MM_SHUFFLE(1, 0, 1, 0)),
138 _mm_xor_ps(iZrWiZrW, vDFT4SignBits3) );
178 XVECTOR rTemp0, rTemp1, rTemp2, rTemp3, rTemp4, rTemp5, rTemp6, rTemp7;
179 XVECTOR iTemp0, iTemp1, iTemp2, iTemp3, iTemp4, iTemp5, iTemp6, iTemp7;
183 rTemp0 = _mm_add_ps(r0,
r2); iTemp0 = _mm_add_ps(i0,
i2);
184 rTemp2 = _mm_add_ps(
r1,
r3); iTemp2 = _mm_add_ps(
i1,
i3);
185 rTemp1 = _mm_sub_ps(r0,
r2); iTemp1 = _mm_sub_ps(i0,
i2);
186 rTemp3 = _mm_sub_ps(
r1,
r3); iTemp3 = _mm_sub_ps(
i1,
i3);
187 rTemp4 = _mm_add_ps(rTemp0, rTemp2); iTemp4 = _mm_add_ps(iTemp0, iTemp2);
188 rTemp5 = _mm_add_ps(rTemp1, iTemp3); iTemp5 = _mm_sub_ps(iTemp1, rTemp3);
189 rTemp6 = _mm_sub_ps(rTemp0, rTemp2); iTemp6 = _mm_sub_ps(iTemp0, iTemp2);
190 rTemp7 = _mm_sub_ps(rTemp1, iTemp3); iTemp7 = _mm_add_ps(iTemp1, rTemp3);
194 vmulComplex(rTemp5, iTemp5, pUnityTableReal[uStride], pUnityTableImaginary[uStride]);
195 vmulComplex(rTemp6, iTemp6, pUnityTableReal[uStride*2], pUnityTableImaginary[uStride*2]);
196 vmulComplex(rTemp7, iTemp7, pUnityTableReal[uStride*3], pUnityTableImaginary[uStride*3]);
205 r0 = rTemp4; i0 = iTemp4;
206 r1 = rTemp5;
i1 = iTemp5;
207 r2 = rTemp6;
i2 = iTemp6;
208 r3 = rTemp7;
i3 = iTemp7;
225 __forceinline
void FFT4 (__inout_ecount(uCount)
XVECTOR* __restrict pReal, __inout_ecount(uCount)
XVECTOR* __restrict pImaginary,
const UINT32 uCount=1)
233 for (
UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
252 __forceinline
void FFT8 (__inout_ecount(uCount*2)
XVECTOR* __restrict pReal, __inout_ecount(uCount*2)
XVECTOR* __restrict pImaginary,
const UINT32 uCount=1)
260 static XVECTOR wr1 = { 1.0f, 0.70710677f, 0.0f, -0.70710677f };
261 static XVECTOR wi1 = { 0.0f, -0.70710677f, -1.0f, -0.70710677f };
262 static XVECTOR wr2 = { -1.0f, -0.70710677f, 0.0f, 0.70710677f };
263 static XVECTOR wi2 = { 0.0f, 0.70710677f, 1.0f, 0.70710677f };
266 for (
UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
267 XVECTOR* __restrict pR = pReal + uIndex*2;
268 XVECTOR* __restrict pI = pImaginary + uIndex*2;
270 XVECTOR oddsR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(3, 1, 3, 1));
271 XVECTOR evensR = _mm_shuffle_ps(pR[0], pR[1], _MM_SHUFFLE(2, 0, 2, 0));
272 XVECTOR oddsI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(3, 1, 3, 1));
273 XVECTOR evensI = _mm_shuffle_ps(pI[0], pI[1], _MM_SHUFFLE(2, 0, 2, 0));
279 pR[0] = _mm_add_ps(evensR,
r);
280 pI[0] = _mm_add_ps(evensI, i);
283 pR[1] = _mm_add_ps(evensR,
r);
284 pI[1] = _mm_add_ps(evensI, i);
302 __forceinline
void FFT16 (__inout_ecount(uCount*4)
XVECTOR* __restrict pReal, __inout_ecount(uCount*4)
XVECTOR* __restrict pImaginary,
const UINT32 uCount=1)
310 XVECTOR aUnityTableReal[4] = { 1.0f, 1.0f, 1.0f, 1.0f, 1.0f, 0.92387950f, 0.70710677f, 0.38268343f, 1.0f, 0.70710677f, -4.3711388e-008
f, -0.70710677f, 1.0f, 0.38268343f, -0.70710677f, -0.92387950f };
311 XVECTOR aUnityTableImaginary[4] = { -0.0f, -0.0f, -0.0f, -0.0f, -0.0f, -0.38268343f, -0.70710677f, -0.92387950f, -0.0f, -0.70710677f, -1.0f, -0.70710677f, -0.0f, -0.92387950f, -0.70710677f, 0.38268343f };
314 for (
UINT32 uIndex=0; uIndex<uCount; ++uIndex) {
319 pImaginary[uIndex*4],
320 pImaginary[uIndex*4 + 1],
321 pImaginary[uIndex*4 + 2],
322 pImaginary[uIndex*4 + 3],
324 aUnityTableImaginary,
348 inline void FFT (__inout_ecount((uLength*uCount)/4)
XVECTOR* __restrict pReal, __inout_ecount((uLength*uCount)/4)
XVECTOR* __restrict pImaginary,
__in_ecount(uLength*uCount)
const XVECTOR* __restrict pUnityTable,
const UINT32 uLength,
const UINT32 uCount=1)
360 const XVECTOR* __restrict pUnityTableReal = pUnityTable;
361 const XVECTOR* __restrict pUnityTableImaginary = pUnityTable + (uLength>>2);
362 const UINT32 uTotal = uCount * uLength;
363 const UINT32 uTotal_vectors = uTotal >> 2;
364 const UINT32 uStage_vectors = uLength >> 2;
365 const UINT32 uStage_vectors_mask = uStage_vectors - 1;
366 const UINT32 uStride = uLength >> 4;
367 const UINT32 uStrideMask = uStride - 1;
368 const UINT32 uStride2 = uStride * 2;
369 const UINT32 uStride3 = uStride * 3;
370 const UINT32 uStrideInvMask = ~uStrideMask;
373 for (
UINT32 uIndex=0; uIndex<(uTotal_vectors>>2); ++uIndex) {
374 const UINT32 n = ((uIndex & uStrideInvMask) << 2) + (uIndex & uStrideMask);
380 pImaginary[
n + uStride],
381 pImaginary[
n + uStride2],
382 pImaginary[
n + uStride3],
383 pUnityTableReal + (
n & uStage_vectors_mask),
384 pUnityTableImaginary + (
n & uStage_vectors_mask),
389 if (uLength > 16*4) {
390 FFT(pReal, pImaginary, pUnityTable+(uLength>>1), uLength>>2, uCount*4);
391 }
else if (uLength == 16*4) {
392 FFT16(pReal, pImaginary, uCount*4);
393 }
else if (uLength == 8*4) {
394 FFT8(pReal, pImaginary, uCount*4);
395 }
else if (uLength == 4*4) {
396 FFT4(pReal, pImaginary, uCount*4);
424 FLOAT32* __restrict pfUnityTable = (
FLOAT32* __restrict)pUnityTable;
429 FLOAT32 flStep = 6.283185307f / uLength;
434 for (
UINT32 i=0; i<4; ++i) {
436 UINT32 uIndex = (i*uLength) +
j;
438 pfUnityTable[uIndex + uLength*4] = -sinf(
FLOAT32(i)*
FLOAT32(
j)*flStep);
441 pfUnityTable += uLength*8;
442 }
while (uLength > 16);
468 const FLOAT32* __restrict pfInput = (
const FLOAT32* __restrict)pInput;
472 if ((uLog2Length & 0x1) == 0) {
474 for (
UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
476 n = ( (
n & 0xcccccccc) >> 2 ) | ( (
n & 0x33333333) << 2 );
477 n = ( (
n & 0xf0f0f0f0) >> 4 ) | ( (
n & 0x0f0f0f0f) << 4 );
478 n = ( (
n & 0xff00ff00) >> 8 ) | ( (
n & 0x00ff00ff) << 8 );
479 n = ( (
n & 0xffff0000) >> 16 ) | ( (
n & 0x0000ffff) << 16 );
480 n >>= (32 - uLog2Length);
481 pfOutput[
n] = pfInput[uIndex];
485 for (
UINT32 uIndex=0; uIndex<uLength; ++uIndex) {
487 n = ( (
n & 0xcccccccc) >> 2 ) | ( (
n & 0x33333333) << 2 );
488 n = ( (
n & 0xf0f0f0f0) >> 4 ) | ( (
n & 0x0f0f0f0f) << 4 );
489 n = ( (
n & 0xff00ff00) >> 8 ) | ( (
n & 0x00ff00ff) << 8 );
490 n = ( (
n & 0xffff0000) >> 16 ) | ( (
n & 0x0000ffff) << 16 );
491 n >>= (32 - (uLog2Length-3));
492 n |= ((uIndex & 0x7) << (uLog2Length - 3));
493 pfOutput[
n] = pfInput[uIndex];
520 FLOAT32 flOneOverLength = 1.0f / uLength;
524 XVECTOR vOneOverLength = _mm_set_ps1(flOneOverLength);
526 for (
UINT32 uIndex=0; uIndex<(uLength>>2); ++uIndex) {
527 XVECTOR vReal = _mm_mul_ps(pInputReal[uIndex], vOneOverLength);
528 XVECTOR vImaginary = _mm_mul_ps(pInputImaginary[uIndex], vOneOverLength);
529 XVECTOR vRR = _mm_mul_ps(vReal, vReal);
530 XVECTOR vII = _mm_mul_ps(vImaginary, vImaginary);
531 XVECTOR vRRplusII = _mm_add_ps(vRR, vII);
532 XVECTOR vTotal = _mm_sqrt_ps(vRRplusII);
533 pOutput[uIndex] = _mm_add_ps(vTotal, vTotal);
567 const FLOAT32* __restrict pfInput = (
const FLOAT32* __restrict)pInput;
570 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
571 for (
UINT32 uFrame=0; uFrame<uFrameCount; ++uFrame) {
572 pfOutput[uChannel * uFrameCount + uFrame] = pfInput[uFrame * uChannelCount + uChannel];
603 const FLOAT32* __restrict pfInput = (
const FLOAT32* __restrict)pInput;
606 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
607 for (
UINT32 uFrame=0; uFrame<uFrameCount; ++uFrame) {
608 pfOutput[uFrame * uChannelCount + uChannel] = pfInput[uChannel * uFrameCount + uFrame];
642 DSPASSERT(uChannelCount > 0 && uChannelCount <= 6);
643 DSPASSERT(uLog2Length >= 2 && uLog2Length <= 9);
650 if (uChannelCount > 1) {
653 CopyMemory(vRealTemp, pReal, (uLength>>2)*
sizeof(
XVECTOR));
655 for (
UINT32 u=0; u<uChannelCount*(uLength>>2); u++) {
656 vImaginaryTemp[u] = _mm_setzero_ps();
660 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
661 FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength);
663 }
else if (uLength == 16) {
664 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
665 FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
667 }
else if (uLength == 8) {
668 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
669 FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
671 }
else if (uLength == 4) {
672 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
673 FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
677 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
678 FFTUnswizzle(&pReal[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length);
679 FFTUnswizzle(&pImaginary[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], uLog2Length);
707 DSPASSERT(uChannelCount > 0 && uChannelCount <= 6);
708 DSPASSERT(uLog2Length >= 2 && uLog2Length <= 9);
715 const XVECTOR vRnp = _mm_set_ps1(1.0
f/uLength);
716 const XVECTOR vRnm = _mm_set_ps1(-1.0
f/uLength);
717 for (
UINT32 u=0; u<uChannelCount*(uLength>>2); u++) {
718 vRealTemp[u] = _mm_mul_ps(pReal[u], vRnp);
719 vImaginaryTemp[u] = _mm_mul_ps(pImaginary[u], vRnm);
723 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
724 FFT(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)], pUnityTable, uLength);
726 }
else if (uLength == 16) {
727 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
728 FFT16(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
730 }
else if (uLength == 8) {
731 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
732 FFT8(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
734 }
else if (uLength == 4) {
735 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
736 FFT4(&vRealTemp[uChannel*(uLength>>2)], &vImaginaryTemp[uChannel*(uLength>>2)]);
740 for (
UINT32 uChannel=0; uChannel<uChannelCount; ++uChannel) {
741 FFTUnswizzle(&vImaginaryTemp[uChannel*(uLength>>2)], &vRealTemp[uChannel*(uLength>>2)], uLog2Length);
743 if (uChannelCount > 1) {
744 Interleave(pReal, vImaginaryTemp, uChannelCount, uLength);
746 CopyMemory(pReal, vImaginaryTemp, (uLength>>2)*
sizeof(
XVECTOR));
__in_ecount(4) CONST FLOAT *pF
__forceinline void FFT4(__inout_ecount(uCount) XVECTOR *__restrict pReal, __inout_ecount(uCount) XVECTOR *__restrict pImaginary, const UINT32 uCount=1)
Definition: xdsp.h:225
void Interleave(__out_ecount((uChannelCount *uFrameCount)/4) XVECTOR *__restrict pOutput, __in_ecount((uChannelCount *uFrameCount)/4) const XVECTOR *__restrict pInput, const UINT32 uChannelCount, const UINT32 uFrameCount)
Definition: xdsp.h:595
GLdouble GLdouble GLdouble r
Definition: glext.h:6406
#define FALSE
Definition: stb_vorbis.h:232
__out_ecount(4) FLOAT *WINAPI D3DXSHMultiply2(__out_ecount(4) FLOAT *pOut
GLfloat f
Definition: glext.h:8207
bf_uint8_t r1
Definition: connect_ps4.c:74
bf_uint8_t r3
Definition: connect_ps4.c:80
void IFFTDeinterleaved(__inout_ecount((1<< uLog2Length *uChannelCount)/4) XVECTOR *__restrict pReal, __out_ecount((1<< uLog2Length *uChannelCount)/4) XVECTOR *__restrict pImaginary, __in_ecount(1<< uLog2Length) const XVECTOR *__restrict pUnityTable, const UINT32 uChannelCount, const UINT32 uLog2Length)
Definition: xdsp.h:699
bf_uint8_t r2
Definition: connect_ps4.c:76
void FFT(__inout_ecount((uLength *uCount)/4) XVECTOR *__restrict pReal, __inout_ecount((uLength *uCount)/4) XVECTOR *__restrict pImaginary, __in_ecount(uLength *uCount) const XVECTOR *__restrict pUnityTable, const UINT32 uLength, const UINT32 uCount=1)
Definition: xdsp.h:348
GLint i1
Definition: nx_glsym.h:308
__forceinline void FFT8(__inout_ecount(uCount *2) XVECTOR *__restrict pReal, __inout_ecount(uCount *2) XVECTOR *__restrict pImaginary, const UINT32 uCount=1)
Definition: xdsp.h:252
#define NULL
Pointer to 0.
Definition: gctypes.h:65
#define UINT_PTR
Definition: Common.h:66
#define ISPOWEROF2(n)
Definition: xdsp.h:46
__forceinline void ButterflyDIT4_4(__inout XVECTORREF r0, __inout XVECTORREF r1, __inout XVECTORREF r2, __inout XVECTORREF r3, __inout XVECTORREF i0, __inout XVECTORREF i1, __inout XVECTORREF i2, __inout XVECTORREF i3, __in_ecount(uStride *4) const XVECTOR *__restrict pUnityTableReal, __in_ecount(uStride *4) const XVECTOR *__restrict pUnityTableImaginary, const UINT32 uStride, const BOOL fLast)
Definition: xdsp.h:160
void FFTInterleaved(__inout_ecount((1<< uLog2Length *uChannelCount)/4) XVECTOR *__restrict pReal, __out_ecount((1<< uLog2Length *uChannelCount)/4) XVECTOR *__restrict pImaginary, __in_ecount(1<< uLog2Length) const XVECTOR *__restrict pUnityTable, const UINT32 uChannelCount, const UINT32 uLog2Length)
Definition: xdsp.h:634
GLint GLint i2
Definition: nx_glsym.h:308
void FFTPolar(__out_ecount(uLength/4) XVECTOR *__restrict pOutput, __in_ecount(uLength/4) const XVECTOR *__restrict pInputReal, __in_ecount(uLength/4) const XVECTOR *__restrict pInputImaginary, const UINT32 uLength)
Definition: xdsp.h:512
float FLOAT32
Definition: xapobase.h:60
__forceinline void ButterflyDIT4_1(__inout XVECTORREF r1, __inout XVECTORREF i1)
Definition: xdsp.h:117
__m128 XVECTOR
Definition: xdsp.h:60
XVECTOR & XVECTORREF
Definition: xdsp.h:61
void Deinterleave(__out_ecount((uChannelCount *uFrameCount)/4) XVECTOR *__restrict pOutput, __in_ecount((uChannelCount *uFrameCount)/4) const XVECTOR *__restrict pInput, const UINT32 uChannelCount, const UINT32 uFrameCount)
Definition: xdsp.h:559
unsigned int BOOL
Definition: gctypes.h:51
uint32_t UINT32
Definition: coretypes.h:10
GLint j
Definition: nx_glsym.h:307
#define DSPASSERT(exp)
Definition: xdsp.h:40
const XVECTOR & XVECTORREFC
Definition: xdsp.h:62
void FFTInitializeUnityTable(__out_ecount(uLength) XVECTOR *__restrict pUnityTable, UINT32 uLength)
Definition: xdsp.h:418
__forceinline void vmulComplex(__out XVECTORREF rResult, __out XVECTORREF iResult, __in XVECTORREFC r1, __in XVECTORREFC i1, __in XVECTORREFC r2, __in XVECTORREFC i2)
Definition: xdsp.h:67
#define TRUE
Definition: stb_vorbis.h:231
__forceinline void FFT16(__inout_ecount(uCount *4) XVECTOR *__restrict pReal, __inout_ecount(uCount *4) XVECTOR *__restrict pImaginary, const UINT32 uCount=1)
Definition: xdsp.h:302
float4 i3
Definition: foo.h:3
GLdouble n
Definition: glext.h:8396
void FFTUnswizzle(__out_ecount((1<< uLog2Length)/4) XVECTOR *__restrict pOutput, __in_ecount((1<< uLog2Length)/4) const XVECTOR *__restrict pInput, const UINT32 uLog2Length)
Definition: xdsp.h:461