123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379 |
- /*
- * Handy add/subtract functions to operate on chunks of shorts.
- * Feel free to add customizations for additional architectures
- *
- */
- /*
- * See http://www.asterisk.org for more information about
- * the Asterisk project. Please do not directly contact
- * any of the maintainers of this project for assistance;
- * the project provides a web site, mailing lists and IRC
- * channels for your use.
- *
- * This program is free software, distributed under the terms of
- * the GNU General Public License Version 2 as published by the
- * Free Software Foundation. See the LICENSE file included with
- * this program for more details.
- */
- #ifndef _DAHDI_ARITH_H
- #define _DAHDI_ARITH_H
- #ifdef CONFIG_DAHDI_MMX
- #ifdef DAHDI_CHUNKSIZE
- static inline void __ACSS(volatile short *dst, const short *src)
- {
- __asm__ __volatile__ (
- "movq 0(%0), %%mm0;\n"
- "movq 0(%1), %%mm1;\n"
- "movq 8(%0), %%mm2;\n"
- "movq 8(%1), %%mm3;\n"
- "paddsw %%mm1, %%mm0;\n"
- "paddsw %%mm3, %%mm2;\n"
- "movq %%mm0, 0(%0);\n"
- "movq %%mm2, 8(%0);\n"
- : "=r" (dst)
- : "r" (src), "0" (dst)
- : "memory"
- #ifdef CLOBBERMMX
- , "%mm0", "%mm1", "%mm2", "%mm3"
- #endif
- );
- }
- static inline void __SCSS(volatile short *dst, const short *src)
- {
- __asm__ __volatile__ (
- "movq 0(%0), %%mm0;\n"
- "movq 0(%1), %%mm1;\n"
- "movq 8(%0), %%mm2;\n"
- "movq 8(%1), %%mm3;\n"
- "psubsw %%mm1, %%mm0;\n"
- "psubsw %%mm3, %%mm2;\n"
- "movq %%mm0, 0(%0);\n"
- "movq %%mm2, 8(%0);\n"
- : "=r" (dst)
- : "r" (src), "0" (dst)
- : "memory"
- #ifdef CLOBBERMMX
- , "%mm0", "%mm1", "%mm2", "%mm3"
- #endif
- );
- }
- #if (DAHDI_CHUNKSIZE == 8)
- #define ACSS(a,b) __ACSS(a,b)
- #define SCSS(a,b) __SCSS(a,b)
- #elif (DAHDI_CHUNKSIZE > 8)
- static inline void ACSS(volatile short *dst, const short *src)
- {
- int x;
- for (x=0;x<DAHDI_CHUNKSIZE;x+=8)
- __ACSS(dst + x, src + x);
- }
- static inline void SCSS(volatile short *dst, const short *src)
- {
- int x;
- for (x=0;x<DAHDI_CHUNKSIZE;x+=8)
- __SCSS(dst + x, src + x);
- }
- #else
- #error No MMX for DAHDI_CHUNKSIZE < 8
- #endif
- #endif
- static inline int CONVOLVE(const int *coeffs, const short *hist, int len)
- {
- int sum;
- /* Divide length by 16 */
- len >>= 4;
-
- /* Clear our accumulator, mm4 */
-
- /*
-
- For every set of eight...
-
- Load 16 coefficients into four registers...
- Shift each word right 16 to make them shorts...
- Pack the resulting shorts into two registers...
- With the coefficients now in mm0 and mm2, load the
- history into mm1 and mm3...
- Multiply/add mm1 into mm0, and mm3 into mm2...
- Add mm2 into mm0 (without saturation, alas). Now we have two half-results.
- Accumulate in mm4 (again, without saturation, alas)
- */
- __asm__ (
- "pxor %%mm4, %%mm4;\n"
- "mov %1, %%edi;\n"
- "mov %2, %%esi;\n"
- "mov %3, %%ecx;\n"
- "1:"
- "movq 0(%%edi), %%mm0;\n"
- "movq 8(%%edi), %%mm1;\n"
- "movq 16(%%edi), %%mm2;\n"
- "movq 24(%%edi), %%mm3;\n"
- /* can't use 4/5 since 4 is the accumulator for us */
- "movq 32(%%edi), %%mm6;\n"
- "movq 40(%%edi), %%mm7;\n"
- "psrad $16, %%mm0;\n"
- "psrad $16, %%mm1;\n"
- "psrad $16, %%mm2;\n"
- "psrad $16, %%mm3;\n"
- "psrad $16, %%mm6;\n"
- "psrad $16, %%mm7;\n"
- "packssdw %%mm1, %%mm0;\n"
- "packssdw %%mm3, %%mm2;\n"
- "packssdw %%mm7, %%mm6;\n"
- "movq 0(%%esi), %%mm1;\n"
- "movq 8(%%esi), %%mm3;\n"
- "movq 16(%%esi), %%mm7;\n"
- "pmaddwd %%mm1, %%mm0;\n"
- "pmaddwd %%mm3, %%mm2;\n"
- "pmaddwd %%mm7, %%mm6;\n"
- "paddd %%mm6, %%mm4;\n"
- "paddd %%mm2, %%mm4;\n"
- "paddd %%mm0, %%mm4;\n"
- /* Come back and do for the last few bytes */
- "movq 48(%%edi), %%mm6;\n"
- "movq 56(%%edi), %%mm7;\n"
- "psrad $16, %%mm6;\n"
- "psrad $16, %%mm7;\n"
- "packssdw %%mm7, %%mm6;\n"
- "movq 24(%%esi), %%mm7;\n"
- "pmaddwd %%mm7, %%mm6;\n"
- "paddd %%mm6, %%mm4;\n"
- "add $64, %%edi;\n"
- "add $32, %%esi;\n"
- "dec %%ecx;\n"
- "jnz 1b;\n"
- "movq %%mm4, %%mm0;\n"
- "psrlq $32, %%mm0;\n"
- "paddd %%mm0, %%mm4;\n"
- "movd %%mm4, %0;\n"
- : "=r" (sum)
- : "r" (coeffs), "r" (hist), "r" (len)
- : "%ecx", "%edi", "%esi"
- );
-
- return sum;
- }
- static inline void UPDATE(volatile int *taps, const short *history, const int nsuppr, const int ntaps)
- {
- int i;
- int correction;
- for (i=0;i<ntaps;i++) {
- correction = history[i] * nsuppr;
- taps[i] += correction;
- }
- }
- static inline void UPDATE2(volatile int *taps, volatile short *taps_short, const short *history, const int nsuppr, const int ntaps)
- {
- int i;
- int correction;
- #if 0
- ntaps >>= 4;
- /* First, load up taps, */
- __asm__ (
- "pxor %%mm4, %%mm4;\n"
- "mov %0, %%edi;\n"
- "mov %1, %%esi;\n"
- "mov %3, %%ecx;\n"
- "1:"
- "jnz 1b;\n"
- "movq %%mm4, %%mm0;\n"
- "psrlq $32, %%mm0;\n"
- "paddd %%mm0, %%mm4;\n"
- "movd %%mm4, %0;\n"
- : "=r" (taps), "=r" (taps_short)
- : "r" (history), "r" (nsuppr), "r" (ntaps), "0" (taps)
- : "%ecx", "%edi", "%esi"
- );
- #endif
- #if 1
- for (i=0;i<ntaps;i++) {
- correction = history[i] * nsuppr;
- taps[i] += correction;
- taps_short[i] = taps[i] >> 16;
- }
- #endif
- }
- static inline int CONVOLVE2(const short *coeffs, const short *hist, int len)
- {
- int sum;
- /* Divide length by 16 */
- len >>= 4;
-
- /* Clear our accumulator, mm4 */
-
- /*
-
- For every set of eight...
- Load in eight coefficients and eight historic samples, multliply add and
- accumulate the result
- */
- __asm__ (
- "pxor %%mm4, %%mm4;\n"
- "mov %1, %%edi;\n"
- "mov %2, %%esi;\n"
- "mov %3, %%ecx;\n"
- "1:"
- "movq 0(%%edi), %%mm0;\n"
- "movq 8(%%edi), %%mm2;\n"
- "movq 0(%%esi), %%mm1;\n"
- "movq 8(%%esi), %%mm3;\n"
- "pmaddwd %%mm1, %%mm0;\n"
- "pmaddwd %%mm3, %%mm2;\n"
- "paddd %%mm2, %%mm4;\n"
- "paddd %%mm0, %%mm4;\n"
- "movq 16(%%edi), %%mm0;\n"
- "movq 24(%%edi), %%mm2;\n"
- "movq 16(%%esi), %%mm1;\n"
- "movq 24(%%esi), %%mm3;\n"
- "pmaddwd %%mm1, %%mm0;\n"
- "pmaddwd %%mm3, %%mm2;\n"
- "paddd %%mm2, %%mm4;\n"
- "paddd %%mm0, %%mm4;\n"
- "add $32, %%edi;\n"
- "add $32, %%esi;\n"
- "dec %%ecx;\n"
- "jnz 1b;\n"
- "movq %%mm4, %%mm0;\n"
- "psrlq $32, %%mm0;\n"
- "paddd %%mm0, %%mm4;\n"
- "movd %%mm4, %0;\n"
- : "=r" (sum)
- : "r" (coeffs), "r" (hist), "r" (len)
- : "%ecx", "%edi", "%esi"
- );
-
- return sum;
- }
- static inline short MAX16(const short *y, int len, int *pos)
- {
- int k;
- short max = 0;
- int bestpos = 0;
- for (k=0;k<len;k++) {
- if (max < y[k]) {
- bestpos = k;
- max = y[k];
- }
- }
- *pos = (len - 1 - bestpos);
- return max;
- }
- #else
- #ifdef DAHDI_CHUNKSIZE
- static inline void ACSS(short *dst, short *src)
- {
- int x;
- /* Add src to dst with saturation, storing in dst */
- #ifdef BFIN
- for (x = 0; x < DAHDI_CHUNKSIZE; x++)
- dst[x] = __builtin_bfin_add_fr1x16(dst[x], src[x]);
- #else
- int sum;
- for (x = 0; x < DAHDI_CHUNKSIZE; x++) {
- sum = dst[x] + src[x];
- if (sum > 32767)
- sum = 32767;
- else if (sum < -32768)
- sum = -32768;
- dst[x] = sum;
- }
- #endif
- }
- static inline void SCSS(short *dst, short *src)
- {
- int x;
- /* Subtract src from dst with saturation, storing in dst */
- #ifdef BFIN
- for (x = 0; x < DAHDI_CHUNKSIZE; x++)
- dst[x] = __builtin_bfin_sub_fr1x16(dst[x], src[x]);
- #else
- int sum;
- for (x = 0; x < DAHDI_CHUNKSIZE; x++) {
- sum = dst[x] - src[x];
- if (sum > 32767)
- sum = 32767;
- else if (sum < -32768)
- sum = -32768;
- dst[x] = sum;
- }
- #endif
- }
- #endif /* DAHDI_CHUNKSIZE */
- static inline int CONVOLVE(const int *coeffs, const short *hist, int len)
- {
- int x;
- int sum = 0;
- for (x=0;x<len;x++)
- sum += (coeffs[x] >> 16) * hist[x];
- return sum;
- }
- static inline int CONVOLVE2(const short *coeffs, const short *hist, int len)
- {
- int x;
- int sum = 0;
- for (x=0;x<len;x++)
- sum += coeffs[x] * hist[x];
- return sum;
- }
- static inline void UPDATE(int *taps, const short *history, const int nsuppr, const int ntaps)
- {
- int i;
- int correction;
- for (i=0;i<ntaps;i++) {
- correction = history[i] * nsuppr;
- taps[i] += correction;
- }
- }
- static inline void UPDATE2(int *taps, short *taps_short, const short *history, const int nsuppr, const int ntaps)
- {
- int i;
- int correction;
- for (i=0;i<ntaps;i++) {
- correction = history[i] * nsuppr;
- taps[i] += correction;
- taps_short[i] = taps[i] >> 16;
- }
- }
- static inline short MAX16(const short *y, int len, int *pos)
- {
- int k;
- short max = 0;
- int bestpos = 0;
- for (k=0;k<len;k++) {
- if (max < y[k]) {
- bestpos = k;
- max = y[k];
- }
- }
- *pos = (len - 1 - bestpos);
- return max;
- }
- #endif /* MMX */
- #endif /* _DAHDI_ARITH_H */
|