| Improve resampler performance: |
| |
| (1) Add neon optimization for ARM platform. |
| (2) Add an option to use the full sinc table. The rationale is given in |
| http://lists.xiph.org/pipermail/speex-dev/2011-September/008243.html |
| |
| The code change is from Android repository. |
| |
| The configure.ac change is from the patch series in |
| http://lists.xiph.org/pipermail/speex-dev/2011-September/008242.html |
| |
| --- |
| configure.ac | 35 +++++++++ |
| libspeex/arch.h | 1 + |
| libspeex/fixed_generic.h | 4 + |
| libspeex/resample.c | 14 +++- |
| libspeex/resample_neon.h | 188 +++++++++++++++++++++++++++++++++++++++++++++++ |
| 5 files changed, 240 insertions(+), 2 deletions(-) |
| create mode 100644 libspeex/resample_neon.h |
| |
| diff --git a/configure.ac b/configure.ac |
| index 7f49f8a..29abca1 100644 |
| --- a/configure.ac |
| +++ b/configure.ac |
| @@ -89,6 +89,23 @@ has_sse=no |
| ) |
| AC_MSG_RESULT($has_sse) |
| |
| +AC_MSG_CHECKING(for NEON in current arch/CFLAGS) |
| +AC_LINK_IFELSE([ |
| +AC_LANG_PROGRAM([[ |
| +#include <arm_neon.h> |
| +int32x4_t testfunc(int16_t *a, int16_t *b) { |
| + return vmull_s16(vld1_s16(a), vld1_s16(b)); |
| +} |
| +]])], |
| +[ |
| +has_neon=yes |
| +], |
| +[ |
| +has_neon=no |
| +] |
| +) |
| +AC_MSG_RESULT($has_neon) |
| + |
| SAVE_CFLAGS="$CFLAGS" |
| CFLAGS="$CFLAGS -fvisibility=hidden" |
| AC_MSG_CHECKING(for ELF visibility) |
| @@ -151,6 +168,15 @@ has_sse=no |
| fi |
| ]) |
| |
| +AC_ARG_ENABLE(neon, [ --enable-neon Enable NEON support], [ |
| +if test "x$enableval" != xno; then |
| +has_neon=yes |
| +CFLAGS="$CFLAGS -O3 -march=armv7-a -mfpu=neon" |
| +else |
| +has_neon=no |
| +fi |
| +]) |
| + |
| |
| FFT=smallft |
| |
| @@ -168,6 +194,10 @@ if test "$has_sse" = yes; then |
| AC_DEFINE([_USE_SSE], , [Enable SSE support]) |
| fi |
| |
| +if test "$has_neon" = yes; then |
| + AC_DEFINE([_USE_NEON], , [Enable NEON support]) |
| +fi |
| + |
| AC_ARG_ENABLE(float-api, [ --disable-float-api Disable the floating-point API], |
| [if test "$enableval" = no; then |
| AC_DEFINE([DISABLE_FLOAT_API], , [Disable all parts of the API that are using floats]) |
| @@ -199,6 +229,11 @@ AC_ARG_ENABLE(fixed-point-debug, [ --enable-fixed-point-debug Debug fixed-poin |
| AC_DEFINE([FIXED_DEBUG], , [Debug fixed-point implementation]) |
| fi]) |
| |
| +AC_ARG_ENABLE(resample-full-sinc-table, [ --enable-resample-full-sinc-table Resample full SINC table (no interpolation)], |
| +[if test "$enableval" = yes; then |
| + AC_DEFINE([RESAMPLE_FORCE_FULL_SINC_TABLE], , [Resample with full SINC table (no interpolation)]) |
| +fi]) |
| + |
| AC_ARG_ENABLE(ti-c55x, [ --enable-ti-c55x Enable support for TI C55X DSP], |
| [if test "$enableval" = yes; then |
| has_char16=yes; |
| diff --git a/libspeex/arch.h b/libspeex/arch.h |
| index d38c36c..e911422 100644 |
| --- a/libspeex/arch.h |
| +++ b/libspeex/arch.h |
| @@ -171,6 +171,7 @@ typedef float spx_word32_t; |
| #define VSHR32(a,shift) (a) |
| #define SATURATE16(x,a) (x) |
| #define SATURATE32(x,a) (x) |
| +#define SATURATE32PSHR(x,shift,a) (x) |
| |
| #define PSHR(a,shift) (a) |
| #define SHR(a,shift) (a) |
| diff --git a/libspeex/fixed_generic.h b/libspeex/fixed_generic.h |
| index 3fb096e..0e012e9 100644 |
| --- a/libspeex/fixed_generic.h |
| +++ b/libspeex/fixed_generic.h |
| @@ -52,6 +52,10 @@ |
| #define SATURATE16(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x))) |
| #define SATURATE32(x,a) (((x)>(a) ? (a) : (x)<-(a) ? -(a) : (x))) |
| |
| +#define SATURATE32PSHR(x,shift,a) (((x)>=(SHL32(a,shift))) ? (a) : \ |
| + (x)<=-(SHL32(a,shift)) ? -(a) : \ |
| + (PSHR32(x, shift))) |
| + |
| #define SHR(a,shift) ((a) >> (shift)) |
| #define SHL(a,shift) ((spx_word32_t)(a) << (shift)) |
| #define PSHR(a,shift) (SHR((a)+((EXTEND32(1)<<((shift))>>1)),shift)) |
| diff --git a/libspeex/resample.c b/libspeex/resample.c |
| index 78e1518..69e5032 100644 |
| --- a/libspeex/resample.c |
| +++ b/libspeex/resample.c |
| @@ -99,6 +99,10 @@ static void speex_free (void *ptr) {free(ptr);} |
| #include "resample_sse.h" |
| #endif |
| |
| +#ifdef _USE_NEON |
| +#include "resample_neon.h" |
| +#endif |
| + |
| /* Numer of elements to allocate on the stack */ |
| #ifdef VAR_ARRAYS |
| #define FIXED_STACK_ALLOC 8192 |
| @@ -354,11 +358,12 @@ static int resampler_basic_direct_single(SpeexResamplerState *st, spx_uint32_t c |
| accum[3] += sinc[j+3]*iptr[j+3]; |
| } |
| sum = accum[0] + accum[1] + accum[2] + accum[3]; |
| + sum = SATURATE32PSHR(sum, 15, 32767); |
| #else |
| sum = inner_product_single(sinc, iptr, N); |
| #endif |
| |
| - out[out_stride * out_sample++] = PSHR32(sum, 15); |
| + out[out_stride * out_sample++] = sum; |
| last_sample += int_advance; |
| samp_frac_num += frac_advance; |
| if (samp_frac_num >= den_rate) |
| @@ -464,12 +469,13 @@ static int resampler_basic_interpolate_single(SpeexResamplerState *st, spx_uint3 |
| |
| cubic_coef(frac, interp); |
| sum = MULT16_32_Q15(interp[0],accum[0]) + MULT16_32_Q15(interp[1],accum[1]) + MULT16_32_Q15(interp[2],accum[2]) + MULT16_32_Q15(interp[3],accum[3]); |
| + sum = SATURATE32PSHR(sum, 15, 32767); |
| #else |
| cubic_coef(frac, interp); |
| sum = interpolate_product_single(iptr, st->sinc_table + st->oversample + 4 - offset - 2, N, st->oversample, interp); |
| #endif |
| |
| - out[out_stride * out_sample++] = PSHR32(sum,15); |
| + out[out_stride * out_sample++] = sum; |
| last_sample += int_advance; |
| samp_frac_num += frac_advance; |
| if (samp_frac_num >= den_rate) |
| @@ -579,7 +585,11 @@ static void update_filter(SpeexResamplerState *st) |
| } |
| |
| /* Choose the resampling type that requires the least amount of memory */ |
| +#ifdef RESAMPLE_FORCE_FULL_SINC_TABLE |
| + if (1) |
| +#else |
| if (st->den_rate <= st->oversample) |
| +#endif |
| { |
| spx_uint32_t i; |
| if (!st->sinc_table) |
| diff --git a/libspeex/resample_neon.h b/libspeex/resample_neon.h |
| new file mode 100644 |
| index 0000000..b651d5d |
| --- /dev/null |
| +++ b/libspeex/resample_neon.h |
| @@ -0,0 +1,188 @@ |
| +/* Copyright (C) 2007-2008 Jean-Marc Valin |
| + * Copyright (C) 2008 Thorvald Natvig |
| + * Copyright (C) 2011 Jyri Sarha, Texas Instruments |
| + */ |
| +/** |
| + @file resample_neon.h |
| + @brief Resampler functions (NEON version) |
| +*/ |
| +/* |
| + Redistribution and use in source and binary forms, with or without |
| + modification, are permitted provided that the following conditions |
| + are met: |
| + |
| + - Redistributions of source code must retain the above copyright |
| + notice, this list of conditions and the following disclaimer. |
| + |
| + - Redistributions in binary form must reproduce the above copyright |
| + notice, this list of conditions and the following disclaimer in the |
| + documentation and/or other materials provided with the distribution. |
| + |
| + - Neither the name of the Xiph.org Foundation nor the names of its |
| + contributors may be used to endorse or promote products derived from |
| + this software without specific prior written permission. |
| + |
| + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
| + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| +*/ |
| + |
| +#include <arm_neon.h> |
| + |
| +#ifdef FIXED_POINT |
| +static inline int32_t saturate_32bit_to_16bit(int32_t a) { |
| + int32_t ret; |
| + asm volatile ("vmov.s32 d24[0], %[a]\n" |
| + "vqmovn.s32 d24, q12\n" |
| + "vmov.s16 %[ret], d24[0]\n" |
| + : [ret] "=&r" (ret) |
| + : [a] "r" (a) |
| + : "q12", "d24", "d25" ); |
| + return ret; |
| +} |
| +#undef WORD2INT |
| +#define WORD2INT(x) (saturate_32bit_to_16bit(x)) |
| + |
| +#define OVERRIDE_INNER_PRODUCT_SINGLE |
| +static inline int32_t inner_product_single(const int16_t *a, const int16_t *b, unsigned int len) |
| +{ |
| + int32_t ret; |
| + uint32_t remainder = len % 16; |
| + len = len - remainder; |
| + |
| + asm volatile (" cmp %[len], #0\n" |
| + " bne 1f\n" |
| + " vld1.16 {d16}, [%[a]]!\n" |
| + " vld1.16 {d20}, [%[b]]!\n" |
| + " subs %[remainder], %[remainder], #4\n" |
| + " vmull.s16 q0, d16, d20\n" |
| + " beq 5f\n" |
| + " b 4f\n" |
| + "1:" |
| + " vld1.16 {d16, d17, d18, d19}, [%[a]]!\n" |
| + " vld1.16 {d20, d21, d22, d23}, [%[b]]!\n" |
| + " subs %[len], %[len], #16\n" |
| + " vmull.s16 q0, d16, d20\n" |
| + " vmlal.s16 q0, d17, d21\n" |
| + " vmlal.s16 q0, d18, d22\n" |
| + " vmlal.s16 q0, d19, d23\n" |
| + " beq 3f\n" |
| + "2:" |
| + " vld1.16 {d16, d17, d18, d19}, [%[a]]!\n" |
| + " vld1.16 {d20, d21, d22, d23}, [%[b]]!\n" |
| + " subs %[len], %[len], #16\n" |
| + " vmlal.s16 q0, d16, d20\n" |
| + " vmlal.s16 q0, d17, d21\n" |
| + " vmlal.s16 q0, d18, d22\n" |
| + " vmlal.s16 q0, d19, d23\n" |
| + " bne 2b\n" |
| + "3:" |
| + " cmp %[remainder], #0\n" |
| + " beq 5f\n" |
| + "4:" |
| + " vld1.16 {d16}, [%[a]]!\n" |
| + " vld1.16 {d20}, [%[b]]!\n" |
| + " subs %[remainder], %[remainder], #4\n" |
| + " vmlal.s16 q0, d16, d20\n" |
| + " bne 4b\n" |
| + "5:" |
| + " vaddl.s32 q0, d0, d1\n" |
| + " vadd.s64 d0, d0, d1\n" |
| + " vqmovn.s64 d0, q0\n" |
| + " vqrshrn.s32 d0, q0, #15\n" |
| + " vmov.s16 %[ret], d0[0]\n" |
| + : [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b), |
| + [len] "+r" (len), [remainder] "+r" (remainder) |
| + : |
| + : "cc", "q0", |
| + "d16", "d17", "d18", "d19", |
| + "d20", "d21", "d22", "d23"); |
| + return ret; |
| +} |
| + |
| +#elif defined(FLOATING_POINT) |
| + |
| +static inline int32_t saturate_float_to_16bit(float a) { |
| + int32_t ret; |
| + asm ("vmov.f32 d24[0], %[a]\n" |
| + "vcvt.s32.f32 d24, d24, #15\n" |
| + "vqrshrn.s32 d24, q12, #15\n" |
| + "vmov.s16 %[ret], d24[0]\n" |
| + : [ret] "=&r" (ret) |
| + : [a] "r" (a) |
| + : "q12", "d24", "d25" ); |
| + return ret; |
| +} |
| +#undef WORD2INT |
| +#define WORD2INT(x) (saturate_float_to_16bit(x)) |
| + |
| +#define OVERRIDE_INNER_PRODUCT_SINGLE |
| +static inline float inner_product_single(const float *a, const float *b, unsigned int len) |
| +{ |
| + float ret; |
| + uint32_t remainder = len % 16; |
| + len = len - remainder; |
| + |
| + asm volatile (" cmp %[len], #0\n" |
| + " bne 1f\n" |
| + " vld1.32 {q4}, [%[a]]!\n" |
| + " vld1.32 {q8}, [%[b]]!\n" |
| + " subs %[remainder], %[remainder], #4\n" |
| + " vmul.f32 q0, q4, q8\n" |
| + " beq 5f\n" |
| + " b 4f\n" |
| + "1:" |
| + " vld1.32 {q4, q5}, [%[a]]!\n" |
| + " vld1.32 {q8, q9}, [%[b]]!\n" |
| + " vld1.32 {q6, q7}, [%[a]]!\n" |
| + " vld1.32 {q10, q11}, [%[b]]!\n" |
| + " subs %[len], %[len], #16\n" |
| + " vmul.f32 q0, q4, q8\n" |
| + " vmul.f32 q1, q5, q9\n" |
| + " vmul.f32 q2, q6, q10\n" |
| + " vmul.f32 q3, q7, q11\n" |
| + " beq 3f\n" |
| + "2:" |
| + " vld1.32 {q4, q5}, [%[a]]!\n" |
| + " vld1.32 {q8, q9}, [%[b]]!\n" |
| + " vld1.32 {q6, q7}, [%[a]]!\n" |
| + " vld1.32 {q10, q11}, [%[b]]!\n" |
| + " subs %[len], %[len], #16\n" |
| + " vmla.f32 q0, q4, q8\n" |
| + " vmla.f32 q1, q5, q9\n" |
| + " vmla.f32 q2, q6, q10\n" |
| + " vmla.f32 q3, q7, q11\n" |
| + " bne 2b\n" |
| + "3:" |
| + " vadd.f32 q4, q0, q1\n" |
| + " vadd.f32 q5, q2, q3\n" |
| + " vadd.f32 q0, q4, q5\n" |
| + " cmp %[remainder], #0\n" |
| + " beq 5f\n" |
| + "4:" |
| + " vld1.32 {q6}, [%[a]]!\n" |
| + " vld1.32 {q10}, [%[b]]!\n" |
| + " subs %[remainder], %[remainder], #4\n" |
| + " vmla.f32 q0, q6, q10\n" |
| + " bne 4b\n" |
| + "5:" |
| + " vadd.f32 d0, d0, d1\n" |
| + " vpadd.f32 d0, d0, d0\n" |
| + " vmov.f32 %[ret], d0[0]\n" |
| + : [ret] "=&r" (ret), [a] "+r" (a), [b] "+r" (b), |
| + [len] "+l" (len), [remainder] "+l" (remainder) |
| + : |
| + : "cc", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", |
| + "q9", "q10", "q11"); |
| + return ret; |
| +} |
| + |
| +#endif |
| -- |
| 1.8.4 |
| |