blob: 93a0d07f01b4e2676810511ad7052b447470aa12 [file] [log] [blame]
Subject: [PATCH] Fix SSE optimized inner product function.
The original implementation assumes the filter length is multiples of 8,
but it is only multiples of 4.
---
libspeex/resample_sse.h | 46 ++++++++++++++++++++++++++++++++++------------
1 file changed, 34 insertions(+), 12 deletions(-)
diff --git a/libspeex/resample_sse.h b/libspeex/resample_sse.h
index 4bd35a2..5d8a3f3 100644
--- a/libspeex/resample_sse.h
+++ b/libspeex/resample_sse.h
@@ -39,18 +39,40 @@
#define OVERRIDE_INNER_PRODUCT_SINGLE
static inline float inner_product_single(const float *a, const float *b, unsigned int len)
{
- int i;
- float ret;
- __m128 sum = _mm_setzero_ps();
- for (i=0;i<len;i+=8)
- {
- sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
- sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
- }
- sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
- sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
- _mm_store_ss(&ret, sum);
- return ret;
+ float sum = 0;
+ unsigned int chunk = len / 4;
+ if (chunk) {
+ __asm__ __volatile__ (
+ "xorps %%xmm0, %%xmm0 \n"
+ "1: \n"
+ "lddqu (%[a]), %%xmm1 \n"
+ "lddqu (%[b]), %%xmm2 \n"
+ "mulps %%xmm1, %%xmm2 \n"
+ "addps %%xmm2, %%xmm0 \n"
+ "add $16, %[a] \n"
+ "add $16, %[b] \n"
+ "sub $1, %[chunk] \n"
+ "jnz 1b \n"
+ "movhlps %%xmm0, %%xmm1 \n"
+ "addps %%xmm1, %%xmm0 \n"
+ "movaps %%xmm0, %%xmm1 \n"
+ "shufps $1, %%xmm0, %%xmm0 \n"
+ "addss %%xmm1, %%xmm0 \n"
+ "movss %%xmm0, %[sum] \n"
+ : /* output */
+ "=r"(chunk),
+ "=r"(a),
+ "=r"(b),
+ [sum]"=m"(sum)
+ : /* input */
+ [chunk]"0"(chunk),
+ [a]"1"(a),
+ [b]"2"(b)
+ : /* clobber */
+ "xmm0", "xmm1", "xmm2", "memory", "cc"
+ );
+ }
+ return sum;
}
#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
--
1.7.12.4