media-libs/speex/files/speex-1.2_rc1-fix-sse-inner-product.patch - third_party/overlays/chromiumos-overlay - Git at Google

 Subject: [PATCH] Fix SSE optimized inner product function.

 The original implementation assumes the filter length is multiples of 8,
 but it is only multiples of 4.
 ---
  libspeex/resample_sse.h | 46 ++++++++++++++++++++++++++++++++++------------
  1 file changed, 34 insertions(+), 12 deletions(-)

 diff --git a/libspeex/resample_sse.h b/libspeex/resample_sse.h
 index 4bd35a2..5d8a3f3 100644
 --- a/libspeex/resample_sse.h
 +++ b/libspeex/resample_sse.h
 @@ -39,18 +39,40 @@
  #define OVERRIDE_INNER_PRODUCT_SINGLE
  static inline float inner_product_single(const float *a, const float *b, unsigned int len)
  {
 -   int i;
 -   float ret;
 -   __m128 sum = _mm_setzero_ps();
 -   for (i=0;i<len;i+=8)
 -   {
 -      sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
 -      sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
 -   }
 -   sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
 -   sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
 -   _mm_store_ss(&ret, sum);
 -   return ret;
 +  float sum = 0;
 +  unsigned int chunk = len / 4;
 +  if (chunk) {
 +    __asm__ __volatile__ (
 +        "xorps %%xmm0, %%xmm0                       \n"
 +        "1:                                         \n"
 +        "lddqu (%[a]), %%xmm1                       \n"
 +        "lddqu (%[b]), %%xmm2                       \n"
 +        "mulps %%xmm1, %%xmm2                       \n"
 +        "addps %%xmm2, %%xmm0                       \n"
 +        "add $16, %[a]                              \n"
 +        "add $16, %[b]                              \n"
 +        "sub $1, %[chunk]                           \n"
 +        "jnz 1b                                     \n"
 +        "movhlps %%xmm0, %%xmm1                     \n"
 +        "addps %%xmm1, %%xmm0                       \n"
 +        "movaps %%xmm0, %%xmm1                      \n"
 +        "shufps $1, %%xmm0, %%xmm0                  \n"
 +        "addss %%xmm1, %%xmm0                       \n"
 +        "movss %%xmm0, %[sum]                       \n"
 +       : /* output */
 +          "=r"(chunk),
 +          "=r"(a),
 +          "=r"(b),
 +          [sum]"=m"(sum)
 +        : /* input */
 +          [chunk]"0"(chunk),
 +          [a]"1"(a),
 +          [b]"2"(b)
 +        : /* clobber */
 +          "xmm0", "xmm1", "xmm2", "memory", "cc"
 +        );
 +  }
 +  return sum;
  }

  #define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
 --
 1.7.12.4
	Subject: [PATCH] Fix SSE optimized inner product function.

	The original implementation assumes the filter length is multiples of 8,
	but it is only multiples of 4.
	---
	libspeex/resample_sse.h \| 46 ++++++++++++++++++++++++++++++++++------------
	1 file changed, 34 insertions(+), 12 deletions(-)

	diff --git a/libspeex/resample_sse.h b/libspeex/resample_sse.h
	index 4bd35a2..5d8a3f3 100644
	--- a/libspeex/resample_sse.h
	+++ b/libspeex/resample_sse.h
	@@ -39,18 +39,40 @@
	#define OVERRIDE_INNER_PRODUCT_SINGLE
	static inline float inner_product_single(const float a, const float b, unsigned int len)
	{
	- int i;
	- float ret;
	- __m128 sum = _mm_setzero_ps();
	- for (i=0;i<len;i+=8)
	- {
	- sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i), _mm_loadu_ps(b+i)));
	- sum = _mm_add_ps(sum, _mm_mul_ps(_mm_loadu_ps(a+i+4), _mm_loadu_ps(b+i+4)));
	- }
	- sum = _mm_add_ps(sum, _mm_movehl_ps(sum, sum));
	- sum = _mm_add_ss(sum, _mm_shuffle_ps(sum, sum, 0x55));
	- _mm_store_ss(&ret, sum);
	- return ret;
	+ float sum = 0;
	+ unsigned int chunk = len / 4;
	+ if (chunk) {
	+ __asm__ __volatile__ (
	+ "xorps %%xmm0, %%xmm0 \n"
	+ "1: \n"
	+ "lddqu (%[a]), %%xmm1 \n"
	+ "lddqu (%[b]), %%xmm2 \n"
	+ "mulps %%xmm1, %%xmm2 \n"
	+ "addps %%xmm2, %%xmm0 \n"
	+ "add $16, %[a] \n"
	+ "add $16, %[b] \n"
	+ "sub $1, %[chunk] \n"
	+ "jnz 1b \n"
	+ "movhlps %%xmm0, %%xmm1 \n"
	+ "addps %%xmm1, %%xmm0 \n"
	+ "movaps %%xmm0, %%xmm1 \n"
	+ "shufps $1, %%xmm0, %%xmm0 \n"
	+ "addss %%xmm1, %%xmm0 \n"
	+ "movss %%xmm0, %[sum] \n"
	+ : /* output */
	+ "=r"(chunk),
	+ "=r"(a),
	+ "=r"(b),
	+ [sum]"=m"(sum)
	+ : /* input */
	+ [chunk]"0"(chunk),
	+ [a]"1"(a),
	+ [b]"2"(b)
	+ : /* clobber */
	+ "xmm0", "xmm1", "xmm2", "memory", "cc"
	+ );
	+ }
	+ return sum;
	}

	#define OVERRIDE_INTERPOLATE_PRODUCT_SINGLE
	--
	1.7.12.4