blob: 30050f60fa3181b19338a587c3a513bb9975dd0a [file] [log] [blame]
From e6a2dbc6a980bf5889cb1340a270ee5ae329d028 Mon Sep 17 00:00:00 2001
From: Leonardo Sandoval <leonardo.sandoval.gonzalez@linux.intel.com>
Date: Thu, 3 May 2018 11:09:30 -0500
Subject: [PATCH] x86-64: Optimize strcmp/wcscmp and strncmp/wcsncmp with AVX2
Optimize x86-64 strcmp/wcscmp and strncmp/wcsncmp with AVX2. It uses vector
comparison as much as possible. Peak performance observed on a SkyLake
machine: 9x, 3x, 2.5x and 5.5x for strcmp, strncmp, wcscmp and wcsncmp,
respectively. The larger the comparison length, the more benefit using
avx2 functions, except on the strcmp, where peak is observed at length
== 32 bytes. Select AVX2 strcmp/wcscmp on AVX2 machines where vzeroupper
is preferred and AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strcmp-avx2, strncmp-avx2, wcscmp-avx2, wcscmp-sse2, wcsncmp-avx2 and
wcsncmp-sse2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strcmp_avx2,
__strncmp_avx2, __wcscmp_avx2, __wcsncmp_avx2, __wcscmp_sse2
and __wcsncmp_sse2.
* sysdeps/x86_64/multiarch/strcmp.c (OPTIMIZE (avx2)):
(IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if
AVX unaligned load is fast and vzeroupper is preferred.
* sysdeps/x86_64/multiarch/strncmp.c: Likewise.
* sysdeps/x86_64/multiarch/strcmp-avx2.S: New file.
* sysdeps/x86_64/multiarch/strncmp-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcscmp-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcscmp-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/wcscmp.c: Likewise.
* sysdeps/x86_64/multiarch/wcsncmp-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcsncmp-sse2.c: Likewise.
* sysdeps/x86_64/multiarch/wcsncmp.c: Likewise.
* sysdeps/x86_64/wcscmp.S (__wcscmp): Add alias only if __wcscmp
is undefined.
Conflicts:
ChangeLog
---
ChangeLog | 24 +
sysdeps/x86_64/multiarch/Makefile | 6 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 +
sysdeps/x86_64/multiarch/strcmp-avx2.S | 847 +++++++++++++++++++++
sysdeps/x86_64/multiarch/strcmp.c | 6 +
sysdeps/x86_64/multiarch/strncmp-avx2.S | 3 +
sysdeps/x86_64/multiarch/strncmp.c | 6 +
sysdeps/x86_64/multiarch/wcscmp-avx2.S | 4 +
sysdeps/x86_64/multiarch/wcscmp-sse2.S | 23 +
sysdeps/x86_64/multiarch/wcscmp.c | 37 +
sysdeps/x86_64/multiarch/wcsncmp-avx2.S | 5 +
sysdeps/x86_64/multiarch/wcsncmp-sse2.c | 20 +
sysdeps/x86_64/multiarch/wcsncmp.c | 31 +
sysdeps/x86_64/wcscmp.S | 2 +
14 files changed, 1032 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strcmp-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/wcscmp-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/wcscmp-sse2.S
create mode 100644 sysdeps/x86_64/multiarch/wcscmp.c
create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-sse2.c
create mode 100644 sysdeps/x86_64/multiarch/wcsncmp.c
diff --git a/ChangeLog b/ChangeLog
index 7fcb7d5981..4c34ebd4cc 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,27 @@
+2018-06-01 Leonardo Sandoval <leonardo.sandoval.gonzalez@intel.com>
+
+ * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
+ strcmp-avx2, strncmp-avx2, wcscmp-avx2, wcscmp-sse2, wcsncmp-avx2 and
+ wcsncmp-sse2.
+ * sysdeps/x86_64/multiarch/ifunc-impl-list.c
+ (__libc_ifunc_impl_list): Add tests for __strcmp_avx2,
+ __strncmp_avx2, __wcscmp_avx2, __wcsncmp_avx2, __wcscmp_sse2
+ and __wcsncmp_sse2.
+ * sysdeps/x86_64/multiarch/strcmp.c (OPTIMIZE (avx2)):
+ (IFUNC_SELECTOR): Return OPTIMIZE (avx2) on AVX 2 machines if
+ AVX unaligned load is fast and vzeroupper is preferred.
+ * sysdeps/x86_64/multiarch/strncmp.c: Likewise.
+ * sysdeps/x86_64/multiarch/strcmp-avx2.S: New file.
+ * sysdeps/x86_64/multiarch/strncmp-avx2.S: Likewise.
+ * sysdeps/x86_64/multiarch/wcscmp-avx2.S: Likewise.
+ * sysdeps/x86_64/multiarch/wcscmp-sse2.S: Likewise.
+ * sysdeps/x86_64/multiarch/wcscmp.c: Likewise.
+ * sysdeps/x86_64/multiarch/wcsncmp-avx2.S: Likewise.
+ * sysdeps/x86_64/multiarch/wcsncmp-sse2.c: Likewise.
+ * sysdeps/x86_64/multiarch/wcsncmp.c: Likewise.
+ * sysdeps/x86_64/wcscmp.S (__wcscmp): Add alias only if __wcscmp
+ is undefined.
+
2018-07-29 H.J. Lu <hongjiu.lu@intel.com>
[BZ #23459]
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 68257c4017..bb5e970735 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -6,8 +6,8 @@ ifeq ($(subdir),string)
sysdep_routines += strncat-c stpncpy-c strncpy-c \
strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3 \
- strcmp-sse4_2 \
- strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 \
+ strcmp-sse4_2 strcmp-avx2 \
+ strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \
memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
memrchr-sse2 memrchr-avx2 \
memcmp-sse2 \
@@ -51,6 +51,8 @@ ifeq ($(subdir),wcsmbs)
sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wmemcmp-avx2-movbe \
wmemchr-sse2 wmemchr-avx2 \
+ wcscmp-sse2 wcscmp-avx2 \
+ wcsncmp-sse2 wcsncmp-avx2 \
wcscpy-ssse3 wcscpy-c \
wcschr-sse2 wcschr-avx2 \
wcsrchr-sse2 wcsrchr-avx2 \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7afd674b81..9aaaef7251 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -268,6 +268,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strcmp.c. */
IFUNC_IMPL (i, name, strcmp,
+ IFUNC_IMPL_ADD (array, i, strcmp,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strcmp_avx2)
IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
__strcmp_sse42)
IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSSE3),
@@ -364,6 +367,20 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wcsrchr_avx2)
IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
+ /* Support sysdeps/x86_64/multiarch/wcscmp.c. */
+ IFUNC_IMPL (i, name, wcscmp,
+ IFUNC_IMPL_ADD (array, i, wcscmp,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcscmp_avx2)
+ IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2))
+
+ /* Support sysdeps/x86_64/multiarch/wcsncmp.c. */
+ IFUNC_IMPL (i, name, wcsncmp,
+ IFUNC_IMPL_ADD (array, i, wcsncmp,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcsncmp_avx2)
+ IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2))
+
/* Support sysdeps/x86_64/multiarch/wcscpy.c. */
IFUNC_IMPL (i, name, wcscpy,
IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
@@ -536,6 +553,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strncmp.c. */
IFUNC_IMPL (i, name, strncmp,
+ IFUNC_IMPL_ADD (array, i, strncmp,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strncmp_avx2)
IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSE4_2),
__strncmp_sse42)
IFUNC_IMPL_ADD (array, i, strncmp, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
new file mode 100644
index 0000000000..e8397f3b05
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -0,0 +1,847 @@
+/* strcmp/wcscmp/strncmp/wcsncmp optimized with AVX2.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCMP
+# define STRCMP __strcmp_avx2
+# endif
+
+# define PAGE_SIZE 4096
+
+/* VEC_SIZE = Number of bytes in a ymm register */
+# define VEC_SIZE 32
+
+/* Shift for dividing by (VEC_SIZE * 4). */
+# define DIVIDE_BY_VEC_4_SHIFT 7
+# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
+# endif
+
+# ifdef USE_AS_WCSCMP
+/* Compare packed dwords. */
+# define VPCMPEQ vpcmpeqd
+/* Compare packed dwords and store minimum. */
+# define VPMINU vpminud
+/* 1 dword char == 4 bytes. */
+# define SIZE_OF_CHAR 4
+# else
+/* Compare packed bytes. */
+# define VPCMPEQ vpcmpeqb
+/* Compare packed bytes and store minimum. */
+# define VPMINU vpminub
+/* 1 byte char == 1 byte. */
+# define SIZE_OF_CHAR 1
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+/* Warning!
+ wcscmp/wcsncmp have to use SIGNED comparison for elements.
+ strcmp/strncmp have to use UNSIGNED comparison for elements.
+*/
+
+/* The main idea of the string comparison (byte or dword) using AVX2
+ consists of comparing (VPCMPEQ) two ymm vectors. The latter can be on
+ either packed bytes or dwords depending on USE_AS_WCSCMP. In order
+ to check the null char, algorithm keeps the matched bytes/dwords,
+ requiring two more AVX2 instructions (VPMINU and VPCMPEQ). In general,
+ the costs of comparing VEC_SIZE bytes (32-bytes) are two VPCMPEQ and
+ one VPMINU instructions, together with movdqu and testl instructions.
+ Main loop (away from from page boundary) compares 4 vectors are a time,
+ effectively comparing 4 x VEC_SIZE bytes (128 bytes) on each loop.
+
+ The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
+ is the same as strcmp, except that an a maximum offset is tracked. If
+ the maximum offset is reached before a difference is found, zero is
+ returned. */
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRCMP)
+# ifdef USE_AS_STRNCMP
+ /* Check for simple cases (0 or 1) in offset. */
+ cmp $1, %rdx
+ je L(char0)
+ jb L(zero)
+# ifdef USE_AS_WCSCMP
+ /* Convert units: from wide to byte char. */
+ shl $2, %rdx
+# endif
+ /* Register %r11 tracks the maximum offset. */
+ movq %rdx, %r11
+# endif
+ movl %edi, %eax
+ xorl %edx, %edx
+ /* Make %ymm7 all zeros in this function. */
+ vpxor %ymm7, %ymm7, %ymm7
+ orl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+ jg L(cross_page)
+ /* Start comparing 4 vectors. */
+ vmovdqu (%rdi), %ymm1
+ VPCMPEQ (%rsi), %ymm1, %ymm0
+ VPMINU %ymm1, %ymm0, %ymm0
+ VPCMPEQ %ymm7, %ymm0, %ymm0
+ vpmovmskb %ymm0, %ecx
+ testl %ecx, %ecx
+ je L(next_3_vectors)
+ tzcntl %ecx, %edx
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the mismatched index (%rdx) is after the maximum
+ offset (%r11). */
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi, %rdx), %ecx
+ cmpl (%rsi, %rdx), %ecx
+ je L(return)
+L(wcscmp_return):
+ setl %al
+ negl %eax
+ orl $1, %eax
+L(return):
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(return_vec_size):
+ tzcntl %ecx, %edx
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
+ the maximum offset (%r11). */
+ addq $VEC_SIZE, %rdx
+ cmpq %r11, %rdx
+ jae L(zero)
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi, %rdx), %ecx
+ cmpl (%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl VEC_SIZE(%rdi, %rdx), %ecx
+ cmpl VEC_SIZE(%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl VEC_SIZE(%rdi, %rdx), %eax
+ movzbl VEC_SIZE(%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(return_2_vec_size):
+ tzcntl %ecx, %edx
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
+ after the maximum offset (%r11). */
+ addq $(VEC_SIZE * 2), %rdx
+ cmpq %r11, %rdx
+ jae L(zero)
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi, %rdx), %ecx
+ cmpl (%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
+ cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(return_3_vec_size):
+ tzcntl %ecx, %edx
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
+ after the maximum offset (%r11). */
+ addq $(VEC_SIZE * 3), %rdx
+ cmpq %r11, %rdx
+ jae L(zero)
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi, %rdx), %ecx
+ cmpl (%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
+ cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(next_3_vectors):
+ vmovdqu VEC_SIZE(%rdi), %ymm6
+ VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3
+ VPMINU %ymm6, %ymm3, %ymm3
+ VPCMPEQ %ymm7, %ymm3, %ymm3
+ vpmovmskb %ymm3, %ecx
+ testl %ecx, %ecx
+ jne L(return_vec_size)
+ vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5
+ vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4
+ vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0
+ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2
+ VPMINU %ymm5, %ymm2, %ymm2
+ VPCMPEQ %ymm4, %ymm0, %ymm0
+ VPCMPEQ %ymm7, %ymm2, %ymm2
+ vpmovmskb %ymm2, %ecx
+ testl %ecx, %ecx
+ jne L(return_2_vec_size)
+ VPMINU %ymm4, %ymm0, %ymm0
+ VPCMPEQ %ymm7, %ymm0, %ymm0
+ vpmovmskb %ymm0, %ecx
+ testl %ecx, %ecx
+ jne L(return_3_vec_size)
+L(main_loop_header):
+ leaq (VEC_SIZE * 4)(%rdi), %rdx
+ movl $PAGE_SIZE, %ecx
+ /* Align load via RAX. */
+ andq $-(VEC_SIZE * 4), %rdx
+ subq %rdi, %rdx
+ leaq (%rdi, %rdx), %rax
+# ifdef USE_AS_STRNCMP
+ /* Starting from this point, the maximum offset, or simply the
+ 'offset', DECREASES by the same amount when base pointers are
+ moved forward. Return 0 when:
+ 1) On match: offset <= the matched vector index.
+ 2) On mistmach, offset is before the mistmatched index.
+ */
+ subq %rdx, %r11
+ jbe L(zero)
+# endif
+ addq %rsi, %rdx
+ movq %rdx, %rsi
+ andl $(PAGE_SIZE - 1), %esi
+ /* Number of bytes before page crossing. */
+ subq %rsi, %rcx
+ /* Number of VEC_SIZE * 4 blocks before page crossing. */
+ shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
+ /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
+ movl %ecx, %esi
+ jmp L(loop_start)
+
+ .p2align 4
+L(loop):
+# ifdef USE_AS_STRNCMP
+ /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
+ the maximum offset (%r11) by the same amount. */
+ subq $(VEC_SIZE * 4), %r11
+ jbe L(zero)
+# endif
+ addq $(VEC_SIZE * 4), %rax
+ addq $(VEC_SIZE * 4), %rdx
+L(loop_start):
+ testl %esi, %esi
+ leal -1(%esi), %esi
+ je L(loop_cross_page)
+L(back_to_loop):
+ /* Main loop, comparing 4 vectors are a time. */
+ vmovdqa (%rax), %ymm0
+ vmovdqa VEC_SIZE(%rax), %ymm3
+ VPCMPEQ (%rdx), %ymm0, %ymm4
+ VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1
+ VPMINU %ymm0, %ymm4, %ymm4
+ VPMINU %ymm3, %ymm1, %ymm1
+ vmovdqa (VEC_SIZE * 2)(%rax), %ymm2
+ VPMINU %ymm1, %ymm4, %ymm0
+ vmovdqa (VEC_SIZE * 3)(%rax), %ymm3
+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5
+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6
+ VPMINU %ymm2, %ymm5, %ymm5
+ VPMINU %ymm3, %ymm6, %ymm6
+ VPMINU %ymm5, %ymm0, %ymm0
+ VPMINU %ymm6, %ymm0, %ymm0
+ VPCMPEQ %ymm7, %ymm0, %ymm0
+
+ /* Test each mask (32 bits) individually because for VEC_SIZE
+ == 32 is not possible to OR the four masks and keep all bits
+ in a 64-bit integer register, differing from SSE2 strcmp
+ where ORing is possible. */
+ vpmovmskb %ymm0, %ecx
+ testl %ecx, %ecx
+ je L(loop)
+ VPCMPEQ %ymm7, %ymm4, %ymm0
+ vpmovmskb %ymm0, %edi
+ testl %edi, %edi
+ je L(test_vec)
+ tzcntl %edi, %ecx
+# ifdef USE_AS_STRNCMP
+ cmpq %rcx, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %edi
+ cmpl (%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %edi
+ cmpl (%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(test_vec):
+# ifdef USE_AS_STRNCMP
+ /* The first vector matched. Return 0 if the maximum offset
+ (%r11) <= VEC_SIZE. */
+ cmpq $VEC_SIZE, %r11
+ jbe L(zero)
+# endif
+ VPCMPEQ %ymm7, %ymm1, %ymm1
+ vpmovmskb %ymm1, %ecx
+ testl %ecx, %ecx
+ je L(test_2_vec)
+ tzcntl %ecx, %edi
+# ifdef USE_AS_STRNCMP
+ addq $VEC_SIZE, %rdi
+ cmpq %rdi, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rdi), %ecx
+ cmpl (%rdx, %rdi), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rdi), %eax
+ movzbl (%rdx, %rdi), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl VEC_SIZE(%rsi, %rdi), %ecx
+ cmpl VEC_SIZE(%rdx, %rdi), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl VEC_SIZE(%rax, %rdi), %eax
+ movzbl VEC_SIZE(%rdx, %rdi), %edx
+ subl %edx, %eax
+# endif
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(test_2_vec):
+# ifdef USE_AS_STRNCMP
+ /* The first 2 vectors matched. Return 0 if the maximum offset
+ (%r11) <= 2 * VEC_SIZE. */
+ cmpq $(VEC_SIZE * 2), %r11
+ jbe L(zero)
+# endif
+ VPCMPEQ %ymm7, %ymm5, %ymm5
+ vpmovmskb %ymm5, %ecx
+ testl %ecx, %ecx
+ je L(test_3_vec)
+ tzcntl %ecx, %edi
+# ifdef USE_AS_STRNCMP
+ addq $(VEC_SIZE * 2), %rdi
+ cmpq %rdi, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rdi), %ecx
+ cmpl (%rdx, %rdi), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rdi), %eax
+ movzbl (%rdx, %rdi), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
+ cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
+ movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
+ subl %edx, %eax
+# endif
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(test_3_vec):
+# ifdef USE_AS_STRNCMP
+ /* The first 3 vectors matched. Return 0 if the maximum offset
+ (%r11) <= 3 * VEC_SIZE. */
+ cmpq $(VEC_SIZE * 3), %r11
+ jbe L(zero)
+# endif
+ VPCMPEQ %ymm7, %ymm6, %ymm6
+ vpmovmskb %ymm6, %esi
+ tzcntl %esi, %ecx
+# ifdef USE_AS_STRNCMP
+ addq $(VEC_SIZE * 3), %rcx
+ cmpq %rcx, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %esi
+ cmpl (%rdx, %rcx), %esi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
+ cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
+ jne L(wcscmp_return)
+# else
+ movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(loop_cross_page):
+ xorl %r10d, %r10d
+ movq %rdx, %rcx
+ /* Align load via RDX. We load the extra ECX bytes which should
+ be ignored. */
+ andl $((VEC_SIZE * 4) - 1), %ecx
+ /* R10 is -RCX. */
+ subq %rcx, %r10
+
+ /* This works only if VEC_SIZE * 2 == 64. */
+# if (VEC_SIZE * 2) != 64
+# error (VEC_SIZE * 2) != 64
+# endif
+
+ /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
+ cmpl $(VEC_SIZE * 2), %ecx
+ jge L(loop_cross_page_2_vec)
+
+ vmovdqu (%rax, %r10), %ymm2
+ vmovdqu VEC_SIZE(%rax, %r10), %ymm3
+ VPCMPEQ (%rdx, %r10), %ymm2, %ymm0
+ VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1
+ VPMINU %ymm2, %ymm0, %ymm0
+ VPMINU %ymm3, %ymm1, %ymm1
+ VPCMPEQ %ymm7, %ymm0, %ymm0
+ VPCMPEQ %ymm7, %ymm1, %ymm1
+
+ vpmovmskb %ymm0, %edi
+ vpmovmskb %ymm1, %esi
+
+ salq $32, %rsi
+ xorq %rsi, %rdi
+
+ /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
+ shrq %cl, %rdi
+
+ testq %rdi, %rdi
+ je L(loop_cross_page_2_vec)
+ tzcntq %rdi, %rcx
+# ifdef USE_AS_STRNCMP
+ cmpq %rcx, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %edi
+ cmpl (%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %edi
+ cmpl (%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(loop_cross_page_2_vec):
+ /* The first VEC_SIZE * 2 bytes match or are ignored. */
+ vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2
+ vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3
+ VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5
+ VPMINU %ymm2, %ymm5, %ymm5
+ VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6
+ VPCMPEQ %ymm7, %ymm5, %ymm5
+ VPMINU %ymm3, %ymm6, %ymm6
+ VPCMPEQ %ymm7, %ymm6, %ymm6
+
+ vpmovmskb %ymm5, %edi
+ vpmovmskb %ymm6, %esi
+
+ salq $32, %rsi
+ xorq %rsi, %rdi
+
+ xorl %r8d, %r8d
+ /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
+ subl $(VEC_SIZE * 2), %ecx
+ jle 1f
+ /* Skip ECX bytes. */
+ shrq %cl, %rdi
+ /* R8 has number of bytes skipped. */
+ movl %ecx, %r8d
+1:
+ /* Before jumping back to the loop, set ESI to the number of
+ VEC_SIZE * 4 blocks before page crossing. */
+ movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
+
+ testq %rdi, %rdi
+ je L(back_to_loop)
+ tzcntq %rdi, %rcx
+ addq %r10, %rcx
+ /* Adjust for number of bytes skipped. */
+ addq %r8, %rcx
+# ifdef USE_AS_STRNCMP
+ addq $(VEC_SIZE * 2), %rcx
+ subq %rcx, %r11
+ jbe L(zero)
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (%rsi, %rcx), %edi
+ cmpl (%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (%rax, %rcx), %eax
+ movzbl (%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# else
+# ifdef USE_AS_WCSCMP
+ movq %rax, %rsi
+ xorl %eax, %eax
+ movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
+ cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
+ jne L(wcscmp_return)
+# else
+ movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
+ movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
+ subl %edx, %eax
+# endif
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(cross_page_loop):
+ /* Check one byte/dword at a time. */
+# ifdef USE_AS_WCSCMP
+ cmpl %ecx, %eax
+# else
+ subl %ecx, %eax
+# endif
+ jne L(different)
+ addl $SIZE_OF_CHAR, %edx
+ cmpl $(VEC_SIZE * 4), %edx
+ je L(main_loop_header)
+# ifdef USE_AS_STRNCMP
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+ movl (%rdi, %rdx), %eax
+ movl (%rsi, %rdx), %ecx
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %ecx
+# endif
+ /* Check null char. */
+ testl %eax, %eax
+ jne L(cross_page_loop)
+ /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
+ comparisons. */
+ subl %ecx, %eax
+# ifndef USE_AS_WCSCMP
+L(different):
+# endif
+ VZEROUPPER
+ ret
+
+# ifdef USE_AS_WCSCMP
+ .p2align 4
+L(different):
+ /* Use movl to avoid modifying EFLAGS. */
+ movl $0, %eax
+ setl %al
+ negl %eax
+ orl $1, %eax
+ VZEROUPPER
+ ret
+# endif
+
+# ifdef USE_AS_STRNCMP
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(char0):
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi), %ecx
+ cmpl (%rsi), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
+ subl %ecx, %eax
+# endif
+ VZEROUPPER
+ ret
+# endif
+
+ .p2align 4
+L(last_vector):
+ addq %rdx, %rdi
+ addq %rdx, %rsi
+# ifdef USE_AS_STRNCMP
+ subq %rdx, %r11
+# endif
+ tzcntl %ecx, %edx
+# ifdef USE_AS_STRNCMP
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+ xorl %eax, %eax
+ movl (%rdi, %rdx), %ecx
+ cmpl (%rsi, %rdx), %ecx
+ jne L(wcscmp_return)
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %edx
+ subl %edx, %eax
+# endif
+ VZEROUPPER
+ ret
+
+ /* Comparing on page boundary region requires special treatment:
+ It must done one vector at the time, starting with the wider
+ ymm vector if possible, if not, with xmm. If fetching 16 bytes
+ (xmm) still passes the boundary, byte comparison must be done.
+ */
+ .p2align 4
+L(cross_page):
+ /* Try one ymm vector at a time. */
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(cross_page_1_vector)
+L(loop_1_vector):
+ vmovdqu (%rdi, %rdx), %ymm1
+ VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0
+ VPMINU %ymm1, %ymm0, %ymm0
+ VPCMPEQ %ymm7, %ymm0, %ymm0
+ vpmovmskb %ymm0, %ecx
+ testl %ecx, %ecx
+ jne L(last_vector)
+
+ addl $VEC_SIZE, %edx
+
+ addl $VEC_SIZE, %eax
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
+ (%r11). */
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jle L(loop_1_vector)
+L(cross_page_1_vector):
+ /* Less than 32 bytes to check, try one xmm vector. */
+ cmpl $(PAGE_SIZE - 16), %eax
+ jg L(cross_page_1_xmm)
+ vmovdqu (%rdi, %rdx), %xmm1
+ VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0
+ VPMINU %xmm1, %xmm0, %xmm0
+ VPCMPEQ %xmm7, %xmm0, %xmm0
+ vpmovmskb %xmm0, %ecx
+ testl %ecx, %ecx
+ jne L(last_vector)
+
+ addl $16, %edx
+# ifndef USE_AS_WCSCMP
+ addl $16, %eax
+# endif
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
+ (%r11). */
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+
+L(cross_page_1_xmm):
+# ifndef USE_AS_WCSCMP
+ /* Less than 16 bytes to check, try 8 byte vector. NB: No need
+ for wcscmp nor wcsncmp since wide char is 4 bytes. */
+ cmpl $(PAGE_SIZE - 8), %eax
+ jg L(cross_page_8bytes)
+ vmovq (%rdi, %rdx), %xmm1
+ vmovq (%rsi, %rdx), %xmm0
+ VPCMPEQ %xmm0, %xmm1, %xmm0
+ VPMINU %xmm1, %xmm0, %xmm0
+ VPCMPEQ %xmm7, %xmm0, %xmm0
+ vpmovmskb %xmm0, %ecx
+ /* Only last 8 bits are valid. */
+ andl $0xff, %ecx
+ testl %ecx, %ecx
+ jne L(last_vector)
+
+ addl $8, %edx
+ addl $8, %eax
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
+ (%r11). */
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+
+L(cross_page_8bytes):
+ /* Less than 8 bytes to check, try 4 byte vector. */
+ cmpl $(PAGE_SIZE - 4), %eax
+ jg L(cross_page_4bytes)
+ vmovd (%rdi, %rdx), %xmm1
+ vmovd (%rsi, %rdx), %xmm0
+ VPCMPEQ %xmm0, %xmm1, %xmm0
+ VPMINU %xmm1, %xmm0, %xmm0
+ VPCMPEQ %xmm7, %xmm0, %xmm0
+ vpmovmskb %xmm0, %ecx
+ /* Only last 4 bits are valid. */
+ andl $0xf, %ecx
+ testl %ecx, %ecx
+ jne L(last_vector)
+
+ addl $4, %edx
+# ifdef USE_AS_STRNCMP
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
+ (%r11). */
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+
+L(cross_page_4bytes):
+# endif
+ /* Less than 4 bytes to check, try one byte/dword at a time. */
+# ifdef USE_AS_STRNCMP
+ cmpq %r11, %rdx
+ jae L(zero)
+# endif
+# ifdef USE_AS_WCSCMP
+ movl (%rdi, %rdx), %eax
+ movl (%rsi, %rdx), %ecx
+# else
+ movzbl (%rdi, %rdx), %eax
+ movzbl (%rsi, %rdx), %ecx
+# endif
+ testl %eax, %eax
+ jne L(cross_page_loop)
+ subl %ecx, %eax
+ VZEROUPPER
+ ret
+END (STRCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 0335f96b09..b903e418df 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -29,12 +29,18 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ return OPTIMIZE (avx2);
+
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
new file mode 100644
index 0000000000..1678bcc235
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
@@ -0,0 +1,3 @@
+#define STRCMP __strncmp_avx2
+#define USE_AS_STRNCMP 1
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index 0d2d657a36..02b6d0b6f5 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -29,12 +29,18 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ return OPTIMIZE (avx2);
+
if (CPU_FEATURES_CPU_P (cpu_features, SSE4_2)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
return OPTIMIZE (sse42);
diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2.S b/sysdeps/x86_64/multiarch/wcscmp-avx2.S
new file mode 100644
index 0000000000..e5da4da689
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscmp-avx2.S
@@ -0,0 +1,4 @@
+#define STRCMP __wcscmp_avx2
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcscmp-sse2.S b/sysdeps/x86_64/multiarch/wcscmp-sse2.S
new file mode 100644
index 0000000000..b129d1c073
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscmp-sse2.S
@@ -0,0 +1,23 @@
+/* wcscmp optimized with SSE2.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define __wcscmp __wcscmp_sse2
+#endif
+
+#include "../wcscmp.S"
diff --git a/sysdeps/x86_64/multiarch/wcscmp.c b/sysdeps/x86_64/multiarch/wcscmp.c
new file mode 100644
index 0000000000..74d92cf0f9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcscmp.c
@@ -0,0 +1,37 @@
+/* Multiple versions of wcscmp.
+ Copyright (C) 2017-2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define wcscmp __redirect_wcscmp
+# define __wcscmp __redirect___wcscmp
+# include <wchar.h>
+# undef wcscmp
+# undef __wcscmp
+
+# define SYMBOL_NAME wcscmp
+# include "ifunc-avx2.h"
+
+libc_ifunc_redirected (__redirect_wcscmp, __wcscmp, IFUNC_SELECTOR ());
+weak_alias (__wcscmp, wcscmp)
+
+# ifdef SHARED
+__hidden_ver1 (__wcscmp, __GI___wcscmp, __redirect_wcscmp)
+ __attribute__ ((visibility ("hidden")));
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
new file mode 100644
index 0000000000..4fa1de4d3f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
@@ -0,0 +1,5 @@
+#define STRCMP __wcsncmp_avx2
+#define USE_AS_STRNCMP 1
+#define USE_AS_WCSCMP 1
+
+#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-sse2.c b/sysdeps/x86_64/multiarch/wcsncmp-sse2.c
new file mode 100644
index 0000000000..2bc7b4f693
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncmp-sse2.c
@@ -0,0 +1,20 @@
+/* wcsncmp optimized with SSE2.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WCSNCMP __wcsncmp_sse2
+#include <wcsmbs/wcsncmp.c>
diff --git a/sysdeps/x86_64/multiarch/wcsncmp.c b/sysdeps/x86_64/multiarch/wcsncmp.c
new file mode 100644
index 0000000000..90e9a352d9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsncmp.c
@@ -0,0 +1,31 @@
+/* Multiple versions of wcsncmp.
+ Copyright (C) 2018 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define wcsncmp __redirect_wcsncmp
+# define __wcsncmp __redirect___wcsncmp
+# include <wchar.h>
+# undef wcsncmp
+# undef __wcsncmp
+
+# define SYMBOL_NAME wcsncmp
+# include "ifunc-avx2.h"
+
+libc_ifunc_redirected (__redirect_wcsncmp, wcsncmp, IFUNC_SELECTOR ());
+#endif
diff --git a/sysdeps/x86_64/wcscmp.S b/sysdeps/x86_64/wcscmp.S
index 1b9f81f54c..0d506c8b5c 100644
--- a/sysdeps/x86_64/wcscmp.S
+++ b/sysdeps/x86_64/wcscmp.S
@@ -946,5 +946,7 @@ L(equal):
ret
END (__wcscmp)
+#ifndef __wcscmp
libc_hidden_def (__wcscmp)
weak_alias (__wcscmp, wcscmp)
+#endif
--
2.19.1