Skip to content

Commit

Permalink
arm64: lib: Implement optimized string length routines
Browse files Browse the repository at this point in the history
This patch, based on Linaro's Cortex Strings library, adds
an assembly optimized strlen() and strnlen() functions.

Signed-off-by: Zhichang Yuan <zhichang.yuan@linaro.org>
Signed-off-by: Deepak Saxena <dsaxena@linaro.org>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
  • Loading branch information
zhichang.yuan authored and Catalin Marinas committed May 23, 2014
1 parent 192c4d9 commit 0a42cb0
Show file tree
Hide file tree
Showing 5 changed files with 307 additions and 1 deletion.
6 changes: 6 additions & 0 deletions arch/arm64/include/asm/string.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ extern int strcmp(const char *, const char *);
#define __HAVE_ARCH_STRNCMP
extern int strncmp(const char *, const char *, __kernel_size_t);

#define __HAVE_ARCH_STRLEN
extern __kernel_size_t strlen(const char *);

#define __HAVE_ARCH_STRNLEN
extern __kernel_size_t strnlen(const char *, __kernel_size_t);

#define __HAVE_ARCH_MEMCPY
extern void *memcpy(void *, const void *, __kernel_size_t);

Expand Down
2 changes: 2 additions & 0 deletions arch/arm64/kernel/arm64ksyms.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ EXPORT_SYMBOL(strchr);
EXPORT_SYMBOL(strrchr);
EXPORT_SYMBOL(strcmp);
EXPORT_SYMBOL(strncmp);
EXPORT_SYMBOL(strlen);
EXPORT_SYMBOL(strnlen);
EXPORT_SYMBOL(memset);
EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(memmove);
Expand Down
3 changes: 2 additions & 1 deletion arch/arm64/lib/Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
copy_to_user.o copy_in_user.o copy_page.o \
clear_page.o memchr.o memcpy.o memmove.o memset.o \
memcmp.o strcmp.o strncmp.o strchr.o strrchr.o
memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
strchr.o strrchr.o
126 changes: 126 additions & 0 deletions arch/arm64/lib/strlen.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
* Copyright (C) 2013 ARM Ltd.
* Copyright (C) 2013 Linaro.
*
* This code is based on glibc cortex strings work originally authored by Linaro
* and re-licensed under GPLv2 for the Linux kernel. The original code can
* be found @
*
* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
* files/head:/src/aarch64/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include <linux/linkage.h>
#include <asm/assembler.h>

/*
* calculate the length of a string
*
* Parameters:
* x0 - const string pointer
* Returns:
* x0 - the return length of specific string
*/

/* Arguments and results. */
srcin .req x0
len .req x0

/* Locals and temporaries. */
src .req x1
data1 .req x2
data2 .req x3
data2a .req x4
has_nul1 .req x5
has_nul2 .req x6
tmp1 .req x7
tmp2 .req x8
tmp3 .req x9
tmp4 .req x10
zeroones .req x11
pos .req x12

#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080

ENTRY(strlen)
mov zeroones, #REP8_01
bic src, srcin, #15
ands tmp1, srcin, #15
b.ne .Lmisaligned
/*
* NUL detection works on the principle that (X - 1) & (~X) & 0x80
* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
* can be done in parallel across the entire word.
*/
/*
* The inner loop deals with two Dwords at a time. This has a
* slightly higher start-up cost, but we should win quite quickly,
* especially on cores with a high number of issue slots per
* cycle, as we get much better parallelism out of the operations.
*/
.Lloop:
ldp data1, data2, [src], #16
.Lrealigned:
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bics has_nul2, tmp3, tmp4
ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */
b.eq .Lloop

sub len, src, srcin
cbz has_nul1, .Lnul_in_data2
CPU_BE( mov data2, data1 ) /*prepare data to re-calculate the syndrome*/
sub len, len, #8
mov has_nul2, has_nul1
.Lnul_in_data2:
/*
* For big-endian, carry propagation (if the final byte in the
* string is 0x01) means we cannot use has_nul directly. The
* easiest way to get the correct byte is to byte-swap the data
* and calculate the syndrome a second time.
*/
CPU_BE( rev data2, data2 )
CPU_BE( sub tmp1, data2, zeroones )
CPU_BE( orr tmp2, data2, #REP8_7f )
CPU_BE( bic has_nul2, tmp1, tmp2 )

sub len, len, #8
rev has_nul2, has_nul2
clz pos, has_nul2
add len, len, pos, lsr #3 /* Bits to bytes. */
ret

.Lmisaligned:
cmp tmp1, #8
neg tmp1, tmp1
ldp data1, data2, [src], #16
lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
mov tmp2, #~0
/* Big-endian. Early bytes are at MSB. */
CPU_BE( lsl tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */
/* Little-endian. Early bytes are at LSB. */
CPU_LE( lsr tmp2, tmp2, tmp1 ) /* Shift (tmp1 & 63). */

orr data1, data1, tmp2
orr data2a, data2, tmp2
csinv data1, data1, xzr, le
csel data2, data2, data2a, le
b .Lrealigned
ENDPROC(strlen)
171 changes: 171 additions & 0 deletions arch/arm64/lib/strnlen.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,171 @@
/*
* Copyright (C) 2013 ARM Ltd.
* Copyright (C) 2013 Linaro.
*
* This code is based on glibc cortex strings work originally authored by Linaro
* and re-licensed under GPLv2 for the Linux kernel. The original code can
* be found @
*
* http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
* files/head:/src/aarch64/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

#include <linux/linkage.h>
#include <asm/assembler.h>

/*
* determine the length of a fixed-size string
*
* Parameters:
* x0 - const string pointer
* x1 - maximal string length
* Returns:
* x0 - the return length of specific string
*/

/* Arguments and results. */
srcin .req x0
len .req x0
limit .req x1

/* Locals and temporaries. */
src .req x2
data1 .req x3
data2 .req x4
data2a .req x5
has_nul1 .req x6
has_nul2 .req x7
tmp1 .req x8
tmp2 .req x9
tmp3 .req x10
tmp4 .req x11
zeroones .req x12
pos .req x13
limit_wd .req x14

#define REP8_01 0x0101010101010101
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080

ENTRY(strnlen)
cbz limit, .Lhit_limit
mov zeroones, #REP8_01
bic src, srcin, #15
ands tmp1, srcin, #15
b.ne .Lmisaligned
/* Calculate the number of full and partial words -1. */
sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */

/*
* NUL detection works on the principle that (X - 1) & (~X) & 0x80
* (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
* can be done in parallel across the entire word.
*/
/*
* The inner loop deals with two Dwords at a time. This has a
* slightly higher start-up cost, but we should win quite quickly,
* especially on cores with a high number of issue slots per
* cycle, as we get much better parallelism out of the operations.
*/
.Lloop:
ldp data1, data2, [src], #16
.Lrealigned:
sub tmp1, data1, zeroones
orr tmp2, data1, #REP8_7f
sub tmp3, data2, zeroones
orr tmp4, data2, #REP8_7f
bic has_nul1, tmp1, tmp2
bic has_nul2, tmp3, tmp4
subs limit_wd, limit_wd, #1
orr tmp1, has_nul1, has_nul2
ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
b.eq .Lloop

cbz tmp1, .Lhit_limit /* No null in final Qword. */

/*
* We know there's a null in the final Qword. The easiest thing
* to do now is work out the length of the string and return
* MIN (len, limit).
*/
sub len, src, srcin
cbz has_nul1, .Lnul_in_data2
CPU_BE( mov data2, data1 ) /*perpare data to re-calculate the syndrome*/

sub len, len, #8
mov has_nul2, has_nul1
.Lnul_in_data2:
/*
* For big-endian, carry propagation (if the final byte in the
* string is 0x01) means we cannot use has_nul directly. The
* easiest way to get the correct byte is to byte-swap the data
* and calculate the syndrome a second time.
*/
CPU_BE( rev data2, data2 )
CPU_BE( sub tmp1, data2, zeroones )
CPU_BE( orr tmp2, data2, #REP8_7f )
CPU_BE( bic has_nul2, tmp1, tmp2 )

sub len, len, #8
rev has_nul2, has_nul2
clz pos, has_nul2
add len, len, pos, lsr #3 /* Bits to bytes. */
cmp len, limit
csel len, len, limit, ls /* Return the lower value. */
ret

.Lmisaligned:
/*
* Deal with a partial first word.
* We're doing two things in parallel here;
* 1) Calculate the number of words (but avoiding overflow if
* limit is near ULONG_MAX) - to do this we need to work out
* limit + tmp1 - 1 as a 65-bit value before shifting it;
* 2) Load and mask the initial data words - we force the bytes
* before the ones we are interested in to 0xff - this ensures
* early bytes will not hit any zero detection.
*/
ldp data1, data2, [src], #16

sub limit_wd, limit, #1
and tmp3, limit_wd, #15
lsr limit_wd, limit_wd, #4

add tmp3, tmp3, tmp1
add limit_wd, limit_wd, tmp3, lsr #4

neg tmp4, tmp1
lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */

mov tmp2, #~0
/* Big-endian. Early bytes are at MSB. */
CPU_BE( lsl tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */
/* Little-endian. Early bytes are at LSB. */
CPU_LE( lsr tmp2, tmp2, tmp4 ) /* Shift (tmp1 & 63). */

cmp tmp1, #8

orr data1, data1, tmp2
orr data2a, data2, tmp2

csinv data1, data1, xzr, le
csel data2, data2, data2a, le
b .Lrealigned

.Lhit_limit:
mov len, limit
ret
ENDPROC(strnlen)

0 comments on commit 0a42cb0

Please sign in to comment.