Skip to content

Commit

Permalink
PowerPC: strcpy/stpcpy optimization for PPC64/POWER7
Browse files Browse the repository at this point in the history
This patch intends to unify both strcpy and stpcpy implementationsi
for PPC64 and PPC64/POWER7. The idead default powerpc64 implementation
is to provide both doubleword and word aligned memory access.

For PPC64/POWER7 is also provide doubleword and word memory access,
remove the branch hints, use the cmpb instruction for compare
doubleword/words, and add an optimization for inputs of same alignment.
  • Loading branch information
Adhemerval Zanella committed Oct 25, 2013
1 parent 151659f commit 69f13db
Show file tree
Hide file tree
Showing 5 changed files with 418 additions and 134 deletions.
11 changes: 11 additions & 0 deletions ChangeLog
Original file line number Diff line number Diff line change
@@ -1,3 +1,14 @@
2013-10-04 Adhemerval Zanella <azanella@linux.vnet.ibm.com>

* sysdeps/powerpc/powerpc64/strcpy.S (strcpy): Add word load/store
to provide a boost for large inputs with word alignment.
* sysdeps/powerpc/powerpc64/stpcpy.S (__stpcpy): Rewrite
implementation based on optimized PPC64 strcpy.
* sysdeps/powerpc/powerpc64/power7/strcpy.S: New file: optimized
strcpy for PPC64/POWER7 based on both doubleword and word load/store.
* sysdeps/powerpc/powerpc64/power7/stpcpy.S: New file: optimized
stpcpy for PPC64/POWER7 based on PPC64/POWER7 strcpy.

2013-10-25 Ondřej Bílka <neleai@seznam.cz>

[BZ 2801]
Expand Down
24 changes: 24 additions & 0 deletions sysdeps/powerpc/powerpc64/power7/stpcpy.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
/* Optimized stpcpy implementation for PowerPC64/POWER7.
Copyright (C) 2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */

#define USE_AS_STPCPY
#include <sysdeps/powerpc/powerpc64/power7/strcpy.S>

weak_alias (__stpcpy, stpcpy)
libc_hidden_def (__stpcpy)
libc_hidden_builtin_def (stpcpy)
274 changes: 274 additions & 0 deletions sysdeps/powerpc/powerpc64/power7/strcpy.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,274 @@
/* Optimized strcpy/stpcpy implementation for PowerPC64/POWER7.
Copyright (C) 2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */

#include <sysdep.h>

/* Implements the function
char * [r3] strcpy (char *dest [r3], const char *src [r4])
or
char * [r3] strcpy (char *dest [r3], const char *src [r4])
if USE_AS_STPCPY is defined. It tries to use aligned memory accesses
when possible using the following algorithm:
if (((((uintptr_t)dst & 0x7UL) == 0) && ((uintptr_t)src & 0x7UL) == 0))
goto aligned_doubleword_copy;
if (((((uintptr_t)dst & 0x3UL) == 0) && ((uintptr_t)src & 0x3UL) == 0))
goto aligned_word_copy;
if (((uintptr_t)dst & 0x7UL) == ((uintptr_t)src & 0x7UL))
goto same_alignment;
goto unaligned;
The aligned comparison are made using cmpb instructions. */

#ifdef USE_AS_STPCPY
# define FUNC_NAME __stpcpy
#else
# define FUNC_NAME strcpy
#endif

.machine power7
EALIGN (FUNC_NAME, 4, 0)
CALL_MCOUNT 2

#define rTMP r0
#ifdef USE_AS_STPCPY
#define rRTN r3 /* pointer to previous word/doubleword in dest */
#else
#define rRTN r12 /* pointer to previous word/doubleword in dest */
#endif
#define rSRC r4 /* pointer to previous word/doubleword in src */
#define rMASK r5 /* mask 0xffffffff | 0xffffffffffffffff */
#define rWORD r6 /* current word from src */
#define rALT r7 /* alternate word from src */
#define rRTNAL r8 /* alignment of return pointer */
#define rSRCAL r9 /* alignment of source pointer */
#define rALCNT r10 /* bytes to read to reach 8 bytes alignment */
#define rSUBAL r11 /* doubleword minus unaligned displacement */

#ifndef USE_AS_STPCPY
/* Save the dst pointer to use as return value. */
mr rRTN, r3
#endif
or rTMP, rSRC, rRTN
clrldi. rTMP, rTMP, 61
bne L(check_word_alignment)
b L(aligned_doubleword_copy)

L(same_alignment):
/* Src and dst with same alignment: align both to doubleword. */
mr rALCNT, rRTN
lbz rWORD, 0(rSRC)
subfic rSUBAL, rRTNAL, 8
addi rRTN, rRTN, 1
addi rSRC, rSRC, 1
cmpdi cr7, rWORD, 0
stb rWORD, 0(rALCNT)
beq cr7, L(s2)

add rALCNT, rALCNT, rSUBAL
subf rALCNT, rRTN, rALCNT
addi rALCNT, rALCNT, 1
mtctr rALCNT
b L(s1)

.align 4
L(s0):
addi rSRC, rSRC, 1
lbz rWORD, -1(rSRC)
cmpdi cr7, rWORD, 0
stb rWORD, -1(rALCNT)
beqlr cr7
mr rRTN, rALCNT
L(s1):
addi rALCNT, rRTN,1
bdnz L(s0)
b L(aligned_doubleword_copy)
.align 4
L(s2):
mr rRTN, rALCNT
blr

/* For doubleword aligned memory, operate using doubleword load and stores. */
.align 4
L(aligned_doubleword_copy):
li rMASK, 0
addi rRTN, rRTN, -8
ld rWORD, 0(rSRC)
b L(g2)

.align 4
L(g0): ldu rALT, 8(rSRC)
stdu rWORD, 8(rRTN)
cmpb rTMP, rALT, rMASK
cmpdi rTMP, 0
bne L(g1)
ldu rWORD, 8(rSRC)
stdu rALT, 8(rRTN)
L(g2): cmpb rTMP, rWORD, rMASK
cmpdi rTMP, 0 /* If rTMP is 0, no null's have been found. */
beq L(g0)

mr rALT, rWORD
/* We've hit the end of the string. Do the rest byte-by-byte. */
L(g1):
#ifdef __LITTLE_ENDIAN__
extrdi. rTMP, rALT, 8, 56
stbu rALT, 8(rRTN)
beqlr-
extrdi. rTMP, rALT, 8, 48
stbu rTMP, 1(rRTN)
beqlr-
extrdi. rTMP, rALT, 8, 40
stbu rTMP, 1(rRTN)
beqlr-
extrdi. rTMP, rALT, 8, 32
stbu rTMP, 1(rRTN)
beqlr-
extrdi. rTMP, rALT, 8, 24
stbu rTMP, 1(rRTN)
beqlr-
extrdi. rTMP, rALT, 8, 16
stbu rTMP, 1(rRTN)
beqlr-
extrdi. rTMP, rALT, 8, 8
stbu rTMP, 1(rRTN)
beqlr-
extrdi rTMP, rALT, 8, 0
stbu rTMP, 1(rRTN)
#else
extrdi. rTMP, rALT, 8, 0
stbu rTMP, 8(rRTN)
beqlr
extrdi. rTMP, rALT, 8, 8
stbu rTMP, 1(rRTN)
beqlr
extrdi. rTMP, rALT, 8, 16
stbu rTMP, 1(rRTN)
beqlr
extrdi. rTMP, rALT, 8, 24
stbu rTMP, 1(rRTN)
beqlr
extrdi. rTMP, rALT, 8, 32
stbu rTMP, 1(rRTN)
beqlr
extrdi. rTMP, rALT, 8, 40
stbu rTMP, 1(rRTN)
beqlr
extrdi. rTMP, rALT, 8, 48
stbu rTMP, 1(rRTN)
beqlr
stbu rALT, 1(rRTN)
#endif
blr

L(check_word_alignment):
clrldi. rTMP, rTMP, 62
beq L(aligned_word_copy)
rldicl rRTNAL, rRTN, 0, 61
rldicl rSRCAL, rSRC, 0, 61
cmpld cr7, rSRCAL, rRTNAL
beq cr7, L(same_alignment)
b L(unaligned)

/* For word aligned memory, operate using word load and stores. */
.align 4
L(aligned_word_copy):
li rMASK, 0
addi rRTN, rRTN, -4
lwz rWORD, 0(rSRC)
b L(g5)

.align 4
L(g3): lwzu rALT, 4(rSRC)
stwu rWORD, 4(rRTN)
cmpb rTMP, rALT, rMASK
cmpwi rTMP, 0
bne L(g4)
lwzu rWORD, 4(rSRC)
stwu rALT, 4(rRTN)
L(g5): cmpb rTMP, rWORD, rMASK
cmpwi rTMP, 0 /* If rTMP is 0, no null in word. */
beq L(g3)

mr rALT, rWORD
/* We've hit the end of the string. Do the rest byte-by-byte. */
L(g4):
#ifdef __LITTLE_ENDIAN__
rlwinm. rTMP, rALT, 0, 24, 31
stbu rALT, 4(rRTN)
beqlr-
rlwinm. rTMP, rALT, 24, 24, 31
stbu rTMP, 1(rRTN)
beqlr-
rlwinm. rTMP, rALT, 16, 24, 31
stbu rTMP, 1(rRTN)
beqlr-
rlwinm rTMP, rALT, 8, 24, 31
stbu rTMP, 1(rRTN)
#else
rlwinm. rTMP, rALT, 8, 24, 31
stbu rTMP, 4(rRTN)
beqlr
rlwinm. rTMP, rALT, 16, 24, 31
stbu rTMP, 1(rRTN)
beqlr
rlwinm. rTMP, rALT, 24, 24, 31
stbu rTMP, 1(rRTN)
beqlr
stbu rALT, 1(rRTN)
#endif
blr

/* Oh well. In this case, we just do a byte-by-byte copy. */
.align 4
L(unaligned):
lbz rWORD, 0(rSRC)
addi rRTN, rRTN, -1
cmpdi rWORD, 0
beq L(u2)

.align 5
L(u0): lbzu rALT, 1(rSRC)
stbu rWORD, 1(rRTN)
cmpdi rALT, 0
beq L(u1)
lbzu rWORD, 1(rSRC)
stbu rALT, 1(rRTN)
cmpdi rWORD, 0
beq L(u2)
lbzu rALT, 1(rSRC)
stbu rWORD, 1(rRTN)
cmpdi rALT, 0
beq L(u1)
lbzu rWORD, 1(rSRC)
stbu rALT, 1(rRTN)
cmpdi rWORD, 0
bne L(u0)
L(u2): stbu rWORD, 1(rRTN)
blr
L(u1): stbu rALT, 1(rRTN)
blr
END (FUNC_NAME)

#ifndef USE_AS_STPCPY
libc_hidden_builtin_def (strcpy)
#endif
Loading

0 comments on commit 69f13db

Please sign in to comment.