Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 100289 | Differences between
and this patch

Collapse All | Expand All

(-)sysdeps/x86_64/strlen.S (-137 / +403 lines)
Lines 1-139 Link Here
1
/* strlen(str) -- determine the length of the string STR.
1
# $Header: /K8_Projects/Glibc/amd64strlen.S 3     10/06/03 11:00 Emenezes $
2
   Copyright (C) 2002, 2003 Free Software Foundation, Inc.
2
3
   Based on i486 version contributed by Ulrich Drepper <drepper@redhat.com>.
3
# (c) 2002 Advanced Micro Devices, Inc.
4
   This file is part of the GNU C Library.
4
# YOUR USE OF THIS CODE IS SUBJECT TO THE TERMS
5
5
# AND CONDITIONS OF THE GNU LESSER GENERAL PUBLIC
6
   The GNU C Library is free software; you can redistribute it and/or
6
# LICENSE FOUND IN THE "README" FILE THAT IS
7
   modify it under the terms of the GNU Lesser General Public
7
# INCLUDED WITH THIS FILE
8
   License as published by the Free Software Foundation; either
8
9
   version 2.1 of the License, or (at your option) any later version.
9
#include "sysdep.h"
10
10
#include <rtld-global-offsets.h>
11
   The GNU C Library is distributed in the hope that it will be useful,
11
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
12
#ifdef PIC
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
	.globl _rtld_local_ro
14
   Lesser General Public License for more details.
14
        .hidden _rtld_local_ro
15
15
        .set    _rtld_local_ro,_rtld_global_ro
16
   You should have received a copy of the GNU Lesser General Public
16
#endif
17
   License along with the GNU C Library; if not, write to the Free
17
        .text
18
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
18
19
   02111-1307 USA.  */
19
ENTRY   (strlen)                # (const char *s)
20
20
21
#include <sysdep.h>
21
        mov     %rdi, %rsi
22
#include "asm-syntax.h"
22
        neg     %rdi
23
#include "bp-sym.h"
23
24
#include "bp-asm.h"
24
L(strlenaligntry):
25
25
        mov     %rsi , %r8
26
26
        and     $7, %r8d
27
	.text
27
	jz	L(strlenalignafter)
28
ENTRY (strlen)
28
29
	movq %rdi, %rcx		/* Duplicate source pointer. */
29
L(strlenalign):                            # 8-byte align
30
	andl $7, %ecx		/* mask alignment bits */
30
        sub     $8, %r8
31
	movq %rdi, %rax		/* duplicate destination.  */
31
32
	jz 1f			/* aligned => start loop */
32
        .p2align 4
33
33
34
	neg %ecx		/* We need to align to 8 bytes.  */
34
L(strlenalignloop):
35
	addl $8,%ecx
35
        cmpb    $0, (%rsi)
36
	/* Search the first bytes directly.  */
36
        je      L(exit)
37
0:	cmpb $0x0,(%rax)	/* is byte NUL? */
37
38
	je 2f			/* yes => return */
38
        inc     %rsi
39
	incq %rax		/* increment pointer */
39
        inc     %r8
40
	decl %ecx
40
        jnz     L(strlenalignloop)
41
	jnz 0b
41
42
42
        .p2align 4
43
1:	movq $0xfefefefefefefeff,%r8 /* Save magic.  */
43
44
44
L(strlenalignafter):
45
	.p2align 4		/* Align loop.  */
45
46
4:	/* Main Loop is unrolled 4 times.  */
46
L(strlen56try):
47
	/* First unroll.  */
47
48
	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
48
L(strlen56):                               # 56-byte
49
	addq $8,%rax		/* adjust pointer for next word */
49
        mov     (%rsi), %rax
50
	movq %r8, %rdx		/* magic value */
50
        mov     $0xfefefefefefefeff, %rcx
51
	addq %rcx, %rdx		/* add the magic value to the word.  We get
51
52
				   carry bits reported for each byte which
52
L(strlen56loop):
53
				   is *not* 0 */
53
        mov     %rcx, %r8
54
	jnc 3f			/* highest byte is NUL => return pointer */
54
        add     %rax, %r8
55
	xorq %rcx, %rdx		/* (word+magic)^word */
55
        jnc     L(strlentail)
56
	orq %r8, %rdx		/* set all non-carry bits */
56
57
	incq %rdx		/* add 1: if one carry bit was *not* set
57
        xor     %rax, %r8
58
				   the addition will not result in 0.  */
58
        or      %rcx, %r8
59
	jnz 3f			/* found NUL => return pointer */
59
        inc     %r8
60
60
        jnz     L(strlentail)
61
	/* Second unroll.  */
61
62
	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
62
        mov     8 (%rsi), %rax
63
	addq $8,%rax		/* adjust pointer for next word */
63
        lea     8 (%rsi), %rsi
64
	movq %r8, %rdx		/* magic value */
64
65
	addq %rcx, %rdx		/* add the magic value to the word.  We get
65
        mov     %rcx, %r8
66
				   carry bits reported for each byte which
66
        add     %rax, %r8
67
				   is *not* 0 */
67
        jnc     L(strlentail)
68
	jnc 3f			/* highest byte is NUL => return pointer */
68
69
	xorq %rcx, %rdx		/* (word+magic)^word */
69
        xor     %rax, %r8
70
	orq %r8, %rdx		/* set all non-carry bits */
70
        or      %rcx, %r8
71
	incq %rdx		/* add 1: if one carry bit was *not* set
71
        inc     %r8
72
				   the addition will not result in 0.  */
72
        jnz     L(strlentail)
73
	jnz 3f			/* found NUL => return pointer */
73
74
74
        mov     8 (%rsi), %rax
75
	/* Third unroll.  */
75
        lea     8 (%rsi), %rsi
76
	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
76
77
	addq $8,%rax		/* adjust pointer for next word */
77
        mov     %rcx, %r8
78
	movq %r8, %rdx		/* magic value */
78
        add     %rax, %r8
79
	addq %rcx, %rdx		/* add the magic value to the word.  We get
79
        jnc     L(strlentail)
80
				   carry bits reported for each byte which
80
81
				   is *not* 0 */
81
        xor     %rax, %r8
82
	jnc 3f			/* highest byte is NUL => return pointer */
82
        or      %rcx, %r8
83
	xorq %rcx, %rdx		/* (word+magic)^word */
83
        inc     %r8
84
	orq %r8, %rdx		/* set all non-carry bits */
84
        jnz     L(strlentail)
85
	incq %rdx		/* add 1: if one carry bit was *not* set
85
86
				   the addition will not result in 0.  */
86
        mov     8 (%rsi), %rax
87
	jnz 3f			/* found NUL => return pointer */
87
        lea     8 (%rsi), %rsi
88
88
89
	/* Fourth unroll.  */
89
        mov     %rcx, %r8
90
	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
90
        add     %rax, %r8
91
	addq $8,%rax		/* adjust pointer for next word */
91
        jnc     L(strlentail)
92
	movq %r8, %rdx		/* magic value */
92
93
	addq %rcx, %rdx		/* add the magic value to the word.  We get
93
        xor     %rax, %r8
94
				   carry bits reported for each byte which
94
        or      %rcx, %r8
95
				   is *not* 0 */
95
        inc     %r8
96
	jnc 3f			/* highest byte is NUL => return pointer */
96
        jnz     L(strlentail)
97
	xorq %rcx, %rdx		/* (word+magic)^word */
97
98
	orq %r8, %rdx		/* set all non-carry bits */
98
        mov     8 (%rsi), %rax
99
	incq %rdx		/* add 1: if one carry bit was *not* set
99
        lea     8 (%rsi), %rsi
100
				   the addition will not result in 0.  */
100
101
	jz 4b			/* no NUL found => continue loop */
101
        mov     %rcx, %r8
102
102
        add     %rax, %r8
103
	.p2align 4		/* Align, it's a jump target.  */
103
        jnc     L(strlentail)
104
3:	subq $8,%rax		/* correct pointer increment.  */
104
105
105
        xor     %rax, %r8
106
	testb %cl, %cl		/* is first byte NUL? */
106
        or      %rcx, %r8
107
	jz 2f			/* yes => return */
107
        inc     %r8
108
	incq %rax		/* increment pointer */
108
        jnz     L(strlentail)
109
109
110
	testb %ch, %ch		/* is second byte NUL? */
110
        mov     8 (%rsi), %rax
111
	jz 2f			/* yes => return */
111
        lea     8 (%rsi), %rsi
112
	incq %rax		/* increment pointer */
112
113
113
        mov     %rcx, %r8
114
	testl $0x00ff0000, %ecx /* is third byte NUL? */
114
        add     %rax, %r8
115
	jz 2f			/* yes => return pointer */
115
        jnc     L(strlentail)
116
	incq %rax		/* increment pointer */
116
117
117
        xor     %rax, %r8
118
	testl $0xff000000, %ecx /* is fourth byte NUL? */
118
        or      %rcx, %r8
119
	jz 2f			/* yes => return pointer */
119
        inc     %r8
120
	incq %rax		/* increment pointer */
120
        jnz     L(strlentail)
121
121
122
	shrq $32, %rcx		/* look at other half.  */
122
        mov     8 (%rsi), %rax
123
123
        lea     8 (%rsi), %rsi
124
	testb %cl, %cl		/* is first byte NUL? */
124
125
	jz 2f			/* yes => return */
125
        mov     %rcx, %r8
126
	incq %rax		/* increment pointer */
126
        add     %rax, %r8
127
127
        jnc     L(strlentail)
128
	testb %ch, %ch		/* is second byte NUL? */
128
129
	jz 2f			/* yes => return */
129
        xor     %rax, %r8
130
	incq %rax		/* increment pointer */
130
        or      %rcx, %r8
131
131
        inc     %r8
132
	testl $0xff0000, %ecx	/* is third byte NUL? */
132
        jnz     L(strlentail)
133
	jz 2f			/* yes => return pointer */
133
134
	incq %rax		/* increment pointer */
134
        mov     8 (%rsi), %rax
135
2:
135
        lea     8 (%rsi), %rsi
136
	subq %rdi, %rax		/* compute difference to string start */
136
137
	ret
137
L(strlen56after):
138
139
L(strlen32):                               # 32-byte
140
#       mov     $0xfefefefefefefeff, %rcx
141
#       mov     (%rsi), %rax
142
143
#ifdef PIC
144
        mov     _rtld_local_ro@GOTPCREL(%rip), %r8
145
	mov	RTLD_GLOBAL_DL_CACHE1SIZE(%r8), %r9
146
#else
147
        mov     _dl_cache1size, %r9
148
#endif
149
	
150
        .p2align 4
151
152
L(strlen32loop):
153
        mov     %rcx, %r8
154
        add     %rax, %r8
155
        sbb     %rdx, %rdx
156
157
        xor     %rax, %r8
158
        or      %rcx, %r8
159
        sub     %rdx, %r8
160
        jnz     L(strlentail)
161
162
        mov     8 (%rsi), %rax
163
        add     $8, %rsi
164
165
        mov     %rcx, %r8
166
        add     %rax, %r8
167
        sbb     %rdx, %rdx
168
169
        xor     %rax, %r8
170
        or      %rcx, %r8
171
        sub     %rdx, %r8
172
        jnz     L(strlentail)
173
174
        mov     8 (%rsi), %rax
175
        add     $8, %rsi
176
177
        mov     %rcx, %r8
178
        add     %rax, %r8
179
        sbb     %rdx, %rdx
180
181
        xor     %rax, %r8
182
        or      %rcx, %r8
183
        sub     %rdx, %r8
184
        jnz     L(strlentail)
185
186
        mov     8 (%rsi), %rax
187
        add     $8, %rsi
188
189
        mov     %rcx, %r8
190
        add     %rax, %r8
191
        sbb     %rdx, %rdx
192
193
        xor     %rax, %r8
194
        or      %rcx, %r8
195
        sub     %rdx, %r8
196
        jnz     L(strlentail)
197
198
        mov     8 (%rsi), %rax
199
        add     $8, %rsi
200
201
        mov     %rcx, %r8
202
        add     %rax, %r8
203
        sbb     %rdx, %rdx
204
205
        xor     %rax, %r8
206
        or      %rcx, %r8
207
        sub     %rdx, %r8
208
        jnz     L(strlentail)
209
210
        mov     8 (%rsi), %rax
211
        add     $8, %rsi
212
213
        mov     %rcx, %r8
214
        add     %rax, %r8
215
        sbb     %rdx, %rdx
216
217
        xor     %rax, %r8
218
        or      %rcx, %r8
219
        sub     %rdx, %r8
220
        jnz     L(strlentail)
221
222
        mov     8 (%rsi), %rax
223
        add     $8, %rsi
224
225
        mov     %rcx, %r8
226
        add     %rax, %r8
227
        sbb     %rdx, %rdx
228
229
        xor     %rax, %r8
230
        or      %rcx, %r8
231
        sub     %rdx, %r8
232
        jnz     L(strlentail)
233
234
        mov     8 (%rsi), %rax
235
        add     $8, %rsi
236
237
        mov     %rcx, %r8
238
        add     %rax, %r8
239
        sbb     %rdx, %rdx
240
241
        xor     %rax, %r8
242
        or      %rcx, %r8
243
        sub     %rdx, %r8
244
        jnz     L(strlentail)
245
246
        sub     $32, %r9
247
248
        mov     8 (%rsi), %rax
249
        lea     8 (%rsi), %rsi
250
251
        jbe     L(strlen32loop)
252
253
L(strlen32after):
254
255
L(strlenpretry):
256
257
L(strlenpre):                              # 64-byte prefetch
258
#       mov     $0xfefefefefefefeff, %rcx
259
#       mov     (%rsi), %rax
260
261
        .p2align 4
262
263
L(strlenpreloop):
264
        mov     %rcx, %r8
265
        add     %rax, %r8
266
        sbb     %rdx, %rdx
267
268
        xor     %rax, %r8
269
        or      %rcx, %r8
270
        sub     %rdx, %r8
271
        jnz     L(strlentail)
272
273
        mov     8 (%rsi), %rax
274
        add     $8, %rsi
275
276
        mov     %rcx, %r8
277
        add     %rax, %r8
278
        sbb     %rdx, %rdx
279
280
        xor     %rax, %r8
281
        or      %rcx, %r8
282
        sub     %rdx, %r8
283
        jnz     L(strlentail)
284
285
        mov     8 (%rsi), %rax
286
        add     $8, %rsi
287
288
        mov     %rcx, %r8
289
        add     %rax, %r8
290
        sbb     %rdx, %rdx
291
292
        xor     %rax, %r8
293
        or      %rcx, %r8
294
        sub     %rdx, %r8
295
        jnz     L(strlentail)
296
297
        mov     8 (%rsi), %rax
298
        add     $8, %rsi
299
300
        mov     %rcx, %r8
301
        add     %rax, %r8
302
        sbb     %rdx, %rdx
303
304
        xor     %rax, %r8
305
        or      %rcx, %r8
306
        sub     %rdx, %r8
307
        jnz     L(strlentail)
308
309
        mov     8 (%rsi), %rax
310
        add     $8, %rsi
311
312
        mov     %rcx, %r8
313
        add     %rax, %r8
314
        sbb     %rdx, %rdx
315
316
        xor     %rax, %r8
317
        or      %rcx, %r8
318
        sub     %rdx, %r8
319
        jnz     L(strlentail)
320
321
        mov     8 (%rsi), %rax
322
        add     $8, %rsi
323
324
        mov     %rcx, %r8
325
        add     %rax, %r8
326
        sbb     %rdx, %rdx
327
328
        xor     %rax, %r8
329
        or      %rcx, %r8
330
        sub     %rdx, %r8
331
        jnz     L(strlentail)
332
333
        mov     8 (%rsi), %rax
334
        add     $8, %rsi
335
336
        mov     %rcx, %r8
337
        add     %rax, %r8
338
        sbb     %rdx, %rdx
339
340
        xor     %rax, %r8
341
        or      %rcx, %r8
342
        sub     %rdx, %r8
343
        jnz     L(strlentail)
344
345
        mov     8 (%rsi), %rax
346
        add     $8, %rsi
347
348
        mov     %rcx, %r8
349
        add     %rax, %r8
350
        sbb     %rdx, %rdx
351
352
        xor     %rax, %r8
353
        or      %rcx, %r8
354
        sub     %rdx, %r8
355
        jnz     L(strlentail)
356
357
        prefetcht0 512 (%rsi)
358
359
        mov     8 (%rsi), %rax
360
        add     $8, %rsi
361
362
        jmp     L(strlenpreloop)
363
364
        .p2align 4
365
366
L(strlenpreafter):
367
368
L(strlentailtry):
369
370
L(strlentail):                             # 4-byte tail
371
372
L(strlentailloop):
373
        test    %al, %al
374
        jz      L(exit)
375
376
        inc     %rsi
377
378
        test    %ah, %ah
379
        jz      L(exit)
380
381
        inc     %rsi
382
383
        test    $0x00ff0000, %eax
384
        jz      L(exit)
385
386
        inc     %rsi
387
388
        test    $0xff000000, %eax
389
        jz      L(exit)
390
391
        inc     %rsi
392
393
        shr     $32, %rax
394
        jmp     L(strlentailloop)
395
396
L(strlentailafter):
397
398
        .p2align 4
399
400
L(exit):
401
        lea     (%rdi, %rsi), %rax
402
        ret
403
138
END (strlen)
404
END (strlen)
139
libc_hidden_builtin_def (strlen)
405
libc_hidden_builtin_def (strlen)
(-)sysdeps/x86_64/dl-machine.h (+34 lines)
Lines 208-213 dl_platform_init (void) Link Here
208
  if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
208
  if (GLRO(dl_platform) != NULL && *GLRO(dl_platform) == '\0')
209
    /* Avoid an empty string which would disturb us.  */
209
    /* Avoid an empty string which would disturb us.  */
210
    GLRO(dl_platform) = NULL;
210
    GLRO(dl_platform) = NULL;
211
212
  long int t1, t2;
213
  t1 = 0;
214
  t2 = 0;
215
  
216
  asm (
217
       "mov	$0x80000000, %%eax		# get highest level of support\n\t"
218
       "cpuid\n\t"
219
       "cmp     $0x80000006, %%eax	# check for support of cache info\n\t"
220
       "jb      1f\n\t"
221
       "mov     $0x80000005, %%eax	# get L1 info\n\t"
222
       "cpuid\n\t"
223
       "shr     $24, %%ecx\n\t"
224
       "shl     $10, %%ecx\n\t"
225
       "mov     %%rcx, %0\n\t"
226
       "mov     $0x80000006, %%eax    	# get L2 info\n\t"
227
       "cpuid\n\t"
228
       "shr     $16, %%ecx\n\t"
229
       "shl     $10, %%ecx\n\t"
230
       "mov     %%rcx, %1\n\t"
231
       "1:\n\t"
232
       :"=r" (t1), "=r" (t2) :: "%rbx", "%rax", "%rcx", "%rdx"
233
       );
234
235
  if (t1)
236
    {
237
      GLRO(dl_cache1size) = t1;
238
      GLRO(dl_cache1sizehalf) = t1 / 2;
239
    }
240
  if (t2)
241
    {
242
      GLRO(dl_cache2size) = t2;
243
      GLRO(dl_cache2sizehalf) = t2 / 2;
244
    }
211
}
245
}
212
246
213
static inline Elf64_Addr
247
static inline Elf64_Addr
(-)sysdeps/x86_64/Makefile (+3 lines)
Lines 4-9 long-double-fcts = yes Link Here
4
ifeq ($(subdir),csu)
4
ifeq ($(subdir),csu)
5
sysdep_routines += hp-timing
5
sysdep_routines += hp-timing
6
elide-routines.os += hp-timing
6
elide-routines.os += hp-timing
7
8
# get offset to rtld_global._dl_*
9
gen-as-const-headers += rtld-global-offsets.sym
7
endif
10
endif
8
11
9
ifeq ($(subdir),gmon)
12
ifeq ($(subdir),gmon)
(-)sysdeps/x86_64/strcpy.S (-146 / +820 lines)
Lines 1-159 Link Here
1
/* strcpy/stpcpy implementation for x86-64.
1
# $Header: /K8_Projects/Glibc/amd64strcpy.S 7     2/12/04 19:06 Emenezes $
2
   Copyright (C) 2002 Free Software Foundation, Inc.
3
   This file is part of the GNU C Library.
4
   Contributed by Andreas Jaeger <aj@suse.de>, 2002.
5
6
   The GNU C Library is free software; you can redistribute it and/or
7
   modify it under the terms of the GNU Lesser General Public
8
   License as published by the Free Software Foundation; either
9
   version 2.1 of the License, or (at your option) any later version.
10
11
   The GNU C Library is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
   Lesser General Public License for more details.
15
16
   You should have received a copy of the GNU Lesser General Public
17
   License along with the GNU C Library; if not, write to the Free
18
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19
   02111-1307 USA.  */
20
21
#include <sysdep.h>
22
#include "asm-syntax.h"
23
#include "bp-sym.h"
24
#include "bp-asm.h"
25
2
26
#ifndef USE_AS_STPCPY
3
# (c) 2002 Advanced Micro Devices, Inc.
4
# YOUR USE OF THIS CODE IS SUBJECT TO THE TERMS
5
# AND CONDITIONS OF THE GNU LESSER GENERAL PUBLIC
6
# LICENSE FOUND IN THE "README" FILE THAT IS
7
# INCLUDED WITH THIS FILE
8
9
#include "sysdep.h"
10
#include <rtld-global-offsets.h>
11
12
	/* XXX:	strncpy is broken, just use this for strcpy for now.  */
13
#ifdef PIC
14
	.globl _rtld_local_ro
15
        .hidden _rtld_local_ro
16
        .set    _rtld_local_ro,_rtld_global_ro
17
#endif
18
#ifndef STRCPY
27
# define STRCPY strcpy
19
# define STRCPY strcpy
28
#endif
20
#endif
21
#define LABEL(s) L(strcpy##s)
22
23
        .text
24
25
ENTRY   (STRCPY)                        # (char *, const char *)
26
27
#ifdef USE_AS_STRNCPY			// (char *, const char *, size_t)
28
	test	%rdx, %rdx		# (char *, const char *, size_t)
29
	mov	%rdx, %r11
30
	jz	LABEL(exitn)		# early exit
31
#endif
32
33
        xor     %edx, %edx
34
35
LABEL(aligntry):
36
        mov     %rsi, %r8		# align by source
37
        and     $7, %r8
38
	jz	LABEL(alignafter)
39
40
LABEL(align):				# 8-byte align
41
        sub     $8, %r8
29
42
30
	.text
31
ENTRY (BP_SYM (STRCPY))
32
	movq %rsi, %rcx		/* Source register. */
33
	andl $7, %ecx		/* mask alignment bits */
34
	movq %rdi, %rdx		/* Duplicate destination pointer.  */
35
36
	jz 5f			/* aligned => start loop */
37
38
	neg %ecx		/* We need to align to 8 bytes.  */
39
	addl $8,%ecx
40
	/* Search the first bytes directly.  */
41
0:
42
	movb	(%rsi), %al	/* Fetch a byte */
43
	testb	%al, %al	/* Is it NUL? */
44
	movb	%al, (%rdx)	/* Store it */
45
	jz	4f		/* If it was NUL, done! */
46
	incq	%rsi
47
	incq	%rdx
48
	decl	%ecx
49
	jnz	0b
50
51
5:
52
	movq $0xfefefefefefefeff,%r8
53
54
	/* Now the sources is aligned.  Unfortunatly we cannot force
55
	   to have both source and destination aligned, so ignore the
56
	   alignment of the destination.  */
57
	.p2align 4
43
	.p2align 4
58
1:
59
	/* 1st unroll.  */
60
	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
61
	addq	$8, %rsi	/* Adjust pointer for next word.  */
62
	movq	%rax, %r9	/* Save a copy for NUL finding.  */
63
	addq	%r8, %r9	/* add the magic value to the word.  We get
64
				   carry bits reported for each byte which
65
				   is *not* 0 */
66
	jnc	3f		/* highest byte is NUL => return pointer */
67
	xorq	%rax, %r9	/* (word+magic)^word */
68
	orq	%r8, %r9	/* set all non-carry bits */
69
	incq	%r9		/* add 1: if one carry bit was *not* set
70
				   the addition will not result in 0.  */
71
72
	jnz	3f		/* found NUL => return pointer */
73
74
	movq	%rax, (%rdx)	/* Write value to destination.  */
75
	addq	$8, %rdx	/* Adjust pointer.  */
76
77
	/* 2nd unroll.  */
78
	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
79
	addq	$8, %rsi	/* Adjust pointer for next word.  */
80
	movq	%rax, %r9	/* Save a copy for NUL finding.  */
81
	addq	%r8, %r9	/* add the magic value to the word.  We get
82
				   carry bits reported for each byte which
83
				   is *not* 0 */
84
	jnc	3f		/* highest byte is NUL => return pointer */
85
	xorq	%rax, %r9	/* (word+magic)^word */
86
	orq	%r8, %r9	/* set all non-carry bits */
87
	incq	%r9		/* add 1: if one carry bit was *not* set
88
				   the addition will not result in 0.  */
89
90
	jnz	3f		/* found NUL => return pointer */
91
92
	movq	%rax, (%rdx)	/* Write value to destination.  */
93
	addq	$8, %rdx	/* Adjust pointer.  */
94
95
	/* 3rd unroll.  */
96
	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
97
	addq	$8, %rsi	/* Adjust pointer for next word.  */
98
	movq	%rax, %r9	/* Save a copy for NUL finding.  */
99
	addq	%r8, %r9	/* add the magic value to the word.  We get
100
				   carry bits reported for each byte which
101
				   is *not* 0 */
102
	jnc	3f		/* highest byte is NUL => return pointer */
103
	xorq	%rax, %r9	/* (word+magic)^word */
104
	orq	%r8, %r9	/* set all non-carry bits */
105
	incq	%r9		/* add 1: if one carry bit was *not* set
106
				   the addition will not result in 0.  */
107
108
	jnz	3f		/* found NUL => return pointer */
109
110
	movq	%rax, (%rdx)	/* Write value to destination.  */
111
	addq	$8, %rdx	/* Adjust pointer.  */
112
113
	/* 4th unroll.  */
114
	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
115
	addq	$8, %rsi	/* Adjust pointer for next word.  */
116
	movq	%rax, %r9	/* Save a copy for NUL finding.  */
117
	addq	%r8, %r9	/* add the magic value to the word.  We get
118
				   carry bits reported for each byte which
119
				   is *not* 0 */
120
	jnc	3f		/* highest byte is NUL => return pointer */
121
	xorq	%rax, %r9	/* (word+magic)^word */
122
	orq	%r8, %r9	/* set all non-carry bits */
123
	incq	%r9		/* add 1: if one carry bit was *not* set
124
				   the addition will not result in 0.  */
125
126
	jnz	3f		/* found NUL => return pointer */
127
128
	movq	%rax, (%rdx)	/* Write value to destination.  */
129
	addq	$8, %rdx	/* Adjust pointer.  */
130
	jmp	1b		/* Next iteration.  */
131
44
132
	/* Do the last few bytes. %rax contains the value to write.
45
LABEL(alignloop):
133
	   The loop is unrolled twice.  */
46
#ifdef USE_AS_STRNCPY
47
	dec	%r11
48
	jl	LABEL(exitn)
49
#endif
50
51
        mov     (%rsi, %rdx), %al       # check if same character
52
        test    %al, %al                # check if character a NUL
53
        mov     %al, (%rdi, %rdx)
54
        jz      LABEL(exit)
55
56
        inc     %edx
57
        inc     %r8
58
        jnz     LABEL(alignloop)
59
134
	.p2align 4
60
	.p2align 4
61
62
LABEL(alignafter):
63
64
LABEL(8try):
65
        mov     $0xfefefefefefefeff, %rcx
66
67
LABEL(8):                               # 8-byte
68
        mov     (%rsi, %rdx), %rax
69
70
LABEL(8loop):
71
#ifdef USE_AS_STRNCPY
72
	sub	$8, %r11
73
	jl	LABEL(tail)
74
#endif
75
76
        mov     %rcx, %r8
77
        add     %rax, %r8
78
        sbb     %r10, %r10
79
80
        xor     %rax, %r8
81
        or      %rcx, %r8
82
        sub     %r10, %r8
83
        jnz     LABEL(tail)
84
85
        mov     %rax, (%rdi, %rdx)
86
        mov     8 (%rsi, %rdx), %rax
87
        add     $8, %edx
88
89
#ifdef USE_AS_STRNCPY
90
	sub	$8, %r11
91
	jl	LABEL(tail)
92
#endif
93
94
        mov     %rcx, %r8
95
        add     %rax, %r8
96
        sbb     %r10, %r10
97
98
        xor     %rax, %r8
99
        or      %rcx, %r8
100
        sub     %r10, %r8
101
        jnz     LABEL(tail)
102
103
        mov     %rax, (%rdi, %rdx)
104
        mov     8 (%rsi, %rdx), %rax
105
        add     $8, %edx
106
107
#ifdef USE_AS_STRNCPY
108
	sub	$8, %r11
109
	jl	LABEL(tail)
110
#endif
111
112
        mov     %rcx, %r8
113
        add     %rax, %r8
114
        sbb     %r10, %r10
115
116
        xor     %rax, %r8
117
        or      %rcx, %r8
118
        sub     %r10, %r8
119
        jnz     LABEL(tail)
120
121
        mov     %rax, (%rdi, %rdx)
122
        mov     8 (%rsi, %rdx), %rax
123
        add     $8, %edx
124
125
#ifdef USE_AS_STRNCPY
126
	sub	$8, %r11
127
	jl	LABEL(tail)
128
#endif
129
130
        mov     %rcx, %r8
131
        add     %rax, %r8
132
        sbb     %r10, %r10
133
134
        xor     %rax, %r8
135
        or      %rcx, %r8
136
        sub     %r10, %r8
137
        jnz     LABEL(tail)
138
139
        mov     %rax, (%rdi, %rdx)
140
        mov     8 (%rsi, %rdx), %rax
141
        add     $8, %edx
142
143
#ifdef USE_AS_STRNCPY
144
	sub	$8, %r11
145
	jl	LABEL(tail)
146
#endif
147
148
        mov     %rcx, %r8
149
        add     %rax, %r8
150
        sbb     %r10, %r10
151
152
        xor     %rax, %r8
153
        or      %rcx, %r8
154
        sub     %r10, %r8
155
        jnz     LABEL(tail)
156
157
        mov     %rax, (%rdi, %rdx)
158
        mov     8 (%rsi, %rdx), %rax
159
        add     $8, %edx
160
161
#ifdef USE_AS_STRNCPY
162
	sub	$8, %r11
163
	jl	LABEL(tail)
164
#endif
165
166
        mov     %rcx, %r8
167
        add     %rax, %r8
168
        sbb     %r10, %r10
169
170
        xor     %rax, %r8
171
        or      %rcx, %r8
172
        sub     %r10, %r8
173
        jnz     LABEL(tail)
174
175
        mov     %rax, (%rdi, %rdx)
176
        mov     8 (%rsi, %rdx), %rax
177
        add     $8, %edx
178
179
#ifdef USE_AS_STRNCPY
180
	sub	$8, %r11
181
	jl	LABEL(tail)
182
#endif
183
184
        mov     %rcx, %r8
185
        add     %rax, %r8
186
        sbb     %r10, %r10
187
188
        xor     %rax, %r8
189
        or      %rcx, %r8
190
        sub     %r10, %r8
191
        jnz     LABEL(tail)
192
193
        mov     %rax, (%rdi, %rdx)
194
        mov     8 (%rsi, %rdx), %rax
195
        add     $8, %edx
196
197
#ifdef USE_AS_STRNCPY
198
	sub	$8, %r11
199
	jl	LABEL(tail)
200
#endif
201
202
        mov     %rcx, %r8
203
        add     %rax, %r8
204
        sbb     %r10, %r10
205
206
        xor     %rax, %r8
207
        or      %rcx, %r8
208
        sub     %r10, %r8
209
        jnz     LABEL(tail)
210
211
        mov     %rax, (%rdi, %rdx)
212
        mov     8 (%rsi, %rdx), %rax
213
        add     $8, %edx
214
215
LABEL(8after):
216
217
LABEL(64try):
218
#ifdef PIC
219
        mov     _rtld_local_ro@GOTPCREL(%rip), %r8
220
	mov	RTLD_GLOBAL_DL_CACHE1SIZEHALF(%r8), %r9
221
#else
222
        mov     _dl_cache1sizehalf, %r9
223
#endif
224
225
226
LABEL(64):				# 64-byte
227
228
        .p2align 4
229
230
LABEL(64loop):
231
#ifdef USE_AS_STRNCPY
232
	sub	$8, %r11
233
	jl	LABEL(tail)
234
#endif
235
236
        mov     %rcx, %r8
237
        add     %rax, %r8
238
        sbb     %r10, %r10
239
240
        xor     %rax, %r8
241
        or      %rcx, %r8
242
        sub     %r10, %r8
243
        jnz     LABEL(tail)
244
245
        mov     %rax, (%rdi, %rdx)
246
        mov     8 (%rsi, %rdx), %rax
247
        add     $8, %edx
248
249
#ifdef USE_AS_STRNCPY
250
	sub	$8, %r11
251
	jl	LABEL(tail)
252
#endif
253
254
        mov     %rcx, %r8
255
        add     %rax, %r8
256
        sbb     %r10, %r10
257
258
        xor     %rax, %r8
259
        or      %rcx, %r8
260
        sub     %r10, %r8
261
        jnz     LABEL(tail)
262
263
        mov     %rax, (%rdi, %rdx)
264
        mov     8 (%rsi, %rdx), %rax
265
        add     $8, %edx
266
267
#ifdef USE_AS_STRNCPY
268
	sub	$8, %r11
269
	jl	LABEL(tail)
270
#endif
271
272
        mov     %rcx, %r8
273
        add     %rax, %r8
274
        sbb     %r10, %r10
275
276
        xor     %rax, %r8
277
        or      %rcx, %r8
278
        sub     %r10, %r8
279
        jnz     LABEL(tail)
280
281
        mov     %rax, (%rdi, %rdx)
282
        mov     8 (%rsi, %rdx), %rax
283
        add     $8, %edx
284
285
#ifdef USE_AS_STRNCPY
286
	sub	$8, %r11
287
	jl	LABEL(tail)
288
#endif
289
290
        mov     %rcx, %r8
291
        add     %rax, %r8
292
        sbb     %r10, %r10
293
294
        xor     %rax, %r8
295
        or      %rcx, %r8
296
        sub     %r10, %r8
297
        jnz     LABEL(tail)
298
299
        mov     %rax, (%rdi, %rdx)
300
        mov     8 (%rsi, %rdx), %rax
301
        add     $8, %edx
302
303
#ifdef USE_AS_STRNCPY
304
	sub	$8, %r11
305
	jl	LABEL(tail)
306
#endif
307
308
        mov     %rcx, %r8
309
        add     %rax, %r8
310
        sbb     %r10, %r10
311
312
        xor     %rax, %r8
313
        or      %rcx, %r8
314
        sub     %r10, %r8
315
        jnz     LABEL(tail)
316
317
        mov     %rax, (%rdi, %rdx)
318
        mov     8 (%rsi, %rdx), %rax
319
        add     $8, %edx
320
321
#ifdef USE_AS_STRNCPY
322
	sub	$8, %r11
323
	jl	LABEL(tail)
324
#endif
325
326
        mov     %rcx, %r8
327
        add     %rax, %r8
328
        sbb     %r10, %r10
329
330
        xor     %rax, %r8
331
        or      %rcx, %r8
332
        sub     %r10, %r8
333
        jnz     LABEL(tail)
334
335
        mov     %rax, (%rdi, %rdx)
336
        mov     8 (%rsi, %rdx), %rax
337
        add     $8, %edx
338
339
#ifdef USE_AS_STRNCPY
340
	sub	$8, %r11
341
	jl	LABEL(tail)
342
#endif
343
344
        mov     %rcx, %r8
345
        add     %rax, %r8
346
        sbb     %r10, %r10
347
348
        xor     %rax, %r8
349
        or      %rcx, %r8
350
        sub     %r10, %r8
351
        jnz     LABEL(tail)
352
353
        mov     %rax, (%rdi, %rdx)
354
        mov     8 (%rsi, %rdx), %rax
355
        add     $8, %edx
356
357
#ifdef USE_AS_STRNCPY
358
	sub	$8, %r11
359
	jl	LABEL(tail)
360
#endif
361
362
        mov     %rcx, %r8
363
        add     %rax, %r8
364
        sbb     %r10, %r10
365
366
        xor     %rax, %r8
367
        or      %rcx, %r8
368
        sub     %r10, %r8
369
        jnz     LABEL(tail)
370
371
        cmp     %r9, %rdx
372
373
        mov     %rax, (%rdi, %rdx)
374
        mov     8 (%rsi, %rdx), %rax
375
        lea     8 (%rdx), %rdx
376
377
        jbe     LABEL(64loop)
378
379
LABEL(64after):
380
381
LABEL(pretry):
382
#ifdef PIC
383
	mov     _rtld_local_ro@GOTPCREL(%rip), %r8
384
	mov	RTLD_GLOBAL_DL_CACHE2SIZEHALF(%r8), %r9
385
#else
386
        mov     _dl_cache2sizehalf, %r9
387
#endif
388
389
LABEL(pre):                              # 64-byte prefetch
390
391
        .p2align 4
392
393
LABEL(preloop):
394
#ifdef USE_AS_STRNCPY
395
	sub	$8, %r11
396
	jl	LABEL(tail)
397
#endif
398
399
        mov     %rcx, %r8
400
        add     %rax, %r8
401
        sbb     %r10, %r10
402
403
        xor     %rax, %r8
404
        or      %rcx, %r8
405
        sub     %r10, %r8
406
        jnz     LABEL(tail)
407
408
        mov     %rax, (%rdi, %rdx)
409
        mov     8 (%rsi, %rdx), %rax
410
        add     $8, %edx
411
412
#ifdef USE_AS_STRNCPY
413
	sub	$8, %r11
414
	jl	LABEL(tail)
415
#endif
416
417
        mov     %rcx, %r8
418
        add     %rax, %r8
419
        sbb     %r10, %r10
420
421
        xor     %rax, %r8
422
        or      %rcx, %r8
423
        sub     %r10, %r8
424
        jnz     LABEL(tail)
425
426
        mov     %rax, (%rdi, %rdx)
427
        mov     8 (%rsi, %rdx), %rax
428
        add     $8, %edx
429
430
#ifdef USE_AS_STRNCPY
431
	sub	$8, %r11
432
	jl	LABEL(tail)
433
#endif
434
435
        mov     %rcx, %r8
436
        add     %rax, %r8
437
        sbb     %r10, %r10
438
439
        xor     %rax, %r8
440
        or      %rcx, %r8
441
        sub     %r10, %r8
442
        jnz     LABEL(tail)
443
444
        mov     %rax, (%rdi, %rdx)
445
        mov     8 (%rsi, %rdx), %rax
446
        add     $8, %edx
447
448
#ifdef USE_AS_STRNCPY
449
	sub	$8, %r11
450
	jl	LABEL(tail)
451
#endif
452
453
        mov     %rcx, %r8
454
        add     %rax, %r8
455
        sbb     %r10, %r10
456
457
        xor     %rax, %r8
458
        or      %rcx, %r8
459
        sub     %r10, %r8
460
        jnz     LABEL(tail)
461
462
        mov     %rax, (%rdi, %rdx)
463
        mov     8 (%rsi, %rdx), %rax
464
        add     $8, %edx
465
466
#ifdef USE_AS_STRNCPY
467
	sub	$8, %r11
468
	jl	LABEL(tail)
469
#endif
470
471
        mov     %rcx, %r8
472
        add     %rax, %r8
473
        sbb     %r10, %r10
474
475
        xor     %rax, %r8
476
        or      %rcx, %r8
477
        sub     %r10, %r8
478
        jnz     LABEL(tail)
479
480
        mov     %rax, (%rdi, %rdx)
481
        mov     8 (%rsi, %rdx), %rax
482
        add     $8, %edx
483
484
#ifdef USE_AS_STRNCPY
485
	sub	$8, %r11
486
	jl	LABEL(tail)
487
#endif
488
489
        mov     %rcx, %r8
490
        add     %rax, %r8
491
        sbb     %r10, %r10
492
493
        xor     %rax, %r8
494
        or      %rcx, %r8
495
        sub     %r10, %r8
496
        jnz     LABEL(tail)
497
498
        mov     %rax, (%rdi, %rdx)
499
        mov     8 (%rsi, %rdx), %rax
500
        add     $8, %edx
501
502
#ifdef USE_AS_STRNCPY
503
	sub	$8, %r11
504
	jl	LABEL(tail)
505
#endif
506
507
        mov     %rcx, %r8
508
        add     %rax, %r8
509
        sbb     %r10, %r10
510
511
        xor     %rax, %r8
512
        or      %rcx, %r8
513
        sub     %r10, %r8
514
        jnz     LABEL(tail)
515
516
        mov     %rax, (%rdi, %rdx)
517
        mov     8 (%rsi, %rdx), %rax
518
        add     $8, %edx
519
520
#ifdef USE_AS_STRNCPY
521
	sub	$8, %r11
522
	jl	LABEL(tail)
523
#endif
524
525
        mov     %rcx, %r8
526
        add     %rax, %r8
527
        sbb     %r10, %r10
528
529
        xor     %rax, %r8
530
        or      %rcx, %r8
531
        sub     %r10, %r8
532
        jnz     LABEL(tail)
533
534
        cmp     %r9, %rdx
535
536
        mov     %rax, (%rdi, %rdx)
537
        prefetcht0 512 + 8 (%rdi, %rdx)
538
        mov     8 (%rsi, %rdx), %rax
539
        prefetcht0 512 + 8 (%rsi, %rdx)
540
        lea     8 (%rdx), %rdx
541
542
        jb	LABEL(preloop)
543
544
        .p2align 4
545
546
LABEL(preafter):
547
548
LABEL(NTtry):
549
	sfence
550
551
LABEL(NT):				# 64-byte NT
552
553
        .p2align 4
554
555
LABEL(NTloop):
556
#ifdef USE_AS_STRNCPY
557
	sub	$8, %r11
558
	jl	LABEL(tail)
559
#endif
560
561
        mov     %rcx, %r8
562
        add     %rax, %r8
563
        sbb     %r10, %r10
564
565
        xor     %rax, %r8
566
        or      %rcx, %r8
567
        sub     %r10, %r8
568
        jnz     LABEL(NTtail)
569
570
        movnti  %rax, (%rdi, %rdx)
571
        mov     8 (%rsi, %rdx), %rax
572
        add     $8, %rdx
573
574
#ifdef USE_AS_STRNCPY
575
	sub	$8, %r11
576
	jl	LABEL(tail)
577
#endif
578
579
        mov     %rcx, %r8
580
        add     %rax, %r8
581
        sbb     %r10, %r10
582
583
        xor     %rax, %r8
584
        or      %rcx, %r8
585
        sub     %r10, %r8
586
        jnz     LABEL(NTtail)
587
588
        movnti  %rax, (%rdi, %rdx)
589
        mov     8 (%rsi, %rdx), %rax
590
        add     $8, %rdx
591
592
#ifdef USE_AS_STRNCPY
593
	sub	$8, %r11
594
	jl	LABEL(tail)
595
#endif
596
597
        mov     %rcx, %r8
598
        add     %rax, %r8
599
        sbb     %r10, %r10
600
601
        xor     %rax, %r8
602
        or      %rcx, %r8
603
        sub     %r10, %r8
604
        jnz     LABEL(NTtail)
605
606
        movnti  %rax, (%rdi, %rdx)
607
        mov     8 (%rsi, %rdx), %rax
608
        add     $8, %rdx
609
610
#ifdef USE_AS_STRNCPY
611
	sub	$8, %r11
612
	jl	LABEL(tail)
613
#endif
614
615
        mov     %rcx, %r8
616
        add     %rax, %r8
617
        sbb     %r10, %r10
618
619
        xor     %rax, %r8
620
        or      %rcx, %r8
621
        sub     %r10, %r8
622
        jnz     LABEL(NTtail)
623
624
        movnti  %rax, (%rdi, %rdx)
625
        mov     8 (%rsi, %rdx), %rax
626
        add     $8, %rdx
627
628
#ifdef USE_AS_STRNCPY
629
	sub	$8, %r11
630
	jl	LABEL(tail)
631
#endif
632
633
        mov     %rcx, %r8
634
        add     %rax, %r8
635
        sbb     %r10, %r10
636
637
        xor     %rax, %r8
638
        or      %rcx, %r8
639
        sub     %r10, %r8
640
        jnz     LABEL(NTtail)
641
642
        movnti  %rax, (%rdi, %rdx)
643
        mov     8 (%rsi, %rdx), %rax
644
        add     $8, %rdx
645
646
#ifdef USE_AS_STRNCPY
647
	sub	$8, %r11
648
	jl	LABEL(tail)
649
#endif
650
651
        mov     %rcx, %r8
652
        add     %rax, %r8
653
        sbb     %r10, %r10
654
655
        xor     %rax, %r8
656
        or      %rcx, %r8
657
        sub     %r10, %r8
658
        jnz     LABEL(NTtail)
659
660
        movnti  %rax, (%rdi, %rdx)
661
        mov     8 (%rsi, %rdx), %rax
662
        add     $8, %rdx
663
664
#ifdef USE_AS_STRNCPY
665
	sub	$8, %r11
666
	jl	LABEL(tail)
667
#endif
668
669
        mov     %rcx, %r8
670
        add     %rax, %r8
671
        sbb     %r10, %r10
672
673
        xor     %rax, %r8
674
        or      %rcx, %r8
675
        sub     %r10, %r8
676
        jnz     LABEL(NTtail)
677
678
        movnti  %rax, (%rdi, %rdx)
679
        mov     8 (%rsi, %rdx), %rax
680
        add     $8, %rdx
681
682
#ifdef USE_AS_STRNCPY
683
	sub	$8, %r11
684
	jl	LABEL(tail)
685
#endif
686
687
        mov     %rcx, %r8
688
        add     %rax, %r8
689
        sbb     %r10, %r10
690
691
        xor     %rax, %r8
692
        or      %rcx, %r8
693
        sub     %r10, %r8
694
        jnz     LABEL(NTtail)
695
696
        movnti  %rax, (%rdi, %rdx)
697
	mov     8 (%rsi, %rdx), %rax
698
	prefetchnta 768 + 8 (%rsi, %rdx)
699
        add     $8, %rdx
700
701
        jmp     LABEL(NTloop)
702
703
        .p2align 4
704
705
LABEL(NTtail):
706
	sfence
707
708
        .p2align 4
709
710
LABEL(NTafter):
711
712
LABEL(tailtry):
713
714
LABEL(tail):                             # 1-byte tail
715
#ifdef USE_AS_STRNCPY
716
	add	$8, %r11
717
#endif
718
719
        .p2align 4
720
721
LABEL(tailloop):
722
#ifdef USE_AS_STRNCPY
723
	dec	%r11
724
	jl	LABEL(exitn)
725
#endif
726
727
        test    %al, %al
728
        mov     %al, (%rdi, %rdx)
729
        jz      LABEL(exit)
730
731
	inc     %rdx
732
733
#ifdef USE_AS_STRNCPY
734
	dec	%r11
735
	jl	LABEL(exitn)
736
737
	mov	%ah, %al
738
#endif
739
740
        test    %ah, %ah
741
        mov     %ah, (%rdi, %rdx)
742
        jz      LABEL(exit)
743
744
        inc     %rdx
745
746
#ifdef USE_AS_STRNCPY
747
	dec	%r11
748
	jl	LABEL(exitn)
749
#endif
750
751
        shr     $16, %rax
752
753
        test    %al, %al
754
        mov     %al, (%rdi, %rdx)
755
        jz      LABEL(exit)
756
757
        inc     %rdx
758
759
#ifdef USE_AS_STRNCPY
760
	dec	%r11
761
	jl	LABEL(exitn)
762
763
	mov	%ah, %al
764
#endif
765
766
        test    %ah, %ah
767
        mov     %ah, (%rdi, %rdx)
768
        jz      LABEL(exit)
769
770
        shr     $16, %rax
771
        inc     %rdx
772
773
        jmp     LABEL(tailloop)
774
775
        .p2align 4
776
777
LABEL(tailafter):
778
779
LABEL(exit):
780
#ifdef USE_AS_STRNCPY
781
	test	%r11, %r11
782
	mov	%r11, %rcx
783
784
#ifdef USE_AS_STPCPY
785
        lea     (%rdi, %rdx), %r8
786
#else
787
        mov     %rdi, %r8
788
#endif
789
790
	jz	2f
791
792
	xor	%eax, %eax		# bzero () would do too, but usually there are only a handfull of bytes left
793
	shr	$3, %rcx
794
        lea     1 (%rdi, %rdx), %rdi
795
	jz	1f
796
797
	rep	stosq
798
799
1:
800
	mov	%r11d, %ecx
801
	and	$7, %ecx
802
	jz	2f
803
804
        .p2align 4,, 3
805
135
3:
806
3:
136
	/* Note that stpcpy needs to return with the value of the NUL
807
	dec	%ecx
137
	   byte.  */
808
	mov	%al, (%rdi, %rcx)
138
	movb	%al, (%rdx)	/* 1st byte.  */
809
	jnz	3b
139
	testb	%al, %al	/* Is it NUL.  */
810
140
	jz	4f		/* yes, finish.  */
811
        .p2align 4,, 3
141
	incq	%rdx		/* Increment destination.  */
812
142
	movb	%ah, (%rdx)	/* 2nd byte.  */
813
2:
143
	testb	%ah, %ah	/* Is it NUL?.  */
814
	mov	%r8, %rax
144
	jz	4f		/* yes, finish.  */
815
        ret
145
	incq	%rdx		/* Increment destination.  */
816
	
146
	shrq	$16, %rax	/* Shift...  */
817
#endif
147
	jmp	3b		/* and look at next two bytes in %rax.  */
818
819
        .p2align 4
148
820
149
4:
821
LABEL(exitn):
150
#ifdef USE_AS_STPCPY
822
#ifdef USE_AS_STPCPY
151
	movq	%rdx, %rax	/* Destination is return value.  */
823
        lea     (%rdi, %rdx), %rax
152
#else
824
#else
153
	movq	%rdi, %rax	/* Source is return value.  */
825
        mov     %rdi, %rax
154
#endif
826
#endif
155
	retq
827
156
END (BP_SYM (STRCPY))
828
        ret
157
#ifndef USE_AS_STPCPY
829
158
libc_hidden_builtin_def (strcpy)
830
END (STRCPY)
831
#if !defined USE_AS_STPCPY && !defined USE_AS_STRNCPY
832
libc_hidden_builtin_def (STRCPY)
159
#endif
833
#endif
(-)sysdeps/x86_64/memset.S (-109 / +291 lines)
Lines 1-138 Link Here
1
/* memset/bzero -- set memory area to CH/0
1
# (c) 2002 Advanced Micro Devices, Inc.
2
   Optimized version for x86-64.
2
# YOUR USE OF THIS CODE IS SUBJECT TO THE TERMS
3
   Copyright (C) 2002, 2003, 2004 Free Software Foundation, Inc.
3
# AND CONDITIONS OF THE GNU LESSER GENERAL PUBLIC
4
   This file is part of the GNU C Library.
4
# LICENSE FOUND IN THE "README" FILE THAT IS
5
   Contributed by Andreas Jaeger <aj@suse.de>.
5
# INCLUDED WITH THIS FILE
6
6
7
   The GNU C Library is free software; you can redistribute it and/or
8
   modify it under the terms of the GNU Lesser General Public
9
   License as published by the Free Software Foundation; either
10
   version 2.1 of the License, or (at your option) any later version.
11
12
   The GNU C Library is distributed in the hope that it will be useful,
13
   but WITHOUT ANY WARRANTY; without even the implied warranty of
14
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
   Lesser General Public License for more details.
16
17
   You should have received a copy of the GNU Lesser General Public
18
   License along with the GNU C Library; if not, write to the Free
19
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20
   02111-1307 USA.  */
21
7
22
#include <sysdep.h>
8
#include <sysdep.h>
23
#include "asm-syntax.h"
9
#include "asm-syntax.h"
24
#include "bp-sym.h"
10
#include "bp-sym.h"
25
#include "bp-asm.h"
11
#include "bp-asm.h"
12
#include <rtld-global-offsets.h>
26
13
27
/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */
14
/* BEWARE: `#ifdef memset' means that memset is redefined as `bzero' */
28
#define BZERO_P (defined memset)
15
#define BZERO_P (defined memset)
29
16
30
/* This is somehow experimental and could made dependend on the cache
17
#ifdef PIC
31
   size.  */
18
	.globl _rtld_local_ro
32
#define LARGE $120000
19
        .hidden _rtld_local_ro
20
        .set    _rtld_local_ro,_rtld_global_ro
21
#endif
22
23
#define LABEL(s) L(memset##s)
33
24
34
        .text
25
        .text
35
#if !BZERO_P && defined PIC && !defined NOT_IN_libc
26
#if !BZERO_P && defined PIC && !defined NOT_IN_libc
36
ENTRY (__memset_chk)
27
ENTRY (__memset_chk)
37
	cmpq	%rdx, %rcx
28
       cmpq    %rdx, %rcx
38
	jb	HIDDEN_JUMPTARGET (__chk_fail)
29
       jb      HIDDEN_JUMPTARGET (__chk_fail)
39
END (__memset_chk)
30
END (__memset_chk)
40
#endif
31
#endif
41
ENTRY (memset)
32
33
ENTRY (memset)                	# (void *, const void*, size_t)
34
42
#if BZERO_P
35
#if BZERO_P
43
	mov	%rsi,%rdx	/* Adjust parameter.  */
36
        mov     %rsi, %rdx		# memset doubles as bzero
44
	xorq	%rsi,%rsi	/* Fill with 0s.  */
37
        xor     %esi, %esi
38
#else
39
	mov	$0x0101010101010101, %rcx # memset is itself
40
        movzx   %sil, %rsi
41
        imul    %rcx, %rsi		# replicate 8 times
45
#endif
42
#endif
46
	cmp	$0x7,%rdx	/* Check for small length.  */
47
	mov	%rdi,%rcx	/* Save ptr as return value.  */
48
	jbe	7f
49
43
50
#if BZERO_P
44
LABEL(try1):
51
	mov	%rsi,%r8	/* Just copy 0.  */
45
        cmp     $64, %rdx
46
        mov     %rdi, %rax		# return memory block address (even for bzero ())
47
        jae	LABEL(1after)
48
49
LABEL(1):                                # 1-byte
50
        test    $1, %dl
51
        jz      LABEL(1a)
52
53
        mov     %sil, (%rdi)
54
        inc	%rdi
55
56
LABEL(1a):
57
        test    $2, %dl
58
        jz      LABEL(1b)
59
60
        mov     %si, (%rdi)
61
        add	$2, %rdi
62
63
LABEL(1b):
64
        test    $4, %dl
65
        jz      LABEL(1c)
66
67
        mov     %esi, (%rdi)
68
	add	$4, %rdi
69
70
LABEL(1c):
71
        test    $8, %dl
72
        jz      LABEL(1d)
73
74
        mov     %rsi, (%rdi)
75
	add	$8, %rdi
76
77
LABEL(1d):
78
        test    $16, %dl
79
        jz      LABEL(1e)
80
81
        mov     %rsi,   (%rdi)
82
        mov     %rsi, 8 (%rdi)
83
	add	$16, %rdi
84
85
LABEL(1e):
86
87
        test    $32, %dl
88
        jz      LABEL(1f)
89
90
        mov     %rsi,    (%rdi)
91
        mov     %rsi,  8 (%rdi)
92
        mov     %rsi, 16 (%rdi)
93
        mov     %rsi, 24 (%rdi)
94
#	add	$32, %rdi
95
96
LABEL(1f):
97
98
LABEL(exit):
99
        rep
100
        ret
101
102
        .p2align 4
103
104
LABEL(1after):
105
106
LABEL(32try):
107
        cmp     $256, %rdx
108
        ja     LABEL(32after)
109
110
LABEL(32):                               # 32-byte
111
        mov     %edx, %ecx
112
        shr     $5, %ecx
113
        jz      LABEL(32skip)
114
115
        .p2align 4
116
117
LABEL(32loop):
118
        dec     %ecx
119
120
        mov     %rsi,    (%rdi)
121
        mov     %rsi,  8 (%rdi)
122
        mov     %rsi, 16 (%rdi)
123
        mov     %rsi, 24 (%rdi)
124
125
        lea     32 (%rdi), %rdi
126
127
        jz      LABEL(32skip)
128
129
        dec     %ecx
130
131
        mov     %rsi,    (%rdi)
132
        mov     %rsi,  8 (%rdi)
133
        mov     %rsi, 16 (%rdi)
134
        mov     %rsi, 24 (%rdi)
135
136
        lea     32 (%rdi), %rdi
137
138
        jnz     LABEL(32loop)
139
140
        .p2align 4
141
142
LABEL(32skip):
143
        and     $31, %edx
144
        jnz     LABEL(1)
145
146
        rep
147
        ret
148
149
        .p2align 4
150
151
LABEL(32after):
152
153
#ifdef PIC
154
	mov     _rtld_local_ro@GOTPCREL(%rip), %r8
155
	mov     RTLD_GLOBAL_DL_CACHE1SIZEHALF(%r8), %r9
52
#else
156
#else
53
	/* Populate 8 bit data to full 64-bit.  */
157
	mov     _dl_cache1sizehalf, %r9
54
	movabs	$0x0101010101010101,%r8
55
	movzbl	%sil,%eax
56
	imul	%rax,%r8
57
#endif
158
#endif
58
	test	$0x7,%edi	/* Check for alignment.  */
159
	prefetcht0 (%r9)		# improves test further ahead
59
	je	2f
60
160
61
	.p2align 4
161
LABEL(aligntry):
62
1:	/* Align ptr to 8 byte.  */
162
        mov     %edi, %ecx              # align by destination
63
	mov	%sil,(%rcx)
64
	dec	%rdx
65
	inc	%rcx
66
	test	$0x7,%ecx
67
	jne	1b
68
69
2:	/* Check for really large regions.  */
70
	mov	%rdx,%rax
71
	shr	$0x6,%rax
72
	je	4f
73
	cmp	LARGE, %rdx
74
	jae	11f
75
163
76
	.p2align 4
164
        and     $7, %ecx                # skip if already aligned
77
3:	/* Copy 64 bytes.  */
165
        jz      LABEL(alignafter)
78
	mov	%r8,(%rcx)
166
79
	mov	%r8,0x8(%rcx)
167
LABEL(align):                            # align
80
	mov	%r8,0x10(%rcx)
168
        lea     -8 (%rcx, %rdx), %rdx
81
	mov	%r8,0x18(%rcx)
169
        sub     $8, %ecx
82
	mov	%r8,0x20(%rcx)
170
83
	mov	%r8,0x28(%rcx)
171
        .p2align 4
84
	mov	%r8,0x30(%rcx)
172
85
	mov	%r8,0x38(%rcx)
173
LABEL(alignloop):
86
	add	$0x40,%rcx
174
        inc     %ecx
87
	dec	%rax
175
88
	jne	3b
176
        mov     %sil, (%rdi)
89
177
        lea     1 (%rdi), %rdi
90
4:	/* Copy final bytes.  */
178
91
	and	$0x3f,%edx
179
        jnz     LABEL(alignloop)
92
	mov	%rdx,%rax
180
93
	shr	$0x3,%rax
181
        .p2align 4
94
	je	6f
182
95
183
LABEL(alignafter):
96
5:	/* First in chunks of 8 bytes.  */
184
#ifdef PIC
97
	mov	%r8,(%rcx)
185
        mov     _rtld_local_ro@GOTPCREL(%rip), %r9
98
	add	$0x8,%rcx
186
	mov	RTLD_GLOBAL_DL_CACHE2SIZEHALF(%r9), %r8
99
	dec	%rax
187
	# For MP System half cache size is better, for UP full cache size
100
	jne	5b
188
	# is better -> use half cache size only
101
6:
102
	and	$0x7,%edx
103
7:
104
	test	%rdx,%rdx
105
	je	9f
106
8:	/* And finally as bytes (up to 7).  */
107
	mov	%sil,(%rcx)
108
	inc	%rcx
109
	dec	%rdx
110
	jne	8b
111
9:
112
#if BZERO_P
113
	nop
114
#else
189
#else
115
	/* Load result (only if used as memset).  */
190
        mov     _dl_cache2sizehalf, %r8
116
	mov	%rdi,%rax	/* start address of destination is result */
117
#endif
191
#endif
118
	retq
192
        cmp     %rdx, %r8
193
        cmova   %rdx, %r8
194
195
LABEL(fasttry):
196
	cmp	$2048, %rdx		# this is slow for some block sizes
197
	jb	LABEL(64)
198
199
LABEL(fast):				# microcode
200
	mov	%r8, %rcx
201
	and	$-8, %r8
202
	shr	$3, %rcx
203
204
	xchg	%rax, %rsi
205
206
	rep
207
	stosq
208
209
	xchg	%rax, %rsi
210
211
LABEL(fastskip):
212
	sub	%r8, %rdx
213
	ja	LABEL(64after)
214
215
	and	$7, %edx
216
	jnz	LABEL(1)
217
218
	rep
219
	ret
119
220
120
	.p2align 4
221
	.p2align 4
121
11:	/* Copy 64 bytes without polluting the cache.  */
222
122
	/* We could use	movntdq    %xmm0,(%rcx) here to further
223
LABEL(fastafter):
123
	   speed up for large cases but let's not use XMM registers.  */
224
124
	movnti	%r8,(%rcx)
225
LABEL(64try):
125
	movnti  %r8,0x8(%rcx)
226
126
	movnti  %r8,0x10(%rcx)
227
LABEL(64):                               # 64-byte
127
	movnti  %r8,0x18(%rcx)
228
        mov     %r8, %rcx
128
	movnti  %r8,0x20(%rcx)
229
        and     $-64, %r8
129
	movnti  %r8,0x28(%rcx)
230
        shr     $6, %rcx
130
	movnti  %r8,0x30(%rcx)
231
131
	movnti  %r8,0x38(%rcx)
232
        dec     %rcx                    # this iteration starts the prefetcher sooner
132
	add	$0x40,%rcx
233
133
	dec	%rax
234
        mov     %rsi,    (%rdi)
134
	jne	11b
235
        mov     %rsi,  8 (%rdi)
135
	jmp	4b
236
        mov     %rsi, 16 (%rdi)
237
        mov     %rsi, 24 (%rdi)
238
        mov     %rsi, 32 (%rdi)
239
        mov     %rsi, 40 (%rdi)
240
        mov     %rsi, 48 (%rdi)
241
        mov     %rsi, 56 (%rdi)
242
243
        lea     64 (%rdi), %rdi
244
245
        .p2align 4
246
247
LABEL(64loop):
248
        dec     %rcx
249
250
        mov     %rsi,    (%rdi)
251
        mov     %rsi,  8 (%rdi)
252
        mov     %rsi, 16 (%rdi)
253
        mov     %rsi, 24 (%rdi)
254
        mov     %rsi, 32 (%rdi)
255
        mov     %rsi, 40 (%rdi)
256
        mov     %rsi, 48 (%rdi)
257
        mov     %rsi, 56 (%rdi)
258
259
        lea     64 (%rdi), %rdi
260
261
        jnz     LABEL(64loop)
262
263
LABEL(64skip):
264
        sub     %r8, %rdx
265
        ja      LABEL(64after)
266
267
	and     $63, %edx
268
	jnz     LABEL(32)
269
270
        rep
271
        ret
272
273
        .p2align 4
274
275
LABEL(64after):
276
277
LABEL(NTtry):
278
279
LABEL(NT):                               # 128-byte
280
        mov     %rdx, %rcx
281
        shr     $7, %rcx
282
        jz      LABEL(NTskip)
283
284
        .p2align 4
285
286
LABEL(NTloop):                  # on an MP system it would be better to prefetchnta 320 (%rdi) and 384 (%rdi) here, but not so on an 1P system
287
        dec     %rcx
288
289
        movnti  %rsi,     (%rdi)
290
        movnti  %rsi,   8 (%rdi)
291
        movnti  %rsi,  16 (%rdi)
292
        movnti  %rsi,  24 (%rdi)
293
        movnti  %rsi,  32 (%rdi)
294
        movnti  %rsi,  40 (%rdi)
295
        movnti  %rsi,  48 (%rdi)
296
        movnti  %rsi,  56 (%rdi)
297
        movnti  %rsi,  64 (%rdi)
298
        movnti  %rsi,  72 (%rdi)
299
        movnti  %rsi,  80 (%rdi)
300
        movnti  %rsi,  88 (%rdi)
301
        movnti  %rsi,  96 (%rdi)
302
        movnti  %rsi, 104 (%rdi)
303
        movnti  %rsi, 112 (%rdi)
304
        movnti  %rsi, 120 (%rdi)
305
306
        lea     128 (%rdi), %rdi
307
308
        jnz     LABEL(NTloop)
309
310
        sfence
311
312
LABEL(NTskip):
313
        and     $127, %edx
314
        jnz     LABEL(32)
315
316
        rep
317
        ret
136
318
137
END (memset)
319
END (memset)
138
#if !BZERO_P
320
#if !BZERO_P
(-)sysdeps/x86_64/memcpy.S (-67 / +486 lines)
Lines 1-101 Link Here
1
/* Highly optimized version for x86-64.
1
# (c) 2002 Advanced Micro Devices, Inc.
2
   Copyright (C) 1997, 2000, 2002, 2003, 2004 Free Software Foundation, Inc.
2
# YOUR USE OF THIS CODE IS SUBJECT TO THE TERMS
3
   This file is part of the GNU C Library.
3
# AND CONDITIONS OF THE GNU LESSER GENERAL PUBLIC
4
   Based on i586 version contributed by Ulrich Drepper <drepper@cygnus.com>, 1997.
4
# LICENSE FOUND IN THE "README" FILE THAT IS
5
5
# INCLUDED WITH THIS FILE
6
   The GNU C Library is free software; you can redistribute it and/or
7
   modify it under the terms of the GNU Lesser General Public
8
   License as published by the Free Software Foundation; either
9
   version 2.1 of the License, or (at your option) any later version.
10
11
   The GNU C Library is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
   Lesser General Public License for more details.
15
16
   You should have received a copy of the GNU Lesser General Public
17
   License along with the GNU C Library; if not, write to the Free
18
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19
   02111-1307 USA.  */
20
6
21
#include <sysdep.h>
7
#include <sysdep.h>
22
#include "asm-syntax.h"
8
#include "asm-syntax.h"
23
#include "bp-sym.h"
9
#include "bp-sym.h"
24
#include "bp-asm.h"
10
#include "bp-asm.h"
11
#include <rtld-global-offsets.h>
25
12
26
/* BEWARE: `#ifdef memcpy' means that memcpy is redefined as `mempcpy',
13
/* BEWARE: `#ifdef memcpy' means that memcpy is redefined as `mempcpy',
27
   and the return value is the byte after the last one copied in
14
   and the return value is the byte after the last one copied in
28
   the destination. */
15
   the destination. */
29
#define MEMPCPY_P (defined memcpy)
16
#if defined memcpy
17
# define MEMPCPY_P 1
18
#else
19
# define MEMPCPY_P 0
20
#endif
21
22
23
#ifdef PIC
24
	.globl _rtld_local_ro
25
        .hidden _rtld_local_ro
26
        .set    _rtld_local_ro,_rtld_global_ro
27
#endif
28
	
29
#define LABEL(s) L(memcpy##s)
30
30
31
        .text
31
        .text
32
#if defined PIC && !defined NOT_IN_libc
32
#if defined PIC && !defined NOT_IN_libc && !defined USE_AS_BCOPY
33
ENTRY (__memcpy_chk)
33
ENTRY (__memcpy_chk)
34
	cmpq	%rdx, %rcx
34
       cmpq    %rdx, %rcx
35
	jb	HIDDEN_JUMPTARGET (__chk_fail)
35
       jb      HIDDEN_JUMPTARGET (__chk_fail)
36
END (__memcpy_chk)
36
END (__memcpy_chk)
37
#endif
37
#endif
38
ENTRY (BP_SYM (memcpy))
38
39
	/* Cutoff for the big loop is a size of 32 bytes since otherwise
39
ENTRY   (memcpy)                        # (void *, const void*, size_t)
40
	   the loop will never be entered.  */
40
41
	cmpq	$32, %rdx
41
LABEL(1try):
42
	movq	%rdx, %rcx
42
        cmp     $8, %rdx
43
#if !MEMPCPY_P
43
#if defined (USE_AS_MEMPCPY)
44
	movq	%rdi, %r10	/* Save value. */
44
        lea     (%rdi, %rdx), %rax
45
#elif defined (USE_AS_BCOPY)
46
	mov	%rsi, %rax
47
	mov	%rdi, %rsi
48
	mov	%rax, %rdi
49
#else
50
        mov     %rdi, %rax
45
#endif
51
#endif
52
        jae     LABEL(1after)
53
54
LABEL(1):				# 1-byte
55
        test    $4, %dl
56
        jz      LABEL(1a)
57
58
        mov     (%rsi), %ecx
59
        mov     %ecx, (%rdi)
60
61
        add	$4, %rsi
62
        add	$4, %rdi
63
64
LABEL(1a):
65
        test    $2, %dl
66
        jz      LABEL(1b)
67
68
        mov     (%rsi), %cx
69
        mov     %cx, (%rdi)
70
71
        add	$2, %rsi
72
        add	$2, %rdi
73
74
LABEL(1b):
75
        test    $1, %dl
76
        jz      LABEL(exit)
77
78
        mov     (%rsi), %cl
79
        mov     %cl, (%rdi)
80
81
LABEL(exit):
82
        rep
83
        ret
84
85
        .p2align 4
86
87
LABEL(1after):
88
        push    %rax
46
89
47
	/* We need this in any case.  */
90
LABEL(8try):
48
	cld
91
        cmp     $32, %rdx
92
        jae     LABEL(8after)
49
93
50
	jbe	1f
94
LABEL(8):                        # 8-byte
95
        mov     %edx, %ecx
96
        shr     $3, %ecx
97
        jz      LABEL(8skip)
51
98
52
	/* Align destination.  */
99
        .p2align 4
53
	movq	%rdi, %rax
54
	negq	%rax
55
	andq	$7, %rax
56
	subq	%rax, %rcx
57
	xchgq	%rax, %rcx
58
100
59
	rep; movsb
101
LABEL(8loop):
102
        dec     %ecx
60
103
61
	movq	%rax, %rcx
104
        mov     (%rsi), %rax
62
	subq	$32, %rcx
105
        mov     %rax, (%rdi)
63
	js	2f
64
106
65
	.p2align 4
107
        lea     8 (%rsi), %rsi
66
3:
108
        lea     8 (%rdi), %rdi
67
109
68
	/* Now correct the loop counter.  Please note that in the following
110
        jnz     LABEL(8loop)
69
	   code the flags are not changed anymore.  */
70
	subq	$32, %rcx
71
111
72
	movq	(%rsi), %rax
112
LABEL(8skip):
73
	movq	8(%rsi), %rdx
113
        and     $7, %edx
74
	movq	16(%rsi), %r8
114
        pop     %rax
75
	movq	24(%rsi), %r9
115
        jnz     LABEL(1)
76
	movq	%rax, (%rdi)
77
	movq	%rdx, 8(%rdi)
78
	movq	%r8, 16(%rdi)
79
	movq	%r9, 24(%rdi)
80
116
81
	leaq	32(%rsi), %rsi
117
        rep
82
	leaq	32(%rdi), %rdi
118
        ret
83
119
84
	jns	3b
120
        .p2align 4
85
121
86
	/* Correct extra loop counter modification.  */
122
LABEL(8after):
87
2:	addq	$32, %rcx
88
1:	rep; movsb
89
123
90
#if MEMPCPY_P
124
LABEL(32try):
91
	movq	%rdi, %rax		/* Set return value.  */
125
	mov	$512, %r8d		# size for unaligned data
126
	mov	$4096, %r9d		# size for aligned data
127
	test	$7, %esi		# check if either source..
128
	cmovz	%r9, %r8
129
	test	$7, %edi		# .. or destination is aligned
130
	cmovz	%r9, %r8
131
132
        cmp     %r8, %rdx
133
        ja	LABEL(32after)
134
135
LABEL(32):				# 32-byte
136
        mov     %edx, %ecx
137
        shr     $5, %ecx
138
        jz      LABEL(32skip)
139
140
        .p2align 4
141
142
LABEL(32loop):
143
        dec     %ecx
144
145
        mov        (%rsi), %rax
146
        mov      8 (%rsi), %r8
147
        mov     16 (%rsi), %r9
148
        mov     24 (%rsi), %r10
149
150
        mov     %rax,    (%rdi)
151
        mov      %r8,  8 (%rdi)
152
        mov      %r9, 16 (%rdi)
153
        mov     %r10, 24 (%rdi)
154
155
        lea     32 (%rsi), %rsi
156
        lea     32 (%rdi), %rdi
157
158
        jz      LABEL(32skip)
159
160
        dec     %ecx
161
162
        mov        (%rsi), %rax
163
        mov      8 (%rsi), %r8
164
        mov     16 (%rsi), %r9
165
        mov     24 (%rsi), %r10
166
167
        mov     %rax,    (%rdi)
168
        mov      %r8,  8 (%rdi)
169
        mov      %r9, 16 (%rdi)
170
        mov     %r10, 24 (%rdi)
171
172
        lea     32 (%rsi), %rsi
173
        lea     32 (%rdi), %rdi
174
175
        jnz     LABEL(32loop)
176
177
        .p2align 4
178
179
LABEL(32skip):
180
        and     $31, %edx
181
        jnz     LABEL(8)
182
183
        pop     %rax
184
        ret
185
186
        .p2align 4
187
188
LABEL(32after):
189
#ifdef PIC
190
	mov     _rtld_local_ro@GOTPCREL(%rip), %r8
191
	mov     RTLD_GLOBAL_DL_CACHE1SIZE(%r8), %r9
92
#else
192
#else
93
	movq	%r10, %rax		/* Set return value.  */
193
	mov     _dl_cache1size, %r9
94
	
95
#endif
194
#endif
195
	prefetcht0 (%r9)		# improves test further ahead
196
197
LABEL(aligntry):
198
        mov     %edi, %r8d      	# align by destination
199
200
        and	$7, %r8d
201
        jz      LABEL(alignafter)  	# not unaligned
202
203
LABEL(align):                      	# align
204
        lea     -8 (%r8, %rdx), %rdx
205
        sub     $8, %r8d
206
207
        .p2align 4
208
209
LABEL(alignloop):
210
        inc     %r8d
211
212
        mov     (%rsi), %al
213
        mov     %al, (%rdi)
214
215
        lea     1 (%rsi), %rsi
216
        lea     1 (%rdi), %rdi
217
218
        jnz     LABEL(alignloop)
219
220
        .p2align 4
221
222
LABEL(alignafter):
223
224
LABEL(fasttry):
225
#ifdef PIC
226
	mov     _rtld_local_ro@GOTPCREL(%rip), %r9
227
	mov     RTLD_GLOBAL_DL_CACHE1SIZEHALF(%r9), %r11
228
#else
229
	mov     _dl_cache1sizehalf, %r11
230
#endif
231
        cmp     %rdx, %r11
232
        cmova   %rdx, %r11
233
234
LABEL(fast):
235
	mov	%r11, %rcx
236
	and	$-8, %r11
237
	shr	$3, %rcx
238
	jz	LABEL(fastskip)
239
240
	rep				# good ol' MOVS
241
	movsq
242
243
LABEL(fastskip):
244
	sub	%r11, %rdx
245
	test	$-8, %rdx
246
	jnz	LABEL(64after)
247
248
	and	$7, %edx
249
	pop	%rax
250
	jnz	LABEL(1)
251
252
	rep
96
	ret
253
	ret
97
254
98
END (BP_SYM (memcpy))
255
        .p2align 4
256
257
LABEL(64):                               # 64-byte
258
        mov     %r11, %rcx
259
        and     $-64, %r11
260
        shr     $6, %rcx
261
        jz      LABEL(64skip)
262
263
        .p2align 4
264
265
LABEL(64loop):
266
        dec     %ecx
267
268
        mov        (%rsi), %rax
269
        mov      8 (%rsi), %r8
270
        mov     16 (%rsi), %r9
271
        mov     24 (%rsi), %r10
272
273
        mov     %rax,    (%rdi)
274
        mov      %r8,  8 (%rdi)
275
        mov      %r9, 16 (%rdi)
276
        mov     %r10, 24 (%rdi)
277
278
        mov     32 (%rsi), %rax
279
        mov     40 (%rsi), %r8
280
        mov     48 (%rsi), %r9
281
        mov     56 (%rsi), %r10
282
283
        mov     %rax, 32 (%rdi)
284
        mov      %r8, 40 (%rdi)
285
        mov      %r9, 48 (%rdi)
286
        mov     %r10, 56 (%rdi)
287
288
        lea     64 (%rsi), %rsi
289
        lea     64 (%rdi), %rdi
290
291
        jz      LABEL(64skip)
292
293
        dec     %ecx
294
295
        mov        (%rsi), %rax
296
        mov      8 (%rsi), %r8
297
        mov     16 (%rsi), %r9
298
        mov     24 (%rsi), %r10
299
300
        mov     %rax,    (%rdi)
301
        mov      %r8,  8 (%rdi)
302
        mov      %r9, 16 (%rdi)
303
        mov     %r10, 24 (%rdi)
304
305
        mov     32 (%rsi), %rax
306
        mov     40 (%rsi), %r8
307
        mov     48 (%rsi), %r9
308
        mov     56 (%rsi), %r10
309
310
        mov     %rax, 32 (%rdi)
311
        mov      %r8, 40 (%rdi)
312
        mov      %r9, 48 (%rdi)
313
        mov     %r10, 56 (%rdi)
314
315
        lea     64 (%rsi), %rsi
316
        lea     64 (%rdi), %rdi
317
318
        jnz     LABEL(64loop)
319
320
        .p2align 4
321
322
LABEL(64skip):
323
        sub     %r11, %rdx
324
        test    $-64, %rdx
325
        jnz     LABEL(64after)
326
327
        and     $63, %edx
328
        jnz     LABEL(32)
329
330
        pop     %rax
331
        ret
332
333
        .p2align 4
334
335
LABEL(64after):
336
337
LABEL(pretry):
338
#ifdef PIC
339
	mov     _rtld_local_ro@GOTPCREL(%rip), %r11
340
	mov     RTLD_GLOBAL_DL_CACHE2SIZEHALF(%r11), %r8
341
#else
342
	mov     _dl_cache2sizehalf, %r8
343
#endif
344
        cmp     %rdx, %r8
345
        cmova   %rdx, %r8
346
347
LABEL(pre):                              # 64-byte prefetching
348
        mov     %r8, %rcx
349
        and     $-64, %r8
350
        shr     $6, %rcx
351
        jz      LABEL(preskip)
352
353
	push	%r15
354
        push    %r14
355
        push    %r13
356
        push    %r12
357
        push    %rbx
358
359
	mov	$896, %r15		# 1P look-ahead (MP improves with 640)
360
361
        .p2align 4
362
363
LABEL(preloop):
364
        dec     %rcx
365
366
        mov        (%rsi), %rax
367
        mov      8 (%rsi), %rbx
368
        mov     16 (%rsi), %r9
369
        mov     24 (%rsi), %r10
370
        mov     32 (%rsi), %r11
371
        mov     40 (%rsi), %r12
372
        mov     48 (%rsi), %r13
373
        mov     56 (%rsi), %r14
374
375
        prefetcht0    (%rsi, %r15)
376
        prefetcht0 64 (%rsi, %r15)
377
378
        mov     %rax,    (%rdi)
379
        mov     %rbx,  8 (%rdi)
380
        mov      %r9, 16 (%rdi)
381
        mov     %r10, 24 (%rdi)
382
        mov     %r11, 32 (%rdi)
383
        mov     %r12, 40 (%rdi)
384
        mov     %r13, 48 (%rdi)
385
        mov     %r14, 56 (%rdi)
386
387
        lea     64 (%rsi), %rsi
388
        lea     64 (%rdi), %rdi
389
390
        jz      LABEL(preskipa)
391
392
        dec     %rcx
393
394
        mov        (%rsi), %rax
395
        mov      8 (%rsi), %rbx
396
        mov     16 (%rsi), %r9
397
        mov     24 (%rsi), %r10
398
        mov     32 (%rsi), %r11
399
        mov     40 (%rsi), %r12
400
        mov     48 (%rsi), %r13
401
        mov     56 (%rsi), %r14
402
403
        mov     %rax,    (%rdi)
404
        mov     %rbx,  8 (%rdi)
405
        mov      %r9, 16 (%rdi)
406
        mov     %r10, 24 (%rdi)
407
        mov     %r11, 32 (%rdi)
408
        mov     %r12, 40 (%rdi)
409
        mov     %r13, 48 (%rdi)
410
        mov     %r14, 56 (%rdi)
411
412
        prefetcht0 -64 (%rdi, %r15)
413
        prefetcht0     (%rdi, %r15)
414
415
        lea     64 (%rsi), %rsi
416
        lea     64 (%rdi), %rdi
417
418
        jnz     LABEL(preloop)
419
420
LABEL(preskipa):
421
        pop     %rbx
422
        pop     %r12
423
        pop     %r13
424
        pop     %r14
425
        pop	%r15
426
427
#       .p2align 4
428
429
LABEL(preskip):
430
        sub     %r8, %rdx
431
        test    $-64, %rdx
432
        jnz     LABEL(preafter)
433
434
        and     $63, %edx
435
        jnz     LABEL(32)
436
437
        pop     %rax
438
        ret
439
440
        .p2align 4
441
442
LABEL(preafter):
443
444
LABEL(NTtry):
445
446
LABEL(NT):                               # NT 64-byte
447
        mov     %rdx, %rcx
448
        shr     $7, %rcx
449
        jz      LABEL(NTskip)
450
451
        push    %r14
452
        push    %r13
453
        push    %r12
454
455
       .p2align 4
456
457
LABEL(NTloop):
458
        prefetchnta 768 (%rsi)		# prefetching NT here is not so good on B0 and C0 MP systems
459
        prefetchnta 832 (%rsi)
460
461
        dec     %rcx
462
463
        mov        (%rsi), %rax
464
        mov      8 (%rsi), %r8
465
        mov     16 (%rsi), %r9
466
        mov     24 (%rsi), %r10
467
        mov     32 (%rsi), %r11
468
        mov     40 (%rsi), %r12
469
        mov     48 (%rsi), %r13
470
        mov     56 (%rsi), %r14
471
472
        movnti  %rax,    (%rdi)
473
        movnti   %r8,  8 (%rdi)
474
        movnti   %r9, 16 (%rdi)
475
        movnti  %r10, 24 (%rdi)
476
        movnti  %r11, 32 (%rdi)
477
        movnti  %r12, 40 (%rdi)
478
        movnti  %r13, 48 (%rdi)
479
        movnti  %r14, 56 (%rdi)
480
481
        mov      64 (%rsi), %rax
482
        mov      72 (%rsi), %r8
483
        mov      80 (%rsi), %r9
484
        mov      88 (%rsi), %r10
485
        mov      96 (%rsi), %r11
486
        mov     104 (%rsi), %r12
487
        mov     112 (%rsi), %r13
488
        mov     120 (%rsi), %r14
489
490
        movnti  %rax,  64 (%rdi)
491
        movnti   %r8,  72 (%rdi)
492
        movnti   %r9,  80 (%rdi)
493
        movnti  %r10,  88 (%rdi)
494
        movnti  %r11,  96 (%rdi)
495
        movnti  %r12, 104 (%rdi)
496
        movnti  %r13, 112 (%rdi)
497
        movnti  %r14, 120 (%rdi)
498
499
        lea     128 (%rsi), %rsi
500
        lea     128 (%rdi), %rdi
501
502
        jnz     LABEL(NTloop)
503
504
        sfence
505
506
        pop     %r12
507
        pop     %r13
508
        pop     %r14
509
510
LABEL(NTskip):
511
        and     $127, %edx
512
        jnz     LABEL(32)
513
514
        pop     %rax
515
        ret
516
517
END (memcpy)
99
#if !MEMPCPY_P
518
#if !MEMPCPY_P
100
libc_hidden_builtin_def (memcpy)
519
libc_hidden_builtin_def (memcpy)
101
#endif
520
#endif
(-)sysdeps/x86_64/mempcpy.S (+1 lines)
Lines 1-3 Link Here
1
#define USE_AS_MEMPCPY
1
#define memcpy __mempcpy
2
#define memcpy __mempcpy
2
#include <sysdeps/x86_64/memcpy.S>
3
#include <sysdeps/x86_64/memcpy.S>
3
4
(-)sysdeps/x86_64/strcmp.S (-42 / +484 lines)
Lines 1-45 Link Here
1
/* Highly optimized version for x86-64.
1
# $Header: /K8_Projects/Glibc/amd64strcmp.S 10    2/10/04 11:48 Emenezes $
2
   Copyright (C) 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
2
3
   This file is part of the GNU C Library.
3
# (c) 2002 Advanced Micro Devices, Inc.
4
   Based on i686 version contributed by Ulrich Drepper
4
# YOUR USE OF THIS CODE IS SUBJECT TO THE TERMS
5
   <drepper@cygnus.com>, 1999.
5
# AND CONDITIONS OF THE GNU LESSER GENERAL PUBLIC
6
6
# LICENSE FOUND IN THE "README" FILE THAT IS
7
   The GNU C Library is free software; you can redistribute it and/or
7
# INCLUDED WITH THIS FILE
8
   modify it under the terms of the GNU Lesser General Public
8
9
   License as published by the Free Software Foundation; either
9
#include "sysdep.h"
10
   version 2.1 of the License, or (at your option) any later version.
10
11
11
#define LABEL(s) L##s
12
   The GNU C Library is distributed in the hope that it will be useful,
13
   but WITHOUT ANY WARRANTY; without even the implied warranty of
14
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
   Lesser General Public License for more details.
16
17
   You should have received a copy of the GNU Lesser General Public
18
   License along with the GNU C Library; if not, write to the Free
19
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20
   02111-1307 USA.  */
21
22
#include <sysdep.h>
23
#include "asm-syntax.h"
24
#include "bp-sym.h"
25
#include "bp-asm.h"
26
12
27
        .text
13
        .text
28
ENTRY (BP_SYM (strcmp))
14
29
L(oop):	movb	(%rdi), %al
15
ENTRY   (strcmp)			# (const char *, const char *)
30
	cmpb	(%rsi), %al
16
31
	jne	L(neq)
17
        xor     %ecx, %ecx
32
	incq	%rdi
18
33
	incq	%rsi
19
#ifdef USE_AS_STRNCMP			// (const char *, const char *, size_t)
34
	testb	%al, %al
20
        mov	%r14, -8 (%rsp)
35
	jnz	L(oop)
21
	mov	%rdx, %r14
36
22
37
	xorq	%rax, %rax
23
	test	%rdx, %rdx
38
	ret
24
	mov	%edx, %eax
39
25
	jz	.LABEL(exitz)
40
L(neq):	movl	$1, %eax
26
#endif
41
	movl	$-1, %ecx
27
42
	cmovbl	%ecx, %eax
28
.LABEL(aligntry):
43
	ret
29
        mov     %rsi, %r8		# align by "source"
44
END (BP_SYM (strcmp))
30
        and     $8 - 1, %r8		# between 0 and 8 characters compared
31
	jz	.LABEL(alignafter)
32
33
.LABEL(align):
34
        sub     $8, %r8
35
36
        .p2align 4
37
38
.LABEL(alignloop):
39
        mov     (%rsi, %rcx), %al
40
        mov	(%rdi, %rcx), %dl
41
42
#ifdef USE_AS_STRNCMP
43
	dec	%r14
44
	jl	.LABEL(exitafter)
45
#endif
46
47
        cmp     %dl, %al		# check if same character
48
        jne     .LABEL(exitafter)
49
        test    %al, %al		# check if character a NUL
50
        jz      .LABEL(exitafter)
51
52
        inc     %ecx
53
54
        inc     %r8
55
        jnz     .LABEL(alignloop)
56
57
        .p2align 4
58
59
.LABEL(alignafter):
60
61
        mov	%r15, -32 (%rsp)
62
        mov	%rbp, -24 (%rsp)
63
        mov	%rbx, -16 (%rsp)
64
65
.LABEL(pagealigntry):			# page align by "destination"
66
	mov	$4096, %r15d		# page size is 4096
67
        lea	(%rdi, %rcx), %ebp
68
        and     $4095, %ebp		# page mask
69
        sub	%r15d, %ebp
70
71
.LABEL(64):                              # 64-byte
72
	mov     $0xfefefefefefefeff, %rbx # magic number
73
74
        .p2align 4
75
76
.LABEL(64loop):
77
	add	$64, %ebp		# check if "destination" crosses a page unevenly
78
	jle	.LABEL(64gobble)
79
80
        sub	%r15d, %ebp
81
        lea	64 (%rcx), %r8
82
83
        .p2align 4
84
85
.LABEL(64nibble):
86
        mov     (%rsi, %rcx), %al
87
        mov	(%rdi, %rcx), %dl
88
89
#ifdef USE_AS_STRNCMP
90
	dec	%r14
91
	jl	.exit
92
#endif
93
94
        cmp     %dl, %al		# check if same character
95
        jne     .exit
96
        test    %al, %al		# check if character a NUL
97
        jz      .exit
98
99
        inc	%ecx
100
101
        cmp	%ecx, %r8d
102
        ja	.LABEL(64nibble)
103
104
        .p2align 4
105
106
.LABEL(64gobble):
107
        mov     (%rsi, %rcx), %rax
108
        mov     (%rdi, %rcx), %rdx
109
110
#ifdef USE_AS_STRNCMP
111
	sub	$8, %r14
112
	jl	.LABEL(tail)
113
#endif
114
115
        mov     %rbx, %r8
116
        add     %rax, %r8
117
        sbb     %r10, %r10
118
119
        mov     %rbx, %r9
120
        add     %rdx, %r9
121
        sbb     %r11, %r11
122
123
        xor     %rax, %r8
124
        or      %rbx, %r8
125
        sub     %r10, %r8
126
        jnz     .LABEL(tail)
127
128
        xor     %rdx, %r9
129
        or      %rbx, %r9
130
        sub     %r11, %r9
131
        jnz     .LABEL(tail)
132
133
        cmp     %rdx, %rax
134
        jne     .LABEL(tail)
135
136
        mov     8 (%rsi, %rcx), %rax
137
        mov     8 (%rdi, %rcx), %rdx
138
        add     $8, %ecx
139
140
#ifdef USE_AS_STRNCMP
141
	sub	$8, %r14
142
	jl	.LABEL(tail)
143
#endif
144
145
        mov     %rbx, %r8
146
        add     %rax, %r8
147
        sbb     %r10, %r10
148
149
        mov     %rbx, %r9
150
        add     %rdx, %r9
151
        sbb     %r11, %r11
152
153
        xor     %rax, %r8
154
        or      %rbx, %r8
155
        sub     %r10, %r8
156
        jnz     .LABEL(tail)
157
158
        xor     %rdx, %r9
159
        or      %rbx, %r9
160
        sub     %r11, %r9
161
        jnz     .LABEL(tail)
162
163
        cmp     %rdx, %rax
164
        jne     .LABEL(tail)
165
166
        mov     8 (%rsi, %rcx), %rax
167
        mov     8 (%rdi, %rcx), %rdx
168
        add     $8, %ecx
169
170
#ifdef USE_AS_STRNCMP
171
	sub	$8, %r14
172
	jl	.LABEL(tail)
173
#endif
174
175
        mov     %rbx, %r8
176
        add     %rax, %r8
177
        sbb     %r10, %r10
178
179
        mov     %rbx, %r9
180
        add     %rdx, %r9
181
        sbb     %r11, %r11
182
183
        xor     %rax, %r8
184
        or      %rbx, %r8
185
        sub     %r10, %r8
186
        jnz     .LABEL(tail)
187
188
        xor     %rdx, %r9
189
        or      %rbx, %r9
190
        sub     %r11, %r9
191
        jnz     .LABEL(tail)
192
193
        cmp     %rdx, %rax
194
        jne     .LABEL(tail)
195
196
        mov     8 (%rsi, %rcx), %rax
197
        mov     8 (%rdi, %rcx), %rdx
198
        add     $8, %ecx
199
200
#ifdef USE_AS_STRNCMP
201
	sub	$8, %r14
202
	jl	.LABEL(tail)
203
#endif
204
205
        mov     %rbx, %r8
206
        add     %rax, %r8
207
        sbb     %r10, %r10
208
209
        mov     %rbx, %r9
210
        add     %rdx, %r9
211
        sbb     %r11, %r11
212
213
        xor     %rax, %r8
214
        or      %rbx, %r8
215
        sub     %r10, %r8
216
        jnz     .LABEL(tail)
217
218
        xor     %rdx, %r9
219
        or      %rbx, %r9
220
        sub     %r11, %r9
221
        jnz     .LABEL(tail)
222
223
        cmp     %rdx, %rax
224
        jne     .LABEL(tail)
225
226
        mov     8 (%rsi, %rcx), %rax
227
        mov     8 (%rdi, %rcx), %rdx
228
        add     $8, %ecx
229
230
#ifdef USE_AS_STRNCMP
231
	sub	$8, %r14
232
	jl	.LABEL(tail)
233
#endif
234
235
        mov     %rbx, %r8
236
        add     %rax, %r8
237
        sbb     %r10, %r10
238
239
        mov     %rbx, %r9
240
        add     %rdx, %r9
241
        sbb     %r11, %r11
242
243
        xor     %rax, %r8
244
        or      %rbx, %r8
245
        sub     %r10, %r8
246
        jnz     .LABEL(tail)
247
248
        xor     %rdx, %r9
249
        or      %rbx, %r9
250
        sub     %r11, %r9
251
        jnz     .LABEL(tail)
252
253
        cmp     %rdx, %rax
254
        jne     .LABEL(tail)
255
256
        mov     8 (%rsi, %rcx), %rax
257
        mov     8 (%rdi, %rcx), %rdx
258
        add     $8, %ecx
259
260
#ifdef USE_AS_STRNCMP
261
	sub	$8, %r14
262
	jl	.LABEL(tail)
263
#endif
264
265
        mov     %rbx, %r8
266
        add     %rax, %r8
267
        sbb     %r10, %r10
268
269
        mov     %rbx, %r9
270
        add     %rdx, %r9
271
        sbb     %r11, %r11
272
273
        xor     %rax, %r8
274
        or      %rbx, %r8
275
        sub     %r10, %r8
276
        jnz     .LABEL(tail)
277
278
        xor     %rdx, %r9
279
        or      %rbx, %r9
280
        sub     %r11, %r9
281
        jnz     .LABEL(tail)
282
283
        cmp     %rdx, %rax
284
        jne     .LABEL(tail)
285
286
        mov     8 (%rsi, %rcx), %rax
287
        mov     8 (%rdi, %rcx), %rdx
288
        add     $8, %ecx
289
290
#ifdef USE_AS_STRNCMP
291
	sub	$8, %r14
292
	jl	.LABEL(tail)
293
#endif
294
295
        mov     %rbx, %r8
296
        add     %rax, %r8
297
        sbb     %r10, %r10
298
299
        mov     %rbx, %r9
300
        add     %rdx, %r9
301
        sbb     %r11, %r11
302
303
        xor     %rax, %r8
304
        or      %rbx, %r8
305
        sub     %r10, %r8
306
        jnz     .LABEL(tail)
307
308
        xor     %rdx, %r9
309
        or      %rbx, %r9
310
        sub     %r11, %r9
311
        jnz     .LABEL(tail)
312
313
        cmp     %rdx, %rax
314
        jne     .LABEL(tail)
315
316
        mov     8 (%rsi, %rcx), %rax
317
        mov     8 (%rdi, %rcx), %rdx
318
        add     $8, %ecx
319
320
#ifdef USE_AS_STRNCMP
321
	sub	$8, %r14
322
	jl	.LABEL(tail)
323
#endif
324
325
        mov     %rbx, %r8
326
        add     %rax, %r8
327
        sbb     %r10, %r10
328
329
        mov     %rbx, %r9
330
        add     %rdx, %r9
331
        sbb     %r11, %r11
332
333
        xor     %rax, %r8
334
        or      %rbx, %r8
335
        sub     %r10, %r8
336
        jnz     .LABEL(tail)
337
338
        xor     %rdx, %r9
339
        or      %rbx, %r9
340
        sub     %r11, %r9
341
        jnz     .LABEL(tail)
342
343
        cmp     %rdx, %rax
344
        jne     .LABEL(tail)
345
346
        add	$8, %ecx
347
348
        jmp	.LABEL(64loop)
349
350
.LABEL(64after):
351
352
.LABEL(tailtry):
353
#	mov     (%rsi, %rcx), %rax
354
#	mov     (%rdi, %rcx), %rdx
355
#	add     $8, %rcx
356
357
.LABEL(tail):				# byte tail
358
#ifdef USE_AS_STRNCMP
359
	add	$7, %r14
360
#endif
361
362
        cmp     %dl, %al		# check if same character
363
        jne     .exit
364
        test    %al, %al		# check if character a NUL
365
        jz      .exit
366
367
        shr	$8, %rax
368
        shr	$8, %rdx
369
370
#ifdef USE_AS_STRNCMP
371
	dec	%r14
372
	jl	.exit
373
#endif
374
375
        cmp     %dl, %al
376
        jne     .exit
377
        test    %al, %al
378
        jz      .exit
379
380
        shr	$8, %rax
381
        shr	$8, %rdx
382
383
#ifdef USE_AS_STRNCMP
384
	dec	%r14
385
	jl	.exit
386
#endif
387
388
        cmp     %dl, %al
389
        jne     .exit
390
        test    %al, %al
391
        jz      .exit
392
393
        shr	$8, %rax
394
        shr	$8, %rdx
395
396
#ifdef USE_AS_STRNCMP
397
	dec	%r14
398
	jl	.exit
399
#endif
400
401
        cmp     %dl, %al
402
        jne     .exit
403
        test    %al, %al
404
        jz      .exit
405
406
        shr	$8, %rax
407
        shr	$8, %rdx
408
409
#ifdef USE_AS_STRNCMP
410
	dec	%r14
411
	jl	.exit
412
#endif
413
414
        cmp     %dl, %al
415
        jne     .exit
416
        test    %al, %al
417
        jz      .exit
418
419
        shr	$8, %eax
420
        shr	$8, %edx
421
422
#ifdef USE_AS_STRNCMP
423
	dec	%r14
424
	jl	.exit
425
#endif
426
427
        cmp     %dl, %al
428
        jne     .exit
429
        test    %al, %al
430
        jz      .exit
431
432
        shr	$8, %eax
433
        shr	$8, %edx
434
435
#ifdef USE_AS_STRNCMP
436
	dec	%r14
437
	jl	.exit
438
#endif
439
440
        cmp     %dl, %al
441
        jne     .exit
442
        test    %al, %al
443
        jz      .exit
444
445
        shr	$8, %eax
446
        shr	$8, %edx
447
448
#ifdef USE_AS_STRNCMP
449
	dec	%r14
450
	jl	.exit
451
#endif
452
453
        cmp     %dl, %al
454
        jne     .exit
455
#	test    %al, %al
456
#	jz      .exit
457
458
        .p2align 4,, 15
459
460
.LABEL(tailafter):
461
462
.exit:
463
	mov	-32 (%rsp), %r15
464
	mov	-24 (%rsp), %rbp
465
        mov	-16 (%rsp), %rbx
466
467
        .p2align 4,, 3
468
469
.LABEL(exitafter):
470
#ifdef USE_AS_STRNCMP
471
	test	%r14, %r14
472
	cmovl	%edx, %eax
473
#endif
474
475
	movzx	%al, %eax
476
	movzx	%dl, %edx
477
	sub	%eax, %edx
478
	xchg	%edx, %eax
479
480
#ifdef USE_AS_STRNCMP
481
.LABEL(exitz):
482
	mov	-8 (%rsp), %r14
483
#endif
484
        ret
485
486
END     (strcmp)
45
libc_hidden_builtin_def (strcmp)
487
libc_hidden_builtin_def (strcmp)
(-)sysdeps/unix/sysv/linux/x86_64/dl-procinfo.c (-1 / +1 lines)
Lines 1-5 Link Here
1
#ifdef IS_IN_ldconfig
1
#ifdef IS_IN_ldconfig
2
#include <sysdeps/i386/dl-procinfo.c>
2
#include <sysdeps/i386/dl-procinfo.c>
3
#else
3
#else
4
#include <sysdeps/generic/dl-procinfo.c>
4
#include <sysdeps/x86_64/dl-procinfo.c>
5
#endif
5
#endif
(-)sysdeps/x86_64/dl-procinfo.c (+108 lines)
Line 0 Link Here
1
/* Data for x86-64 version of processor capability information.
2
   Copyright (C) 2004 Free Software Foundation, Inc.
3
   This file is part of the GNU C Library.
4
   Contributed by Andreas Jaeger <aj@suse.de>, 2004.
5
6
   The GNU C Library is free software; you can redistribute it and/or
7
   modify it under the terms of the GNU Lesser General Public
8
   License as published by the Free Software Foundation; either
9
   version 2.1 of the License, or (at your option) any later version.
10
11
   The GNU C Library is distributed in the hope that it will be useful,
12
   but WITHOUT ANY WARRANTY; without even the implied warranty of
13
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
   Lesser General Public License for more details.
15
16
   You should have received a copy of the GNU Lesser General Public
17
   License along with the GNU C Library; if not, write to the Free
18
   Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
19
   02111-1307 USA.  */
20
21
/* This information must be kept in sync with the _DL_HWCAP_COUNT and
22
   _DL_PLATFORM_COUNT definitions in procinfo.h.
23
24
   If anything should be added here check whether the size of each string
25
   is still ok with the given array size.
26
27
   All the #ifdefs in the definitions ar equite irritating but
28
   necessary if we want to avoid duplicating the information.  There
29
   are three different modes:
30
31
   - PROCINFO_DECL is defined.  This means we are only interested in
32
     declarations.
33
34
   - PROCINFO_DECL is not defined:
35
36
     + if SHARED is defined the file is included in an array
37
       initializer.  The .element = { ... } syntax is needed.
38
39
     + if SHARED is not defined a normal array initialization is
40
       needed.
41
  */
42
43
#ifndef PROCINFO_CLASS
44
#define PROCINFO_CLASS
45
#endif
46
47
  /* _dl_cache1size: size of L1 cache  */
48
#if !defined PROCINFO_DECL && defined SHARED
49
  ._dl_cache1size
50
#else
51
PROCINFO_CLASS long int _dl_cache1size
52
#endif
53
#ifndef PROCINFO_DECL
54
= 1024 * 64
55
#endif
56
#if !defined SHARED || defined PROCINFO_DECL
57
;
58
#else
59
,
60
#endif
61
62
  /* _dl_cache1sizehalf: 1/2 size of L1 cache  */
63
#if !defined PROCINFO_DECL && defined SHARED
64
  ._dl_cache1sizehalf
65
#else
66
PROCINFO_CLASS long int _dl_cache1sizehalf
67
#endif
68
#ifndef PROCINFO_DECL
69
= 1024 * 64 / 2
70
#endif
71
#if !defined SHARED || defined PROCINFO_DECL
72
;
73
#else
74
,
75
#endif
76
77
  /* _dl_cache2size: size of L2 cache  */
78
#if !defined PROCINFO_DECL && defined SHARED
79
  ._dl_cache2size
80
#else
81
PROCINFO_CLASS long int _dl_cache2size
82
#endif
83
#ifndef PROCINFO_DECL
84
= 1024 * 1024
85
#endif
86
#if !defined SHARED || defined PROCINFO_DECL
87
;
88
#else
89
,
90
#endif
91
92
  /* _dl_cache2size: 1/2 size of L2 cache  */
93
#if !defined PROCINFO_DECL && defined SHARED
94
  ._dl_cache2sizehalf
95
#else
96
PROCINFO_CLASS long int _dl_cache2sizehalf
97
#endif
98
#ifndef PROCINFO_DECL
99
= 1024 * 1024 / 2
100
#endif
101
#if !defined SHARED || defined PROCINFO_DECL
102
;
103
#else
104
,
105
#endif
106
107
#undef PROCINFO_DECL
108
#undef PROCINFO_CLASS
(-)sysdeps/x86_64/elf/rtld-global-offsets.sym (+10 lines)
Line 0 Link Here
1
#define SHARED 1
2
3
#include <ldsodefs.h>
4
5
#define rtdl_global_offsetof(mem) offsetof (struct rtld_global_ro, mem)
6
7
RTLD_GLOBAL_DL_CACHE1SIZE	rtdl_global_offsetof (_dl_cache1size)
8
RTLD_GLOBAL_DL_CACHE1SIZEHALF	rtdl_global_offsetof (_dl_cache1sizehalf)
9
RTLD_GLOBAL_DL_CACHE2SIZE	rtdl_global_offsetof (_dl_cache2size)
10
RTLD_GLOBAL_DL_CACHE2SIZEHALF	rtdl_global_offsetof (_dl_cache2sizehalf)
(-)sysdeps/x86_64/memcmp.S (+442 lines)
Line 0 Link Here
1
# $Header: /K8_Projects/Glibc/amd64memcmp.S 4     10/06/03 10:57 Emenezes $
2
3
# (c) 2002 Advanced Micro Devices, Inc.
4
# YOUR USE OF THIS CODE IS SUBJECT TO THE TERMS
5
# AND CONDITIONS OF THE GNU LESSER GENERAL PUBLIC
6
# LICENSE FOUND IN THE "README" FILE THAT IS
7
# INCLUDED WITH THIS FILE
8
9
#include "sysdep.h"
10
#include <rtld-global-offsets.h>
11
12
#ifdef PIC
13
	.globl _rtld_local_ro
14
        .hidden _rtld_local_ro
15
        .set    _rtld_local_ro,_rtld_global_ro
16
#endif
17
18
        .text
19
20
ENTRY   (memcmp)                        # (const void *, const void*, size_t)
21
22
L(memcmptry1):
23
        cmp     $8, %rdx
24
        jae     L(memcmp1after)
25
26
L(memcmp1):                                # 1-byte
27
        test    %rdx, %rdx
28
        mov     $0, %eax
29
        jz      L(memcmpexit)
30
31
L(memcmp1loop):
32
        movzbl  (%rdi), %eax
33
        movzbl  (%rsi), %ecx
34
        sub     %ecx, %eax
35
        jnz     L(memcmpexit)
36
37
        dec     %rdx
38
39
        lea     1 (%rdi), %rdi
40
        lea     1 (%rsi), %rsi
41
42
        jnz     L(memcmp1loop)
43
44
L(memcmpexit):
45
        rep
46
        ret
47
48
        .p2align 4
49
50
L(memcmp1after):
51
52
L(memcmp8try):
53
        cmp     $32, %rdx
54
        jae     L(memcmp8after)
55
56
L(memcmp8):                        # 8-byte
57
        mov     %edx, %ecx
58
        shr     $3, %ecx
59
        jz      L(memcmp1)
60
61
        .p2align 4
62
63
L(memcmp8loop):
64
        mov     (%rsi), %rax
65
        cmp     (%rdi), %rax
66
        jne     L(memcmp1)
67
68
        sub     $8, %rdx
69
        dec     %ecx
70
71
        lea     8 (%rsi), %rsi
72
        lea     8 (%rdi), %rdi
73
74
        jnz     L(memcmp8loop)
75
76
L(memcmp8skip):
77
        and     $7, %edx
78
        jnz     L(memcmp1)
79
80
        xor     %eax, %eax
81
        ret
82
83
        .p2align 4
84
85
L(memcmp8after):
86
87
L(memcmp32try):
88
        cmp     $2048, %rdx
89
        ja      L(memcmp32after)
90
91
L(memcmp32):                               # 32-byte
92
        mov     %edx, %ecx
93
        shr     $5, %ecx
94
        jz      L(memcmp8)
95
96
        .p2align 4
97
98
L(memcmp32loop):
99
        mov        (%rsi), %rax
100
        mov      8 (%rsi),  %r8
101
        mov     16 (%rsi),  %r9
102
        mov     24 (%rsi), %r10
103
        sub        (%rdi), %rax
104
        sub      8 (%rdi),  %r8
105
        sub     16 (%rdi),  %r9
106
        sub     24 (%rdi), %r10
107
108
        or      %rax,  %r8
109
        or       %r9, %r10
110
        or       %r8, %r10
111
        jnz     L(memcmp8)
112
113
        sub     $32, %rdx
114
        dec     %ecx
115
116
        lea     32 (%rsi), %rsi
117
        lea     32 (%rdi), %rdi
118
119
        jnz     L(memcmp32loop)
120
121
L(memcmp32skip):
122
        and     $31, %edx
123
        jnz     L(memcmp8)
124
125
        xor     %eax, %eax
126
        ret
127
128
        .p2align 4
129
130
L(memcmp32after):
131
132
#ifdef PIC
133
        mov     _rtld_local_ro@GOTPCREL(%rip), %r8
134
	mov	RTLD_GLOBAL_DL_CACHE1SIZEHALF(%r8), %r9
135
#else
136
        mov     _dl_cache1sizehalf, %r9
137
#endif
138
	prefetcht0 (%r9)
139
140
141
.alignsrctry:
142
        mov     %esi, %r8d      # align by source
143
144
        and     $7, %r8d
145
        jz      .alignsrcafter  # not unaligned
146
147
.alignsrc:                      # align
148
        lea     -8 (%r8, %rdx), %rdx
149
        sub     $8, %r8d
150
151
#       .p2align 4
152
153
.alignsrcloop:
154
        movzbl  (%rdi), %eax
155
        movzbl  (%rsi), %ecx
156
        sub     %ecx, %eax
157
        jnz     L(memcmpexit)
158
159
        inc     %r8d
160
161
        lea     1 (%rdi), %rdi
162
        lea     1 (%rsi), %rsi
163
164
        jnz     .alignsrcloop
165
166
        .p2align 4
167
168
.alignsrcafter:
169
170
171
L(memcmp64try):
172
#ifdef PIC
173
        mov     _rtld_local_ro@GOTPCREL(%rip), %r8
174
	mov	RTLD_GLOBAL_DL_CACHE1SIZEHALF(%r8), %rcx
175
#else
176
        mov     _dl_cache1sizehalf, %rcx
177
#endif
178
        cmp	%rdx, %rcx
179
        cmova   %rdx, %rcx
180
181
L(memcmp64):                               # 64-byte
182
        shr     $6, %rcx
183
        jz      L(memcmp32)
184
185
        .p2align 4
186
187
L(memcmp64loop):
188
        mov        (%rsi), %rax
189
        mov      8 (%rsi),  %r8
190
        sub        (%rdi), %rax
191
        sub      8 (%rdi),  %r8
192
        or      %r8,  %rax
193
194
        mov     16 (%rsi),  %r9
195
        mov     24 (%rsi), %r10
196
        sub     16 (%rdi),  %r9
197
        sub     24 (%rdi), %r10
198
        or      %r10, %r9
199
200
        or      %r9,  %rax
201
        jnz     L(memcmp32)
202
203
        mov     32 (%rsi), %rax
204
        mov     40 (%rsi),  %r8
205
        sub     32 (%rdi), %rax
206
        sub     40 (%rdi),  %r8
207
        or      %r8,  %rax
208
209
        mov     48 (%rsi),  %r9
210
        mov     56 (%rsi), %r10
211
        sub     48 (%rdi),  %r9
212
        sub     56 (%rdi), %r10
213
        or      %r10, %r9
214
215
        or      %r9,  %rax
216
        jnz    	L(memcmp32)
217
218
        lea     64 (%rsi), %rsi
219
        lea     64 (%rdi), %rdi
220
221
        sub     $64, %rdx
222
        dec     %rcx
223
        jnz     L(memcmp64loop)
224
225
#       .p2align 4
226
227
L(memcmp64skip):
228
        cmp     $2048, %rdx
229
        ja     L(memcmp64after)
230
231
        test    %edx, %edx
232
        jnz     L(memcmp32)
233
234
        xor     %eax, %eax
235
        ret
236
237
        .p2align 4
238
239
L(memcmp64after):
240
241
L(memcmppretry):
242
243
L(memcmppre):                              # 64-byte prefetching
244
#ifdef PIC
245
        mov     _rtld_local_ro@GOTPCREL(%rip), %r8
246
	mov	RTLD_GLOBAL_DL_CACHE2SIZEHALF(%r8), %rcx
247
#else
248
        mov     _dl_cache2sizehalf, %rcx
249
#endif
250
        cmp	%rdx, %rcx
251
        cmova   %rdx, %rcx
252
253
        shr     $6, %rcx
254
        jz      L(memcmppreskip)
255
256
        prefetcht0 512 (%rsi)
257
        prefetcht0 512 (%rdi)
258
259
        mov        (%rsi), %rax
260
        mov      8 (%rsi), %r9
261
        mov     16 (%rsi), %r10
262
        mov     24 (%rsi), %r11
263
        sub        (%rdi), %rax
264
        sub      8 (%rdi), %r9
265
        sub     16 (%rdi), %r10
266
        sub     24 (%rdi), %r11
267
268
        or       %r9, %rax
269
        or      %r11, %r10
270
        or      %r10, %rax
271
        jnz     L(memcmp32)
272
273
        mov     32 (%rsi), %rax
274
        mov     40 (%rsi), %r9
275
        mov     48 (%rsi), %r10
276
        mov     56 (%rsi), %r11
277
        sub     32 (%rdi), %rax
278
        sub     40 (%rdi), %r9
279
        sub     48 (%rdi), %r10
280
        sub     56 (%rdi), %r11
281
282
        or       %r9, %rax
283
        or      %r11, %r10
284
        or      %r10, %rax
285
        jnz     L(memcmp32)
286
287
        lea     64 (%rsi), %rsi
288
        lea     64 (%rdi), %rdi
289
290
        sub     $64, %rdx
291
        dec     %rcx
292
293
        .p2align 4
294
295
L(memcmppreloop):
296
        prefetcht0 512 (%rsi)
297
        prefetcht0 512 (%rdi)
298
299
        mov        (%rsi), %rax
300
        mov      8 (%rsi), %r9
301
        mov     16 (%rsi), %r10
302
        mov     24 (%rsi), %r11
303
        sub        (%rdi), %rax
304
        sub      8 (%rdi), %r9
305
        sub     16 (%rdi), %r10
306
        sub     24 (%rdi), %r11
307
308
        or       %r9, %rax
309
        or      %r11, %r10
310
        or      %r10, %rax
311
        jnz     L(memcmp32)
312
313
        mov     32 (%rsi), %rax
314
        mov     40 (%rsi), %r9
315
        mov     48 (%rsi), %r10
316
        mov     56 (%rsi), %r11
317
        sub     32 (%rdi), %rax
318
        sub     40 (%rdi), %r9
319
        sub     48 (%rdi), %r10
320
        sub     56 (%rdi), %r11
321
322
        or       %r9, %rax
323
        or      %r11, %r10
324
        or      %r10, %rax
325
        jnz     L(memcmp32)
326
327
        lea     64 (%rsi), %rsi
328
        lea     64 (%rdi), %rdi
329
330
        sub     $64, %rdx
331
        dec     %rcx
332
        jnz     L(memcmppreloop)
333
334
#       .p2align 4
335
336
L(memcmppreskip):
337
        cmp     $2048, %rdx
338
        ja      L(memcmppreafter)
339
340
        test    %edx, %edx
341
        jnz     L(memcmp32)
342
343
        xor     %eax, %eax
344
        ret
345
346
        .p2align 4
347
348
L(memcmppreafter):
349
350
L(memcmp128try):
351
352
L(memcmp128):                              # 128-byte
353
        mov     %rdx, %rcx
354
        shr     $7, %rcx
355
        jz      L(memcmp128skip)
356
357
        .p2align 4
358
359
L(memcmp128loop):
360
        prefetcht0 512 (%rsi)
361
        prefetcht0 512 (%rdi)
362
363
        mov        (%rsi), %rax
364
        mov      8 (%rsi), %r8
365
        sub        (%rdi), %rax
366
        sub      8 (%rdi), %r8
367
        mov     16 (%rsi), %r9
368
        mov     24 (%rsi), %r10
369
        sub     16 (%rdi), %r9
370
        sub     24 (%rdi), %r10
371
372
        or       %r8, %rax
373
        or       %r9, %r10
374
        or      %r10, %rax
375
376
        mov     32 (%rsi), %r8
377
        mov     40 (%rsi), %r9
378
        sub     32 (%rdi), %r8
379
        sub     40 (%rdi), %r9
380
        mov     48 (%rsi), %r10
381
        mov     56 (%rsi), %r11
382
        sub     48 (%rdi), %r10
383
        sub     56 (%rdi), %r11
384
385
        or       %r9, %r8
386
        or      %r11, %r10
387
        or      %r10, %r8
388
389
        or      %r8, %rax
390
        jnz     L(memcmp32)
391
392
        prefetcht0 576 (%rsi)
393
        prefetcht0 576 (%rdi)
394
395
        mov      64 (%rsi), %rax
396
        mov      72 (%rsi), %r8
397
        sub      64 (%rdi), %rax
398
        sub      72 (%rdi), %r8
399
        mov      80 (%rsi), %r9
400
        mov      88 (%rsi), %r10
401
        sub      80 (%rdi), %r9
402
        sub      88 (%rdi), %r10
403
404
        or       %r8, %rax
405
        or       %r9, %r10
406
        or      %r10, %rax
407
408
        mov      96 (%rsi), %r8
409
        mov     104 (%rsi), %r9
410
        sub      96 (%rdi), %r8
411
        sub     104 (%rdi), %r9
412
        mov     112 (%rsi), %r10
413
        mov     120 (%rsi), %r11
414
        sub     112 (%rdi), %r10
415
        sub     120 (%rdi), %r11
416
417
        or       %r9, %r8
418
        or      %r11, %r10
419
        or      %r10, %r8
420
421
        or      %r8, %rax
422
        jnz     L(memcmp32)
423
424
        sub     $128, %rdx
425
        dec     %rcx
426
427
        lea     128 (%rsi), %rsi
428
        lea     128 (%rdi), %rdi
429
430
        jnz     L(memcmp128loop)
431
432
L(memcmp128skip):
433
        and     $127, %edx
434
        jnz     L(memcmp32)
435
436
        xor     %eax, %eax
437
        ret
438
439
END     (memcmp)
440
441
#undef bcmp
442
weak_alias (memcmp, bcmp)
(-)sysdeps/x86_64/strncmp.S (+15 lines)
Line 0 Link Here
1
# $Header: /K8_Projects/Glibc/amd64strncpy.S 1     8/29/03 16:37 Emenezes $
2
3
# (c) 2002 Advanced Micro Devices, Inc.
4
# YOUR USE OF THIS CODE IS SUBJECT TO THE TERMS
5
# AND CONDITIONS OF THE GNU LESSER GENERAL PUBLIC
6
# LICENSE FOUND IN THE "README" FILE THAT IS
7
# INCLUDED WITH THIS FILE
8
9
#define USE_AS_STRNCMP
10
#define strcmp strncmp
11
12
#include "strcmp.S"
13
14
weak_alias (strncmp, __strncmp)
15
libc_hidden_builtin_def (strncmp)

Return to bug 100289