Line 0
Link Here
|
|
|
1 |
/* |
2 |
(C) 2002 Advanced Micro Devices, Inc. |
3 |
** YOUR USE OF THIS LIBRARY IS SUBJECT TO THE TERMS |
4 |
AND CONDITIONS OF THE GNU LESSER GENERAL PUBLIC |
5 |
LICENSE FOUND IN THE "README" FILE THAT IS INCLUDED WITH |
6 |
THIS LIBRARY** |
7 |
*/ |
8 |
|
9 |
#ifndef LIBM_INLINES_AMD_H_INCLUDED |
10 |
#define LIBM_INLINES_AMD_H_INCLUDED 1 |
11 |
|
12 |
#include "libm_util_amd.h" |
13 |
|
14 |
#ifdef WIN32 |
15 |
#define inline __inline |
16 |
#endif |
17 |
|
18 |
/* Set defines for inline functions calling other inlines */ |
19 |
#if defined(USE_VAL_WITH_FLAGS) || defined(USE_VALF_WITH_FLAGS) || \ |
20 |
defined(USE_ZERO_WITH_FLAGS) || defined(USE_ZEROF_WITH_FLAGS) || \ |
21 |
defined(USE_NAN_WITH_FLAGS) || defined(USE_NANF_WITH_FLAGS) || \ |
22 |
defined(USE_INFINITY_WITH_FLAGS) || defined(USE_INFINITYF_WITH_FLAGS) || \ |
23 |
defined(USE_SQRT_AMD_INLINE) || defined(USE_SQRTF_AMD_INLINE) |
24 |
#undef USE_RAISE_FPSW_FLAGS |
25 |
#define USE_RAISE_FPSW_FLAGS 1 |
26 |
#endif |
27 |
|
28 |
#if defined(USE_SPLITDOUBLE) |
29 |
/* Splits double x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0. |
30 |
Assumes that x is not zero, denormal, infinity or NaN, but these conditions |
31 |
are not checked */ |
32 |
static inline void splitDouble(double x, int *e, double *m) |
33 |
{ |
34 |
unsigned long ux, uy; |
35 |
GET_BITS_DP64(x, ux); |
36 |
uy = ux; |
37 |
ux &= EXPBITS_DP64; |
38 |
ux >>= EXPSHIFTBITS_DP64; |
39 |
*e = (int)ux - EXPBIAS_DP64 + 1; |
40 |
uy = (uy & (SIGNBIT_DP64 | MANTBITS_DP64)) | HALFEXPBITS_DP64; |
41 |
PUT_BITS_DP64(uy, x); |
42 |
*m = x; |
43 |
} |
44 |
#endif /* USE_SPLITDOUBLE */ |
45 |
|
46 |
|
47 |
#if defined(USE_SPLITDOUBLE_2) |
48 |
/* Splits double x into exponent e and mantissa m, where 1.0 <= abs(m) < 4.0. |
49 |
Assumes that x is not zero, denormal, infinity or NaN, but these conditions |
50 |
are not checked. Also assumes EXPBIAS_DP is odd. With this |
51 |
assumption, e will be even on exit. */ |
52 |
static inline void splitDouble_2(double x, int *e, double *m) |
53 |
{ |
54 |
unsigned long ux, vx; |
55 |
GET_BITS_DP64(x, ux); |
56 |
vx = ux; |
57 |
ux &= EXPBITS_DP64; |
58 |
ux >>= EXPSHIFTBITS_DP64; |
59 |
if (ux & 1) |
60 |
{ |
61 |
/* The exponent is odd */ |
62 |
vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | ONEEXPBITS_DP64; |
63 |
PUT_BITS_DP64(vx, x); |
64 |
*m = x; |
65 |
*e = ux - EXPBIAS_DP64; |
66 |
} |
67 |
else |
68 |
{ |
69 |
/* The exponent is even */ |
70 |
vx = (vx & (SIGNBIT_DP64 | MANTBITS_DP64)) | TWOEXPBITS_DP64; |
71 |
PUT_BITS_DP64(vx, x); |
72 |
*m = x; |
73 |
*e = ux - EXPBIAS_DP64 - 1; |
74 |
} |
75 |
} |
76 |
#endif /* USE_SPLITDOUBLE_2 */ |
77 |
|
78 |
|
79 |
#if defined(USE_SPLITFLOAT) |
80 |
/* Splits float x into exponent e and mantissa m, where 0.5 <= abs(m) < 1.0. |
81 |
Assumes that x is not zero, denormal, infinity or NaN, but these conditions |
82 |
are not checked */ |
83 |
static inline void splitFloat(float x, int *e, float *m) |
84 |
{ |
85 |
unsigned int ux, uy; |
86 |
GET_BITS_SP32(x, ux); |
87 |
uy = ux; |
88 |
ux &= EXPBITS_SP32; |
89 |
ux >>= EXPSHIFTBITS_SP32; |
90 |
*e = (int)ux - EXPBIAS_SP32 + 1; |
91 |
uy = (uy & (SIGNBIT_SP32 | MANTBITS_SP32)) | HALFEXPBITS_SP32; |
92 |
PUT_BITS_SP32(uy, x); |
93 |
*m = x; |
94 |
} |
95 |
#endif /* USE_SPLITFLOAT */ |
96 |
|
97 |
|
98 |
#if defined(USE_SCALEDOUBLE_1) |
99 |
/* Scales the double x by 2.0**n. |
100 |
Assumes EMIN <= n <= EMAX, though this condition is not checked. */ |
101 |
static inline double scaleDouble_1(double x, int n) |
102 |
{ |
103 |
double t; |
104 |
/* Construct the number t = 2.0**n */ |
105 |
PUT_BITS_DP64(((long)n + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t); |
106 |
return x*t; |
107 |
} |
108 |
#endif /* USE_SCALEDOUBLE_1 */ |
109 |
|
110 |
|
111 |
#if defined(USE_SCALEDOUBLE_2) |
112 |
/* Scales the double x by 2.0**n. |
113 |
Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ |
114 |
static inline double scaleDouble_2(double x, int n) |
115 |
{ |
116 |
double t1, t2; |
117 |
int n1, n2; |
118 |
n1 = n / 2; |
119 |
n2 = n - n1; |
120 |
/* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ |
121 |
PUT_BITS_DP64(((long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); |
122 |
PUT_BITS_DP64(((long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); |
123 |
return (x*t1)*t2; |
124 |
} |
125 |
#endif /* USE_SCALEDOUBLE_2 */ |
126 |
|
127 |
|
128 |
#if defined(USE_SCALEDOUBLE_3) |
129 |
/* Scales the double x by 2.0**n. |
130 |
Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */ |
131 |
static inline double scaleDouble_3(double x, int n) |
132 |
{ |
133 |
double t1, t2, t3; |
134 |
int n1, n2, n3; |
135 |
n1 = n / 3; |
136 |
n2 = (n - n1) / 2; |
137 |
n3 = n - n1 - n2; |
138 |
/* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */ |
139 |
PUT_BITS_DP64(((long)n1 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t1); |
140 |
PUT_BITS_DP64(((long)n2 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t2); |
141 |
PUT_BITS_DP64(((long)n3 + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, t3); |
142 |
return ((x*t1)*t2)*t3; |
143 |
} |
144 |
#endif /* USE_SCALEDOUBLE_3 */ |
145 |
|
146 |
|
147 |
#if defined(USE_SCALEFLOAT_1) |
148 |
/* Scales the float x by 2.0**n. |
149 |
Assumes EMIN <= n <= EMAX, though this condition is not checked. */ |
150 |
static inline double scaleFloat_1(float x, int n) |
151 |
{ |
152 |
float t; |
153 |
/* Construct the number t = 2.0**n */ |
154 |
PUT_BITS_SP32((n + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t); |
155 |
return x*t; |
156 |
} |
157 |
#endif /* USE_SCALEFLOAT_1 */ |
158 |
|
159 |
|
160 |
#if defined(USE_SCALEFLOAT_2) |
161 |
/* Scales the float x by 2.0**n. |
162 |
Assumes 2*EMIN <= n <= 2*EMAX, though this condition is not checked. */ |
163 |
static inline float scaleFloat_2(float x, int n) |
164 |
{ |
165 |
float t1, t2; |
166 |
int n1, n2; |
167 |
n1 = n / 2; |
168 |
n2 = n - n1; |
169 |
/* Construct the numbers t1 = 2.0**n1 and t2 = 2.0**n2 */ |
170 |
PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1); |
171 |
PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2); |
172 |
return (x*t1)*t2; |
173 |
} |
174 |
#endif /* USE_SCALEFLOAT_2 */ |
175 |
|
176 |
|
177 |
#if defined(USE_SCALEFLOAT_3) |
178 |
/* Scales the float x by 2.0**n. |
179 |
Assumes 3*EMIN <= n <= 3*EMAX, though this condition is not checked. */ |
180 |
static inline double scaleFloat_3(float x, int n) |
181 |
{ |
182 |
float t1, t2, t3; |
183 |
int n1, n2, n3; |
184 |
n1 = n / 3; |
185 |
n2 = (n - n1) / 2; |
186 |
n3 = n - n1 - n2; |
187 |
/* Construct the numbers t1 = 2.0**n1, t2 = 2.0**n2 and t3 = 2.0**n3 */ |
188 |
PUT_BITS_SP32((n1 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t1); |
189 |
PUT_BITS_SP32((n2 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t2); |
190 |
PUT_BITS_SP32((n3 + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, t3); |
191 |
return ((x*t1)*t2)*t3; |
192 |
} |
193 |
#endif /* USE_SCALEFLOAT_3 */ |
194 |
|
195 |
#if defined(USE_SETPRECISIONDOUBLE) |
196 |
unsigned int setPrecisionDouble(void) |
197 |
{ |
198 |
unsigned int cw, cwold = 0; |
199 |
#if defined(WIN32) |
200 |
__asm fstcw cwold; |
201 |
cw = cwold & (~0x00000300); /* These two bits control rounding precision */ |
202 |
cw |= AMD_F_DOUBLE; |
203 |
__asm fldcw cw; |
204 |
#elif defined(linux) |
205 |
/* There is no precision control on Hammer */ |
206 |
#else |
207 |
/* Do nowt */ |
208 |
#endif |
209 |
return cwold; |
210 |
} |
211 |
#endif /* USE_SETPRECISIONDOUBLE */ |
212 |
|
213 |
#if defined(USE_RESTOREPRECISION) |
214 |
void restorePrecision(unsigned int cwold) |
215 |
{ |
216 |
#if defined(WIN32) |
217 |
__asm fldcw cwold; |
218 |
#elif defined(linux) |
219 |
/* There is no precision control on Hammer */ |
220 |
#else |
221 |
/* Do nowt */ |
222 |
#endif |
223 |
return; |
224 |
} |
225 |
#endif /* USE_RESTOREPRECISION */ |
226 |
|
227 |
|
228 |
#if defined(USE_CLEAR_FPSW_FLAGS) |
229 |
/* Clears floating-point status flags. The argument should be |
230 |
the bitwise or of the flags to be cleared, from the |
231 |
list above, e.g. |
232 |
clear_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID); |
233 |
*/ |
234 |
static inline void clear_fpsw_flags(int flags) |
235 |
{ |
236 |
#if defined(WIN32) |
237 |
fpenv_type fenv; |
238 |
/* Get the current floating-point environment */ |
239 |
__asm fnstenv fenv; |
240 |
fenv.status_word &= (~flags); |
241 |
/* Put the floating-point environment back */ |
242 |
__asm fldenv fenv; |
243 |
#elif defined(linux) |
244 |
unsigned int cw; |
245 |
/* Get the current floating-point control/status word */ |
246 |
asm volatile ("STMXCSR %0" : "=m" (cw)); |
247 |
cw &= (~flags); |
248 |
asm volatile ("LDMXCSR %0" : : "m" (cw)); |
249 |
#else |
250 |
#error Unknown machine |
251 |
#endif |
252 |
} |
253 |
#endif /* USE_CLEAR_FPSW_FLAGS */ |
254 |
|
255 |
|
256 |
#if defined(USE_RAISE_FPSW_FLAGS) |
257 |
/* Raises floating-point status flags. The argument should be |
258 |
the bitwise or of the flags to be raised, from the |
259 |
list above, e.g. |
260 |
raise_fpsw_flags(AMD_F_INEXACT | AMD_F_INVALID); |
261 |
*/ |
262 |
static inline void raise_fpsw_flags(int flags) |
263 |
{ |
264 |
#if defined(WIN32) |
265 |
fpenv_type fenv; |
266 |
/* Get the current floating-point environment */ |
267 |
__asm fnstenv fenv; |
268 |
fenv.status_word |= flags; |
269 |
/* Put the floating-point environment back */ |
270 |
__asm fldenv fenv; |
271 |
#elif defined(linux) |
272 |
unsigned int cw; |
273 |
/* Get the current floating-point control/status word */ |
274 |
asm volatile ("STMXCSR %0" : "=m" (cw)); |
275 |
cw |= flags; |
276 |
asm volatile ("LDMXCSR %0" : : "m" (cw)); |
277 |
#else |
278 |
#error Unknown machine |
279 |
#endif |
280 |
} |
281 |
#endif /* USE_RAISE_FPSW_FLAGS */ |
282 |
|
283 |
|
284 |
#if defined(USE_GET_FPSW_INLINE) |
285 |
/* Return the current floating-point status word */ |
286 |
static inline unsigned int get_fpsw_inline(void) |
287 |
{ |
288 |
#if defined(WIN32) |
289 |
unsigned short sw; |
290 |
__asm fstsw sw; |
291 |
return (unsigned int)sw; |
292 |
#elif defined(linux) |
293 |
unsigned int sw; |
294 |
asm volatile ("STMXCSR %0" : "=m" (sw)); |
295 |
return sw; |
296 |
#else |
297 |
#error Unknown machine |
298 |
#endif |
299 |
} |
300 |
#endif /* USE_GET_FPSW_INLINE */ |
301 |
|
302 |
#if defined(USE_SET_FPSW_INLINE) |
303 |
/* Set the floating-point status word */ |
304 |
static inline void set_fpsw_inline(unsigned int sw) |
305 |
{ |
306 |
#if defined(WIN32) |
307 |
fpenv_type fenv; |
308 |
/* Get the current floating-point environment */ |
309 |
__asm fnstenv fenv; |
310 |
/* Set the status word to sw */ |
311 |
fenv.status_word = (unsigned short)sw; |
312 |
/* Put the floating-point environment back */ |
313 |
__asm fldenv fenv; |
314 |
#elif defined(linux) |
315 |
/* Set the current floating-point control/status word */ |
316 |
asm volatile ("LDMXCSR %0" : : "m" (sw)); |
317 |
#else |
318 |
#error Unknown machine |
319 |
#endif |
320 |
} |
321 |
#endif /* USE_SET_FPSW_INLINE */ |
322 |
|
323 |
#if defined(USE_CLEAR_FPSW_INLINE) |
324 |
/* Clear all exceptions from the floating-point status word */ |
325 |
static inline void clear_fpsw_inline(void) |
326 |
{ |
327 |
#if defined(WIN32) |
328 |
fpenv_type fenv; |
329 |
/* Get the current floating-point environment */ |
330 |
__asm fnstenv fenv; |
331 |
/* Set the status word to 0 */ |
332 |
fenv.status_word = 0; |
333 |
/* Put the floating-point environment back */ |
334 |
__asm fldenv fenv; |
335 |
#elif defined(linux) |
336 |
unsigned int cw; |
337 |
/* Get the current floating-point control/status word */ |
338 |
asm volatile ("STMXCSR %0" : "=m" (cw)); |
339 |
cw &= ~(AMD_F_INEXACT | AMD_F_UNDERFLOW | AMD_F_OVERFLOW | |
340 |
AMD_F_DIVBYZERO | AMD_F_INVALID); |
341 |
asm volatile ("LDMXCSR %0" : : "m" (cw)); |
342 |
#else |
343 |
#error Unknown machine |
344 |
#endif |
345 |
} |
346 |
#endif /* USE_CLEAR_FPSW_INLINE */ |
347 |
|
348 |
|
349 |
#if defined(USE_VAL_WITH_FLAGS) |
350 |
/* Returns a double value after raising the given flags, |
351 |
e.g. val_with_flags(AMD_F_INEXACT); |
352 |
*/ |
353 |
static inline double val_with_flags(double val, int flags) |
354 |
{ |
355 |
raise_fpsw_flags(flags); |
356 |
return val; |
357 |
} |
358 |
#endif /* USE_VAL_WITH_FLAGS */ |
359 |
|
360 |
#if defined(USE_VALF_WITH_FLAGS) |
361 |
/* Returns a float value after raising the given flags, |
362 |
e.g. valf_with_flags(AMD_F_INEXACT); |
363 |
*/ |
364 |
static inline float valf_with_flags(float val, int flags) |
365 |
{ |
366 |
raise_fpsw_flags(flags); |
367 |
return val; |
368 |
} |
369 |
#endif /* USE_VALF_WITH_FLAGS */ |
370 |
|
371 |
|
372 |
#if defined(USE_ZERO_WITH_FLAGS) |
373 |
/* Returns a double +zero after raising the given flags, |
374 |
e.g. zero_with_flags(AMD_F_INEXACT | AMD_F_INVALID); |
375 |
*/ |
376 |
static inline double zero_with_flags(int flags) |
377 |
{ |
378 |
raise_fpsw_flags(flags); |
379 |
return 0.0; |
380 |
} |
381 |
#endif /* USE_ZERO_WITH_FLAGS */ |
382 |
|
383 |
|
384 |
#if defined(USE_ZEROF_WITH_FLAGS) |
385 |
/* Returns a float +zero after raising the given flags, |
386 |
e.g. zerof_with_flags(AMD_F_INEXACT | AMD_F_INVALID); |
387 |
*/ |
388 |
static inline float zerof_with_flags(int flags) |
389 |
{ |
390 |
raise_fpsw_flags(flags); |
391 |
return 0.0F; |
392 |
} |
393 |
#endif /* USE_ZEROF_WITH_FLAGS */ |
394 |
|
395 |
|
396 |
#if defined(USE_NAN_WITH_FLAGS) |
397 |
/* Returns a double quiet +nan after raising the given flags, |
398 |
e.g. nan_with_flags(AMD_F_INVALID); |
399 |
*/ |
400 |
static inline double nan_with_flags(int flags) |
401 |
{ |
402 |
double z; |
403 |
raise_fpsw_flags(flags); |
404 |
PUT_BITS_DP64(0x7ff8000000000000, z); |
405 |
return z; |
406 |
} |
407 |
#endif /* USE_NAN_WITH_FLAGS */ |
408 |
|
409 |
#if defined(USE_NANF_WITH_FLAGS) |
410 |
/* Returns a float quiet +nan after raising the given flags, |
411 |
e.g. nanf_with_flags(AMD_F_INVALID); |
412 |
*/ |
413 |
static inline float nanf_with_flags(int flags) |
414 |
{ |
415 |
float z; |
416 |
raise_fpsw_flags(flags); |
417 |
PUT_BITS_SP32(0x7fc00000, z); |
418 |
return z; |
419 |
} |
420 |
#endif /* USE_NANF_WITH_FLAGS */ |
421 |
|
422 |
|
423 |
#ifdef USE_INFINITY_WITH_FLAGS |
424 |
/* Returns a positive double infinity after raising the given flags, |
425 |
e.g. infinity_with_flags(AMD_F_OVERFLOW); |
426 |
*/ |
427 |
static inline double infinity_with_flags(int flags) |
428 |
{ |
429 |
double z; |
430 |
raise_fpsw_flags(flags); |
431 |
PUT_BITS_DP64((unsigned long)(BIASEDEMAX_DP64 + 1) << EXPSHIFTBITS_DP64, z); |
432 |
return z; |
433 |
} |
434 |
#endif /* USE_INFINITY_WITH_FLAGS */ |
435 |
|
436 |
#ifdef USE_INFINITYF_WITH_FLAGS |
437 |
/* Returns a positive float infinity after raising the given flags, |
438 |
e.g. infinityf_with_flags(AMD_F_OVERFLOW); |
439 |
*/ |
440 |
static inline float infinityf_with_flags(int flags) |
441 |
{ |
442 |
float z; |
443 |
raise_fpsw_flags(flags); |
444 |
PUT_BITS_SP32((BIASEDEMAX_SP32 + 1) << EXPSHIFTBITS_SP32, z); |
445 |
return z; |
446 |
} |
447 |
#endif /* USE_INFINITYF_WITH_FLAGS */ |
448 |
|
449 |
|
450 |
#if defined(USE_SPLITEXP) |
451 |
/* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). |
452 |
Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments |
453 |
abs(x) > large/(ln(base)) (where large is the largest representable |
454 |
floating point number) should be handled separately instead of calling |
455 |
this function. This function is called by exp_amd, exp2_amd, exp10_amd, |
456 |
cosh_amd and sinh_amd. */ |
457 |
static inline void splitexp(double x, double logbase, |
458 |
double thirtytwo_by_logbaseof2, |
459 |
double logbaseof2_by_32_lead, |
460 |
double logbaseof2_by_32_trail, |
461 |
int *m, double *z1, double *z2) |
462 |
{ |
463 |
double q, r, r1, r2, f1, f2; |
464 |
int n, j; |
465 |
|
466 |
/* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain |
467 |
leading and trailing parts respectively of precomputed |
468 |
values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. |
469 |
two_to_jby32_lead_table contains the first 25 bits of precision, |
470 |
and two_to_jby32_trail_table contains a further 53 bits precision. */ |
471 |
|
472 |
static const double two_to_jby32_lead_table[32] = { |
473 |
1.00000000000000000000e+00, /* 0x3ff0000000000000 */ |
474 |
1.02189713716506958008e+00, /* 0x3ff059b0d0000000 */ |
475 |
1.04427373409271240234e+00, /* 0x3ff0b55860000000 */ |
476 |
1.06714040040969848633e+00, /* 0x3ff11301d0000000 */ |
477 |
1.09050768613815307617e+00, /* 0x3ff172b830000000 */ |
478 |
1.11438673734664916992e+00, /* 0x3ff1d48730000000 */ |
479 |
1.13878858089447021484e+00, /* 0x3ff2387a60000000 */ |
480 |
1.16372483968734741211e+00, /* 0x3ff29e9df0000000 */ |
481 |
1.18920707702636718750e+00, /* 0x3ff306fe00000000 */ |
482 |
1.21524733304977416992e+00, /* 0x3ff371a730000000 */ |
483 |
1.24185776710510253906e+00, /* 0x3ff3dea640000000 */ |
484 |
1.26905095577239990234e+00, /* 0x3ff44e0860000000 */ |
485 |
1.29683953523635864258e+00, /* 0x3ff4bfdad0000000 */ |
486 |
1.32523661851882934570e+00, /* 0x3ff5342b50000000 */ |
487 |
1.35425549745559692383e+00, /* 0x3ff5ab07d0000000 */ |
488 |
1.38390988111495971680e+00, /* 0x3ff6247eb0000000 */ |
489 |
1.41421353816986083984e+00, /* 0x3ff6a09e60000000 */ |
490 |
1.44518077373504638672e+00, /* 0x3ff71f75e0000000 */ |
491 |
1.47682613134384155273e+00, /* 0x3ff7a11470000000 */ |
492 |
1.50916439294815063477e+00, /* 0x3ff8258990000000 */ |
493 |
1.54221081733703613281e+00, /* 0x3ff8ace540000000 */ |
494 |
1.57598084211349487305e+00, /* 0x3ff93737b0000000 */ |
495 |
1.61049032211303710938e+00, /* 0x3ff9c49180000000 */ |
496 |
1.64575546979904174805e+00, /* 0x3ffa5503b0000000 */ |
497 |
1.68179279565811157227e+00, /* 0x3ffae89f90000000 */ |
498 |
1.71861928701400756836e+00, /* 0x3ffb7f76f0000000 */ |
499 |
1.75625211000442504883e+00, /* 0x3ffc199bd0000000 */ |
500 |
1.79470902681350708008e+00, /* 0x3ffcb720d0000000 */ |
501 |
1.83400803804397583008e+00, /* 0x3ffd5818d0000000 */ |
502 |
1.87416762113571166992e+00, /* 0x3ffdfc9730000000 */ |
503 |
1.91520655155181884766e+00, /* 0x3ffea4afa0000000 */ |
504 |
1.95714408159255981445e+00}; /* 0x3fff507650000000 */ |
505 |
|
506 |
static const double two_to_jby32_trail_table[32] = { |
507 |
0.00000000000000000000e+00, /* 0x0000000000000000 */ |
508 |
1.14890470981563546737e-08, /* 0x3e48ac2ba1d73e2a */ |
509 |
4.83347014379782142328e-08, /* 0x3e69f3121ec53172 */ |
510 |
2.67125131841396124714e-10, /* 0x3df25b50a4ebbf1b */ |
511 |
4.65271045830351350190e-08, /* 0x3e68faa2f5b9bef9 */ |
512 |
5.24924336638693782574e-09, /* 0x3e368b9aa7805b80 */ |
513 |
5.38622214388600821910e-08, /* 0x3e6ceac470cd83f6 */ |
514 |
1.90902301017041969782e-08, /* 0x3e547f7b84b09745 */ |
515 |
3.79763538792174980894e-08, /* 0x3e64636e2a5bd1ab */ |
516 |
2.69306947081946450986e-08, /* 0x3e5ceaa72a9c5154 */ |
517 |
4.49683815095311756138e-08, /* 0x3e682468446b6824 */ |
518 |
1.41933332021066904914e-09, /* 0x3e18624b40c4dbd0 */ |
519 |
1.94146510233556266402e-08, /* 0x3e54d8a89c750e5e */ |
520 |
2.46409119489264118569e-08, /* 0x3e5a753e077c2a0f */ |
521 |
4.94812958044698886494e-08, /* 0x3e6a90a852b19260 */ |
522 |
8.48872238075784476136e-10, /* 0x3e0d2ac258f87d03 */ |
523 |
2.42032342089579394887e-08, /* 0x3e59fcef32422cbf */ |
524 |
3.32420002333182569170e-08, /* 0x3e61d8bee7ba46e2 */ |
525 |
1.45956577586525322754e-08, /* 0x3e4f580c36bea881 */ |
526 |
3.46452721050003920866e-08, /* 0x3e62999c25159f11 */ |
527 |
8.07090469079979051284e-09, /* 0x3e415506dadd3e2a */ |
528 |
2.99439161340839520436e-09, /* 0x3e29b8bc9e8a0388 */ |
529 |
9.83621719880452147153e-09, /* 0x3e451f8480e3e236 */ |
530 |
8.35492309647188080486e-09, /* 0x3e41f12ae45a1224 */ |
531 |
3.48493175137966283582e-08, /* 0x3e62b5a75abd0e6a */ |
532 |
1.11084703472699692902e-08, /* 0x3e47daf237553d84 */ |
533 |
5.03688744342840346564e-08, /* 0x3e6b0aa538444196 */ |
534 |
4.81896001063495806249e-08, /* 0x3e69df20d22a0798 */ |
535 |
4.83653666334089557746e-08, /* 0x3e69f7490e4bb40b */ |
536 |
1.29745882314081237628e-08, /* 0x3e4bdcdaf5cb4656 */ |
537 |
9.84532844621636118964e-09, /* 0x3e452486cc2c7b9d */ |
538 |
4.25828404545651943883e-08}; /* 0x3e66dc8a80ce9f09 */ |
539 |
|
540 |
/* |
541 |
Step 1. Reduce the argument. |
542 |
|
543 |
To perform argument reduction, we find the integer n such that |
544 |
x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. |
545 |
n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and |
546 |
remainder by x - n*logbaseof2/32. The calculation of n is |
547 |
straightforward whereas the computation of x - n*logbaseof2/32 |
548 |
must be carried out carefully. |
549 |
logbaseof2/32 is so represented in two pieces that |
550 |
(1) logbaseof2/32 is known to extra precision, (2) the product |
551 |
of n and the leading piece is a model number and is hence |
552 |
calculated without error, and (3) the subtraction of the value |
553 |
obtained in (2) from x is a model number and is hence again |
554 |
obtained without error. |
555 |
*/ |
556 |
|
557 |
r = x * thirtytwo_by_logbaseof2; |
558 |
/* Set n = nearest integer to r */ |
559 |
/* This is faster on Hammer */ |
560 |
if (r > 0) |
561 |
n = (int)(r + 0.5); |
562 |
else |
563 |
n = (int)(r - 0.5); |
564 |
|
565 |
r1 = x - n * logbaseof2_by_32_lead; |
566 |
r2 = - n * logbaseof2_by_32_trail; |
567 |
|
568 |
/* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ |
569 |
/* j = n % 32; |
570 |
if (j < 0) j += 32; */ |
571 |
j = n & 0x0000001f; |
572 |
|
573 |
f1 = two_to_jby32_lead_table[j]; |
574 |
f2 = two_to_jby32_trail_table[j]; |
575 |
|
576 |
*m = (n - j) / 32; |
577 |
|
578 |
/* Step 2. The following is the core approximation. We approximate |
579 |
exp(r1+r2)-1 by a polynomial. */ |
580 |
|
581 |
r1 *= logbase; r2 *= logbase; |
582 |
|
583 |
r = r1 + r2; |
584 |
q = r1 + (r2 + |
585 |
r*r*( 5.00000000000000008883e-01 + |
586 |
r*( 1.66666666665260878863e-01 + |
587 |
r*( 4.16666666662260795726e-02 + |
588 |
r*( 8.33336798434219616221e-03 + |
589 |
r*( 1.38889490863777199667e-03 )))))); |
590 |
|
591 |
/* Step 3. Function value reconstruction. |
592 |
We now reconstruct the exponential of the input argument |
593 |
so that exp(x) = 2**m * (z1 + z2). |
594 |
The order of the computation below must be strictly observed. */ |
595 |
|
596 |
*z1 = f1; |
597 |
*z2 = f2 + ((f1 + f2) * q); |
598 |
} |
599 |
#endif /* USE_SPLITEXP */ |
600 |
|
601 |
|
602 |
#if defined(USE_SPLITEXPF) |
603 |
/* Compute the values m, z1, and z2 such that base**x = 2**m * (z1 + z2). |
604 |
Small arguments abs(x) < 1/(16*ln(base)) and extreme arguments |
605 |
abs(x) > large/(ln(base)) (where large is the largest representable |
606 |
floating point number) should be handled separately instead of calling |
607 |
this function. This function is called by exp_amd, exp2_amd, exp10_amd, |
608 |
cosh_amd and sinh_amd. */ |
609 |
static inline void splitexpf(float x, float logbase, |
610 |
float thirtytwo_by_logbaseof2, |
611 |
float logbaseof2_by_32_lead, |
612 |
float logbaseof2_by_32_trail, |
613 |
int *m, float *z1, float *z2) |
614 |
{ |
615 |
float q, r, r1, r2, f1, f2; |
616 |
int n, j; |
617 |
|
618 |
/* Arrays two_to_jby32_lead_table and two_to_jby32_trail_table contain |
619 |
leading and trailing parts respectively of precomputed |
620 |
values of pow(2.0,j/32.0), for j = 0, 1, ..., 31. |
621 |
two_to_jby32_lead_table contains the first 10 bits of precision, |
622 |
and two_to_jby32_trail_table contains a further 24 bits precision. */ |
623 |
|
624 |
static const float two_to_jby32_lead_table[32] = { |
625 |
1.0000000000E+00F, /* 0x3F800000 */ |
626 |
1.0214843750E+00F, /* 0x3F82C000 */ |
627 |
1.0429687500E+00F, /* 0x3F858000 */ |
628 |
1.0664062500E+00F, /* 0x3F888000 */ |
629 |
1.0898437500E+00F, /* 0x3F8B8000 */ |
630 |
1.1132812500E+00F, /* 0x3F8E8000 */ |
631 |
1.1386718750E+00F, /* 0x3F91C000 */ |
632 |
1.1621093750E+00F, /* 0x3F94C000 */ |
633 |
1.1875000000E+00F, /* 0x3F980000 */ |
634 |
1.2148437500E+00F, /* 0x3F9B8000 */ |
635 |
1.2402343750E+00F, /* 0x3F9EC000 */ |
636 |
1.2675781250E+00F, /* 0x3FA24000 */ |
637 |
1.2949218750E+00F, /* 0x3FA5C000 */ |
638 |
1.3242187500E+00F, /* 0x3FA98000 */ |
639 |
1.3535156250E+00F, /* 0x3FAD4000 */ |
640 |
1.3828125000E+00F, /* 0x3FB10000 */ |
641 |
1.4140625000E+00F, /* 0x3FB50000 */ |
642 |
1.4433593750E+00F, /* 0x3FB8C000 */ |
643 |
1.4765625000E+00F, /* 0x3FBD0000 */ |
644 |
1.5078125000E+00F, /* 0x3FC10000 */ |
645 |
1.5410156250E+00F, /* 0x3FC54000 */ |
646 |
1.5742187500E+00F, /* 0x3FC98000 */ |
647 |
1.6093750000E+00F, /* 0x3FCE0000 */ |
648 |
1.6445312500E+00F, /* 0x3FD28000 */ |
649 |
1.6816406250E+00F, /* 0x3FD74000 */ |
650 |
1.7167968750E+00F, /* 0x3FDBC000 */ |
651 |
1.7558593750E+00F, /* 0x3FE0C000 */ |
652 |
1.7929687500E+00F, /* 0x3FE58000 */ |
653 |
1.8339843750E+00F, /* 0x3FEAC000 */ |
654 |
1.8730468750E+00F, /* 0x3FEFC000 */ |
655 |
1.9140625000E+00F, /* 0x3FF50000 */ |
656 |
1.9570312500E+00F}; /* 0x3FFA8000 */ |
657 |
|
658 |
static const float two_to_jby32_trail_table[32] = { |
659 |
0.0000000000E+00F, /* 0x00000000 */ |
660 |
4.1277357377E-04F, /* 0x39D86988 */ |
661 |
1.3050324051E-03F, /* 0x3AAB0D9F */ |
662 |
7.3415064253E-04F, /* 0x3A407404 */ |
663 |
6.6398258787E-04F, /* 0x3A2E0F1E */ |
664 |
1.1054925853E-03F, /* 0x3A90E62D */ |
665 |
1.1675967835E-04F, /* 0x38F4DCE0 */ |
666 |
1.6154836630E-03F, /* 0x3AD3BEA3 */ |
667 |
1.7071149778E-03F, /* 0x3ADFC146 */ |
668 |
4.0360994171E-04F, /* 0x39D39B9C */ |
669 |
1.6234370414E-03F, /* 0x3AD4C982 */ |
670 |
1.4728321694E-03F, /* 0x3AC10C0C */ |
671 |
1.9176795613E-03F, /* 0x3AFB5AA6 */ |
672 |
1.0178930825E-03F, /* 0x3A856AD3 */ |
673 |
7.3992193211E-04F, /* 0x3A41F752 */ |
674 |
1.0973819299E-03F, /* 0x3A8FD607 */ |
675 |
1.5106226783E-04F, /* 0x391E6678 */ |
676 |
1.8214319134E-03F, /* 0x3AEEBD1D */ |
677 |
2.6364589576E-04F, /* 0x398A39F4 */ |
678 |
1.3519275235E-03F, /* 0x3AB13329 */ |
679 |
1.1952003697E-03F, /* 0x3A9CA845 */ |
680 |
1.7620950239E-03F, /* 0x3AE6F619 */ |
681 |
1.1153318919E-03F, /* 0x3A923054 */ |
682 |
1.2242280645E-03F, /* 0x3AA07647 */ |
683 |
1.5220546629E-04F, /* 0x391F9958 */ |
684 |
1.8224230735E-03F, /* 0x3AEEDE5F */ |
685 |
3.9278529584E-04F, /* 0x39CDEEC0 */ |
686 |
1.7403248930E-03F, /* 0x3AE41B9D */ |
687 |
2.3711356334E-05F, /* 0x37C6E7C0 */ |
688 |
1.1207590578E-03F, /* 0x3A92E66F */ |
689 |
1.1440613307E-03F, /* 0x3A95F454 */ |
690 |
1.1287408415E-04F}; /* 0x38ECB6D0 */ |
691 |
|
692 |
/* |
693 |
Step 1. Reduce the argument. |
694 |
|
695 |
To perform argument reduction, we find the integer n such that |
696 |
x = n * logbaseof2/32 + remainder, |remainder| <= logbaseof2/64. |
697 |
n is defined by round-to-nearest-integer( x*32/logbaseof2 ) and |
698 |
remainder by x - n*logbaseof2/32. The calculation of n is |
699 |
straightforward whereas the computation of x - n*logbaseof2/32 |
700 |
must be carried out carefully. |
701 |
logbaseof2/32 is so represented in two pieces that |
702 |
(1) logbaseof2/32 is known to extra precision, (2) the product |
703 |
of n and the leading piece is a model number and is hence |
704 |
calculated without error, and (3) the subtraction of the value |
705 |
obtained in (2) from x is a model number and is hence again |
706 |
obtained without error. |
707 |
*/ |
708 |
|
709 |
r = x * thirtytwo_by_logbaseof2; |
710 |
/* Set n = nearest integer to r */ |
711 |
/* This is faster on Hammer */ |
712 |
if (r > 0) |
713 |
n = (int)(r + 0.5F); |
714 |
else |
715 |
n = (int)(r - 0.5F); |
716 |
|
717 |
r1 = x - n * logbaseof2_by_32_lead; |
718 |
r2 = - n * logbaseof2_by_32_trail; |
719 |
|
720 |
/* Set j = n mod 32: 5 mod 32 = 5, -5 mod 32 = 27, etc. */ |
721 |
/* j = n % 32; |
722 |
if (j < 0) j += 32; */ |
723 |
j = n & 0x0000001f; |
724 |
|
725 |
f1 = two_to_jby32_lead_table[j]; |
726 |
f2 = two_to_jby32_trail_table[j]; |
727 |
|
728 |
*m = (n - j) / 32; |
729 |
|
730 |
/* Step 2. The following is the core approximation. We approximate |
731 |
exp(r1+r2)-1 by a polynomial. */ |
732 |
|
733 |
r1 *= logbase; r2 *= logbase; |
734 |
|
735 |
r = r1 + r2; |
736 |
q = r1 + (r2 + |
737 |
r*r*( 5.00000000000000008883e-01F + |
738 |
r*( 1.66666666665260878863e-01F ))); |
739 |
|
740 |
/* Step 3. Function value reconstruction. |
741 |
We now reconstruct the exponential of the input argument |
742 |
so that exp(x) = 2**m * (z1 + z2). |
743 |
The order of the computation below must be strictly observed. */ |
744 |
|
745 |
*z1 = f1; |
746 |
*z2 = f2 + ((f1 + f2) * q); |
747 |
} |
748 |
#endif /* SPLITEXPF */ |
749 |
|
750 |
|
751 |
#if defined(USE_SCALEUPDOUBLE1024) |
752 |
/* Scales up a double (normal or denormal) whose bit pattern is given |
753 |
as ux by 2**1024. There are no checks that the input number is |
754 |
scalable by that amount. */ |
755 |
static inline void scaleUpDouble1024(unsigned long ux, unsigned long *ur) |
756 |
{ |
757 |
unsigned long uy; |
758 |
double y; |
759 |
|
760 |
if ((ux & EXPBITS_DP64) == 0) |
761 |
{ |
762 |
/* ux is denormalised */ |
763 |
PUT_BITS_DP64(ux | 0x4010000000000000, y); |
764 |
if (ux & SIGNBIT_DP64) |
765 |
y += 4.0; |
766 |
else |
767 |
y -= 4.0; |
768 |
GET_BITS_DP64(y, uy); |
769 |
} |
770 |
else |
771 |
/* ux is normal */ |
772 |
uy = ux + 0x4000000000000000; |
773 |
|
774 |
*ur = uy; |
775 |
return; |
776 |
} |
777 |
|
778 |
#endif /* SCALEUPDOUBLE1024 */ |
779 |
|
780 |
|
781 |
#if defined(USE_SCALEDOWNDOUBLE) |
782 |
/* Scales down a double whose bit pattern is given as ux by 2**k. |
783 |
There are no checks that the input number is scalable by that amount. */ |
784 |
static inline void scaleDownDouble(unsigned long ux, int k, |
785 |
unsigned long *ur) |
786 |
{ |
787 |
unsigned long uy, uk, ax, xsign; |
788 |
int n, shift; |
789 |
xsign = ux & SIGNBIT_DP64; |
790 |
ax = ux & ~SIGNBIT_DP64; |
791 |
n = ((ax & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - k; |
792 |
if (n > 0) |
793 |
{ |
794 |
uk = (unsigned long)n << EXPSHIFTBITS_DP64; |
795 |
uy = (ax & ~EXPBITS_DP64) | uk; |
796 |
} |
797 |
else |
798 |
{ |
799 |
uy = (ax & ~EXPBITS_DP64) | 0x0010000000000000; |
800 |
shift = (1 - n); |
801 |
if (shift > MANTLENGTH_DP64 + 1) |
802 |
/* Sigh. Shifting works mod 64 so be careful not to shift too much */ |
803 |
uy = 0; |
804 |
else |
805 |
{ |
806 |
/* Make sure we round the result */ |
807 |
uy >>= shift - 1; |
808 |
uy = (uy >> 1) + (uy & 1); |
809 |
} |
810 |
} |
811 |
*ur = uy | xsign; |
812 |
} |
813 |
|
814 |
#endif /* SCALEDOWNDOUBLE */ |
815 |
|
816 |
|
817 |
#if defined(USE_SCALEUPFLOAT128) |
818 |
/* Scales up a float (normal or denormal) whose bit pattern is given |
819 |
as ux by 2**128. There are no checks that the input number is |
820 |
scalable by that amount. */ |
821 |
static inline void scaleUpFloat128(unsigned int ux, unsigned int *ur) |
822 |
{ |
823 |
unsigned int uy; |
824 |
float y; |
825 |
|
826 |
if ((ux & EXPBITS_SP32) == 0) |
827 |
{ |
828 |
/* ux is denormalised */ |
829 |
PUT_BITS_SP32(ux | 0x40800000, y); |
830 |
/* Compensate for the implicit bit just added */ |
831 |
if (ux & SIGNBIT_SP32) |
832 |
y += 4.0F; |
833 |
else |
834 |
y -= 4.0F; |
835 |
GET_BITS_SP32(y, uy); |
836 |
} |
837 |
else |
838 |
/* ux is normal */ |
839 |
uy = ux + 0x40000000; |
840 |
*ur = uy; |
841 |
} |
842 |
#endif /* SCALEUPFLOAT128 */ |
843 |
|
844 |
|
845 |
#if defined(USE_SCALEDOWNFLOAT) |
846 |
/* Scales down a float whose bit pattern is given as ux by 2**k. |
847 |
There are no checks that the input number is scalable by that amount. */ |
848 |
static inline void scaleDownFloat(unsigned int ux, int k, |
849 |
unsigned int *ur) |
850 |
{ |
851 |
unsigned int uy, uk, ax, xsign; |
852 |
int n, shift; |
853 |
|
854 |
xsign = ux & SIGNBIT_SP32; |
855 |
ax = ux & ~SIGNBIT_SP32; |
856 |
n = ((ax & EXPBITS_SP32) >> EXPSHIFTBITS_SP32) - k; |
857 |
if (n > 0) |
858 |
{ |
859 |
uk = (unsigned int)n << EXPSHIFTBITS_SP32; |
860 |
uy = (ax & ~EXPBITS_SP32) | uk; |
861 |
} |
862 |
else |
863 |
{ |
864 |
uy = (ax & ~EXPBITS_SP32) | 0x00800000; |
865 |
shift = (1 - n); |
866 |
if (shift > MANTLENGTH_SP32 + 1) |
867 |
/* Sigh. Shifting works mod 32 so be careful not to shift too much */ |
868 |
uy = 0; |
869 |
else |
870 |
{ |
871 |
/* Make sure we round the result */ |
872 |
uy >>= shift - 1; |
873 |
uy = (uy >> 1) + (uy & 1); |
874 |
} |
875 |
} |
876 |
*ur = uy | xsign; |
877 |
} |
878 |
#endif /* SCALEDOWNFLOAT */ |
879 |
|
880 |
|
881 |
#if defined(USE_SQRT_AMD_INLINE) |
882 |
static inline double sqrt_amd_inline(double x) |
883 |
{ |
884 |
/* |
885 |
Computes the square root of x. |
886 |
|
887 |
The calculation is carried out in three steps. |
888 |
|
889 |
Step 1. Reduction. |
890 |
The input argument is scaled to the interval [1, 4) by |
891 |
computing |
892 |
x = 2^e * y, where y in [1,4). |
893 |
Furthermore y is decomposed as y = c + t where |
894 |
c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64. |
895 |
|
896 |
Step 2. Approximation. |
897 |
An approximation q = sqrt(1 + (t/c)) - 1 is obtained |
898 |
from a basic series expansion using precomputed values |
899 |
stored in rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl. |
900 |
|
901 |
Step 3. Reconstruction. |
902 |
The value of sqrt(x) is reconstructed via |
903 |
sqrt(x) = 2^(e/2) * sqrt(y) |
904 |
= 2^(e/2) * sqrt(c) * sqrt(y/c) |
905 |
= 2^(e/2) * sqrt(c) * sqrt(1 + t/c) |
906 |
= 2^(e/2) * [ sqrt(c) + sqrt(c)*q ] |
907 |
*/ |
908 |
|
909 |
unsigned long ux, ax, u; |
910 |
double r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail; |
911 |
int e, denorm = 0, index; |
912 |
|
913 |
/* Arrays rt_jby32_lead_table_dbl and rt_jby32_trail_table_dbl contain |
914 |
leading and trailing parts respectively of precomputed |
915 |
values of sqrt(j/32), for j = 32, 33, ..., 128. |
916 |
rt_jby32_lead_table_dbl contains the first 21 bits of precision, |
917 |
and rt_jby32_trail_table_dbl contains a further 53 bits precision. */ |
918 |
|
919 |
static const double rt_jby32_lead_table_dbl[97] = { |
920 |
1.00000000000000000000e+00, /* 0x3ff0000000000000 */ |
921 |
1.01550388336181640625e+00, /* 0x3ff03f8100000000 */ |
922 |
1.03077602386474609375e+00, /* 0x3ff07e0f00000000 */ |
923 |
1.04582500457763671875e+00, /* 0x3ff0bbb300000000 */ |
924 |
1.06065940856933593750e+00, /* 0x3ff0f87600000000 */ |
925 |
1.07528972625732421875e+00, /* 0x3ff1346300000000 */ |
926 |
1.08972454071044921875e+00, /* 0x3ff16f8300000000 */ |
927 |
1.10396957397460937500e+00, /* 0x3ff1a9dc00000000 */ |
928 |
1.11803340911865234375e+00, /* 0x3ff1e37700000000 */ |
929 |
1.13192272186279296875e+00, /* 0x3ff21c5b00000000 */ |
930 |
1.14564323425292968750e+00, /* 0x3ff2548e00000000 */ |
931 |
1.15920162200927734375e+00, /* 0x3ff28c1700000000 */ |
932 |
1.17260360717773437500e+00, /* 0x3ff2c2fc00000000 */ |
933 |
1.18585395812988281250e+00, /* 0x3ff2f94200000000 */ |
934 |
1.19895744323730468750e+00, /* 0x3ff32eee00000000 */ |
935 |
1.21191978454589843750e+00, /* 0x3ff3640600000000 */ |
936 |
1.22474479675292968750e+00, /* 0x3ff3988e00000000 */ |
937 |
1.23743629455566406250e+00, /* 0x3ff3cc8a00000000 */ |
938 |
1.25000000000000000000e+00, /* 0x3ff4000000000000 */ |
939 |
1.26243782043457031250e+00, /* 0x3ff432f200000000 */ |
940 |
1.27475452423095703125e+00, /* 0x3ff4656500000000 */ |
941 |
1.28695297241210937500e+00, /* 0x3ff4975c00000000 */ |
942 |
1.29903793334960937500e+00, /* 0x3ff4c8dc00000000 */ |
943 |
1.31101036071777343750e+00, /* 0x3ff4f9e600000000 */ |
944 |
1.32287502288818359375e+00, /* 0x3ff52a7f00000000 */ |
945 |
1.33463478088378906250e+00, /* 0x3ff55aaa00000000 */ |
946 |
1.34629058837890625000e+00, /* 0x3ff58a6800000000 */ |
947 |
1.35784721374511718750e+00, /* 0x3ff5b9be00000000 */ |
948 |
1.36930561065673828125e+00, /* 0x3ff5e8ad00000000 */ |
949 |
1.38066959381103515625e+00, /* 0x3ff6173900000000 */ |
950 |
1.39194107055664062500e+00, /* 0x3ff6456400000000 */ |
951 |
1.40312099456787109375e+00, /* 0x3ff6732f00000000 */ |
952 |
1.41421318054199218750e+00, /* 0x3ff6a09e00000000 */ |
953 |
1.42521858215332031250e+00, /* 0x3ff6cdb200000000 */ |
954 |
1.43614006042480468750e+00, /* 0x3ff6fa6e00000000 */ |
955 |
1.44697952270507812500e+00, /* 0x3ff726d400000000 */ |
956 |
1.45773792266845703125e+00, /* 0x3ff752e500000000 */ |
957 |
1.46841716766357421875e+00, /* 0x3ff77ea300000000 */ |
958 |
1.47901916503906250000e+00, /* 0x3ff7aa1000000000 */ |
959 |
1.48954677581787109375e+00, /* 0x3ff7d52f00000000 */ |
960 |
1.50000000000000000000e+00, /* 0x3ff8000000000000 */ |
961 |
1.51038074493408203125e+00, /* 0x3ff82a8500000000 */ |
962 |
1.52068996429443359375e+00, /* 0x3ff854bf00000000 */ |
963 |
1.53093051910400390625e+00, /* 0x3ff87eb100000000 */ |
964 |
1.54110336303710937500e+00, /* 0x3ff8a85c00000000 */ |
965 |
1.55120849609375000000e+00, /* 0x3ff8d1c000000000 */ |
966 |
1.56124877929687500000e+00, /* 0x3ff8fae000000000 */ |
967 |
1.57122516632080078125e+00, /* 0x3ff923bd00000000 */ |
968 |
1.58113861083984375000e+00, /* 0x3ff94c5800000000 */ |
969 |
1.59099006652832031250e+00, /* 0x3ff974b200000000 */ |
970 |
1.60078048706054687500e+00, /* 0x3ff99ccc00000000 */ |
971 |
1.61051177978515625000e+00, /* 0x3ff9c4a800000000 */ |
972 |
1.62018489837646484375e+00, /* 0x3ff9ec4700000000 */ |
973 |
1.62979984283447265625e+00, /* 0x3ffa13a900000000 */ |
974 |
1.63935947418212890625e+00, /* 0x3ffa3ad100000000 */ |
975 |
1.64886283874511718750e+00, /* 0x3ffa61be00000000 */ |
976 |
1.65831184387207031250e+00, /* 0x3ffa887200000000 */ |
977 |
1.66770744323730468750e+00, /* 0x3ffaaeee00000000 */ |
978 |
1.67705059051513671875e+00, /* 0x3ffad53300000000 */ |
979 |
1.68634128570556640625e+00, /* 0x3ffafb4100000000 */ |
980 |
1.69558238983154296875e+00, /* 0x3ffb211b00000000 */ |
981 |
1.70477199554443359375e+00, /* 0x3ffb46bf00000000 */ |
982 |
1.71391296386718750000e+00, /* 0x3ffb6c3000000000 */ |
983 |
1.72300529479980468750e+00, /* 0x3ffb916e00000000 */ |
984 |
1.73204994201660156250e+00, /* 0x3ffbb67a00000000 */ |
985 |
1.74104785919189453125e+00, /* 0x3ffbdb5500000000 */ |
986 |
1.75000000000000000000e+00, /* 0x3ffc000000000000 */ |
987 |
1.75890541076660156250e+00, /* 0x3ffc247a00000000 */ |
988 |
1.76776695251464843750e+00, /* 0x3ffc48c600000000 */ |
989 |
1.77658367156982421875e+00, /* 0x3ffc6ce300000000 */ |
990 |
1.78535652160644531250e+00, /* 0x3ffc90d200000000 */ |
991 |
1.79408740997314453125e+00, /* 0x3ffcb49500000000 */ |
992 |
1.80277538299560546875e+00, /* 0x3ffcd82b00000000 */ |
993 |
1.81142139434814453125e+00, /* 0x3ffcfb9500000000 */ |
994 |
1.82002735137939453125e+00, /* 0x3ffd1ed500000000 */ |
995 |
1.82859230041503906250e+00, /* 0x3ffd41ea00000000 */ |
996 |
1.83711719512939453125e+00, /* 0x3ffd64d500000000 */ |
997 |
1.84560203552246093750e+00, /* 0x3ffd879600000000 */ |
998 |
1.85404872894287109375e+00, /* 0x3ffdaa2f00000000 */ |
999 |
1.86245727539062500000e+00, /* 0x3ffdcca000000000 */ |
1000 |
1.87082862854003906250e+00, /* 0x3ffdeeea00000000 */ |
1001 |
1.87916183471679687500e+00, /* 0x3ffe110c00000000 */ |
1002 |
1.88745784759521484375e+00, /* 0x3ffe330700000000 */ |
1003 |
1.89571857452392578125e+00, /* 0x3ffe54dd00000000 */ |
1004 |
1.90394306182861328125e+00, /* 0x3ffe768d00000000 */ |
1005 |
1.91213226318359375000e+00, /* 0x3ffe981800000000 */ |
1006 |
1.92028617858886718750e+00, /* 0x3ffeb97e00000000 */ |
1007 |
1.92840576171875000000e+00, /* 0x3ffedac000000000 */ |
1008 |
1.93649101257324218750e+00, /* 0x3ffefbde00000000 */ |
1009 |
1.94454288482666015625e+00, /* 0x3fff1cd900000000 */ |
1010 |
1.95256233215332031250e+00, /* 0x3fff3db200000000 */ |
1011 |
1.96054744720458984375e+00, /* 0x3fff5e6700000000 */ |
1012 |
1.96850109100341796875e+00, /* 0x3fff7efb00000000 */ |
1013 |
1.97642326354980468750e+00, /* 0x3fff9f6e00000000 */ |
1014 |
1.98431301116943359375e+00, /* 0x3fffbfbf00000000 */ |
1015 |
1.99217128753662109375e+00, /* 0x3fffdfef00000000 */ |
1016 |
2.00000000000000000000e+00}; /* 0x4000000000000000 */ |
1017 |
|
1018 |
static const double rt_jby32_trail_table_dbl[97] = { |
1019 |
0.00000000000000000000e+00, /* 0x0000000000000000 */ |
1020 |
9.17217678638807524014e-07, /* 0x3eaec6d70177881c */ |
1021 |
3.82539669043705364790e-07, /* 0x3e99abfb41bd6b24 */ |
1022 |
2.85899577162227138140e-08, /* 0x3e5eb2bf6bab55a2 */ |
1023 |
7.63210485349101216659e-07, /* 0x3ea99bed9b2d8d0c */ |
1024 |
9.32123004127716212874e-07, /* 0x3eaf46e029c1b296 */ |
1025 |
1.95174719169309219157e-07, /* 0x3e8a3226fc42f30c */ |
1026 |
5.34316371481845492427e-07, /* 0x3ea1edbe20701d73 */ |
1027 |
5.79631242504454563052e-07, /* 0x3ea372fe94f82be7 */ |
1028 |
4.20404384109571705948e-07, /* 0x3e9c367e08e7bb06 */ |
1029 |
6.89486030314147010716e-07, /* 0x3ea722a3d0a66608 */ |
1030 |
6.89927685625314560328e-07, /* 0x3ea7266f067ca1d6 */ |
1031 |
3.32778123013641425828e-07, /* 0x3e965515a9b34850 */ |
1032 |
1.64433259436999584387e-07, /* 0x3e8611e23ef6c1bd */ |
1033 |
4.37590875197899335723e-07, /* 0x3e9d5dc1059ed8e7 */ |
1034 |
1.79808183816018617413e-07, /* 0x3e88222982d0e4f4 */ |
1035 |
7.46386593615986477624e-08, /* 0x3e7409212e7d0322 */ |
1036 |
5.72520794105201454728e-07, /* 0x3ea335ea8a5fcf39 */ |
1037 |
0.00000000000000000000e+00, /* 0x0000000000000000 */ |
1038 |
2.96860689431670420344e-07, /* 0x3e93ec071e938bfe */ |
1039 |
3.54167239176257065345e-07, /* 0x3e97c48bfd9862c6 */ |
1040 |
7.95211265664474710063e-07, /* 0x3eaaaed010f74671 */ |
1041 |
1.72327048595145565621e-07, /* 0x3e87211cbfeb62e0 */ |
1042 |
6.99494915996239297020e-07, /* 0x3ea7789d9660e72d */ |
1043 |
6.32644111701500844315e-07, /* 0x3ea53a5f1d36f1cf */ |
1044 |
6.20124838851440463844e-10, /* 0x3e054eacff2057dc */ |
1045 |
6.13404719757812629969e-07, /* 0x3ea4951b3e6a83cc */ |
1046 |
3.47654909777986407387e-07, /* 0x3e9754aa76884c66 */ |
1047 |
7.83106177002392475763e-07, /* 0x3eaa46d4b1de1074 */ |
1048 |
5.33337372440526357008e-07, /* 0x3ea1e55548f92635 */ |
1049 |
2.01508648555298681765e-08, /* 0x3e55a3070dd17788 */ |
1050 |
5.25472356925843939587e-07, /* 0x3ea1a1c5eedb0801 */ |
1051 |
3.81831102861301692797e-07, /* 0x3e999fcef32422cc */ |
1052 |
6.99220602161420018738e-07, /* 0x3ea776425d6b0199 */ |
1053 |
6.01209702477462624811e-07, /* 0x3ea42c5a1e0191a2 */ |
1054 |
9.01437000591944740554e-08, /* 0x3e7832a0bdff1327 */ |
1055 |
5.10428680864685379950e-08, /* 0x3e6b674743636676 */ |
1056 |
3.47895267104621031421e-07, /* 0x3e9758cb90d2f714 */ |
1057 |
7.80735841510641848628e-07, /* 0x3eaa3278459cde25 */ |
1058 |
1.35158752025506517690e-07, /* 0x3e822404f4a103ee */ |
1059 |
0.00000000000000000000e+00, /* 0x0000000000000000 */ |
1060 |
1.76523947728535489812e-09, /* 0x3e1e539af6892ac5 */ |
1061 |
6.68280121328499932183e-07, /* 0x3ea66c7b872c9cd0 */ |
1062 |
5.70135482405123276616e-07, /* 0x3ea3216d2f43887d */ |
1063 |
1.37705134737562525897e-07, /* 0x3e827b832cbedc0e */ |
1064 |
7.09655107074516613672e-07, /* 0x3ea7cfe41579091d */ |
1065 |
7.20302724551461693011e-07, /* 0x3ea82b5a713c490a */ |
1066 |
4.69926266058212796694e-07, /* 0x3e9f8945932d872e */ |
1067 |
2.19244345915999437026e-07, /* 0x3e8d6d2da9490251 */ |
1068 |
1.91141411617401877927e-07, /* 0x3e89a791a3114e4a */ |
1069 |
5.72297665296622053774e-07, /* 0x3ea333ffe005988d */ |
1070 |
5.61055484436830560103e-07, /* 0x3ea2d36e0ed49ab1 */ |
1071 |
2.76225500213991506100e-07, /* 0x3e92898498f55f9e */ |
1072 |
7.58466189522395692908e-07, /* 0x3ea9732cca1032a3 */ |
1073 |
1.56893371256836029827e-07, /* 0x3e850ed0b02a22d2 */ |
1074 |
4.06038997708867066507e-07, /* 0x3e9b3fb265b1e40a */ |
1075 |
5.51305629612057435809e-07, /* 0x3ea27fade682d1de */ |
1076 |
5.64778487026561123207e-07, /* 0x3ea2f36906f707ba */ |
1077 |
3.92609705553556897517e-07, /* 0x3e9a58fbbee883b6 */ |
1078 |
9.09698438776943827802e-07, /* 0x3eae864005bca6d7 */ |
1079 |
1.05949774066016139743e-07, /* 0x3e7c70d02300f263 */ |
1080 |
7.16578798392844784244e-07, /* 0x3ea80b5d712d8e3e */ |
1081 |
6.86233073531233972561e-07, /* 0x3ea706b27cc7d390 */ |
1082 |
7.99211473033494452908e-07, /* 0x3eaad12c9d849a97 */ |
1083 |
8.65552275731027456121e-07, /* 0x3ead0b09954e764b */ |
1084 |
6.75456120386058448618e-07, /* 0x3ea6aa1fb7826cbd */ |
1085 |
0.00000000000000000000e+00, /* 0x0000000000000000 */ |
1086 |
4.99167184520462138743e-07, /* 0x3ea0bfd03f46763c */ |
1087 |
4.51720373502110930296e-10, /* 0x3dff0abfb4adfb9e */ |
1088 |
1.28874162718371367439e-07, /* 0x3e814c151f991b2e */ |
1089 |
5.85529267186999798656e-07, /* 0x3ea3a5a879b09292 */ |
1090 |
1.01827770937125531924e-07, /* 0x3e7b558d173f9796 */ |
1091 |
2.54736389177809626508e-07, /* 0x3e9118567cd83fb8 */ |
1092 |
6.98925535290464831294e-07, /* 0x3ea773b981896751 */ |
1093 |
1.20940735036524314513e-07, /* 0x3e803b7df49f48a8 */ |
1094 |
5.43759351196479689657e-08, /* 0x3e6d315f22491900 */ |
1095 |
1.11957989042397958409e-07, /* 0x3e7e0db1c5bb84b2 */ |
1096 |
8.47006714134442661218e-07, /* 0x3eac6bbb7644ff76 */ |
1097 |
8.92831044643427836228e-07, /* 0x3eadf55c3afec01f */ |
1098 |
7.77828292464916501663e-07, /* 0x3eaa197e81034da3 */ |
1099 |
6.48469316302918797451e-08, /* 0x3e71683f4920555d */ |
1100 |
2.12579816658859849140e-07, /* 0x3e8c882fd78bb0b0 */ |
1101 |
7.61222472580559138435e-07, /* 0x3ea98ad9eb7b83ec */ |
1102 |
2.86488961857314189607e-07, /* 0x3e9339d7c7777273 */ |
1103 |
2.14637363790165363515e-07, /* 0x3e8ccee237cae6fe */ |
1104 |
5.44137005612605847831e-08, /* 0x3e6d368fe324a146 */ |
1105 |
2.58378284856442408413e-07, /* 0x3e9156e7b6d99b45 */ |
1106 |
3.15848939061134843091e-07, /* 0x3e95323e5310b5c1 */ |
1107 |
6.60530466255089632309e-07, /* 0x3ea629e9db362f5d */ |
1108 |
7.63436345535852301127e-07, /* 0x3ea99dde4728d7ec */ |
1109 |
8.68233432860324345268e-08, /* 0x3e774e746878544d */ |
1110 |
9.45465175398023087082e-07, /* 0x3eafb97be873a87d */ |
1111 |
8.77499534786171267246e-07, /* 0x3ead71a9e23c2f63 */ |
1112 |
2.74055432394999316135e-07, /* 0x3e92643c89cda173 */ |
1113 |
4.72129009349126213532e-07, /* 0x3e9faf1d57a4d56c */ |
1114 |
8.93777032327078947306e-07, /* 0x3eadfd7c7ab7b282 */ |
1115 |
0.00000000000000000000e+00}; /* 0x0000000000000000 */ |
1116 |
|
1117 |
|
1118 |
/* Handle special arguments first */ |
1119 |
|
1120 |
GET_BITS_DP64(x, ux); |
1121 |
ax = ux & (~SIGNBIT_DP64); |
1122 |
|
1123 |
if(ax >= 0x7ff0000000000000) |
1124 |
{ |
1125 |
/* x is either NaN or infinity */ |
1126 |
if (ux & MANTBITS_DP64) |
1127 |
/* x is NaN */ |
1128 |
return x + x; /* Raise invalid if it is a signalling NaN */ |
1129 |
else if (ux & SIGNBIT_DP64) |
1130 |
/* x is negative infinity */ |
1131 |
return nan_with_flags(AMD_F_INVALID); |
1132 |
else |
1133 |
/* x is positive infinity */ |
1134 |
return x; |
1135 |
} |
1136 |
else if (ux & SIGNBIT_DP64) |
1137 |
{ |
1138 |
/* x is negative. */ |
1139 |
if (ux == SIGNBIT_DP64) |
1140 |
/* Handle negative zero first */ |
1141 |
return x; |
1142 |
else |
1143 |
return nan_with_flags(AMD_F_INVALID); |
1144 |
} |
1145 |
else if (ux <= 0x000fffffffffffff) |
1146 |
{ |
1147 |
/* x is denormalised or zero */ |
1148 |
if (ux == 0) |
1149 |
/* x is zero */ |
1150 |
return x; |
1151 |
else |
1152 |
{ |
1153 |
/* x is denormalised; scale it up */ |
1154 |
/* Normalize x by increasing the exponent by 60 |
1155 |
and subtracting a correction to account for the implicit |
1156 |
bit. This replaces a slow denormalized |
1157 |
multiplication by a fast normal subtraction. */ |
1158 |
static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */ |
1159 |
denorm = 1; |
1160 |
GET_BITS_DP64(x, ux); |
1161 |
PUT_BITS_DP64(ux | 0x03d0000000000000, x); |
1162 |
x -= corr; |
1163 |
GET_BITS_DP64(x, ux); |
1164 |
} |
1165 |
} |
1166 |
|
1167 |
/* Main algorithm */ |
1168 |
|
1169 |
/* |
1170 |
Find y and e such that x = 2^e * y, where y in [1,4). |
1171 |
This is done using an in-lined variant of splitDouble, |
1172 |
which also ensures that e is even. |
1173 |
*/ |
1174 |
y = x; |
1175 |
ux &= EXPBITS_DP64; |
1176 |
ux >>= EXPSHIFTBITS_DP64; |
1177 |
if (ux & 1) |
1178 |
{ |
1179 |
GET_BITS_DP64(y, u); |
1180 |
u &= (SIGNBIT_DP64 | MANTBITS_DP64); |
1181 |
u |= ONEEXPBITS_DP64; |
1182 |
PUT_BITS_DP64(u, y); |
1183 |
e = ux - EXPBIAS_DP64; |
1184 |
} |
1185 |
else |
1186 |
{ |
1187 |
GET_BITS_DP64(y, u); |
1188 |
u &= (SIGNBIT_DP64 | MANTBITS_DP64); |
1189 |
u |= TWOEXPBITS_DP64; |
1190 |
PUT_BITS_DP64(u, y); |
1191 |
e = ux - EXPBIAS_DP64 - 1; |
1192 |
} |
1193 |
|
1194 |
|
1195 |
/* Find the index of the sub-interval of [1,4) in which y lies. */ |
1196 |
|
1197 |
index = (int)(32.0*y+0.5); |
1198 |
|
1199 |
/* Look up the table values and compute c and r = c/t */ |
1200 |
|
1201 |
rtc_lead = rt_jby32_lead_table_dbl[index-32]; |
1202 |
rtc_trail = rt_jby32_trail_table_dbl[index-32]; |
1203 |
c = 0.03125*index; |
1204 |
r = (y - c)/c; |
1205 |
|
1206 |
/* |
1207 |
Find q = sqrt(1+r) - 1. |
1208 |
From one step of Newton on (q+1)^2 = 1+r |
1209 |
*/ |
1210 |
|
1211 |
p = r*0.5 - r*r*(0.1250079870 - r*(0.6250522999E-01)); |
1212 |
twop = p + p; |
1213 |
q = p - (p*p + (twop - r))/(twop + 2.0); |
1214 |
|
1215 |
/* Reconstruction */ |
1216 |
|
1217 |
rtc = rtc_lead + rtc_trail; |
1218 |
e >>= 1; /* e = e/2 */ |
1219 |
z = rtc_lead + (rtc*q+rtc_trail); |
1220 |
|
1221 |
if (denorm) |
1222 |
{ |
1223 |
/* Scale by 2**(e-30) */ |
1224 |
PUT_BITS_DP64(((long)(e - 30) + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r); |
1225 |
z *= r; |
1226 |
} |
1227 |
else |
1228 |
{ |
1229 |
/* Scale by 2**e */ |
1230 |
PUT_BITS_DP64(((long)e + EXPBIAS_DP64) << EXPSHIFTBITS_DP64, r); |
1231 |
z *= r; |
1232 |
} |
1233 |
|
1234 |
return z; |
1235 |
|
1236 |
} |
1237 |
#endif /* SQRT_AMD_INLINE */ |
1238 |
|
1239 |
#if defined(USE_SQRTF_AMD_INLINE) |
1240 |
|
1241 |
static inline float sqrtf_amd_inline(float x) |
1242 |
{ |
1243 |
/* |
1244 |
Computes the square root of x. |
1245 |
|
1246 |
The calculation is carried out in three steps. |
1247 |
|
1248 |
Step 1. Reduction. |
1249 |
The input argument is scaled to the interval [1, 4) by |
1250 |
computing |
1251 |
x = 2^e * y, where y in [1,4). |
1252 |
Furthermore y is decomposed as y = c + t where |
1253 |
c = 1 + j/32, j = 0,1,..,96; and |t| <= 1/64. |
1254 |
|
1255 |
Step 2. Approximation. |
1256 |
An approximation q = sqrt(1 + (t/c)) - 1 is obtained |
1257 |
from a basic series expansion using precomputed values |
1258 |
stored in rt_jby32_lead_table_float and rt_jby32_trail_table_float. |
1259 |
|
1260 |
Step 3. Reconstruction. |
1261 |
The value of sqrt(x) is reconstructed via |
1262 |
sqrt(x) = 2^(e/2) * sqrt(y) |
1263 |
= 2^(e/2) * sqrt(c) * sqrt(y/c) |
1264 |
= 2^(e/2) * sqrt(c) * sqrt(1 + t/c) |
1265 |
= 2^(e/2) * [ sqrt(c) + sqrt(c)*q ] |
1266 |
*/ |
1267 |
|
1268 |
unsigned int ux, ax, u; |
1269 |
float r1, r2, c, y, p, q, r, twop, z, rtc, rtc_lead, rtc_trail; |
1270 |
int e, denorm = 0, index; |
1271 |
|
1272 |
/* Arrays rt_jby32_lead_table_float and rt_jby32_trail_table_float contain |
1273 |
leading and trailing parts respectively of precomputed |
1274 |
values of sqrt(j/32), for j = 32, 33, ..., 128. |
1275 |
rt_jby32_lead_table_float contains the first 13 bits of precision, |
1276 |
and rt_jby32_trail_table_float contains a further 24 bits precision. */ |
1277 |
|
1278 |
static const float rt_jby32_lead_table_float[97] = { |
1279 |
1.00000000000000000000e+00F, /* 0x3f800000 */ |
1280 |
1.01538085937500000000e+00F, /* 0x3f81f800 */ |
1281 |
1.03076171875000000000e+00F, /* 0x3f83f000 */ |
1282 |
1.04565429687500000000e+00F, /* 0x3f85d800 */ |
1283 |
1.06054687500000000000e+00F, /* 0x3f87c000 */ |
1284 |
1.07519531250000000000e+00F, /* 0x3f89a000 */ |
1285 |
1.08959960937500000000e+00F, /* 0x3f8b7800 */ |
1286 |
1.10375976562500000000e+00F, /* 0x3f8d4800 */ |
1287 |
1.11791992187500000000e+00F, /* 0x3f8f1800 */ |
1288 |
1.13183593750000000000e+00F, /* 0x3f90e000 */ |
1289 |
1.14550781250000000000e+00F, /* 0x3f92a000 */ |
1290 |
1.15917968750000000000e+00F, /* 0x3f946000 */ |
1291 |
1.17236328125000000000e+00F, /* 0x3f961000 */ |
1292 |
1.18579101562500000000e+00F, /* 0x3f97c800 */ |
1293 |
1.19873046875000000000e+00F, /* 0x3f997000 */ |
1294 |
1.21191406250000000000e+00F, /* 0x3f9b2000 */ |
1295 |
1.22460937500000000000e+00F, /* 0x3f9cc000 */ |
1296 |
1.23730468750000000000e+00F, /* 0x3f9e6000 */ |
1297 |
1.25000000000000000000e+00F, /* 0x3fa00000 */ |
1298 |
1.26220703125000000000e+00F, /* 0x3fa19000 */ |
1299 |
1.27465820312500000000e+00F, /* 0x3fa32800 */ |
1300 |
1.28686523437500000000e+00F, /* 0x3fa4b800 */ |
1301 |
1.29882812500000000000e+00F, /* 0x3fa64000 */ |
1302 |
1.31079101562500000000e+00F, /* 0x3fa7c800 */ |
1303 |
1.32275390625000000000e+00F, /* 0x3fa95000 */ |
1304 |
1.33447265625000000000e+00F, /* 0x3faad000 */ |
1305 |
1.34619140625000000000e+00F, /* 0x3fac5000 */ |
1306 |
1.35766601562500000000e+00F, /* 0x3fadc800 */ |
1307 |
1.36914062500000000000e+00F, /* 0x3faf4000 */ |
1308 |
1.38061523437500000000e+00F, /* 0x3fb0b800 */ |
1309 |
1.39184570312500000000e+00F, /* 0x3fb22800 */ |
1310 |
1.40307617187500000000e+00F, /* 0x3fb39800 */ |
1311 |
1.41406250000000000000e+00F, /* 0x3fb50000 */ |
1312 |
1.42504882812500000000e+00F, /* 0x3fb66800 */ |
1313 |
1.43603515625000000000e+00F, /* 0x3fb7d000 */ |
1314 |
1.44677734375000000000e+00F, /* 0x3fb93000 */ |
1315 |
1.45751953125000000000e+00F, /* 0x3fba9000 */ |
1316 |
1.46826171875000000000e+00F, /* 0x3fbbf000 */ |
1317 |
1.47900390625000000000e+00F, /* 0x3fbd5000 */ |
1318 |
1.48950195312500000000e+00F, /* 0x3fbea800 */ |
1319 |
1.50000000000000000000e+00F, /* 0x3fc00000 */ |
1320 |
1.51025390625000000000e+00F, /* 0x3fc15000 */ |
1321 |
1.52050781250000000000e+00F, /* 0x3fc2a000 */ |
1322 |
1.53076171875000000000e+00F, /* 0x3fc3f000 */ |
1323 |
1.54101562500000000000e+00F, /* 0x3fc54000 */ |
1324 |
1.55102539062500000000e+00F, /* 0x3fc68800 */ |
1325 |
1.56103515625000000000e+00F, /* 0x3fc7d000 */ |
1326 |
1.57104492187500000000e+00F, /* 0x3fc91800 */ |
1327 |
1.58105468750000000000e+00F, /* 0x3fca6000 */ |
1328 |
1.59082031250000000000e+00F, /* 0x3fcba000 */ |
1329 |
1.60058593750000000000e+00F, /* 0x3fcce000 */ |
1330 |
1.61035156250000000000e+00F, /* 0x3fce2000 */ |
1331 |
1.62011718750000000000e+00F, /* 0x3fcf6000 */ |
1332 |
1.62963867187500000000e+00F, /* 0x3fd09800 */ |
1333 |
1.63916015625000000000e+00F, /* 0x3fd1d000 */ |
1334 |
1.64868164062500000000e+00F, /* 0x3fd30800 */ |
1335 |
1.65820312500000000000e+00F, /* 0x3fd44000 */ |
1336 |
1.66748046875000000000e+00F, /* 0x3fd57000 */ |
1337 |
1.67700195312500000000e+00F, /* 0x3fd6a800 */ |
1338 |
1.68627929687500000000e+00F, /* 0x3fd7d800 */ |
1339 |
1.69555664062500000000e+00F, /* 0x3fd90800 */ |
1340 |
1.70458984375000000000e+00F, /* 0x3fda3000 */ |
1341 |
1.71386718750000000000e+00F, /* 0x3fdb6000 */ |
1342 |
1.72290039062500000000e+00F, /* 0x3fdc8800 */ |
1343 |
1.73193359375000000000e+00F, /* 0x3fddb000 */ |
1344 |
1.74096679687500000000e+00F, /* 0x3fded800 */ |
1345 |
1.75000000000000000000e+00F, /* 0x3fe00000 */ |
1346 |
1.75878906250000000000e+00F, /* 0x3fe12000 */ |
1347 |
1.76757812500000000000e+00F, /* 0x3fe24000 */ |
1348 |
1.77636718750000000000e+00F, /* 0x3fe36000 */ |
1349 |
1.78515625000000000000e+00F, /* 0x3fe48000 */ |
1350 |
1.79394531250000000000e+00F, /* 0x3fe5a000 */ |
1351 |
1.80273437500000000000e+00F, /* 0x3fe6c000 */ |
1352 |
1.81127929687500000000e+00F, /* 0x3fe7d800 */ |
1353 |
1.81982421875000000000e+00F, /* 0x3fe8f000 */ |
1354 |
1.82836914062500000000e+00F, /* 0x3fea0800 */ |
1355 |
1.83691406250000000000e+00F, /* 0x3feb2000 */ |
1356 |
1.84545898437500000000e+00F, /* 0x3fec3800 */ |
1357 |
1.85400390625000000000e+00F, /* 0x3fed5000 */ |
1358 |
1.86230468750000000000e+00F, /* 0x3fee6000 */ |
1359 |
1.87060546875000000000e+00F, /* 0x3fef7000 */ |
1360 |
1.87915039062500000000e+00F, /* 0x3ff08800 */ |
1361 |
1.88745117187500000000e+00F, /* 0x3ff19800 */ |
1362 |
1.89550781250000000000e+00F, /* 0x3ff2a000 */ |
1363 |
1.90380859375000000000e+00F, /* 0x3ff3b000 */ |
1364 |
1.91210937500000000000e+00F, /* 0x3ff4c000 */ |
1365 |
1.92016601562500000000e+00F, /* 0x3ff5c800 */ |
1366 |
1.92822265625000000000e+00F, /* 0x3ff6d000 */ |
1367 |
1.93627929687500000000e+00F, /* 0x3ff7d800 */ |
1368 |
1.94433593750000000000e+00F, /* 0x3ff8e000 */ |
1369 |
1.95239257812500000000e+00F, /* 0x3ff9e800 */ |
1370 |
1.96044921875000000000e+00F, /* 0x3ffaf000 */ |
1371 |
1.96826171875000000000e+00F, /* 0x3ffbf000 */ |
1372 |
1.97631835937500000000e+00F, /* 0x3ffcf800 */ |
1373 |
1.98413085937500000000e+00F, /* 0x3ffdf800 */ |
1374 |
1.99194335937500000000e+00F, /* 0x3ffef800 */ |
1375 |
2.00000000000000000000e+00F}; /* 0x40000000 */ |
1376 |
|
1377 |
static const float rt_jby32_trail_table_float[97] = { |
1378 |
0.00000000000000000000e+00F, /* 0x00000000 */ |
1379 |
1.23941208585165441036e-04F, /* 0x3901f637 */ |
1380 |
1.46876545841223560274e-05F, /* 0x37766aff */ |
1381 |
1.70736297150142490864e-04F, /* 0x393307ad */ |
1382 |
1.13296780909877270460e-04F, /* 0x38ed99bf */ |
1383 |
9.53458802541717886925e-05F, /* 0x38c7f46e */ |
1384 |
1.25126505736261606216e-04F, /* 0x39033464 */ |
1385 |
2.10342666832730174065e-04F, /* 0x395c8f6e */ |
1386 |
1.14066875539720058441e-04F, /* 0x38ef3730 */ |
1387 |
8.72047676239162683487e-05F, /* 0x38b6e1b4 */ |
1388 |
1.36111237225122749805e-04F, /* 0x390eb915 */ |
1389 |
2.26244374061934649944e-05F, /* 0x37bdc99c */ |
1390 |
2.40658700931817293167e-04F, /* 0x397c5954 */ |
1391 |
6.31069415248930454254e-05F, /* 0x38845848 */ |
1392 |
2.27412077947519719601e-04F, /* 0x396e7577 */ |
1393 |
5.90185391047270968556e-06F, /* 0x36c6088a */ |
1394 |
1.35496389702893793583e-04F, /* 0x390e1409 */ |
1395 |
1.32179571664892137051e-04F, /* 0x390a99af */ |
1396 |
0.00000000000000000000e+00F, /* 0x00000000 */ |
1397 |
2.31086043640971183777e-04F, /* 0x39724fb0 */ |
1398 |
9.66752704698592424393e-05F, /* 0x38cabe24 */ |
1399 |
8.85332483449019491673e-05F, /* 0x38b9aaed */ |
1400 |
2.09980673389509320259e-04F, /* 0x395c2e42 */ |
1401 |
2.20044588786549866199e-04F, /* 0x3966bbc5 */ |
1402 |
1.21749282698146998882e-04F, /* 0x38ff53a6 */ |
1403 |
1.62125259521417319775e-04F, /* 0x392a002b */ |
1404 |
9.97955357888713479042e-05F, /* 0x38d14952 */ |
1405 |
1.81545779923908412457e-04F, /* 0x393e5d53 */ |
1406 |
1.65768768056295812130e-04F, /* 0x392dd237 */ |
1407 |
5.48927710042335093021e-05F, /* 0x38663caa */ |
1408 |
9.53875860432162880898e-05F, /* 0x38c80ad2 */ |
1409 |
4.53481625299900770187e-05F, /* 0x383e3438 */ |
1410 |
1.51062369695864617825e-04F, /* 0x391e667f */ |
1411 |
1.70453247847035527229e-04F, /* 0x3932bbb2 */ |
1412 |
1.05505387182347476482e-04F, /* 0x38dd42c6 */ |
1413 |
2.02269104192964732647e-04F, /* 0x39541833 */ |
1414 |
2.18442466575652360916e-04F, /* 0x39650db4 */ |
1415 |
1.55796806211583316326e-04F, /* 0x39235d63 */ |
1416 |
1.60395247803535312414e-05F, /* 0x37868c9e */ |
1417 |
4.49578510597348213196e-05F, /* 0x383c9120 */ |
1418 |
0.00000000000000000000e+00F, /* 0x00000000 */ |
1419 |
1.26840444863773882389e-04F, /* 0x39050079 */ |
1420 |
1.82820076588541269302e-04F, /* 0x393fb364 */ |
1421 |
1.69370483490638434887e-04F, /* 0x3931990b */ |
1422 |
8.78757418831810355186e-05F, /* 0x38b849ee */ |
1423 |
1.83815121999941766262e-04F, /* 0x3940be7f */ |
1424 |
2.14343352126888930798e-04F, /* 0x3960c15b */ |
1425 |
1.80714370799250900745e-04F, /* 0x393d7e25 */ |
1426 |
8.41425862745381891727e-05F, /* 0x38b075b5 */ |
1427 |
1.69945167726837098598e-04F, /* 0x3932334f */ |
1428 |
1.95121858268976211548e-04F, /* 0x394c99a0 */ |
1429 |
1.60778334247879683971e-04F, /* 0x3928969b */ |
1430 |
6.79871009197086095810e-05F, /* 0x388e944c */ |
1431 |
1.61929419846273958683e-04F, /* 0x3929cb99 */ |
1432 |
1.99474830878898501396e-04F, /* 0x39512a1e */ |
1433 |
1.81604162207804620266e-04F, /* 0x393e6cff */ |
1434 |
1.09270178654696792364e-04F, /* 0x38e527fb */ |
1435 |
2.27539261686615645885e-04F, /* 0x396e979b */ |
1436 |
4.90300008095800876617e-05F, /* 0x384da590 */ |
1437 |
6.28985289949923753738e-05F, /* 0x3883e864 */ |
1438 |
2.58551553997676819563e-05F, /* 0x37d8e386 */ |
1439 |
1.82868374395184218884e-04F, /* 0x393fc05b */ |
1440 |
4.64625991298817098141e-05F, /* 0x3842e0d6 */ |
1441 |
1.05703387816902250051e-04F, /* 0x38ddad13 */ |
1442 |
1.17213814519345760345e-04F, /* 0x38f5d0b0 */ |
1443 |
8.17377731436863541603e-05F, /* 0x38ab6aa2 */ |
1444 |
0.00000000000000000000e+00F, /* 0x00000000 */ |
1445 |
1.16847433673683553934e-04F, /* 0x38f50bfd */ |
1446 |
1.88827965757809579372e-04F, /* 0x3946001f */ |
1447 |
2.16612941585481166840e-04F, /* 0x39632298 */ |
1448 |
2.00857131858356297016e-04F, /* 0x39529d2d */ |
1449 |
1.42199307447299361229e-04F, /* 0x39151b56 */ |
1450 |
4.12627305195201188326e-05F, /* 0x382d1185 */ |
1451 |
1.42796401632949709892e-04F, /* 0x3915bb9e */ |
1452 |
2.03253570361994206905e-04F, /* 0x39552077 */ |
1453 |
2.23214170546270906925e-04F, /* 0x396a0e99 */ |
1454 |
2.03244591830298304558e-04F, /* 0x39551e0e */ |
1455 |
1.43898156238719820976e-04F, /* 0x3916e35e */ |
1456 |
4.57155256299301981926e-05F, /* 0x383fbeac */ |
1457 |
1.53365719597786664963e-04F, /* 0x3920d0cc */ |
1458 |
2.23224633373320102692e-04F, /* 0x396a1168 */ |
1459 |
1.16566716314991936088e-05F, /* 0x37439106 */ |
1460 |
7.43694272387074306607e-06F, /* 0x36f98ada */ |
1461 |
2.11048507480882108212e-04F, /* 0x395d4ce7 */ |
1462 |
1.34682719362899661064e-04F, /* 0x390d399e */ |
1463 |
2.29425968427676707506e-05F, /* 0x37c074da */ |
1464 |
1.20421340398024767637e-04F, /* 0x38fc8ab7 */ |
1465 |
1.83421318070031702518e-04F, /* 0x394054c9 */ |
1466 |
2.12376224226318299770e-04F, /* 0x395eb14f */ |
1467 |
2.07710763788782060146e-04F, /* 0x3959ccef */ |
1468 |
1.69840845046564936638e-04F, /* 0x3932174e */ |
1469 |
9.91739216260612010956e-05F, /* 0x38cffb98 */ |
1470 |
2.40249748458154499531e-04F, /* 0x397beb8d */ |
1471 |
1.05178231024183332920e-04F, /* 0x38dc9322 */ |
1472 |
1.82623916771262884140e-04F, /* 0x393f7ebc */ |
1473 |
2.28821940254420042038e-04F, /* 0x396fefec */ |
1474 |
0.00000000000000000000e+00F}; /* 0x00000000 */ |
1475 |
|
1476 |
|
1477 |
/* Handle special arguments first */ |
1478 |
|
1479 |
GET_BITS_SP32(x, ux); |
1480 |
ax = ux & (~SIGNBIT_SP32); |
1481 |
|
1482 |
if(ax >= 0x7f800000) |
1483 |
{ |
1484 |
/* x is either NaN or infinity */ |
1485 |
if (ux & MANTBITS_SP32) |
1486 |
/* x is NaN */ |
1487 |
return x + x; /* Raise invalid if it is a signalling NaN */ |
1488 |
else if (ux & SIGNBIT_SP32) |
1489 |
return nanf_with_flags(AMD_F_INVALID); |
1490 |
else |
1491 |
/* x is positive infinity */ |
1492 |
return x; |
1493 |
} |
1494 |
else if (ux & SIGNBIT_SP32) |
1495 |
{ |
1496 |
/* x is negative. */ |
1497 |
if (x == 0.0F) |
1498 |
/* Handle negative zero first */ |
1499 |
return x; |
1500 |
else |
1501 |
return nanf_with_flags(AMD_F_INVALID); |
1502 |
} |
1503 |
else if (ux <= 0x007fffff) |
1504 |
{ |
1505 |
/* x is denormalised or zero */ |
1506 |
if (ux == 0) |
1507 |
/* x is zero */ |
1508 |
return x; |
1509 |
else |
1510 |
{ |
1511 |
/* x is denormalised; scale it up */ |
1512 |
/* Normalize x by increasing the exponent by 26 |
1513 |
and subtracting a correction to account for the implicit |
1514 |
bit. This replaces a slow denormalized |
1515 |
multiplication by a fast normal subtraction. */ |
1516 |
static const float corr = 7.888609052210118054e-31F; /* 0x0d800000 */ |
1517 |
denorm = 1; |
1518 |
GET_BITS_SP32(x, ux); |
1519 |
PUT_BITS_SP32(ux | 0x0d800000, x); |
1520 |
x -= corr; |
1521 |
GET_BITS_SP32(x, ux); |
1522 |
} |
1523 |
} |
1524 |
|
1525 |
/* Main algorithm */ |
1526 |
|
1527 |
/* |
1528 |
Find y and e such that x = 2^e * y, where y in [1,4). |
1529 |
This is done using an in-lined variant of splitFloat, |
1530 |
which also ensures that e is even. |
1531 |
*/ |
1532 |
y = x; |
1533 |
ux &= EXPBITS_SP32; |
1534 |
ux >>= EXPSHIFTBITS_SP32; |
1535 |
if (ux & 1) |
1536 |
{ |
1537 |
GET_BITS_SP32(y, u); |
1538 |
u &= (SIGNBIT_SP32 | MANTBITS_SP32); |
1539 |
u |= ONEEXPBITS_SP32; |
1540 |
PUT_BITS_SP32(u, y); |
1541 |
e = ux - EXPBIAS_SP32; |
1542 |
} |
1543 |
else |
1544 |
{ |
1545 |
GET_BITS_SP32(y, u); |
1546 |
u &= (SIGNBIT_SP32 | MANTBITS_SP32); |
1547 |
u |= TWOEXPBITS_SP32; |
1548 |
PUT_BITS_SP32(u, y); |
1549 |
e = ux - EXPBIAS_SP32 - 1; |
1550 |
} |
1551 |
|
1552 |
/* Find the index of the sub-interval of [1,4) in which y lies. */ |
1553 |
|
1554 |
index = (int)(32.0F*y+0.5); |
1555 |
|
1556 |
/* Look up the table values and compute c and r = c/t */ |
1557 |
|
1558 |
rtc_lead = rt_jby32_lead_table_float[index-32]; |
1559 |
rtc_trail = rt_jby32_trail_table_float[index-32]; |
1560 |
c = 0.03125F*index; |
1561 |
r = (y - c)/c; |
1562 |
|
1563 |
/* |
1564 |
Find q = sqrt(1+r) - 1. |
1565 |
From one step of Newton on (q+1)^2 = 1+r |
1566 |
*/ |
1567 |
|
1568 |
p = r*0.5F - r*r*(0.1250079870F - r*(0.6250522999e-01F)); |
1569 |
twop = p + p; |
1570 |
q = p - (p*p + (twop - r))/(twop + 2.0); |
1571 |
|
1572 |
/* Reconstruction */ |
1573 |
|
1574 |
rtc = rtc_lead + rtc_trail; |
1575 |
e >>= 1; /* e = e/2 */ |
1576 |
z = rtc_lead + (rtc*q+rtc_trail); |
1577 |
|
1578 |
if (denorm) |
1579 |
{ |
1580 |
/* Scale by 2**(e-13) */ |
1581 |
PUT_BITS_SP32(((e - 13) + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r); |
1582 |
z *= r; |
1583 |
} |
1584 |
else |
1585 |
{ |
1586 |
/* Scale by 2**e */ |
1587 |
PUT_BITS_SP32((e + EXPBIAS_SP32) << EXPSHIFTBITS_SP32, r); |
1588 |
z *= r; |
1589 |
} |
1590 |
|
1591 |
return z; |
1592 |
|
1593 |
} |
1594 |
#endif /* SQRTF_AMD_INLINE */ |
1595 |
|
1596 |
#ifdef USE_LOG_KERNEL_AMD |
1597 |
static inline void log_kernel_amd64(double x, unsigned long ux, int *xexp, double *r1, double *r2) |
1598 |
{ |
1599 |
|
1600 |
int expadjust; |
1601 |
double r, z1, z2, correction, f, f1, f2, q, u, v, poly; |
1602 |
int index; |
1603 |
|
1604 |
/* |
1605 |
Computes natural log(x). Algorithm based on: |
1606 |
Ping-Tak Peter Tang |
1607 |
"Table-driven implementation of the logarithm function in IEEE |
1608 |
floating-point arithmetic" |
1609 |
ACM Transactions on Mathematical Software (TOMS) |
1610 |
Volume 16, Issue 4 (December 1990) |
1611 |
*/ |
1612 |
|
1613 |
/* Arrays ln_lead_table and ln_tail_table contain |
1614 |
leading and trailing parts respectively of precomputed |
1615 |
values of natural log(1+i/64), for i = 0, 1, ..., 64. |
1616 |
ln_lead_table contains the first 24 bits of precision, |
1617 |
and ln_tail_table contains a further 53 bits precision. */ |
1618 |
|
1619 |
static const double ln_lead_table[65] = { |
1620 |
0.00000000000000000000e+00, /* 0x0000000000000000 */ |
1621 |
1.55041813850402832031e-02, /* 0x3f8fc0a800000000 */ |
1622 |
3.07716131210327148438e-02, /* 0x3f9f829800000000 */ |
1623 |
4.58095073699951171875e-02, /* 0x3fa7745800000000 */ |
1624 |
6.06245994567871093750e-02, /* 0x3faf0a3000000000 */ |
1625 |
7.52233862876892089844e-02, /* 0x3fb341d700000000 */ |
1626 |
8.96121263504028320312e-02, /* 0x3fb6f0d200000000 */ |
1627 |
1.03796780109405517578e-01, /* 0x3fba926d00000000 */ |
1628 |
1.17783010005950927734e-01, /* 0x3fbe270700000000 */ |
1629 |
1.31576299667358398438e-01, /* 0x3fc0d77e00000000 */ |
1630 |
1.45181953907012939453e-01, /* 0x3fc2955280000000 */ |
1631 |
1.58604979515075683594e-01, /* 0x3fc44d2b00000000 */ |
1632 |
1.71850204467773437500e-01, /* 0x3fc5ff3000000000 */ |
1633 |
1.84922337532043457031e-01, /* 0x3fc7ab8900000000 */ |
1634 |
1.97825729846954345703e-01, /* 0x3fc9525a80000000 */ |
1635 |
2.10564732551574707031e-01, /* 0x3fcaf3c900000000 */ |
1636 |
2.23143517971038818359e-01, /* 0x3fcc8ff780000000 */ |
1637 |
2.35566020011901855469e-01, /* 0x3fce270700000000 */ |
1638 |
2.47836112976074218750e-01, /* 0x3fcfb91800000000 */ |
1639 |
2.59957492351531982422e-01, /* 0x3fd0a324c0000000 */ |
1640 |
2.71933674812316894531e-01, /* 0x3fd1675c80000000 */ |
1641 |
2.83768117427825927734e-01, /* 0x3fd22941c0000000 */ |
1642 |
2.95464158058166503906e-01, /* 0x3fd2e8e280000000 */ |
1643 |
3.07025015354156494141e-01, /* 0x3fd3a64c40000000 */ |
1644 |
3.18453729152679443359e-01, /* 0x3fd4618bc0000000 */ |
1645 |
3.29753279685974121094e-01, /* 0x3fd51aad80000000 */ |
1646 |
3.40926527976989746094e-01, /* 0x3fd5d1bd80000000 */ |
1647 |
3.51976394653320312500e-01, /* 0x3fd686c800000000 */ |
1648 |
3.62905442714691162109e-01, /* 0x3fd739d7c0000000 */ |
1649 |
3.73716354370117187500e-01, /* 0x3fd7eaf800000000 */ |
1650 |
3.84411692619323730469e-01, /* 0x3fd89a3380000000 */ |
1651 |
3.94993782043457031250e-01, /* 0x3fd9479400000000 */ |
1652 |
4.05465066432952880859e-01, /* 0x3fd9f323c0000000 */ |
1653 |
4.15827870368957519531e-01, /* 0x3fda9cec80000000 */ |
1654 |
4.26084339618682861328e-01, /* 0x3fdb44f740000000 */ |
1655 |
4.36236739158630371094e-01, /* 0x3fdbeb4d80000000 */ |
1656 |
4.46287095546722412109e-01, /* 0x3fdc8ff7c0000000 */ |
1657 |
4.56237375736236572266e-01, /* 0x3fdd32fe40000000 */ |
1658 |
4.66089725494384765625e-01, /* 0x3fddd46a00000000 */ |
1659 |
4.75845873355865478516e-01, /* 0x3fde744240000000 */ |
1660 |
4.85507786273956298828e-01, /* 0x3fdf128f40000000 */ |
1661 |
4.95077252388000488281e-01, /* 0x3fdfaf5880000000 */ |
1662 |
5.04556000232696533203e-01, /* 0x3fe02552a0000000 */ |
1663 |
5.13945698738098144531e-01, /* 0x3fe0723e40000000 */ |
1664 |
5.23248136043548583984e-01, /* 0x3fe0be72e0000000 */ |
1665 |
5.32464742660522460938e-01, /* 0x3fe109f380000000 */ |
1666 |
5.41597247123718261719e-01, /* 0x3fe154c3c0000000 */ |
1667 |
5.50647079944610595703e-01, /* 0x3fe19ee6a0000000 */ |
1668 |
5.59615731239318847656e-01, /* 0x3fe1e85f40000000 */ |
1669 |
5.68504691123962402344e-01, /* 0x3fe23130c0000000 */ |
1670 |
5.77315330505371093750e-01, /* 0x3fe2795e00000000 */ |
1671 |
5.86049020290374755859e-01, /* 0x3fe2c0e9e0000000 */ |
1672 |
5.94707071781158447266e-01, /* 0x3fe307d720000000 */ |
1673 |
6.03290796279907226562e-01, /* 0x3fe34e2880000000 */ |
1674 |
6.11801505088806152344e-01, /* 0x3fe393e0c0000000 */ |
1675 |
6.20240390300750732422e-01, /* 0x3fe3d90260000000 */ |
1676 |
6.28608644008636474609e-01, /* 0x3fe41d8fe0000000 */ |
1677 |
6.36907458305358886719e-01, /* 0x3fe4618bc0000000 */ |
1678 |
6.45137906074523925781e-01, /* 0x3fe4a4f840000000 */ |
1679 |
6.53301239013671875000e-01, /* 0x3fe4e7d800000000 */ |
1680 |
6.61398470401763916016e-01, /* 0x3fe52a2d20000000 */ |
1681 |
6.69430613517761230469e-01, /* 0x3fe56bf9c0000000 */ |
1682 |
6.77398800849914550781e-01, /* 0x3fe5ad4040000000 */ |
1683 |
6.85303986072540283203e-01, /* 0x3fe5ee02a0000000 */ |
1684 |
6.93147122859954833984e-01}; /* 0x3fe62e42e0000000 */ |
1685 |
|
1686 |
static const double ln_tail_table[65] = { |
1687 |
0.00000000000000000000e+00, /* 0x0000000000000000 */ |
1688 |
5.15092497094772879206e-09, /* 0x3e361f807c79f3db */ |
1689 |
4.55457209735272790188e-08, /* 0x3e6873c1980267c8 */ |
1690 |
2.86612990859791781788e-08, /* 0x3e5ec65b9f88c69e */ |
1691 |
2.23596477332056055352e-08, /* 0x3e58022c54cc2f99 */ |
1692 |
3.49498983167142274770e-08, /* 0x3e62c37a3a125330 */ |
1693 |
3.23392843005887000414e-08, /* 0x3e615cad69737c93 */ |
1694 |
1.35722380472479366661e-08, /* 0x3e4d256ab1b285e9 */ |
1695 |
2.56504325268044191098e-08, /* 0x3e5b8abcb97a7aa2 */ |
1696 |
5.81213608741512136843e-08, /* 0x3e6f34239659a5dc */ |
1697 |
5.59374849578288093334e-08, /* 0x3e6e07fd48d30177 */ |
1698 |
5.06615629004996189970e-08, /* 0x3e6b32df4799f4f6 */ |
1699 |
5.24588857848400955725e-08, /* 0x3e6c29e4f4f21cf8 */ |
1700 |
9.61968535632653505972e-10, /* 0x3e1086c848df1b59 */ |
1701 |
1.34829655346594463137e-08, /* 0x3e4cf456b4764130 */ |
1702 |
3.65557749306383026498e-08, /* 0x3e63a02ffcb63398 */ |
1703 |
3.33431709374069198903e-08, /* 0x3e61e6a6886b0976 */ |
1704 |
5.13008650536088382197e-08, /* 0x3e6b8abcb97a7aa2 */ |
1705 |
5.09285070380306053751e-08, /* 0x3e6b578f8aa35552 */ |
1706 |
3.20853940845502057341e-08, /* 0x3e6139c871afb9fc */ |
1707 |
4.06713248643004200446e-08, /* 0x3e65d5d30701ce64 */ |
1708 |
5.57028186706125221168e-08, /* 0x3e6de7bcb2d12142 */ |
1709 |
5.48356693724804282546e-08, /* 0x3e6d708e984e1664 */ |
1710 |
1.99407553679345001938e-08, /* 0x3e556945e9c72f36 */ |
1711 |
1.96585517245087232086e-09, /* 0x3e20e2f613e85bda */ |
1712 |
6.68649386072067321503e-09, /* 0x3e3cb7e0b42724f6 */ |
1713 |
5.89936034642113390002e-08, /* 0x3e6fac04e52846c7 */ |
1714 |
2.85038578721554472484e-08, /* 0x3e5e9b14aec442be */ |
1715 |
5.09746772910284482606e-08, /* 0x3e6b5de8034e7126 */ |
1716 |
5.54234668933210171467e-08, /* 0x3e6dc157e1b259d3 */ |
1717 |
6.29100830926604004874e-09, /* 0x3e3b05096ad69c62 */ |
1718 |
2.61974119468563937716e-08, /* 0x3e5c2116faba4cdd */ |
1719 |
4.16752115011186398935e-08, /* 0x3e665fcc25f95b47 */ |
1720 |
2.47747534460820790327e-08, /* 0x3e5a9a08498d4850 */ |
1721 |
5.56922172017964209793e-08, /* 0x3e6de647b1465f77 */ |
1722 |
2.76162876992552906035e-08, /* 0x3e5da71b7bf7861d */ |
1723 |
7.08169709942321478061e-09, /* 0x3e3e6a6886b09760 */ |
1724 |
5.77453510221151779025e-08, /* 0x3e6f0075eab0ef64 */ |
1725 |
4.43021445893361960146e-09, /* 0x3e33071282fb989b */ |
1726 |
3.15140984357495864573e-08, /* 0x3e60eb43c3f1bed2 */ |
1727 |
2.95077445089736670973e-08, /* 0x3e5faf06ecb35c84 */ |
1728 |
1.44098510263167149349e-08, /* 0x3e4ef1e63db35f68 */ |
1729 |
1.05196987538551827693e-08, /* 0x3e469743fb1a71a5 */ |
1730 |
5.23641361722697546261e-08, /* 0x3e6c1cdf404e5796 */ |
1731 |
7.72099925253243069458e-09, /* 0x3e4094aa0ada625e */ |
1732 |
5.62089493829364197156e-08, /* 0x3e6e2d4c96fde3ec */ |
1733 |
3.53090261098577946927e-08, /* 0x3e62f4d5e9a98f34 */ |
1734 |
3.80080516835568242269e-08, /* 0x3e6467c96ecc5cbe */ |
1735 |
5.66961038386146408282e-08, /* 0x3e6e7040d03dec5a */ |
1736 |
4.42287063097349852717e-08, /* 0x3e67bebf4282de36 */ |
1737 |
3.45294525105681104660e-08, /* 0x3e6289b11aeb783f */ |
1738 |
2.47132034530447431509e-08, /* 0x3e5a891d1772f538 */ |
1739 |
3.59655343422487209774e-08, /* 0x3e634f10be1fb591 */ |
1740 |
5.51581770357780862071e-08, /* 0x3e6d9ce1d316eb93 */ |
1741 |
3.60171867511861372793e-08, /* 0x3e63562a19a9c442 */ |
1742 |
1.94511067964296180547e-08, /* 0x3e54e2adf548084c */ |
1743 |
1.54137376631349347838e-08, /* 0x3e508ce55cc8c97a */ |
1744 |
3.93171034490174464173e-09, /* 0x3e30e2f613e85bda */ |
1745 |
5.52990607758839766440e-08, /* 0x3e6db03ebb0227bf */ |
1746 |
3.29990737637586136511e-08, /* 0x3e61b75bb09cb098 */ |
1747 |
1.18436010922446096216e-08, /* 0x3e496f16abb9df22 */ |
1748 |
4.04248680368301346709e-08, /* 0x3e65b3f399411c62 */ |
1749 |
2.27418915900284316293e-08, /* 0x3e586b3e59f65355 */ |
1750 |
1.70263791333409206020e-08, /* 0x3e52482ceae1ac12 */ |
1751 |
5.76999904754328540596e-08}; /* 0x3e6efa39ef35793c */ |
1752 |
|
1753 |
/* Approximating polynomial coefficients for x near 1.0 */ |
1754 |
static const double |
1755 |
ca_1 = 8.33333333333317923934e-02, /* 0x3fb55555555554e6 */ |
1756 |
ca_2 = 1.25000000037717509602e-02, /* 0x3f89999999bac6d4 */ |
1757 |
ca_3 = 2.23213998791944806202e-03, /* 0x3f62492307f1519f */ |
1758 |
ca_4 = 4.34887777707614552256e-04; /* 0x3f3c8034c85dfff0 */ |
1759 |
|
1760 |
/* Approximating polynomial coefficients for other x */ |
1761 |
static const double |
1762 |
cb_1 = 8.33333333333333593622e-02, /* 0x3fb5555555555557 */ |
1763 |
cb_2 = 1.24999999978138668903e-02, /* 0x3f89999999865ede */ |
1764 |
cb_3 = 2.23219810758559851206e-03; /* 0x3f6249423bd94741 */ |
1765 |
|
1766 |
static const unsigned long |
1767 |
log_thresh1 = 0x3fee0faa00000000, |
1768 |
log_thresh2 = 0x3ff1082c00000000; |
1769 |
|
1770 |
/* log_thresh1 = 9.39412117004394531250e-1 = 0x3fee0faa00000000 |
1771 |
log_thresh2 = 1.06449508666992187500 = 0x3ff1082c00000000 */ |
1772 |
if (ux >= log_thresh1 && ux <= log_thresh2) |
1773 |
{ |
1774 |
/* Arguments close to 1.0 are handled separately to maintain |
1775 |
accuracy. |
1776 |
|
1777 |
The approximation in this region exploits the identity |
1778 |
log( 1 + r ) = log( 1 + u/2 ) / log( 1 - u/2 ), where |
1779 |
u = 2r / (2+r). |
1780 |
Note that the right hand side has an odd Taylor series expansion |
1781 |
which converges much faster than the Taylor series expansion of |
1782 |
log( 1 + r ) in r. Thus, we approximate log( 1 + r ) by |
1783 |
u + A1 * u^3 + A2 * u^5 + ... + An * u^(2n+1). |
1784 |
|
1785 |
One subtlety is that since u cannot be calculated from |
1786 |
r exactly, the rounding error in the first u should be |
1787 |
avoided if possible. To accomplish this, we observe that |
1788 |
u = r - r*r/(2+r). |
1789 |
Since x (=1+r) is the input argument, and thus presumed exact, |
1790 |
the formula above approximates u accurately because |
1791 |
u = r - correction, |
1792 |
and the magnitude of "correction" (of the order of r*r) |
1793 |
is small. |
1794 |
With these observations, we will approximate log( 1 + r ) by |
1795 |
r + ( (A1*u^3 + ... + An*u^(2n+1)) - correction ). |
1796 |
|
1797 |
We approximate log(1+r) by an odd polynomial in u, where |
1798 |
u = 2r/(2+r) = r - r*r/(2+r). |
1799 |
*/ |
1800 |
r = x - 1.0; |
1801 |
u = r / (2.0 + r); |
1802 |
correction = r * u; |
1803 |
u = u + u; |
1804 |
v = u * u; |
1805 |
z1 = r; |
1806 |
z2 = (u * v * (ca_1 + v * (ca_2 + v * (ca_3 + v * ca_4))) - correction); |
1807 |
*r1 = z1; |
1808 |
*r2 = z2; |
1809 |
*xexp = 0; |
1810 |
} |
1811 |
else |
1812 |
{ |
1813 |
/* |
1814 |
First, we decompose the argument x to the form |
1815 |
x = 2**M * (F1 + F2), |
1816 |
where 1 <= F1+F2 < 2, M has the value of an integer, |
1817 |
F1 = 1 + j/64, j ranges from 0 to 64, and |F2| <= 1/128. |
1818 |
|
1819 |
Second, we approximate log( 1 + F2/F1 ) by an odd polynomial |
1820 |
in U, where U = 2 F2 / (2 F2 + F1). |
1821 |
Note that log( 1 + F2/F1 ) = log( 1 + U/2 ) - log( 1 - U/2 ). |
1822 |
The core approximation calculates |
1823 |
Poly = [log( 1 + U/2 ) - log( 1 - U/2 )]/U - 1. |
1824 |
Note that log(1 + U/2) - log(1 - U/2) = 2 arctanh ( U/2 ), |
1825 |
thus, Poly = 2 arctanh( U/2 ) / U - 1. |
1826 |
|
1827 |
It is not hard to see that |
1828 |
log(x) = M*log(2) + log(F1) + log( 1 + F2/F1 ). |
1829 |
Hence, we return Z1 = log(F1), and Z2 = log( 1 + F2/F1). |
1830 |
The values of log(F1) are calculated beforehand and stored |
1831 |
in the program. |
1832 |
*/ |
1833 |
|
1834 |
f = x; |
1835 |
if (ux < IMPBIT_DP64) |
1836 |
{ |
1837 |
/* The input argument x is denormalized */ |
1838 |
/* Normalize f by increasing the exponent by 60 |
1839 |
and subtracting a correction to account for the implicit |
1840 |
bit. This replaces a slow denormalized |
1841 |
multiplication by a fast normal subtraction. */ |
1842 |
static const double corr = 2.5653355008114851558350183e-290; /* 0x03d0000000000000 */ |
1843 |
GET_BITS_DP64(f, ux); |
1844 |
ux |= 0x03d0000000000000; |
1845 |
PUT_BITS_DP64(ux, f); |
1846 |
f -= corr; |
1847 |
GET_BITS_DP64(f, ux); |
1848 |
expadjust = 60; |
1849 |
} |
1850 |
else |
1851 |
expadjust = 0; |
1852 |
|
1853 |
/* Store the exponent of x in xexp and put |
1854 |
f into the range [0.5,1) */ |
1855 |
*xexp = (int)((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64 - expadjust; |
1856 |
PUT_BITS_DP64((ux & MANTBITS_DP64) | HALFEXPBITS_DP64, f); |
1857 |
|
1858 |
/* Now x = 2**xexp * f, 1/2 <= f < 1. */ |
1859 |
|
1860 |
/* Set index to be the nearest integer to 128*f */ |
1861 |
r = 128.0 * f; |
1862 |
index = (int)(r + 0.5); |
1863 |
|
1864 |
z1 = ln_lead_table[index-64]; |
1865 |
q = ln_tail_table[index-64]; |
1866 |
f1 = index * 0.0078125; /* 0.0078125 = 1/128 */ |
1867 |
f2 = f - f1; |
1868 |
/* At this point, x = 2**xexp * ( f1 + f2 ) where |
1869 |
f1 = j/128, j = 64, 65, ..., 128 and |f2| <= 1/256. */ |
1870 |
|
1871 |
/* Calculate u = 2 f2 / ( 2 f1 + f2 ) = f2 / ( f1 + 0.5*f2 ) */ |
1872 |
/* u = f2 / (f1 + 0.5 * f2); */ |
1873 |
u = f2 / (f1 + 0.5 * f2); |
1874 |
|
1875 |
/* Here, |u| <= 2(exp(1/16)-1) / (exp(1/16)+1). |
1876 |
The core approximation calculates |
1877 |
poly = [log(1 + u/2) - log(1 - u/2)]/u - 1 */ |
1878 |
v = u * u; |
1879 |
poly = (v * (cb_1 + v * (cb_2 + v * cb_3))); |
1880 |
z2 = q + (u + u * poly); |
1881 |
*r1 = z1; |
1882 |
*r2 = z2; |
1883 |
} |
1884 |
return; |
1885 |
} |
1886 |
#endif /* USE_LOG_KERNEL_AMD */ |
1887 |
|
1888 |
#if defined(USE_REMAINDER_PIBY2F_INLINE) |
1889 |
/* Define this to get debugging print statements activated */ |
1890 |
#define DEBUGGING_PRINT |
1891 |
#undef DEBUGGING_PRINT |
1892 |
|
1893 |
|
1894 |
#ifdef DEBUGGING_PRINT |
1895 |
#include <stdio.h> |
1896 |
char *d2b(long d, int bitsper, int point) |
1897 |
{ |
1898 |
static char buff[200]; |
1899 |
int i, j; |
1900 |
j = bitsper; |
1901 |
if (point >= 0 && point <= bitsper) |
1902 |
j++; |
1903 |
buff[j] = '\0'; |
1904 |
for (i = bitsper - 1; i >= 0; i--) |
1905 |
{ |
1906 |
j--; |
1907 |
if (d % 2 == 1) |
1908 |
buff[j] = '1'; |
1909 |
else |
1910 |
buff[j] = '0'; |
1911 |
if (i == point) |
1912 |
{ |
1913 |
j--; |
1914 |
buff[j] = '.'; |
1915 |
} |
1916 |
d /= 2; |
1917 |
} |
1918 |
return buff; |
1919 |
} |
1920 |
#endif |
1921 |
|
1922 |
/* Given positive argument x, reduce it to the range [-pi/4,pi/4] using |
1923 |
extra precision, and return the result in r. |
1924 |
Return value "region" tells how many lots of pi/2 were subtracted |
1925 |
from x to put it in the range [-pi/4,pi/4], mod 4. */ |
1926 |
static inline void __remainder_piby2f_inline(double x, unsigned long ux, double *r, int *region) |
1927 |
{ |
1928 |
|
1929 |
/* eleven_piby4 is the closest machine number BELOW 11*pi/4 */ |
1930 |
static const double |
1931 |
eleven_piby4 = 8.6393797973719301808159e+00; /* 0x4021475cc9eedf00 */ |
1932 |
|
1933 |
static const double |
1934 |
piby2 = 1.57079632679489655800e+00, /* 0x3ff921fb54442d18 */ |
1935 |
twobypi = 6.36619772367581382433e-01, /* 0x3fe45f306dc9c883 */ |
1936 |
pi = 3.14159265358979311600e+00, /* 0x400921fb54442d18 */ |
1937 |
three_piby2 = 4.71238898038468967400e+00, /* 0x4012d97c7f3321d2 */ |
1938 |
two_pi = 6.28318530717958623200e+00, /* 0x401921fb54442d18 */ |
1939 |
five_piby2 = 7.85398163397448278999e+00; /* 0x401f6a7a2955385e */ |
1940 |
|
1941 |
/* Each of these threshold values is the closest machine |
1942 |
number BELOW a multiple of pi/4, i.e. they are not |
1943 |
rounded to nearest. thresh1 is 1*pi/4, thresh3 is 3*pi/4, etc. |
1944 |
This ensures that we end up in precisely the correct region. */ |
1945 |
static const double |
1946 |
thresh1 = 7.8539816339744827899949e-01, /* 0x3fe921fb54442d18 */ |
1947 |
thresh3 = 2.3561944901923448369984e+00, /* 0x4002d97c7f3321d2 */ |
1948 |
thresh5 = 3.9269908169872413949974e+00, /* 0x400f6a7a2955385e */ |
1949 |
thresh7 = 5.4977871437821379529964e+00, /* 0x4015fdbbe9bba775 */ |
1950 |
thresh9 = 7.0685834705770345109954e+00; /* 0x401c463abeccb2bb */ |
1951 |
|
1952 |
static const double cancellationThresh = 1.0e-5; |
1953 |
int done = 0; |
1954 |
|
1955 |
/* For small values of x, up to 11*pi/4, we do double precision |
1956 |
subtraction of the relevant multiple of pi/2 */ |
1957 |
if (x <= eleven_piby4) /* x <= 11*pi/4 */ |
1958 |
{ |
1959 |
double t, ctest; |
1960 |
|
1961 |
if (x <= thresh5) /* x < 5*pi/4 */ |
1962 |
{ |
1963 |
if (x <= thresh1) /* x < pi/4 */ |
1964 |
{ |
1965 |
/* Quick return if x is already less than pi/4 */ |
1966 |
*r = x; |
1967 |
*region = 0; |
1968 |
return; |
1969 |
} |
1970 |
else if (x <= thresh3) /* x < 3*pi/4 */ |
1971 |
{ |
1972 |
t = x - piby2; |
1973 |
*region = 1; |
1974 |
} |
1975 |
else /* x < 5*pi/4 */ |
1976 |
{ |
1977 |
t = x - pi; |
1978 |
*region = 2; |
1979 |
} |
1980 |
} |
1981 |
else |
1982 |
{ |
1983 |
if (x <= thresh7) /* x < 7*pi/4 */ |
1984 |
{ |
1985 |
t = x - three_piby2; |
1986 |
*region = 3; |
1987 |
} |
1988 |
else if (x <= thresh9) /* x < 9*pi/4 */ |
1989 |
{ |
1990 |
t = x - two_pi; |
1991 |
*region = 0; |
1992 |
} |
1993 |
else /* x < 11*pi/4 */ |
1994 |
{ |
1995 |
t = x - five_piby2; |
1996 |
*region = 1; |
1997 |
} |
1998 |
} |
1999 |
|
2000 |
/* Check for massive cancellation which may happen very close |
2001 |
to multiples of pi/2 */ |
2002 |
if (t < 0.0) |
2003 |
ctest = -t; |
2004 |
else |
2005 |
ctest = t; |
2006 |
#ifdef DEBUGGING_PRINT |
2007 |
printf("Cancellation threshold test = (%g > %g)\n", |
2008 |
ctest, cancellationThresh); |
2009 |
#endif |
2010 |
|
2011 |
/* Check if cancellation error was not too large */ |
2012 |
if (ctest > cancellationThresh) |
2013 |
{ |
2014 |
*r = t; |
2015 |
done = 1; |
2016 |
} |
2017 |
/* Otherwise fall through to the expensive method */ |
2018 |
} |
2019 |
else if (x <= 1.0e6) |
2020 |
{ |
2021 |
/* This range reduction is accurate enough for x up to |
2022 |
approximately 2**(20) except near multiples of pi/2 */ |
2023 |
|
2024 |
/* We perform double precision arithmetic to find the |
2025 |
nearest multiple of pi/2 to x */ |
2026 |
int reg; |
2027 |
double z, w, c, ctest; |
2028 |
|
2029 |
/* Multiply x by 2/pi in double precision, result in z */ |
2030 |
z = x * twobypi; |
2031 |
|
2032 |
#ifdef DEBUGGING_PRINT |
2033 |
printf("z = %30.20e = %s\n", z, double2hex(&z)); |
2034 |
#endif |
2035 |
|
2036 |
/* Find reg, the nearest integer to z */ |
2037 |
reg = (int)(z + 0.5); |
2038 |
|
2039 |
#ifdef DEBUGGING_PRINT |
2040 |
printf("reg = %d\n", reg); |
2041 |
#endif |
2042 |
|
2043 |
/* Subtract reg from z, result in w */ |
2044 |
w = z - reg; |
2045 |
|
2046 |
#ifdef DEBUGGING_PRINT |
2047 |
printf("w = %30.20e = %s\n", w, double2hex(&w)); |
2048 |
#endif |
2049 |
|
2050 |
/* Check for massive cancellation which may happen very close |
2051 |
to multiples of pi/2 */ |
2052 |
if (w < 0.0) |
2053 |
ctest = -w; |
2054 |
else |
2055 |
ctest = w; |
2056 |
|
2057 |
/* If cancellation is not too severe, continue with this method. |
2058 |
Otherwise we fall through to the expensive, accurate method */ |
2059 |
if (ctest > cancellationThresh) |
2060 |
{ |
2061 |
/* Multiply w by pi/2 */ |
2062 |
c = w * piby2; |
2063 |
*r = c; |
2064 |
*region = reg & 3; |
2065 |
|
2066 |
#ifdef DEBUGGING_PRINT |
2067 |
printf("r = %30.20e = %s\n", *r, double2hex(r)); |
2068 |
#endif |
2069 |
done = 1; |
2070 |
} |
2071 |
} |
2072 |
|
2073 |
if (!done) |
2074 |
{ |
2075 |
/* This method simulates multi-precision floating-point |
2076 |
arithmetic and is accurate for all 1 <= x < infinity */ |
2077 |
#if 0 |
2078 |
const int bitsper = 36; |
2079 |
#else |
2080 |
#define bitsper 36 |
2081 |
#endif |
2082 |
unsigned long res[10]; |
2083 |
unsigned long u, carry, mask, mant, nextbits; |
2084 |
int first, last, i, rexp, xexp, resexp, ltb, determ, bc; |
2085 |
double dx; |
2086 |
static const double |
2087 |
piby2 = 1.57079632679489655800e+00; /* 0x3ff921fb54442d18 */ |
2088 |
static unsigned long pibits[] = |
2089 |
{ |
2090 |
0L, |
2091 |
5215L, 13000023176L, 11362338026L, 67174558139L, |
2092 |
34819822259L, 10612056195L, 67816420731L, 57840157550L, |
2093 |
19558516809L, 50025467026L, 25186875954L, 18152700886L |
2094 |
}; |
2095 |
|
2096 |
#ifdef DEBUGGING_PRINT |
2097 |
printf("On entry, x = %25.20e = %s\n", x, double2hex(&x)); |
2098 |
#endif |
2099 |
|
2100 |
xexp = (int)(((ux & EXPBITS_DP64) >> EXPSHIFTBITS_DP64) - EXPBIAS_DP64); |
2101 |
ux = ((ux & MANTBITS_DP64) | IMPBIT_DP64) >> 29; |
2102 |
|
2103 |
#ifdef DEBUGGING_PRINT |
2104 |
printf("ux = %s\n", d2b(ux, 64, -1)); |
2105 |
#endif |
2106 |
|
2107 |
/* Now ux is the mantissa bit pattern of x as a long integer */ |
2108 |
mask = (1L << bitsper) - 1; |
2109 |
|
2110 |
/* Set first and last to the positions of the first |
2111 |
and last chunks of 2/pi that we need */ |
2112 |
first = xexp / bitsper; |
2113 |
resexp = xexp - first * bitsper; |
2114 |
/* 120 is the theoretical maximum number of bits (actually |
2115 |
115 for IEEE single precision) that we need to extract |
2116 |
from the middle of 2/pi to compute the reduced argument |
2117 |
accurately enough for our purposes */ |
2118 |
last = first + 120 / bitsper; |
2119 |
|
2120 |
#ifdef DEBUGGING_PRINT |
2121 |
printf("first = %d, last = %d\n", first, last); |
2122 |
#endif |
2123 |
|
2124 |
/* Do a long multiplication of the bits of 2/pi by the |
2125 |
integer mantissa */ |
2126 |
#if 0 |
2127 |
for (i = last; i >= first; i--) |
2128 |
{ |
2129 |
u = pibits[i] * ux + carry; |
2130 |
res[i - first] = u & mask; |
2131 |
carry = u >> bitsper; |
2132 |
} |
2133 |
res[last - first + 1] = 0; |
2134 |
#else |
2135 |
/* Unroll the loop. This is only correct because we know |
2136 |
that bitsper is fixed as 36. */ |
2137 |
res[4] = 0; |
2138 |
u = pibits[last] * ux; |
2139 |
res[3] = u & mask; |
2140 |
carry = u >> bitsper; |
2141 |
u = pibits[last - 1] * ux + carry; |
2142 |
res[2] = u & mask; |
2143 |
carry = u >> bitsper; |
2144 |
u = pibits[last - 2] * ux + carry; |
2145 |
res[1] = u & mask; |
2146 |
carry = u >> bitsper; |
2147 |
u = pibits[first] * ux + carry; |
2148 |
res[0] = u & mask; |
2149 |
#endif |
2150 |
|
2151 |
#ifdef DEBUGGING_PRINT |
2152 |
printf("resexp = %d\n", resexp); |
2153 |
printf("Significant part of x * 2/pi with binary" |
2154 |
" point in correct place:\n"); |
2155 |
for (i = 0; i <= last - first; i++) |
2156 |
{ |
2157 |
if (i > 0 && i % 5 == 0) |
2158 |
printf("\n "); |
2159 |
if (i == 1) |
2160 |
printf("%s ", d2b(res[i], bitsper, resexp)); |
2161 |
else |
2162 |
printf("%s ", d2b(res[i], bitsper, -1)); |
2163 |
} |
2164 |
printf("\n"); |
2165 |
#endif |
2166 |
|
2167 |
/* Reconstruct the result */ |
2168 |
ltb = (int)((((res[0] << bitsper) | res[1]) |
2169 |
>> (bitsper - 1 - resexp)) & 7); |
2170 |
|
2171 |
/* determ says whether the fractional part is >= 0.5 */ |
2172 |
determ = ltb & 1; |
2173 |
|
2174 |
#ifdef DEBUGGING_PRINT |
2175 |
printf("ltb = %d (last two bits before binary point" |
2176 |
" and first bit after)\n", ltb); |
2177 |
printf("determ = %d (1 means need to negate because the fractional\n" |
2178 |
" part of x * 2/pi is greater than 0.5)\n", determ); |
2179 |
#endif |
2180 |
|
2181 |
i = 1; |
2182 |
if (determ) |
2183 |
{ |
2184 |
/* The mantissa is >= 0.5. We want to subtract it |
2185 |
from 1.0 by negating all the bits */ |
2186 |
*region = ((ltb >> 1) + 1) & 3; |
2187 |
mant = ~(res[1]) & ((1L << (bitsper - resexp)) - 1); |
2188 |
while (mant < 0x0000000000010000) |
2189 |
{ |
2190 |
i++; |
2191 |
mant = (mant << bitsper) | (~(res[i]) & mask); |
2192 |
} |
2193 |
nextbits = (~(res[i+1]) & mask); |
2194 |
} |
2195 |
else |
2196 |
{ |
2197 |
*region = (ltb >> 1); |
2198 |
mant = res[1] & ((1L << (bitsper - resexp)) - 1); |
2199 |
while (mant < 0x0000000000010000) |
2200 |
{ |
2201 |
i++; |
2202 |
mant = (mant << bitsper) | res[i]; |
2203 |
} |
2204 |
nextbits = res[i+1]; |
2205 |
} |
2206 |
|
2207 |
#ifdef DEBUGGING_PRINT |
2208 |
printf("First bits of mant = %s\n", d2b(mant, bitsper, -1)); |
2209 |
#endif |
2210 |
|
2211 |
/* Normalize the mantissa. The shift value 6 here, determined by |
2212 |
trial and error, seems to give optimal speed. */ |
2213 |
bc = 0; |
2214 |
while (mant < 0x0000400000000000) |
2215 |
{ |
2216 |
bc += 6; |
2217 |
mant <<= 6; |
2218 |
} |
2219 |
while (mant < 0x0010000000000000) |
2220 |
{ |
2221 |
bc++; |
2222 |
mant <<= 1; |
2223 |
} |
2224 |
mant |= nextbits >> (bitsper - bc); |
2225 |
|
2226 |
rexp = 52 + resexp - bc - i * bitsper; |
2227 |
|
2228 |
#ifdef DEBUGGING_PRINT |
2229 |
printf("Normalised mantissa = 0x%016lx\n", mant); |
2230 |
printf("Exponent to be inserted on mantissa = rexp = %d\n", rexp); |
2231 |
#endif |
2232 |
|
2233 |
/* Put the result exponent rexp onto the mantissa pattern */ |
2234 |
u = ((unsigned long)rexp + EXPBIAS_DP64) << EXPSHIFTBITS_DP64; |
2235 |
ux = (mant & MANTBITS_DP64) | u; |
2236 |
if (determ) |
2237 |
/* If we negated the mantissa we negate x too */ |
2238 |
ux |= SIGNBIT_DP64; |
2239 |
PUT_BITS_DP64(ux, dx); |
2240 |
|
2241 |
#ifdef DEBUGGING_PRINT |
2242 |
printf("(x*2/pi) = %25.20e = %s\n", dx, double2hex(&dx)); |
2243 |
#endif |
2244 |
|
2245 |
/* x is a double precision version of the fractional part of |
2246 |
x * 2 / pi. Multiply x by pi/2 in double precision |
2247 |
to get the reduced argument r. */ |
2248 |
*r = dx * piby2; |
2249 |
|
2250 |
#ifdef DEBUGGING_PRINT |
2251 |
printf(" r = frac(x*2/pi) * pi/2:\n"); |
2252 |
printf(" r = %25.20e = %s\n", *r, double2hex(r)); |
2253 |
printf("region = (number of pi/2 subtracted from x) mod 4 = %d\n", |
2254 |
*region); |
2255 |
#endif |
2256 |
} |
2257 |
} |
2258 |
#endif /* USE_REMAINDER_PIBY2F_INLINE */ |
2259 |
|
2260 |
#endif /* LIBM_INLINES_AMD_H_INCLUDED */ |