Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 66605 | Differences between
and this patch

Collapse All | Expand All

(-)src/CryptoPP.cpp (-571 / +5 lines)
Lines 2534-2631 Link Here
2534
2534
2535
#ifdef CRYPTOPP_X86ASM_AVAILABLE
2535
#ifdef CRYPTOPP_X86ASM_AVAILABLE
2536
2536
2537
// ************** x86 feature detection ***************
2538
2539
static bool s_sse2Enabled = true;
2540
2541
static void CpuId(word32 input, word32 *output)
2542
{
2543
#ifdef __GNUC__
2544
	__asm__
2545
	(
2546
		// save ebx in case -fPIC is being used
2547
		"push %%ebx; cpuid; mov %%ebx, %%edi; pop %%ebx"
2548
		: "=a" (output[0]), "=D" (output[1]), "=c" (output[2]), "=d" (output[3])
2549
		: "a" (input)
2550
	);
2551
#else
2552
	__asm
2553
	{
2554
		mov eax, input
2555
		cpuid
2556
		mov edi, output
2557
		mov [edi], eax
2558
		mov [edi+4], ebx
2559
		mov [edi+8], ecx
2560
		mov [edi+12], edx
2561
	}
2562
#endif
2563
}
2564
2565
#ifdef SSE2_INTRINSICS_AVAILABLE
2566
#ifndef _MSC_VER
2567
static jmp_buf s_env;
2568
static void SigIllHandler(int)
2569
{
2570
	longjmp(s_env, 1);
2571
}
2572
#endif
2573
2574
static bool HasSSE2()
2575
{
2576
	if (!s_sse2Enabled)
2577
		return false;
2578
2579
	word32 cpuid[4];
2580
	CpuId(1, cpuid);
2581
	if ((cpuid[3] & (1 << 26)) == 0)
2582
		return false;
2583
2584
#ifdef _MSC_VER
2585
    __try
2586
	{
2587
        __asm xorpd xmm0, xmm0        // executing SSE2 instruction
2588
	}
2589
    __except (1)
2590
	{
2591
		return false;
2592
    }
2593
	return true;
2594
#else
2595
	typedef void (*SigHandler)(int);
2596
2597
	SigHandler oldHandler = signal(SIGILL, SigIllHandler);
2598
	if (oldHandler == SIG_ERR)
2599
		return false;
2600
2601
	bool result = true;
2602
	if (setjmp(s_env))
2603
		result = false;
2604
	else
2605
		__asm __volatile ("xorps %xmm0, %xmm0");
2606
2607
	signal(SIGILL, oldHandler);
2608
	return result;
2609
#endif
2610
2611
}
2612
#endif
2613
2614
static bool IsP4()
2615
{
2616
	word32 cpuid[4];
2617
2618
	CpuId(0, cpuid);
2619
	std::swap(cpuid[2], cpuid[3]);
2620
	if (memcmp(cpuid+1, "GenuineIntel", 12) != 0)
2621
		return false;
2622
2623
	CpuId(1, cpuid);
2624
	return ((cpuid[0] >> 8) & 0xf) == 0xf;
2625
2626
}
2627
2628
2629
// ************** Pentium/P4 optimizations ***************
2537
// ************** Pentium/P4 optimizations ***************
2630
2538
2631
class PentiumOptimized : public Portable
2539
class PentiumOptimized : public Portable
Lines 2638-2655 Link Here
2638
	static void CRYPTOPP_CDECL Multiply8Bottom(word *C, const word *A, const word *B);
2546
	static void CRYPTOPP_CDECL Multiply8Bottom(word *C, const word *A, const word *B);
2639
};
2547
};
2640
2548
2641
class P4Optimized
2642
{
2643
public:
2644
	static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
2645
	static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);
2646
#ifdef SSE2_INTRINSICS_AVAILABLE
2647
	static void CRYPTOPP_CDECL Multiply4(word *C, const word *A, const word *B);
2648
	static void CRYPTOPP_CDECL Multiply8(word *C, const word *A, const word *B);
2649
	static void CRYPTOPP_CDECL Multiply8Bottom(word *C, const word *A, const word *B);
2650
#endif
2651
};
2652
2653
typedef word (CRYPTOPP_CDECL * PAddSub)(word *C, const word *A, const word *B, unsigned int N);
2549
typedef word (CRYPTOPP_CDECL * PAddSub)(word *C, const word *A, const word *B, unsigned int N);
2654
typedef void (CRYPTOPP_CDECL * PMul)(word *C, const word *A, const word *B);
2550
typedef void (CRYPTOPP_CDECL * PMul)(word *C, const word *A, const word *B);
2655
2551
Lines 2660-2700 Link Here
2660
2556
2661
static void SetPentiumFunctionPointers()
2557
static void SetPentiumFunctionPointers()
2662
{
2558
{
2663
	if (IsP4())
2559
	s_pAdd = &PentiumOptimized::Add;
2664
	{
2560
	s_pSub = &PentiumOptimized::Subtract;
2665
		s_pAdd = &P4Optimized::Add;
2666
		s_pSub = &P4Optimized::Subtract;
2667
	}
2668
	else
2669
	{
2670
		s_pAdd = &PentiumOptimized::Add;
2671
		s_pSub = &PentiumOptimized::Subtract;
2672
	}
2673
2561
2674
#ifdef SSE2_INTRINSICS_AVAILABLE
2562
#ifdef SSE2_INTRINSICS_AVAILABLE
2675
	if (HasSSE2())
2563
	s_pMul4 = &PentiumOptimized::Multiply4;
2676
	{
2564
	s_pMul8 = &PentiumOptimized::Multiply8;
2677
		s_pMul4 = &P4Optimized::Multiply4;
2565
	s_pMul8B = &PentiumOptimized::Multiply8Bottom;
2678
		s_pMul8 = &P4Optimized::Multiply8;
2679
		s_pMul8B = &P4Optimized::Multiply8Bottom;
2680
	}
2681
	else
2682
	{
2683
		s_pMul4 = &PentiumOptimized::Multiply4;
2684
		s_pMul8 = &PentiumOptimized::Multiply8;
2685
		s_pMul8B = &PentiumOptimized::Multiply8Bottom;
2686
	}
2687
#endif
2566
#endif
2688
}
2567
}
2689
2568
2690
static const char s_RunAtStartupSetPentiumFunctionPointers = (SetPentiumFunctionPointers(), 0);
2569
static const char s_RunAtStartupSetPentiumFunctionPointers = (SetPentiumFunctionPointers(), 0);
2691
2570
2692
void DisableSSE2()
2693
{
2694
	s_sse2Enabled = false;
2695
	SetPentiumFunctionPointers();
2696
}
2697
2698
class LowLevel : public PentiumOptimized
2571
class LowLevel : public PentiumOptimized
2699
{
2572
{
2700
public:
2573
public:
Lines 2862-2963 Link Here
2862
	AddEpilogue
2735
	AddEpilogue
2863
}
2736
}
2864
2737
2865
// On Pentium 4, the adc and sbb instructions are very expensive, so avoid them.
2866
2867
CRYPTOPP_NAKED word P4Optimized::Add(word *C, const word *A, const word *B, unsigned int N)
2868
{
2869
	AddPrologue
2870
2871
	// now: ebx = B, ecx = C, edx = A, esi = N
2872
	AS2(	xor		eax, eax)
2873
	AS1(	neg		esi)
2874
	AS1(	jz		loopendAddP4)		// if no dwords then nothing to do
2875
2876
	AS2(	mov		edi, [edx])
2877
	AS2(	mov		ebp, [ebx])
2878
	AS1(	jmp		carry1AddP4)
2879
2880
	AS1(loopstartAddP4:)
2881
	AS2(	mov		edi, [edx+8])
2882
	AS2(	add		ecx, 8)
2883
	AS2(	add		edx, 8)
2884
	AS2(	mov		ebp, [ebx])
2885
	AS2(	add		edi, eax)
2886
	AS1(	jc		carry1AddP4)
2887
	AS2(	xor		eax, eax)
2888
2889
	AS1(carry1AddP4:)
2890
	AS2(	add		edi, ebp)
2891
	AS2(	mov		ebp, 1)
2892
	AS2(	mov		[ecx], edi)
2893
	AS2(	mov		edi, [edx+4])
2894
	AS2(	cmovc	eax, ebp)
2895
	AS2(	mov		ebp, [ebx+4])
2896
	AS2(	add		ebx, 8)
2897
	AS2(	add		edi, eax)
2898
	AS1(	jc		carry2AddP4)
2899
	AS2(	xor		eax, eax)
2900
2901
	AS1(carry2AddP4:)
2902
	AS2(	add		edi, ebp)
2903
	AS2(	mov		ebp, 1)
2904
	AS2(	cmovc	eax, ebp)
2905
	AS2(	mov		[ecx+4], edi)
2906
	AS2(	add		esi, 2)
2907
	AS1(	jnz		loopstartAddP4)
2908
2909
	AS1(loopendAddP4:)
2910
2911
	AddEpilogue
2912
}
2913
2914
CRYPTOPP_NAKED word P4Optimized::Subtract(word *C, const word *A, const word *B, unsigned int N)
2915
{
2916
	AddPrologue
2917
2918
	// now: ebx = B, ecx = C, edx = A, esi = N
2919
	AS2(	xor		eax, eax)
2920
	AS1(	neg		esi)
2921
	AS1(	jz		loopendSubP4)		// if no dwords then nothing to do
2922
2923
	AS2(	mov		edi, [edx])
2924
	AS2(	mov		ebp, [ebx])
2925
	AS1(	jmp		carry1SubP4)
2926
2927
	AS1(loopstartSubP4:)
2928
	AS2(	mov		edi, [edx+8])
2929
	AS2(	add		edx, 8)
2930
	AS2(	add		ecx, 8)
2931
	AS2(	mov		ebp, [ebx])
2932
	AS2(	sub		edi, eax)
2933
	AS1(	jc		carry1SubP4)
2934
	AS2(	xor		eax, eax)
2935
2936
	AS1(carry1SubP4:)
2937
	AS2(	sub		edi, ebp)
2938
	AS2(	mov		ebp, 1)
2939
	AS2(	mov		[ecx], edi)
2940
	AS2(	mov		edi, [edx+4])
2941
	AS2(	cmovc	eax, ebp)
2942
	AS2(	mov		ebp, [ebx+4])
2943
	AS2(	add		ebx, 8)
2944
	AS2(	sub		edi, eax)
2945
	AS1(	jc		carry2SubP4)
2946
	AS2(	xor		eax, eax)
2947
2948
	AS1(carry2SubP4:)
2949
	AS2(	sub		edi, ebp)
2950
	AS2(	mov		ebp, 1)
2951
	AS2(	cmovc	eax, ebp)
2952
	AS2(	mov		[ecx+4], edi)
2953
	AS2(	add		esi, 2)
2954
	AS1(	jnz		loopstartSubP4)
2955
2956
	AS1(loopendSubP4:)
2957
2958
	AddEpilogue
2959
}
2960
2961
// multiply assembly code originally contributed by Leonard Janke
2738
// multiply assembly code originally contributed by Leonard Janke
2962
2739
2963
#define MulStartup \
2740
#define MulStartup \
Lines 3293-3625 Link Here
3293
	C[5] = _mm_add_epi64(a3b2, a2b3);
3070
	C[5] = _mm_add_epi64(a3b2, a2b3);
3294
}
3071
}
3295
3072
3296
void P4Optimized::Multiply4(word *C, const word *A, const word *B)
3297
{
3298
	__m128i temp[7];
3299
	const word *w = (word *)temp;
3300
	const __m64 *mw = (__m64 *)w;
3301
3302
	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
3303
3304
	C[0] = w[0];
3305
3306
	__m64 s1, s2;
3307
3308
	__m64 w1 = _mm_cvtsi32_si64(w[1]);
3309
	__m64 w4 = mw[2];
3310
	__m64 w6 = mw[3];
3311
	__m64 w8 = mw[4];
3312
	__m64 w10 = mw[5];
3313
	__m64 w12 = mw[6];
3314
	__m64 w14 = mw[7];
3315
	__m64 w16 = mw[8];
3316
	__m64 w18 = mw[9];
3317
	__m64 w20 = mw[10];
3318
	__m64 w22 = mw[11];
3319
	__m64 w26 = _mm_cvtsi32_si64(w[26]);
3320
3321
	s1 = _mm_add_si64(w1, w4);
3322
	C[1] = _mm_cvtsi64_si32(s1);
3323
	s1 = _mm_srli_si64(s1, 32);
3324
3325
	s2 = _mm_add_si64(w6, w8);
3326
	s1 = _mm_add_si64(s1, s2);
3327
	C[2] = _mm_cvtsi64_si32(s1);
3328
	s1 = _mm_srli_si64(s1, 32);
3329
3330
	s2 = _mm_add_si64(w10, w12);
3331
	s1 = _mm_add_si64(s1, s2);
3332
	C[3] = _mm_cvtsi64_si32(s1);
3333
	s1 = _mm_srli_si64(s1, 32);
3334
3335
	s2 = _mm_add_si64(w14, w16);
3336
	s1 = _mm_add_si64(s1, s2);
3337
	C[4] = _mm_cvtsi64_si32(s1);
3338
	s1 = _mm_srli_si64(s1, 32);
3339
3340
	s2 = _mm_add_si64(w18, w20);
3341
	s1 = _mm_add_si64(s1, s2);
3342
	C[5] = _mm_cvtsi64_si32(s1);
3343
	s1 = _mm_srli_si64(s1, 32);
3344
3345
	s2 = _mm_add_si64(w22, w26);
3346
	s1 = _mm_add_si64(s1, s2);
3347
	C[6] = _mm_cvtsi64_si32(s1);
3348
	s1 = _mm_srli_si64(s1, 32);
3349
3350
	C[7] = _mm_cvtsi64_si32(s1) + w[27];
3351
	_mm_empty();
3352
}
3353
3354
void P4Optimized::Multiply8(word *C, const word *A, const word *B)
3355
{
3356
	__m128i temp[28];
3357
	const word *w = (word *)temp;
3358
	const __m64 *mw = (__m64 *)w;
3359
	const word *x = (word *)temp+7*4;
3360
	const __m64 *mx = (__m64 *)x;
3361
	const word *y = (word *)temp+7*4*2;
3362
	const __m64 *my = (__m64 *)y;
3363
	const word *z = (word *)temp+7*4*3;
3364
	const __m64 *mz = (__m64 *)z;
3365
3366
	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
3367
3368
	P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
3369
3370
	P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
3371
3372
	P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
3373
3374
	C[0] = w[0];
3375
3376
	__m64 s1, s2, s3, s4;
3377
3378
	__m64 w1 = _mm_cvtsi32_si64(w[1]);
3379
	__m64 w4 = mw[2];
3380
	__m64 w6 = mw[3];
3381
	__m64 w8 = mw[4];
3382
	__m64 w10 = mw[5];
3383
	__m64 w12 = mw[6];
3384
	__m64 w14 = mw[7];
3385
	__m64 w16 = mw[8];
3386
	__m64 w18 = mw[9];
3387
	__m64 w20 = mw[10];
3388
	__m64 w22 = mw[11];
3389
	__m64 w26 = _mm_cvtsi32_si64(w[26]);
3390
	__m64 w27 = _mm_cvtsi32_si64(w[27]);
3391
3392
	__m64 x0 = _mm_cvtsi32_si64(x[0]);
3393
	__m64 x1 = _mm_cvtsi32_si64(x[1]);
3394
	__m64 x4 = mx[2];
3395
	__m64 x6 = mx[3];
3396
	__m64 x8 = mx[4];
3397
	__m64 x10 = mx[5];
3398
	__m64 x12 = mx[6];
3399
	__m64 x14 = mx[7];
3400
	__m64 x16 = mx[8];
3401
	__m64 x18 = mx[9];
3402
	__m64 x20 = mx[10];
3403
	__m64 x22 = mx[11];
3404
	__m64 x26 = _mm_cvtsi32_si64(x[26]);
3405
	__m64 x27 = _mm_cvtsi32_si64(x[27]);
3406
3407
	__m64 y0 = _mm_cvtsi32_si64(y[0]);
3408
	__m64 y1 = _mm_cvtsi32_si64(y[1]);
3409
	__m64 y4 = my[2];
3410
	__m64 y6 = my[3];
3411
	__m64 y8 = my[4];
3412
	__m64 y10 = my[5];
3413
	__m64 y12 = my[6];
3414
	__m64 y14 = my[7];
3415
	__m64 y16 = my[8];
3416
	__m64 y18 = my[9];
3417
	__m64 y20 = my[10];
3418
	__m64 y22 = my[11];
3419
	__m64 y26 = _mm_cvtsi32_si64(y[26]);
3420
	__m64 y27 = _mm_cvtsi32_si64(y[27]);
3421
3422
	__m64 z0 = _mm_cvtsi32_si64(z[0]);
3423
	__m64 z1 = _mm_cvtsi32_si64(z[1]);
3424
	__m64 z4 = mz[2];
3425
	__m64 z6 = mz[3];
3426
	__m64 z8 = mz[4];
3427
	__m64 z10 = mz[5];
3428
	__m64 z12 = mz[6];
3429
	__m64 z14 = mz[7];
3430
	__m64 z16 = mz[8];
3431
	__m64 z18 = mz[9];
3432
	__m64 z20 = mz[10];
3433
	__m64 z22 = mz[11];
3434
	__m64 z26 = _mm_cvtsi32_si64(z[26]);
3435
3436
	s1 = _mm_add_si64(w1, w4);
3437
	C[1] = _mm_cvtsi64_si32(s1);
3438
	s1 = _mm_srli_si64(s1, 32);
3439
3440
	s2 = _mm_add_si64(w6, w8);
3441
	s1 = _mm_add_si64(s1, s2);
3442
	C[2] = _mm_cvtsi64_si32(s1);
3443
	s1 = _mm_srli_si64(s1, 32);
3444
3445
	s2 = _mm_add_si64(w10, w12);
3446
	s1 = _mm_add_si64(s1, s2);
3447
	C[3] = _mm_cvtsi64_si32(s1);
3448
	s1 = _mm_srli_si64(s1, 32);
3449
3450
	s3 = _mm_add_si64(x0, y0);
3451
	s2 = _mm_add_si64(w14, w16);
3452
	s1 = _mm_add_si64(s1, s3);
3453
	s1 = _mm_add_si64(s1, s2);
3454
	C[4] = _mm_cvtsi64_si32(s1);
3455
	s1 = _mm_srli_si64(s1, 32);
3456
3457
	s3 = _mm_add_si64(x1, y1);
3458
	s4 = _mm_add_si64(x4, y4);
3459
	s1 = _mm_add_si64(s1, w18);
3460
	s3 = _mm_add_si64(s3, s4);
3461
	s1 = _mm_add_si64(s1, w20);
3462
	s1 = _mm_add_si64(s1, s3);
3463
	C[5] = _mm_cvtsi64_si32(s1);
3464
	s1 = _mm_srli_si64(s1, 32);
3465
3466
	s3 = _mm_add_si64(x6, y6);
3467
	s4 = _mm_add_si64(x8, y8);
3468
	s1 = _mm_add_si64(s1, w22);
3469
	s3 = _mm_add_si64(s3, s4);
3470
	s1 = _mm_add_si64(s1, w26);
3471
	s1 = _mm_add_si64(s1, s3);
3472
	C[6] = _mm_cvtsi64_si32(s1);
3473
	s1 = _mm_srli_si64(s1, 32);
3474
3475
	s3 = _mm_add_si64(x10, y10);
3476
	s4 = _mm_add_si64(x12, y12);
3477
	s1 = _mm_add_si64(s1, w27);
3478
	s3 = _mm_add_si64(s3, s4);
3479
	s1 = _mm_add_si64(s1, s3);
3480
	C[7] = _mm_cvtsi64_si32(s1);
3481
	s1 = _mm_srli_si64(s1, 32);
3482
3483
	s3 = _mm_add_si64(x14, y14);
3484
	s4 = _mm_add_si64(x16, y16);
3485
	s1 = _mm_add_si64(s1, z0);
3486
	s3 = _mm_add_si64(s3, s4);
3487
	s1 = _mm_add_si64(s1, s3);
3488
	C[8] = _mm_cvtsi64_si32(s1);
3489
	s1 = _mm_srli_si64(s1, 32);
3490
3491
	s3 = _mm_add_si64(x18, y18);
3492
	s4 = _mm_add_si64(x20, y20);
3493
	s1 = _mm_add_si64(s1, z1);
3494
	s3 = _mm_add_si64(s3, s4);
3495
	s1 = _mm_add_si64(s1, z4);
3496
	s1 = _mm_add_si64(s1, s3);
3497
	C[9] = _mm_cvtsi64_si32(s1);
3498
	s1 = _mm_srli_si64(s1, 32);
3499
3500
	s3 = _mm_add_si64(x22, y22);
3501
	s4 = _mm_add_si64(x26, y26);
3502
	s1 = _mm_add_si64(s1, z6);
3503
	s3 = _mm_add_si64(s3, s4);
3504
	s1 = _mm_add_si64(s1, z8);
3505
	s1 = _mm_add_si64(s1, s3);
3506
	C[10] = _mm_cvtsi64_si32(s1);
3507
	s1 = _mm_srli_si64(s1, 32);
3508
3509
	s3 = _mm_add_si64(x27, y27);
3510
	s1 = _mm_add_si64(s1, z10);
3511
	s1 = _mm_add_si64(s1, z12);
3512
	s1 = _mm_add_si64(s1, s3);
3513
	C[11] = _mm_cvtsi64_si32(s1);
3514
	s1 = _mm_srli_si64(s1, 32);
3515
3516
	s3 = _mm_add_si64(z14, z16);
3517
	s1 = _mm_add_si64(s1, s3);
3518
	C[12] = _mm_cvtsi64_si32(s1);
3519
	s1 = _mm_srli_si64(s1, 32);
3520
3521
	s3 = _mm_add_si64(z18, z20);
3522
	s1 = _mm_add_si64(s1, s3);
3523
	C[13] = _mm_cvtsi64_si32(s1);
3524
	s1 = _mm_srli_si64(s1, 32);
3525
3526
	s3 = _mm_add_si64(z22, z26);
3527
	s1 = _mm_add_si64(s1, s3);
3528
	C[14] = _mm_cvtsi64_si32(s1);
3529
	s1 = _mm_srli_si64(s1, 32);
3530
3531
	C[15] = z[27] + _mm_cvtsi64_si32(s1);
3532
	_mm_empty();
3533
}
3534
3535
void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
3536
{
3537
	__m128i temp[21];
3538
	const word *w = (word *)temp;
3539
	const __m64 *mw = (__m64 *)w;
3540
	const word *x = (word *)temp+7*4;
3541
	const __m64 *mx = (__m64 *)x;
3542
	const word *y = (word *)temp+7*4*2;
3543
	const __m64 *my = (__m64 *)y;
3544
3545
	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
3546
3547
	P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
3548
3549
	P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
3550
3551
	C[0] = w[0];
3552
3553
	__m64 s1, s2, s3, s4;
3554
3555
	__m64 w1 = _mm_cvtsi32_si64(w[1]);
3556
	__m64 w4 = mw[2];
3557
	__m64 w6 = mw[3];
3558
	__m64 w8 = mw[4];
3559
	__m64 w10 = mw[5];
3560
	__m64 w12 = mw[6];
3561
	__m64 w14 = mw[7];
3562
	__m64 w16 = mw[8];
3563
	__m64 w18 = mw[9];
3564
	__m64 w20 = mw[10];
3565
	__m64 w22 = mw[11];
3566
	__m64 w26 = _mm_cvtsi32_si64(w[26]);
3567
3568
	__m64 x0 = _mm_cvtsi32_si64(x[0]);
3569
	__m64 x1 = _mm_cvtsi32_si64(x[1]);
3570
	__m64 x4 = mx[2];
3571
	__m64 x6 = mx[3];
3572
	__m64 x8 = mx[4];
3573
3574
	__m64 y0 = _mm_cvtsi32_si64(y[0]);
3575
	__m64 y1 = _mm_cvtsi32_si64(y[1]);
3576
	__m64 y4 = my[2];
3577
	__m64 y6 = my[3];
3578
	__m64 y8 = my[4];
3579
3580
	s1 = _mm_add_si64(w1, w4);
3581
	C[1] = _mm_cvtsi64_si32(s1);
3582
	s1 = _mm_srli_si64(s1, 32);
3583
3584
	s2 = _mm_add_si64(w6, w8);
3585
	s1 = _mm_add_si64(s1, s2);
3586
	C[2] = _mm_cvtsi64_si32(s1);
3587
	s1 = _mm_srli_si64(s1, 32);
3588
3589
	s2 = _mm_add_si64(w10, w12);
3590
	s1 = _mm_add_si64(s1, s2);
3591
	C[3] = _mm_cvtsi64_si32(s1);
3592
	s1 = _mm_srli_si64(s1, 32);
3593
3594
	s3 = _mm_add_si64(x0, y0);
3595
	s2 = _mm_add_si64(w14, w16);
3596
	s1 = _mm_add_si64(s1, s3);
3597
	s1 = _mm_add_si64(s1, s2);
3598
	C[4] = _mm_cvtsi64_si32(s1);
3599
	s1 = _mm_srli_si64(s1, 32);
3600
3601
	s3 = _mm_add_si64(x1, y1);
3602
	s4 = _mm_add_si64(x4, y4);
3603
	s1 = _mm_add_si64(s1, w18);
3604
	s3 = _mm_add_si64(s3, s4);
3605
	s1 = _mm_add_si64(s1, w20);
3606
	s1 = _mm_add_si64(s1, s3);
3607
	C[5] = _mm_cvtsi64_si32(s1);
3608
	s1 = _mm_srli_si64(s1, 32);
3609
3610
	s3 = _mm_add_si64(x6, y6);
3611
	s4 = _mm_add_si64(x8, y8);
3612
	s1 = _mm_add_si64(s1, w22);
3613
	s3 = _mm_add_si64(s3, s4);
3614
	s1 = _mm_add_si64(s1, w26);
3615
	s1 = _mm_add_si64(s1, s3);
3616
	C[6] = _mm_cvtsi64_si32(s1);
3617
	s1 = _mm_srli_si64(s1, 32);
3618
3619
	C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
3620
	_mm_empty();
3621
}
3622
3623
#endif	// #ifdef SSE2_INTRINSICS_AVAILABLE
3073
#endif	// #ifdef SSE2_INTRINSICS_AVAILABLE
3624
3074
3625
// ********************************************************
3075
// ********************************************************
Lines 4064-4081 Link Here
4064
	DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
3514
	DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
4065
	Q[0] = q.GetLowHalf();
3515
	Q[0] = q.GetLowHalf();
4066
	Q[1] = q.GetHighHalf();
3516
	Q[1] = q.GetHighHalf();
4067
4068
#ifndef NDEBUG
4069
	if (B[0] || B[1])
4070
	{
4071
		// multiply quotient and divisor and add remainder, make sure it equals dividend
4072
		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
4073
		word P[4];
4074
		Portable::Multiply2(P, Q, B);
4075
		Add(P, P, T, 4);
4076
		assert(memcmp(P, A, 4*WORD_SIZE)==0);
4077
	}
4078
#endif
4079
}
3517
}
4080
3518
4081
// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
3519
// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
Lines 9421-9430 Link Here
9421
8859
9422
//- #include "modes.h"
8860
//- #include "modes.h"
9423
8861
9424
#ifndef NDEBUG
9425
//- #include "des.h"
9426
#endif
9427
9428
NAMESPACE_BEGIN(CryptoPP)
8862
NAMESPACE_BEGIN(CryptoPP)
9429
8863
9430
void CipherModeBase::SetKey(const byte *key, unsigned int length, const NameValuePairs &params)
8864
void CipherModeBase::SetKey(const byte *key, unsigned int length, const NameValuePairs &params)

Return to bug 66605