Gentoo Websites Logo
Go to: Gentoo Home Documentation Forums Lists Bugs Planet Store Wiki Get Gentoo!
View | Details | Raw Unified | Return to bug 66605 | Differences between
and this patch

Collapse All | Expand All

(-)src/CryptoPP.cpp (-415 / +3 lines)
Lines 2562-2616 Link Here
2562
#endif
2562
#endif
2563
}
2563
}
2564
2564
2565
#ifdef SSE2_INTRINSICS_AVAILABLE
2566
#ifndef _MSC_VER
2567
static jmp_buf s_env;
2568
static void SigIllHandler(int)
2569
{
2570
	longjmp(s_env, 1);
2571
}
2572
#endif
2573
2574
static bool HasSSE2()
2575
{
2576
	if (!s_sse2Enabled)
2577
		return false;
2578
2579
	word32 cpuid[4];
2580
	CpuId(1, cpuid);
2581
	if ((cpuid[3] & (1 << 26)) == 0)
2582
		return false;
2583
2584
#ifdef _MSC_VER
2585
    __try
2586
	{
2587
        __asm xorpd xmm0, xmm0        // executing SSE2 instruction
2588
	}
2589
    __except (1)
2590
	{
2591
		return false;
2592
    }
2593
	return true;
2594
#else
2595
	typedef void (*SigHandler)(int);
2596
2597
	SigHandler oldHandler = signal(SIGILL, SigIllHandler);
2598
	if (oldHandler == SIG_ERR)
2599
		return false;
2600
2601
	bool result = true;
2602
	if (setjmp(s_env))
2603
		result = false;
2604
	else
2605
		__asm __volatile ("xorps %xmm0, %xmm0");
2606
2607
	signal(SIGILL, oldHandler);
2608
	return result;
2609
#endif
2610
2611
}
2612
#endif
2613
2614
static bool IsP4()
2565
static bool IsP4()
2615
{
2566
{
2616
	word32 cpuid[4];
2567
	word32 cpuid[4];
Lines 2643-2653 Link Here
2643
public:
2594
public:
2644
	static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
2595
	static word CRYPTOPP_CDECL Add(word *C, const word *A, const word *B, unsigned int N);
2645
	static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);
2596
	static word CRYPTOPP_CDECL Subtract(word *C, const word *A, const word *B, unsigned int N);
2646
#ifdef SSE2_INTRINSICS_AVAILABLE
2647
	static void CRYPTOPP_CDECL Multiply4(word *C, const word *A, const word *B);
2648
	static void CRYPTOPP_CDECL Multiply8(word *C, const word *A, const word *B);
2649
	static void CRYPTOPP_CDECL Multiply8Bottom(word *C, const word *A, const word *B);
2650
#endif
2651
};
2597
};
2652
2598
2653
typedef word (CRYPTOPP_CDECL * PAddSub)(word *C, const word *A, const word *B, unsigned int N);
2599
typedef word (CRYPTOPP_CDECL * PAddSub)(word *C, const word *A, const word *B, unsigned int N);
Lines 2672-2700 Link Here
2672
	}
2618
	}
2673
2619
2674
#ifdef SSE2_INTRINSICS_AVAILABLE
2620
#ifdef SSE2_INTRINSICS_AVAILABLE
2675
	if (HasSSE2())
2621
	s_pMul4 = &PentiumOptimized::Multiply4;
2676
	{
2622
	s_pMul8 = &PentiumOptimized::Multiply8;
2677
		s_pMul4 = &P4Optimized::Multiply4;
2623
	s_pMul8B = &PentiumOptimized::Multiply8Bottom;
2678
		s_pMul8 = &P4Optimized::Multiply8;
2679
		s_pMul8B = &P4Optimized::Multiply8Bottom;
2680
	}
2681
	else
2682
	{
2683
		s_pMul4 = &PentiumOptimized::Multiply4;
2684
		s_pMul8 = &PentiumOptimized::Multiply8;
2685
		s_pMul8B = &PentiumOptimized::Multiply8Bottom;
2686
	}
2687
#endif
2624
#endif
2688
}
2625
}
2689
2626
2690
static const char s_RunAtStartupSetPentiumFunctionPointers = (SetPentiumFunctionPointers(), 0);
2627
static const char s_RunAtStartupSetPentiumFunctionPointers = (SetPentiumFunctionPointers(), 0);
2691
2628
2692
void DisableSSE2()
2693
{
2694
	s_sse2Enabled = false;
2695
	SetPentiumFunctionPointers();
2696
}
2697
2698
class LowLevel : public PentiumOptimized
2629
class LowLevel : public PentiumOptimized
2699
{
2630
{
2700
public:
2631
public:
Lines 3293-3625 Link Here
3293
	C[5] = _mm_add_epi64(a3b2, a2b3);
3224
	C[5] = _mm_add_epi64(a3b2, a2b3);
3294
}
3225
}
3295
3226
3296
void P4Optimized::Multiply4(word *C, const word *A, const word *B)
3297
{
3298
	__m128i temp[7];
3299
	const word *w = (word *)temp;
3300
	const __m64 *mw = (__m64 *)w;
3301
3302
	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
3303
3304
	C[0] = w[0];
3305
3306
	__m64 s1, s2;
3307
3308
	__m64 w1 = _mm_cvtsi32_si64(w[1]);
3309
	__m64 w4 = mw[2];
3310
	__m64 w6 = mw[3];
3311
	__m64 w8 = mw[4];
3312
	__m64 w10 = mw[5];
3313
	__m64 w12 = mw[6];
3314
	__m64 w14 = mw[7];
3315
	__m64 w16 = mw[8];
3316
	__m64 w18 = mw[9];
3317
	__m64 w20 = mw[10];
3318
	__m64 w22 = mw[11];
3319
	__m64 w26 = _mm_cvtsi32_si64(w[26]);
3320
3321
	s1 = _mm_add_si64(w1, w4);
3322
	C[1] = _mm_cvtsi64_si32(s1);
3323
	s1 = _mm_srli_si64(s1, 32);
3324
3325
	s2 = _mm_add_si64(w6, w8);
3326
	s1 = _mm_add_si64(s1, s2);
3327
	C[2] = _mm_cvtsi64_si32(s1);
3328
	s1 = _mm_srli_si64(s1, 32);
3329
3330
	s2 = _mm_add_si64(w10, w12);
3331
	s1 = _mm_add_si64(s1, s2);
3332
	C[3] = _mm_cvtsi64_si32(s1);
3333
	s1 = _mm_srli_si64(s1, 32);
3334
3335
	s2 = _mm_add_si64(w14, w16);
3336
	s1 = _mm_add_si64(s1, s2);
3337
	C[4] = _mm_cvtsi64_si32(s1);
3338
	s1 = _mm_srli_si64(s1, 32);
3339
3340
	s2 = _mm_add_si64(w18, w20);
3341
	s1 = _mm_add_si64(s1, s2);
3342
	C[5] = _mm_cvtsi64_si32(s1);
3343
	s1 = _mm_srli_si64(s1, 32);
3344
3345
	s2 = _mm_add_si64(w22, w26);
3346
	s1 = _mm_add_si64(s1, s2);
3347
	C[6] = _mm_cvtsi64_si32(s1);
3348
	s1 = _mm_srli_si64(s1, 32);
3349
3350
	C[7] = _mm_cvtsi64_si32(s1) + w[27];
3351
	_mm_empty();
3352
}
3353
3354
void P4Optimized::Multiply8(word *C, const word *A, const word *B)
3355
{
3356
	__m128i temp[28];
3357
	const word *w = (word *)temp;
3358
	const __m64 *mw = (__m64 *)w;
3359
	const word *x = (word *)temp+7*4;
3360
	const __m64 *mx = (__m64 *)x;
3361
	const word *y = (word *)temp+7*4*2;
3362
	const __m64 *my = (__m64 *)y;
3363
	const word *z = (word *)temp+7*4*3;
3364
	const __m64 *mz = (__m64 *)z;
3365
3366
	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
3367
3368
	P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
3369
3370
	P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
3371
3372
	P4_Mul(temp+21, (__m128i *)A+1, (__m128i *)B+1);
3373
3374
	C[0] = w[0];
3375
3376
	__m64 s1, s2, s3, s4;
3377
3378
	__m64 w1 = _mm_cvtsi32_si64(w[1]);
3379
	__m64 w4 = mw[2];
3380
	__m64 w6 = mw[3];
3381
	__m64 w8 = mw[4];
3382
	__m64 w10 = mw[5];
3383
	__m64 w12 = mw[6];
3384
	__m64 w14 = mw[7];
3385
	__m64 w16 = mw[8];
3386
	__m64 w18 = mw[9];
3387
	__m64 w20 = mw[10];
3388
	__m64 w22 = mw[11];
3389
	__m64 w26 = _mm_cvtsi32_si64(w[26]);
3390
	__m64 w27 = _mm_cvtsi32_si64(w[27]);
3391
3392
	__m64 x0 = _mm_cvtsi32_si64(x[0]);
3393
	__m64 x1 = _mm_cvtsi32_si64(x[1]);
3394
	__m64 x4 = mx[2];
3395
	__m64 x6 = mx[3];
3396
	__m64 x8 = mx[4];
3397
	__m64 x10 = mx[5];
3398
	__m64 x12 = mx[6];
3399
	__m64 x14 = mx[7];
3400
	__m64 x16 = mx[8];
3401
	__m64 x18 = mx[9];
3402
	__m64 x20 = mx[10];
3403
	__m64 x22 = mx[11];
3404
	__m64 x26 = _mm_cvtsi32_si64(x[26]);
3405
	__m64 x27 = _mm_cvtsi32_si64(x[27]);
3406
3407
	__m64 y0 = _mm_cvtsi32_si64(y[0]);
3408
	__m64 y1 = _mm_cvtsi32_si64(y[1]);
3409
	__m64 y4 = my[2];
3410
	__m64 y6 = my[3];
3411
	__m64 y8 = my[4];
3412
	__m64 y10 = my[5];
3413
	__m64 y12 = my[6];
3414
	__m64 y14 = my[7];
3415
	__m64 y16 = my[8];
3416
	__m64 y18 = my[9];
3417
	__m64 y20 = my[10];
3418
	__m64 y22 = my[11];
3419
	__m64 y26 = _mm_cvtsi32_si64(y[26]);
3420
	__m64 y27 = _mm_cvtsi32_si64(y[27]);
3421
3422
	__m64 z0 = _mm_cvtsi32_si64(z[0]);
3423
	__m64 z1 = _mm_cvtsi32_si64(z[1]);
3424
	__m64 z4 = mz[2];
3425
	__m64 z6 = mz[3];
3426
	__m64 z8 = mz[4];
3427
	__m64 z10 = mz[5];
3428
	__m64 z12 = mz[6];
3429
	__m64 z14 = mz[7];
3430
	__m64 z16 = mz[8];
3431
	__m64 z18 = mz[9];
3432
	__m64 z20 = mz[10];
3433
	__m64 z22 = mz[11];
3434
	__m64 z26 = _mm_cvtsi32_si64(z[26]);
3435
3436
	s1 = _mm_add_si64(w1, w4);
3437
	C[1] = _mm_cvtsi64_si32(s1);
3438
	s1 = _mm_srli_si64(s1, 32);
3439
3440
	s2 = _mm_add_si64(w6, w8);
3441
	s1 = _mm_add_si64(s1, s2);
3442
	C[2] = _mm_cvtsi64_si32(s1);
3443
	s1 = _mm_srli_si64(s1, 32);
3444
3445
	s2 = _mm_add_si64(w10, w12);
3446
	s1 = _mm_add_si64(s1, s2);
3447
	C[3] = _mm_cvtsi64_si32(s1);
3448
	s1 = _mm_srli_si64(s1, 32);
3449
3450
	s3 = _mm_add_si64(x0, y0);
3451
	s2 = _mm_add_si64(w14, w16);
3452
	s1 = _mm_add_si64(s1, s3);
3453
	s1 = _mm_add_si64(s1, s2);
3454
	C[4] = _mm_cvtsi64_si32(s1);
3455
	s1 = _mm_srli_si64(s1, 32);
3456
3457
	s3 = _mm_add_si64(x1, y1);
3458
	s4 = _mm_add_si64(x4, y4);
3459
	s1 = _mm_add_si64(s1, w18);
3460
	s3 = _mm_add_si64(s3, s4);
3461
	s1 = _mm_add_si64(s1, w20);
3462
	s1 = _mm_add_si64(s1, s3);
3463
	C[5] = _mm_cvtsi64_si32(s1);
3464
	s1 = _mm_srli_si64(s1, 32);
3465
3466
	s3 = _mm_add_si64(x6, y6);
3467
	s4 = _mm_add_si64(x8, y8);
3468
	s1 = _mm_add_si64(s1, w22);
3469
	s3 = _mm_add_si64(s3, s4);
3470
	s1 = _mm_add_si64(s1, w26);
3471
	s1 = _mm_add_si64(s1, s3);
3472
	C[6] = _mm_cvtsi64_si32(s1);
3473
	s1 = _mm_srli_si64(s1, 32);
3474
3475
	s3 = _mm_add_si64(x10, y10);
3476
	s4 = _mm_add_si64(x12, y12);
3477
	s1 = _mm_add_si64(s1, w27);
3478
	s3 = _mm_add_si64(s3, s4);
3479
	s1 = _mm_add_si64(s1, s3);
3480
	C[7] = _mm_cvtsi64_si32(s1);
3481
	s1 = _mm_srli_si64(s1, 32);
3482
3483
	s3 = _mm_add_si64(x14, y14);
3484
	s4 = _mm_add_si64(x16, y16);
3485
	s1 = _mm_add_si64(s1, z0);
3486
	s3 = _mm_add_si64(s3, s4);
3487
	s1 = _mm_add_si64(s1, s3);
3488
	C[8] = _mm_cvtsi64_si32(s1);
3489
	s1 = _mm_srli_si64(s1, 32);
3490
3491
	s3 = _mm_add_si64(x18, y18);
3492
	s4 = _mm_add_si64(x20, y20);
3493
	s1 = _mm_add_si64(s1, z1);
3494
	s3 = _mm_add_si64(s3, s4);
3495
	s1 = _mm_add_si64(s1, z4);
3496
	s1 = _mm_add_si64(s1, s3);
3497
	C[9] = _mm_cvtsi64_si32(s1);
3498
	s1 = _mm_srli_si64(s1, 32);
3499
3500
	s3 = _mm_add_si64(x22, y22);
3501
	s4 = _mm_add_si64(x26, y26);
3502
	s1 = _mm_add_si64(s1, z6);
3503
	s3 = _mm_add_si64(s3, s4);
3504
	s1 = _mm_add_si64(s1, z8);
3505
	s1 = _mm_add_si64(s1, s3);
3506
	C[10] = _mm_cvtsi64_si32(s1);
3507
	s1 = _mm_srli_si64(s1, 32);
3508
3509
	s3 = _mm_add_si64(x27, y27);
3510
	s1 = _mm_add_si64(s1, z10);
3511
	s1 = _mm_add_si64(s1, z12);
3512
	s1 = _mm_add_si64(s1, s3);
3513
	C[11] = _mm_cvtsi64_si32(s1);
3514
	s1 = _mm_srli_si64(s1, 32);
3515
3516
	s3 = _mm_add_si64(z14, z16);
3517
	s1 = _mm_add_si64(s1, s3);
3518
	C[12] = _mm_cvtsi64_si32(s1);
3519
	s1 = _mm_srli_si64(s1, 32);
3520
3521
	s3 = _mm_add_si64(z18, z20);
3522
	s1 = _mm_add_si64(s1, s3);
3523
	C[13] = _mm_cvtsi64_si32(s1);
3524
	s1 = _mm_srli_si64(s1, 32);
3525
3526
	s3 = _mm_add_si64(z22, z26);
3527
	s1 = _mm_add_si64(s1, s3);
3528
	C[14] = _mm_cvtsi64_si32(s1);
3529
	s1 = _mm_srli_si64(s1, 32);
3530
3531
	C[15] = z[27] + _mm_cvtsi64_si32(s1);
3532
	_mm_empty();
3533
}
3534
3535
void P4Optimized::Multiply8Bottom(word *C, const word *A, const word *B)
3536
{
3537
	__m128i temp[21];
3538
	const word *w = (word *)temp;
3539
	const __m64 *mw = (__m64 *)w;
3540
	const word *x = (word *)temp+7*4;
3541
	const __m64 *mx = (__m64 *)x;
3542
	const word *y = (word *)temp+7*4*2;
3543
	const __m64 *my = (__m64 *)y;
3544
3545
	P4_Mul(temp, (__m128i *)A, (__m128i *)B);
3546
3547
	P4_Mul(temp+7, (__m128i *)A+1, (__m128i *)B);
3548
3549
	P4_Mul(temp+14, (__m128i *)A, (__m128i *)B+1);
3550
3551
	C[0] = w[0];
3552
3553
	__m64 s1, s2, s3, s4;
3554
3555
	__m64 w1 = _mm_cvtsi32_si64(w[1]);
3556
	__m64 w4 = mw[2];
3557
	__m64 w6 = mw[3];
3558
	__m64 w8 = mw[4];
3559
	__m64 w10 = mw[5];
3560
	__m64 w12 = mw[6];
3561
	__m64 w14 = mw[7];
3562
	__m64 w16 = mw[8];
3563
	__m64 w18 = mw[9];
3564
	__m64 w20 = mw[10];
3565
	__m64 w22 = mw[11];
3566
	__m64 w26 = _mm_cvtsi32_si64(w[26]);
3567
3568
	__m64 x0 = _mm_cvtsi32_si64(x[0]);
3569
	__m64 x1 = _mm_cvtsi32_si64(x[1]);
3570
	__m64 x4 = mx[2];
3571
	__m64 x6 = mx[3];
3572
	__m64 x8 = mx[4];
3573
3574
	__m64 y0 = _mm_cvtsi32_si64(y[0]);
3575
	__m64 y1 = _mm_cvtsi32_si64(y[1]);
3576
	__m64 y4 = my[2];
3577
	__m64 y6 = my[3];
3578
	__m64 y8 = my[4];
3579
3580
	s1 = _mm_add_si64(w1, w4);
3581
	C[1] = _mm_cvtsi64_si32(s1);
3582
	s1 = _mm_srli_si64(s1, 32);
3583
3584
	s2 = _mm_add_si64(w6, w8);
3585
	s1 = _mm_add_si64(s1, s2);
3586
	C[2] = _mm_cvtsi64_si32(s1);
3587
	s1 = _mm_srli_si64(s1, 32);
3588
3589
	s2 = _mm_add_si64(w10, w12);
3590
	s1 = _mm_add_si64(s1, s2);
3591
	C[3] = _mm_cvtsi64_si32(s1);
3592
	s1 = _mm_srli_si64(s1, 32);
3593
3594
	s3 = _mm_add_si64(x0, y0);
3595
	s2 = _mm_add_si64(w14, w16);
3596
	s1 = _mm_add_si64(s1, s3);
3597
	s1 = _mm_add_si64(s1, s2);
3598
	C[4] = _mm_cvtsi64_si32(s1);
3599
	s1 = _mm_srli_si64(s1, 32);
3600
3601
	s3 = _mm_add_si64(x1, y1);
3602
	s4 = _mm_add_si64(x4, y4);
3603
	s1 = _mm_add_si64(s1, w18);
3604
	s3 = _mm_add_si64(s3, s4);
3605
	s1 = _mm_add_si64(s1, w20);
3606
	s1 = _mm_add_si64(s1, s3);
3607
	C[5] = _mm_cvtsi64_si32(s1);
3608
	s1 = _mm_srli_si64(s1, 32);
3609
3610
	s3 = _mm_add_si64(x6, y6);
3611
	s4 = _mm_add_si64(x8, y8);
3612
	s1 = _mm_add_si64(s1, w22);
3613
	s3 = _mm_add_si64(s3, s4);
3614
	s1 = _mm_add_si64(s1, w26);
3615
	s1 = _mm_add_si64(s1, s3);
3616
	C[6] = _mm_cvtsi64_si32(s1);
3617
	s1 = _mm_srli_si64(s1, 32);
3618
3619
	C[7] = _mm_cvtsi64_si32(s1) + w[27] + x[10] + y[10] + x[12] + y[12];
3620
	_mm_empty();
3621
}
3622
3623
#endif	// #ifdef SSE2_INTRINSICS_AVAILABLE
3227
#endif	// #ifdef SSE2_INTRINSICS_AVAILABLE
3624
3228
3625
// ********************************************************
3229
// ********************************************************
Lines 4064-4081 Link Here
4064
	DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
3668
	DWord q = DivideFourWordsByTwo<word, DWord>(T, DWord(A[0], A[1]), DWord(A[2], A[3]), DWord(B[0], B[1]));
4065
	Q[0] = q.GetLowHalf();
3669
	Q[0] = q.GetLowHalf();
4066
	Q[1] = q.GetHighHalf();
3670
	Q[1] = q.GetHighHalf();
4067
4068
#ifndef NDEBUG
4069
	if (B[0] || B[1])
4070
	{
4071
		// multiply quotient and divisor and add remainder, make sure it equals dividend
4072
		assert(!T[2] && !T[3] && (T[1] < B[1] || (T[1]==B[1] && T[0]<B[0])));
4073
		word P[4];
4074
		Portable::Multiply2(P, Q, B);
4075
		Add(P, P, T, 4);
4076
		assert(memcmp(P, A, 4*WORD_SIZE)==0);
4077
	}
4078
#endif
4079
}
3671
}
4080
3672
4081
// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
3673
// for use by Divide(), corrects the underestimated quotient {Q1,Q0}
Lines 9421-9430 Link Here
9421
9013
9422
//- #include "modes.h"
9014
//- #include "modes.h"
9423
9015
9424
#ifndef NDEBUG
9425
//- #include "des.h"
9426
#endif
9427
9428
NAMESPACE_BEGIN(CryptoPP)
9016
NAMESPACE_BEGIN(CryptoPP)
9429
9017
9430
void CipherModeBase::SetKey(const byte *key, unsigned int length, const NameValuePairs &params)
9018
void CipherModeBase::SetKey(const byte *key, unsigned int length, const NameValuePairs &params)

Return to bug 66605