Lines 24-42
Link Here
|
24 |
#define CFL_LINE_2 128 |
24 |
#define CFL_LINE_2 128 |
25 |
#define CFL_LINE_3 192 |
25 |
#define CFL_LINE_3 192 |
26 |
|
26 |
|
27 |
typedef vector int8_t int8x16_t; |
27 |
typedef vector signed char int8x16_t; // NOLINT(runtime/int) |
28 |
typedef vector uint8_t uint8x16_t; |
28 |
typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int) |
29 |
typedef vector int16_t int16x8_t; |
29 |
typedef vector signed short int16x8_t; // NOLINT(runtime/int) |
30 |
typedef vector uint16_t uint16x8_t; |
30 |
typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int) |
31 |
typedef vector int32_t int32x4_t; |
31 |
typedef vector signed int int32x4_t; // NOLINT(runtime/int) |
32 |
typedef vector uint32_t uint32x4_t; |
32 |
typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int) |
33 |
typedef vector uint64_t uint64x2_t; |
33 |
typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int) |
34 |
|
34 |
|
35 |
static INLINE void subtract_average_vsx(int16_t *pred_buf, int width, |
35 |
static INLINE void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst, |
36 |
int height, int round_offset, |
36 |
int width, int height, int round_offset, |
37 |
int num_pel_log2) { |
37 |
int num_pel_log2) { |
38 |
const int16_t *end = pred_buf + height * CFL_BUF_LINE; |
38 |
// int16_t *dst = dst_ptr; |
39 |
const int16_t *sum_buf = pred_buf; |
39 |
const int16_t *dst_end = dst + height * CFL_BUF_LINE; |
|
|
40 |
const int16_t *sum_buf = (const int16_t *)src_ptr; |
41 |
const int16_t *end = sum_buf + height * CFL_BUF_LINE; |
40 |
const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2); |
42 |
const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2); |
41 |
const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
43 |
const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, |
42 |
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
44 |
0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; |
Lines 71-113
Link Here
|
71 |
const int32x4_t avg = vec_sr(sum_32x4, div_shift); |
73 |
const int32x4_t avg = vec_sr(sum_32x4, div_shift); |
72 |
const int16x8_t vec_avg = vec_pack(avg, avg); |
74 |
const int16x8_t vec_avg = vec_pack(avg, avg); |
73 |
do { |
75 |
do { |
74 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, pred_buf), vec_avg), OFF_0, pred_buf); |
76 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, dst), vec_avg), OFF_0, dst); |
75 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, pred_buf), vec_avg), |
77 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, dst), vec_avg), |
76 |
OFF_0 + CFL_BUF_LINE_BYTES, pred_buf); |
78 |
OFF_0 + CFL_BUF_LINE_BYTES, dst); |
77 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, pred_buf), vec_avg), |
79 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, dst), vec_avg), |
78 |
OFF_0 + CFL_LINE_2, pred_buf); |
80 |
OFF_0 + CFL_LINE_2, dst); |
79 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, pred_buf), vec_avg), |
81 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, dst), vec_avg), |
80 |
OFF_0 + CFL_LINE_3, pred_buf); |
82 |
OFF_0 + CFL_LINE_3, dst); |
81 |
if (width >= 16) { |
83 |
if (width >= 16) { |
82 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, pred_buf), vec_avg), OFF_1, |
84 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, dst), vec_avg), OFF_1, dst); |
83 |
pred_buf); |
85 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, dst), vec_avg), |
84 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, pred_buf), vec_avg), |
86 |
OFF_1 + CFL_LINE_1, dst); |
85 |
OFF_1 + CFL_LINE_1, pred_buf); |
87 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, dst), vec_avg), |
86 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, pred_buf), vec_avg), |
88 |
OFF_1 + CFL_LINE_2, dst); |
87 |
OFF_1 + CFL_LINE_2, pred_buf); |
89 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, dst), vec_avg), |
88 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, pred_buf), vec_avg), |
90 |
OFF_1 + CFL_LINE_3, dst); |
89 |
OFF_1 + CFL_LINE_3, pred_buf); |
|
|
90 |
} |
91 |
} |
91 |
if (width == 32) { |
92 |
if (width == 32) { |
92 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, pred_buf), vec_avg), OFF_2, |
93 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, dst), vec_avg), OFF_2, dst); |
93 |
pred_buf); |
94 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, dst), vec_avg), |
94 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, pred_buf), vec_avg), |
95 |
OFF_2 + CFL_LINE_1, dst); |
95 |
OFF_2 + CFL_LINE_1, pred_buf); |
96 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, dst), vec_avg), |
96 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, pred_buf), vec_avg), |
97 |
OFF_2 + CFL_LINE_2, dst); |
97 |
OFF_2 + CFL_LINE_2, pred_buf); |
98 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, dst), vec_avg), |
98 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, pred_buf), vec_avg), |
99 |
OFF_2 + CFL_LINE_3, dst); |
99 |
OFF_2 + CFL_LINE_3, pred_buf); |
|
|
100 |
|
100 |
|
101 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, pred_buf), vec_avg), OFF_3, |
101 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, dst), vec_avg), OFF_3, dst); |
102 |
pred_buf); |
102 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, dst), vec_avg), |
103 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, pred_buf), vec_avg), |
103 |
OFF_3 + CFL_LINE_1, dst); |
104 |
OFF_3 + CFL_LINE_1, pred_buf); |
104 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, dst), vec_avg), |
105 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, pred_buf), vec_avg), |
105 |
OFF_3 + CFL_LINE_2, dst); |
106 |
OFF_3 + CFL_LINE_2, pred_buf); |
106 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, dst), vec_avg), |
107 |
vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, pred_buf), vec_avg), |
107 |
OFF_3 + CFL_LINE_3, dst); |
108 |
OFF_3 + CFL_LINE_3, pred_buf); |
|
|
109 |
} |
108 |
} |
110 |
} while ((pred_buf += CFL_BUF_LINE * 4) < end); |
109 |
} while ((dst += CFL_BUF_LINE * 4) < dst_end); |
111 |
} |
110 |
} |
112 |
|
111 |
|
113 |
// Declare wrappers for VSX sizes |
112 |
// Declare wrappers for VSX sizes |