diff options
Diffstat (limited to 'libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_sse3.c')
-rw-r--r-- | libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_sse3.c | 316 |
1 files changed, 316 insertions, 0 deletions
diff --git a/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_sse3.c b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_sse3.c new file mode 100644 index 0000000..2e72fec --- /dev/null +++ b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_sse3.c | |||
@@ -0,0 +1,316 @@ | |||
1 | /* blend pixel --> dst */ | ||
2 | |||
3 | #ifdef BUILD_SSE3 | ||
4 | |||
5 | static void | ||
6 | _op_blend_p_dp_sse3(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *d, int l) { | ||
7 | |||
8 | LOOP_ALIGNED_U1_A48_SSE3(d, l, | ||
9 | { /* UOP */ | ||
10 | |||
11 | int alpha = 256 - (*s >> 24); | ||
12 | *d = *s + MUL_256(alpha, *d); | ||
13 | s++; d++; l--; | ||
14 | }, | ||
15 | { /* A4OP */ | ||
16 | |||
17 | __m128i s0 = _mm_lddqu_si128((__m128i *)s); | ||
18 | __m128i d0 = _mm_load_si128((__m128i *)d); | ||
19 | |||
20 | __m128i a0 = sub4_alpha_sse3(s0); | ||
21 | __m128i mul0 = mul_256_sse3(a0, d0); | ||
22 | d0 = _mm_add_epi32(mul0, s0); | ||
23 | |||
24 | _mm_store_si128((__m128i *)d, d0); | ||
25 | |||
26 | s += 4; d += 4; l -= 4; | ||
27 | }, | ||
28 | { /* A8OP */ | ||
29 | |||
30 | __m128i s0 = _mm_lddqu_si128((__m128i *)s); | ||
31 | __m128i d0 = _mm_load_si128((__m128i *)d); | ||
32 | |||
33 | __m128i s1 = _mm_lddqu_si128((__m128i *)(s+4)); | ||
34 | __m128i d1 = _mm_load_si128((__m128i *)(d+4)); | ||
35 | |||
36 | __m128i a0 = sub4_alpha_sse3(s0); | ||
37 | __m128i a1 = sub4_alpha_sse3(s1); | ||
38 | |||
39 | __m128i mul0 = mul_256_sse3(a0, d0); | ||
40 | __m128i mul1 = mul_256_sse3(a1, d1); | ||
41 | |||
42 | d0 = _mm_add_epi32(mul0, s0); | ||
43 | d1 = _mm_add_epi32(mul1, s1); | ||
44 | |||
45 | _mm_store_si128((__m128i *)d, d0); | ||
46 | _mm_store_si128((__m128i *)(d+4), d1); | ||
47 | |||
48 | s += 8; d += 8; l -= 8; | ||
49 | }) | ||
50 | } | ||
51 | |||
52 | static void | ||
53 | _op_blend_pas_dp_sse3(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *d, int l) { | ||
54 | |||
55 | int alpha; | ||
56 | |||
57 | const __m128i zero = _mm_setzero_si128(); | ||
58 | |||
59 | LOOP_ALIGNED_U1_A48_SSE3(d, l, | ||
60 | { /* UOP */ | ||
61 | switch (*s & 0xff000000) | ||
62 | { | ||
63 | case 0: | ||
64 | break; | ||
65 | case 0xff000000: | ||
66 | *d = *s; | ||
67 | break; | ||
68 | default: | ||
69 | alpha = 256 - (*s >> 24); | ||
70 | *d = *s + MUL_256(alpha, *d); | ||
71 | break; | ||
72 | } | ||
73 | s++; d++; l--; | ||
74 | }, | ||
75 | { /* A4OP */ | ||
76 | |||
77 | __m128i s0 = _mm_lddqu_si128((__m128i *)s); | ||
78 | __m128i d0 = _mm_load_si128((__m128i *)d); | ||
79 | |||
80 | __m128i a0 = sub4_alpha_sse3(s0); | ||
81 | __m128i mul0 = mul_256_sse3(a0, d0); | ||
82 | |||
83 | mul0 = _mm_add_epi32(s0, mul0); | ||
84 | |||
85 | __m128i zmask0 = _mm_cmpeq_epi32(_mm_srli_epi32(s0, 24), zero); | ||
86 | __m128i imask0 = ~zmask0; | ||
87 | |||
88 | mul0 = _mm_and_si128(imask0, mul0); | ||
89 | d0 = _mm_and_si128(zmask0, d0); | ||
90 | |||
91 | d0 = _mm_add_epi32(mul0, d0); | ||
92 | |||
93 | _mm_store_si128((__m128i *)d, d0); | ||
94 | |||
95 | s += 4; d += 4; l -= 4; | ||
96 | }, | ||
97 | { /* A8OP */ | ||
98 | |||
99 | __m128i s0 = _mm_lddqu_si128((__m128i *)s); | ||
100 | __m128i d0 = _mm_load_si128((__m128i *)d); | ||
101 | |||
102 | __m128i s1 = _mm_lddqu_si128((__m128i *)(s+4)); | ||
103 | __m128i d1 = _mm_load_si128((__m128i *)(d+4)); | ||
104 | |||
105 | __m128i a0 = sub4_alpha_sse3(s0); | ||
106 | __m128i a1 = sub4_alpha_sse3(s1); | ||
107 | |||
108 | __m128i mul0 = mul_256_sse3(a0, d0); | ||
109 | __m128i mul1 = mul_256_sse3(a1, d1); | ||
110 | |||
111 | mul0 = _mm_add_epi32(s0, mul0); | ||
112 | mul1 = _mm_add_epi32(s1, mul1); | ||
113 | |||
114 | __m128i zmask0 = _mm_cmpeq_epi32(_mm_srli_epi32(s0, 24), zero); | ||
115 | __m128i zmask1 = _mm_cmpeq_epi32(_mm_srli_epi32(s1, 24), zero); | ||
116 | |||
117 | __m128i imask0 = ~zmask0; | ||
118 | __m128i imask1 = ~zmask1; | ||
119 | |||
120 | mul0 = _mm_and_si128(imask0, mul0); | ||
121 | d0 = _mm_and_si128(zmask0, d0); | ||
122 | |||
123 | mul1 = _mm_and_si128(imask1, mul1); | ||
124 | d1 = _mm_and_si128(zmask1, d1); | ||
125 | |||
126 | d0 = _mm_add_epi32(mul0, d0); | ||
127 | d1 = _mm_add_epi32(mul1, d1); | ||
128 | |||
129 | _mm_store_si128((__m128i *)d, d0); | ||
130 | _mm_store_si128((__m128i *)(d+4), d1); | ||
131 | |||
132 | s += 8; d += 8; l -= 8; | ||
133 | }) | ||
134 | } | ||
135 | |||
136 | #define _op_blend_pan_dp_sse3 NULL | ||
137 | |||
138 | #define _op_blend_p_dpan_sse3 _op_blend_p_dp_sse3 | ||
139 | #define _op_blend_pas_dpan_sse3 _op_blend_pas_dp_sse3 | ||
140 | #define _op_blend_pan_dpan_sse3 _op_blend_pan_dp_sse3 | ||
141 | |||
142 | static void | ||
143 | init_blend_pixel_span_funcs_sse3(void) | ||
144 | { | ||
145 | op_blend_span_funcs[SP][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_p_dp_sse3; | ||
146 | op_blend_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_pas_dp_sse3; | ||
147 | op_blend_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_pan_dp_sse3; | ||
148 | |||
149 | |||
150 | // FIXME: BUGGY BUGGY Core i5 750 (32bit), 4.5.2 (Ubuntu/Linaro 4.5.2-8ubuntu4), ello (text and rectangle) | ||
151 | // op_blend_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_p_dpan_sse3; | ||
152 | op_blend_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_pas_dpan_sse3; | ||
153 | op_blend_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_pan_dpan_sse3; | ||
154 | } | ||
155 | |||
156 | #define _op_blend_pt_p_dp_sse3 NULL | ||
157 | |||
158 | #define _op_blend_pt_pas_dp_sse3 _op_blend_pt_p_dp_sse3 | ||
159 | #define _op_blend_pt_pan_dp_sse3 NULL | ||
160 | |||
161 | #define _op_blend_pt_p_dpan_sse3 _op_blend_pt_p_dp_sse3 | ||
162 | #define _op_blend_pt_pan_dpan_sse3 _op_blend_pt_pan_dp_sse3 | ||
163 | #define _op_blend_pt_pas_dpan_sse3 _op_blend_pt_pas_dp_sse3 | ||
164 | |||
165 | static void | ||
166 | init_blend_pixel_pt_funcs_sse3(void) | ||
167 | { | ||
168 | op_blend_pt_funcs[SP][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_pt_p_dp_sse3; | ||
169 | op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_pt_pas_dp_sse3; | ||
170 | op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_pt_pan_dp_sse3; | ||
171 | |||
172 | op_blend_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_pt_p_dpan_sse3; | ||
173 | op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_pt_pas_dpan_sse3; | ||
174 | op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_pt_pan_dpan_sse3; | ||
175 | } | ||
176 | |||
177 | /*-----*/ | ||
178 | |||
179 | /* blend_rel pixel -> dst */ | ||
180 | |||
181 | static void | ||
182 | _op_blend_rel_p_dp_sse3(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) { | ||
183 | |||
184 | const __m128i ones = _mm_set_epi32(1, 1, 1, 1); | ||
185 | |||
186 | LOOP_ALIGNED_U1_A48_SSE3(d, l, | ||
187 | { /* UOP */ | ||
188 | |||
189 | int alpha = 256 - (*s >> 24); | ||
190 | c = 1 + (*d >> 24); | ||
191 | *d = MUL_256(c, *s) + MUL_256(alpha, *d); | ||
192 | d++; s++; l--; | ||
193 | }, | ||
194 | { /*A4OP */ | ||
195 | |||
196 | __m128i s0 = _mm_lddqu_si128((__m128i *)s); | ||
197 | __m128i d0 = _mm_load_si128((__m128i *)d); | ||
198 | |||
199 | __m128i c0 = _mm_add_epi32(_mm_srli_epi32(d0, 24), ones); | ||
200 | __m128i a0 = sub4_alpha_sse3(s0); | ||
201 | |||
202 | d0 = _mm_add_epi32(mul_256_sse3(c0, s0), mul_256_sse3(a0, d0)); | ||
203 | |||
204 | _mm_store_si128((__m128i *)d, d0); | ||
205 | |||
206 | d += 4; s += 4; l -= 4; | ||
207 | }, | ||
208 | { /* A8OP */ | ||
209 | |||
210 | __m128i s0 = _mm_lddqu_si128((__m128i *)s); | ||
211 | __m128i d0 = _mm_load_si128 ((__m128i *)d); | ||
212 | |||
213 | __m128i s1 = _mm_lddqu_si128((__m128i *)(s+4)); | ||
214 | __m128i d1 = _mm_load_si128 ((__m128i *)(d+4)); | ||
215 | |||
216 | __m128i c0 = _mm_add_epi32(_mm_srli_epi32(d0, 24), ones); | ||
217 | __m128i c1 = _mm_add_epi32(_mm_srli_epi32(d1, 24), ones); | ||
218 | |||
219 | __m128i a0 = sub4_alpha_sse3(s0); | ||
220 | __m128i a1 = sub4_alpha_sse3(s1); | ||
221 | |||
222 | d0 = _mm_add_epi32(mul_256_sse3(c0, s0), mul_256_sse3(a0, d0)); | ||
223 | d1 = _mm_add_epi32(mul_256_sse3(c1, s1), mul_256_sse3(a1, d1)); | ||
224 | |||
225 | _mm_store_si128((__m128i *)d, d0); | ||
226 | _mm_store_si128((__m128i *)(d+4), d1); | ||
227 | |||
228 | d += 8; s += 8; l -= 8; | ||
229 | }) | ||
230 | } | ||
231 | |||
232 | static void | ||
233 | _op_blend_rel_pan_dp_sse3(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) { | ||
234 | |||
235 | const __m128i ones = _mm_set_epi32(1, 1, 1, 1); | ||
236 | |||
237 | LOOP_ALIGNED_U1_A48_SSE3(d, l, | ||
238 | { /* UOP */ | ||
239 | |||
240 | c = 1 + (*d >> 24); | ||
241 | *d++ = MUL_256(c, *s); | ||
242 | s++; l--; | ||
243 | }, | ||
244 | { /* A4OP */ | ||
245 | |||
246 | __m128i s0 = _mm_lddqu_si128((__m128i *)s); | ||
247 | __m128i d0 = _mm_load_si128((__m128i *)d); | ||
248 | |||
249 | __m128i c0 = _mm_add_epi32(_mm_srli_epi32(d0, 24), ones); | ||
250 | d0 = mul_256_sse3(c0, s0); | ||
251 | |||
252 | _mm_store_si128((__m128i *)d, d0); | ||
253 | |||
254 | d += 4; s += 4; l -= 4; | ||
255 | }, | ||
256 | { /* A8OP */ | ||
257 | |||
258 | __m128i s0 = _mm_lddqu_si128((__m128i *)s); | ||
259 | __m128i d0 = _mm_load_si128 ((__m128i *)d); | ||
260 | |||
261 | __m128i s1 = _mm_lddqu_si128((__m128i *)(s+4)); | ||
262 | __m128i d1 = _mm_load_si128 ((__m128i *)(d+4)); | ||
263 | |||
264 | __m128i c0 = _mm_add_epi32(_mm_srli_epi32(d0, 24), ones); | ||
265 | __m128i c1 = _mm_add_epi32(_mm_srli_epi32(d1, 24), ones); | ||
266 | |||
267 | d0 = mul_256_sse3(c0, s0); | ||
268 | d1 = mul_256_sse3(c1, s1); | ||
269 | |||
270 | _mm_store_si128((__m128i *)d, d0); | ||
271 | _mm_store_si128((__m128i *)(d+4), d1); | ||
272 | |||
273 | d += 8; s += 8; l -= 8; | ||
274 | }) | ||
275 | } | ||
276 | |||
277 | #define _op_blend_rel_pas_dp_sse3 _op_blend_rel_p_dp_sse3 | ||
278 | |||
279 | #define _op_blend_rel_p_dpan_sse3 _op_blend_p_dpan_sse3 | ||
280 | #define _op_blend_rel_pan_dpan_sse3 _op_blend_pan_dpan_sse3 | ||
281 | #define _op_blend_rel_pas_dpan_sse3 _op_blend_pas_dpan_sse3 | ||
282 | |||
283 | static void | ||
284 | init_blend_rel_pixel_span_funcs_sse3(void) | ||
285 | { | ||
286 | op_blend_rel_span_funcs[SP][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_p_dp_sse3; | ||
287 | op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_pas_dp_sse3; | ||
288 | op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_pan_dp_sse3; | ||
289 | |||
290 | op_blend_rel_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_p_dpan_sse3; | ||
291 | op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pas_dpan_sse3; | ||
292 | op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pan_dpan_sse3; | ||
293 | } | ||
294 | |||
295 | #define _op_blend_rel_pt_p_dp_sse3 NULL | ||
296 | #define _op_blend_rel_pt_pan_dp_sse3 NULL | ||
297 | |||
298 | #define _op_blend_rel_pt_pas_dp_sse3 _op_blend_rel_pt_p_dp_sse3 | ||
299 | |||
300 | #define _op_blend_rel_pt_p_dpan_sse3 _op_blend_pt_p_dpan_sse3 | ||
301 | #define _op_blend_rel_pt_pan_dpan_sse3 _op_blend_pt_pan_dpan_sse3 | ||
302 | #define _op_blend_rel_pt_pas_dpan_sse3 _op_blend_pt_pas_dpan_sse3 | ||
303 | |||
304 | static void | ||
305 | init_blend_rel_pixel_pt_funcs_sse3(void) | ||
306 | { | ||
307 | op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_pt_p_dp_sse3; | ||
308 | op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_pt_pas_dp_sse3; | ||
309 | op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_pt_pan_dp_sse3; | ||
310 | |||
311 | op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pt_p_dpan_sse3; | ||
312 | op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pt_pas_dpan_sse3; | ||
313 | op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pt_pan_dpan_sse3; | ||
314 | } | ||
315 | |||
316 | #endif | ||