diff options
Diffstat (limited to '')
-rw-r--r-- | libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_color_neon.c | 570 |
1 files changed, 0 insertions, 570 deletions
diff --git a/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_color_neon.c b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_color_neon.c deleted file mode 100644 index 6e35970..0000000 --- a/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_color_neon.c +++ /dev/null | |||
@@ -1,570 +0,0 @@ | |||
1 | /* blend pixel x color --> dst */ | ||
2 | #ifdef BUILD_NEON | ||
3 | /* Note: Optimisation is based on keeping _dest_ aligned: else it's a pair of | ||
4 | * reads, then two writes, a miss on read is 'just' two reads */ | ||
5 | static void | ||
6 | _op_blend_p_c_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) { | ||
7 | #define AP "blend_p_c_dp_" | ||
8 | asm volatile ( | ||
9 | ".fpu neon \n\t" | ||
10 | // Load 'c' | ||
11 | "vdup.u32 q7, %[c] \n\t" | ||
12 | "vmov.i8 q6, #1 \n\t" | ||
13 | |||
14 | // Choose a loop | ||
15 | "andS %[tmp], %[d], $0xf \n\t" | ||
16 | "beq "AP"quadstart \n\t" | ||
17 | |||
18 | "andS %[tmp],%[d], $0x4 \n\t" | ||
19 | "beq "AP"dualloop \n\t" | ||
20 | |||
21 | AP"singleloop:" | ||
22 | "vld1.32 d0[0], [%[s]]! \n\t" | ||
23 | "vld1.32 d2[0], [%[d]] \n\t" | ||
24 | // Mulitply s * c (= sc) | ||
25 | "vmull.u8 q4, d0,d14 \n\t" | ||
26 | // sc in d8 | ||
27 | "vqrshrn.u16 d4, q4, #8 \n\t" | ||
28 | |||
29 | // sca in d9 | ||
30 | "vmvn.u32 d6, d4 \n\t" | ||
31 | "vshr.u32 d6, d6, #24 \n\t" | ||
32 | |||
33 | "vmul.u32 d6, d12, d6 \n\t" | ||
34 | |||
35 | /* d * alpha */ | ||
36 | "vmull.u8 q4, d6, d2 \n\t" | ||
37 | "vqrshrn.u16 d0, q4, #8 \n\t" | ||
38 | |||
39 | "vqadd.u8 d2, d0, d4 \n\t" | ||
40 | |||
41 | // Save dsc + sc | ||
42 | "vst1.32 d2[0], [%[d]]! \n\t" | ||
43 | |||
44 | // Now where? | ||
45 | // Can we go the fast path? | ||
46 | "andS %[tmp], %[d],$0xf \n\t" | ||
47 | "beq "AP"quadstart \n\t" | ||
48 | |||
49 | AP"dualloop: \n\t" | ||
50 | // Check we have enough to bother with! | ||
51 | "sub %[tmp], %[e], %[d] \n\t" | ||
52 | "cmp %[tmp], #16 \n\t" | ||
53 | "blt "AP"loopout \n\t" | ||
54 | |||
55 | // load 's' -> q0, 'd' -> q1 | ||
56 | "vldm %[s]!, {d0} \n\t" | ||
57 | "vldm %[d], {d2} \n\t" | ||
58 | // Mulitply s * c (= sc) | ||
59 | "vmull.u8 q4, d0,d14 \n\t" | ||
60 | // sc in d8 | ||
61 | "vqrshrn.u16 d4, q4, #8 \n\t" | ||
62 | |||
63 | // sca in d9 | ||
64 | "vmvn.u32 d6, d4 \n\t" | ||
65 | "vshr.u32 d6, d6, #24 \n\t" | ||
66 | |||
67 | "vmul.u32 d6, d12, d6 \n\t" | ||
68 | |||
69 | /* d * alpha */ | ||
70 | "vmull.u8 q4, d6, d2 \n\t" | ||
71 | "vqrshrn.u16 d0, q4, #8 \n\t" | ||
72 | |||
73 | "vqadd.u8 d2, d0, d4 \n\t" | ||
74 | |||
75 | // Save dsc + sc | ||
76 | "vst1.32 d2, [%[d]]! \n\t" | ||
77 | |||
78 | AP"quadstart: \n\t" | ||
79 | "sub %[tmp], %[e], %[d] \n\t" | ||
80 | "cmp %[tmp], #16 \n\t" | ||
81 | "blt "AP"loopout \n\t" | ||
82 | |||
83 | "sub %[tmp], %[e], #15 \n\t" | ||
84 | |||
85 | AP"quadloop:\n\t" | ||
86 | // load 's' -> q0, 'd' -> q1 | ||
87 | "vldm %[s]!, {d0,d1} \n\t" | ||
88 | "vldm %[d], {d2,d3} \n\t" | ||
89 | // Mulitply s * c (= sc) | ||
90 | "vmull.u8 q4, d0,d14 \n\t" | ||
91 | "vmull.u8 q5, d1,d14 \n\t" | ||
92 | |||
93 | // Get sc & sc alpha | ||
94 | "vqrshrn.u16 d4, q4, #8 \n\t" | ||
95 | "vqrshrn.u16 d5, q5, #8 \n\t" | ||
96 | // sc is now in q2, 8bpp | ||
97 | // Shift out, then spread alpha for q2 | ||
98 | "vmvn.u32 q3, q2 \n\t" | ||
99 | "vshr.u32 q3, q3, $0x18 \n\t" | ||
100 | "vmul.u32 q3, q6,q3 \n\t" | ||
101 | |||
102 | // Multiply 'd' by sc.alpha (dsca) | ||
103 | "vmull.u8 q4, d6,d2 \n\t" | ||
104 | "vmull.u8 q5, d7,d3 \n\t" | ||
105 | |||
106 | "vqrshrn.u16 d0, q4, #8 \n\t" | ||
107 | "vqrshrn.u16 d1, q5, #8 \n\t" | ||
108 | |||
109 | "vqadd.u8 q1, q0, q2 \n\t" | ||
110 | |||
111 | // Save dsc + sc | ||
112 | "vstm %[d]!, {d2,d3} \n\t" | ||
113 | |||
114 | "cmp %[tmp], %[d] \n\t" | ||
115 | |||
116 | "bhi "AP"quadloop \n\t" | ||
117 | |||
118 | /* Trailing stuff */ | ||
119 | AP"loopout: \n\t" | ||
120 | |||
121 | "cmp %[d], %[e] \n\t" | ||
122 | "beq "AP"done\n\t" | ||
123 | "sub %[tmp],%[e], %[d] \n\t" | ||
124 | "cmp %[tmp],$0x04 \n\t" | ||
125 | "beq "AP"singleloop2 \n\t" | ||
126 | |||
127 | "sub %[tmp], %[e], #7 \n\t" | ||
128 | /* Dual loop */ | ||
129 | AP"dualloop2: \n\t" | ||
130 | "vldm %[s]!, {d0} \n\t" | ||
131 | "vldm %[d], {d2} \n\t" | ||
132 | // Mulitply s * c (= sc) | ||
133 | "vmull.u8 q4, d0,d14 \n\t" | ||
134 | // sc in d8 | ||
135 | "vqrshrn.u16 d4, q4, #8 \n\t" | ||
136 | |||
137 | // sca in d9 | ||
138 | // XXX: I can probably squash one of these 3 | ||
139 | "vmvn.u32 d6, d4 \n\t" | ||
140 | "vshr.u32 d6, d6, #24 \n\t" | ||
141 | "vmul.u32 d6, d6, d12 \n\t" | ||
142 | |||
143 | /* d * alpha */ | ||
144 | "vmull.u8 q4, d6, d2 \n\t" | ||
145 | "vqrshrn.u16 d0, q4, #8 \n\t" | ||
146 | |||
147 | "vqadd.u8 d2, d0, d4 \n\t" | ||
148 | |||
149 | // Save dsc + sc | ||
150 | "vstm %[d]!, {d2} \n\t" | ||
151 | |||
152 | "cmp %[tmp], %[d] \n\t" | ||
153 | "bhi "AP"dualloop2 \n\t" | ||
154 | |||
155 | "cmp %[d], %[e] \n\t" | ||
156 | "beq "AP"done \n\t" | ||
157 | |||
158 | AP"singleloop2: \n\t" | ||
159 | "vld1.32 d0[0], [%[s]]! \n\t" | ||
160 | "vld1.32 d2[0], [%[d]] \n\t" | ||
161 | // Mulitply s * c (= sc) | ||
162 | "vmull.u8 q4, d0,d14 \n\t" | ||
163 | // sc in d8 | ||
164 | "vqrshrn.u16 d4, q4, #8 \n\t" | ||
165 | |||
166 | // sca in d6 | ||
167 | "vmvn.u32 d6, d4 \n\t" | ||
168 | "vshr.u32 d6, d6, #24 \n\t" | ||
169 | "vmul.u32 d6, d12,d6 \n\t" | ||
170 | |||
171 | /* d * alpha */ | ||
172 | "vmull.u8 q4, d6, d2 \n\t" | ||
173 | "vqrshrn.u16 d0, q4, #8 \n\t" | ||
174 | |||
175 | "vqadd.u8 d2, d0, d4 \n\t" | ||
176 | |||
177 | // Save dsc + sc | ||
178 | "vst1.32 d2[0], [%[d]]! \n\t" | ||
179 | |||
180 | |||
181 | AP"done:" | ||
182 | : // No output | ||
183 | // | ||
184 | : [s] "r" (s), [e] "r" (d + l), [d] "r" (d), [c] "r" (c), | ||
185 | [tmp] "r" (12) | ||
186 | : "q0","q1","q2","q3","q4","q5","q6","q7","memory" | ||
187 | ); | ||
188 | #undef AP | ||
189 | } | ||
190 | |||
191 | static void | ||
192 | _op_blend_pan_can_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) { | ||
193 | DATA32 *e; | ||
194 | UNROLL8_PLD_WHILE(d, l, e, | ||
195 | { | ||
196 | *d++ = 0xff000000 + MUL3_SYM(c, *s); | ||
197 | s++; | ||
198 | }); | ||
199 | } | ||
200 | |||
201 | static void | ||
202 | _op_blend_pan_caa_dp_neon(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) { | ||
203 | #if 1 | ||
204 | DATA32 *e; | ||
205 | int alpha; | ||
206 | c = 1 + (c & 0xff); | ||
207 | UNROLL8_PLD_WHILE(d, l, e, | ||
208 | { | ||
209 | DATA32 sc = MUL_256(c, *s); | ||
210 | alpha = 256 - (sc >> 24); | ||
211 | *d = sc + MUL_256(alpha, *d); | ||
212 | d++; | ||
213 | s++; | ||
214 | }); | ||
215 | #else // the below neon is buggy!! misses rendering of spans, i think with alignment. quick - just disable this. | ||
216 | #define AP "_op_blend_pan_caa_dp_" | ||
217 | DATA32 *e = d + l, *tmp = (void*)73; | ||
218 | asm volatile ( | ||
219 | ".fpu neon \n\t" | ||
220 | /* Set up 'c' */ | ||
221 | "vdup.u8 d14, %[c] \n\t" | ||
222 | "vmov.i8 d15, #1 \n\t" | ||
223 | "vaddl.u8 q15, d14, d15 \n\t" | ||
224 | "vshr.u8 q15,#1 \n\t" | ||
225 | |||
226 | // Pick a loop | ||
227 | "andS %[tmp], %[d], $0xf \n\t" | ||
228 | "beq "AP"quadstart \n\t" | ||
229 | |||
230 | "andS %[tmp], %[d], $0x4 \n\t" | ||
231 | "beq "AP"dualstart \n\t" | ||
232 | |||
233 | AP"singleloop: \n\t" | ||
234 | "vld1.32 d4[0], [%[d]] \n\t" | ||
235 | "vld1.32 d0[0], [%[s]]! \n\t" | ||
236 | |||
237 | // Long version of 'd' | ||
238 | "vmovl.u8 q8, d4 \n\t" | ||
239 | |||
240 | // Long version of 's' | ||
241 | "vmovl.u8 q6, d0 \n\t" | ||
242 | |||
243 | // d8 = s -d | ||
244 | "vsub.s16 d8, d12, d16 \n\t" | ||
245 | |||
246 | // Multiply | ||
247 | "vmul.s16 d8, d8, d30 \n\t" | ||
248 | |||
249 | // Shift down | ||
250 | "vshr.s16 d8, #7 \n\t" | ||
251 | |||
252 | // Add 'd' | ||
253 | "vqadd.s16 d8, d8, d16 \n\t" | ||
254 | |||
255 | // Shrink to save | ||
256 | "vqmovun.s16 d0, q4 \n\t" | ||
257 | "vst1.32 d0[0], [%[d]]! \n\t" | ||
258 | |||
259 | // Now where? | ||
260 | "andS %[tmp], %[d], $0xf \n\t" | ||
261 | "beq "AP"quadstart \n\t" | ||
262 | |||
263 | AP"dualstart: \n\t" | ||
264 | // Check we have enough | ||
265 | "sub %[tmp], %[e], %[d] \n\t" | ||
266 | "cmp %[tmp], #16 \n\t" | ||
267 | "blt "AP"loopout \n\t" | ||
268 | |||
269 | AP"dualloop:" | ||
270 | "vldm %[d], {d4} \n\t" | ||
271 | "vldm %[s]!, {d0} \n\t" | ||
272 | |||
273 | // Long version of d | ||
274 | "vmovl.u8 q8, d4 \n\t" | ||
275 | |||
276 | // Long version of s | ||
277 | "vmovl.u8 q6, d0 \n\t" | ||
278 | |||
279 | // q4/q5 = s-d | ||
280 | "vsub.s16 q4, q6, q8 \n\t" | ||
281 | |||
282 | // Multiply | ||
283 | "vmul.s16 q4, q4,q15 \n\t" | ||
284 | |||
285 | // Shift down | ||
286 | "vshr.s16 q4, #7 \n\t" | ||
287 | |||
288 | // Add d | ||
289 | "vqadd.s16 q4, q4, q8 \n\t" | ||
290 | |||
291 | // Shrink to save | ||
292 | "vqmovun.s16 d0, q4 \n\t" | ||
293 | |||
294 | "vstm %[d]!, {d0} \n\t" | ||
295 | AP"quadstart: \n\t" | ||
296 | "sub %[tmp], %[e], %[d] \n\t" | ||
297 | "cmp %[tmp], #16 \n\t" | ||
298 | "blt "AP"loopout \n\t" | ||
299 | |||
300 | "sub %[tmp], %[e], #15 \n\t" | ||
301 | |||
302 | AP"quadloop: \n\t" | ||
303 | // load 's' -> q0, 'd' -> q2 | ||
304 | "vldm %[d], {d4,d5} \n\t" | ||
305 | "vldm %[s]!, {d0,d1} \n\t" | ||
306 | |||
307 | // Long version of d | ||
308 | "vmovl.u8 q8, d4 \n\t" | ||
309 | "vmovl.u8 q9, d5 \n\t" | ||
310 | |||
311 | // Long version of s | ||
312 | "vmovl.u8 q6, d0 \n\t" | ||
313 | "vmovl.u8 q7, d1 \n\t" | ||
314 | |||
315 | // q4/q5 = s-d | ||
316 | "vsub.s16 q4, q6, q8 \n\t" | ||
317 | "vsub.s16 q5, q7, q9 \n\t" | ||
318 | |||
319 | // Multiply | ||
320 | "vmul.s16 q4, q4,q15 \n\t" | ||
321 | "vmul.s16 q5, q5,q15 \n\t" | ||
322 | |||
323 | // Shift down | ||
324 | "vshr.s16 q4, #7 \n\t" | ||
325 | "vshr.s16 q5, #7 \n\t" | ||
326 | |||
327 | // Add d | ||
328 | "vqadd.s16 q4, q4, q8 \n\t" | ||
329 | "vqadd.s16 q5, q5, q9 \n\t" | ||
330 | |||
331 | // Shrink to save | ||
332 | "vqmovun.s16 d0, q4 \n\t" | ||
333 | "vqmovun.s16 d1, q5 \n\t" | ||
334 | "vstm %[d]!, {d0,d1} \n\t" | ||
335 | "cmp %[tmp], %[d] \n\t" | ||
336 | |||
337 | "bhi "AP"quadloop\n\t" | ||
338 | |||
339 | |||
340 | "b "AP"done\n\t" | ||
341 | AP"loopout: \n\t" | ||
342 | "cmp %[d], %[e] \n\t" | ||
343 | "beq "AP"done\n\t" | ||
344 | "sub %[tmp],%[e], %[d] \n\t" | ||
345 | "cmp %[tmp],$0x04 \n\t" | ||
346 | "beq "AP"singleloop2 \n\t" | ||
347 | |||
348 | AP"dualloop2: \n\t" | ||
349 | "vldm %[d], {d4} \n\t" | ||
350 | "vldm %[s]!, {d0} \n\t" | ||
351 | |||
352 | // Long version of d | ||
353 | "vmovl.u8 q8, d4 \n\t" | ||
354 | |||
355 | // Long version of s | ||
356 | "vmovl.u8 q6, d0 \n\t" | ||
357 | |||
358 | // q4/q5 = s-d | ||
359 | "vsub.s16 q4, q6, q8 \n\t" | ||
360 | |||
361 | // Multiply | ||
362 | "vmul.s16 q4, q4,q15 \n\t" | ||
363 | |||
364 | // Shift down | ||
365 | "vshr.s16 q4, #7 \n\t" | ||
366 | |||
367 | // Add d | ||
368 | "vqadd.s16 q4, q4, q8 \n\t" | ||
369 | |||
370 | // Shrink to save | ||
371 | "vqmovun.s16 d0, q4 \n\t" | ||
372 | |||
373 | "vstm %[d]!, {d0} \n\t" | ||
374 | |||
375 | "cmp %[d], %[e] \n\t" | ||
376 | "beq "AP"done \n\t" | ||
377 | |||
378 | AP"singleloop2: \n\t" | ||
379 | "vld1.32 d4[0], [%[d]] \n\t" | ||
380 | "vld1.32 d0[0], [%[s]]! \n\t" | ||
381 | |||
382 | // Long version of 'd' | ||
383 | "vmovl.u8 q8, d4 \n\t" | ||
384 | |||
385 | // Long version of 's' | ||
386 | "vmovl.u8 q6, d0 \n\t" | ||
387 | |||
388 | // d8 = s -d | ||
389 | "vsub.s16 d8, d12, d16 \n\t" | ||
390 | |||
391 | // Multiply | ||
392 | "vmul.s16 d8, d8, d30 \n\t" | ||
393 | |||
394 | // Shift down | ||
395 | "vshr.s16 d8, #7 \n\t" | ||
396 | |||
397 | // Add 'd' | ||
398 | "vqadd.s16 d8, d8, d16 \n\t" | ||
399 | |||
400 | // Shrink to save | ||
401 | "vqmovun.s16 d0, q4 \n\t" | ||
402 | |||
403 | "vst1.32 d0[0], [%[d]] \n\t" | ||
404 | |||
405 | |||
406 | AP"done: \n\t" | ||
407 | |||
408 | // No output | ||
409 | : | ||
410 | // Input | ||
411 | : [s] "r" (s), [d] "r" (d), [e] "r" (e), [c] "r" (c), [tmp] "r" (tmp) | ||
412 | // Clobbered | ||
413 | : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "memory" | ||
414 | ); | ||
415 | #undef AP | ||
416 | #endif | ||
417 | } | ||
418 | |||
419 | #define _op_blend_pas_c_dp_neon _op_blend_p_c_dp_neon | ||
420 | #define _op_blend_pan_c_dp_neon _op_blend_p_c_dp_neon | ||
421 | #define _op_blend_p_can_dp_neon _op_blend_p_c_dp_neon | ||
422 | #define _op_blend_pas_can_dp_neon _op_blend_p_c_dp_neon | ||
423 | #define _op_blend_p_caa_dp_neon _op_blend_p_c_dp_neon | ||
424 | #define _op_blend_pas_caa_dp_neon _op_blend_p_c_dp_neon | ||
425 | |||
426 | #define _op_blend_p_c_dpan_neon _op_blend_p_c_dp_neon | ||
427 | #define _op_blend_pas_c_dpan_neon _op_blend_pas_c_dp_neon | ||
428 | #define _op_blend_pan_c_dpan_neon _op_blend_pan_c_dp_neon | ||
429 | #define _op_blend_p_can_dpan_neon _op_blend_p_can_dp_neon | ||
430 | #define _op_blend_pas_can_dpan_neon _op_blend_pas_can_dp_neon | ||
431 | #define _op_blend_pan_can_dpan_neon _op_blend_pan_can_dp_neon | ||
432 | #define _op_blend_p_caa_dpan_neon _op_blend_p_caa_dp_neon | ||
433 | #define _op_blend_pas_caa_dpan_neon _op_blend_pas_caa_dp_neon | ||
434 | #define _op_blend_pan_caa_dpan_neon _op_blend_pan_caa_dp_neon | ||
435 | |||
436 | |||
437 | static void | ||
438 | init_blend_pixel_color_span_funcs_neon(void) | ||
439 | { | ||
440 | op_blend_span_funcs[SP][SM_N][SC][DP][CPU_NEON] = _op_blend_p_c_dp_neon; | ||
441 | op_blend_span_funcs[SP_AS][SM_N][SC][DP][CPU_NEON] = _op_blend_pas_c_dp_neon; | ||
442 | op_blend_span_funcs[SP_AN][SM_N][SC][DP][CPU_NEON] = _op_blend_pan_c_dp_neon; | ||
443 | op_blend_span_funcs[SP][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_p_can_dp_neon; | ||
444 | op_blend_span_funcs[SP_AS][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_pas_can_dp_neon; | ||
445 | op_blend_span_funcs[SP_AN][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_pan_can_dp_neon; | ||
446 | op_blend_span_funcs[SP][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_p_caa_dp_neon; | ||
447 | op_blend_span_funcs[SP_AS][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_pas_caa_dp_neon; | ||
448 | op_blend_span_funcs[SP_AN][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_pan_caa_dp_neon; | ||
449 | |||
450 | op_blend_span_funcs[SP][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_p_c_dpan_neon; | ||
451 | op_blend_span_funcs[SP_AS][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_pas_c_dpan_neon; | ||
452 | op_blend_span_funcs[SP_AN][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_pan_c_dpan_neon; | ||
453 | op_blend_span_funcs[SP][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_p_can_dpan_neon; | ||
454 | op_blend_span_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_pas_can_dpan_neon; | ||
455 | op_blend_span_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_pan_can_dpan_neon; | ||
456 | op_blend_span_funcs[SP][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_p_caa_dpan_neon; | ||
457 | op_blend_span_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_pas_caa_dpan_neon; | ||
458 | op_blend_span_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_pan_caa_dpan_neon; | ||
459 | } | ||
460 | #endif | ||
461 | |||
462 | #ifdef BUILD_NEON | ||
463 | static void | ||
464 | _op_blend_pt_p_c_dp_neon(DATA32 s, DATA8 m __UNUSED__, DATA32 c, DATA32 *d) { | ||
465 | s = MUL4_SYM(c, s); | ||
466 | c = 256 - (s >> 24); | ||
467 | *d = s + MUL_256(c, *d); | ||
468 | } | ||
469 | |||
470 | #define _op_blend_pt_pas_c_dp_neon _op_blend_pt_p_c_dp_neon | ||
471 | #define _op_blend_pt_pan_c_dp_neon _op_blend_pt_p_c_dp_neon | ||
472 | #define _op_blend_pt_p_can_dp_neon _op_blend_pt_p_c_dp_neon | ||
473 | #define _op_blend_pt_pas_can_dp_neon _op_blend_pt_p_c_dp_neon | ||
474 | #define _op_blend_pt_pan_can_dp_neon _op_blend_pt_p_c_dp_neon | ||
475 | #define _op_blend_pt_p_caa_dp_neon _op_blend_pt_p_c_dp_neon | ||
476 | #define _op_blend_pt_pas_caa_dp_neon _op_blend_pt_p_c_dp_neon | ||
477 | #define _op_blend_pt_pan_caa_dp_neon _op_blend_pt_p_c_dp_neon | ||
478 | |||
479 | #define _op_blend_pt_p_c_dpan_neon _op_blend_pt_p_c_dp_neon | ||
480 | #define _op_blend_pt_pas_c_dpan_neon _op_blend_pt_p_c_dp_neon | ||
481 | #define _op_blend_pt_pan_c_dpan_neon _op_blend_pt_p_c_dp_neon | ||
482 | #define _op_blend_pt_p_can_dpan_neon _op_blend_pt_p_c_dp_neon | ||
483 | #define _op_blend_pt_pas_can_dpan_neon _op_blend_pt_p_c_dp_neon | ||
484 | #define _op_blend_pt_pan_can_dpan_neon _op_blend_pt_p_c_dp_neon | ||
485 | #define _op_blend_pt_p_caa_dpan_neon _op_blend_pt_p_c_dp_neon | ||
486 | #define _op_blend_pt_pas_caa_dpan_neon _op_blend_pt_p_c_dp_neon | ||
487 | #define _op_blend_pt_pan_caa_dpan_neon _op_blend_pt_p_c_dp_neon | ||
488 | |||
489 | static void | ||
490 | init_blend_pixel_color_pt_funcs_neon(void) | ||
491 | { | ||
492 | op_blend_pt_funcs[SP][SM_N][SC][DP][CPU_NEON] = _op_blend_pt_p_c_dp_neon; | ||
493 | op_blend_pt_funcs[SP_AS][SM_N][SC][DP][CPU_NEON] = _op_blend_pt_pas_c_dp_neon; | ||
494 | op_blend_pt_funcs[SP_AN][SM_N][SC][DP][CPU_NEON] = _op_blend_pt_pan_c_dp_neon; | ||
495 | op_blend_pt_funcs[SP][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_pt_p_can_dp_neon; | ||
496 | op_blend_pt_funcs[SP_AS][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_pt_pas_can_dp_neon; | ||
497 | op_blend_pt_funcs[SP_AN][SM_N][SC_AN][DP][CPU_NEON] = _op_blend_pt_pan_can_dp_neon; | ||
498 | op_blend_pt_funcs[SP][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_pt_p_caa_dp_neon; | ||
499 | op_blend_pt_funcs[SP_AS][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_pt_pas_caa_dp_neon; | ||
500 | op_blend_pt_funcs[SP_AN][SM_N][SC_AA][DP][CPU_NEON] = _op_blend_pt_pan_caa_dp_neon; | ||
501 | |||
502 | op_blend_pt_funcs[SP][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_pt_p_c_dpan_neon; | ||
503 | op_blend_pt_funcs[SP_AS][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_pt_pas_c_dpan_neon; | ||
504 | op_blend_pt_funcs[SP_AN][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_pt_pan_c_dpan_neon; | ||
505 | op_blend_pt_funcs[SP][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_pt_p_can_dpan_neon; | ||
506 | op_blend_pt_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_pt_pas_can_dpan_neon; | ||
507 | op_blend_pt_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_pt_pan_can_dpan_neon; | ||
508 | op_blend_pt_funcs[SP][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_pt_p_caa_dpan_neon; | ||
509 | op_blend_pt_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_pt_pas_caa_dpan_neon; | ||
510 | op_blend_pt_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_pt_pan_caa_dpan_neon; | ||
511 | } | ||
512 | #endif | ||
513 | |||
514 | /*-----*/ | ||
515 | |||
516 | /* blend_rel pixel x color -> dst */ | ||
517 | |||
518 | #ifdef BUILD_NEON | ||
519 | |||
520 | #define _op_blend_rel_p_c_dpan_neon _op_blend_p_c_dpan_neon | ||
521 | #define _op_blend_rel_pas_c_dpan_neon _op_blend_pas_c_dpan_neon | ||
522 | #define _op_blend_rel_pan_c_dpan_neon _op_blend_pan_c_dpan_neon | ||
523 | #define _op_blend_rel_p_can_dpan_neon _op_blend_p_can_dpan_neon | ||
524 | #define _op_blend_rel_pas_can_dpan_neon _op_blend_pas_can_dpan_neon | ||
525 | #define _op_blend_rel_pan_can_dpan_neon _op_blend_pan_can_dpan_neon | ||
526 | #define _op_blend_rel_p_caa_dpan_neon _op_blend_p_caa_dpan_neon | ||
527 | #define _op_blend_rel_pas_caa_dpan_neon _op_blend_pas_caa_dpan_neon | ||
528 | #define _op_blend_rel_pan_caa_dpan_neon _op_blend_pan_caa_dpan_neon | ||
529 | |||
530 | static void | ||
531 | init_blend_rel_pixel_color_span_funcs_neon(void) | ||
532 | { | ||
533 | op_blend_rel_span_funcs[SP][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_p_c_dpan_neon; | ||
534 | op_blend_rel_span_funcs[SP_AS][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_pas_c_dpan_neon; | ||
535 | op_blend_rel_span_funcs[SP_AN][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_pan_c_dpan_neon; | ||
536 | op_blend_rel_span_funcs[SP][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_p_can_dpan_neon; | ||
537 | op_blend_rel_span_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pas_can_dpan_neon; | ||
538 | op_blend_rel_span_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pan_can_dpan_neon; | ||
539 | op_blend_rel_span_funcs[SP][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_p_caa_dpan_neon; | ||
540 | op_blend_rel_span_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pas_caa_dpan_neon; | ||
541 | op_blend_rel_span_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pan_caa_dpan_neon; | ||
542 | } | ||
543 | #endif | ||
544 | |||
545 | #ifdef BUILD_NEON | ||
546 | |||
547 | #define _op_blend_rel_pt_p_c_dpan_neon _op_blend_pt_p_c_dpan_neon | ||
548 | #define _op_blend_rel_pt_pas_c_dpan_neon _op_blend_pt_pas_c_dpan_neon | ||
549 | #define _op_blend_rel_pt_pan_c_dpan_neon _op_blend_pt_pan_c_dpan_neon | ||
550 | #define _op_blend_rel_pt_p_can_dpan_neon _op_blend_pt_p_can_dpan_neon | ||
551 | #define _op_blend_rel_pt_pas_can_dpan_neon _op_blend_pt_pas_can_dpan_neon | ||
552 | #define _op_blend_rel_pt_pan_can_dpan_neon _op_blend_pt_pan_can_dpan_neon | ||
553 | #define _op_blend_rel_pt_p_caa_dpan_neon _op_blend_pt_p_caa_dpan_neon | ||
554 | #define _op_blend_rel_pt_pas_caa_dpan_neon _op_blend_pt_pas_caa_dpan_neon | ||
555 | #define _op_blend_rel_pt_pan_caa_dpan_neon _op_blend_pt_pan_caa_dpan_neon | ||
556 | |||
557 | static void | ||
558 | init_blend_rel_pixel_color_pt_funcs_neon(void) | ||
559 | { | ||
560 | op_blend_rel_pt_funcs[SP][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_pt_p_c_dpan_neon; | ||
561 | op_blend_rel_pt_funcs[SP_AS][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_pt_pas_c_dpan_neon; | ||
562 | op_blend_rel_pt_funcs[SP_AN][SM_N][SC][DP_AN][CPU_NEON] = _op_blend_rel_pt_pan_c_dpan_neon; | ||
563 | op_blend_rel_pt_funcs[SP][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pt_p_can_dpan_neon; | ||
564 | op_blend_rel_pt_funcs[SP_AS][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pt_pas_can_dpan_neon; | ||
565 | op_blend_rel_pt_funcs[SP_AN][SM_N][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pt_pan_can_dpan_neon; | ||
566 | op_blend_rel_pt_funcs[SP][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pt_p_caa_dpan_neon; | ||
567 | op_blend_rel_pt_funcs[SP_AS][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pt_pas_caa_dpan_neon; | ||
568 | op_blend_rel_pt_funcs[SP_AN][SM_N][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pt_pan_caa_dpan_neon; | ||
569 | } | ||
570 | #endif | ||