diff options
Diffstat (limited to '')
-rw-r--r-- | libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c | 530 |
1 files changed, 530 insertions, 0 deletions
diff --git a/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c new file mode 100644 index 0000000..1cb50b6 --- /dev/null +++ b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c | |||
@@ -0,0 +1,530 @@ | |||
1 | /* blend pixel --> dst */ | ||
2 | |||
3 | #ifdef BUILD_NEON | ||
4 | static void | ||
5 | _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { | ||
6 | #define AP "blend_p_dp_" | ||
7 | asm volatile ( | ||
8 | ".fpu neon \n\t" | ||
9 | //** init | ||
10 | "vmov.i8 q8, $0x1 \n\t" | ||
11 | |||
12 | AP "loopchoose: \n\t" | ||
13 | // If aligned already - straight to octs | ||
14 | "andS %[tmp], %[d],$0x1f \n\t" | ||
15 | "beq "AP"octloops \n\t" | ||
16 | |||
17 | "andS %[tmp], %[d],$0xf \n\t" | ||
18 | "beq "AP"quadloops \n\t" | ||
19 | |||
20 | "andS %[tmp], %[d],$0x4 \n\t" | ||
21 | "beq "AP"dualloop \n\t" | ||
22 | |||
23 | // Only ever executes once, fall through to dual | ||
24 | AP "singleloop: \n\t" | ||
25 | "vld1.32 d0[0], [%[s]]! \n\t" | ||
26 | "vld1.32 d4[0], [%[d]] \n\t" | ||
27 | |||
28 | "vmvn.u8 d8, d0 \n\t" | ||
29 | "vshr.u32 d8, d8, #24 \n\t" | ||
30 | |||
31 | "vmul.u32 d8, d16, d8 \n\t" | ||
32 | |||
33 | "vmull.u8 q6, d4,d8 \n\t" | ||
34 | "vqrshrn.u16 d8, q6, #8 \n\t" | ||
35 | // Add to 's' | ||
36 | "vqadd.u8 q2, q4,q0 \n\t" | ||
37 | |||
38 | "vst1.32 d4[0], [%[d]] \n\t" | ||
39 | "add %[d], #4 \n\t" | ||
40 | |||
41 | // Can we go the fast path? | ||
42 | "andS %[tmp], %[d],$0x1f \n\t" | ||
43 | "beq "AP"octloops \n\t" | ||
44 | |||
45 | "andS %[tmp], %[d],$0x0f \n\t" | ||
46 | "beq "AP"quadloops \n\t" | ||
47 | |||
48 | |||
49 | AP "dualloop: \n\t" | ||
50 | "sub %[tmp], %[e], %[d] \n\t" | ||
51 | "cmp %[tmp], #32 \n\t" | ||
52 | "blt "AP"loopout \n\t" | ||
53 | |||
54 | AP "dualloopint: \n\t" | ||
55 | //** Dual Loop | ||
56 | "vldm %[s]!, {d0} \n\t" | ||
57 | "vldr d4, [%[d]] \n\t" | ||
58 | |||
59 | "vmvn.u8 d8, d0 \n\t" | ||
60 | "vshr.u32 d8, d8, #24 \n\t" | ||
61 | |||
62 | "vmul.u32 d8, d16, d8 \n\t" | ||
63 | |||
64 | "vmull.u8 q6, d4,d8 \n\t" | ||
65 | "vqrshrn.u16 d8, q6, #8 \n\t" | ||
66 | // Add to 's' | ||
67 | "vqadd.u8 d4, d8,d0 \n\t" | ||
68 | "vstr d4, [%[d]] \n\t" | ||
69 | "add %[d], #8 \n\t" | ||
70 | |||
71 | "ands %[tmp], %[d], $0x1f \n\t" | ||
72 | "beq "AP"octloops \n\t" | ||
73 | |||
74 | AP"quadloops: \n\t" | ||
75 | "sub %[tmp], %[e], %[d] \n\t" | ||
76 | "cmp %[tmp], #32 \n\t" | ||
77 | "blt "AP"loopout \n\t" | ||
78 | |||
79 | "vldm %[s]!, {d0,d1) \n\t" | ||
80 | "vldm %[d], {d4,d5} \n\t" | ||
81 | |||
82 | |||
83 | // Copy s.a into q2 (>> 24) & subtract from 255 | ||
84 | "vmvn.u8 q4, q0 \n\t" | ||
85 | "vshr.u32 q4, q4,$0x18 \n\t" | ||
86 | |||
87 | // Multiply into all fields | ||
88 | "vmul.u32 q4, q8,q4 \n\t" | ||
89 | |||
90 | // a * d (clobbering 'd'/q7) | ||
91 | "vmull.u8 q6, d4,d8 \n\t" | ||
92 | "vmull.u8 q2, d5,d9 \n\t" | ||
93 | |||
94 | // Shift & narrow it | ||
95 | "vqrshrn.u16 d8, q6, #8 \n\t" | ||
96 | "vqrshrn.u16 d9, q2, #8 \n\t" | ||
97 | |||
98 | // Add to s | ||
99 | "vqadd.u8 q2, q4,q0 \n\t" | ||
100 | |||
101 | // Write it | ||
102 | "vstm %[d]!, {d4,d5} \n\t" | ||
103 | |||
104 | AP "octloops: \n\t" | ||
105 | "sub %[tmp], %[e], %[d] \n\t" | ||
106 | "cmp %[tmp], #32 \n\t" | ||
107 | "ble "AP"loopout \n\t" | ||
108 | |||
109 | "sub %[tmp],%[e],#64 \n\t" | ||
110 | |||
111 | |||
112 | AP "octloopint:\n\t" | ||
113 | //** Oct loop | ||
114 | "vldm %[s]!, {d0,d1,d2,d3) \n\t" | ||
115 | "vldm %[d], {d4,d5,d6,d7} \n\t" | ||
116 | "pld [%[s], #64] \n\t" | ||
117 | |||
118 | |||
119 | // Copy s.a into q2 (>> 24) & subtract from 255 | ||
120 | "vmvn.u8 q4, q0 \n\t" | ||
121 | "vmvn.u8 q5, q1 \n\t" | ||
122 | "vshr.u32 q4, q4,$0x18 \n\t" | ||
123 | "vshr.u32 q5, q5,$0x18\n\t" | ||
124 | |||
125 | // Multiply into all fields | ||
126 | "vmul.u32 q4, q8,q4 \n\t" | ||
127 | "vmul.u32 q5, q8,q5 \n\t" | ||
128 | |||
129 | |||
130 | // a * d (clobbering 'd'/q7) | ||
131 | "vmull.u8 q6, d4,d8 \n\t" | ||
132 | "vmull.u8 q2, d5,d9 \n\t" | ||
133 | "vmull.u8 q7, d6,d10 \n\t" | ||
134 | "vmull.u8 q3, d7,d11 \n\t" | ||
135 | |||
136 | "cmp %[tmp], %[d]\n\t" | ||
137 | |||
138 | // Shift & narrow it | ||
139 | "vqrshrn.u16 d8, q6, #8 \n\t" | ||
140 | "vqrshrn.u16 d9, q2, #8 \n\t" | ||
141 | "vqrshrn.u16 d10, q7, #8 \n\t" | ||
142 | "vqrshrn.u16 d11, q3, #8 \n\t" | ||
143 | |||
144 | |||
145 | // Add to s | ||
146 | "vqadd.u8 q2, q4,q0 \n\t" | ||
147 | "vqadd.u8 q3, q5,q1 \n\t" | ||
148 | |||
149 | // Write it | ||
150 | "vstm %[d]!, {d4,d5,d6,d7} \n\t" | ||
151 | |||
152 | "bhi "AP"octloopint\n\t" | ||
153 | |||
154 | AP "loopout: \n\t" | ||
155 | "cmp %[d], %[e] \n\t" | ||
156 | "beq "AP"done \n\t" | ||
157 | "sub %[tmp],%[e], %[d] \n\t" | ||
158 | "cmp %[tmp],$0x04 \n\t" | ||
159 | "ble "AP"singleloop2 \n\t" | ||
160 | |||
161 | AP "dualloop2: \n\t" | ||
162 | "sub %[tmp],%[e],$0x7 \n\t" | ||
163 | AP "dualloop2int: \n\t" | ||
164 | //** Trailing double | ||
165 | |||
166 | "vldm %[s]!, {d0} \n\t" | ||
167 | "vldm %[d], {d4} \n\t" | ||
168 | |||
169 | "vmvn.u8 d8, d0 \n\t" | ||
170 | "vshr.u32 d8, d8, #24 \n\t" | ||
171 | |||
172 | "vmul.u32 d8, d16, d8 \n\t" | ||
173 | |||
174 | "vmull.u8 q6, d4,d8 \n\t" | ||
175 | "vqrshrn.u16 d8, q6, #8 \n\t" | ||
176 | // Add to 's' | ||
177 | "vqadd.u8 d4, d8,d0 \n\t" | ||
178 | |||
179 | "vstr.32 d4, [%[d]] \n\t" | ||
180 | "add %[d], #8 \n\t" | ||
181 | |||
182 | "cmp %[tmp], %[d] \n\t" | ||
183 | "bhi "AP"dualloop2int \n\t" | ||
184 | |||
185 | // Single ?? | ||
186 | "cmp %[e], %[d] \n\t" | ||
187 | "beq "AP"done \n\t" | ||
188 | |||
189 | AP"singleloop2: \n\t" | ||
190 | "vld1.32 d0[0], [%[s]] \n\t" | ||
191 | "vld1.32 d4[0], [%[d]] \n\t" | ||
192 | |||
193 | "vmvn.u8 d8, d0 \n\t" | ||
194 | "vshr.u32 d8, d8, #24 \n\t" | ||
195 | |||
196 | "vmul.u32 d8, d8, d16 \n\t" | ||
197 | |||
198 | "vmull.u8 q6, d8,d4 \n\t" | ||
199 | "vqrshrn.u16 d8, q6, #8 \n\t" | ||
200 | // Add to 's' | ||
201 | "vqadd.u8 d0, d0,d8 \n\t" | ||
202 | "vst1.32 d0[0], [%[d]] \n\t" | ||
203 | |||
204 | //** Trailing single | ||
205 | |||
206 | AP"done:\n\t" | ||
207 | //"sub %[tmp], %[e], #4 \n\t" | ||
208 | //"vmov.i32 d0, $0xffff0000 \n\t" | ||
209 | //"vst1.32 d0[0], [%[tmp]] \n\t" | ||
210 | |||
211 | |||
212 | : // output regs | ||
213 | // Input | ||
214 | : [e] "r" (d + l), [d] "r" (d), [s] "r" (s), [c] "r" (c), | ||
215 | [tmp] "r" (7) | ||
216 | : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered | ||
217 | ); | ||
218 | #undef AP | ||
219 | |||
220 | } | ||
221 | |||
222 | static void | ||
223 | _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { | ||
224 | #define AP "blend_pas_dp_" | ||
225 | DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912; | ||
226 | asm volatile ( | ||
227 | ".fpu neon \n\t" | ||
228 | "vmov.i8 q8, #1 \n\t" | ||
229 | AP"loopchoose: \n\t" | ||
230 | // If aliged - go as fast we can | ||
231 | "andS %[tmp], %[d], #31 \n\t" | ||
232 | "beq "AP"quadstart \n\t" | ||
233 | |||
234 | // See if we can at least do our double loop | ||
235 | "andS %[tmp], %[d], $0x7 \n\t" | ||
236 | "beq "AP"dualstart \n\t" | ||
237 | |||
238 | // Ugly single word version | ||
239 | AP "singleloop: \n\t" | ||
240 | "vld1.32 d0[0], [%[s]]! \n\t" | ||
241 | "vld1.32 d4[0], [%[d]] \n\t" | ||
242 | |||
243 | "vmvn.u8 d8, d0 \n\t" | ||
244 | |||
245 | "vshr.u32 d8, d8,$0x18 \n\t" | ||
246 | |||
247 | // Mulitply into all fields | ||
248 | "vmul.u32 d8, d8, d16 \n\t" | ||
249 | |||
250 | // Multiply out | ||
251 | "vmull.u8 q6, d8, d4 \n\t" | ||
252 | |||
253 | "vqrshrn.u16 d8, q6, #8 \n\t" | ||
254 | |||
255 | // Add to s | ||
256 | "vqadd.u8 d0, d0,d8 \n\t" | ||
257 | "vst1.32 d0[0], [%[d]]! \n\t" | ||
258 | |||
259 | AP"dualstart: \n\t" | ||
260 | "sub %[tmp], %[e], %[d] \n\t" | ||
261 | "cmp %[tmp], #32 \n\t" | ||
262 | "blt "AP"loopout \n\t" | ||
263 | |||
264 | // If aligned - go as fast we can | ||
265 | "andS %[tmp], %[d], #31 \n\t" | ||
266 | "beq "AP"quadstart \n\t" | ||
267 | |||
268 | |||
269 | AP"dualloop: \n\t" | ||
270 | |||
271 | "vldm %[s]!, {d0) \n\t" | ||
272 | "vldm %[d], {d4} \n\t" | ||
273 | |||
274 | // Subtract from 255 (ie negate) and extract alpha channel | ||
275 | "vmvn.u8 d8, d0 \n\t" | ||
276 | "vshr.u32 d8, d8,$0x18 \n\t" | ||
277 | |||
278 | // Mulitply into all fields | ||
279 | "vmul.u32 d8, d8, d16 \n\t" | ||
280 | |||
281 | // Multiply out | ||
282 | "vmull.u8 q6, d8, d4 \n\t" | ||
283 | |||
284 | "vqrshrn.u16 d8, q6, #8 \n\t" | ||
285 | |||
286 | // Add to s | ||
287 | "vqadd.u8 d0, d0,d8 \n\t" | ||
288 | "vstm %[d]!, {d0} \n\t" | ||
289 | |||
290 | "andS %[tmp], %[d], $0x1f \n\t" | ||
291 | "bne "AP"dualloop \n\t" | ||
292 | |||
293 | |||
294 | AP"quadstart: \n\t" | ||
295 | "sub %[tmp], %[e], %[d] \n\t" | ||
296 | "cmp %[tmp], #32 \n\t" | ||
297 | "blt "AP"loopout \n\t" | ||
298 | |||
299 | "sub %[tmp], %[e], #31 \n\t" | ||
300 | |||
301 | AP"quadloop:\n\t" | ||
302 | "vldm %[s]!, {d0,d1,d2,d3) \n\t" | ||
303 | "vldm %[d], {d4,d5,d6,d7} \n\t" | ||
304 | |||
305 | // Subtract from 255 (ie negate) and extract alpha channel | ||
306 | "vmvn.u8 q4, q0 \n\t" | ||
307 | "vmvn.u8 q5, q1 \n\t" | ||
308 | "vshr.u32 q4, q4,$0x18 \n\t" | ||
309 | "vshr.u32 q5, q5,$0x18 \n\t" | ||
310 | |||
311 | // Prepare to preload | ||
312 | "add %[pl], %[s], #32 \n\t" | ||
313 | |||
314 | // Mulitply into all fields | ||
315 | "vmul.u32 q4, q4, q8 \n\t" | ||
316 | "vmul.u32 q5, q5, q8 \n\t" | ||
317 | "pld [%[pl]] \n\t" | ||
318 | |||
319 | // Multiply out | ||
320 | "vmull.u8 q6, d8, d4 \n\t" | ||
321 | "vmull.u8 q7, d10, d6 \n\t" | ||
322 | "vmull.u8 q2, d9, d5 \n\t" | ||
323 | "vmull.u8 q3, d11, d7 \n\t" | ||
324 | |||
325 | "add %[pl], %[d], #32 \n\t" | ||
326 | |||
327 | "vqrshrn.u16 d8, q6, #8 \n\t" | ||
328 | "vqrshrn.u16 d10, q7, #8 \n\t" | ||
329 | "vqrshrn.u16 d9, q2, #8 \n\t" | ||
330 | "vqrshrn.u16 d11, q3, #8 \n\t" | ||
331 | "pld [%[pl]] \n\t" | ||
332 | |||
333 | "cmp %[tmp], %[pl] \n\t" | ||
334 | // Add to s | ||
335 | "vqadd.u8 q0, q0,q4 \n\t" | ||
336 | "vqadd.u8 q1, q1,q5 \n\t" | ||
337 | |||
338 | "vstm %[d]!, {d0,d1,d2,d3} \n\t" | ||
339 | |||
340 | "bhi "AP"quadloop \n\t" | ||
341 | |||
342 | AP "loopout: \n\t" | ||
343 | "cmp %[d], %[e] \n\t" | ||
344 | "beq "AP"done \n\t" | ||
345 | |||
346 | "sub %[tmp],%[e], %[d] \n\t" | ||
347 | "cmp %[tmp],$0x04 \n\t" | ||
348 | "beq "AP"singleloop2 \n\t" | ||
349 | |||
350 | "sub %[tmp],%[e],$0x7 \n\t" | ||
351 | |||
352 | AP"dualloop2: \n\t" | ||
353 | "vldm %[s]!, {d0) \n\t" | ||
354 | "vldm %[d], {d4} \n\t" | ||
355 | |||
356 | // Subtract from 255 (ie negate) and extract alpha channel | ||
357 | "vmvn.u8 d8, d0 \n\t" | ||
358 | "vshr.u32 d8, d8,$0x18 \n\t" | ||
359 | |||
360 | // Mulitply into all fields | ||
361 | "vmul.u32 d8, d8, d16 \n\t" | ||
362 | |||
363 | // Multiply out | ||
364 | "vmull.u8 q6, d8, d4 \n\t" | ||
365 | |||
366 | "vqrshrn.u16 d8, q6, #8 \n\t" | ||
367 | |||
368 | // Add to s | ||
369 | "vqadd.u8 d0, d0,d8 \n\t" | ||
370 | |||
371 | "vstm %[d]!, {d0} \n\t" | ||
372 | "cmp %[tmp], %[d] \n\t" | ||
373 | |||
374 | "bhi "AP"dualloop2 \n\t" | ||
375 | |||
376 | // Single ?? | ||
377 | "cmp %[e], %[d] \n\t" | ||
378 | "beq "AP"done \n\t" | ||
379 | |||
380 | AP "singleloop2: \n\t" | ||
381 | "vld1.32 d0[0], [%[s]] \n\t" | ||
382 | "vld1.32 d4[0], [%[d]] \n\t" | ||
383 | |||
384 | "vmvn.u8 d8, d0 \n\t" | ||
385 | |||
386 | "vshr.u32 d8, d8,$0x18 \n\t" | ||
387 | |||
388 | // Mulitply into all fields | ||
389 | "vmul.u32 d8, d8, d16 \n\t" | ||
390 | |||
391 | // Multiply out | ||
392 | "vmull.u8 q6, d8, d4 \n\t" | ||
393 | |||
394 | "vqrshrn.u16 d8, q6, #8 \n\t" | ||
395 | |||
396 | // Add to s | ||
397 | "vqadd.u8 d0, d0,d8 \n\t" | ||
398 | |||
399 | "vst1.32 d0[0], [%[d]] \n\t" | ||
400 | AP "done:\n\t" | ||
401 | |||
402 | |||
403 | : /* Out */ | ||
404 | : /* In */ [s] "r" (s), [e] "r" (e), [d] "r" (d), [tmp] "r" (tmp), | ||
405 | [pl] "r" (pl) | ||
406 | : /* Clobbered */ | ||
407 | "q0","q1","q2","q3","q4","q5","q6","q7","q8","memory" | ||
408 | ); | ||
409 | #undef AP | ||
410 | } | ||
411 | |||
412 | #define _op_blend_pan_dp_neon NULL | ||
413 | |||
414 | #define _op_blend_p_dpan_neon _op_blend_p_dp_neon | ||
415 | #define _op_blend_pas_dpan_neon _op_blend_pas_dp_neon | ||
416 | #define _op_blend_pan_dpan_neon _op_blend_pan_dp_neon | ||
417 | |||
418 | static void | ||
419 | init_blend_pixel_span_funcs_neon(void) | ||
420 | { | ||
421 | op_blend_span_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_p_dp_neon; | ||
422 | op_blend_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pas_dp_neon; | ||
423 | op_blend_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pan_dp_neon; | ||
424 | |||
425 | op_blend_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_p_dpan_neon; | ||
426 | op_blend_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pas_dpan_neon; | ||
427 | op_blend_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pan_dpan_neon; | ||
428 | } | ||
429 | #endif | ||
430 | |||
431 | #ifdef BUILD_NEON | ||
432 | static void | ||
433 | _op_blend_pt_p_dp_neon(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) { | ||
434 | c = 256 - (s >> 24); | ||
435 | *d = s + MUL_256(c, *d); | ||
436 | } | ||
437 | |||
438 | |||
439 | #define _op_blend_pt_pan_dp_neon NULL | ||
440 | #define _op_blend_pt_pas_dp_neon _op_blend_pt_p_dp_neon | ||
441 | |||
442 | #define _op_blend_pt_p_dpan_neon _op_blend_pt_p_dp_neon | ||
443 | #define _op_blend_pt_pan_dpan_neon _op_blend_pt_pan_dp_neon | ||
444 | #define _op_blend_pt_pas_dpan_neon _op_blend_pt_pas_dp_neon | ||
445 | |||
446 | static void | ||
447 | init_blend_pixel_pt_funcs_neon(void) | ||
448 | { | ||
449 | op_blend_pt_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pt_p_dp_neon; | ||
450 | op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pt_pas_dp_neon; | ||
451 | op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pt_pan_dp_neon; | ||
452 | |||
453 | op_blend_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pt_p_dpan_neon; | ||
454 | op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pt_pas_dpan_neon; | ||
455 | op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pt_pan_dpan_neon; | ||
456 | } | ||
457 | #endif | ||
458 | |||
459 | /*-----*/ | ||
460 | |||
461 | /* blend_rel pixel -> dst */ | ||
462 | |||
463 | #ifdef BUILD_NEON | ||
464 | static void | ||
465 | _op_blend_rel_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { | ||
466 | DATA32 *e = d + l; | ||
467 | while (d < e) { | ||
468 | l = 256 - (*s >> 24); | ||
469 | c = 1 + (*d >> 24); | ||
470 | *d = MUL_256(c, *s) + MUL_256(l, *d); | ||
471 | d++; | ||
472 | s++; | ||
473 | } | ||
474 | } | ||
475 | |||
476 | static void | ||
477 | _op_blend_rel_pan_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { | ||
478 | DATA32 *e = d + l; | ||
479 | while (d < e) { | ||
480 | c = 1 + (*d >> 24); | ||
481 | *d++ = MUL_256(c, *s); | ||
482 | s++; | ||
483 | } | ||
484 | } | ||
485 | |||
486 | #define _op_blend_rel_pas_dp_neon _op_blend_rel_p_dp_neon | ||
487 | |||
488 | #define _op_blend_rel_p_dpan_neon _op_blend_p_dpan_neon | ||
489 | #define _op_blend_rel_pan_dpan_neon _op_blend_pan_dpan_neon | ||
490 | #define _op_blend_rel_pas_dpan_neon _op_blend_pas_dpan_neon | ||
491 | |||
492 | static void | ||
493 | init_blend_rel_pixel_span_funcs_neon(void) | ||
494 | { | ||
495 | op_blend_rel_span_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_p_dp_neon; | ||
496 | op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pas_dp_neon; | ||
497 | op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pan_dp_neon; | ||
498 | |||
499 | op_blend_rel_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_p_dpan_neon; | ||
500 | op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pas_dpan_neon; | ||
501 | op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pan_dpan_neon; | ||
502 | } | ||
503 | #endif | ||
504 | |||
505 | #ifdef BUILD_NEON | ||
506 | static void | ||
507 | _op_blend_rel_pt_p_dp_neon(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) { | ||
508 | c = 256 - (s >> 24); | ||
509 | *d = MUL_SYM(*d >> 24, s) + MUL_256(c, *d); | ||
510 | } | ||
511 | |||
512 | #define _op_blend_rel_pt_pas_dp_neon _op_blend_rel_pt_p_dp_neon | ||
513 | #define _op_blend_rel_pt_pan_dp_neon _op_blend_rel_pt_p_dp_neon | ||
514 | |||
515 | #define _op_blend_rel_pt_p_dpan_neon _op_blend_pt_p_dpan_neon | ||
516 | #define _op_blend_rel_pt_pas_dpan_neon _op_blend_pt_pas_dpan_neon | ||
517 | #define _op_blend_rel_pt_pan_dpan_neon _op_blend_pt_pan_dpan_neon | ||
518 | |||
519 | static void | ||
520 | init_blend_rel_pixel_pt_funcs_neon(void) | ||
521 | { | ||
522 | op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pt_p_dp_neon; | ||
523 | op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pt_pas_dp_neon; | ||
524 | op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pt_pan_dp_neon; | ||
525 | |||
526 | op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pt_p_dpan_neon; | ||
527 | op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pt_pas_dpan_neon; | ||
528 | op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pt_pan_dpan_neon; | ||
529 | } | ||
530 | #endif | ||