diff options
Diffstat (limited to '')
-rw-r--r-- | libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_neon.c | 562 |
1 files changed, 0 insertions, 562 deletions
diff --git a/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_neon.c b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_neon.c deleted file mode 100644 index f5eb480..0000000 --- a/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_neon.c +++ /dev/null | |||
@@ -1,562 +0,0 @@ | |||
1 | #define NEONDEBUG 0 | ||
2 | |||
3 | |||
4 | #if NEONDEBUG | ||
5 | #define DEBUG_FNCOUNT(x) \ | ||
6 | do { \ | ||
7 | static int _foo = 0; \ | ||
8 | if (_foo++%10000 ==0) \ | ||
9 | printf("%s %+d %s: %d (%s)\n",__FILE__,__LINE__,__FUNCTION__,\ | ||
10 | _foo, x " optimised");\ | ||
11 | } while (0) | ||
12 | #else | ||
13 | #define DEBUG_FNCOUNT(x) ((void)x) | ||
14 | #endif | ||
15 | |||
16 | |||
17 | /* blend mask x color -> dst */ | ||
18 | |||
19 | #ifdef BUILD_NEON | ||
20 | static void | ||
21 | _op_blend_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) { | ||
22 | DATA32 *e; | ||
23 | |||
24 | DEBUG_FNCOUNT(""); | ||
25 | |||
26 | #define AP "blend_mas_c_dp_" | ||
27 | asm volatile ( | ||
28 | ".fpu neon \n\t" | ||
29 | " vdup.i32 q15, %[c] \n\t" | ||
30 | " vmov.i8 q14, #1 \n\t" | ||
31 | |||
32 | // If aligned already - straight to quads | ||
33 | " andS %[tmp], %[d],$0xf \n\t" | ||
34 | " beq "AP"quadloops \n\t" | ||
35 | |||
36 | " andS %[tmp], %[d],$0x4 \n\t" | ||
37 | " beq "AP"dualloop \n\t" | ||
38 | |||
39 | AP"singleloop: \n\t" | ||
40 | " vld1.8 d0[0], [%[m]]! \n\t" | ||
41 | " vld1.32 d4[0], [%[d]] \n\t" | ||
42 | " vdup.u8 d0, d0[0] \n\t" | ||
43 | " vmull.u8 q4, d0, d30 \n\t" | ||
44 | " vqrshrn.u16 d12, q4, #8 \n\t" | ||
45 | " vmvn.u16 d14, d12 \n\t" | ||
46 | " vshr.u32 d16, d14, #24 \n\t" | ||
47 | " vmul.u32 d16, d16, d28 \n\t" | ||
48 | " vmull.u8 q7, d16, d4 \n\t" | ||
49 | " vqrshrn.u16 d0, q7, #8 \n\t" | ||
50 | " vqadd.u8 d0, d0, d12 \n\t" | ||
51 | " vst1.32 d0[0], [%[d]]! \n\t" | ||
52 | |||
53 | // Can we go the fast path? | ||
54 | " andS %[tmp], %[d],$0xf \n\t" | ||
55 | " beq "AP"quadloops \n\t" | ||
56 | |||
57 | AP"dualloop: \n\t" | ||
58 | " sub %[tmp], %[e], %[d] \n\t" | ||
59 | " cmp %[tmp], #16 \n\t" | ||
60 | " blt "AP"loopout \n\t" | ||
61 | |||
62 | " vld1.16 d0[0], [%[m]]! \n\t" | ||
63 | " vldm %[d], {d4} \n\t" | ||
64 | " vmovl.u8 q0, d0 \n\t" | ||
65 | " vmovl.u8 q0, d0 \n\t" | ||
66 | " vmul.u32 q0, q14 \n\t" | ||
67 | " vmull.u8 q4, d0, d30 \n\t" | ||
68 | " vqrshrn.u16 d12, q4, #8 \n\t" | ||
69 | " vmvn.u16 d14, d12 \n\t" | ||
70 | " vshr.u32 d16, d14, #24 \n\t" | ||
71 | " vmul.u32 d16, d16, d28 \n\t" | ||
72 | " vmull.u8 q7, d16, d4 \n\t" | ||
73 | " vqrshrn.u16 d0, q7, #8 \n\t" | ||
74 | " vqadd.u8 q0, q0, q6 \n\t" | ||
75 | " vstm %[d]!, {d0} \n\t" | ||
76 | |||
77 | AP"quadloops: \n\t" | ||
78 | " sub %[tmp], %[e], %[d] \n\t" | ||
79 | " cmp %[tmp], #16 \n\t" | ||
80 | " blt "AP"loopout \n\t" | ||
81 | |||
82 | |||
83 | " sub %[tmp], %[e], #15 \n\t" | ||
84 | |||
85 | " sub %[d], #16 \n\t" | ||
86 | AP"fastloop:" | ||
87 | " add %[d], #16 \n\t" | ||
88 | " cmp %[tmp], %[d] \n\t" | ||
89 | " ble "AP"loopout \n\t" | ||
90 | AP"quadloopint: \n\t" | ||
91 | " ldr %[x], [%[m]] \n\t" | ||
92 | " add %[m], #4 \n\t" | ||
93 | " cmp %[x], #0 \n\t" | ||
94 | " beq "AP"fastloop \n\t" | ||
95 | " vmov.32 d0[0], %[x] \n\t" | ||
96 | " vldm %[d], {d4,d5} \n\t" | ||
97 | |||
98 | // Expand M: Fixme: Can we do this quicker? | ||
99 | " vmovl.u8 q0, d0 \n\t" | ||
100 | " vmovl.u8 q0, d0 \n\t" | ||
101 | " vmul.u32 q0, q14 \n\t" | ||
102 | |||
103 | // Multiply a * c | ||
104 | " vmull.u8 q4, d0, d30 \n\t" | ||
105 | " vmull.u8 q5, d1, d31 \n\t" | ||
106 | |||
107 | // Shorten | ||
108 | " vqrshrn.u16 d12, q4, #8 \n\t" | ||
109 | " vqrshrn.u16 d13, q5, #8 \n\t" | ||
110 | |||
111 | // extract negated alpha | ||
112 | " vmvn.u16 q7, q6 \n\t" | ||
113 | " vshr.u32 q8, q7, #24 \n\t" | ||
114 | " vmul.u32 q8, q8, q14 \n\t" | ||
115 | |||
116 | // Multiply | ||
117 | " vmull.u8 q7, d16, d4 \n\t" | ||
118 | " vmull.u8 q8, d17, d5 \n\t" | ||
119 | |||
120 | " vqrshrn.u16 d0, q7, #8 \n\t" | ||
121 | " vqrshrn.u16 d1, q8, #8 \n\t" | ||
122 | |||
123 | // Add | ||
124 | " vqadd.u8 q0, q0, q6 \n\t" | ||
125 | |||
126 | " vstm %[d]!, {d0,d1} \n\t" | ||
127 | |||
128 | " cmp %[tmp], %[d] \n\t" | ||
129 | " bhi "AP"quadloopint \n\t" | ||
130 | |||
131 | AP"loopout: \n\t" | ||
132 | #if NEONDEBUG | ||
133 | "cmp %[d], %[e] \n\t" | ||
134 | "ble "AP"foo \n\t" | ||
135 | "cmp %[tmp], %[m] \n\t" | ||
136 | "sub %[x], %[x] \n\t" | ||
137 | "vst1.32 d0[0], [%[x]] \n\t" | ||
138 | AP"foo: \n\t" | ||
139 | #endif | ||
140 | |||
141 | " cmp %[d], %[e] \n\t" | ||
142 | " beq "AP"done \n\t" | ||
143 | " sub %[tmp],%[e], %[d] \n\t" | ||
144 | " cmp %[tmp],#4 \n\t" | ||
145 | " beq "AP"singleout \n\t" | ||
146 | |||
147 | AP "dualloop2: \n\t" | ||
148 | "sub %[tmp],%[e],$0x8 \n\t" | ||
149 | " vld1.16 d0[0], [%[m]]! \n\t" | ||
150 | " vldm %[d], {d4} \n\t" | ||
151 | " vmovl.u8 q0, d0 \n\t" | ||
152 | " vmovl.u8 q0, d0 \n\t" | ||
153 | " vmul.u32 q0, q14 \n\t" | ||
154 | " vmull.u8 q4, d0, d30 \n\t" | ||
155 | " vqrshrn.u16 d12, q4, #8 \n\t" | ||
156 | " vmvn.u16 d14, d12 \n\t" | ||
157 | " vshr.u32 d16, d14, #24 \n\t" | ||
158 | " vmul.u32 d16, d16, d28 \n\t" | ||
159 | " vmull.u8 q7, d16, d4 \n\t" | ||
160 | " vqrshrn.u16 d0, q7, #8 \n\t" | ||
161 | " vqadd.u8 q0, q0, q6 \n\t" | ||
162 | " vstm %[d]!, {d0} \n\t" | ||
163 | |||
164 | " cmp %[e], %[d] \n\t" | ||
165 | " beq "AP"done \n\t" | ||
166 | |||
167 | AP"singleout: \n\t" | ||
168 | " vld1.8 d0[0], [%[m]]! \n\t" | ||
169 | " vld1.32 d4[0], [%[d]] \n\t" | ||
170 | " vdup.u8 d0, d0[0] \n\t" | ||
171 | " vmull.u8 q4, d0, d30 \n\t" | ||
172 | " vqrshrn.u16 d12, q4, #8 \n\t" | ||
173 | " vmvn.u16 d14, d12 \n\t" | ||
174 | " vshr.u32 d16, d14, #24 \n\t" | ||
175 | " vmul.u32 d16, d16, d28 \n\t" | ||
176 | " vmull.u8 q7, d16, d4 \n\t" | ||
177 | " vqrshrn.u16 d0, q7, #8 \n\t" | ||
178 | " vqadd.u8 q0, q0, q6 \n\t" | ||
179 | " vst1.32 d0[0], [%[d]]! \n\t" | ||
180 | |||
181 | AP"done: \n\t" | ||
182 | #if NEONDEBUG | ||
183 | "cmp %[d], %[e] \n\t" | ||
184 | "beq "AP"reallydone \n\t" | ||
185 | "sub %[tmp], %[tmp] \n\t" | ||
186 | "vst1.32 d0[0], [%[tmp]] \n\t" | ||
187 | AP"reallydone:" | ||
188 | #endif | ||
189 | : // Out | ||
190 | : [e] "r" (d + l), [d] "r" (d), [c] "r" (c), | ||
191 | [tmp] "r" (7), [m] "r" (m), [x] "r" (0) | ||
192 | : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","q14","q15", | ||
193 | "memory" // clobbered | ||
194 | ); | ||
195 | #undef AP | ||
196 | } | ||
197 | #endif | ||
198 | |||
199 | #ifdef BUILD_NEON | ||
200 | static void | ||
201 | _op_blend_mas_can_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) { | ||
202 | DATA32 *e,*tmp; | ||
203 | int alpha; | ||
204 | |||
205 | DEBUG_FNCOUNT(""); | ||
206 | |||
207 | #define AP "_blend_mas_can_dp_neon_" | ||
208 | asm volatile ( | ||
209 | ".fpu neon \n\t" | ||
210 | "vdup.u32 q9, %[c] \n\t" | ||
211 | "vmov.i8 q15, #1 \n\t" | ||
212 | "vmov.i8 q14, #0 \n\t" | ||
213 | |||
214 | // Make C 16 bit (C in q3/q2) | ||
215 | "vmovl.u8 q3, d19 \n\t" | ||
216 | "vmovl.u8 q2, d18 \n\t" | ||
217 | |||
218 | // Which loop to start | ||
219 | " andS %[tmp], %[d],$0xf \n\t" | ||
220 | " beq "AP"quadloop \n\t" | ||
221 | |||
222 | " andS %[tmp], %[d], #4 \n\t" | ||
223 | " beq "AP"dualstart \n\t" | ||
224 | |||
225 | |||
226 | AP"singleloop: \n\t" | ||
227 | " vld1.8 d0[0], [%[m]]! \n\t" | ||
228 | " vld1.32 d8[0], [%[d]] \n\t" | ||
229 | " vdup.u8 d0, d0[0] \n\t" | ||
230 | " vshr.u8 d0, d0, #1 \n\t" | ||
231 | " vmovl.u8 q0, d0 \n\t" | ||
232 | " vmovl.u8 q4, d8 \n\t" | ||
233 | " vsub.s16 q6, q2, q4 \n\t" | ||
234 | " vmul.s16 q6, q0 \n\t" | ||
235 | " vshr.s16 q6, #7 \n\t" | ||
236 | " vadd.s16 q6, q4 \n\t" | ||
237 | " vqmovun.s16 d2, q6 \n\t" | ||
238 | " vst1.32 d2[0], [%[d]]! \n\t" | ||
239 | |||
240 | " andS %[tmp], %[d], $0xf \n\t" | ||
241 | " beq "AP"quadloop \n\t" | ||
242 | |||
243 | AP"dualstart: \n\t" | ||
244 | " sub %[tmp], %[e], %[d] \n\t" | ||
245 | " cmp %[tmp], #16 \n\t" | ||
246 | " blt "AP"loopout \n\t" | ||
247 | |||
248 | AP"dualloop: \n\t" | ||
249 | " vld1.16 d0[0], [%[m]]! \n\t" | ||
250 | " vldm %[d], {d8} \n\t" | ||
251 | " vmovl.u8 q0, d0 \n\t" | ||
252 | " vmovl.u8 q0, d0 \n\t" | ||
253 | " vmul.u32 d0, d0, d30 \n\t" | ||
254 | " vshr.u8 d0, d0, #1 \n\t" | ||
255 | " vmovl.u8 q0, d0 \n\t" | ||
256 | " vmovl.u8 q4, d8 \n\t" | ||
257 | " vsub.s16 q6, q2, q4 \n\t" | ||
258 | " vmul.s16 q6, q0 \n\t" | ||
259 | " vshr.s16 q6, #7 \n\t" | ||
260 | " vadd.s16 q6, q4 \n\t" | ||
261 | " vqmovun.s16 d2, q6 \n\t" | ||
262 | " vstm %[d]!, {d2} \n\t" | ||
263 | |||
264 | AP"quadloop: \n\t" | ||
265 | " sub %[tmp], %[e], %[d] \n\t" | ||
266 | " cmp %[tmp], #16 \n\t" | ||
267 | " blt "AP"loopout \n\t" | ||
268 | " sub %[tmp], %[e], #15 \n\t" | ||
269 | |||
270 | " sub %[d], #16 \n\t" | ||
271 | AP"fastloop: \n\t" | ||
272 | " add %[d], #16 \n\t" | ||
273 | " cmp %[tmp], %[d] \n\t" | ||
274 | " blt "AP"loopout \n\t" | ||
275 | |||
276 | AP"quadloopint: \n\t" | ||
277 | // Load the mask: 4 bytes: It has d0/d1 | ||
278 | " ldr %[x], [%[m]] \n\t" | ||
279 | " add %[m], #4 \n\t" | ||
280 | |||
281 | // Check for shortcuts | ||
282 | " cmp %[x], #0 \n\t" | ||
283 | " beq "AP"fastloop \n\t" | ||
284 | |||
285 | " cmp %[x], $0xffffffff \n\t" | ||
286 | " beq "AP"quadstore \n\t" | ||
287 | |||
288 | " vmov.32 d0[0], %[x] \n\t" | ||
289 | // Load d into d8/d9 q4 | ||
290 | " vldm %[d], {d8,d9} \n\t" | ||
291 | |||
292 | // Get the alpha channel ready (m) | ||
293 | " vmovl.u8 q0, d0 \n\t" | ||
294 | " vmovl.u8 q0, d0 \n\t" | ||
295 | " vmul.u32 q0, q0,q15 \n\t" | ||
296 | // Lop a bit off to prevent overflow | ||
297 | " vshr.u8 q0, q0, #1 \n\t" | ||
298 | |||
299 | // Now make it 16 bit | ||
300 | " vmovl.u8 q1, d1 \n\t" | ||
301 | " vmovl.u8 q0, d0 \n\t" | ||
302 | |||
303 | // 16 bit 'd' | ||
304 | " vmovl.u8 q5, d9 \n\t" | ||
305 | " vmovl.u8 q4, d8 \n\t" | ||
306 | |||
307 | // Diff 'd' & 'c' | ||
308 | " vsub.s16 q7, q3, q5 \n\t" | ||
309 | " vsub.s16 q6, q2, q4 \n\t" | ||
310 | |||
311 | " vmul.s16 q7, q1 \n\t" | ||
312 | " vmul.s16 q6, q0 \n\t" | ||
313 | |||
314 | // Shift results a bit | ||
315 | " vshr.s16 q7, #7 \n\t" | ||
316 | " vshr.s16 q6, #7 \n\t" | ||
317 | |||
318 | // Add 'd' | ||
319 | " vadd.s16 q7, q5 \n\t" | ||
320 | " vadd.s16 q6, q4 \n\t" | ||
321 | |||
322 | // Make sure none are negative | ||
323 | " vqmovun.s16 d9, q7 \n\t" | ||
324 | " vqmovun.s16 d8, q6 \n\t" | ||
325 | |||
326 | " vstm %[d]!, {d8,d9} \n\t" | ||
327 | |||
328 | " cmp %[tmp], %[d] \n\t" | ||
329 | " bhi "AP"quadloopint \n\t" | ||
330 | " b "AP"loopout \n\t" | ||
331 | |||
332 | AP"quadstore: \n\t" | ||
333 | " vstm %[d]!, {d18,d19} \n\t" | ||
334 | " cmp %[tmp], %[d] \n\t" | ||
335 | " bhi "AP"quadloopint \n\t" | ||
336 | |||
337 | AP"loopout: \n\t" | ||
338 | #if NEONDEBUG | ||
339 | "cmp %[d], %[e] \n\t" | ||
340 | "ble "AP"foo \n\t" | ||
341 | "sub %[tmp], %[tmp] \n\t" | ||
342 | "vst1.32 d0[0], [%[tmp]] \n\t" | ||
343 | AP"foo: \n\t" | ||
344 | #endif | ||
345 | |||
346 | " cmp %[e], %[d] \n\t" | ||
347 | " beq "AP"done \n\t" | ||
348 | |||
349 | " sub %[tmp],%[e], %[d] \n\t" | ||
350 | " cmp %[tmp],#8 \n\t" | ||
351 | |||
352 | " blt "AP"onebyte \n\t" | ||
353 | |||
354 | // Load the mask: 2 bytes: It has d0 | ||
355 | " vld1.16 d0[0], [%[m]]! \n\t" | ||
356 | |||
357 | // Load d into d8/d9 q4 | ||
358 | " vldm %[d], {d8} \n\t" | ||
359 | |||
360 | // Get the alpha channel ready (m) | ||
361 | " vmovl.u8 q0, d0 \n\t" | ||
362 | " vmovl.u8 q0, d0 \n\t" | ||
363 | " vmul.u32 d0, d0, d30 \n\t" | ||
364 | // Lop a bit off to prevent overflow | ||
365 | " vshr.u8 d0, d0, #1 \n\t" | ||
366 | |||
367 | // Now make it 16 bit | ||
368 | " vmovl.u8 q0, d0 \n\t" | ||
369 | |||
370 | // 16 bit 'd' | ||
371 | " vmovl.u8 q4, d8 \n\t" | ||
372 | |||
373 | // Diff 'd' & 'c' | ||
374 | " vsub.s16 q6, q2, q4 \n\t" | ||
375 | |||
376 | " vmul.s16 q6, q0 \n\t" | ||
377 | |||
378 | // Shift results a bit | ||
379 | " vshr.s16 q6, #7 \n\t" | ||
380 | |||
381 | // Add 'd' | ||
382 | "vadd.s16 q6, q4 \n\t" | ||
383 | |||
384 | // Make sure none are negative | ||
385 | "vqmovun.s16 d2, q6 \n\t" | ||
386 | |||
387 | "vstm %[d]!, {d2} \n\t" | ||
388 | |||
389 | "cmp %[e], %[d] \n\t" | ||
390 | "beq "AP"done \n\t" | ||
391 | |||
392 | AP"onebyte: \n\t" | ||
393 | "vld1.8 d0[0], [%[m]]! \n\t" | ||
394 | "vld1.32 d8[0], [%[d]] \n\t" | ||
395 | "vdup.u8 d0, d0[0] \n\t" | ||
396 | "vshr.u8 d0, d0, #1 \n\t" | ||
397 | "vmovl.u8 q0, d0 \n\t" | ||
398 | "vmovl.u8 q4, d8 \n\t" | ||
399 | "vsub.s16 q6, q2, q4 \n\t" | ||
400 | "vmul.s16 q6, q0 \n\t" | ||
401 | "vshr.s16 q6, #7 \n\t" | ||
402 | "vadd.s16 q6, q4 \n\t" | ||
403 | "vqmovun.s16 d2, q6 \n\t" | ||
404 | "vst1.32 d2[0], [%[d]]! \n\t" | ||
405 | |||
406 | |||
407 | AP"done: \n\t" | ||
408 | #if NEONDEBUG | ||
409 | "cmp %[d], %[e] \n\t" | ||
410 | "beq "AP"reallydone \n\t" | ||
411 | "sub %[m], %[m] \n\t" | ||
412 | "vst1.32 d0[0], [%[m]] \n\t" | ||
413 | AP"reallydone:" | ||
414 | #endif | ||
415 | |||
416 | |||
417 | : // output regs | ||
418 | // Input | ||
419 | : [e] "r" (e = d + l), [d] "r" (d), [c] "r" (c), | ||
420 | [m] "r" (m), [tmp] "r" (7), [x] "r" (33) | ||
421 | : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q9","q14","q15", | ||
422 | "memory" // clobbered | ||
423 | |||
424 | ); | ||
425 | #undef AP | ||
426 | } | ||
427 | #endif | ||
428 | |||
429 | #ifdef BUILD_NEON | ||
430 | #define _op_blend_mas_cn_dp_neon _op_blend_mas_can_dp_neon | ||
431 | #define _op_blend_mas_caa_dp_neon _op_blend_mas_c_dp_neon | ||
432 | |||
433 | #define _op_blend_mas_c_dpan_neon _op_blend_mas_c_dp_neon | ||
434 | #define _op_blend_mas_cn_dpan_neon _op_blend_mas_cn_dp_neon | ||
435 | #define _op_blend_mas_can_dpan_neon _op_blend_mas_can_dp_neon | ||
436 | #define _op_blend_mas_caa_dpan_neon _op_blend_mas_caa_dp_neon | ||
437 | |||
438 | static void | ||
439 | init_blend_mask_color_span_funcs_neon(void) | ||
440 | { | ||
441 | op_blend_span_funcs[SP_N][SM_AS][SC][DP][CPU_NEON] = _op_blend_mas_c_dp_neon; | ||
442 | op_blend_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_NEON] = _op_blend_mas_cn_dp_neon; | ||
443 | op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_NEON] = _op_blend_mas_can_dp_neon; | ||
444 | op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_NEON] = _op_blend_mas_caa_dp_neon; | ||
445 | |||
446 | op_blend_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_NEON] = _op_blend_mas_c_dpan_neon; | ||
447 | op_blend_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_NEON] = _op_blend_mas_cn_dpan_neon; | ||
448 | op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_NEON] = _op_blend_mas_can_dpan_neon; | ||
449 | op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_NEON] = _op_blend_mas_caa_dpan_neon; | ||
450 | } | ||
451 | #endif | ||
452 | |||
453 | #ifdef BUILD_NEON | ||
454 | static void | ||
455 | _op_blend_pt_mas_c_dp_neon(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) { | ||
456 | s = MUL_SYM(m, c); | ||
457 | c = 256 - (s >> 24); | ||
458 | *d = MUL_SYM(*d >> 24, s) + MUL_256(c, *d); | ||
459 | } | ||
460 | |||
461 | |||
462 | #define _op_blend_pt_mas_cn_dp_neon _op_blend_pt_mas_c_dp_neon | ||
463 | #define _op_blend_pt_mas_can_dp_neon _op_blend_pt_mas_c_dp_neon | ||
464 | #define _op_blend_pt_mas_caa_dp_neon _op_blend_pt_mas_c_dp_neon | ||
465 | |||
466 | #define _op_blend_pt_mas_c_dpan_neon _op_blend_pt_mas_c_dp_neon | ||
467 | #define _op_blend_pt_mas_cn_dpan_neon _op_blend_pt_mas_cn_dp_neon | ||
468 | #define _op_blend_pt_mas_can_dpan_neon _op_blend_pt_mas_can_dp_neon | ||
469 | #define _op_blend_pt_mas_caa_dpan_neon _op_blend_pt_mas_caa_dp_neon | ||
470 | |||
471 | static void | ||
472 | init_blend_mask_color_pt_funcs_neon(void) | ||
473 | { | ||
474 | op_blend_pt_funcs[SP_N][SM_AS][SC][DP][CPU_NEON] = _op_blend_pt_mas_c_dp_neon; | ||
475 | op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_NEON] = _op_blend_pt_mas_cn_dp_neon; | ||
476 | op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_NEON] = _op_blend_pt_mas_can_dp_neon; | ||
477 | op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_NEON] = _op_blend_pt_mas_caa_dp_neon; | ||
478 | |||
479 | op_blend_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_NEON] = _op_blend_pt_mas_c_dpan_neon; | ||
480 | op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_NEON] = _op_blend_pt_mas_cn_dpan_neon; | ||
481 | op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_NEON] = _op_blend_pt_mas_can_dpan_neon; | ||
482 | op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_NEON] = _op_blend_pt_mas_caa_dpan_neon; | ||
483 | } | ||
484 | #endif | ||
485 | |||
486 | /*-----*/ | ||
487 | |||
488 | /* blend_rel mask x color -> dst */ | ||
489 | |||
490 | #ifdef BUILD_NEON | ||
491 | static void | ||
492 | _op_blend_rel_mas_c_dp_neon(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) { | ||
493 | DATA32 *e; | ||
494 | int alpha; | ||
495 | |||
496 | DEBUG_FNCOUNT("not"); | ||
497 | |||
498 | UNROLL8_PLD_WHILE(d, l, e, | ||
499 | { | ||
500 | DATA32 mc = MUL_SYM(*m, c); | ||
501 | alpha = 256 - (mc >> 24); | ||
502 | *d = MUL_SYM(*d >> 24, mc) + MUL_256(alpha, *d); | ||
503 | d++; | ||
504 | m++; | ||
505 | }); | ||
506 | } | ||
507 | |||
508 | #define _op_blend_rel_mas_cn_dp_neon _op_blend_rel_mas_c_dp_neon | ||
509 | #define _op_blend_rel_mas_can_dp_neon _op_blend_rel_mas_c_dp_neon | ||
510 | #define _op_blend_rel_mas_caa_dp_neon _op_blend_rel_mas_c_dp_neon | ||
511 | |||
512 | #define _op_blend_rel_mas_c_dpan_neon _op_blend_mas_c_dpan_neon | ||
513 | #define _op_blend_rel_mas_cn_dpan_neon _op_blend_mas_cn_dpan_neon | ||
514 | #define _op_blend_rel_mas_can_dpan_neon _op_blend_mas_can_dpan_neon | ||
515 | #define _op_blend_rel_mas_caa_dpan_neon _op_blend_mas_caa_dpan_neon | ||
516 | |||
517 | static void | ||
518 | init_blend_rel_mask_color_span_funcs_neon(void) | ||
519 | { | ||
520 | op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP][CPU_NEON] = _op_blend_rel_mas_c_dp_neon; | ||
521 | op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_NEON] = _op_blend_rel_mas_cn_dp_neon; | ||
522 | op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_NEON] = _op_blend_rel_mas_can_dp_neon; | ||
523 | op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_NEON] = _op_blend_rel_mas_caa_dp_neon; | ||
524 | |||
525 | op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_NEON] = _op_blend_rel_mas_c_dpan_neon; | ||
526 | op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_mas_cn_dpan_neon; | ||
527 | op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_mas_can_dpan_neon; | ||
528 | op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_mas_caa_dpan_neon; | ||
529 | } | ||
530 | #endif | ||
531 | |||
532 | #ifdef BUILD_NEON | ||
533 | static void | ||
534 | _op_blend_rel_pt_mas_c_dp_neon(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) { | ||
535 | s = MUL_SYM(m, c); | ||
536 | c = 256 - (s >> 24); | ||
537 | *d = MUL_SYM(*d >> 24, s) + MUL_256(c, *d); | ||
538 | } | ||
539 | |||
540 | #define _op_blend_rel_pt_mas_cn_dp_neon _op_blend_rel_pt_mas_c_dp_neon | ||
541 | #define _op_blend_rel_pt_mas_can_dp_neon _op_blend_rel_pt_mas_c_dp_neon | ||
542 | #define _op_blend_rel_pt_mas_caa_dp_neon _op_blend_rel_pt_mas_c_dp_neon | ||
543 | |||
544 | #define _op_blend_rel_pt_mas_c_dpan_neon _op_blend_pt_mas_c_dpan_neon | ||
545 | #define _op_blend_rel_pt_mas_cn_dpan_neon _op_blend_pt_mas_cn_dpan_neon | ||
546 | #define _op_blend_rel_pt_mas_can_dpan_neon _op_blend_pt_mas_can_dpan_neon | ||
547 | #define _op_blend_rel_pt_mas_caa_dpan_neon _op_blend_pt_mas_caa_dpan_neon | ||
548 | |||
549 | static void | ||
550 | init_blend_rel_mask_color_pt_funcs_neon(void) | ||
551 | { | ||
552 | op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP][CPU_NEON] = _op_blend_rel_pt_mas_c_dp_neon; | ||
553 | op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_NEON] = _op_blend_rel_pt_mas_cn_dp_neon; | ||
554 | op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_NEON] = _op_blend_rel_pt_mas_can_dp_neon; | ||
555 | op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_NEON] = _op_blend_rel_pt_mas_caa_dp_neon; | ||
556 | |||
557 | op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_NEON] = _op_blend_rel_pt_mas_c_dpan_neon; | ||
558 | op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pt_mas_cn_dpan_neon; | ||
559 | op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_NEON] = _op_blend_rel_pt_mas_can_dpan_neon; | ||
560 | op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_NEON] = _op_blend_rel_pt_mas_caa_dpan_neon; | ||
561 | } | ||
562 | #endif | ||