aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c')
-rw-r--r--libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c530
1 files changed, 530 insertions, 0 deletions
diff --git a/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c
new file mode 100644
index 0000000..1cb50b6
--- /dev/null
+++ b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_neon.c
@@ -0,0 +1,530 @@
1/* blend pixel --> dst */
2
3#ifdef BUILD_NEON
4static void
5_op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
6#define AP "blend_p_dp_"
7 asm volatile (
8 ".fpu neon \n\t"
9 //** init
10 "vmov.i8 q8, $0x1 \n\t"
11
12 AP "loopchoose: \n\t"
13 // If aligned already - straight to octs
14 "andS %[tmp], %[d],$0x1f \n\t"
15 "beq "AP"octloops \n\t"
16
17 "andS %[tmp], %[d],$0xf \n\t"
18 "beq "AP"quadloops \n\t"
19
20 "andS %[tmp], %[d],$0x4 \n\t"
21 "beq "AP"dualloop \n\t"
22
23 // Only ever executes once, fall through to dual
24 AP "singleloop: \n\t"
25 "vld1.32 d0[0], [%[s]]! \n\t"
26 "vld1.32 d4[0], [%[d]] \n\t"
27
28 "vmvn.u8 d8, d0 \n\t"
29 "vshr.u32 d8, d8, #24 \n\t"
30
31 "vmul.u32 d8, d16, d8 \n\t"
32
33 "vmull.u8 q6, d4,d8 \n\t"
34 "vqrshrn.u16 d8, q6, #8 \n\t"
35 // Add to 's'
36 "vqadd.u8 q2, q4,q0 \n\t"
37
38 "vst1.32 d4[0], [%[d]] \n\t"
39 "add %[d], #4 \n\t"
40
41 // Can we go the fast path?
42 "andS %[tmp], %[d],$0x1f \n\t"
43 "beq "AP"octloops \n\t"
44
45 "andS %[tmp], %[d],$0x0f \n\t"
46 "beq "AP"quadloops \n\t"
47
48
49 AP "dualloop: \n\t"
50 "sub %[tmp], %[e], %[d] \n\t"
51 "cmp %[tmp], #32 \n\t"
52 "blt "AP"loopout \n\t"
53
54 AP "dualloopint: \n\t"
55 //** Dual Loop
56 "vldm %[s]!, {d0} \n\t"
57 "vldr d4, [%[d]] \n\t"
58
59 "vmvn.u8 d8, d0 \n\t"
60 "vshr.u32 d8, d8, #24 \n\t"
61
62 "vmul.u32 d8, d16, d8 \n\t"
63
64 "vmull.u8 q6, d4,d8 \n\t"
65 "vqrshrn.u16 d8, q6, #8 \n\t"
66 // Add to 's'
67 "vqadd.u8 d4, d8,d0 \n\t"
68 "vstr d4, [%[d]] \n\t"
69 "add %[d], #8 \n\t"
70
71 "ands %[tmp], %[d], $0x1f \n\t"
72 "beq "AP"octloops \n\t"
73
74 AP"quadloops: \n\t"
75 "sub %[tmp], %[e], %[d] \n\t"
76 "cmp %[tmp], #32 \n\t"
77 "blt "AP"loopout \n\t"
78
79 "vldm %[s]!, {d0,d1) \n\t"
80 "vldm %[d], {d4,d5} \n\t"
81
82
83 // Copy s.a into q2 (>> 24) & subtract from 255
84 "vmvn.u8 q4, q0 \n\t"
85 "vshr.u32 q4, q4,$0x18 \n\t"
86
87 // Multiply into all fields
88 "vmul.u32 q4, q8,q4 \n\t"
89
90 // a * d (clobbering 'd'/q7)
91 "vmull.u8 q6, d4,d8 \n\t"
92 "vmull.u8 q2, d5,d9 \n\t"
93
94 // Shift & narrow it
95 "vqrshrn.u16 d8, q6, #8 \n\t"
96 "vqrshrn.u16 d9, q2, #8 \n\t"
97
98 // Add to s
99 "vqadd.u8 q2, q4,q0 \n\t"
100
101 // Write it
102 "vstm %[d]!, {d4,d5} \n\t"
103
104 AP "octloops: \n\t"
105 "sub %[tmp], %[e], %[d] \n\t"
106 "cmp %[tmp], #32 \n\t"
107 "ble "AP"loopout \n\t"
108
109 "sub %[tmp],%[e],#64 \n\t"
110
111
112 AP "octloopint:\n\t"
113 //** Oct loop
114 "vldm %[s]!, {d0,d1,d2,d3) \n\t"
115 "vldm %[d], {d4,d5,d6,d7} \n\t"
116 "pld [%[s], #64] \n\t"
117
118
119 // Copy s.a into q2 (>> 24) & subtract from 255
120 "vmvn.u8 q4, q0 \n\t"
121 "vmvn.u8 q5, q1 \n\t"
122 "vshr.u32 q4, q4,$0x18 \n\t"
123 "vshr.u32 q5, q5,$0x18\n\t"
124
125 // Multiply into all fields
126 "vmul.u32 q4, q8,q4 \n\t"
127 "vmul.u32 q5, q8,q5 \n\t"
128
129
130 // a * d (clobbering 'd'/q7)
131 "vmull.u8 q6, d4,d8 \n\t"
132 "vmull.u8 q2, d5,d9 \n\t"
133 "vmull.u8 q7, d6,d10 \n\t"
134 "vmull.u8 q3, d7,d11 \n\t"
135
136 "cmp %[tmp], %[d]\n\t"
137
138 // Shift & narrow it
139 "vqrshrn.u16 d8, q6, #8 \n\t"
140 "vqrshrn.u16 d9, q2, #8 \n\t"
141 "vqrshrn.u16 d10, q7, #8 \n\t"
142 "vqrshrn.u16 d11, q3, #8 \n\t"
143
144
145 // Add to s
146 "vqadd.u8 q2, q4,q0 \n\t"
147 "vqadd.u8 q3, q5,q1 \n\t"
148
149 // Write it
150 "vstm %[d]!, {d4,d5,d6,d7} \n\t"
151
152 "bhi "AP"octloopint\n\t"
153
154 AP "loopout: \n\t"
155 "cmp %[d], %[e] \n\t"
156 "beq "AP"done \n\t"
157 "sub %[tmp],%[e], %[d] \n\t"
158 "cmp %[tmp],$0x04 \n\t"
159 "ble "AP"singleloop2 \n\t"
160
161 AP "dualloop2: \n\t"
162 "sub %[tmp],%[e],$0x7 \n\t"
163 AP "dualloop2int: \n\t"
164 //** Trailing double
165
166 "vldm %[s]!, {d0} \n\t"
167 "vldm %[d], {d4} \n\t"
168
169 "vmvn.u8 d8, d0 \n\t"
170 "vshr.u32 d8, d8, #24 \n\t"
171
172 "vmul.u32 d8, d16, d8 \n\t"
173
174 "vmull.u8 q6, d4,d8 \n\t"
175 "vqrshrn.u16 d8, q6, #8 \n\t"
176 // Add to 's'
177 "vqadd.u8 d4, d8,d0 \n\t"
178
179 "vstr.32 d4, [%[d]] \n\t"
180 "add %[d], #8 \n\t"
181
182 "cmp %[tmp], %[d] \n\t"
183 "bhi "AP"dualloop2int \n\t"
184
185 // Single ??
186 "cmp %[e], %[d] \n\t"
187 "beq "AP"done \n\t"
188
189 AP"singleloop2: \n\t"
190 "vld1.32 d0[0], [%[s]] \n\t"
191 "vld1.32 d4[0], [%[d]] \n\t"
192
193 "vmvn.u8 d8, d0 \n\t"
194 "vshr.u32 d8, d8, #24 \n\t"
195
196 "vmul.u32 d8, d8, d16 \n\t"
197
198 "vmull.u8 q6, d8,d4 \n\t"
199 "vqrshrn.u16 d8, q6, #8 \n\t"
200 // Add to 's'
201 "vqadd.u8 d0, d0,d8 \n\t"
202 "vst1.32 d0[0], [%[d]] \n\t"
203
204 //** Trailing single
205
206 AP"done:\n\t"
207//"sub %[tmp], %[e], #4 \n\t"
208//"vmov.i32 d0, $0xffff0000 \n\t"
209//"vst1.32 d0[0], [%[tmp]] \n\t"
210
211
212 : // output regs
213 // Input
214 : [e] "r" (d + l), [d] "r" (d), [s] "r" (s), [c] "r" (c),
215 [tmp] "r" (7)
216 : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered
217 );
218#undef AP
219
220}
221
222static void
223_op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
224#define AP "blend_pas_dp_"
225 DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912;
226 asm volatile (
227 ".fpu neon \n\t"
228 "vmov.i8 q8, #1 \n\t"
229 AP"loopchoose: \n\t"
230 // If aliged - go as fast we can
231 "andS %[tmp], %[d], #31 \n\t"
232 "beq "AP"quadstart \n\t"
233
234 // See if we can at least do our double loop
235 "andS %[tmp], %[d], $0x7 \n\t"
236 "beq "AP"dualstart \n\t"
237
238 // Ugly single word version
239 AP "singleloop: \n\t"
240 "vld1.32 d0[0], [%[s]]! \n\t"
241 "vld1.32 d4[0], [%[d]] \n\t"
242
243 "vmvn.u8 d8, d0 \n\t"
244
245 "vshr.u32 d8, d8,$0x18 \n\t"
246
247 // Mulitply into all fields
248 "vmul.u32 d8, d8, d16 \n\t"
249
250 // Multiply out
251 "vmull.u8 q6, d8, d4 \n\t"
252
253 "vqrshrn.u16 d8, q6, #8 \n\t"
254
255 // Add to s
256 "vqadd.u8 d0, d0,d8 \n\t"
257 "vst1.32 d0[0], [%[d]]! \n\t"
258
259 AP"dualstart: \n\t"
260 "sub %[tmp], %[e], %[d] \n\t"
261 "cmp %[tmp], #32 \n\t"
262 "blt "AP"loopout \n\t"
263
264 // If aligned - go as fast we can
265 "andS %[tmp], %[d], #31 \n\t"
266 "beq "AP"quadstart \n\t"
267
268
269 AP"dualloop: \n\t"
270
271 "vldm %[s]!, {d0) \n\t"
272 "vldm %[d], {d4} \n\t"
273
274 // Subtract from 255 (ie negate) and extract alpha channel
275 "vmvn.u8 d8, d0 \n\t"
276 "vshr.u32 d8, d8,$0x18 \n\t"
277
278 // Mulitply into all fields
279 "vmul.u32 d8, d8, d16 \n\t"
280
281 // Multiply out
282 "vmull.u8 q6, d8, d4 \n\t"
283
284 "vqrshrn.u16 d8, q6, #8 \n\t"
285
286 // Add to s
287 "vqadd.u8 d0, d0,d8 \n\t"
288 "vstm %[d]!, {d0} \n\t"
289
290 "andS %[tmp], %[d], $0x1f \n\t"
291 "bne "AP"dualloop \n\t"
292
293
294 AP"quadstart: \n\t"
295 "sub %[tmp], %[e], %[d] \n\t"
296 "cmp %[tmp], #32 \n\t"
297 "blt "AP"loopout \n\t"
298
299 "sub %[tmp], %[e], #31 \n\t"
300
301 AP"quadloop:\n\t"
302 "vldm %[s]!, {d0,d1,d2,d3) \n\t"
303 "vldm %[d], {d4,d5,d6,d7} \n\t"
304
305 // Subtract from 255 (ie negate) and extract alpha channel
306 "vmvn.u8 q4, q0 \n\t"
307 "vmvn.u8 q5, q1 \n\t"
308 "vshr.u32 q4, q4,$0x18 \n\t"
309 "vshr.u32 q5, q5,$0x18 \n\t"
310
311 // Prepare to preload
312 "add %[pl], %[s], #32 \n\t"
313
314 // Mulitply into all fields
315 "vmul.u32 q4, q4, q8 \n\t"
316 "vmul.u32 q5, q5, q8 \n\t"
317 "pld [%[pl]] \n\t"
318
319 // Multiply out
320 "vmull.u8 q6, d8, d4 \n\t"
321 "vmull.u8 q7, d10, d6 \n\t"
322 "vmull.u8 q2, d9, d5 \n\t"
323 "vmull.u8 q3, d11, d7 \n\t"
324
325 "add %[pl], %[d], #32 \n\t"
326
327 "vqrshrn.u16 d8, q6, #8 \n\t"
328 "vqrshrn.u16 d10, q7, #8 \n\t"
329 "vqrshrn.u16 d9, q2, #8 \n\t"
330 "vqrshrn.u16 d11, q3, #8 \n\t"
331 "pld [%[pl]] \n\t"
332
333 "cmp %[tmp], %[pl] \n\t"
334 // Add to s
335 "vqadd.u8 q0, q0,q4 \n\t"
336 "vqadd.u8 q1, q1,q5 \n\t"
337
338 "vstm %[d]!, {d0,d1,d2,d3} \n\t"
339
340 "bhi "AP"quadloop \n\t"
341
342 AP "loopout: \n\t"
343 "cmp %[d], %[e] \n\t"
344 "beq "AP"done \n\t"
345
346 "sub %[tmp],%[e], %[d] \n\t"
347 "cmp %[tmp],$0x04 \n\t"
348 "beq "AP"singleloop2 \n\t"
349
350 "sub %[tmp],%[e],$0x7 \n\t"
351
352 AP"dualloop2: \n\t"
353 "vldm %[s]!, {d0) \n\t"
354 "vldm %[d], {d4} \n\t"
355
356 // Subtract from 255 (ie negate) and extract alpha channel
357 "vmvn.u8 d8, d0 \n\t"
358 "vshr.u32 d8, d8,$0x18 \n\t"
359
360 // Mulitply into all fields
361 "vmul.u32 d8, d8, d16 \n\t"
362
363 // Multiply out
364 "vmull.u8 q6, d8, d4 \n\t"
365
366 "vqrshrn.u16 d8, q6, #8 \n\t"
367
368 // Add to s
369 "vqadd.u8 d0, d0,d8 \n\t"
370
371 "vstm %[d]!, {d0} \n\t"
372 "cmp %[tmp], %[d] \n\t"
373
374 "bhi "AP"dualloop2 \n\t"
375
376 // Single ??
377 "cmp %[e], %[d] \n\t"
378 "beq "AP"done \n\t"
379
380 AP "singleloop2: \n\t"
381 "vld1.32 d0[0], [%[s]] \n\t"
382 "vld1.32 d4[0], [%[d]] \n\t"
383
384 "vmvn.u8 d8, d0 \n\t"
385
386 "vshr.u32 d8, d8,$0x18 \n\t"
387
388 // Mulitply into all fields
389 "vmul.u32 d8, d8, d16 \n\t"
390
391 // Multiply out
392 "vmull.u8 q6, d8, d4 \n\t"
393
394 "vqrshrn.u16 d8, q6, #8 \n\t"
395
396 // Add to s
397 "vqadd.u8 d0, d0,d8 \n\t"
398
399 "vst1.32 d0[0], [%[d]] \n\t"
400 AP "done:\n\t"
401
402
403 : /* Out */
404 : /* In */ [s] "r" (s), [e] "r" (e), [d] "r" (d), [tmp] "r" (tmp),
405 [pl] "r" (pl)
406 : /* Clobbered */
407 "q0","q1","q2","q3","q4","q5","q6","q7","q8","memory"
408 );
409#undef AP
410}
411
412#define _op_blend_pan_dp_neon NULL
413
414#define _op_blend_p_dpan_neon _op_blend_p_dp_neon
415#define _op_blend_pas_dpan_neon _op_blend_pas_dp_neon
416#define _op_blend_pan_dpan_neon _op_blend_pan_dp_neon
417
418static void
419init_blend_pixel_span_funcs_neon(void)
420{
421 op_blend_span_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_p_dp_neon;
422 op_blend_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pas_dp_neon;
423 op_blend_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pan_dp_neon;
424
425 op_blend_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_p_dpan_neon;
426 op_blend_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pas_dpan_neon;
427 op_blend_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pan_dpan_neon;
428}
429#endif
430
431#ifdef BUILD_NEON
432static void
433_op_blend_pt_p_dp_neon(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) {
434 c = 256 - (s >> 24);
435 *d = s + MUL_256(c, *d);
436}
437
438
439#define _op_blend_pt_pan_dp_neon NULL
440#define _op_blend_pt_pas_dp_neon _op_blend_pt_p_dp_neon
441
442#define _op_blend_pt_p_dpan_neon _op_blend_pt_p_dp_neon
443#define _op_blend_pt_pan_dpan_neon _op_blend_pt_pan_dp_neon
444#define _op_blend_pt_pas_dpan_neon _op_blend_pt_pas_dp_neon
445
446static void
447init_blend_pixel_pt_funcs_neon(void)
448{
449 op_blend_pt_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pt_p_dp_neon;
450 op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pt_pas_dp_neon;
451 op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pt_pan_dp_neon;
452
453 op_blend_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pt_p_dpan_neon;
454 op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pt_pas_dpan_neon;
455 op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pt_pan_dpan_neon;
456}
457#endif
458
459/*-----*/
460
461/* blend_rel pixel -> dst */
462
463#ifdef BUILD_NEON
464static void
465_op_blend_rel_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
466 DATA32 *e = d + l;
467 while (d < e) {
468 l = 256 - (*s >> 24);
469 c = 1 + (*d >> 24);
470 *d = MUL_256(c, *s) + MUL_256(l, *d);
471 d++;
472 s++;
473 }
474}
475
476static void
477_op_blend_rel_pan_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) {
478 DATA32 *e = d + l;
479 while (d < e) {
480 c = 1 + (*d >> 24);
481 *d++ = MUL_256(c, *s);
482 s++;
483 }
484}
485
486#define _op_blend_rel_pas_dp_neon _op_blend_rel_p_dp_neon
487
488#define _op_blend_rel_p_dpan_neon _op_blend_p_dpan_neon
489#define _op_blend_rel_pan_dpan_neon _op_blend_pan_dpan_neon
490#define _op_blend_rel_pas_dpan_neon _op_blend_pas_dpan_neon
491
492static void
493init_blend_rel_pixel_span_funcs_neon(void)
494{
495 op_blend_rel_span_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_p_dp_neon;
496 op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pas_dp_neon;
497 op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pan_dp_neon;
498
499 op_blend_rel_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_p_dpan_neon;
500 op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pas_dpan_neon;
501 op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pan_dpan_neon;
502}
503#endif
504
505#ifdef BUILD_NEON
506static void
507_op_blend_rel_pt_p_dp_neon(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) {
508 c = 256 - (s >> 24);
509 *d = MUL_SYM(*d >> 24, s) + MUL_256(c, *d);
510}
511
512#define _op_blend_rel_pt_pas_dp_neon _op_blend_rel_pt_p_dp_neon
513#define _op_blend_rel_pt_pan_dp_neon _op_blend_rel_pt_p_dp_neon
514
515#define _op_blend_rel_pt_p_dpan_neon _op_blend_pt_p_dpan_neon
516#define _op_blend_rel_pt_pas_dpan_neon _op_blend_pt_pas_dpan_neon
517#define _op_blend_rel_pt_pan_dpan_neon _op_blend_pt_pan_dpan_neon
518
519static void
520init_blend_rel_pixel_pt_funcs_neon(void)
521{
522 op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pt_p_dp_neon;
523 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pt_pas_dp_neon;
524 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pt_pan_dp_neon;
525
526 op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pt_p_dpan_neon;
527 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pt_pas_dpan_neon;
528 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pt_pan_dpan_neon;
529}
530#endif