aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/evas/src/lib/include/evas_blend_ops.h
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/evas/src/lib/include/evas_blend_ops.h')
-rw-r--r--libraries/evas/src/lib/include/evas_blend_ops.h378
1 files changed, 378 insertions, 0 deletions
diff --git a/libraries/evas/src/lib/include/evas_blend_ops.h b/libraries/evas/src/lib/include/evas_blend_ops.h
new file mode 100644
index 0000000..1ada384
--- /dev/null
+++ b/libraries/evas/src/lib/include/evas_blend_ops.h
@@ -0,0 +1,378 @@
1#ifndef EVAS_BLEND_OPS_H
2#define EVAS_BLEND_OPS_H
3
4#if defined BUILD_MMX || defined BUILD_SSE
5#include "evas_mmx.h"
6#endif
7
8#include "config.h"
9
10#ifdef NEED_SSE3
11# if defined BUILD_SSE3
12# include <immintrin.h>
13# endif
14#endif
15
16/* src pixel flags: */
17
18/* pixels none */
19#define SP_N 0
20/* pixels (argb default) */
21#define SP 1
22/* pixels are rgb (ie. alphas == 255) */
23#define SP_AN 2
24/* pixels alpha are sparse */
25#define SP_AS 3
26/* src pixels flags count */
27#define SP_LAST 4
28
29/* src mask flags: */
30
31/* mask none */
32#define SM_N 0
33/* mask (alpha) */
34#define SM 1
35/* mask alphas are 'trivial - ie. only 0 or 255 */
36#define SM_AT 2
37/* mask alphas are sparse */
38#define SM_AS 3
39/* src mask flags count */
40#define SM_LAST 4
41
42/* src color flags: */
43
44/* color is 0xffffffff */
45#define SC_N 0
46/* color (argb default) */
47#define SC 1
48/* color is rgb (ie. 0xffrrggbb) */
49#define SC_AN 2
50/* color is 'alpha' (ie. 0xaaaaaaaa) */
51#define SC_AA 3
52/* src color flags count */
53#define SC_LAST 4
54
55/* dst pixels flags: */
56
57/* pixels (argb default) */
58#define DP 0
59/* pixels are rgb (ie. alphas == 255) */
60#define DP_AN 1
61/* dst pixels flags count */
62#define DP_LAST 2
63
64/* cpu types flags */
65
66/* none, bad news */
67#define CPU_N 0
68/* cpu C */
69#define CPU_C 1
70/* cpu MMX */
71#define CPU_MMX 2
72/* cpu SSE */
73#define CPU_SSE 3
74/* cpu SSE2 */
75#define CPU_SSE2 4
76/* cpu flags count */
77#define CPU_NEON 5
78/* CPU SSE3 */
79#define CPU_SSE3 6
80/* cpu flags count */
81#define CPU_LAST 7
82
83
84/* some useful constants */
85
86extern const DATA32 ALPHA_255;
87extern const DATA32 ALPHA_256;
88
89/* some useful C macros */
90
91#define MUL4_256(a, r, g, b, c) \
92 ( (((((c) >> 8) & 0xff0000) * (a)) & 0xff000000) + \
93 (((((c) & 0xff0000) * (r)) >> 8) & 0xff0000) + \
94 (((((c) & 0xff00) * (g)) >> 8) & 0xff00) + \
95 ((((c) & 0xff) * (b)) >> 8) )
96
97#define MUL3_256(r, g, b, c) \
98 ( (((((c) & 0xff0000) * (r)) >> 8) & 0xff0000) + \
99 (((((c) & 0xff00) * (g)) >> 8) & 0xff00) + \
100 ((((c) & 0xff) * (b)) >> 8) )
101
102#define MUL_256(a, c) \
103 ( (((((c) >> 8) & 0x00ff00ff) * (a)) & 0xff00ff00) + \
104 (((((c) & 0x00ff00ff) * (a)) >> 8) & 0x00ff00ff) )
105
106#define MUL4_SYM(x, y) \
107 ( ((((((x) >> 16) & 0xff00) * (((y) >> 16) & 0xff00)) + 0xff0000) & 0xff000000) + \
108 ((((((x) >> 8) & 0xff00) * (((y) >> 16) & 0xff)) + 0xff00) & 0xff0000) + \
109 ((((((x) & 0xff00) * ((y) & 0xff00)) + 0xff00) >> 16) & 0xff00) + \
110 (((((x) & 0xff) * ((y) & 0xff)) + 0xff) >> 8) )
111
112#define MUL3_SYM(x, y) \
113 ( ((((((x) >> 8) & 0xff00) * (((y) >> 16) & 0xff)) + 0xff00) & 0xff0000) + \
114 ((((((x) & 0xff00) * ((y) & 0xff00)) + 0xff00) >> 16) & 0xff00) + \
115 (((((x) & 0xff) * ((y) & 0xff)) + 0xff) >> 8) )
116
117#define MUL_SYM(a, x) \
118 ( (((((x) >> 8) & 0x00ff00ff) * (a) + 0xff00ff) & 0xff00ff00) + \
119 (((((x) & 0x00ff00ff) * (a) + 0xff00ff) >> 8) & 0x00ff00ff) )
120
121#define MUL_A_256(a, c) \
122 ( ((((c) >> 8) & 0x00ff0000) * (a)) & 0xff000000 )
123
124#define MUL_A_SYM(a, c) \
125 ( (((((c) >> 8) & 0x00ff0000) * (a)) + 0x00ff0000) & 0xff000000 )
126
127#define INTERP_256(a, c0, c1) \
128 ( (((((((c0) >> 8) & 0xff00ff) - (((c1) >> 8) & 0xff00ff)) * (a)) \
129 + ((c1) & 0xff00ff00)) & 0xff00ff00) + \
130 (((((((c0) & 0xff00ff) - ((c1) & 0xff00ff)) * (a)) >> 8) \
131 + ((c1) & 0xff00ff)) & 0xff00ff) )
132
133#define INTERP_RGB_256(a, c0, c1) \
134 ( (((((((c0) >> 8) & 0xff) - (((c1) >> 8) & 0xff)) * (a)) \
135 + ((c1) & 0xff00)) & 0xff00) + \
136 (((((((c0) & 0xff00ff) - ((c1) & 0xff00ff)) * (a)) >> 8) \
137 + ((c1) & 0xff00ff)) & 0xff00ff) )
138
139#define INTERP_A_256(a, c0, c1) \
140 ( (((((((c0) >> 8) & 0xff0000) - (((c1) >> 8) & 0xff0000)) * (a)) \
141 + ((c1) & 0xff000000)) & 0xff000000) )
142
143
144/* some useful MMX macros */
145
146#ifdef BUILD_MMX
147#define MOV_A2R(a, mma) \
148 movd_m2r(a, mma); \
149 punpcklwd_r2r(mma, mma); \
150 punpckldq_r2r(mma, mma);
151
152#define MOV_P2R(c, mmc, mmz) \
153 movd_m2r(c, mmc); \
154 punpcklbw_r2r(mmz, mmc);
155
156#define MOV_R2P(mmc, c, mmz) \
157 packuswb_r2r(mmz, mmc); \
158 movd_r2m(mmc, c);
159
160#define MUL4_256_R2R(mmx, mmy) \
161 pmullw_r2r(mmx, mmy); \
162 psrlw_i2r(8, mmy);
163
164#define MUL4_SYM_R2R(mmx, mmy, mm255) \
165 pmullw_r2r(mmx, mmy); \
166 paddw_r2r(mm255, mmy); \
167 psrlw_i2r(8, mmy);
168
169#define MOV_RA2R(mmx, mma) \
170 movq_r2r(mmx, mma); \
171 punpckhwd_r2r(mma, mma); \
172 punpckhdq_r2r(mma, mma);
173
174#define MOV_PA2R(c, mma) \
175 movd_m2r(c, mma); \
176 punpcklbw_r2r(mma, mma); \
177 punpckhwd_r2r(mma, mma); \
178 punpckhdq_r2r(mma, mma);
179
180#define INTERP_256_R2R(mma, mmx, mmy, mm255) \
181 psubw_r2r(mmy, mmx); \
182 pmullw_r2r(mma, mmx); \
183 psrlw_i2r(8, mmx); \
184 paddw_r2r(mmx, mmy); \
185 pand_r2r(mm255, mmy);
186
187#endif
188
189
190/* some useful SSE3 inline functions */
191
192#ifdef NEED_SSE3
193#ifdef BUILD_SSE3
194
195static __m128i GA_MASK_SSE3;
196static __m128i RB_MASK_SSE3;
197static __m128i SYM4_MASK_SSE3;
198static __m128i RGB_MASK_SSE3;
199//static __m128i A_MASK_SSE3;
200
201static __m128i ALPHA_SSE3;
202
203EFL_ALWAYS_INLINE __m128i
204mul_256_sse3(__m128i a, __m128i c) {
205
206 /* prepare alpha for word multiplication */
207 __m128i a_l = a;
208 __m128i a_h = a;
209 a_l = _mm_unpacklo_epi16(a_l, a_l);
210 a_h = _mm_unpackhi_epi16(a_h, a_h);
211 __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88);
212
213 /* first half of calc */
214 __m128i c0 = c;
215 c0 = _mm_srli_epi32(c0, 8);
216 c0 = _mm_and_si128(GA_MASK_SSE3, c0);
217 c0 = _mm_mullo_epi16(a0, c0);
218 c0 = _mm_and_si128(RB_MASK_SSE3, c0);
219
220 /* second half of calc */
221 __m128i c1 = c;
222 c1 = _mm_and_si128(GA_MASK_SSE3, c1);
223 c1 = _mm_mullo_epi16(a0, c1);
224 c1 = _mm_srli_epi32(c1, 8);
225 c1 = _mm_and_si128(GA_MASK_SSE3, c1);
226
227 /* combine */
228 return _mm_add_epi32(c0, c1);
229}
230
231EFL_ALWAYS_INLINE __m128i
232sub4_alpha_sse3(__m128i c) {
233
234 __m128i c0 = c;
235
236 c0 = _mm_srli_epi32(c0, 24);
237 return _mm_sub_epi32(ALPHA_SSE3, c0);
238}
239
240EFL_ALWAYS_INLINE __m128i
241interp4_256_sse3(__m128i a, __m128i c0, __m128i c1)
242{
243 const __m128i zero = _mm_setzero_si128();
244
245 __m128i a_l = a;
246 __m128i a_h = a;
247 a_l = _mm_unpacklo_epi16(a_l, a_l);
248 a_h = _mm_unpackhi_epi16(a_h, a_h);
249
250 __m128i a_t = _mm_slli_epi64(a_l, 32);
251 __m128i a_t0 = _mm_slli_epi64(a_h, 32);
252
253 a_l = _mm_add_epi32(a_l, a_t);
254 a_h = _mm_add_epi32(a_h, a_t0);
255
256 __m128i c0_l = c0;
257 __m128i c0_h = c0;
258
259 c0_l = _mm_unpacklo_epi8(c0_l, zero);
260 c0_h = _mm_unpackhi_epi8(c0_h, zero);
261
262 __m128i c1_l = c1;
263 __m128i c1_h = c1;
264
265 c1_l = _mm_unpacklo_epi8(c1_l, zero);
266 c1_h = _mm_unpackhi_epi8(c1_h, zero);
267
268 __m128i cl_sub = _mm_sub_epi16(c0_l, c1_l);
269 __m128i ch_sub = _mm_sub_epi16(c0_h, c1_h);
270
271 cl_sub = _mm_mullo_epi16(cl_sub, a_l);
272 ch_sub = _mm_mullo_epi16(ch_sub, a_h);
273
274 __m128i c1ls = _mm_slli_epi16(c1_l, 8);
275 __m128i c1hs = _mm_slli_epi16(c1_h, 8);
276
277 cl_sub = _mm_add_epi16(cl_sub, c1ls);
278 ch_sub = _mm_add_epi16(ch_sub, c1hs);
279
280 cl_sub = _mm_and_si128(cl_sub, RB_MASK_SSE3);
281 ch_sub = _mm_and_si128(ch_sub, RB_MASK_SSE3);
282
283 cl_sub = _mm_srli_epi64(cl_sub, 8);
284 ch_sub = _mm_srli_epi64(ch_sub, 8);
285
286 cl_sub = _mm_packus_epi16(cl_sub, cl_sub);
287 ch_sub = _mm_packus_epi16(ch_sub, ch_sub);
288
289 return (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44);
290}
291
292EFL_ALWAYS_INLINE __m128i
293mul_sym_sse3(__m128i a, __m128i c) {
294
295 /* Prepare alpha for word mult */
296 __m128i a_l = a;
297 __m128i a_h = a;
298 a_l = _mm_unpacklo_epi16(a_l, a_l);
299 a_h = _mm_unpackhi_epi16(a_h, a_h);
300 __m128i a0 = (__m128i) _mm_shuffle_ps( (__m128)a_l, (__m128)a_h, 0x88);
301
302 /* first part */
303 __m128i c0 = c;
304 c0 = _mm_srli_epi32(c0, 8);
305 c0 = _mm_and_si128(GA_MASK_SSE3, c0);
306 c0 = _mm_mullo_epi16(a0, c0);
307 c0 = _mm_add_epi32(c0, GA_MASK_SSE3);
308 c0 = _mm_and_si128(RB_MASK_SSE3, c0);
309
310 /* second part */
311 __m128i c1 = c;
312 c1 = _mm_and_si128(GA_MASK_SSE3, c1);
313 c1 = _mm_mullo_epi16(a0, c1);
314 c1 = _mm_add_epi32(c1, GA_MASK_SSE3);
315 c1 = _mm_srli_epi32(c1, 8);
316 c1 = _mm_and_si128(GA_MASK_SSE3, c1);
317
318 return _mm_add_epi32(c0, c1);
319}
320
321EFL_ALWAYS_INLINE __m128i
322mul4_sym_sse3(__m128i x, __m128i y) {
323
324 const __m128i zero = _mm_setzero_si128();
325
326 __m128i x_l = _mm_unpacklo_epi8(x, zero);
327 __m128i x_h = _mm_unpackhi_epi8(x, zero);
328
329 __m128i y_l = _mm_unpacklo_epi8(y, zero);
330 __m128i y_h = _mm_unpackhi_epi8(y, zero);
331
332 __m128i r_l = _mm_mullo_epi16(x_l, y_l);
333 __m128i r_h = _mm_mullo_epi16(x_h, y_h);
334
335 r_l = _mm_add_epi16(r_l, SYM4_MASK_SSE3);
336 r_h = _mm_add_epi16(r_h, SYM4_MASK_SSE3);
337
338 r_l = _mm_srli_epi16(r_l, 8);
339 r_h = _mm_srli_epi16(r_h, 8);
340
341 return _mm_packus_epi16(r_l, r_h);
342}
343
344EFL_ALWAYS_INLINE __m128i
345mul3_sym_sse3(__m128i x, __m128i y) {
346
347 __m128i res = mul4_sym_sse3(x, y);
348 return _mm_and_si128(res, RGB_MASK_SSE3);
349}
350
351#define LOOP_ALIGNED_U1_A48_SSE3(DEST, LENGTH, UOP, A4OP, A8OP) \
352 { \
353 while((uintptr_t)DEST & 0xF && LENGTH) UOP \
354 \
355 while(LENGTH) { \
356 switch(LENGTH) { \
357 case 3: UOP \
358 case 2: UOP \
359 case 1: UOP \
360 break; \
361 case 7: \
362 case 6: \
363 case 5: \
364 case 4: \
365 A4OP \
366 break; \
367 default: \
368 A8OP \
369 break; \
370 } \
371 } \
372 }
373
374
375#endif
376#endif
377
378#endif