aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_sse3.c
diff options
context:
space:
mode:
authorDavid Walter Seikel2012-01-04 18:41:13 +1000
committerDavid Walter Seikel2012-01-04 18:41:13 +1000
commitdd7595a3475407a7fa96a97393bae8c5220e8762 (patch)
treee341e911d7eb911a51684a7412ef7f7c7605d28e /libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_sse3.c
parentAdd the skeleton. (diff)
downloadSledjHamr-dd7595a3475407a7fa96a97393bae8c5220e8762.zip
SledjHamr-dd7595a3475407a7fa96a97393bae8c5220e8762.tar.gz
SledjHamr-dd7595a3475407a7fa96a97393bae8c5220e8762.tar.bz2
SledjHamr-dd7595a3475407a7fa96a97393bae8c5220e8762.tar.xz
Add the base Enlightenment Foundation Libraries - eina, eet, evas, ecore, embryo, and edje.
Note that embryo wont be used, but I'm not sure yet if you can build edje without it.
Diffstat (limited to 'libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_sse3.c')
-rw-r--r--libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_sse3.c320
1 files changed, 320 insertions, 0 deletions
diff --git a/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_sse3.c b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_sse3.c
new file mode 100644
index 0000000..83230e5
--- /dev/null
+++ b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_mask_color_sse3.c
@@ -0,0 +1,320 @@
1/* blend mask x color -> dst */
2
3#ifdef BUILD_SSE3
4
5static void
6_op_blend_mas_c_dp_sse3(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) {
7
8 const __m128i c_packed = _mm_set_epi32(c, c, c, c);
9
10 LOOP_ALIGNED_U1_A48_SSE3(d, l,
11 { /* UOP */
12
13 DATA32 a = *m;
14 DATA32 mc = MUL_SYM(a, c);
15 a = 256 - (mc >> 24);
16 *d = mc + MUL_256(a, *d);
17 m++; d++; l--;
18 },
19 { /* A4OP */
20
21 if ((m[3] | m[2] | m[1] | m[0]) == 0) {
22 m += 4; d += 4; l -= 4;
23 continue;
24 }
25
26 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
27 __m128i d0 = _mm_load_si128((__m128i *)d);
28
29 __m128i mc0 = mul_sym_sse3(m0, c_packed);
30 __m128i a0 = sub4_alpha_sse3(mc0);
31 __m128i mul0 = mul_256_sse3(a0, d0);
32
33 mul0 = _mm_add_epi32(mul0, mc0);
34
35 _mm_store_si128((__m128i *)d, mul0);
36
37 m += 4; d += 4; l -= 4;
38 },
39 { /* A8OP */
40
41 if((m[7] | m[6] | m[5] | m[4] | m[3] | m[2] | m[1] | m[0]) == 0) {
42 m += 8; d += 8; l -= 8;
43 continue;
44 }
45
46 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
47 __m128i d0 = _mm_load_si128((__m128i *)d);
48
49 __m128i m1 = _mm_set_epi32(m[7], m[6], m[5], m[4]);
50 __m128i d1 = _mm_load_si128((__m128i *)(d+4));
51
52 __m128i mc0 = mul_sym_sse3(m0, c_packed);
53 __m128i a0 = sub4_alpha_sse3(mc0);
54 __m128i mul0 = mul_256_sse3(a0, d0);
55
56 mul0 = _mm_add_epi32(mc0, mul0);
57
58 __m128i mc1 = mul_sym_sse3(m1, c_packed);
59 __m128i a1 = sub4_alpha_sse3(mc1);
60 __m128i mul1 = mul_256_sse3(a1, d1);
61
62 mul1 = _mm_add_epi32(mc1, mul1);
63
64 _mm_store_si128((__m128i *)d, mul0);
65 _mm_store_si128((__m128i *)(d+4), mul1);
66
67 m += 8; d += 8; l -= 8;
68 })
69}
70
71static void
72_op_blend_mas_can_dp_sse3(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) {
73
74 DATA32 alpha;
75
76 const __m128i one = _mm_set_epi32(1, 1, 1, 1);
77 const __m128i c_packed = _mm_set_epi32(c, c, c, c);
78
79 LOOP_ALIGNED_U1_A48_SSE3(d, l,
80 { /* UOP */
81
82 alpha = *m;
83 switch(alpha)
84 {
85 case 0:
86 break;
87 case 255:
88 *d = c;
89 break;
90 default:
91 alpha++;
92 *d = INTERP_256(alpha, c, *d);
93 break;
94 }
95 m++; d++; l--;
96 },
97 { /* A4OP */
98
99 if ((m[3] | m[2] | m[1] | m[0]) == 0) {
100 m += 4; d += 4; l -= 4;
101 continue;
102 }
103
104 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
105 __m128i d0 = _mm_load_si128((__m128i *)d);
106
107 __m128i zm0 = _mm_cmpeq_epi32(m0, _mm_setzero_si128());
108
109 m0 = _mm_add_epi32(one, m0);
110
111 __m128i r0 = interp4_256_sse3(m0, c_packed, d0);
112
113 r0 = _mm_and_si128(~zm0, r0);
114 d0 = _mm_and_si128(zm0, d0);
115
116 d0 = _mm_add_epi32(r0, d0);
117
118 _mm_store_si128((__m128i *)d, d0);
119
120 m += 4; d += 4; l -= 4;
121 },
122 { /* A8OP */
123
124 if ((m[7] | m[6] | m[5] | m[4] | m[3] | m[2] | m[1] | m[0]) == 0) {
125 m += 8; d += 8; l -= 8;
126 continue;
127 }
128
129 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
130 __m128i d0 = _mm_load_si128((__m128i *)d);
131
132 __m128i m1 = _mm_set_epi32(m[7], m[6], m[5], m[4]);
133 __m128i d1 = _mm_load_si128((__m128i *)(d+4));
134
135 __m128i zm0 = _mm_cmpeq_epi32(m0, _mm_setzero_si128());
136 __m128i zm1 = _mm_cmpeq_epi32(m1, _mm_setzero_si128());
137
138 m0 = _mm_add_epi32(one, m0);
139 m1 = _mm_add_epi32(one, m1);
140
141 __m128i r0 = interp4_256_sse3(m0, c_packed, d0);
142 __m128i r1 = interp4_256_sse3(m1, c_packed, d1);
143
144 r0 = _mm_and_si128(~zm0, r0);
145 d0 = _mm_and_si128(zm0, d0);
146
147 r1 = _mm_and_si128(~zm1, r1);
148 d1 = _mm_and_si128(zm1, d1);
149
150 d0 = _mm_add_epi32(d0, r0);
151 d1 = _mm_add_epi32(d1, r1);
152
153 _mm_store_si128((__m128i *)d, d0);
154 _mm_store_si128((__m128i *)(d+4), d1);
155
156 m += 8; d += 8; l -= 8;
157 })
158}
159
160#define _op_blend_mas_cn_dp_sse3 _op_blend_mas_can_dp_sse3
161#define _op_blend_mas_caa_dp_sse3 _op_blend_mas_c_dp_sse3
162
163#define _op_blend_mas_c_dpan_sse3 _op_blend_mas_c_dp_sse3
164#define _op_blend_mas_cn_dpan_sse3 _op_blend_mas_cn_dp_sse3
165#define _op_blend_mas_can_dpan_sse3 _op_blend_mas_can_dp_sse3
166#define _op_blend_mas_caa_dpan_sse3 _op_blend_mas_caa_dp_sse3
167
168static void
169init_blend_mask_color_span_funcs_sse3(void)
170{
171 op_blend_span_funcs[SP_N][SM_AS][SC][DP][CPU_SSE3] = _op_blend_mas_c_dp_sse3;
172 op_blend_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_SSE3] = _op_blend_mas_cn_dp_sse3;
173 op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_SSE3] = _op_blend_mas_can_dp_sse3;
174 op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_SSE3] = _op_blend_mas_caa_dp_sse3;
175
176// FIXME: BUGGY BUGGY Core i5 2500 (64bit), gcc version 4.5.2 (Ubuntu/Linaro 4.5.2-8ubuntu4), ello (text)
177// op_blend_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_SSE3] = _op_blend_mas_c_dpan_sse3;
178 op_blend_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_SSE3] = _op_blend_mas_cn_dpan_sse3;
179 op_blend_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_SSE3] = _op_blend_mas_can_dpan_sse3;
180 op_blend_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_SSE3] = _op_blend_mas_caa_dpan_sse3;
181}
182
183#define _op_blend_pt_mas_c_dp_sse3 NULL
184#define _op_blend_pt_mas_can_dp_sse3 NULL
185
186#define _op_blend_pt_mas_cn_dp_sse3 _op_blend_pt_mas_can_dp_sse3
187#define _op_blend_pt_mas_caa_dp_sse3 _op_blend_pt_mas_c_dp_sse3
188
189#define _op_blend_pt_mas_c_dpan_sse3 _op_blend_pt_mas_c_dp_sse3
190#define _op_blend_pt_mas_cn_dpan_sse3 _op_blend_pt_mas_cn_dp_sse3
191#define _op_blend_pt_mas_can_dpan_sse3 _op_blend_pt_mas_can_dp_sse3
192#define _op_blend_pt_mas_caa_dpan_sse3 _op_blend_pt_mas_caa_dp_sse3
193
194static void
195init_blend_mask_color_pt_funcs_sse3(void)
196{
197 op_blend_pt_funcs[SP_N][SM_AS][SC][DP][CPU_SSE3] = _op_blend_pt_mas_c_dp_sse3;
198 op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_SSE3] = _op_blend_pt_mas_cn_dp_sse3;
199 op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_SSE3] = _op_blend_pt_mas_can_dp_sse3;
200 op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_SSE3] = _op_blend_pt_mas_caa_dp_sse3;
201
202 op_blend_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_SSE3] = _op_blend_pt_mas_c_dpan_sse3;
203 op_blend_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_SSE3] = _op_blend_pt_mas_cn_dpan_sse3;
204 op_blend_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_SSE3] = _op_blend_pt_mas_can_dpan_sse3;
205 op_blend_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_SSE3] = _op_blend_pt_mas_caa_dpan_sse3;
206}
207
208/*-----*/
209
210/* blend_rel mask x color --> dst */
211
212static void
213_op_blend_rel_mas_c_dp_sse3(DATA32 *s __UNUSED__, DATA8 *m, DATA32 c, DATA32 *d, int l) {
214
215 const __m128i c_packed = _mm_set_epi32(c, c, c, c);
216
217 LOOP_ALIGNED_U1_A48_SSE3(d, l,
218 { /* UOP */
219
220 DATA32 mc = MUL_SYM(*m, c);
221 int alpha = 256 - (mc >> 24);
222 *d = MUL_SYM(*d >> 24, mc) + MUL_256(alpha, *d);
223 d++; m++; l--;
224 },
225 { /* A4OP */
226
227 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
228 __m128i d0 = _mm_load_si128((__m128i *) d);
229
230 __m128i mc0 = mul_sym_sse3(m0, c_packed);
231 __m128i a0 = sub4_alpha_sse3(mc0);
232
233 __m128i d0_sym = mul_sym_sse3(_mm_srli_epi32(d0, 24), mc0);
234 d0 = mul_256_sse3(a0, d0);
235
236 d0 = _mm_add_epi32(d0, d0_sym);
237
238 _mm_store_si128((__m128i *)d, d0);
239
240 d += 4; m += 4; l -= 4;
241 },
242 { /* A8OP */
243
244 __m128i m0 = _mm_set_epi32(m[3], m[2], m[1], m[0]);
245 __m128i d0 = _mm_load_si128((__m128i *)d);
246
247 __m128i m1 = _mm_set_epi32(m[7], m[6], m[5], m[4]);
248 __m128i d1 = _mm_load_si128((__m128i *)(d+4));
249
250 __m128i mc0 = mul_sym_sse3(m0, c_packed);
251 __m128i mc1 = mul_sym_sse3(m1, c_packed);
252
253 __m128i a0 = sub4_alpha_sse3(mc0);
254 __m128i a1 = sub4_alpha_sse3(mc1);
255
256 __m128i d0_sym = mul_sym_sse3(_mm_srli_epi32(d0, 24), mc0);
257 __m128i d1_sym = mul_sym_sse3(_mm_srli_epi32(d1, 24), mc1);
258
259 d0 = mul_256_sse3(a0, d0);
260 d1 = mul_256_sse3(a1, d1);
261
262 d0 = _mm_add_epi32(d0, d0_sym);
263 d1 = _mm_add_epi32(d1, d1_sym);
264
265 _mm_store_si128((__m128i *)d, d0);
266 _mm_store_si128((__m128i *)(d+4), d1);
267
268 d += 8; m += 8; l -= 8;
269 })
270}
271
272#define _op_blend_rel_mas_cn_dp_sse3 _op_blend_rel_mas_c_dp_sse3
273#define _op_blend_rel_mas_can_dp_sse3 _op_blend_rel_mas_c_dp_sse3
274#define _op_blend_rel_mas_caa_dp_sse3 _op_blend_rel_mas_c_dp_sse3
275
276#define _op_blend_rel_mas_c_dpan_sse3 _op_blend_mas_c_dpan_sse3
277#define _op_blend_rel_mas_cn_dpan_sse3 _op_blend_mas_cn_dpan_sse3
278#define _op_blend_rel_mas_can_dpan_sse3 _op_blend_mas_can_dpan_sse3
279#define _op_blend_rel_mas_caa_dpan_sse3 _op_blend_mas_caa_dpan_sse3
280
281static void
282init_blend_rel_mask_color_span_funcs_sse3(void)
283{
284 op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP][CPU_SSE3] = _op_blend_rel_mas_c_dp_sse3;
285 op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP][CPU_SSE3] = _op_blend_rel_mas_can_dp_sse3;
286 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP][CPU_SSE3] = _op_blend_rel_mas_can_dp_sse3;
287 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP][CPU_SSE3] = _op_blend_rel_mas_caa_dp_sse3;
288
289 op_blend_rel_span_funcs[SP_N][SM_AS][SC][DP_AN][CPU_SSE3] = _op_blend_rel_mas_c_dpan_sse3;
290 op_blend_rel_span_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_mas_cn_dpan_sse3;
291 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_SSE3] = _op_blend_rel_mas_can_dpan_sse3;
292 op_blend_rel_span_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_SSE3] = _op_blend_rel_mas_caa_dpan_sse3;
293}
294
295#define _op_blend_rel_pt_mas_c_dp_sse3 NULL
296
297#define _op_blend_rel_pt_mas_cn_dp_sse3 _op_blend_rel_pt_mas_c_dp_sse3
298#define _op_blend_rel_pt_mas_can_dp_sse3 _op_blend_rel_pt_mas_c_dp_sse3
299#define _op_blend_rel_pt_mas_caa_dp_sse3 _op_blend_rel_pt_mas_c_dp_sse3
300
301#define _op_blend_rel_pt_mas_c_dpan_sse3 _op_blend_pt_mas_c_dpan_sse3
302#define _op_blend_rel_pt_mas_cn_dpan_sse3 _op_blend_pt_mas_cn_dpan_sse3
303#define _op_blend_rel_pt_mas_can_dpan_sse3 _op_blend_pt_mas_can_dpan_sse3
304#define _op_blend_rel_pt_mas_caa_dpan_sse3 _op_blend_pt_mas_caa_dpan_sse3
305
306static void
307init_blend_rel_mask_color_pt_funcs_sse3(void)
308{
309 op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP][CPU_SSE3] = _op_blend_rel_pt_mas_c_dp_sse3;
310 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP][CPU_SSE3] = _op_blend_rel_pt_mas_cn_dp_sse3;
311 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP][CPU_SSE3] = _op_blend_rel_pt_mas_can_dp_sse3;
312 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP][CPU_SSE3] = _op_blend_rel_pt_mas_caa_dp_sse3;
313
314 op_blend_rel_pt_funcs[SP_N][SM_AS][SC][DP_AN][CPU_SSE3] = _op_blend_rel_pt_mas_c_dpan_sse3;
315 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pt_mas_cn_dpan_sse3;
316 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AN][DP_AN][CPU_SSE3] = _op_blend_rel_pt_mas_can_dpan_sse3;
317 op_blend_rel_pt_funcs[SP_N][SM_AS][SC_AA][DP_AN][CPU_SSE3] = _op_blend_rel_pt_mas_caa_dpan_sse3;
318}
319
320#endif