aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_sse3.c
diff options
context:
space:
mode:
Diffstat (limited to 'libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_sse3.c')
-rw-r--r--libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_sse3.c316
1 files changed, 316 insertions, 0 deletions
diff --git a/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_sse3.c b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_sse3.c
new file mode 100644
index 0000000..2e72fec
--- /dev/null
+++ b/libraries/evas/src/lib/engines/common/evas_op_blend/op_blend_pixel_sse3.c
@@ -0,0 +1,316 @@
1/* blend pixel --> dst */
2
3#ifdef BUILD_SSE3
4
5static void
6_op_blend_p_dp_sse3(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *d, int l) {
7
8 LOOP_ALIGNED_U1_A48_SSE3(d, l,
9 { /* UOP */
10
11 int alpha = 256 - (*s >> 24);
12 *d = *s + MUL_256(alpha, *d);
13 s++; d++; l--;
14 },
15 { /* A4OP */
16
17 __m128i s0 = _mm_lddqu_si128((__m128i *)s);
18 __m128i d0 = _mm_load_si128((__m128i *)d);
19
20 __m128i a0 = sub4_alpha_sse3(s0);
21 __m128i mul0 = mul_256_sse3(a0, d0);
22 d0 = _mm_add_epi32(mul0, s0);
23
24 _mm_store_si128((__m128i *)d, d0);
25
26 s += 4; d += 4; l -= 4;
27 },
28 { /* A8OP */
29
30 __m128i s0 = _mm_lddqu_si128((__m128i *)s);
31 __m128i d0 = _mm_load_si128((__m128i *)d);
32
33 __m128i s1 = _mm_lddqu_si128((__m128i *)(s+4));
34 __m128i d1 = _mm_load_si128((__m128i *)(d+4));
35
36 __m128i a0 = sub4_alpha_sse3(s0);
37 __m128i a1 = sub4_alpha_sse3(s1);
38
39 __m128i mul0 = mul_256_sse3(a0, d0);
40 __m128i mul1 = mul_256_sse3(a1, d1);
41
42 d0 = _mm_add_epi32(mul0, s0);
43 d1 = _mm_add_epi32(mul1, s1);
44
45 _mm_store_si128((__m128i *)d, d0);
46 _mm_store_si128((__m128i *)(d+4), d1);
47
48 s += 8; d += 8; l -= 8;
49 })
50}
51
52static void
53_op_blend_pas_dp_sse3(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c __UNUSED__, DATA32 *d, int l) {
54
55 int alpha;
56
57 const __m128i zero = _mm_setzero_si128();
58
59 LOOP_ALIGNED_U1_A48_SSE3(d, l,
60 { /* UOP */
61 switch (*s & 0xff000000)
62 {
63 case 0:
64 break;
65 case 0xff000000:
66 *d = *s;
67 break;
68 default:
69 alpha = 256 - (*s >> 24);
70 *d = *s + MUL_256(alpha, *d);
71 break;
72 }
73 s++; d++; l--;
74 },
75 { /* A4OP */
76
77 __m128i s0 = _mm_lddqu_si128((__m128i *)s);
78 __m128i d0 = _mm_load_si128((__m128i *)d);
79
80 __m128i a0 = sub4_alpha_sse3(s0);
81 __m128i mul0 = mul_256_sse3(a0, d0);
82
83 mul0 = _mm_add_epi32(s0, mul0);
84
85 __m128i zmask0 = _mm_cmpeq_epi32(_mm_srli_epi32(s0, 24), zero);
86 __m128i imask0 = ~zmask0;
87
88 mul0 = _mm_and_si128(imask0, mul0);
89 d0 = _mm_and_si128(zmask0, d0);
90
91 d0 = _mm_add_epi32(mul0, d0);
92
93 _mm_store_si128((__m128i *)d, d0);
94
95 s += 4; d += 4; l -= 4;
96 },
97 { /* A8OP */
98
99 __m128i s0 = _mm_lddqu_si128((__m128i *)s);
100 __m128i d0 = _mm_load_si128((__m128i *)d);
101
102 __m128i s1 = _mm_lddqu_si128((__m128i *)(s+4));
103 __m128i d1 = _mm_load_si128((__m128i *)(d+4));
104
105 __m128i a0 = sub4_alpha_sse3(s0);
106 __m128i a1 = sub4_alpha_sse3(s1);
107
108 __m128i mul0 = mul_256_sse3(a0, d0);
109 __m128i mul1 = mul_256_sse3(a1, d1);
110
111 mul0 = _mm_add_epi32(s0, mul0);
112 mul1 = _mm_add_epi32(s1, mul1);
113
114 __m128i zmask0 = _mm_cmpeq_epi32(_mm_srli_epi32(s0, 24), zero);
115 __m128i zmask1 = _mm_cmpeq_epi32(_mm_srli_epi32(s1, 24), zero);
116
117 __m128i imask0 = ~zmask0;
118 __m128i imask1 = ~zmask1;
119
120 mul0 = _mm_and_si128(imask0, mul0);
121 d0 = _mm_and_si128(zmask0, d0);
122
123 mul1 = _mm_and_si128(imask1, mul1);
124 d1 = _mm_and_si128(zmask1, d1);
125
126 d0 = _mm_add_epi32(mul0, d0);
127 d1 = _mm_add_epi32(mul1, d1);
128
129 _mm_store_si128((__m128i *)d, d0);
130 _mm_store_si128((__m128i *)(d+4), d1);
131
132 s += 8; d += 8; l -= 8;
133 })
134}
135
136#define _op_blend_pan_dp_sse3 NULL
137
138#define _op_blend_p_dpan_sse3 _op_blend_p_dp_sse3
139#define _op_blend_pas_dpan_sse3 _op_blend_pas_dp_sse3
140#define _op_blend_pan_dpan_sse3 _op_blend_pan_dp_sse3
141
142static void
143init_blend_pixel_span_funcs_sse3(void)
144{
145 op_blend_span_funcs[SP][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_p_dp_sse3;
146 op_blend_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_pas_dp_sse3;
147 op_blend_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_pan_dp_sse3;
148
149
150// FIXME: BUGGY BUGGY Core i5 750 (32bit), 4.5.2 (Ubuntu/Linaro 4.5.2-8ubuntu4), ello (text and rectangle)
151// op_blend_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_p_dpan_sse3;
152 op_blend_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_pas_dpan_sse3;
153 op_blend_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_pan_dpan_sse3;
154}
155
156#define _op_blend_pt_p_dp_sse3 NULL
157
158#define _op_blend_pt_pas_dp_sse3 _op_blend_pt_p_dp_sse3
159#define _op_blend_pt_pan_dp_sse3 NULL
160
161#define _op_blend_pt_p_dpan_sse3 _op_blend_pt_p_dp_sse3
162#define _op_blend_pt_pan_dpan_sse3 _op_blend_pt_pan_dp_sse3
163#define _op_blend_pt_pas_dpan_sse3 _op_blend_pt_pas_dp_sse3
164
165static void
166init_blend_pixel_pt_funcs_sse3(void)
167{
168 op_blend_pt_funcs[SP][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_pt_p_dp_sse3;
169 op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_pt_pas_dp_sse3;
170 op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_pt_pan_dp_sse3;
171
172 op_blend_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_pt_p_dpan_sse3;
173 op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_pt_pas_dpan_sse3;
174 op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_pt_pan_dpan_sse3;
175}
176
177/*-----*/
178
179/* blend_rel pixel -> dst */
180
181static void
182_op_blend_rel_p_dp_sse3(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
183
184 const __m128i ones = _mm_set_epi32(1, 1, 1, 1);
185
186 LOOP_ALIGNED_U1_A48_SSE3(d, l,
187 { /* UOP */
188
189 int alpha = 256 - (*s >> 24);
190 c = 1 + (*d >> 24);
191 *d = MUL_256(c, *s) + MUL_256(alpha, *d);
192 d++; s++; l--;
193 },
194 { /*A4OP */
195
196 __m128i s0 = _mm_lddqu_si128((__m128i *)s);
197 __m128i d0 = _mm_load_si128((__m128i *)d);
198
199 __m128i c0 = _mm_add_epi32(_mm_srli_epi32(d0, 24), ones);
200 __m128i a0 = sub4_alpha_sse3(s0);
201
202 d0 = _mm_add_epi32(mul_256_sse3(c0, s0), mul_256_sse3(a0, d0));
203
204 _mm_store_si128((__m128i *)d, d0);
205
206 d += 4; s += 4; l -= 4;
207 },
208 { /* A8OP */
209
210 __m128i s0 = _mm_lddqu_si128((__m128i *)s);
211 __m128i d0 = _mm_load_si128 ((__m128i *)d);
212
213 __m128i s1 = _mm_lddqu_si128((__m128i *)(s+4));
214 __m128i d1 = _mm_load_si128 ((__m128i *)(d+4));
215
216 __m128i c0 = _mm_add_epi32(_mm_srli_epi32(d0, 24), ones);
217 __m128i c1 = _mm_add_epi32(_mm_srli_epi32(d1, 24), ones);
218
219 __m128i a0 = sub4_alpha_sse3(s0);
220 __m128i a1 = sub4_alpha_sse3(s1);
221
222 d0 = _mm_add_epi32(mul_256_sse3(c0, s0), mul_256_sse3(a0, d0));
223 d1 = _mm_add_epi32(mul_256_sse3(c1, s1), mul_256_sse3(a1, d1));
224
225 _mm_store_si128((__m128i *)d, d0);
226 _mm_store_si128((__m128i *)(d+4), d1);
227
228 d += 8; s += 8; l -= 8;
229 })
230}
231
232static void
233_op_blend_rel_pan_dp_sse3(DATA32 *s, DATA8 *m __UNUSED__, DATA32 c, DATA32 *d, int l) {
234
235 const __m128i ones = _mm_set_epi32(1, 1, 1, 1);
236
237 LOOP_ALIGNED_U1_A48_SSE3(d, l,
238 { /* UOP */
239
240 c = 1 + (*d >> 24);
241 *d++ = MUL_256(c, *s);
242 s++; l--;
243 },
244 { /* A4OP */
245
246 __m128i s0 = _mm_lddqu_si128((__m128i *)s);
247 __m128i d0 = _mm_load_si128((__m128i *)d);
248
249 __m128i c0 = _mm_add_epi32(_mm_srli_epi32(d0, 24), ones);
250 d0 = mul_256_sse3(c0, s0);
251
252 _mm_store_si128((__m128i *)d, d0);
253
254 d += 4; s += 4; l -= 4;
255 },
256 { /* A8OP */
257
258 __m128i s0 = _mm_lddqu_si128((__m128i *)s);
259 __m128i d0 = _mm_load_si128 ((__m128i *)d);
260
261 __m128i s1 = _mm_lddqu_si128((__m128i *)(s+4));
262 __m128i d1 = _mm_load_si128 ((__m128i *)(d+4));
263
264 __m128i c0 = _mm_add_epi32(_mm_srli_epi32(d0, 24), ones);
265 __m128i c1 = _mm_add_epi32(_mm_srli_epi32(d1, 24), ones);
266
267 d0 = mul_256_sse3(c0, s0);
268 d1 = mul_256_sse3(c1, s1);
269
270 _mm_store_si128((__m128i *)d, d0);
271 _mm_store_si128((__m128i *)(d+4), d1);
272
273 d += 8; s += 8; l -= 8;
274 })
275}
276
277#define _op_blend_rel_pas_dp_sse3 _op_blend_rel_p_dp_sse3
278
279#define _op_blend_rel_p_dpan_sse3 _op_blend_p_dpan_sse3
280#define _op_blend_rel_pan_dpan_sse3 _op_blend_pan_dpan_sse3
281#define _op_blend_rel_pas_dpan_sse3 _op_blend_pas_dpan_sse3
282
283static void
284init_blend_rel_pixel_span_funcs_sse3(void)
285{
286 op_blend_rel_span_funcs[SP][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_p_dp_sse3;
287 op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_pas_dp_sse3;
288 op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_pan_dp_sse3;
289
290 op_blend_rel_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_p_dpan_sse3;
291 op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pas_dpan_sse3;
292 op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pan_dpan_sse3;
293}
294
295#define _op_blend_rel_pt_p_dp_sse3 NULL
296#define _op_blend_rel_pt_pan_dp_sse3 NULL
297
298#define _op_blend_rel_pt_pas_dp_sse3 _op_blend_rel_pt_p_dp_sse3
299
300#define _op_blend_rel_pt_p_dpan_sse3 _op_blend_pt_p_dpan_sse3
301#define _op_blend_rel_pt_pan_dpan_sse3 _op_blend_pt_pan_dpan_sse3
302#define _op_blend_rel_pt_pas_dpan_sse3 _op_blend_pt_pas_dpan_sse3
303
304static void
305init_blend_rel_pixel_pt_funcs_sse3(void)
306{
307 op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_pt_p_dp_sse3;
308 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_pt_pas_dp_sse3;
309 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_SSE3] = _op_blend_rel_pt_pan_dp_sse3;
310
311 op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pt_p_dpan_sse3;
312 op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pt_pas_dpan_sse3;
313 op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_SSE3] = _op_blend_rel_pt_pan_dpan_sse3;
314}
315
316#endif