diff options
Diffstat (limited to '')
-rw-r--r-- | libraries/evas/src/lib/engines/common/evas_convert_yuv.c | 1258 |
1 files changed, 0 insertions, 1258 deletions
diff --git a/libraries/evas/src/lib/engines/common/evas_convert_yuv.c b/libraries/evas/src/lib/engines/common/evas_convert_yuv.c deleted file mode 100644 index 000cb01..0000000 --- a/libraries/evas/src/lib/engines/common/evas_convert_yuv.c +++ /dev/null | |||
@@ -1,1258 +0,0 @@ | |||
1 | #include "evas_common.h" | ||
2 | #include "evas_convert_yuv.h" | ||
3 | |||
4 | #if defined BUILD_MMX || defined BUILD_SSE | ||
5 | # include "evas_mmx.h" | ||
6 | #endif | ||
7 | |||
8 | #if defined HAVE_ALTIVEC_H | ||
9 | # include <altivec.h> | ||
10 | #ifdef CONFIG_DARWIN | ||
11 | #define AVV(x...) (x) | ||
12 | #else | ||
13 | #define AVV(x...) {x} | ||
14 | #endif | ||
15 | |||
16 | #endif | ||
17 | |||
18 | #ifdef BUILD_CONVERT_YUV | ||
19 | |||
20 | static void _evas_yuv_init (void); | ||
21 | static void _evas_yv12torgb_sse (unsigned char **yuv, unsigned char *rgb, int w, int h); | ||
22 | static void _evas_yv12torgb_mmx (unsigned char **yuv, unsigned char *rgb, int w, int h); | ||
23 | #ifdef BUILD_ALTIVEC | ||
24 | static void _evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h); | ||
25 | static void _evas_yv12torgb_diz (unsigned char **yuv, unsigned char *rgb, int w, int h); | ||
26 | #endif | ||
27 | static void _evas_yv12torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h); | ||
28 | static void _evas_yuy2torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h); | ||
29 | static void _evas_nv12torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h); | ||
30 | static void _evas_nv12tiledtorgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h); | ||
31 | |||
32 | #define CRV 104595 | ||
33 | #define CBU 132251 | ||
34 | #define CGU 25624 | ||
35 | #define CGV 53280 | ||
36 | #define YMUL 76283 | ||
37 | #define OFF 32768 | ||
38 | #define BITRES 16 | ||
39 | |||
40 | /* calculation float resolution in bits */ | ||
41 | /* ie RES = 6 is 10.6 fixed point */ | ||
42 | /* RES = 8 is 8.8 fixed point */ | ||
43 | /* RES = 4 is 12.4 fixed point */ | ||
44 | /* NB: going above 6 will lead to overflow... :( */ | ||
45 | #define RES 6 | ||
46 | |||
47 | #define RZ(i) (i >> (BITRES - RES)) | ||
48 | #define FOUR(i) {i, i, i, i} | ||
49 | |||
50 | #if defined BUILD_MMX || defined BUILD_SSE | ||
51 | __attribute__ ((aligned (8))) const volatile unsigned short _const_crvcrv[4] = FOUR(RZ(CRV)); | ||
52 | __attribute__ ((aligned (8))) const volatile unsigned short _const_cbucbu[4] = FOUR(RZ(CBU)); | ||
53 | __attribute__ ((aligned (8))) const volatile unsigned short _const_cgucgu[4] = FOUR(RZ(CGU)); | ||
54 | __attribute__ ((aligned (8))) const volatile unsigned short _const_cgvcgv[4] = FOUR(RZ(CGV)); | ||
55 | __attribute__ ((aligned (8))) const volatile unsigned short _const_ymul [4] = FOUR(RZ(YMUL)); | ||
56 | __attribute__ ((aligned (8))) const volatile unsigned short _const_128 [4] = FOUR(128); | ||
57 | __attribute__ ((aligned (8))) const volatile unsigned short _const_32 [4] = FOUR(RZ(OFF)); | ||
58 | __attribute__ ((aligned (8))) const volatile unsigned short _const_16 [4] = FOUR(16); | ||
59 | __attribute__ ((aligned (8))) const volatile unsigned short _const_ff [4] = FOUR(-1); | ||
60 | |||
61 | #define CONST_CRVCRV *_const_crvcrv | ||
62 | #define CONST_CBUCBU *_const_cbucbu | ||
63 | #define CONST_CGUCGU *_const_cgucgu | ||
64 | #define CONST_CGVCGV *_const_cgvcgv | ||
65 | #define CONST_YMUL *_const_ymul | ||
66 | #define CONST_128 *_const_128 | ||
67 | #define CONST_32 *_const_32 | ||
68 | #define CONST_16 *_const_16 | ||
69 | #define CONST_FF *_const_ff | ||
70 | |||
71 | /* for C non aligned cleanup */ | ||
72 | const int _crv = RZ(CRV); /* 1.596 */ | ||
73 | const int _cbu = RZ(CBU); /* 2.018 */ | ||
74 | const int _cgu = RZ(CGU); /* 0.391 */ | ||
75 | const int _cgv = RZ(CGV); /* 0.813 */ | ||
76 | |||
77 | #endif | ||
78 | |||
79 | #ifdef BUILD_ALTIVEC | ||
80 | #ifdef __VEC__ | ||
81 | const vector unsigned short res = AVV(RES); | ||
82 | const vector signed short crv = AVV(RZ(CRV)); | ||
83 | const vector signed short cbu = AVV(RZ(CBU)); | ||
84 | const vector signed short cgu = AVV(RZ(CGU)); | ||
85 | const vector signed short cgv = AVV(RZ(CGV)); | ||
86 | const vector signed short ymul = AVV(RZ(YMUL)); | ||
87 | const vector signed short c128 = AVV(128); | ||
88 | const vector signed short c32 = AVV(RZ(OFF)); | ||
89 | const vector signed short c16 = AVV(16); | ||
90 | const vector unsigned char zero = AVV(0); | ||
91 | const vector signed short maxchar = AVV(255); | ||
92 | const vector unsigned char pickrg1 = AVV(0, 0x1, 0x11, 0, | ||
93 | 0, 0x3, 0x13, 0, | ||
94 | 0, 0x5, 0x15, 0, | ||
95 | 0, 0x7, 0x17, 0); | ||
96 | const vector unsigned char pickrg2 = AVV(0, 0x9, 0x19, 0, | ||
97 | 0, 0xb, 0x1b, 0, | ||
98 | 0, 0xd, 0x1d, 0, | ||
99 | 0, 0xf, 0x1f, 0); | ||
100 | const vector unsigned char pickrgb1 = AVV(0x3, 0x1, 0x2, 0x11, | ||
101 | 0x7, 0x5, 0x6, 0x13, | ||
102 | 0xb, 0x9, 0xa, 0x15, | ||
103 | 0xf, 0xd, 0xe, 0x17); | ||
104 | const vector unsigned char pickrgb2 = AVV(0x3, 0x1, 0x2, 0x19, | ||
105 | 0x7, 0x5, 0x6, 0x1b, | ||
106 | 0xb, 0x9, 0xa, 0x1d, | ||
107 | 0xf, 0xd, 0xe, 0x1f); | ||
108 | #endif | ||
109 | #endif | ||
110 | |||
111 | #ifdef BUILD_C | ||
112 | |||
113 | /* shortcut speedup lookup-tables */ | ||
114 | static short _v1164[256]; | ||
115 | static short _v1596[256]; | ||
116 | static short _v813[256]; | ||
117 | static short _v391[256]; | ||
118 | static short _v2018[256]; | ||
119 | |||
120 | static unsigned char _clip_lut[1024]; | ||
121 | #define LUT_CLIP(i) ((_clip_lut+384)[(i)]) | ||
122 | |||
123 | #define CMP_CLIP(i) ((i&256)? (~(i>>10)) : i); | ||
124 | |||
125 | static int initted = 0; | ||
126 | |||
127 | #endif | ||
128 | |||
129 | void | ||
130 | evas_common_convert_yuv_420p_601_rgba(DATA8 **src, DATA8 *dst, int w, int h) | ||
131 | { | ||
132 | int mmx, sse, sse2; | ||
133 | |||
134 | #if defined BUILD_MMX || defined BUILD_SSE | ||
135 | evas_common_cpu_can_do(&mmx, &sse, &sse2); | ||
136 | #endif | ||
137 | #ifndef BUILD_SSE | ||
138 | sse = 0; | ||
139 | sse2 = 0; | ||
140 | #endif | ||
141 | #ifndef BUILD_MMX | ||
142 | mmx = 0; | ||
143 | #endif | ||
144 | if (evas_common_cpu_has_feature(CPU_FEATURE_MMX2)) | ||
145 | _evas_yv12torgb_sse(src, dst, w, h); | ||
146 | else if (evas_common_cpu_has_feature(CPU_FEATURE_MMX)) | ||
147 | _evas_yv12torgb_mmx(src, dst, w, h); | ||
148 | #ifdef BUILD_ALTIVEC | ||
149 | if (evas_common_cpu_has_feature(CPU_FEATURE_ALTIVEC)) | ||
150 | _evas_yv12torgb_altivec(src, dst, w, h); | ||
151 | #endif | ||
152 | else | ||
153 | { | ||
154 | #ifdef BUILD_C | ||
155 | if (!initted) _evas_yuv_init(); | ||
156 | initted = 1; | ||
157 | /* FIXME: diz may be faster sometimes */ | ||
158 | _evas_yv12torgb_raster(src, dst, w, h); | ||
159 | #endif | ||
160 | } | ||
161 | } | ||
162 | |||
163 | /* Thanks to Diz for this code. i've munged it a little and turned it into */ | ||
164 | /* inline macros. I tried beating it with a different algorithm using MMX */ | ||
165 | /* but failed. So here we are. This is the fastest YUV->RGB i know of for */ | ||
166 | /* x86. It has an issue that it doesn't convert colours accurately so the */ | ||
167 | /* image looks a little "yellowy". This is a result of only 10.6 fixed point */ | ||
168 | /* resolution as opposed to 16.16 in the C code. This could be fixed by */ | ||
169 | /* processing half the number of pixels per cycle and going up to 32bits */ | ||
170 | /* per element during compute, but it would all but negate the speedup */ | ||
171 | /* from mmx I think :( It might be possible to use SSE and SSE2 here, but */ | ||
172 | /* I haven't tried yet. Let's see. */ | ||
173 | |||
174 | /* NB: XviD has almost the same code in it's assembly YV12->RGB code. same */ | ||
175 | /* algorithm, same constants, same all over actually, except it actually */ | ||
176 | /* does a few extra memory accesses that this one doesn't, so in theory */ | ||
177 | /* this code should be faster. In the end it's all just an mmx version of */ | ||
178 | /* the reference implimentation done with fixed point math */ | ||
179 | |||
180 | static void | ||
181 | _evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h) | ||
182 | { | ||
183 | #ifdef BUILD_SSE | ||
184 | int xx, yy; | ||
185 | register unsigned char *yp1, *up, *vp; | ||
186 | unsigned char *dp1; | ||
187 | |||
188 | /* destination pointers */ | ||
189 | dp1 = rgb; | ||
190 | |||
191 | for (yy = 0; yy < h; yy++) | ||
192 | { | ||
193 | /* plane pointers */ | ||
194 | yp1 = yuv[yy]; | ||
195 | up = yuv[h + (yy / 2)]; | ||
196 | vp = yuv[h + (h / 2) + (yy / 2)]; | ||
197 | for (xx = 0; xx < (w - 7); xx += 8) | ||
198 | { | ||
199 | movd_m2r(*up, mm3); | ||
200 | movd_m2r(*vp, mm2); | ||
201 | movq_m2r(*yp1, mm0); | ||
202 | |||
203 | pxor_r2r(mm7, mm7); | ||
204 | punpcklbw_r2r(mm7, mm2); | ||
205 | punpcklbw_r2r(mm7, mm3); | ||
206 | |||
207 | movq_r2r(mm0, mm1); | ||
208 | psrlw_i2r(8, mm0); | ||
209 | psllw_i2r(8, mm1); | ||
210 | psrlw_i2r(8, mm1); | ||
211 | |||
212 | movq_m2r(CONST_16, mm4); | ||
213 | psubsw_r2r(mm4, mm0); | ||
214 | psubsw_r2r(mm4, mm1); | ||
215 | |||
216 | movq_m2r(CONST_128, mm5); | ||
217 | psubsw_r2r(mm5, mm2); | ||
218 | psubsw_r2r(mm5, mm3); | ||
219 | |||
220 | movq_m2r(CONST_YMUL, mm4); | ||
221 | pmullw_r2r(mm4, mm0); | ||
222 | pmullw_r2r(mm4, mm1); | ||
223 | |||
224 | movq_m2r(CONST_CRVCRV, mm7); | ||
225 | pmullw_r2r(mm3, mm7); | ||
226 | movq_m2r(CONST_CBUCBU, mm6); | ||
227 | pmullw_r2r(mm2, mm6); | ||
228 | movq_m2r(CONST_CGUCGU, mm5); | ||
229 | pmullw_r2r(mm2, mm5); | ||
230 | movq_m2r(CONST_CGVCGV, mm4); | ||
231 | pmullw_r2r(mm3, mm4); | ||
232 | |||
233 | movq_r2r(mm0, mm2); | ||
234 | paddsw_r2r(mm7, mm2); | ||
235 | paddsw_r2r(mm1, mm7); | ||
236 | |||
237 | psraw_i2r(RES, mm2); | ||
238 | psraw_i2r(RES, mm7); | ||
239 | packuswb_r2r(mm7, mm2); | ||
240 | |||
241 | pxor_r2r(mm7, mm7); | ||
242 | movq_r2r(mm2, mm3); | ||
243 | punpckhbw_r2r(mm7, mm2); | ||
244 | punpcklbw_r2r(mm3, mm7); | ||
245 | por_r2r(mm7, mm2); | ||
246 | |||
247 | movq_r2r(mm0, mm3); | ||
248 | psubsw_r2r(mm5, mm3); | ||
249 | psubsw_r2r(mm4, mm3); | ||
250 | paddsw_m2r(CONST_32, mm3); | ||
251 | |||
252 | movq_r2r(mm1, mm7); | ||
253 | psubsw_r2r(mm5, mm7); | ||
254 | psubsw_r2r(mm4, mm7); | ||
255 | paddsw_m2r(CONST_32, mm7); | ||
256 | |||
257 | psraw_i2r(RES, mm3); | ||
258 | psraw_i2r(RES, mm7); | ||
259 | packuswb_r2r(mm7, mm3); | ||
260 | |||
261 | pxor_r2r(mm7, mm7); | ||
262 | movq_r2r(mm3, mm4); | ||
263 | punpckhbw_r2r(mm7, mm3); | ||
264 | punpcklbw_r2r(mm4, mm7); | ||
265 | por_r2r(mm7, mm3); | ||
266 | |||
267 | movq_m2r(CONST_32, mm4); | ||
268 | paddsw_r2r(mm6, mm0); | ||
269 | paddsw_r2r(mm6, mm1); | ||
270 | paddsw_r2r(mm4, mm0); | ||
271 | paddsw_r2r(mm4, mm1); | ||
272 | psraw_i2r(RES, mm0); | ||
273 | psraw_i2r(RES, mm1); | ||
274 | packuswb_r2r(mm1, mm0); | ||
275 | |||
276 | pxor_r2r(mm7, mm7); | ||
277 | movq_r2r(mm0, mm5); | ||
278 | punpckhbw_r2r(mm7, mm0); | ||
279 | punpcklbw_r2r(mm5, mm7); | ||
280 | por_r2r(mm7, mm0); | ||
281 | |||
282 | movq_m2r(CONST_FF, mm1); | ||
283 | movq_r2r(mm0, mm5); | ||
284 | movq_r2r(mm3, mm6); | ||
285 | movq_r2r(mm2, mm7); | ||
286 | punpckhbw_r2r(mm3, mm2); | ||
287 | punpcklbw_r2r(mm6, mm7); | ||
288 | punpckhbw_r2r(mm1, mm0); | ||
289 | punpcklbw_r2r(mm1, mm5); | ||
290 | |||
291 | movq_r2r(mm7, mm1); | ||
292 | punpckhwd_r2r(mm5, mm7); | ||
293 | punpcklwd_r2r(mm5, mm1); | ||
294 | |||
295 | movq_r2r(mm2, mm4); | ||
296 | punpckhwd_r2r(mm0, mm2); | ||
297 | punpcklwd_r2r(mm0, mm4); | ||
298 | |||
299 | movntq_r2m(mm1, *(dp1)); | ||
300 | movntq_r2m(mm7, *(dp1 + 8)); | ||
301 | movntq_r2m(mm4, *(dp1 + 16)); | ||
302 | movntq_r2m(mm2, *(dp1 + 24)); | ||
303 | |||
304 | yp1 += 8; | ||
305 | up += 4; | ||
306 | vp += 4; | ||
307 | dp1 += 8 * 4; | ||
308 | } | ||
309 | /* cleanup pixles that arent a multiple of 8 pixels wide */ | ||
310 | if (xx < w) | ||
311 | { | ||
312 | int y, u, v, r, g, b; | ||
313 | |||
314 | for (; xx < w; xx += 2) | ||
315 | { | ||
316 | u = (*up++) - 128; | ||
317 | v = (*vp++) - 128; | ||
318 | |||
319 | y = RZ(YMUL) * ((*yp1++) - 16); | ||
320 | r = LUT_CLIP((y + (_crv * v)) >> RES); | ||
321 | g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES); | ||
322 | b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES); | ||
323 | *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); | ||
324 | |||
325 | dp1 += 4; | ||
326 | |||
327 | y = RZ(YMUL) * ((*yp1++) - 16); | ||
328 | r = LUT_CLIP((y + (_crv * v)) >> RES); | ||
329 | g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES); | ||
330 | b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES); | ||
331 | *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); | ||
332 | |||
333 | dp1 += 4; | ||
334 | } | ||
335 | } | ||
336 | } | ||
337 | emms(); | ||
338 | #else | ||
339 | _evas_yv12torgb_mmx(yuv, rgb, w, h); | ||
340 | #endif | ||
341 | } | ||
342 | |||
343 | static void | ||
344 | _evas_yv12torgb_mmx(unsigned char **yuv, unsigned char *rgb, int w, int h) | ||
345 | { | ||
346 | #ifdef BUILD_MMX | ||
347 | int xx, yy; | ||
348 | register unsigned char *yp1, *up, *vp; | ||
349 | unsigned char *dp1; | ||
350 | |||
351 | /* destination pointers */ | ||
352 | dp1 = rgb; | ||
353 | |||
354 | for (yy = 0; yy < h; yy++) | ||
355 | { | ||
356 | /* plane pointers */ | ||
357 | yp1 = yuv[yy]; | ||
358 | up = yuv[h + (yy / 2)]; | ||
359 | vp = yuv[h + (h / 2) + (yy / 2)]; | ||
360 | for (xx = 0; xx < (w - 7); xx += 8) | ||
361 | { | ||
362 | movd_m2r(*up, mm3); | ||
363 | movd_m2r(*vp, mm2); | ||
364 | movq_m2r(*yp1, mm0); | ||
365 | |||
366 | pxor_r2r(mm7, mm7); | ||
367 | punpcklbw_r2r(mm7, mm2); | ||
368 | punpcklbw_r2r(mm7, mm3); | ||
369 | |||
370 | movq_r2r(mm0, mm1); | ||
371 | psrlw_i2r(8, mm0); | ||
372 | psllw_i2r(8, mm1); | ||
373 | psrlw_i2r(8, mm1); | ||
374 | |||
375 | movq_m2r(CONST_16, mm4); | ||
376 | psubsw_r2r(mm4, mm0); | ||
377 | psubsw_r2r(mm4, mm1); | ||
378 | |||
379 | movq_m2r(CONST_128, mm5); | ||
380 | psubsw_r2r(mm5, mm2); | ||
381 | psubsw_r2r(mm5, mm3); | ||
382 | |||
383 | movq_m2r(CONST_YMUL, mm4); | ||
384 | pmullw_r2r(mm4, mm0); | ||
385 | pmullw_r2r(mm4, mm1); | ||
386 | |||
387 | movq_m2r(CONST_CRVCRV, mm7); | ||
388 | pmullw_r2r(mm3, mm7); | ||
389 | movq_m2r(CONST_CBUCBU, mm6); | ||
390 | pmullw_r2r(mm2, mm6); | ||
391 | movq_m2r(CONST_CGUCGU, mm5); | ||
392 | pmullw_r2r(mm2, mm5); | ||
393 | movq_m2r(CONST_CGVCGV, mm4); | ||
394 | pmullw_r2r(mm3, mm4); | ||
395 | |||
396 | movq_r2r(mm0, mm2); | ||
397 | paddsw_r2r(mm7, mm2); | ||
398 | paddsw_r2r(mm1, mm7); | ||
399 | |||
400 | psraw_i2r(RES, mm2); | ||
401 | psraw_i2r(RES, mm7); | ||
402 | packuswb_r2r(mm7, mm2); | ||
403 | |||
404 | pxor_r2r(mm7, mm7); | ||
405 | movq_r2r(mm2, mm3); | ||
406 | punpckhbw_r2r(mm7, mm2); | ||
407 | punpcklbw_r2r(mm3, mm7); | ||
408 | por_r2r(mm7, mm2); | ||
409 | |||
410 | movq_r2r(mm0, mm3); | ||
411 | psubsw_r2r(mm5, mm3); | ||
412 | psubsw_r2r(mm4, mm3); | ||
413 | paddsw_m2r(CONST_32, mm3); | ||
414 | |||
415 | movq_r2r(mm1, mm7); | ||
416 | psubsw_r2r(mm5, mm7); | ||
417 | psubsw_r2r(mm4, mm7); | ||
418 | paddsw_m2r(CONST_32, mm7); | ||
419 | |||
420 | psraw_i2r(RES, mm3); | ||
421 | psraw_i2r(RES, mm7); | ||
422 | packuswb_r2r(mm7, mm3); | ||
423 | |||
424 | pxor_r2r(mm7, mm7); | ||
425 | movq_r2r(mm3, mm4); | ||
426 | punpckhbw_r2r(mm7, mm3); | ||
427 | punpcklbw_r2r(mm4, mm7); | ||
428 | por_r2r(mm7, mm3); | ||
429 | |||
430 | movq_m2r(CONST_32, mm4); | ||
431 | paddsw_r2r(mm6, mm0); | ||
432 | paddsw_r2r(mm6, mm1); | ||
433 | paddsw_r2r(mm4, mm0); | ||
434 | paddsw_r2r(mm4, mm1); | ||
435 | psraw_i2r(RES, mm0); | ||
436 | psraw_i2r(RES, mm1); | ||
437 | packuswb_r2r(mm1, mm0); | ||
438 | |||
439 | pxor_r2r(mm7, mm7); | ||
440 | movq_r2r(mm0, mm5); | ||
441 | punpckhbw_r2r(mm7, mm0); | ||
442 | punpcklbw_r2r(mm5, mm7); | ||
443 | por_r2r(mm7, mm0); | ||
444 | |||
445 | movq_m2r(CONST_FF, mm1); | ||
446 | movq_r2r(mm0, mm5); | ||
447 | movq_r2r(mm3, mm6); | ||
448 | movq_r2r(mm2, mm7); | ||
449 | punpckhbw_r2r(mm3, mm2); | ||
450 | punpcklbw_r2r(mm6, mm7); | ||
451 | punpckhbw_r2r(mm1, mm0); | ||
452 | punpcklbw_r2r(mm1, mm5); | ||
453 | |||
454 | movq_r2r(mm7, mm1); | ||
455 | punpckhwd_r2r(mm5, mm7); | ||
456 | punpcklwd_r2r(mm5, mm1); | ||
457 | |||
458 | movq_r2r(mm2, mm4); | ||
459 | punpckhwd_r2r(mm0, mm2); | ||
460 | punpcklwd_r2r(mm0, mm4); | ||
461 | |||
462 | movq_r2m(mm1, *(dp1)); | ||
463 | movq_r2m(mm7, *(dp1 + 8)); | ||
464 | movq_r2m(mm4, *(dp1 + 16)); | ||
465 | movq_r2m(mm2, *(dp1 + 24)); | ||
466 | |||
467 | yp1 += 8; | ||
468 | up += 4; | ||
469 | vp += 4; | ||
470 | dp1 += 8 * 4; | ||
471 | } | ||
472 | /* cleanup pixles that arent a multiple of 8 pixels wide */ | ||
473 | if (xx < w) | ||
474 | { | ||
475 | int y, u, v, r, g, b; | ||
476 | |||
477 | for (; xx < w; xx += 2) | ||
478 | { | ||
479 | u = (*up++) - 128; | ||
480 | v = (*vp++) - 128; | ||
481 | |||
482 | y = RZ(YMUL) * ((*yp1++) - 16); | ||
483 | r = LUT_CLIP((y + (_crv * v)) >> RES); | ||
484 | g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES); | ||
485 | b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES); | ||
486 | *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); | ||
487 | |||
488 | dp1 += 4; | ||
489 | |||
490 | y = RZ(YMUL) * ((*yp1++) - 16); | ||
491 | r = LUT_CLIP((y + (_crv * v)) >> RES); | ||
492 | g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES); | ||
493 | b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES); | ||
494 | *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); | ||
495 | |||
496 | dp1 += 4; | ||
497 | } | ||
498 | } | ||
499 | } | ||
500 | emms(); | ||
501 | #else | ||
502 | _evas_yv12torgb_raster(yuv, rgb, w, h); | ||
503 | #endif | ||
504 | } | ||
505 | |||
506 | #ifdef BUILD_ALTIVEC | ||
507 | static void | ||
508 | _evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h) | ||
509 | { | ||
510 | #ifdef __VEC__ | ||
511 | int xx, yy; | ||
512 | int w2, h2; | ||
513 | unsigned char *yp1, *yp2, *up, *vp; | ||
514 | unsigned char *dp1, *dp2; | ||
515 | vector signed short y, u, v; | ||
516 | vector signed short r, g, b; | ||
517 | vector signed short tmp1, tmp2, tmp3; | ||
518 | vector unsigned char yperm, uperm, vperm, rgb1, rgb2; | ||
519 | vector unsigned char alpha; | ||
520 | |||
521 | /* handy halved w & h */ | ||
522 | w2 = w / 2; | ||
523 | h2 = h / 2; | ||
524 | /* plane pointers */ | ||
525 | yp1 = yuv; | ||
526 | yp2 = yuv + w; | ||
527 | up = yuv + (w * h); | ||
528 | vp = up + (w2 * h2); | ||
529 | /* destination pointers */ | ||
530 | dp1 = rgb; | ||
531 | dp2 = rgb + (w * 4); | ||
532 | |||
533 | alpha = vec_mergeh((vector unsigned char)AVV(255), zero); | ||
534 | alpha = (vector unsigned char)vec_mergeh((vector unsigned short)alpha, | ||
535 | (vector unsigned short)zero); | ||
536 | |||
537 | for (yy = 0; yy < h2; yy++) | ||
538 | { | ||
539 | for (xx = 0; xx < w2; xx += 4) | ||
540 | { | ||
541 | /* Cycles */ | ||
542 | /* | ||
543 | * Load 4 y and 4 u & v pixels for the 8x2 pixel block. | ||
544 | */ | ||
545 | /* 3 */ tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1); | ||
546 | /* 3 */ tmp1 = (vector signed short)vec_lde(0, (unsigned int *)up); | ||
547 | /* 3 */ tmp2 = (vector signed short)vec_lde(0, (unsigned int *)vp); | ||
548 | |||
549 | /* Prepare for aligning the data in their vectors */ | ||
550 | /* 3 */ yperm = vec_lvsl(0, yp1); | ||
551 | /* 3 */ uperm = vec_lvsl(0, up); | ||
552 | /* 3 */ vperm = vec_lvsl(0, vp); | ||
553 | yp1 += 4; | ||
554 | |||
555 | /* Save y and load the next 4 y pixels for a total of 8 */ | ||
556 | /* 2 */ y = vec_perm(tmp3, tmp3, yperm); | ||
557 | /* 3 */ tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1); | ||
558 | |||
559 | /* Setup and calculate the 4 u pixels */ | ||
560 | /* 2 */ tmp1 = vec_perm(tmp1, tmp1, uperm); | ||
561 | /* 2 */ tmp2 = vec_perm(tmp2, tmp2, vperm); | ||
562 | |||
563 | /* Avoid dependency stalls on yperm and calculate the 4 u values */ | ||
564 | /* 3 */ yperm = vec_lvsr(12, yp1); | ||
565 | /* 1 */ tmp1 = (vector signed short)vec_mergeh((vector unsigned char)tmp1, | ||
566 | (vector unsigned char)tmp1); | ||
567 | /* 1 */ u = (vector signed short)vec_mergeh(zero, | ||
568 | (vector unsigned char)tmp1); | ||
569 | |||
570 | /* 1 */ u = vec_sub(u, c128); | ||
571 | /* 2 */ tmp3 = vec_perm(tmp3, tmp3, yperm); | ||
572 | |||
573 | /* Setup and calculate the 4 v values */ | ||
574 | /* 1 */ tmp2 = (vector signed short)vec_mergeh((vector unsigned char)tmp2, | ||
575 | (vector unsigned char)tmp2); | ||
576 | /* 1 */ v = (vector signed short)vec_mergeh(zero, | ||
577 | (vector unsigned char)tmp2); | ||
578 | /* 4 */ tmp2 = vec_mladd(cgu, u, (vector signed short)zero); | ||
579 | /* 1 */ v = vec_sub(v, c128); | ||
580 | |||
581 | /* Move the data into y and start loading the next 4 pixels */ | ||
582 | /* 1 */ y = (vector signed short)vec_mergeh(zero, | ||
583 | (vector unsigned char)y); | ||
584 | /* 1 */ tmp3 = (vector signed short)vec_mergeh(zero, | ||
585 | (vector unsigned char)tmp3); | ||
586 | /* 1 */ y = vec_or(y, tmp3); | ||
587 | |||
588 | /* Finish calculating y */ | ||
589 | /* 1 */ y = vec_sub(y, c16); | ||
590 | /* 4 */ y = vec_mladd(ymul, y, (vector signed short)zero); | ||
591 | |||
592 | /* Perform non-dependent multiplies first. */ | ||
593 | /* 4 */ tmp1 = vec_mladd(crv, v, y); | ||
594 | /* 4 */ tmp2 = vec_mladd(cgv, v, tmp2); | ||
595 | /* 4 */ tmp3 = vec_mladd(cbu, u, y); | ||
596 | |||
597 | /* Calculate rgb values */ | ||
598 | /* 1 */ r = vec_sra(tmp1, res); | ||
599 | |||
600 | /* 1 */ tmp2 = vec_sub(y, tmp2); | ||
601 | /* 1 */ tmp2 = vec_add(tmp2, c32); | ||
602 | /* 1 */ g = vec_sra(tmp2, res); | ||
603 | |||
604 | /* 1 */ tmp3 = vec_add(tmp3, c32); | ||
605 | /* 1 */ b = vec_sra(tmp3, res); | ||
606 | |||
607 | /* Bound to 0 <= x <= 255 */ | ||
608 | /* 1 */ r = vec_min(r, maxchar); | ||
609 | /* 1 */ g = vec_min(g, maxchar); | ||
610 | /* 1 */ b = vec_min(b, maxchar); | ||
611 | /* 1 */ r = vec_max(r, (vector signed short)zero); | ||
612 | /* 1 */ g = vec_max(g, (vector signed short)zero); | ||
613 | /* 1 */ b = vec_max(b, (vector signed short)zero); | ||
614 | |||
615 | /* Combine r, g and b. */ | ||
616 | /* 2 */ rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g, | ||
617 | pickrg1); | ||
618 | /* 2 */ rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g, | ||
619 | pickrg2); | ||
620 | |||
621 | /* 2 */ rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1); | ||
622 | /* 2 */ rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2); | ||
623 | |||
624 | /* 1 */ rgb1 = vec_or(alpha, rgb1); | ||
625 | /* 1 */ rgb2 = vec_or(alpha, rgb2); | ||
626 | |||
627 | /* 3 */ vec_stl(rgb1, 0, dp1); | ||
628 | dp1 += 16; | ||
629 | /* 3 */ vec_stl(rgb2, 0, dp1); | ||
630 | |||
631 | /* | ||
632 | * Begin the second row calculations | ||
633 | */ | ||
634 | |||
635 | /* | ||
636 | * Load 4 y pixels for the 8x2 pixel block. | ||
637 | */ | ||
638 | /* 3 */ yperm = vec_lvsl(0, yp2); | ||
639 | /* 3 */ tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2); | ||
640 | yp2 += 4; | ||
641 | |||
642 | /* Save y and load the next 4 y pixels for a total of 8 */ | ||
643 | /* 2 */ y = vec_perm(tmp3, tmp3, yperm); | ||
644 | /* 3 */ yperm = vec_lvsr(12, yp2); | ||
645 | /* 3 */ tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2); | ||
646 | /* 1 */ y = (vector signed short)vec_mergeh(zero, | ||
647 | (vector unsigned char)y); | ||
648 | |||
649 | /* Avoid dependency stalls on yperm */ | ||
650 | /* 2 */ tmp3 = vec_perm(tmp3, tmp3, yperm); | ||
651 | /* 1 */ tmp3 = (vector signed short)vec_mergeh(zero, | ||
652 | (vector unsigned char)tmp3); | ||
653 | /* 1 */ y = vec_or(y, tmp3); | ||
654 | |||
655 | /* Start the calculation for g */ | ||
656 | /* 4 */ tmp2 = vec_mladd(cgu, u, (vector signed short)zero); | ||
657 | |||
658 | /* Finish calculating y */ | ||
659 | /* 1 */ y = vec_sub(y, c16); | ||
660 | /* 4 */ y = vec_mladd(ymul, y, (vector signed short)zero); | ||
661 | |||
662 | /* Perform non-dependent multiplies first. */ | ||
663 | /* 4 */ tmp2 = vec_mladd(cgv, v, tmp2); | ||
664 | /* 4 */ tmp1 = vec_mladd(crv, v, y); | ||
665 | /* 4 */ tmp3 = vec_mladd(cbu, u, y); | ||
666 | |||
667 | /* Calculate rgb values */ | ||
668 | /* 1 */ r = vec_sra(tmp1, res); | ||
669 | |||
670 | /* 1 */ tmp2 = vec_sub(y, tmp2); | ||
671 | /* 1 */ tmp2 = vec_add(tmp2, c32); | ||
672 | /* 1 */ g = vec_sra(tmp2, res); | ||
673 | |||
674 | /* 1 */ tmp3 = vec_add(tmp3, c32); | ||
675 | /* 1 */ b = vec_sra(tmp3, res); | ||
676 | |||
677 | /* Bound to 0 <= x <= 255 */ | ||
678 | /* 1 */ r = vec_min(r, maxchar); | ||
679 | /* 1 */ g = vec_min(g, maxchar); | ||
680 | /* 1 */ b = vec_min(b, maxchar); | ||
681 | /* 1 */ r = vec_max(r, (vector signed short)zero); | ||
682 | /* 1 */ g = vec_max(g, (vector signed short)zero); | ||
683 | /* 1 */ b = vec_max(b, (vector signed short)zero); | ||
684 | |||
685 | /* Combine r, g and b. */ | ||
686 | /* 2 */ rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g, | ||
687 | pickrg1); | ||
688 | /* 2 */ rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g, | ||
689 | pickrg2); | ||
690 | |||
691 | /* 2 */ rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1); | ||
692 | /* 2 */ rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2); | ||
693 | |||
694 | /* 1 */ rgb1 = vec_or(alpha, rgb1); | ||
695 | /* 1 */ rgb2 = vec_or(alpha, rgb2); | ||
696 | |||
697 | /* 3 */ vec_stl(rgb1, 0, dp2); | ||
698 | dp2 += 16; | ||
699 | /* 3 */ vec_stl(rgb2, 0, dp2); | ||
700 | |||
701 | /* Increment the YUV data pointers to the next set of pixels. */ | ||
702 | yp1 += 4; | ||
703 | yp2 += 4; | ||
704 | up += 4; | ||
705 | vp += 4; | ||
706 | |||
707 | /* Move the destination pointers to the next set of pixels. */ | ||
708 | dp1 += 16; | ||
709 | dp2 += 16; | ||
710 | } | ||
711 | |||
712 | /* jump down one line since we are doing 2 at once */ | ||
713 | yp1 += w; | ||
714 | yp2 += w; | ||
715 | dp1 += (w * 4); | ||
716 | dp2 += (w * 4); | ||
717 | } | ||
718 | #else | ||
719 | _evas_yv12torgb_diz(yuv, rgb, w, h); | ||
720 | #endif | ||
721 | } | ||
722 | #endif | ||
723 | |||
724 | static void | ||
725 | _evas_yuv_init(void) | ||
726 | { | ||
727 | #ifdef BUILD_C | ||
728 | int i; | ||
729 | |||
730 | for (i = 0; i < 256; i++) | ||
731 | { | ||
732 | _v1164[i] = (int)(((float)(i - 16 )) * 1.164); | ||
733 | |||
734 | _v1596[i] = (int)(((float)(i - 128)) * 1.596); | ||
735 | _v813[i] = (int)(((float)(i - 128)) * 0.813); | ||
736 | |||
737 | _v391[i] = (int)(((float)(i - 128)) * 0.391); | ||
738 | _v2018[i] = (int)(((float)(i - 128)) * 2.018); | ||
739 | } | ||
740 | |||
741 | for (i = -384; i < 640; i++) | ||
742 | { | ||
743 | _clip_lut[i+384] = i < 0 ? 0 : (i > 255) ? 255 : i; | ||
744 | } | ||
745 | #endif | ||
746 | } | ||
747 | |||
748 | #ifdef BUILD_ALTIVEC | ||
749 | static void | ||
750 | _evas_yv12torgb_diz(unsigned char **yuv, unsigned char *rgb, int w, int h) | ||
751 | { | ||
752 | #ifdef BUILD_C | ||
753 | int xx, yy; | ||
754 | int y, u, v, r, g, b; | ||
755 | unsigned char *yp1, *yp2, *up, *vp; | ||
756 | unsigned char *dp1, *dp2; | ||
757 | int crv, cbu, cgu, cgv; | ||
758 | |||
759 | /* destination pointers */ | ||
760 | dp1 = rgb; | ||
761 | dp2 = rgb + (w * 4); | ||
762 | |||
763 | crv = CRV; /* 1.596 */ | ||
764 | cbu = CBU; /* 2.018 */ | ||
765 | cgu = CGU; /* 0.391 */ | ||
766 | cgv = CGV; /* 0.813 */ | ||
767 | |||
768 | for (yy = 0; yy < h; yy += 2) | ||
769 | { | ||
770 | /* plane pointers */ | ||
771 | yp1 = yuv[yy]; | ||
772 | yp2 = yuv[yy + 1]; | ||
773 | up = yuv[h + (yy / 2)]; | ||
774 | vp = yuv[h + (h / 2) + (yy / 2)]; | ||
775 | for (xx = 0; xx < w; xx += 2) | ||
776 | { | ||
777 | /* collect u & v for 2x2 pixel block */ | ||
778 | u = (*up++) - 128; | ||
779 | v = (*vp++) - 128; | ||
780 | |||
781 | /* do the top 2 pixels of the 2x2 block which shared u & v */ | ||
782 | /* yuv to rgb */ | ||
783 | y = YMUL * ((*yp1++) - 16); | ||
784 | r = LUT_CLIP((y + (crv * v)) >> 16); | ||
785 | g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16); | ||
786 | b = LUT_CLIP((y + (cbu * u) + OFF) >> 16); | ||
787 | *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); | ||
788 | |||
789 | dp1 += 4; | ||
790 | |||
791 | /* yuv to rgb */ | ||
792 | y = YMUL * ((*yp1++) - 16); | ||
793 | r = LUT_CLIP((y + (crv * v)) >> 16); | ||
794 | g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16); | ||
795 | b = LUT_CLIP((y + (cbu * u) + OFF) >> 16); | ||
796 | *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b); | ||
797 | |||
798 | dp1 += 4; | ||
799 | |||
800 | /* do the bottom 2 pixels */ | ||
801 | /* yuv to rgb */ | ||
802 | y = YMUL * ((*yp2++) - 16); | ||
803 | r = LUT_CLIP((y + (crv * v)) >> 16); | ||
804 | g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16); | ||
805 | b = LUT_CLIP((y + (cbu * u) + OFF) >> 16); | ||
806 | *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(r,g,b); | ||
807 | |||
808 | dp2 += 4; | ||
809 | |||
810 | /* yuv to rgb */ | ||
811 | y = YMUL * ((*yp2++) - 16); | ||
812 | r = LUT_CLIP((y + (crv * v)) >> 16); | ||
813 | g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16); | ||
814 | b = LUT_CLIP((y + (cbu * u) + OFF) >> 16); | ||
815 | *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(r,g,b); | ||
816 | |||
817 | dp2 += 4; | ||
818 | } | ||
819 | /* jump down one line since we are doing 2 at once */ | ||
820 | dp1 += (w * 4); | ||
821 | dp2 += (w * 4); | ||
822 | } | ||
823 | #endif | ||
824 | } | ||
825 | #endif | ||
826 | |||
827 | static void | ||
828 | _evas_yv12torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h) | ||
829 | { | ||
830 | #ifdef BUILD_C | ||
831 | int xx, yy; | ||
832 | int y, u, v; | ||
833 | unsigned char *yp1, *yp2, *up, *vp; | ||
834 | unsigned char *dp1, *dp2; | ||
835 | |||
836 | /* destination pointers */ | ||
837 | dp1 = rgb; | ||
838 | dp2 = rgb + (w * 4); | ||
839 | |||
840 | for (yy = 0; yy < h; yy += 2) | ||
841 | { | ||
842 | /* plane pointers */ | ||
843 | yp1 = yuv[yy]; | ||
844 | yp2 = yuv[yy + 1]; | ||
845 | up = yuv[h + (yy / 2)]; | ||
846 | vp = yuv[h + (h / 2) + (yy / 2)]; | ||
847 | for (xx = 0; xx < w; xx += 2) | ||
848 | { | ||
849 | int vmu; | ||
850 | |||
851 | /* collect u & v for 2x2 pixel block */ | ||
852 | u = *up++; | ||
853 | v = *vp++; | ||
854 | |||
855 | /* save lookups */ | ||
856 | vmu = _v813[v] + _v391[u]; | ||
857 | u = _v2018[u]; | ||
858 | v = _v1596[v]; | ||
859 | |||
860 | /* do the top 2 pixels of the 2x2 block which shared u & v */ | ||
861 | /* yuv to rgb */ | ||
862 | y = _v1164[*yp1++]; | ||
863 | *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u)); | ||
864 | |||
865 | dp1 += 4; | ||
866 | |||
867 | /* yuv to rgb */ | ||
868 | y = _v1164[*yp1++]; | ||
869 | *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u)); | ||
870 | |||
871 | dp1 += 4; | ||
872 | |||
873 | /* do the bottom 2 pixels */ | ||
874 | /* yuv to rgb */ | ||
875 | y = _v1164[*yp2++]; | ||
876 | *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u)); | ||
877 | |||
878 | dp2 += 4; | ||
879 | |||
880 | /* yuv to rgb */ | ||
881 | y = _v1164[*yp2++]; | ||
882 | *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u)); | ||
883 | |||
884 | dp2 += 4; | ||
885 | } | ||
886 | /* jump down one line since we are doing 2 at once */ | ||
887 | dp1 += (w * 4); | ||
888 | dp2 += (w * 4); | ||
889 | } | ||
890 | #endif | ||
891 | } | ||
892 | |||
893 | void | ||
894 | evas_common_convert_yuv_422_601_rgba(DATA8 **src, DATA8 *dst, int w, int h) | ||
895 | { | ||
896 | #ifdef BUILD_C | ||
897 | if (!initted) _evas_yuv_init(); | ||
898 | initted = 1; | ||
899 | _evas_yuy2torgb_raster(src, dst, w, h); | ||
900 | #endif | ||
901 | } | ||
902 | |||
903 | void | ||
904 | evas_common_convert_yuv_420_601_rgba(DATA8 **src, DATA8 *dst, int w, int h) | ||
905 | { | ||
906 | #ifdef BUILD_C | ||
907 | if (!initted) _evas_yuv_init(); | ||
908 | initted = 1; | ||
909 | _evas_nv12torgb_raster(src, dst, w, h); | ||
910 | #endif | ||
911 | } | ||
912 | |||
913 | void | ||
914 | evas_common_convert_yuv_420T_601_rgba(DATA8 **src, DATA8 *dst, int w, int h) | ||
915 | { | ||
916 | #ifdef BUILD_C | ||
917 | if (initted) _evas_yuv_init(); | ||
918 | initted = 1; | ||
919 | _evas_nv12tiledtorgb_raster(src, dst, w, h); | ||
920 | #endif | ||
921 | } | ||
922 | |||
923 | static void | ||
924 | _evas_yuy2torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h) | ||
925 | { | ||
926 | #ifdef BUILD_C | ||
927 | int xx, yy; | ||
928 | int y, u, v; | ||
929 | unsigned char *yp1, *yp2, *up, *vp; | ||
930 | unsigned char *dp1; | ||
931 | |||
932 | dp1 = rgb; | ||
933 | |||
934 | /* destination pointers */ | ||
935 | for (yy = 0; yy < h; yy++) | ||
936 | { | ||
937 | /* plane pointers */ | ||
938 | unsigned char *line; | ||
939 | |||
940 | line = yuv[yy]; | ||
941 | yp1 = line + 0; | ||
942 | up = line + 1; | ||
943 | yp2 = line + 2; | ||
944 | vp = line + 3; | ||
945 | |||
946 | for (xx = 0; xx < w; xx += 2) | ||
947 | { | ||
948 | int vmu; | ||
949 | |||
950 | /* collect u & v for 2 pixels block */ | ||
951 | u = *up; | ||
952 | v = *vp; | ||
953 | |||
954 | /* save lookups */ | ||
955 | vmu = _v813[v] + _v391[u]; | ||
956 | u = _v2018[u]; | ||
957 | v = _v1596[v]; | ||
958 | |||
959 | /* do the top 2 pixels of the 2x2 block which shared u & v */ | ||
960 | /* yuv to rgb */ | ||
961 | y = _v1164[*yp1]; | ||
962 | *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u)); | ||
963 | |||
964 | dp1 += 4; | ||
965 | |||
966 | /* yuv to rgb */ | ||
967 | y = _v1164[*yp2]; | ||
968 | *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u)); | ||
969 | |||
970 | dp1 += 4; | ||
971 | |||
972 | yp1 += 4; yp2 += 4; up += 4; vp += 4; | ||
973 | } | ||
974 | } | ||
975 | #endif | ||
976 | } | ||
977 | |||
978 | #ifdef BUILD_C | ||
979 | static inline void | ||
980 | _evas_yuv2rgb_420_raster(unsigned char *yp1, unsigned char *yp2, unsigned char *up, unsigned char *vp, | ||
981 | unsigned char *dp1, unsigned char *dp2) | ||
982 | { | ||
983 | int y, u, v; | ||
984 | int vmu; | ||
985 | int rgb; | ||
986 | |||
987 | /* collect u & v for 4 pixels block */ | ||
988 | u = *up; | ||
989 | v = *vp; | ||
990 | |||
991 | /* save lookups */ | ||
992 | #ifdef MEM_BP | ||
993 | vmu = _v813[v] + _v391[u]; | ||
994 | u = _v2018[u]; | ||
995 | v = _v1596[v]; | ||
996 | #else | ||
997 | u -= 128; | ||
998 | v -= 128; | ||
999 | vmu = v * CGV + u * CGU; | ||
1000 | u = u * CBU; | ||
1001 | v = v * CRV; | ||
1002 | #endif | ||
1003 | |||
1004 | /* do the top 2 pixels of the 2x2 block which shared u & v */ | ||
1005 | /* yuv to rgb */ | ||
1006 | #ifdef MEM_BP | ||
1007 | y = _v1164[*yp1]; | ||
1008 | rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u)); | ||
1009 | #else | ||
1010 | y = (*yp1 - 16 ) * YMUL; | ||
1011 | rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)), | ||
1012 | LUT_CLIP(((y - vmu + OFF) >> 16)), | ||
1013 | LUT_CLIP(((y + u + OFF) >> 16))); | ||
1014 | #endif | ||
1015 | *((DATA32 *) dp1) = 0xff000000 + rgb; | ||
1016 | |||
1017 | dp1 += 4; yp1++; | ||
1018 | |||
1019 | /* yuv to rgb */ | ||
1020 | #ifdef MEM_BP | ||
1021 | y = _v1164[*yp1]; | ||
1022 | rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u)); | ||
1023 | #else | ||
1024 | y = (*yp1 - 16 ) * YMUL; | ||
1025 | rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)), | ||
1026 | LUT_CLIP(((y - vmu + OFF) >> 16)), | ||
1027 | LUT_CLIP(((y + u + OFF) >> 16))); | ||
1028 | #endif | ||
1029 | *((DATA32 *) dp1) = 0xff000000 + rgb; | ||
1030 | |||
1031 | /* do the bottom 2 pixels of the 2x2 block which shared u & v */ | ||
1032 | /* yuv to rgb */ | ||
1033 | #ifdef MEM_BP | ||
1034 | y = _v1164[*yp2]; | ||
1035 | rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u)); | ||
1036 | #else | ||
1037 | y = (*yp2 - 16 ) * YMUL; | ||
1038 | rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)), | ||
1039 | LUT_CLIP(((y - vmu + OFF) >> 16)), | ||
1040 | LUT_CLIP(((y + u + OFF) >> 16))); | ||
1041 | #endif | ||
1042 | *((DATA32 *) dp2) = 0xff000000 + rgb; | ||
1043 | |||
1044 | dp2 += 4; yp2++; | ||
1045 | |||
1046 | /* yuv to rgb */ | ||
1047 | #ifdef MEM_BP | ||
1048 | y = _v1164[*yp2]; | ||
1049 | rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u)); | ||
1050 | #else | ||
1051 | y = (*yp2 - 16 ) * YMUL; | ||
1052 | rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)), | ||
1053 | LUT_CLIP(((y - vmu + OFF) >> 16)), | ||
1054 | LUT_CLIP(((y + u + OFF) >> 16))); | ||
1055 | #endif | ||
1056 | *((DATA32 *) dp2) = 0xff000000 + rgb; | ||
1057 | } | ||
1058 | #endif | ||
1059 | |||
1060 | static void | ||
1061 | _evas_nv12tiledtorgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h) | ||
1062 | { | ||
1063 | #ifdef BUILD_C | ||
1064 | |||
1065 | #define HANDLE_MACROBLOCK(YP1, YP2, UP, VP, DP1, DP2) \ | ||
1066 | { \ | ||
1067 | int i; \ | ||
1068 | int j; \ | ||
1069 | \ | ||
1070 | for (i = 0; i < 32; i += 2) \ | ||
1071 | { \ | ||
1072 | for (j = 0; j < 64; j += 2) \ | ||
1073 | { \ | ||
1074 | _evas_yuv2rgb_420_raster(YP1, YP2, UP, VP, DP1, DP2); \ | ||
1075 | \ | ||
1076 | /* the previous call just rendered 2 pixels per lines */ \ | ||
1077 | DP1 += 8; DP2 += 8; \ | ||
1078 | \ | ||
1079 | /* and took for that 2 lines with 2 Y, 1 U and 1 V. Don't forget U & V are in the same plane */ \ | ||
1080 | YP1 += 2; YP2 += 2; UP += 2; VP += 2; \ | ||
1081 | } \ | ||
1082 | \ | ||
1083 | DP1 += sizeof (int) * ((w << 1) - 64); \ | ||
1084 | DP2 += sizeof (int) * ((w << 1) - 64); \ | ||
1085 | YP1 += 64; \ | ||
1086 | YP2 += 64; \ | ||
1087 | } \ | ||
1088 | } | ||
1089 | |||
1090 | /* One macro block is 32 lines of Y and 16 lines of UV */ | ||
1091 | const int offset_value[2] = { 0, 64 * 16 }; | ||
1092 | int mb_x, mb_y, mb_w, mb_h; | ||
1093 | int base_h; | ||
1094 | int uv_x, uv_step; | ||
1095 | int stride; | ||
1096 | |||
1097 | /* Idea iterate over each macroblock and convert each of them using _evas_nv12torgb_raster */ | ||
1098 | |||
1099 | /* The layout of the Y macroblock order in RGB non tiled space : */ | ||
1100 | /* --------------------------------------------------- */ | ||
1101 | /* | 0 | 1 | 6 | 7 | 8 | 9 | 14 | 15 | 16 | 17 | */ | ||
1102 | /* --------------------------------------------------- */ | ||
1103 | /* | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 18 | 19 | */ | ||
1104 | /* --------------------------------------------------- */ | ||
1105 | /* | 20 | 21 | 26 | 27 | 28 | 29 | 34 | 35 | 36 | 37 | */ | ||
1106 | /* --------------------------------------------------- */ | ||
1107 | /* | 22 | 23 | 24 | 25 | 30 | 31 | 32 | 33 | 38 | 39 | */ | ||
1108 | /* --------------------------------------------------- */ | ||
1109 | /* | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | */ | ||
1110 | /* --------------------------------------------------- */ | ||
1111 | /* The layout of the UV macroblock order in the same RGB non tiled space : */ | ||
1112 | /* --------------------------------------------------- */ | ||
1113 | /* | | | | | | | | | | | */ | ||
1114 | /* - 0 - 1 - 6 - 7 - 8 - 9 - 14 - 15 - 16 - 17 - */ | ||
1115 | /* | | | | | | | | | | | */ | ||
1116 | /* --------------------------------------------------- */ | ||
1117 | /* | | | | | | | | | | | */ | ||
1118 | /* - 2 - 3 - 4 - 5 - 10 - 11 - 12 - 13 - 18 - 19 - */ | ||
1119 | /* | | | | | | | | | | | */ | ||
1120 | /* --------------------------------------------------- */ | ||
1121 | /* | | | | | | | | | | | */ | ||
1122 | /* - 20 - 21 - 22 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - */ | ||
1123 | |||
1124 | /* the number of macroblock should be a multiple of 64x32 */ | ||
1125 | mb_w = w / 64; | ||
1126 | mb_h = h / 32; | ||
1127 | |||
1128 | base_h = (mb_h >> 1) + (mb_h & 0x1); | ||
1129 | stride = w * sizeof (int); | ||
1130 | |||
1131 | uv_x = 0; | ||
1132 | |||
1133 | /* In this format we linearize macroblock on two line to form a Z and it's invert */ | ||
1134 | for (mb_y = 0; mb_y < (mb_h >> 1); mb_y++) | ||
1135 | { | ||
1136 | int step = 2; | ||
1137 | int offset = 0; | ||
1138 | int x = 0; | ||
1139 | int rmb_x = 0; | ||
1140 | int ry[2]; | ||
1141 | |||
1142 | ry[0] = mb_y * 2 * 32 * stride; | ||
1143 | ry[1] = ry[0] + 32 * stride; | ||
1144 | |||
1145 | uv_step = (mb_y & 0x1) == 0 ? 4 : 0; | ||
1146 | uv_x = (mb_y & 0x1) == 0 ? 0 : 2 * 64 * 32; | ||
1147 | |||
1148 | for (mb_x = 0; mb_x < mb_w * 2; mb_x++, rmb_x += 64 * 32) | ||
1149 | { | ||
1150 | unsigned char *yp1, *yp2, *up, *vp; | ||
1151 | unsigned char *dp1, *dp2; | ||
1152 | |||
1153 | dp1 = rgb + x + ry[offset]; | ||
1154 | dp2 = dp1 + stride; | ||
1155 | |||
1156 | yp1 = yuv[mb_y] + rmb_x; | ||
1157 | yp2 = yp1 + 64; | ||
1158 | |||
1159 | /* UV plane is two time less bigger in pixel count, but it old two bytes each times */ | ||
1160 | up = yuv[(mb_y >> 1) + base_h] + uv_x + offset_value[offset]; | ||
1161 | vp = up + 1; | ||
1162 | |||
1163 | HANDLE_MACROBLOCK(yp1, yp2, up, vp, dp1, dp2); | ||
1164 | |||
1165 | step++; | ||
1166 | if ((step & 0x3) == 0) | ||
1167 | { | ||
1168 | offset = 1 - offset; | ||
1169 | x -= 64 * sizeof (int); | ||
1170 | uv_x -= 64 * 32; | ||
1171 | } | ||
1172 | else | ||
1173 | { | ||
1174 | x += 64 * sizeof (int); | ||
1175 | uv_x += 64 * 32; | ||
1176 | } | ||
1177 | |||
1178 | uv_step++; | ||
1179 | if (uv_step == 8) | ||
1180 | { | ||
1181 | uv_step = 0; | ||
1182 | uv_x += 4 * 64 * 32; | ||
1183 | } | ||
1184 | } | ||
1185 | } | ||
1186 | |||
1187 | if (mb_h & 0x1) | ||
1188 | { | ||
1189 | int x = 0; | ||
1190 | int ry; | ||
1191 | |||
1192 | ry = mb_y << 1; | ||
1193 | |||
1194 | uv_step = 0; | ||
1195 | uv_x = 0; | ||
1196 | |||
1197 | for (mb_x = 0; mb_x < mb_w; mb_x++, x++, uv_x++) | ||
1198 | { | ||
1199 | unsigned char *yp1, *yp2, *up, *vp; | ||
1200 | unsigned char *dp1, *dp2; | ||
1201 | |||
1202 | dp1 = rgb + (x * 64 + (ry * 32 * w)) * sizeof (int); | ||
1203 | dp2 = dp1 + sizeof (int) * w; | ||
1204 | |||
1205 | yp1 = yuv[mb_y] + mb_x * 64 * 32; | ||
1206 | yp2 = yp1 + 64; | ||
1207 | |||
1208 | up = yuv[mb_y / 2 + base_h] + uv_x * 64 * 32; | ||
1209 | vp = up + 1; | ||
1210 | |||
1211 | HANDLE_MACROBLOCK(yp1, yp2, up, vp, dp1, dp2); | ||
1212 | } | ||
1213 | } | ||
1214 | #endif | ||
1215 | } | ||
1216 | |||
1217 | static void | ||
1218 | _evas_nv12torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h) | ||
1219 | { | ||
1220 | #ifdef BUILD_C | ||
1221 | int xx, yy; | ||
1222 | unsigned char *yp1, *yp2, *up, *vp; | ||
1223 | unsigned char *dp1; | ||
1224 | unsigned char *dp2; | ||
1225 | |||
1226 | dp1 = rgb; | ||
1227 | dp2 = dp1 + sizeof (int) * w; | ||
1228 | |||
1229 | for (yy = 0; yy < h; yy++) | ||
1230 | { | ||
1231 | yp1 = yuv[yy++]; | ||
1232 | yp2 = yuv[yy]; | ||
1233 | |||
1234 | up = yuv[h + (yy >> 1)]; | ||
1235 | vp = up + 1; | ||
1236 | |||
1237 | for (xx = 0; xx < w; xx += 2) | ||
1238 | { | ||
1239 | _evas_yuv2rgb_420_raster(yp1, yp2, up, vp, dp1, dp2); | ||
1240 | |||
1241 | /* the previous call just rendered 2 pixels per lines */ | ||
1242 | dp1 += 8; dp2 += 8; | ||
1243 | |||
1244 | /* and took for that 2 lines with 2 Y, 1 U and 1 V. Don't forget U & V are in the same plane */ | ||
1245 | yp1 += 2; yp2 += 2; up += 2; vp += 2; | ||
1246 | } | ||
1247 | |||
1248 | /* jump one line */ | ||
1249 | dp1 += sizeof (int) * w; | ||
1250 | dp2 += sizeof (int) * w; | ||
1251 | yp1 += w; | ||
1252 | yp2 += w; | ||
1253 | } | ||
1254 | #endif | ||
1255 | } | ||
1256 | |||
1257 | #endif | ||
1258 | |||