From dd7595a3475407a7fa96a97393bae8c5220e8762 Mon Sep 17 00:00:00 2001
From: David Walter Seikel
Date: Wed, 4 Jan 2012 18:41:13 +1000
Subject: Add the base Enlightenment Foundation Libraries - eina, eet, evas,
 ecore, embryo, and edje.

Note that embryo wont be used, but I'm not sure yet if you can build edje without it.
---
 .../evas/src/lib/engines/common/evas_convert_yuv.c | 1258 ++++++++++++++++++++
 1 file changed, 1258 insertions(+)
 create mode 100644 libraries/evas/src/lib/engines/common/evas_convert_yuv.c

(limited to 'libraries/evas/src/lib/engines/common/evas_convert_yuv.c')

diff --git a/libraries/evas/src/lib/engines/common/evas_convert_yuv.c b/libraries/evas/src/lib/engines/common/evas_convert_yuv.c
new file mode 100644
index 0000000..a0d155c
--- /dev/null
+++ b/libraries/evas/src/lib/engines/common/evas_convert_yuv.c
@@ -0,0 +1,1258 @@
+#include "evas_common.h"
+#include "evas_convert_yuv.h"
+
+#if defined BUILD_MMX || defined BUILD_SSE
+# include "evas_mmx.h"
+#endif
+
+#if defined HAVE_ALTIVEC_H
+# include <altivec.h>
+#ifdef CONFIG_DARWIN
+#define AVV(x...) (x)
+#else
+#define AVV(x...) {x}
+#endif
+
+#endif
+
+#ifdef BUILD_CONVERT_YUV
+
+static void _evas_yuv_init         (void);
+static void _evas_yv12torgb_sse    (unsigned char **yuv, unsigned char *rgb, int w, int h);
+static void _evas_yv12torgb_mmx    (unsigned char **yuv, unsigned char *rgb, int w, int h);
+#ifdef BUILD_ALTIVEC
+static void _evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h);
+static void _evas_yv12torgb_diz    (unsigned char **yuv, unsigned char *rgb, int w, int h);
+#endif
+static void _evas_yv12torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
+static void _evas_yuy2torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
+static void _evas_nv12torgb_raster (unsigned char **yuv, unsigned char *rgb, int w, int h);
+static void _evas_nv12tiledtorgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h);
+
+#define CRV    104595
+#define CBU    132251
+#define CGU    25624
+#define CGV    53280
+#define YMUL   76283
+#define OFF    32768
+#define BITRES 16
+
+/* calculation float resolution in bits */
+/* ie RES = 6 is 10.6 fixed point */
+/*    RES = 8 is 8.8 fixed point */
+/*    RES = 4 is 12.4 fixed point */
+/* NB: going above 6 will lead to overflow... :( */
+#define RES    6
+
+#define RZ(i)  (i >> (BITRES - RES))
+#define FOUR(i) {i, i, i, i}
+
+#if defined BUILD_MMX || defined BUILD_SSE
+__attribute__ ((aligned (8))) const volatile unsigned short _const_crvcrv[4] = FOUR(RZ(CRV));
+__attribute__ ((aligned (8))) const volatile unsigned short _const_cbucbu[4] = FOUR(RZ(CBU));
+__attribute__ ((aligned (8))) const volatile unsigned short _const_cgucgu[4] = FOUR(RZ(CGU));
+__attribute__ ((aligned (8))) const volatile unsigned short _const_cgvcgv[4] = FOUR(RZ(CGV));
+__attribute__ ((aligned (8))) const volatile unsigned short _const_ymul  [4] = FOUR(RZ(YMUL));
+__attribute__ ((aligned (8))) const volatile unsigned short _const_128   [4] = FOUR(128);
+__attribute__ ((aligned (8))) const volatile unsigned short _const_32    [4] = FOUR(RZ(OFF));
+__attribute__ ((aligned (8))) const volatile unsigned short _const_16    [4] = FOUR(16);
+__attribute__ ((aligned (8))) const volatile unsigned short _const_ff    [4] = FOUR(-1);
+
+#define CONST_CRVCRV *_const_crvcrv
+#define CONST_CBUCBU *_const_cbucbu
+#define CONST_CGUCGU *_const_cgucgu
+#define CONST_CGVCGV *_const_cgvcgv
+#define CONST_YMUL   *_const_ymul
+#define CONST_128    *_const_128
+#define CONST_32     *_const_32
+#define CONST_16     *_const_16
+#define CONST_FF     *_const_ff
+
+/* for C non aligned cleanup */
+const int _crv = RZ(CRV);   /* 1.596 */
+const int _cbu = RZ(CBU);   /* 2.018 */
+const int _cgu = RZ(CGU);   /* 0.391 */
+const int _cgv = RZ(CGV);   /* 0.813 */
+
+#endif
+
+#ifdef BUILD_ALTIVEC
+#ifdef __VEC__
+const vector unsigned short res     = AVV(RES);
+const vector signed short crv       = AVV(RZ(CRV));
+const vector signed short cbu       = AVV(RZ(CBU));
+const vector signed short cgu       = AVV(RZ(CGU));
+const vector signed short cgv       = AVV(RZ(CGV));
+const vector signed short ymul      = AVV(RZ(YMUL));
+const vector signed short c128      = AVV(128);
+const vector signed short c32       = AVV(RZ(OFF));
+const vector signed short c16       = AVV(16);
+const vector unsigned char zero     = AVV(0);
+const vector signed short maxchar   = AVV(255);
+const vector unsigned char pickrg1  = AVV(0, 0x1, 0x11, 0,
+					  0, 0x3, 0x13, 0,
+					  0, 0x5, 0x15, 0,
+					  0, 0x7, 0x17, 0);
+const vector unsigned char pickrg2  = AVV(0, 0x9, 0x19, 0,
+					  0, 0xb, 0x1b, 0,
+					  0, 0xd, 0x1d, 0,
+					  0, 0xf, 0x1f, 0);
+const vector unsigned char pickrgb1 = AVV(0x3, 0x1, 0x2, 0x11,
+					  0x7, 0x5, 0x6, 0x13,
+					  0xb, 0x9, 0xa, 0x15,
+					  0xf, 0xd, 0xe, 0x17);
+const vector unsigned char pickrgb2 = AVV(0x3, 0x1, 0x2, 0x19,
+					  0x7, 0x5, 0x6, 0x1b,
+					  0xb, 0x9, 0xa, 0x1d,
+					  0xf, 0xd, 0xe, 0x1f);
+#endif
+#endif
+
+#ifdef BUILD_C
+
+/* shortcut speedup lookup-tables */
+static short _v1164[256];
+static short _v1596[256];
+static short _v813[256];
+static short _v391[256];
+static short _v2018[256];
+
+static unsigned char _clip_lut[1024];
+#define LUT_CLIP(i) ((_clip_lut+384)[(i)])
+
+#define CMP_CLIP(i) ((i&256)? (~(i>>10)) : i);
+
+static int initted = 0;
+
+#endif
+
+void
+evas_common_convert_yuv_420p_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
+{
+   int mmx, sse, sse2;
+
+#if defined BUILD_MMX || defined BUILD_SSE
+   evas_common_cpu_can_do(&mmx, &sse, &sse2);
+#endif
+#ifndef BUILD_SSE
+   sse = 0;
+   sse2 = 0;
+#endif
+#ifndef BUILD_MMX
+   mmx = 0;
+#endif
+   if (evas_common_cpu_has_feature(CPU_FEATURE_MMX2))
+     _evas_yv12torgb_sse(src, dst, w, h);
+   else if (evas_common_cpu_has_feature(CPU_FEATURE_MMX))
+     _evas_yv12torgb_mmx(src, dst, w, h);
+#ifdef BUILD_ALTIVEC
+   if (evas_common_cpu_has_feature(CPU_FEATURE_ALTIVEC))
+     _evas_yv12torgb_altivec(src, dst, w, h);
+#endif
+   else
+     {
+#ifdef BUILD_C
+	if (!initted) _evas_yuv_init();
+	initted = 1;
+	/* FIXME: diz may be faster sometimes */
+	_evas_yv12torgb_raster(src, dst, w, h);
+#endif
+     }
+}
+
+/* Thanks to Diz for this code. i've munged it a little and turned it into */
+/* inline macros. I tried beating it with a different algorithm using MMX */
+/* but failed. So here we are. This is the fastest YUV->RGB i know of for */
+/* x86. It has an issue that it doesn't convert colours accurately so the */
+/* image looks a little "yellowy". This is a result of only 10.6 fixed point */
+/* resolution as opposed to 16.16 in the C code. This could be fixed by */
+/* processing half the number of pixels per cycle and going up to 32bits */
+/* per element during compute, but it would all but negate the speedup */
+/* from mmx I think :( It might be possible to use SSE and SSE2 here, but */
+/* I haven't tried yet. Let's see. */
+
+/* NB: XviD has almost the same code in it's assembly YV12->RGB code. same */
+/* algorithm, same constants, same all over actually, except it actually */
+/* does a few extra memory accesses that this one doesn't, so in theory */
+/* this code should be faster. In the end it's all just an mmx version of */
+/* the reference implimentation done with fixed point math */
+
+static void
+_evas_yv12torgb_sse(unsigned char **yuv, unsigned char *rgb, int w, int h)
+{
+#ifdef BUILD_SSE
+   int xx, yy;
+   register unsigned char *yp1, *up, *vp;
+   unsigned char *dp1;
+
+   /* destination pointers */
+   dp1 = rgb;
+
+   for (yy = 0; yy < h; yy++)
+     {
+	/* plane pointers */
+	yp1 = yuv[yy];
+	up = yuv[h + (yy / 2)];
+	vp = yuv[h + (h / 2) + (yy / 2)];
+	for (xx = 0; xx < (w - 7); xx += 8)
+	  {
+	     movd_m2r(*up, mm3);
+	     movd_m2r(*vp, mm2);
+	     movq_m2r(*yp1, mm0);
+
+	     pxor_r2r(mm7, mm7);
+	     punpcklbw_r2r(mm7, mm2);
+	     punpcklbw_r2r(mm7, mm3);
+
+	     movq_r2r(mm0, mm1);
+	     psrlw_i2r(8, mm0);
+	     psllw_i2r(8, mm1);
+	     psrlw_i2r(8, mm1);
+
+	     movq_m2r(CONST_16, mm4);
+	     psubsw_r2r(mm4, mm0);
+	     psubsw_r2r(mm4, mm1);
+
+	     movq_m2r(CONST_128, mm5);
+	     psubsw_r2r(mm5, mm2);
+	     psubsw_r2r(mm5, mm3);
+
+	     movq_m2r(CONST_YMUL, mm4);
+	     pmullw_r2r(mm4, mm0);
+	     pmullw_r2r(mm4, mm1);
+
+	     movq_m2r(CONST_CRVCRV, mm7);
+	     pmullw_r2r(mm3, mm7);
+	     movq_m2r(CONST_CBUCBU, mm6);
+	     pmullw_r2r(mm2, mm6);
+	     movq_m2r(CONST_CGUCGU, mm5);
+	     pmullw_r2r(mm2, mm5);
+	     movq_m2r(CONST_CGVCGV, mm4);
+	     pmullw_r2r(mm3, mm4);
+
+	     movq_r2r(mm0, mm2);
+	     paddsw_r2r(mm7, mm2);
+	     paddsw_r2r(mm1, mm7);
+
+	     psraw_i2r(RES, mm2);
+	     psraw_i2r(RES, mm7);
+	     packuswb_r2r(mm7, mm2);
+
+	     pxor_r2r(mm7, mm7);
+	     movq_r2r(mm2, mm3);
+	     punpckhbw_r2r(mm7, mm2);
+	     punpcklbw_r2r(mm3, mm7);
+	     por_r2r(mm7, mm2);
+
+	     movq_r2r(mm0, mm3);
+	     psubsw_r2r(mm5, mm3);
+	     psubsw_r2r(mm4, mm3);
+	     paddsw_m2r(CONST_32, mm3);
+
+	     movq_r2r(mm1, mm7);
+	     psubsw_r2r(mm5, mm7);
+	     psubsw_r2r(mm4, mm7);
+	     paddsw_m2r(CONST_32, mm7);
+
+	     psraw_i2r(RES, mm3);
+	     psraw_i2r(RES, mm7);
+	     packuswb_r2r(mm7, mm3);
+
+	     pxor_r2r(mm7, mm7);
+	     movq_r2r(mm3, mm4);
+	     punpckhbw_r2r(mm7, mm3);
+	     punpcklbw_r2r(mm4, mm7);
+	     por_r2r(mm7, mm3);
+
+	     movq_m2r(CONST_32, mm4);
+	     paddsw_r2r(mm6, mm0);
+	     paddsw_r2r(mm6, mm1);
+	     paddsw_r2r(mm4, mm0);
+	     paddsw_r2r(mm4, mm1);
+	     psraw_i2r(RES, mm0);
+	     psraw_i2r(RES, mm1);
+	     packuswb_r2r(mm1, mm0);
+
+	     pxor_r2r(mm7, mm7);
+	     movq_r2r(mm0, mm5);
+	     punpckhbw_r2r(mm7, mm0);
+	     punpcklbw_r2r(mm5, mm7);
+	     por_r2r(mm7, mm0);
+
+	     movq_m2r(CONST_FF, mm1);
+	     movq_r2r(mm0, mm5);
+	     movq_r2r(mm3, mm6);
+	     movq_r2r(mm2, mm7);
+	     punpckhbw_r2r(mm3, mm2);
+	     punpcklbw_r2r(mm6, mm7);
+	     punpckhbw_r2r(mm1, mm0);
+	     punpcklbw_r2r(mm1, mm5);
+
+	     movq_r2r(mm7, mm1);
+	     punpckhwd_r2r(mm5, mm7);
+	     punpcklwd_r2r(mm5, mm1);
+
+	     movq_r2r(mm2, mm4);
+	     punpckhwd_r2r(mm0, mm2);
+	     punpcklwd_r2r(mm0, mm4);
+
+	     movntq_r2m(mm1, *(dp1));
+	     movntq_r2m(mm7, *(dp1 + 8));
+	     movntq_r2m(mm4, *(dp1 + 16));
+	     movntq_r2m(mm2, *(dp1 + 24));
+
+	     yp1 += 8;
+	     up += 4;
+	     vp += 4;
+	     dp1 += 8 * 4;
+	  }
+	/* cleanup pixles that arent a multiple of 8 pixels wide */
+	if (xx < w)
+	  {
+	     int y, u, v, r, g, b;
+
+	     for (; xx < w; xx += 2)
+	       {
+		  u = (*up++) - 128;
+		  v = (*vp++) - 128;
+
+		  y = RZ(YMUL) * ((*yp1++) - 16);
+		  r = LUT_CLIP((y + (_crv * v)) >> RES);
+		  g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
+		  b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
+		  *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
+
+		  dp1 += 4;
+
+		  y = RZ(YMUL) * ((*yp1++) - 16);
+		  r = LUT_CLIP((y + (_crv * v)) >> RES);
+		  g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
+		  b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
+		  *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
+
+		  dp1 += 4;
+	       }
+	  }
+     }
+   emms();
+#else
+   _evas_yv12torgb_mmx(yuv, rgb, w, h);
+#endif
+}
+
+static void
+_evas_yv12torgb_mmx(unsigned char **yuv, unsigned char *rgb, int w, int h)
+{
+#ifdef BUILD_MMX
+   int xx, yy;
+   register unsigned char *yp1, *up, *vp;
+   unsigned char *dp1;
+
+   /* destination pointers */
+   dp1 = rgb;
+
+   for (yy = 0; yy < h; yy++)
+     {
+	/* plane pointers */
+	yp1 = yuv[yy];
+	up = yuv[h + (yy / 2)];
+	vp = yuv[h + (h / 2) + (yy / 2)];
+	for (xx = 0; xx < (w - 7); xx += 8)
+	  {
+	     movd_m2r(*up, mm3);
+	     movd_m2r(*vp, mm2);
+	     movq_m2r(*yp1, mm0);
+
+	     pxor_r2r(mm7, mm7);
+	     punpcklbw_r2r(mm7, mm2);
+	     punpcklbw_r2r(mm7, mm3);
+
+	     movq_r2r(mm0, mm1);
+	     psrlw_i2r(8, mm0);
+	     psllw_i2r(8, mm1);
+	     psrlw_i2r(8, mm1);
+
+	     movq_m2r(CONST_16, mm4);
+	     psubsw_r2r(mm4, mm0);
+	     psubsw_r2r(mm4, mm1);
+
+	     movq_m2r(CONST_128, mm5);
+	     psubsw_r2r(mm5, mm2);
+	     psubsw_r2r(mm5, mm3);
+
+	     movq_m2r(CONST_YMUL, mm4);
+	     pmullw_r2r(mm4, mm0);
+	     pmullw_r2r(mm4, mm1);
+
+	     movq_m2r(CONST_CRVCRV, mm7);
+	     pmullw_r2r(mm3, mm7);
+	     movq_m2r(CONST_CBUCBU, mm6);
+	     pmullw_r2r(mm2, mm6);
+	     movq_m2r(CONST_CGUCGU, mm5);
+	     pmullw_r2r(mm2, mm5);
+	     movq_m2r(CONST_CGVCGV, mm4);
+	     pmullw_r2r(mm3, mm4);
+
+	     movq_r2r(mm0, mm2);
+	     paddsw_r2r(mm7, mm2);
+	     paddsw_r2r(mm1, mm7);
+
+	     psraw_i2r(RES, mm2);
+	     psraw_i2r(RES, mm7);
+	     packuswb_r2r(mm7, mm2);
+
+	     pxor_r2r(mm7, mm7);
+	     movq_r2r(mm2, mm3);
+	     punpckhbw_r2r(mm7, mm2);
+	     punpcklbw_r2r(mm3, mm7);
+	     por_r2r(mm7, mm2);
+
+	     movq_r2r(mm0, mm3);
+	     psubsw_r2r(mm5, mm3);
+	     psubsw_r2r(mm4, mm3);
+	     paddsw_m2r(CONST_32, mm3);
+
+	     movq_r2r(mm1, mm7);
+	     psubsw_r2r(mm5, mm7);
+	     psubsw_r2r(mm4, mm7);
+	     paddsw_m2r(CONST_32, mm7);
+
+	     psraw_i2r(RES, mm3);
+	     psraw_i2r(RES, mm7);
+	     packuswb_r2r(mm7, mm3);
+
+	     pxor_r2r(mm7, mm7);
+	     movq_r2r(mm3, mm4);
+	     punpckhbw_r2r(mm7, mm3);
+	     punpcklbw_r2r(mm4, mm7);
+	     por_r2r(mm7, mm3);
+
+	     movq_m2r(CONST_32, mm4);
+	     paddsw_r2r(mm6, mm0);
+	     paddsw_r2r(mm6, mm1);
+	     paddsw_r2r(mm4, mm0);
+	     paddsw_r2r(mm4, mm1);
+	     psraw_i2r(RES, mm0);
+	     psraw_i2r(RES, mm1);
+	     packuswb_r2r(mm1, mm0);
+
+	     pxor_r2r(mm7, mm7);
+	     movq_r2r(mm0, mm5);
+	     punpckhbw_r2r(mm7, mm0);
+	     punpcklbw_r2r(mm5, mm7);
+	     por_r2r(mm7, mm0);
+
+	     movq_m2r(CONST_FF, mm1);
+	     movq_r2r(mm0, mm5);
+	     movq_r2r(mm3, mm6);
+	     movq_r2r(mm2, mm7);
+	     punpckhbw_r2r(mm3, mm2);
+	     punpcklbw_r2r(mm6, mm7);
+	     punpckhbw_r2r(mm1, mm0);
+	     punpcklbw_r2r(mm1, mm5);
+
+	     movq_r2r(mm7, mm1);
+	     punpckhwd_r2r(mm5, mm7);
+	     punpcklwd_r2r(mm5, mm1);
+
+	     movq_r2r(mm2, mm4);
+	     punpckhwd_r2r(mm0, mm2);
+	     punpcklwd_r2r(mm0, mm4);
+
+	     movq_r2m(mm1, *(dp1));
+	     movq_r2m(mm7, *(dp1 + 8));
+	     movq_r2m(mm4, *(dp1 + 16));
+	     movq_r2m(mm2, *(dp1 + 24));
+
+	     yp1 += 8;
+	     up += 4;
+	     vp += 4;
+	     dp1 += 8 * 4;
+	  }
+	/* cleanup pixles that arent a multiple of 8 pixels wide */
+	if (xx < w)
+	  {
+	     int y, u, v, r, g, b;
+
+	     for (; xx < w; xx += 2)
+	       {
+		  u = (*up++) - 128;
+		  v = (*vp++) - 128;
+
+		  y = RZ(YMUL) * ((*yp1++) - 16);
+		  r = LUT_CLIP((y + (_crv * v)) >> RES);
+		  g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
+		  b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
+		  *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
+
+		  dp1 += 4;
+
+		  y = RZ(YMUL) * ((*yp1++) - 16);
+		  r = LUT_CLIP((y + (_crv * v)) >> RES);
+		  g = LUT_CLIP((y - (_cgu * u) - (_cgv * v) + RZ(OFF)) >> RES);
+		  b = LUT_CLIP((y + (_cbu * u) + RZ(OFF)) >> RES);
+		  *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
+
+		  dp1 += 4;
+	       }
+	  }
+     }
+   emms();
+#else
+   _evas_yv12torgb_raster(yuv, rgb, w, h);
+#endif
+}
+
+#ifdef BUILD_ALTIVEC
+static void
+_evas_yv12torgb_altivec(unsigned char **yuv, unsigned char *rgb, int w, int h)
+{
+#ifdef __VEC__
+   int xx, yy;
+   int w2, h2;
+   unsigned char *yp1, *yp2, *up, *vp;
+   unsigned char *dp1, *dp2;
+   vector signed short y, u, v;
+   vector signed short r, g, b;
+   vector signed short tmp1, tmp2, tmp3;
+   vector unsigned char yperm, uperm, vperm, rgb1, rgb2;
+   vector unsigned char alpha;
+
+   /* handy halved w & h */
+   w2 = w / 2;
+   h2 = h / 2;
+   /* plane pointers */
+   yp1 = yuv;
+   yp2 = yuv + w;
+   up = yuv + (w * h);
+   vp = up + (w2 * h2);
+   /* destination pointers */
+   dp1 = rgb;
+   dp2 = rgb + (w * 4);
+
+   alpha = vec_mergeh((vector unsigned char)AVV(255), zero);
+   alpha = (vector unsigned char)vec_mergeh((vector unsigned short)alpha,
+					    (vector unsigned short)zero);
+
+   for (yy = 0; yy < h2; yy++)
+     {
+	for (xx = 0; xx < w2; xx += 4)
+	  {
+/* Cycles */
+	     /*
+	      * Load 4 y and 4 u & v pixels for the 8x2 pixel block.
+	      */
+/* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1);
+/* 3 */      tmp1 = (vector signed short)vec_lde(0, (unsigned int *)up);
+/* 3 */      tmp2 = (vector signed short)vec_lde(0, (unsigned int *)vp);
+
+	     /* Prepare for aligning the data in their vectors */
+/* 3 */      yperm = vec_lvsl(0, yp1);
+/* 3 */      uperm = vec_lvsl(0, up);
+/* 3 */      vperm = vec_lvsl(0, vp);
+	     yp1 += 4;
+
+	     /* Save y and load the next 4 y pixels for a total of 8 */
+/* 2 */      y = vec_perm(tmp3, tmp3, yperm);
+/* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp1);
+
+	     /* Setup and calculate the 4 u pixels */
+/* 2 */      tmp1 = vec_perm(tmp1, tmp1, uperm);
+/* 2 */      tmp2 = vec_perm(tmp2, tmp2, vperm);
+
+	     /* Avoid dependency stalls on yperm and calculate the 4 u values */
+/* 3 */      yperm = vec_lvsr(12, yp1);
+/* 1 */      tmp1 = (vector signed short)vec_mergeh((vector unsigned char)tmp1,
+						    (vector unsigned char)tmp1);
+/* 1 */      u = (vector signed short)vec_mergeh(zero,
+						 (vector unsigned char)tmp1);
+
+/* 1 */      u = vec_sub(u, c128);
+/* 2 */      tmp3 = vec_perm(tmp3, tmp3, yperm);
+
+	     /* Setup and calculate the 4 v values */
+/* 1 */      tmp2 = (vector signed short)vec_mergeh((vector unsigned char)tmp2,
+						    (vector unsigned char)tmp2);
+/* 1 */      v = (vector signed short)vec_mergeh(zero,
+						 (vector unsigned char)tmp2);
+/* 4 */      tmp2 = vec_mladd(cgu, u, (vector signed short)zero);
+/* 1 */      v = vec_sub(v, c128);
+
+	     /* Move the data into y and start loading the next 4 pixels */
+/* 1 */      y = (vector signed short)vec_mergeh(zero,
+						 (vector unsigned char)y);
+/* 1 */      tmp3 = (vector signed short)vec_mergeh(zero,
+						    (vector unsigned char)tmp3);
+/* 1 */      y = vec_or(y, tmp3);
+
+	     /* Finish calculating y */
+/* 1 */      y = vec_sub(y, c16);
+/* 4 */      y = vec_mladd(ymul, y, (vector signed short)zero);
+
+	     /* Perform non-dependent multiplies first. */
+/* 4 */      tmp1 = vec_mladd(crv, v, y);
+/* 4 */      tmp2 = vec_mladd(cgv, v, tmp2);
+/* 4 */      tmp3 = vec_mladd(cbu, u, y);
+
+	     /* Calculate rgb values */
+/* 1 */	     r = vec_sra(tmp1, res);
+
+/* 1 */	     tmp2 = vec_sub(y, tmp2);
+/* 1 */      tmp2 = vec_add(tmp2, c32);
+/* 1 */      g = vec_sra(tmp2, res);
+
+/* 1 */	     tmp3 = vec_add(tmp3, c32);
+/* 1 */	     b = vec_sra(tmp3, res);
+
+	     /* Bound to 0 <= x <= 255 */
+/* 1 */	     r = vec_min(r, maxchar);
+/* 1 */	     g = vec_min(g, maxchar);
+/* 1 */	     b = vec_min(b, maxchar);
+/* 1 */	     r = vec_max(r, (vector signed short)zero);
+/* 1 */	     g = vec_max(g, (vector signed short)zero);
+/* 1 */	     b = vec_max(b, (vector signed short)zero);
+
+	     /* Combine r, g and b. */
+/* 2 */	     rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
+			     pickrg1);
+/* 2 */	     rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
+			    pickrg2);
+
+/* 2 */	     rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1);
+/* 2 */	     rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2);
+
+/* 1 */      rgb1 = vec_or(alpha, rgb1);
+/* 1 */      rgb2 = vec_or(alpha, rgb2);
+
+/* 3 */	     vec_stl(rgb1, 0, dp1);
+	     dp1 += 16;
+/* 3 */	     vec_stl(rgb2, 0, dp1);
+
+	     /*
+	      * Begin the second row calculations
+	      */
+
+	     /*
+	      * Load 4 y pixels for the 8x2 pixel block.
+	      */
+/* 3 */      yperm = vec_lvsl(0, yp2);
+/* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2);
+	     yp2 += 4;
+
+	     /* Save y and load the next 4 y pixels for a total of 8 */
+/* 2 */      y = vec_perm(tmp3, tmp3, yperm);
+/* 3 */      yperm = vec_lvsr(12, yp2);
+/* 3 */      tmp3 = (vector signed short)vec_lde(0, (unsigned int *)yp2);
+/* 1 */      y = (vector signed short)vec_mergeh(zero,
+						 (vector unsigned char)y);
+
+	     /* Avoid dependency stalls on yperm */
+/* 2 */      tmp3 = vec_perm(tmp3, tmp3, yperm);
+/* 1 */      tmp3 = (vector signed short)vec_mergeh(zero,
+						    (vector unsigned char)tmp3);
+/* 1 */      y = vec_or(y, tmp3);
+
+	     /* Start the calculation for g */
+/* 4 */      tmp2 = vec_mladd(cgu, u, (vector signed short)zero);
+
+	     /* Finish calculating y */
+/* 1 */      y = vec_sub(y, c16);
+/* 4 */      y = vec_mladd(ymul, y, (vector signed short)zero);
+
+	     /* Perform non-dependent multiplies first. */
+/* 4 */      tmp2 = vec_mladd(cgv, v, tmp2);
+/* 4 */      tmp1 = vec_mladd(crv, v, y);
+/* 4 */      tmp3 = vec_mladd(cbu, u, y);
+
+	     /* Calculate rgb values */
+/* 1 */	     r = vec_sra(tmp1, res);
+
+/* 1 */	     tmp2 = vec_sub(y, tmp2);
+/* 1 */      tmp2 = vec_add(tmp2, c32);
+/* 1 */      g = vec_sra(tmp2, res);
+
+/* 1 */	     tmp3 = vec_add(tmp3, c32);
+/* 1 */	     b = vec_sra(tmp3, res);
+
+	     /* Bound to 0 <= x <= 255 */
+/* 1 */	     r = vec_min(r, maxchar);
+/* 1 */	     g = vec_min(g, maxchar);
+/* 1 */	     b = vec_min(b, maxchar);
+/* 1 */	     r = vec_max(r, (vector signed short)zero);
+/* 1 */	     g = vec_max(g, (vector signed short)zero);
+/* 1 */	     b = vec_max(b, (vector signed short)zero);
+
+	     /* Combine r, g and b. */
+/* 2 */	     rgb1 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
+			    pickrg1);
+/* 2 */	     rgb2 = vec_perm((vector unsigned char)r, (vector unsigned char)g,
+			    pickrg2);
+
+/* 2 */	     rgb1 = vec_perm(rgb1, (vector unsigned char)b, pickrgb1);
+/* 2 */	     rgb2 = vec_perm(rgb2, (vector unsigned char)b, pickrgb2);
+
+/* 1 */      rgb1 = vec_or(alpha, rgb1);
+/* 1 */      rgb2 = vec_or(alpha, rgb2);
+
+/* 3 */	     vec_stl(rgb1, 0, dp2);
+	     dp2 += 16;
+/* 3 */	     vec_stl(rgb2, 0, dp2);
+
+	     /* Increment the YUV data pointers to the next set of pixels. */
+	     yp1 += 4;
+	     yp2 += 4;
+	     up += 4;
+	     vp += 4;
+
+	     /* Move the destination pointers to the next set of pixels. */
+	     dp1 += 16;
+	     dp2 += 16;
+	  }
+
+	/* jump down one line since we are doing 2 at once */
+	yp1 += w;
+	yp2 += w;
+	dp1 += (w * 4);
+	dp2 += (w * 4);
+     }
+#else
+   _evas_yv12torgb_diz(yuv, rgb, w, h);
+#endif
+}
+#endif
+
+static void
+_evas_yuv_init(void)
+{
+#ifdef BUILD_C
+   int i;
+
+   for (i = 0; i < 256; i++)
+     {
+	_v1164[i] = (int)(((float)(i - 16 )) * 1.164);
+
+	_v1596[i] = (int)(((float)(i - 128)) * 1.596);
+	_v813[i]  = (int)(((float)(i - 128)) * 0.813);
+
+	_v391[i]  = (int)(((float)(i - 128)) * 0.391);
+	_v2018[i] = (int)(((float)(i - 128)) * 2.018);
+     }
+
+   for (i = -384; i < 640; i++)
+     {
+	_clip_lut[i+384] = i < 0 ? 0 : (i > 255) ? 255 : i;
+     }
+#endif
+}
+
+#ifdef BUILD_ALTIVEC
+static void
+_evas_yv12torgb_diz(unsigned char **yuv, unsigned char *rgb, int w, int h)
+{
+#ifdef BUILD_C
+   int xx, yy;
+   int y, u, v, r, g, b;
+   unsigned char *yp1, *yp2, *up, *vp;
+   unsigned char *dp1, *dp2;
+   int crv, cbu, cgu, cgv;
+
+   /* destination pointers */
+   dp1 = rgb;
+   dp2 = rgb + (w * 4);
+
+   crv = CRV;   /* 1.596 */
+   cbu = CBU;   /* 2.018 */
+   cgu = CGU;   /* 0.391 */
+   cgv = CGV;   /* 0.813 */
+
+   for (yy = 0; yy < h; yy += 2)
+     {
+	/* plane pointers */
+	yp1 = yuv[yy];
+	yp2 = yuv[yy + 1];
+	up = yuv[h + (yy / 2)];
+	vp = yuv[h + (h / 2) + (yy / 2)];
+	for (xx = 0; xx < w; xx += 2)
+	  {
+	     /* collect u & v for 2x2 pixel block */
+	     u = (*up++) - 128;
+	     v = (*vp++) - 128;
+
+	     /* do the top 2 pixels of the 2x2 block which shared u & v */
+	     /* yuv to rgb */
+	     y = YMUL * ((*yp1++) - 16);
+	     r = LUT_CLIP((y + (crv * v)) >> 16);
+	     g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
+	     b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
+	     *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
+
+	     dp1 += 4;
+
+	     /* yuv to rgb */
+	     y = YMUL * ((*yp1++) - 16);
+	     r = LUT_CLIP((y + (crv * v)) >> 16);
+	     g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
+	     b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
+	     *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(r,g,b);
+
+	     dp1 += 4;
+
+	     /* do the bottom 2 pixels */
+	     /* yuv to rgb */
+	     y = YMUL * ((*yp2++) - 16);
+	     r = LUT_CLIP((y + (crv * v)) >> 16);
+	     g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
+	     b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
+	     *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(r,g,b);
+
+	     dp2 += 4;
+
+	     /* yuv to rgb */
+	     y = YMUL * ((*yp2++) - 16);
+	     r = LUT_CLIP((y + (crv * v)) >> 16);
+	     g = LUT_CLIP((y - (cgu * u) - (cgv * v) + OFF) >>16);
+	     b = LUT_CLIP((y + (cbu * u) + OFF) >> 16);
+	     *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(r,g,b);
+
+	     dp2 += 4;
+	  }
+	/* jump down one line since we are doing 2 at once */
+	dp1 += (w * 4);
+	dp2 += (w * 4);
+     }
+#endif
+}
+#endif
+
+static void
+_evas_yv12torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
+{
+#ifdef BUILD_C
+   int xx, yy;
+   int y, u, v;
+   unsigned char *yp1, *yp2, *up, *vp;
+   unsigned char *dp1, *dp2;
+
+   /* destination pointers */
+   dp1 = rgb;
+   dp2 = rgb + (w * 4);
+
+   for (yy = 0; yy < h; yy += 2)
+     {
+	/* plane pointers */
+	yp1 = yuv[yy];
+	yp2 = yuv[yy + 1];
+	up = yuv[h + (yy / 2)];
+	vp = yuv[h + (h / 2) + (yy / 2)];
+	for (xx = 0; xx < w; xx += 2)
+	  {
+	     int vmu;
+
+	     /* collect u & v for 2x2 pixel block */
+	     u = *up++;
+	     v = *vp++;
+
+	     /* save lookups */
+	     vmu = _v813[v] + _v391[u];
+	     u = _v2018[u];
+	     v = _v1596[v];
+
+             /* do the top 2 pixels of the 2x2 block which shared u & v */
+	     /* yuv to rgb */
+	     y = _v1164[*yp1++];
+	     *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
+
+	     dp1 += 4;
+
+	     /* yuv to rgb */
+	     y = _v1164[*yp1++];
+	     *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
+
+	     dp1 += 4;
+
+	     /* do the bottom 2 pixels */
+	     /* yuv to rgb */
+	     y = _v1164[*yp2++];
+	     *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
+
+	     dp2 += 4;
+
+	     /* yuv to rgb */
+	     y = _v1164[*yp2++];
+	     *((DATA32 *) dp2) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
+
+	     dp2 += 4;
+	  }
+	/* jump down one line since we are doing 2 at once */
+	dp1 += (w * 4);
+	dp2 += (w * 4);
+     }
+#endif
+}
+
+void
+evas_common_convert_yuv_422_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
+{
+#ifdef BUILD_C
+   if (!initted) _evas_yuv_init();
+   initted = 1;
+   _evas_yuy2torgb_raster(src, dst, w, h);
+#endif
+}
+
+void
+evas_common_convert_yuv_420_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
+{
+#ifdef BUILD_C
+   if (!initted) _evas_yuv_init();
+   initted = 1;
+   _evas_nv12torgb_raster(src, dst, w, h);
+#endif
+}
+
+void
+evas_common_convert_yuv_420T_601_rgba(DATA8 **src, DATA8 *dst, int w, int h)
+{
+#ifdef BUILD_C
+   if (initted) _evas_yuv_init();
+   initted = 1;
+   _evas_nv12tiledtorgb_raster(src, dst, w, h);
+#endif
+}
+
+static void
+_evas_yuy2torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
+{
+#ifdef BUILD_C
+   int xx, yy;
+   int y, u, v;
+   unsigned char *yp1, *yp2, *up, *vp;
+   unsigned char *dp1;
+
+   dp1 = rgb;
+
+   /* destination pointers */
+   for (yy = 0; yy < h; yy++)
+     {
+        /* plane pointers */
+        unsigned char *line;
+
+        line = yuv[yy];
+        yp1 = line + 0;
+        up = line + 1;
+        yp2 = line + 2;
+        vp = line + 3;
+
+        for (xx = 0; xx < w; xx += 2)
+          {
+             int vmu;
+
+             /* collect u & v for 2 pixels block */
+             u = *up;
+             v = *vp;
+
+             /* save lookups */
+             vmu = _v813[v] + _v391[u];
+             u = _v2018[u];
+             v = _v1596[v];
+
+             /* do the top 2 pixels of the 2x2 block which shared u & v */
+	     /* yuv to rgb */
+	     y = _v1164[*yp1];
+	     *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
+
+	     dp1 += 4;
+
+	     /* yuv to rgb */
+	     y = _v1164[*yp2];
+	     *((DATA32 *) dp1) = 0xff000000 + RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
+
+             dp1 += 4;
+
+	     yp1 += 4; yp2 += 4; up += 4; vp += 4;
+	  }
+     }
+#endif
+}
+
+#ifdef BUILD_C
+static inline void
+_evas_yuv2rgb_420_raster(unsigned char *yp1, unsigned char *yp2, unsigned char *up, unsigned char *vp,
+                         unsigned char *dp1, unsigned char *dp2)
+{
+   int y, u, v;
+   int vmu;
+   int rgb;
+
+   /* collect u & v for 4 pixels block */
+   u = *up;
+   v = *vp;
+
+   /* save lookups */
+#ifdef MEM_BP
+   vmu = _v813[v] + _v391[u];
+   u = _v2018[u];
+   v = _v1596[v];
+#else
+   u -= 128;
+   v -= 128;
+   vmu = v * CGV + u * CGU;
+   u = u * CBU;
+   v = v * CRV;
+#endif
+
+   /* do the top 2 pixels of the 2x2 block which shared u & v */
+   /* yuv to rgb */
+#ifdef MEM_BP
+   y = _v1164[*yp1];
+   rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
+#else
+   y = (*yp1 - 16 ) * YMUL;
+   rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
+                  LUT_CLIP(((y - vmu + OFF) >> 16)),
+                  LUT_CLIP(((y + u + OFF) >> 16)));
+#endif
+   *((DATA32 *) dp1) = 0xff000000 + rgb;
+
+   dp1 += 4; yp1++;
+
+   /* yuv to rgb */
+#ifdef MEM_BP
+   y = _v1164[*yp1];
+   rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
+#else
+   y = (*yp1 - 16 ) * YMUL;
+   rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
+                  LUT_CLIP(((y - vmu + OFF) >> 16)),
+                  LUT_CLIP(((y + u + OFF) >> 16)));
+#endif
+   *((DATA32 *) dp1) = 0xff000000 + rgb;
+
+   /* do the bottom 2 pixels of the 2x2 block which shared u & v */
+   /* yuv to rgb */
+#ifdef MEM_BP
+   y = _v1164[*yp2];
+   rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
+#else
+   y = (*yp2 - 16 ) * YMUL;
+   rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
+                  LUT_CLIP(((y - vmu + OFF) >> 16)),
+                  LUT_CLIP(((y + u + OFF) >> 16)));
+#endif
+   *((DATA32 *) dp2) = 0xff000000 + rgb;
+
+   dp2 += 4; yp2++;
+
+   /* yuv to rgb */
+#ifdef MEM_BP
+   y = _v1164[*yp2];
+   rgb = RGB_JOIN(LUT_CLIP(y + v), LUT_CLIP(y - vmu), LUT_CLIP(y + u));
+#else
+   y = (*yp2 - 16 ) * YMUL;
+   rgb = RGB_JOIN(LUT_CLIP(((y + v) >> 16)),
+                  LUT_CLIP(((y - vmu + OFF) >> 16)),
+                  LUT_CLIP(((y + u + OFF) >> 16)));
+#endif
+   *((DATA32 *) dp2) = 0xff000000 + rgb;
+}
+#endif
+
+static void
+_evas_nv12tiledtorgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
+{
+#ifdef BUILD_C
+
+#define HANDLE_MACROBLOCK(YP1, YP2, UP, VP, DP1, DP2)                   \
+   {                                                                    \
+     int i;                                                             \
+     int j;                                                             \
+                                                                        \
+     for (i = 0; i < 32; i += 2)                                        \
+       {                                                                \
+          for (j = 0; j < 64; j += 2)                                   \
+            {                                                           \
+               _evas_yuv2rgb_420_raster(YP1, YP2, UP, VP, DP1, DP2);    \
+                                                                        \
+               /* the previous call just rendered 2 pixels per lines */ \
+               DP1 += 8; DP2 += 8;                                      \
+                                                                        \
+               /* and took for that 2 lines with 2 Y, 1 U and 1 V. Don't forget U & V are in the same plane */ \
+               YP1 += 2; YP2 += 2; UP += 2; VP += 2;                    \
+            }                                                           \
+                                                                        \
+          DP1 += sizeof (int) * ((w << 1) - 64);			\
+          DP2 += sizeof (int) * ((w << 1) - 64);			\
+          YP1 += 64;                                                    \
+          YP2 += 64;                                                    \
+       }                                                                \
+   }
+
+   /* One macro block is 32 lines of Y and 16 lines of UV */
+   const int offset_value[2] = { 0, 64 * 16 };
+   int mb_x, mb_y, mb_w, mb_h;
+   int base_h;
+   int uv_x, uv_y, uv_step;
+   int stride;
+
+   /* Idea iterate over each macroblock and convert each of them using _evas_nv12torgb_raster */
+
+   /* The layout of the Y macroblock order in RGB non tiled space : */
+   /* --------------------------------------------------- */
+   /* | 0  | 1  | 6  | 7  | 8  | 9  | 14 | 15 | 16 | 17 | */
+   /* --------------------------------------------------- */
+   /* | 2  | 3  | 4  | 5  | 10 | 11 | 12 | 13 | 18 | 19 | */
+   /* --------------------------------------------------- */
+   /* | 20 | 21 | 26 | 27 | 28 | 29 | 34 | 35 | 36 | 37 | */
+   /* --------------------------------------------------- */
+   /* | 22 | 23 | 24 | 25 | 30 | 31 | 32 | 33 | 38 | 39 | */
+   /* --------------------------------------------------- */
+   /* | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | */
+   /* --------------------------------------------------- */
+   /* The layout of the UV macroblock order in the same RGB non tiled space : */
+   /* --------------------------------------------------- */
+   /* |    |    |    |    |    |    |    |    |    |    | */
+   /* - 0  - 1  - 6  - 7  - 8  - 9  - 14 - 15 - 16 - 17 - */
+   /* |    |    |    |    |    |    |    |    |    |    | */
+   /* --------------------------------------------------- */
+   /* |    |    |    |    |    |    |    |    |    |    | */
+   /* - 2  - 3  - 4  - 5  - 10 - 11 - 12 - 13 - 18 - 19 - */
+   /* |    |    |    |    |    |    |    |    |    |    | */
+   /* --------------------------------------------------- */
+   /* |    |    |    |    |    |    |    |    |    |    | */
+   /* - 20 - 21 - 22 - 22 - 23 - 24 - 25 - 26 - 27 - 28 - */
+
+   /* the number of macroblock should be a multiple of 64x32 */
+   mb_w = w / 64;
+   mb_h = h / 32;
+
+   base_h = (mb_h >> 1) + (mb_h & 0x1);
+   stride = w * sizeof (int);
+
+   uv_x = 0; uv_y = 0;
+
+   /* In this format we linearize macroblock on two line to form a Z and it's invert */
+   for (mb_y = 0; mb_y < (mb_h >> 1); mb_y++)
+     {
+        int step = 2;
+        int offset = 0;
+        int x = 0;
+	int rmb_x = 0;
+	int ry[2];
+
+	ry[0] = mb_y * 2 * 32 * stride;
+	ry[1] = ry[0] + 32 * stride;
+
+	uv_step = (mb_y & 0x1) == 0 ? 4 : 0;
+	uv_x = (mb_y & 0x1) == 0 ? 0 : 2 * 64 * 32;
+
+	for (mb_x = 0; mb_x < mb_w * 2; mb_x++, rmb_x += 64 * 32)
+	  {
+	    unsigned char *yp1, *yp2, *up, *vp;
+	    unsigned char *dp1, *dp2;
+
+	    dp1 = rgb + x + ry[offset];
+	    dp2 = dp1 + stride;
+
+	    yp1 = yuv[mb_y] + rmb_x;
+	    yp2 = yp1 + 64;
+
+	    /* UV plane is two time less bigger in pixel count, but it old two bytes each times */
+	    up = yuv[(mb_y >> 1) + base_h] + uv_x + offset_value[offset];
+	    vp = up + 1;
+
+	    HANDLE_MACROBLOCK(yp1, yp2, up, vp, dp1, dp2);
+
+	    step++;
+	    if ((step & 0x3) == 0)
+	      {
+		offset = 1 - offset;
+		x -= 64 * sizeof (int);
+		uv_x -= 64 * 32;
+	      }
+	    else
+	      {
+		x += 64 * sizeof (int);
+		uv_x += 64 * 32;
+	      }
+
+	    uv_step++;
+	    if (uv_step == 8)
+	      {
+		uv_step = 0;
+		uv_x += 4 * 64 * 32;
+	      }
+	  }
+     }
+
+   if (mb_h & 0x1)
+     {
+        int x = 0;
+	int ry;
+
+	ry = mb_y << 1;
+
+	uv_step = 0;
+	uv_x = 0;
+
+        for (mb_x = 0; mb_x < mb_w; mb_x++, x++, uv_x++)
+          {
+             unsigned char *yp1, *yp2, *up, *vp;
+             unsigned char *dp1, *dp2;
+
+             dp1 = rgb + (x * 64 + (ry * 32 * w)) * sizeof (int);
+             dp2 = dp1 + sizeof (int) * w;
+
+             yp1 = yuv[mb_y] + mb_x * 64 * 32;
+             yp2 = yp1 + 64;
+
+             up = yuv[mb_y / 2 + base_h] + uv_x * 64 * 32;
+             vp = up + 1;
+
+             HANDLE_MACROBLOCK(yp1, yp2, up, vp, dp1, dp2);
+          }
+     }
+#endif
+}
+
+static void
+_evas_nv12torgb_raster(unsigned char **yuv, unsigned char *rgb, int w, int h)
+{
+#ifdef BUILD_C
+   int xx, yy;
+   unsigned char *yp1, *yp2, *up, *vp;
+   unsigned char *dp1;
+   unsigned char *dp2;
+
+   dp1 = rgb;
+   dp2 = dp1 + sizeof (int) * w;
+
+   for (yy = 0; yy < h; yy++)
+     {
+        yp1 = yuv[yy++];
+        yp2 = yuv[yy];
+
+        up = yuv[h + (yy >> 1)];
+        vp = up + 1;
+
+        for (xx = 0; xx < w; xx += 2)
+          {
+             _evas_yuv2rgb_420_raster(yp1, yp2, up, vp, dp1, dp2);
+
+             /* the previous call just rendered 2 pixels per lines */
+             dp1 += 8; dp2 += 8;
+
+             /* and took for that 2 lines with 2 Y, 1 U and 1 V. Don't forget U & V are in the same plane */
+             yp1 += 2; yp2 += 2; up += 2; vp += 2;
+          }
+
+        /* jump one line */
+        dp1 += sizeof (int) * w;
+        dp2 += sizeof (int) * w;
+        yp1 += w;
+        yp2 += w;
+     }
+#endif
+}
+
+#endif
+
-- 
cgit v1.1