From 825a3d837a33f226c879cd02ad15c3fba57e8b2c Mon Sep 17 00:00:00 2001
From: David Walter Seikel
Date: Mon, 23 Jan 2012 23:30:42 +1000
Subject: Update the EFL to what I'm actually using, coz I'm using some stuff
 not yet released.

---
 .../src/lib/engines/common/evas_convert_rgb_32.c   | 347 +++++++++++++++------
 1 file changed, 256 insertions(+), 91 deletions(-)

(limited to 'libraries/evas/src/lib/engines/common/evas_convert_rgb_32.c')

diff --git a/libraries/evas/src/lib/engines/common/evas_convert_rgb_32.c b/libraries/evas/src/lib/engines/common/evas_convert_rgb_32.c
index 41dac6f..0401a4a 100644
--- a/libraries/evas/src/lib/engines/common/evas_convert_rgb_32.c
+++ b/libraries/evas/src/lib/engines/common/evas_convert_rgb_32.c
@@ -48,15 +48,167 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int
 #endif
 #endif
 
+#ifdef TILE_ROTATE
+#define FAST_SIMPLE_ROTATE(suffix, pix_type) \
+   static void \
+   blt_rotated_90_trivial_##suffix(pix_type       *dst, \
+                                   int             dst_stride, \
+                                   const pix_type *src, \
+                                   int             src_stride, \
+                                   int             w, \
+                                   int             h) \
+   { \
+      int x, y; \
+      for (y = 0; y < h; y++) \
+        { \
+           const pix_type *s = src + (h - y - 1); \
+           pix_type *d = dst + (dst_stride * y); \
+           for (x = 0; x < w; x++) \
+             { \
+                *d++ = *s; \
+                s += src_stride; \
+             } \
+        } \
+   } \
+   static void \
+   blt_rotated_270_trivial_##suffix(pix_type       *dst, \
+                                    int             dst_stride, \
+                                    const pix_type *src, \
+                                    int             src_stride, \
+                                    int             w, \
+                                    int             h) \
+   { \
+      int x, y; \
+      for (y = 0; y < h; y++) \
+        { \
+           const pix_type *s = src + (src_stride * (w - 1)) + y; \
+           pix_type *d = dst + (dst_stride * y); \
+           for (x = 0; x < w; x++) \
+             { \
+                *d++ = *s; \
+                s -= src_stride; \
+             } \
+        } \
+   } \
+   static void \
+   blt_rotated_90_##suffix(pix_type       *dst, \
+                           int             dst_stride, \
+                           const pix_type *src, \
+                           int             src_stride, \
+                           int             w, \
+                           int             h) \
+   { \
+      int x, leading_pixels = 0, trailing_pixels = 0; \
+      const int TILE_SIZE = TILE_CACHE_LINE_SIZE / sizeof(pix_type); \
+      if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) \
+        { \
+           leading_pixels = TILE_SIZE - \
+             (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+           if (leading_pixels > w) \
+             leading_pixels = w; \
+           blt_rotated_90_trivial_##suffix(dst, \
+                                           dst_stride, \
+                                           src, \
+                                           src_stride, \
+                                           leading_pixels, \
+                                           h); \
+           dst += leading_pixels; \
+           src += leading_pixels * src_stride; \
+           w -= leading_pixels; \
+        } \
+      if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) \
+        { \
+           trailing_pixels = (((uintptr_t)(dst + w) & \
+                               (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+           if (trailing_pixels > w) \
+             trailing_pixels = w; \
+           w -= trailing_pixels; \
+        } \
+      for (x = 0; x < w; x += TILE_SIZE) \
+        { \
+           blt_rotated_90_trivial_##suffix(dst + x, \
+                                           dst_stride, \
+                                           src + (src_stride * x), \
+                                           src_stride, \
+                                           TILE_SIZE, \
+                                           h); \
+        } \
+      if (trailing_pixels) \
+        blt_rotated_90_trivial_##suffix(dst + w, \
+                                        dst_stride, \
+                                        src + (w * src_stride), \
+                                        src_stride, \
+                                        trailing_pixels, \
+                                        h); \
+   } \
+   static void \
+   blt_rotated_270_##suffix(pix_type       *dst, \
+                            int             dst_stride, \
+                            const pix_type *src, \
+                            int             src_stride, \
+                            int             w, \
+                            int             h) \
+   { \
+      int x, leading_pixels = 0, trailing_pixels = 0; \
+      const int TILE_SIZE = TILE_CACHE_LINE_SIZE / sizeof(pix_type); \
+      if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) \
+        { \
+           leading_pixels = TILE_SIZE - \
+             (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+           if (leading_pixels > w) \
+             leading_pixels = w; \
+           blt_rotated_270_trivial_##suffix(dst, \
+                                            dst_stride, \
+                                            src + (src_stride * (w - leading_pixels)), \
+                                            src_stride, \
+                                            leading_pixels, \
+                                            h); \
+           dst += leading_pixels; \
+           w -= leading_pixels; \
+        } \
+      if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) \
+        { \
+           trailing_pixels = (((uintptr_t)(dst + w) & \
+                               (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \
+           if (trailing_pixels > w) \
+             trailing_pixels = w; \
+           w -= trailing_pixels; \
+           src += trailing_pixels * src_stride; \
+        } \
+      for (x = 0; x < w; x += TILE_SIZE) \
+        { \
+           blt_rotated_270_trivial_##suffix(dst + x, \
+                                            dst_stride, \
+                                            src + (src_stride * (w - x - TILE_SIZE)), \
+                                            src_stride, \
+                                            TILE_SIZE, \
+                                            h); \
+        } \
+      if (trailing_pixels) \
+        blt_rotated_270_trivial_##suffix(dst + w, \
+                                         dst_stride, \
+                                         src - (trailing_pixels * src_stride), \
+                                         src_stride, \
+                                         trailing_pixels, \
+                                         h); \
+   }
+
+FAST_SIMPLE_ROTATE(8888, DATA8)
+#endif
+
+
 #ifdef BUILD_CONVERT_32_RGB_8888
 #ifdef BUILD_CONVERT_32_RGB_ROT270
 void
 evas_common_convert_rgba_to_32bpp_rgb_8888_rot_270 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x __UNUSED__, int dith_y __UNUSED__, DATA8 *pal __UNUSED__)
 {
+#ifdef TILE_ROTATE
+   blt_rotated_270_8888((DATA8 *)dst,  dst_jump+w, (const DATA8 *)src, src_jump+h,  w, h) ;
+#else
    DATA32 *src_ptr;
    DATA32 *dst_ptr;
    int x, y;
-
+   
    dst_ptr = (DATA32 *)dst;
 
    CONVERT_LOOP_START_ROT_270();
@@ -64,6 +216,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_270 (DATA32 *src, DATA8 *dst, int
    *dst_ptr = *src_ptr;
 
    CONVERT_LOOP_END_ROT_270();
+#endif
    return;
 }
 #endif
@@ -74,106 +227,118 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_270 (DATA32 *src, DATA8 *dst, int
 void
 evas_common_convert_rgba_to_32bpp_rgb_8888_rot_90 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x __UNUSED__, int dith_y __UNUSED__, DATA8 *pal __UNUSED__)
 {
-#ifndef BUILD_NEON
+# ifndef BUILD_NEON
+#  ifdef TILE_ROTATE
+   blt_rotated_90_8888((DATA8 *)dst,  dst_jump+w, (const DATA8 *)src, src_jump+h, w, h) ;
+#  else
    DATA32 *src_ptr;
    DATA32 *dst_ptr;
    int x, y;
-
+   
    dst_ptr = (DATA32 *)dst;
    CONVERT_LOOP_START_ROT_90();
 
    *dst_ptr = *src_ptr;
 
    CONVERT_LOOP_END_ROT_90();
-#else
+#  endif
+   
+# else
+   
+#  ifdef TILE_ROTATE
+   blt_rotated_90_8888((DATA8 *)dst,  dst_jump+w, (const DATA8 *)src, src_jump+h, w, h) ;
+#  else
    if ((w & 1) || (h & 1))
      {
-	/* Rarely (if ever) if ever: so slow path is fine */
-	 DATA32 *src_ptr;
-	 DATA32 *dst_ptr;
-	 int x, y;
-
-	 dst_ptr = (DATA32 *)dst;
-	 CONVERT_LOOP_START_ROT_90();
-
-	 *dst_ptr = *src_ptr;
-
-	 CONVERT_LOOP_END_ROT_90();
-   } else {
-#define AP	"convert_rgba32_rot_90_"
-	asm volatile (
-	".fpu neon						\n\t"
-	"	mov		%[s1],	%[src]			\n\t"
-	"	add		%[s1],	%[h],lsl #2		\n\t"
-	"	sub		%[s1],	#8			\n\t"
-
-	"	mov		%[s2],	%[src]			\n\t"
-	"	add		%[s2],  %[h], lsl #3		\n\t"
-	"	add		%[s2],  %[sjmp], lsr #1		\n\t"
-	"	sub		%[s2],  #8			\n\t"
-
-	"	mov		%[d1],	%[dst]			\n\t"
-
-	"	add		%[d2],	%[d1], %[djmp]		\n\t"
-	"	add		%[d2],	%[w], lsl #2		\n\t"
-
-	"	mov		%[sadv], %[h], lsl #3		\n\t"
-	"	add		%[sadv], %[sjmp], lsl #1	\n\t"
-
-	"	mov		%[y],	#0			\n\t"
-	"	mov		%[x],	#0			\n\t"
-	AP"loop:						\n\t"
-	"	vld1.u32	d0,	[%[s1]]			\n\t"
-	"	vld1.u32	d1,	[%[s2]]			\n\t"
-	"	add		%[x],	#2			\n\t"
-	"	add		%[s1],  %[sadv]			\n\t"
-	"	add		%[s2],  %[sadv]			\n\t"
-	"	vtrn.u32	d0,	d1			\n\t"
-	"	cmp		%[x],	%[w]			\n\t"
-	"	vst1.u32	d1,	[%[d1]]!		\n\t"
-	"	vst1.u32	d0,	[%[d2]]!		\n\t"
-	"	blt		"AP"loop			\n\t"
-
-	"	mov		%[x],	#0			\n\t"
-	"	add		%[d1],  %[djmp]			\n\t"
-	"	add		%[d1],	%[w], lsl #2		\n\t"
-	"	add		%[d2],  %[djmp]			\n\t"
-	"	add		%[d2],	%[w], lsl #2		\n\t"
-
-	"	mov		%[s1],	%[src]			\n\t"
-	"	add		%[s1],  %[h], lsl #2		\n\t"
-	"	sub		%[s1],  %[y], lsl #2		\n\t"
-	"	sub		%[s1],  #16			\n\t"
-
-	"	add		%[s2],	%[s1], %[h], lsl #2	\n\t"
-	"	add		%[s2],  %[sjmp], lsl #2		\n\t"
-
-	"	add		%[y],	#2			\n\t"
-
-	"	cmp		%[y],	%[h]			\n\t"
-	"	blt		"AP"loop			\n\t"
-
-	: // Out
-	:	[s1] "r" (1),
-		[s2] "r" (11),
-		[d1] "r" (2),
-		[d2] "r" (12),
-		[src] "r" (src),
-		[dst] "r" (dst),
-		[x] "r" (3),
-		[y] "r" (4),
-		[w] "r" (w),
-		[h] "r" (h),
-		[sadv] "r" (5),
-		[sjmp] "r" (src_jump * 4),
-		[djmp] "r" (dst_jump * 4 * 2)
-	: "d0", "d1", "memory", "cc"// Clober
-
-
-	);
-   }
-#undef AP
-#endif
+        /* Rarely (if ever) if ever: so slow path is fine */
+        DATA32 *src_ptr;
+        DATA32 *dst_ptr;
+        int x, y;
+
+        dst_ptr = (DATA32 *)dst;
+        CONVERT_LOOP_START_ROT_90();
+
+        *dst_ptr = *src_ptr;
+
+        CONVERT_LOOP_END_ROT_90();
+     }
+   else
+     {
+#   define AP  "convert_rgba32_rot_90_"
+        asm volatile (
+        ".fpu neon                      \n\t"
+        "   mov     %[s1],  %[src]          \n\t"
+        "   add     %[s1],  %[s1],  %[h],lsl #2 \n\t"
+        "   sub     %[s1],  #8          \n\t"
+
+        "   mov     %[s2],  %[src]          \n\t"
+        "   add     %[s2],  %[s2],  %[h], lsl #3    \n\t"
+        "   add     %[s2],  %[s2],  %[sjmp], lsr #1 \n\t"
+        "   sub     %[s2],  #8          \n\t"
+
+        "   mov     %[d1],  %[dst]          \n\t"
+
+        "   add     %[d2],  %[d1], %[djmp]      \n\t"
+        "   add     %[d2],  %[d2],  %[w], lsl #2    \n\t"
+
+        "   mov     %[sadv], %[h], lsl #3       \n\t"
+        "   add     %[sadv], %[sadv], %[sjmp], lsl #1\n\t"
+
+        "   mov     %[y],   #0          \n\t"
+        "   mov     %[x],   #0          \n\t"
+        AP"loop:                        \n\t"
+        "   vld1.u32    d0, [%[s1]]         \n\t"
+        "   vld1.u32    d1, [%[s2]]         \n\t"
+        "   add     %[x],   #2          \n\t"
+        "   add     %[s1],  %[sadv]         \n\t"
+        "   add     %[s2],  %[sadv]         \n\t"
+        "   vtrn.u32    d0, d1          \n\t"
+        "   cmp     %[x],   %[w]            \n\t"
+        "   vst1.u32    d1, [%[d1]]!        \n\t"
+        "   vst1.u32    d0, [%[d2]]!        \n\t"
+        "   blt     "AP"loop            \n\t"
+
+        "   mov     %[x],   #0          \n\t"
+        "   add     %[d1],  %[djmp]         \n\t"
+        "   add     %[d1],  %[d1],  %[w], lsl #2    \n\t"
+        "   add     %[d2],  %[djmp]         \n\t"
+        "   add     %[d2],  %[d2],  %[w], lsl #2    \n\t"
+
+        "   mov     %[s1],  %[src]          \n\t"
+        "   add     %[s1],  %[s1],  %[h], lsl #2    \n\t"
+        "   sub     %[s1],  %[s1],  %[y], lsl #2    \n\t"
+        "   sub     %[s1],  #16         \n\t"
+
+        "   add     %[s2],  %[s1], %[h], lsl #2 \n\t"
+        "   add     %[s2],  %[s2],  %[sjmp], lsl #2 \n\t"
+
+        "   add     %[y],   #2          \n\t"
+
+        "   cmp     %[y],   %[h]            \n\t"
+        "   blt     "AP"loop            \n\t"
+
+    : // Out
+    :   [s1] "r" (1),
+        [s2] "r" (11),
+        [d1] "r" (2),
+        [d2] "r" (12),
+        [src] "r" (src),
+        [dst] "r" (dst),
+        [x] "r" (3),
+        [y] "r" (4),
+        [w] "r" (w),
+        [h] "r" (h),
+        [sadv] "r" (5),
+        [sjmp] "r" (src_jump * 4),
+        [djmp] "r" (dst_jump * 4 * 2)
+    : "d0", "d1", "memory", "cc"// Clober
+
+
+        );
+     }
+#   undef AP
+#  endif
+# endif
    return;
 }
 #endif
@@ -448,7 +613,7 @@ evas_common_convert_rgba_to_32bpp_rgb_666(DATA32 *src, DATA8 *dst, int src_jump,
 
    CONVERT_LOOP_START_ROT_0();
 
-   *dst_ptr = 
+   *dst_ptr =
      (((R_VAL(src_ptr) << 12) | (B_VAL(src_ptr) >> 2)) & 0x03f03f) |
      ((G_VAL(src_ptr) << 4) & 0x000fc0);
 
-- 
cgit v1.1