From 825a3d837a33f226c879cd02ad15c3fba57e8b2c Mon Sep 17 00:00:00 2001 From: David Walter Seikel Date: Mon, 23 Jan 2012 23:30:42 +1000 Subject: Update the EFL to what I'm actually using, coz I'm using some stuff not yet released. --- .../src/lib/engines/common/evas_convert_rgb_32.c | 347 +++++++++++++++------ 1 file changed, 256 insertions(+), 91 deletions(-) (limited to 'libraries/evas/src/lib/engines/common/evas_convert_rgb_32.c') diff --git a/libraries/evas/src/lib/engines/common/evas_convert_rgb_32.c b/libraries/evas/src/lib/engines/common/evas_convert_rgb_32.c index 41dac6f..0401a4a 100644 --- a/libraries/evas/src/lib/engines/common/evas_convert_rgb_32.c +++ b/libraries/evas/src/lib/engines/common/evas_convert_rgb_32.c @@ -48,15 +48,167 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_180 (DATA32 *src, DATA8 *dst, int #endif #endif +#ifdef TILE_ROTATE +#define FAST_SIMPLE_ROTATE(suffix, pix_type) \ + static void \ + blt_rotated_90_trivial_##suffix(pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int w, \ + int h) \ + { \ + int x, y; \ + for (y = 0; y < h; y++) \ + { \ + const pix_type *s = src + (h - y - 1); \ + pix_type *d = dst + (dst_stride * y); \ + for (x = 0; x < w; x++) \ + { \ + *d++ = *s; \ + s += src_stride; \ + } \ + } \ + } \ + static void \ + blt_rotated_270_trivial_##suffix(pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int w, \ + int h) \ + { \ + int x, y; \ + for (y = 0; y < h; y++) \ + { \ + const pix_type *s = src + (src_stride * (w - 1)) + y; \ + pix_type *d = dst + (dst_stride * y); \ + for (x = 0; x < w; x++) \ + { \ + *d++ = *s; \ + s -= src_stride; \ + } \ + } \ + } \ + static void \ + blt_rotated_90_##suffix(pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int w, \ + int h) \ + { \ + int x, leading_pixels = 0, trailing_pixels = 0; \ + const int TILE_SIZE = TILE_CACHE_LINE_SIZE / sizeof(pix_type); \ + if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) \ + { \ + leading_pixels = TILE_SIZE - \ + (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (leading_pixels > w) \ + leading_pixels = w; \ + blt_rotated_90_trivial_##suffix(dst, \ + dst_stride, \ + src, \ + src_stride, \ + leading_pixels, \ + h); \ + dst += leading_pixels; \ + src += leading_pixels * src_stride; \ + w -= leading_pixels; \ + } \ + if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) \ + { \ + trailing_pixels = (((uintptr_t)(dst + w) & \ + (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (trailing_pixels > w) \ + trailing_pixels = w; \ + w -= trailing_pixels; \ + } \ + for (x = 0; x < w; x += TILE_SIZE) \ + { \ + blt_rotated_90_trivial_##suffix(dst + x, \ + dst_stride, \ + src + (src_stride * x), \ + src_stride, \ + TILE_SIZE, \ + h); \ + } \ + if (trailing_pixels) \ + blt_rotated_90_trivial_##suffix(dst + w, \ + dst_stride, \ + src + (w * src_stride), \ + src_stride, \ + trailing_pixels, \ + h); \ + } \ + static void \ + blt_rotated_270_##suffix(pix_type *dst, \ + int dst_stride, \ + const pix_type *src, \ + int src_stride, \ + int w, \ + int h) \ + { \ + int x, leading_pixels = 0, trailing_pixels = 0; \ + const int TILE_SIZE = TILE_CACHE_LINE_SIZE / sizeof(pix_type); \ + if ((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) \ + { \ + leading_pixels = TILE_SIZE - \ + (((uintptr_t)dst & (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (leading_pixels > w) \ + leading_pixels = w; \ + blt_rotated_270_trivial_##suffix(dst, \ + dst_stride, \ + src + (src_stride * (w - leading_pixels)), \ + src_stride, \ + leading_pixels, \ + h); \ + dst += leading_pixels; \ + w -= leading_pixels; \ + } \ + if ((uintptr_t)(dst + w) & (TILE_CACHE_LINE_SIZE - 1)) \ + { \ + trailing_pixels = (((uintptr_t)(dst + w) & \ + (TILE_CACHE_LINE_SIZE - 1)) / sizeof(pix_type)); \ + if (trailing_pixels > w) \ + trailing_pixels = w; \ + w -= trailing_pixels; \ + src += trailing_pixels * src_stride; \ + } \ + for (x = 0; x < w; x += TILE_SIZE) \ + { \ + blt_rotated_270_trivial_##suffix(dst + x, \ + dst_stride, \ + src + (src_stride * (w - x - TILE_SIZE)), \ + src_stride, \ + TILE_SIZE, \ + h); \ + } \ + if (trailing_pixels) \ + blt_rotated_270_trivial_##suffix(dst + w, \ + dst_stride, \ + src - (trailing_pixels * src_stride), \ + src_stride, \ + trailing_pixels, \ + h); \ + } + +FAST_SIMPLE_ROTATE(8888, DATA8) +#endif + + #ifdef BUILD_CONVERT_32_RGB_8888 #ifdef BUILD_CONVERT_32_RGB_ROT270 void evas_common_convert_rgba_to_32bpp_rgb_8888_rot_270 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x __UNUSED__, int dith_y __UNUSED__, DATA8 *pal __UNUSED__) { +#ifdef TILE_ROTATE + blt_rotated_270_8888((DATA8 *)dst, dst_jump+w, (const DATA8 *)src, src_jump+h, w, h) ; +#else DATA32 *src_ptr; DATA32 *dst_ptr; int x, y; - + dst_ptr = (DATA32 *)dst; CONVERT_LOOP_START_ROT_270(); @@ -64,6 +216,7 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_270 (DATA32 *src, DATA8 *dst, int *dst_ptr = *src_ptr; CONVERT_LOOP_END_ROT_270(); +#endif return; } #endif @@ -74,106 +227,118 @@ evas_common_convert_rgba_to_32bpp_rgb_8888_rot_270 (DATA32 *src, DATA8 *dst, int void evas_common_convert_rgba_to_32bpp_rgb_8888_rot_90 (DATA32 *src, DATA8 *dst, int src_jump, int dst_jump, int w, int h, int dith_x __UNUSED__, int dith_y __UNUSED__, DATA8 *pal __UNUSED__) { -#ifndef BUILD_NEON +# ifndef BUILD_NEON +# ifdef TILE_ROTATE + blt_rotated_90_8888((DATA8 *)dst, dst_jump+w, (const DATA8 *)src, src_jump+h, w, h) ; +# else DATA32 *src_ptr; DATA32 *dst_ptr; int x, y; - + dst_ptr = (DATA32 *)dst; CONVERT_LOOP_START_ROT_90(); *dst_ptr = *src_ptr; CONVERT_LOOP_END_ROT_90(); -#else +# endif + +# else + +# ifdef TILE_ROTATE + blt_rotated_90_8888((DATA8 *)dst, dst_jump+w, (const DATA8 *)src, src_jump+h, w, h) ; +# else if ((w & 1) || (h & 1)) { - /* Rarely (if ever) if ever: so slow path is fine */ - DATA32 *src_ptr; - DATA32 *dst_ptr; - int x, y; - - dst_ptr = (DATA32 *)dst; - CONVERT_LOOP_START_ROT_90(); - - *dst_ptr = *src_ptr; - - CONVERT_LOOP_END_ROT_90(); - } else { -#define AP "convert_rgba32_rot_90_" - asm volatile ( - ".fpu neon \n\t" - " mov %[s1], %[src] \n\t" - " add %[s1], %[h],lsl #2 \n\t" - " sub %[s1], #8 \n\t" - - " mov %[s2], %[src] \n\t" - " add %[s2], %[h], lsl #3 \n\t" - " add %[s2], %[sjmp], lsr #1 \n\t" - " sub %[s2], #8 \n\t" - - " mov %[d1], %[dst] \n\t" - - " add %[d2], %[d1], %[djmp] \n\t" - " add %[d2], %[w], lsl #2 \n\t" - - " mov %[sadv], %[h], lsl #3 \n\t" - " add %[sadv], %[sjmp], lsl #1 \n\t" - - " mov %[y], #0 \n\t" - " mov %[x], #0 \n\t" - AP"loop: \n\t" - " vld1.u32 d0, [%[s1]] \n\t" - " vld1.u32 d1, [%[s2]] \n\t" - " add %[x], #2 \n\t" - " add %[s1], %[sadv] \n\t" - " add %[s2], %[sadv] \n\t" - " vtrn.u32 d0, d1 \n\t" - " cmp %[x], %[w] \n\t" - " vst1.u32 d1, [%[d1]]! \n\t" - " vst1.u32 d0, [%[d2]]! \n\t" - " blt "AP"loop \n\t" - - " mov %[x], #0 \n\t" - " add %[d1], %[djmp] \n\t" - " add %[d1], %[w], lsl #2 \n\t" - " add %[d2], %[djmp] \n\t" - " add %[d2], %[w], lsl #2 \n\t" - - " mov %[s1], %[src] \n\t" - " add %[s1], %[h], lsl #2 \n\t" - " sub %[s1], %[y], lsl #2 \n\t" - " sub %[s1], #16 \n\t" - - " add %[s2], %[s1], %[h], lsl #2 \n\t" - " add %[s2], %[sjmp], lsl #2 \n\t" - - " add %[y], #2 \n\t" - - " cmp %[y], %[h] \n\t" - " blt "AP"loop \n\t" - - : // Out - : [s1] "r" (1), - [s2] "r" (11), - [d1] "r" (2), - [d2] "r" (12), - [src] "r" (src), - [dst] "r" (dst), - [x] "r" (3), - [y] "r" (4), - [w] "r" (w), - [h] "r" (h), - [sadv] "r" (5), - [sjmp] "r" (src_jump * 4), - [djmp] "r" (dst_jump * 4 * 2) - : "d0", "d1", "memory", "cc"// Clober - - - ); - } -#undef AP -#endif + /* Rarely (if ever) if ever: so slow path is fine */ + DATA32 *src_ptr; + DATA32 *dst_ptr; + int x, y; + + dst_ptr = (DATA32 *)dst; + CONVERT_LOOP_START_ROT_90(); + + *dst_ptr = *src_ptr; + + CONVERT_LOOP_END_ROT_90(); + } + else + { +# define AP "convert_rgba32_rot_90_" + asm volatile ( + ".fpu neon \n\t" + " mov %[s1], %[src] \n\t" + " add %[s1], %[s1], %[h],lsl #2 \n\t" + " sub %[s1], #8 \n\t" + + " mov %[s2], %[src] \n\t" + " add %[s2], %[s2], %[h], lsl #3 \n\t" + " add %[s2], %[s2], %[sjmp], lsr #1 \n\t" + " sub %[s2], #8 \n\t" + + " mov %[d1], %[dst] \n\t" + + " add %[d2], %[d1], %[djmp] \n\t" + " add %[d2], %[d2], %[w], lsl #2 \n\t" + + " mov %[sadv], %[h], lsl #3 \n\t" + " add %[sadv], %[sadv], %[sjmp], lsl #1\n\t" + + " mov %[y], #0 \n\t" + " mov %[x], #0 \n\t" + AP"loop: \n\t" + " vld1.u32 d0, [%[s1]] \n\t" + " vld1.u32 d1, [%[s2]] \n\t" + " add %[x], #2 \n\t" + " add %[s1], %[sadv] \n\t" + " add %[s2], %[sadv] \n\t" + " vtrn.u32 d0, d1 \n\t" + " cmp %[x], %[w] \n\t" + " vst1.u32 d1, [%[d1]]! \n\t" + " vst1.u32 d0, [%[d2]]! \n\t" + " blt "AP"loop \n\t" + + " mov %[x], #0 \n\t" + " add %[d1], %[djmp] \n\t" + " add %[d1], %[d1], %[w], lsl #2 \n\t" + " add %[d2], %[djmp] \n\t" + " add %[d2], %[d2], %[w], lsl #2 \n\t" + + " mov %[s1], %[src] \n\t" + " add %[s1], %[s1], %[h], lsl #2 \n\t" + " sub %[s1], %[s1], %[y], lsl #2 \n\t" + " sub %[s1], #16 \n\t" + + " add %[s2], %[s1], %[h], lsl #2 \n\t" + " add %[s2], %[s2], %[sjmp], lsl #2 \n\t" + + " add %[y], #2 \n\t" + + " cmp %[y], %[h] \n\t" + " blt "AP"loop \n\t" + + : // Out + : [s1] "r" (1), + [s2] "r" (11), + [d1] "r" (2), + [d2] "r" (12), + [src] "r" (src), + [dst] "r" (dst), + [x] "r" (3), + [y] "r" (4), + [w] "r" (w), + [h] "r" (h), + [sadv] "r" (5), + [sjmp] "r" (src_jump * 4), + [djmp] "r" (dst_jump * 4 * 2) + : "d0", "d1", "memory", "cc"// Clober + + + ); + } +# undef AP +# endif +# endif return; } #endif @@ -448,7 +613,7 @@ evas_common_convert_rgba_to_32bpp_rgb_666(DATA32 *src, DATA8 *dst, int src_jump, CONVERT_LOOP_START_ROT_0(); - *dst_ptr = + *dst_ptr = (((R_VAL(src_ptr) << 12) | (B_VAL(src_ptr) >> 2)) & 0x03f03f) | ((G_VAL(src_ptr) << 4) & 0x000fc0); -- cgit v1.1