/* blend pixel --> dst */ #ifdef BUILD_NEON static void _op_blend_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { #define AP "blend_p_dp_" asm volatile ( ".fpu neon \n\t" //** init "vmov.i8 q8, $0x1 \n\t" AP "loopchoose: \n\t" // If aligned already - straight to octs "andS %[tmp], %[d],$0x1f \n\t" "beq "AP"octloops \n\t" "andS %[tmp], %[d],$0xf \n\t" "beq "AP"quadloops \n\t" "andS %[tmp], %[d],$0x4 \n\t" "beq "AP"dualloop \n\t" // Only ever executes once, fall through to dual AP "singleloop: \n\t" "vld1.32 d0[0], [%[s]]! \n\t" "vld1.32 d4[0], [%[d]] \n\t" "vmvn.u8 d8, d0 \n\t" "vshr.u32 d8, d8, #24 \n\t" "vmul.u32 d8, d16, d8 \n\t" "vmull.u8 q6, d4,d8 \n\t" "vqrshrn.u16 d8, q6, #8 \n\t" // Add to 's' "vqadd.u8 q2, q4,q0 \n\t" "vst1.32 d4[0], [%[d]] \n\t" "add %[d], #4 \n\t" // Can we go the fast path? "andS %[tmp], %[d],$0x1f \n\t" "beq "AP"octloops \n\t" "andS %[tmp], %[d],$0x0f \n\t" "beq "AP"quadloops \n\t" AP "dualloop: \n\t" "sub %[tmp], %[e], %[d] \n\t" "cmp %[tmp], #32 \n\t" "blt "AP"loopout \n\t" AP "dualloopint: \n\t" //** Dual Loop "vldm %[s]!, {d0} \n\t" "vldr d4, [%[d]] \n\t" "vmvn.u8 d8, d0 \n\t" "vshr.u32 d8, d8, #24 \n\t" "vmul.u32 d8, d16, d8 \n\t" "vmull.u8 q6, d4,d8 \n\t" "vqrshrn.u16 d8, q6, #8 \n\t" // Add to 's' "vqadd.u8 d4, d8,d0 \n\t" "vstr d4, [%[d]] \n\t" "add %[d], #8 \n\t" "ands %[tmp], %[d], $0x1f \n\t" "beq "AP"octloops \n\t" AP"quadloops: \n\t" "sub %[tmp], %[e], %[d] \n\t" "cmp %[tmp], #32 \n\t" "blt "AP"loopout \n\t" "vldm %[s]!, {d0,d1) \n\t" "vldm %[d], {d4,d5} \n\t" // Copy s.a into q2 (>> 24) & subtract from 255 "vmvn.u8 q4, q0 \n\t" "vshr.u32 q4, q4,$0x18 \n\t" // Multiply into all fields "vmul.u32 q4, q8,q4 \n\t" // a * d (clobbering 'd'/q7) "vmull.u8 q6, d4,d8 \n\t" "vmull.u8 q2, d5,d9 \n\t" // Shift & narrow it "vqrshrn.u16 d8, q6, #8 \n\t" "vqrshrn.u16 d9, q2, #8 \n\t" // Add to s "vqadd.u8 q2, q4,q0 \n\t" // Write it "vstm %[d]!, {d4,d5} \n\t" AP "octloops: \n\t" "sub %[tmp], %[e], %[d] \n\t" "cmp %[tmp], #32 \n\t" "ble "AP"loopout \n\t" "sub %[tmp],%[e],#64 \n\t" AP "octloopint:\n\t" //** Oct loop "vldm %[s]!, {d0,d1,d2,d3) \n\t" "vldm %[d], {d4,d5,d6,d7} \n\t" "pld [%[s], #64] \n\t" // Copy s.a into q2 (>> 24) & subtract from 255 "vmvn.u8 q4, q0 \n\t" "vmvn.u8 q5, q1 \n\t" "vshr.u32 q4, q4,$0x18 \n\t" "vshr.u32 q5, q5,$0x18\n\t" // Multiply into all fields "vmul.u32 q4, q8,q4 \n\t" "vmul.u32 q5, q8,q5 \n\t" // a * d (clobbering 'd'/q7) "vmull.u8 q6, d4,d8 \n\t" "vmull.u8 q2, d5,d9 \n\t" "vmull.u8 q7, d6,d10 \n\t" "vmull.u8 q3, d7,d11 \n\t" "cmp %[tmp], %[d]\n\t" // Shift & narrow it "vqrshrn.u16 d8, q6, #8 \n\t" "vqrshrn.u16 d9, q2, #8 \n\t" "vqrshrn.u16 d10, q7, #8 \n\t" "vqrshrn.u16 d11, q3, #8 \n\t" // Add to s "vqadd.u8 q2, q4,q0 \n\t" "vqadd.u8 q3, q5,q1 \n\t" // Write it "vstm %[d]!, {d4,d5,d6,d7} \n\t" "bhi "AP"octloopint\n\t" AP "loopout: \n\t" "cmp %[d], %[e] \n\t" "beq "AP"done \n\t" "sub %[tmp],%[e], %[d] \n\t" "cmp %[tmp],$0x04 \n\t" "ble "AP"singleloop2 \n\t" AP "dualloop2: \n\t" "sub %[tmp],%[e],$0x7 \n\t" AP "dualloop2int: \n\t" //** Trailing double "vldm %[s]!, {d0} \n\t" "vldm %[d], {d4} \n\t" "vmvn.u8 d8, d0 \n\t" "vshr.u32 d8, d8, #24 \n\t" "vmul.u32 d8, d16, d8 \n\t" "vmull.u8 q6, d4,d8 \n\t" "vqrshrn.u16 d8, q6, #8 \n\t" // Add to 's' "vqadd.u8 d4, d8,d0 \n\t" "vstr.32 d4, [%[d]] \n\t" "add %[d], #8 \n\t" "cmp %[tmp], %[d] \n\t" "bhi "AP"dualloop2int \n\t" // Single ?? "cmp %[e], %[d] \n\t" "beq "AP"done \n\t" AP"singleloop2: \n\t" "vld1.32 d0[0], [%[s]] \n\t" "vld1.32 d4[0], [%[d]] \n\t" "vmvn.u8 d8, d0 \n\t" "vshr.u32 d8, d8, #24 \n\t" "vmul.u32 d8, d8, d16 \n\t" "vmull.u8 q6, d8,d4 \n\t" "vqrshrn.u16 d8, q6, #8 \n\t" // Add to 's' "vqadd.u8 d0, d0,d8 \n\t" "vst1.32 d0[0], [%[d]] \n\t" //** Trailing single AP"done:\n\t" //"sub %[tmp], %[e], #4 \n\t" //"vmov.i32 d0, $0xffff0000 \n\t" //"vst1.32 d0[0], [%[tmp]] \n\t" : // output regs // Input : [e] "r" (d + l), [d] "r" (d), [s] "r" (s), [c] "r" (c), [tmp] "r" (7) : "q0", "q1", "q2","q3", "q4","q5","q6", "q7","q8","memory" // clobbered ); #undef AP } static void _op_blend_pas_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { #define AP "blend_pas_dp_" DATA32 *e = d + l,*tmp = e + 32,*pl=(void*)912; asm volatile ( ".fpu neon \n\t" "vmov.i8 q8, #1 \n\t" AP"loopchoose: \n\t" // If aliged - go as fast we can "andS %[tmp], %[d], #31 \n\t" "beq "AP"quadstart \n\t" // See if we can at least do our double loop "andS %[tmp], %[d], $0x7 \n\t" "beq "AP"dualstart \n\t" // Ugly single word version AP "singleloop: \n\t" "vld1.32 d0[0], [%[s]]! \n\t" "vld1.32 d4[0], [%[d]] \n\t" "vmvn.u8 d8, d0 \n\t" "vshr.u32 d8, d8,$0x18 \n\t" // Mulitply into all fields "vmul.u32 d8, d8, d16 \n\t" // Multiply out "vmull.u8 q6, d8, d4 \n\t" "vqrshrn.u16 d8, q6, #8 \n\t" // Add to s "vqadd.u8 d0, d0,d8 \n\t" "vst1.32 d0[0], [%[d]]! \n\t" AP"dualstart: \n\t" "sub %[tmp], %[e], %[d] \n\t" "cmp %[tmp], #32 \n\t" "blt "AP"loopout \n\t" // If aligned - go as fast we can "andS %[tmp], %[d], #31 \n\t" "beq "AP"quadstart \n\t" AP"dualloop: \n\t" "vldm %[s]!, {d0) \n\t" "vldm %[d], {d4} \n\t" // Subtract from 255 (ie negate) and extract alpha channel "vmvn.u8 d8, d0 \n\t" "vshr.u32 d8, d8,$0x18 \n\t" // Mulitply into all fields "vmul.u32 d8, d8, d16 \n\t" // Multiply out "vmull.u8 q6, d8, d4 \n\t" "vqrshrn.u16 d8, q6, #8 \n\t" // Add to s "vqadd.u8 d0, d0,d8 \n\t" "vstm %[d]!, {d0} \n\t" "andS %[tmp], %[d], $0x1f \n\t" "bne "AP"dualloop \n\t" AP"quadstart: \n\t" "sub %[tmp], %[e], %[d] \n\t" "cmp %[tmp], #32 \n\t" "blt "AP"loopout \n\t" "sub %[tmp], %[e], #31 \n\t" AP"quadloop:\n\t" "vldm %[s]!, {d0,d1,d2,d3) \n\t" "vldm %[d], {d4,d5,d6,d7} \n\t" // Subtract from 255 (ie negate) and extract alpha channel "vmvn.u8 q4, q0 \n\t" "vmvn.u8 q5, q1 \n\t" "vshr.u32 q4, q4,$0x18 \n\t" "vshr.u32 q5, q5,$0x18 \n\t" // Prepare to preload "add %[pl], %[s], #32 \n\t" // Mulitply into all fields "vmul.u32 q4, q4, q8 \n\t" "vmul.u32 q5, q5, q8 \n\t" "pld [%[pl]] \n\t" // Multiply out "vmull.u8 q6, d8, d4 \n\t" "vmull.u8 q7, d10, d6 \n\t" "vmull.u8 q2, d9, d5 \n\t" "vmull.u8 q3, d11, d7 \n\t" "add %[pl], %[d], #32 \n\t" "vqrshrn.u16 d8, q6, #8 \n\t" "vqrshrn.u16 d10, q7, #8 \n\t" "vqrshrn.u16 d9, q2, #8 \n\t" "vqrshrn.u16 d11, q3, #8 \n\t" "pld [%[pl]] \n\t" "cmp %[tmp], %[pl] \n\t" // Add to s "vqadd.u8 q0, q0,q4 \n\t" "vqadd.u8 q1, q1,q5 \n\t" "vstm %[d]!, {d0,d1,d2,d3} \n\t" "bhi "AP"quadloop \n\t" AP "loopout: \n\t" "cmp %[d], %[e] \n\t" "beq "AP"done \n\t" "sub %[tmp],%[e], %[d] \n\t" "cmp %[tmp],$0x04 \n\t" "beq "AP"singleloop2 \n\t" "sub %[tmp],%[e],$0x7 \n\t" AP"dualloop2: \n\t" "vldm %[s]!, {d0) \n\t" "vldm %[d], {d4} \n\t" // Subtract from 255 (ie negate) and extract alpha channel "vmvn.u8 d8, d0 \n\t" "vshr.u32 d8, d8,$0x18 \n\t" // Mulitply into all fields "vmul.u32 d8, d8, d16 \n\t" // Multiply out "vmull.u8 q6, d8, d4 \n\t" "vqrshrn.u16 d8, q6, #8 \n\t" // Add to s "vqadd.u8 d0, d0,d8 \n\t" "vstm %[d]!, {d0} \n\t" "cmp %[tmp], %[d] \n\t" "bhi "AP"dualloop2 \n\t" // Single ?? "cmp %[e], %[d] \n\t" "beq "AP"done \n\t" AP "singleloop2: \n\t" "vld1.32 d0[0], [%[s]] \n\t" "vld1.32 d4[0], [%[d]] \n\t" "vmvn.u8 d8, d0 \n\t" "vshr.u32 d8, d8,$0x18 \n\t" // Mulitply into all fields "vmul.u32 d8, d8, d16 \n\t" // Multiply out "vmull.u8 q6, d8, d4 \n\t" "vqrshrn.u16 d8, q6, #8 \n\t" // Add to s "vqadd.u8 d0, d0,d8 \n\t" "vst1.32 d0[0], [%[d]] \n\t" AP "done:\n\t" : /* Out */ : /* In */ [s] "r" (s), [e] "r" (e), [d] "r" (d), [tmp] "r" (tmp), [pl] "r" (pl) : /* Clobbered */ "q0","q1","q2","q3","q4","q5","q6","q7","q8","memory" ); #undef AP } #define _op_blend_pan_dp_neon NULL #define _op_blend_p_dpan_neon _op_blend_p_dp_neon #define _op_blend_pas_dpan_neon _op_blend_pas_dp_neon #define _op_blend_pan_dpan_neon _op_blend_pan_dp_neon static void init_blend_pixel_span_funcs_neon(void) { op_blend_span_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_p_dp_neon; op_blend_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pas_dp_neon; op_blend_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pan_dp_neon; op_blend_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_p_dpan_neon; op_blend_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pas_dpan_neon; op_blend_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pan_dpan_neon; } #endif #ifdef BUILD_NEON static void _op_blend_pt_p_dp_neon(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) { c = 256 - (s >> 24); *d = s + MUL_256(c, *d); } #define _op_blend_pt_pan_dp_neon NULL #define _op_blend_pt_pas_dp_neon _op_blend_pt_p_dp_neon #define _op_blend_pt_p_dpan_neon _op_blend_pt_p_dp_neon #define _op_blend_pt_pan_dpan_neon _op_blend_pt_pan_dp_neon #define _op_blend_pt_pas_dpan_neon _op_blend_pt_pas_dp_neon static void init_blend_pixel_pt_funcs_neon(void) { op_blend_pt_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pt_p_dp_neon; op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pt_pas_dp_neon; op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_pt_pan_dp_neon; op_blend_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pt_p_dpan_neon; op_blend_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pt_pas_dpan_neon; op_blend_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_pt_pan_dpan_neon; } #endif /*-----*/ /* blend_rel pixel -> dst */ #ifdef BUILD_NEON static void _op_blend_rel_p_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { DATA32 *e = d + l; while (d < e) { l = 256 - (*s >> 24); c = 1 + (*d >> 24); *d = MUL_256(c, *s) + MUL_256(l, *d); d++; s++; } } static void _op_blend_rel_pan_dp_neon(DATA32 *s, DATA8 *m, DATA32 c, DATA32 *d, int l) { DATA32 *e = d + l; while (d < e) { c = 1 + (*d >> 24); *d++ = MUL_256(c, *s); s++; } } #define _op_blend_rel_pas_dp_neon _op_blend_rel_p_dp_neon #define _op_blend_rel_p_dpan_neon _op_blend_p_dpan_neon #define _op_blend_rel_pan_dpan_neon _op_blend_pan_dpan_neon #define _op_blend_rel_pas_dpan_neon _op_blend_pas_dpan_neon static void init_blend_rel_pixel_span_funcs_neon(void) { op_blend_rel_span_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_p_dp_neon; op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pas_dp_neon; op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pan_dp_neon; op_blend_rel_span_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_p_dpan_neon; op_blend_rel_span_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pas_dpan_neon; op_blend_rel_span_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pan_dpan_neon; } #endif #ifdef BUILD_NEON static void _op_blend_rel_pt_p_dp_neon(DATA32 s, DATA8 m, DATA32 c, DATA32 *d) { c = 256 - (s >> 24); *d = MUL_SYM(*d >> 24, s) + MUL_256(c, *d); } #define _op_blend_rel_pt_pas_dp_neon _op_blend_rel_pt_p_dp_neon #define _op_blend_rel_pt_pan_dp_neon _op_blend_rel_pt_p_dp_neon #define _op_blend_rel_pt_p_dpan_neon _op_blend_pt_p_dpan_neon #define _op_blend_rel_pt_pas_dpan_neon _op_blend_pt_pas_dpan_neon #define _op_blend_rel_pt_pan_dpan_neon _op_blend_pt_pan_dpan_neon static void init_blend_rel_pixel_pt_funcs_neon(void) { op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pt_p_dp_neon; op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pt_pas_dp_neon; op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP][CPU_NEON] = _op_blend_rel_pt_pan_dp_neon; op_blend_rel_pt_funcs[SP][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pt_p_dpan_neon; op_blend_rel_pt_funcs[SP_AS][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pt_pas_dpan_neon; op_blend_rel_pt_funcs[SP_AN][SM_N][SC_N][DP_AN][CPU_NEON] = _op_blend_rel_pt_pan_dpan_neon; } #endif