pixman: Changes to 'upstream-unstable'
Rebased ref, commits from common ancestor:
commit a0f1b565811388b0567c845b9b7063d5b93d325e
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date: Sat Oct 29 05:33:44 2011 -0400
Pre-release version bump to 0.23.8
diff --git a/configure.ac b/configure.ac
index 6c88c84..0552563 100644
--- a/configure.ac
+++ b/configure.ac
@@ -54,7 +54,7 @@ AC_PREREQ([2.57])
m4_define([pixman_major], 0)
m4_define([pixman_minor], 23)
-m4_define([pixman_micro], 7)
+m4_define([pixman_micro], 8)
m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
commit 498138c293a2abce44ce122114852f4e6c5b87fe
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date: Tue Oct 25 08:45:34 2011 -0400
Fix use of uninitialized fields reported by valgrind
In pixman-noop.c and pixman-sse2.c, we are accessing
image->bits.width/height without first making sure the image is a bits
image. The warning is harmless because we never act on this
information without checking that the image is a8r8g8b8, but valgrind
does warn about it.
In pixman-noop.c, just reorder the clauses in the if statement; in
pixman-sse2.c require images to have the FAST_PATH_BITS_IMAGE flag
set.
diff --git a/pixman/pixman-noop.c b/pixman/pixman-noop.c
index 906a491..f4012d8 100644
--- a/pixman/pixman-noop.c
+++ b/pixman/pixman-noop.c
@@ -76,12 +76,12 @@ noop_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
{
iter->get_scanline = _pixman_iter_get_scanline_noop;
}
- else if ((iter->flags & ITER_NARROW) &&
+ else if (image->common.extended_format_code == PIXMAN_a8r8g8b8 &&
+ (iter->flags & ITER_NARROW) &&
(image->common.flags & FLAGS) == FLAGS &&
iter->x >= 0 && iter->y >= 0 &&
iter->x + iter->width <= image->bits.width &&
- iter->y + iter->height <= image->bits.height &&
- image->common.extended_format_code == PIXMAN_a8r8g8b8)
+ iter->y + iter->height <= image->bits.height)
{
iter->buffer =
image->bits.bits + iter->y * image->bits.rowstride + iter->x;
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index c419511..8adf541 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5982,7 +5982,7 @@ sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
int height = iter->height;
#define FLAGS \
- (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+ (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE)
if ((iter->flags & ITER_NARROW) &&
(image->common.flags & FLAGS) == FLAGS &&
commit 3d4d705d2ffa4aeab3dc02a23c2aadbea1374a3f
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Tue Oct 18 21:50:18 2011 +0900
ARM: NEON: Fix assembly typo error in src_n_8_8888
Binutils 2.21 does not complain about missing comma between ARM
register and alignement specifier in vld/vst instructions which
causes build error on binutils 2.20.
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index da8f054..87aae1d 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1260,7 +1260,7 @@ generate_composite_function \
PF subges PF_CTL, PF_CTL, #0x10
vmull.u8 q11, d24, d3
PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
- vst4.8 {d28, d29, d30, d31}, [DST_W :128]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
vrsra.u16 q8, q8, #8
vrsra.u16 q9, q9, #8
vrsra.u16 q10, q10, #8
commit 19f118f41f8725f22395d31eac5670cb350b55ec
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon Sep 26 18:33:27 2011 +0900
ARM: NEON: Standard fast path src_n_8_8
Performance numbers of before/after on cortex-a8 @ 1GHz
- before
L1: 28.05 L2: 28.26 M: 26.97 ( 4.48%) HT: 19.79 VT: 19.14 R: 17.61 RT: 9.88 ( 101Kops/s)
- after
L1:1430.28 L2:1252.10 M:421.93 ( 75.48%) HT:170.16 VT:138.03 R:145.86 RT: 35.51 ( 255Kops/s)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 1db02db..da8f054 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1292,6 +1292,72 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+ vmull.u8 q0, d24, d16
+ vmull.u8 q1, d25, d16
+ vmull.u8 q2, d26, d16
+ vmull.u8 q3, d27, d16
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+ vrshrn.u16 d28, q0, #8
+ vrshrn.u16 d29, q1, #8
+ vrshrn.u16 d30, q2, #8
+ vrshrn.u16 d31, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q0, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q1, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q2, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q3, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q0, d24, d16
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q1, d25, d16
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q2, d26, d16
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q3, d27, d16
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d16[0]}, [DUMMY]
+ vdup.8 d16, d16[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_WRITEONLY, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8_init, \
+ pixman_composite_src_n_8_8_cleanup, \
+ pixman_composite_src_n_8_8_process_pixblock_head, \
+ pixman_composite_src_n_8_8_process_pixblock_tail, \
+ pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8_8888_process_pixblock_head
/* expecting deinterleaved source data in {d8, d9, d10, d11} */
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 3db9adf..ca139de 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -92,6 +92,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8,
+ uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -295,6 +297,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8, neon_composite_src_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
commit 4db9e2bc13d3ed26416f249e57acec4b41f58b7f
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon Sep 26 17:03:54 2011 +0900
ARM: NEON: Standard fast path src_n_8_8888
Performance numbers of before/after on cortex-a8 @ 1GHz
- before
L1: 32.39 L2: 31.79 M: 30.84 ( 13.77%) HT: 21.58 VT: 19.75 R: 18.83 RT: 10.46 ( 106Kops/s)
- after
L1: 516.25 L2: 372.00 M:193.49 ( 85.59%) HT:136.93 VT:109.10 R:104.48 RT: 34.77 ( 253Kops/s)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 3fcd07d..1db02db 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1219,6 +1219,79 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+ /* expecting solid source in {d0, d1, d2, d3} */
+ /* mask is in d24 (d25, d26, d27 are unused) */
+
+ /* in */
+ vmull.u8 q8, d24, d0
+ vmull.u8 q9, d24, d1
+ vmull.u8 q10, d24, d2
+ vmull.u8 q11, d24, d3
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+ vrshrn.u16 d28, q8, #8
+ vrshrn.u16 d29, q9, #8
+ vrshrn.u16 d30, q10, #8
+ vrshrn.u16 d31, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q8, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q9, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q10, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q11, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q8, d24, d0
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q9, d24, d1
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q10, d24, d2
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q11, d24, d3
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W :128]!
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8888_init, \
+ pixman_composite_src_n_8_8888_cleanup, \
+ pixman_composite_src_n_8_8888_process_pixblock_head, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8_8888_process_pixblock_head
/* expecting deinterleaved source data in {d8, d9, d10, d11} */
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index effb50b..3db9adf 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -90,6 +90,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -289,6 +291,11 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8b8g8r8, neon_composite_src_rpixbuf_8888),
PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8r8g8b8, neon_composite_src_rpixbuf_8888),
PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8b8g8r8, neon_composite_src_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
+
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565),
commit 26659de6cd2775c83a9a6e6660324d5baacf61f9
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon Sep 26 19:04:53 2011 +0900
ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888
Instructions are reordered to eliminate pipeline stalls and get
better memory access.
Performance of before/after on cortex-a8 @ 1GHz
<< 2000 x 2000 with scale factor close to 1.x >>
before : 40.53 Mpix/s
after : 50.76 Mpix/s
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 82d248e..f7913ad 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -949,7 +949,7 @@ pixman_asm_function fname
vshrn.u32 d0, q0, #16
vshrn.u32 d1, q1, #16
vld1.32 {d2, d3}, [OUT, :128]
- pld [OUT, PF_OFFS]
+ pld [OUT, #(prefetch_offset * 4)]
vshrn.u32 d4, q2, #16
vshr.u16 q15, q12, #8
vshrn.u32 d5, q3, #16
@@ -1061,15 +1061,169 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_head
- bilinear_over_8888_8_8888_process_four_pixels
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vld1.32 {d0}, [TMP1], STRIDE
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vld1.32 {d1}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vld1.32 {d2}, [TMP2], STRIDE
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vld1.32 {d3}, [TMP2]
+ vmull.u8 q2, d0, d28
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q3, d3, d29
+ vshll.u16 q0, d4, #8
+ vshll.u16 q1, d6, #8
+ vmlsl.u16 q0, d4, d30
+ vmlsl.u16 q1, d6, d31
+ vmlal.u16 q0, d5, d30
+ vmlal.u16 q1, d7, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2}, [TMP3], STRIDE
+ vld1.32 {d3}, [TMP3]
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d4}, [TMP4], STRIDE
+ vld1.32 {d5}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q3, d3, d29
+ vmull.u8 q1, d4, d28
+ vmlal.u8 q1, d5, d29
+ vshr.u16 q15, q12, #8
+ vld1.32 {d22[0]}, [MASK]!
+ pld [MASK, #prefetch_offset]
+ vadd.u16 q12, q12, q13
+ vmovn.u16 d16, q0
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_tail
+ vshll.u16 q9, d6, #8
+ vshll.u16 q10, d2, #8
+ vmlsl.u16 q9, d6, d30
+ vmlsl.u16 q10, d2, d31
+ vmlal.u16 q9, d7, d30
+ vmlal.u16 q10, d3, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vdup.32 d22, d22[0]
+ vshrn.u32 d18, q9, #16
+ vshrn.u32 d19, q10, #16
+ vmovn.u16 d17, q9
+ vld1.32 {d18, d19}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vmull.u8 q10, d16, d22
+ vmull.u8 q11, d17, d22
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+ vrshrn.u16 d16, q10, #8
+ vrshrn.u16 d17, q11, #8
+ vdup.32 d22, d17[1]
+ vmvn.8 d22, d22
+ vmull.u8 q10, d18, d22
+ vmull.u8 q11, d19, d22
+ vrshr.u16 q9, q10, #8
+ vrshr.u16 q0, q11, #8
+ vraddhn.u16 d18, q9, q10
+ vraddhn.u16 d19, q0, q11
+ vqadd.u8 q9, q8, q9
+ vuzp.8 d18, d19
+ vuzp.8 d18, d19
+ vst1.32 {d18, d19}, [OUT, :128]!
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
- bilinear_over_8888_8_8888_process_pixblock_tail
- bilinear_over_8888_8_8888_process_pixblock_head
+ vshll.u16 q9, d6, #8
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vshll.u16 q10, d2, #8
+ vld1.32 {d0}, [TMP1], STRIDE
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlsl.u16 q9, d6, d30
+ vmlsl.u16 q10, d2, d31
+ vld1.32 {d1}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vmlal.u16 q9, d7, d30
+ vmlal.u16 q10, d3, d31
+ vld1.32 {d2}, [TMP2], STRIDE
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vld1.32 {d3}, [TMP2]
+ vdup.32 d22, d22[0]
+ vshrn.u32 d18, q9, #16
+ vshrn.u32 d19, q10, #16
+ vmull.u8 q2, d0, d28
+ vmull.u8 q3, d2, d28
+ vmovn.u16 d17, q9
+ vld1.32 {d18, d19}, [OUT, :128]
+ pld [OUT, #(prefetch_offset * 4)]
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q3, d3, d29
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vshll.u16 q0, d4, #8
+ vshll.u16 q1, d6, #8
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vmlsl.u16 q0, d4, d30
+ vmlsl.u16 q1, d6, d31
+ vmull.u8 q10, d16, d22
+ vmull.u8 q11, d17, d22
+ vmlal.u16 q0, d5, d30
+ vmlal.u16 q1, d7, d31
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vrshrn.u16 d16, q10, #8
+ vrshrn.u16 d17, q11, #8
+ vld1.32 {d2}, [TMP3], STRIDE
+ vdup.32 d22, d17[1]
+ vld1.32 {d3}, [TMP3]
+ vmvn.8 d22, d22
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d4}, [TMP4], STRIDE
+ vmull.u8 q10, d18, d22
+ vmull.u8 q11, d19, d22
+ vld1.32 {d5}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q3, d2, d28
+ vrshr.u16 q9, q10, #8
+ vrshr.u16 q15, q11, #8
+ vmlal.u8 q3, d3, d29
+ vmull.u8 q1, d4, d28
+ vraddhn.u16 d18, q9, q10
+ vraddhn.u16 d19, q15, q11
+ vmlal.u8 q1, d5, d29
+ vshr.u16 q15, q12, #8
+ vqadd.u8 q9, q8, q9
+ vld1.32 {d22[0]}, [MASK]!
+ vuzp.8 d18, d19
+ vadd.u16 q12, q12, q13
+ vuzp.8 d18, d19
+ vmovn.u16 d16, q0
+ vst1.32 {d18, d19}, [OUT, :128]!
.endm
/* add_8888_8888 */
commit 4481920f405e47b3a92811a8cb06afbd37dee01b
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Wed Sep 21 15:52:13 2011 +0900
ARM: NEON: Instruction scheduling of bilinear over_8888_8888
Instructions are reordered to eliminate pipeline stalls and get
better memory access.
Performance of before/after on cortex-a8 @ 1GHz
<< 2000 x 2000 with scale factor close to 1.x >>
before : 50.43 Mpix/s
after : 61.09 Mpix/s
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 25bcb24..82d248e 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -893,15 +893,158 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8888_process_pixblock_head
- bilinear_over_8888_8888_process_four_pixels
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+
+ vld1.32 {d22}, [TMP1], STRIDE
+ vld1.32 {d23}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vmull.u8 q8, d22, d28
+ vmlal.u8 q8, d23, d29
+
+ vld1.32 {d22}, [TMP2], STRIDE
+ vld1.32 {d23}, [TMP2]
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmull.u8 q9, d22, d28
+ vmlal.u8 q9, d23, d29
+
+ vld1.32 {d22}, [TMP3], STRIDE
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+
+ vshll.u16 q1, d18, #8
+ vmlsl.u16 q1, d18, d31
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail
+ vshll.u16 q2, d20, #8
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vmovn.u16 d7, q2
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vmvn.8 d4, d4
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vrshr.u16 q1, q11, #8
+ vrshr.u16 q10, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q10, q2
+ vqadd.u8 q3, q1, q3
+ vuzp.8 d6, d7
+ vuzp.8 d6, d7
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail_head
- bilinear_over_8888_8888_process_pixblock_tail
- bilinear_over_8888_8888_process_pixblock_head
+ vshll.u16 q2, d20, #8
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vmlsl.u16 q2, d20, d30
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vld1.32 {d20}, [TMP1], STRIDE
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vld1.32 {d21}, [TMP1]
+ vmull.u8 q8, d20, d28
+ vmlal.u8 q8, d21, d29
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vld1.32 {d22}, [TMP2], STRIDE
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vld1.32 {d23}, [TMP2]
+ vmull.u8 q9, d22, d28
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmlal.u8 q9, d23, d29
+ vmovn.u16 d7, q2
+ vld1.32 {d22}, [TMP3], STRIDE
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vld1.32 {d23}, [TMP3]
+ vmvn.8 d4, d4
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vrshr.u16 q1, q11, #8
+ vmlal.u16 q0, d17, d30
+ vrshr.u16 q8, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q8, q2
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vqadd.u8 q3, q1, q3
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+ vuzp.8 d6, d7
+ vshll.u16 q1, d18, #8
+ vuzp.8 d6, d7
+ vmlsl.u16 q1, d18, d31
+ vadd.u16 q12, q12, q13
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm
/* over_8888_8_8888 */
commit 1cd916f3a5ebeb943f66eecf0b8ce99af0b95d11
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Fri Sep 23 00:03:22 2011 +0900
ARM: NEON: Replace old bilinear scanline generator with new template
Bilinear scanline functions in pixman-arm-neon-asm-bilinear.S can
be replaced with new template just by wrapping existing macros.
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 784e5df..25bcb24 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -582,198 +582,6 @@ fname:
bilinear_store_&dst_fmt 4, q2, q3
.endm
-.macro generate_bilinear_scanline_func_src_dst \
- fname, src_fmt, dst_fmt, op, \
- bpp_shift, prefetch_distance
-
-pixman_asm_function fname
- OUT .req r0
- TOP .req r1
- BOTTOM .req r2
- WT .req r3
- WB .req r4
- X .req r5
- UX .req r6
- WIDTH .req ip
- TMP1 .req r3
- TMP2 .req r4
- PF_OFFS .req r7
- TMP3 .req r8
- TMP4 .req r9
- STRIDE .req r2
-
- mov ip, sp
- push {r4, r5, r6, r7, r8, r9}
- mov PF_OFFS, #prefetch_distance
- ldmia ip, {WB, X, UX, WIDTH}
- mul PF_OFFS, PF_OFFS, UX
-
- .set prefetch_offset, prefetch_distance
-
- sub STRIDE, BOTTOM, TOP
- .unreq BOTTOM
-
- cmp WIDTH, #0
- ble 3f
-
- vdup.u16 q12, X
- vdup.u16 q13, UX
- vdup.u8 d28, WT
- vdup.u8 d29, WB
- vadd.u16 d25, d25, d26
- vadd.u16 q13, q13, q13
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
-
- subs WIDTH, WIDTH, #4
- blt 1f
- mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
-0:
- bilinear_interpolate_four_pixels src_fmt, x, dst_fmt, op
- subs WIDTH, WIDTH, #4
- bge 0b
-1:
- tst WIDTH, #2
- beq 2f
- bilinear_interpolate_two_pixels src_fmt, x, dst_fmt, op
-2:
- tst WIDTH, #1
- beq 3f
- bilinear_interpolate_last_pixel src_fmt, x, dst_fmt, op
-3:
- pop {r4, r5, r6, r7, r8, r9}
- bx lr
-
- .unreq OUT
- .unreq TOP
- .unreq WT
- .unreq WB
- .unreq X
- .unreq UX
- .unreq WIDTH
- .unreq TMP1
- .unreq TMP2
- .unreq PF_OFFS
- .unreq TMP3
- .unreq TMP4
- .unreq STRIDE
-.endfunc
-
-.endm
-
-.macro generate_bilinear_scanline_func_src_a8_dst \
- fname, src_fmt, dst_fmt, op, \
- bpp_shift, prefetch_distance
-
-pixman_asm_function fname
- OUT .req r0
- MASK .req r1
- TOP .req r2
- BOTTOM .req r3
- WT .req r4
- WB .req r5
- X .req r6
- UX .req r7
- WIDTH .req ip
- TMP1 .req r4
- TMP2 .req r5
- PF_OFFS .req r8
- TMP3 .req r9
- TMP4 .req r10
- STRIDE .req r3
-
- mov ip, sp
- push {r4, r5, r6, r7, r8, r9, r10, ip}
- mov PF_OFFS, #prefetch_distance
- ldmia ip, {WT, WB, X, UX, WIDTH}
- mul PF_OFFS, PF_OFFS, UX
-
- .set prefetch_offset, prefetch_distance
-
- sub STRIDE, BOTTOM, TOP
- .unreq BOTTOM
-
- cmp WIDTH, #0
- ble 3f
-
- vdup.u16 q12, X
- vdup.u16 q13, UX
- vdup.u8 d28, WT
- vdup.u8 d29, WB
- vadd.u16 d25, d25, d26
- vadd.u16 q13, q13, q13
- vshr.u16 q15, q12, #8
- vadd.u16 q12, q12, q13
-
- subs WIDTH, WIDTH, #4
- blt 1f
- mov PF_OFFS, PF_OFFS, asr #(16 - bpp_shift)
-0:
- bilinear_interpolate_four_pixels src_fmt, 8, dst_fmt, op
- subs WIDTH, WIDTH, #4
- bge 0b
-1:
- tst WIDTH, #2
- beq 2f
- bilinear_interpolate_two_pixels src_fmt, 8, dst_fmt, op
-2:
- tst WIDTH, #1
- beq 3f
- bilinear_interpolate_last_pixel src_fmt, 8, dst_fmt, op
-3:
- pop {r4, r5, r6, r7, r8, r9, r10, ip}
- bx lr
-
- .unreq OUT
- .unreq TOP
- .unreq WT
- .unreq WB
- .unreq X
- .unreq UX
- .unreq WIDTH
- .unreq MASK
- .unreq TMP1
- .unreq TMP2
- .unreq PF_OFFS
- .unreq TMP3
- .unreq TMP4
- .unreq STRIDE
-.endfunc
-
-.endm
-
-generate_bilinear_scanline_func_src_dst \
- pixman_scaled_bilinear_scanline_8888_8888_OVER_asm_neon, \
- 8888, 8888, over, 2, 28
-
-generate_bilinear_scanline_func_src_dst \
- pixman_scaled_bilinear_scanline_8888_8888_ADD_asm_neon, \
- 8888, 8888, add, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_8888_SRC_asm_neon, \
- 8888, 8888, src, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_0565_SRC_asm_neon, \
- 8888, 0565, src, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_0565_8_x888_SRC_asm_neon, \
- 0565, 8888, src, 1, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_0565_8_0565_SRC_asm_neon, \
- 0565, 0565, src, 1, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_8888_OVER_asm_neon, \
- 8888, 8888, over, 2, 28
-
-generate_bilinear_scanline_func_src_a8_dst \
- pixman_scaled_bilinear_scanline_8888_8_8888_ADD_asm_neon, \
- 8888, 8888, add, 2, 28
-
.set BILINEAR_FLAG_USE_MASK, 1
.set BILINEAR_FLAG_USE_ALL_NEON_REGS, 2
@@ -855,6 +663,8 @@ pixman_asm_function fname
TMP4 .req r10
STRIDE .req r3
+ .set prefetch_offset, prefetch_distance
+
mov ip, sp
push {r4, r5, r6, r7, r8, r9, r10, ip}
mov PF_OFFS, #prefetch_distance
@@ -968,3 +778,293 @@ pixman_asm_function fname
.endfunc
.endm
+
+/* src_8888_8_8888 */
+.macro bilinear_src_8888_8_8888_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 8888, src
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_head
+ bilinear_src_8888_8_8888_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_8888_process_pixblock_tail_head
+ bilinear_src_8888_8_8888_process_pixblock_tail
+ bilinear_src_8888_8_8888_process_pixblock_head
+.endm
+
+/* src_8888_8_0565 */
+.macro bilinear_src_8888_8_0565_process_last_pixel
+ bilinear_interpolate_last_pixel 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_two_pixels
+ bilinear_interpolate_two_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_four_pixels
+ bilinear_interpolate_four_pixels 8888, 8, 0565, src
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_head
+ bilinear_src_8888_8_0565_process_four_pixels
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail
+.endm
+
+.macro bilinear_src_8888_8_0565_process_pixblock_tail_head
+ bilinear_src_8888_8_0565_process_pixblock_tail
+ bilinear_src_8888_8_0565_process_pixblock_head
+.endm
+
+/* src_0565_8_x888 */
+.macro bilinear_src_0565_8_x888_process_last_pixel
+ bilinear_interpolate_last_pixel 0565, 8, 8888, src
+.endm
+
+.macro bilinear_src_0565_8_x888_process_two_pixels
+ bilinear_interpolate_two_pixels 0565, 8, 8888, src
+.endm
Reply to: