pixman: Changes to 'debian-experimental'
ChangeLog | 198 +++++++
Makefile.am | 4
configure.ac | 2
debian/changelog | 6
pixman/pixman-arm-neon-asm-bilinear.S | 922 ++++++++++++++++++++++++++++------
pixman/pixman-arm-neon-asm.S | 139 +++++
pixman/pixman-arm-neon.c | 10
pixman/pixman-gradient-walker.c | 175 +-----
pixman/pixman-image.c | 73 ++
pixman/pixman-noop.c | 6
pixman/pixman-private.h | 16
pixman/pixman-sse2.c | 2
12 files changed, 1242 insertions(+), 311 deletions(-)
New commits:
commit 39102f8b3e7f36ad912fc95596dcd0a61ae2bab0
Author: Cyril Brulebois <kibi@debian.org>
Date: Tue Nov 1 12:29:25 2011 +0100
Upload to experimental.
diff --git a/debian/changelog b/debian/changelog
index e2f7f36..7bebde7 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,8 +1,8 @@
-pixman (0.23.8-1) UNRELEASED; urgency=low
+pixman (0.23.8-1) experimental; urgency=low
* New upstream release.
- -- Cyril Brulebois <kibi@debian.org> Tue, 01 Nov 2011 12:28:45 +0100
+ -- Cyril Brulebois <kibi@debian.org> Tue, 01 Nov 2011 12:29:16 +0100
pixman (0.23.6-1) experimental; urgency=low
commit bfad5455b6885b09fb8a63a7384f077fc0a45741
Author: Cyril Brulebois <kibi@debian.org>
Date: Tue Nov 1 12:28:58 2011 +0100
Bump changelogs.
diff --git a/ChangeLog b/ChangeLog
index 6a10342..fa61d98 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,201 @@
+commit a0f1b565811388b0567c845b9b7063d5b93d325e
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Sat Oct 29 05:33:44 2011 -0400
+
+ Pre-release version bump to 0.23.8
+
+commit 498138c293a2abce44ce122114852f4e6c5b87fe
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Tue Oct 25 08:45:34 2011 -0400
+
+ Fix use of uninitialized fields reported by valgrind
+
+ In pixman-noop.c and pixman-sse2.c, we are accessing
+ image->bits.width/height without first making sure the image is a bits
+ image. The warning is harmless because we never act on this
+ information without checking that the image is a8r8g8b8, but valgrind
+ does warn about it.
+
+ In pixman-noop.c, just reorder the clauses in the if statement; in
+ pixman-sse2.c require images to have the FAST_PATH_BITS_IMAGE flag
+ set.
+
+commit 6131707e8fc39187d1d358481f7c57c57cfab206
+Merge: 3d4d705 ec7c9c2
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Thu Oct 20 09:13:12 2011 -0400
+
+ Merge branch 'gradients'
+
+commit 3d4d705d2ffa4aeab3dc02a23c2aadbea1374a3f
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date: Tue Oct 18 21:50:18 2011 +0900
+
+ ARM: NEON: Fix assembly typo error in src_n_8_8888
+
+ Binutils 2.21 does not complain about missing comma between ARM
+ register and alignement specifier in vld/vst instructions which
+ causes build error on binutils 2.20.
+
+commit 19f118f41f8725f22395d31eac5670cb350b55ec
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date: Mon Sep 26 18:33:27 2011 +0900
+
+ ARM: NEON: Standard fast path src_n_8_8
+
+ Performance numbers of before/after on cortex-a8 @ 1GHz
+
+ - before
+ L1: 28.05 L2: 28.26 M: 26.97 ( 4.48%) HT: 19.79 VT: 19.14 R: 17.61 RT: 9.88 ( 101Kops/s)
+
+ - after
+ L1:1430.28 L2:1252.10 M:421.93 ( 75.48%) HT:170.16 VT:138.03 R:145.86 RT: 35.51 ( 255Kops/s)
+
+commit 4db9e2bc13d3ed26416f249e57acec4b41f58b7f
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date: Mon Sep 26 17:03:54 2011 +0900
+
+ ARM: NEON: Standard fast path src_n_8_8888
+
+ Performance numbers of before/after on cortex-a8 @ 1GHz
+
+ - before
+ L1: 32.39 L2: 31.79 M: 30.84 ( 13.77%) HT: 21.58 VT: 19.75 R: 18.83 RT: 10.46 ( 106Kops/s)
+
+ - after
+ L1: 516.25 L2: 372.00 M:193.49 ( 85.59%) HT:136.93 VT:109.10 R:104.48 RT: 34.77 ( 253Kops/s)
+
+commit 26659de6cd2775c83a9a6e6660324d5baacf61f9
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date: Mon Sep 26 19:04:53 2011 +0900
+
+ ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888
+
+ Instructions are reordered to eliminate pipeline stalls and get
+ better memory access.
+
+ Performance of before/after on cortex-a8 @ 1GHz
+
+ << 2000 x 2000 with scale factor close to 1.x >>
+ before : 40.53 Mpix/s
+ after : 50.76 Mpix/s
+
+commit 4481920f405e47b3a92811a8cb06afbd37dee01b
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date: Wed Sep 21 15:52:13 2011 +0900
+
+ ARM: NEON: Instruction scheduling of bilinear over_8888_8888
+
+ Instructions are reordered to eliminate pipeline stalls and get
+ better memory access.
+
+ Performance of before/after on cortex-a8 @ 1GHz
+
+ << 2000 x 2000 with scale factor close to 1.x >>
+ before : 50.43 Mpix/s
+ after : 61.09 Mpix/s
+
+commit 1cd916f3a5ebeb943f66eecf0b8ce99af0b95d11
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date: Fri Sep 23 00:03:22 2011 +0900
+
+ ARM: NEON: Replace old bilinear scanline generator with new template
+
+ Bilinear scanline functions in pixman-arm-neon-asm-bilinear.S can
+ be replaced with new template just by wrapping existing macros.
+
+commit 6682b2b3597c9f431900bfe7b1b42dfbe006bae5
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date: Tue Sep 20 21:32:35 2011 +0900
+
+ ARM: NEON: Bilinear macro template for instruction scheduling
+
+ This macro template takes 6 code blocks.
+
+ 1. process_last_pixel
+ 2. process_two_pixels
+ 3. process_four_pixels
+ 4. process_pixblock_head
+ 5. process_pixblock_tail
+ 6. process_pixblock_tail_head
+
+ process_last_pixel does not need to update horizontal weight. This
+ is done by the template. two and four code block should update
+ horizontal weight inside of them. head/tail/tail_head blocks
+ consist unrolled core loop. You can apply instruction scheduling
+ to the tail_head blocks.
+
+ You can also specify size of the pixel block. Supported size is 4
+ and 8. If you want to use mask, give BILINEAR_FLAG_USE_MASK flags
+ to the template, then you can use register MASK. When using d8~d15
+ registers, give BILINEAR_FLAG_USE_ALL_NEON_REGS to make sure
+ registers are properly saved on the stack and later restored.
+
+commit b5e4355fa4973e3edd4abeb11bdc47c42371cc76
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date: Tue Sep 20 19:46:25 2011 +0900
+
+ ARM: NEON: Some cleanup of bilinear scanline functions
+
+ Use STRIDE and initial horizontal weight update is done before
+ entering interpolation loop. Cache preload for mask and dst.
+
+commit ec7c9c2b6865b48b8bd14e4509538f8fcbe93463
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Fri Oct 14 09:04:48 2011 -0400
+
+ Simplify gradient_walker_reset()
+
+ The code that searches for the closest color stop to the given
+ position is duplicated across the various repeat modes. Replace the
+ switch with two if/else constructions, and put the search code between
+ them.
+
+commit 2d0da8ab8d8fef60ed1bbb9d6b75f66577c3f85d
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Fri Oct 14 09:02:14 2011 -0400
+
+ Use sentinels instead of special casing first and last stops
+
+ When storing the gradient stops internally, allocate two more stops,
+ one before the beginning of the stop list and one after the
+ end. Initialize those stops based on the repeat property of the
+ gradient.
+
+ This allows gradient_walker_reset() to be simplified because it can
+ now simply pick the two closest stops to the position without special
+ casing the first and last stops.
+
+commit 84d6ca7c891601b019d4862a556ed98b7e6fe525
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Fri Oct 14 07:42:00 2011 -0400
+
+ gradient walker: Correct types and fix formatting
+
+ The type of pos in gradient_walker_reset() and gradient_walker_pixel()
+ is pixman_fixed_48_16_t and not pixman_fixed_32_32. The types of the
+ positions in the walker struct are pixman_fixed_t and not int32_t, and
+ need_reset is a boolean, not an integer. The spread field should be
+ called repeat and have the type pixman_repeat_t.
+
+ Also fix some formatting issues, make gradient_walker_reset() static,
+ and delete the pointless PIXMAN_GRADIENT_WALKER_NEED_RESET() macro.
+
+commit ace225b53dee88d134753ac901f26ba3db6781da
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Tue Oct 11 16:12:24 2011 -0400
+
+ Add stable release / development snapshot to draft release notes
+
+ This will hopefully serve as a reminder to me that I should put this
+ information in the release notes.
+
+commit bb7142d361d56d66ac40debb60a7c4d099764ba8
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Tue Oct 11 06:10:39 2011 -0400
+
+ Post-release version bump to 0.23.7
+
commit e20ac40bd30484f0f711b52d0c1993ef08760284
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date: Tue Oct 11 06:00:51 2011 -0400
diff --git a/debian/changelog b/debian/changelog
index af38044..e2f7f36 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+pixman (0.23.8-1) UNRELEASED; urgency=low
+
+ * New upstream release.
+
+ -- Cyril Brulebois <kibi@debian.org> Tue, 01 Nov 2011 12:28:45 +0100
+
pixman (0.23.6-1) experimental; urgency=low
[ Rico Tzschichholz ]
commit a0f1b565811388b0567c845b9b7063d5b93d325e
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date: Sat Oct 29 05:33:44 2011 -0400
Pre-release version bump to 0.23.8
diff --git a/configure.ac b/configure.ac
index 6c88c84..0552563 100644
--- a/configure.ac
+++ b/configure.ac
@@ -54,7 +54,7 @@ AC_PREREQ([2.57])
m4_define([pixman_major], 0)
m4_define([pixman_minor], 23)
-m4_define([pixman_micro], 7)
+m4_define([pixman_micro], 8)
m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
commit 498138c293a2abce44ce122114852f4e6c5b87fe
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date: Tue Oct 25 08:45:34 2011 -0400
Fix use of uninitialized fields reported by valgrind
In pixman-noop.c and pixman-sse2.c, we are accessing
image->bits.width/height without first making sure the image is a bits
image. The warning is harmless because we never act on this
information without checking that the image is a8r8g8b8, but valgrind
does warn about it.
In pixman-noop.c, just reorder the clauses in the if statement; in
pixman-sse2.c require images to have the FAST_PATH_BITS_IMAGE flag
set.
diff --git a/pixman/pixman-noop.c b/pixman/pixman-noop.c
index 906a491..f4012d8 100644
--- a/pixman/pixman-noop.c
+++ b/pixman/pixman-noop.c
@@ -76,12 +76,12 @@ noop_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
{
iter->get_scanline = _pixman_iter_get_scanline_noop;
}
- else if ((iter->flags & ITER_NARROW) &&
+ else if (image->common.extended_format_code == PIXMAN_a8r8g8b8 &&
+ (iter->flags & ITER_NARROW) &&
(image->common.flags & FLAGS) == FLAGS &&
iter->x >= 0 && iter->y >= 0 &&
iter->x + iter->width <= image->bits.width &&
- iter->y + iter->height <= image->bits.height &&
- image->common.extended_format_code == PIXMAN_a8r8g8b8)
+ iter->y + iter->height <= image->bits.height)
{
iter->buffer =
image->bits.bits + iter->y * image->bits.rowstride + iter->x;
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index c419511..8adf541 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5982,7 +5982,7 @@ sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
int height = iter->height;
#define FLAGS \
- (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+ (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE)
if ((iter->flags & ITER_NARROW) &&
(image->common.flags & FLAGS) == FLAGS &&
commit 3d4d705d2ffa4aeab3dc02a23c2aadbea1374a3f
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Tue Oct 18 21:50:18 2011 +0900
ARM: NEON: Fix assembly typo error in src_n_8_8888
Binutils 2.21 does not complain about missing comma between ARM
register and alignement specifier in vld/vst instructions which
causes build error on binutils 2.20.
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index da8f054..87aae1d 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1260,7 +1260,7 @@ generate_composite_function \
PF subges PF_CTL, PF_CTL, #0x10
vmull.u8 q11, d24, d3
PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
- vst4.8 {d28, d29, d30, d31}, [DST_W :128]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W, :128]!
vrsra.u16 q8, q8, #8
vrsra.u16 q9, q9, #8
vrsra.u16 q10, q10, #8
commit 19f118f41f8725f22395d31eac5670cb350b55ec
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon Sep 26 18:33:27 2011 +0900
ARM: NEON: Standard fast path src_n_8_8
Performance numbers of before/after on cortex-a8 @ 1GHz
- before
L1: 28.05 L2: 28.26 M: 26.97 ( 4.48%) HT: 19.79 VT: 19.14 R: 17.61 RT: 9.88 ( 101Kops/s)
- after
L1:1430.28 L2:1252.10 M:421.93 ( 75.48%) HT:170.16 VT:138.03 R:145.86 RT: 35.51 ( 255Kops/s)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 1db02db..da8f054 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1292,6 +1292,72 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+ vmull.u8 q0, d24, d16
+ vmull.u8 q1, d25, d16
+ vmull.u8 q2, d26, d16
+ vmull.u8 q3, d27, d16
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+ vrshrn.u16 d28, q0, #8
+ vrshrn.u16 d29, q1, #8
+ vrshrn.u16 d30, q2, #8
+ vrshrn.u16 d31, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q0, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q1, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q2, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q3, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q0, d24, d16
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q1, d25, d16
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q2, d26, d16
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q3, d27, d16
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst1.8 {d28, d29, d30, d31}, [DST_W, :128]!
+ vrsra.u16 q0, q0, #8
+ vrsra.u16 q1, q1, #8
+ vrsra.u16 q2, q2, #8
+ vrsra.u16 q3, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d16[0]}, [DUMMY]
+ vdup.8 d16, d16[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+ FLAG_DST_WRITEONLY, \
+ 32, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8_init, \
+ pixman_composite_src_n_8_8_cleanup, \
+ pixman_composite_src_n_8_8_process_pixblock_head, \
+ pixman_composite_src_n_8_8_process_pixblock_tail, \
+ pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8_8888_process_pixblock_head
/* expecting deinterleaved source data in {d8, d9, d10, d11} */
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 3db9adf..ca139de 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -92,6 +92,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8,
+ uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -295,6 +297,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8, neon_composite_src_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
commit 4db9e2bc13d3ed26416f249e57acec4b41f58b7f
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon Sep 26 17:03:54 2011 +0900
ARM: NEON: Standard fast path src_n_8_8888
Performance numbers of before/after on cortex-a8 @ 1GHz
- before
L1: 32.39 L2: 31.79 M: 30.84 ( 13.77%) HT: 21.58 VT: 19.75 R: 18.83 RT: 10.46 ( 106Kops/s)
- after
L1: 516.25 L2: 372.00 M:193.49 ( 85.59%) HT:136.93 VT:109.10 R:104.48 RT: 34.77 ( 253Kops/s)
diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 3fcd07d..1db02db 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1219,6 +1219,79 @@ generate_composite_function \
/******************************************************************************/
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+ /* expecting solid source in {d0, d1, d2, d3} */
+ /* mask is in d24 (d25, d26, d27 are unused) */
+
+ /* in */
+ vmull.u8 q8, d24, d0
+ vmull.u8 q9, d24, d1
+ vmull.u8 q10, d24, d2
+ vmull.u8 q11, d24, d3
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+ vrshrn.u16 d28, q8, #8
+ vrshrn.u16 d29, q9, #8
+ vrshrn.u16 d30, q10, #8
+ vrshrn.u16 d31, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+ fetch_mask_pixblock
+ PF add PF_X, PF_X, #8
+ vrshrn.u16 d28, q8, #8
+ PF tst PF_CTL, #0x0F
+ vrshrn.u16 d29, q9, #8
+ PF addne PF_X, PF_X, #8
+ vrshrn.u16 d30, q10, #8
+ PF subne PF_CTL, PF_CTL, #1
+ vrshrn.u16 d31, q11, #8
+ PF cmp PF_X, ORIG_W
+ vmull.u8 q8, d24, d0
+ PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+ vmull.u8 q9, d24, d1
+ PF subge PF_X, PF_X, ORIG_W
+ vmull.u8 q10, d24, d2
+ PF subges PF_CTL, PF_CTL, #0x10
+ vmull.u8 q11, d24, d3
+ PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+ vst4.8 {d28, d29, d30, d31}, [DST_W :128]!
+ vrsra.u16 q8, q8, #8
+ vrsra.u16 q9, q9, #8
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+ add DUMMY, sp, #ARGS_STACK_OFFSET
+ vld1.32 {d3[0]}, [DUMMY]
+ vdup.8 d0, d3[0]
+ vdup.8 d1, d3[1]
+ vdup.8 d2, d3[2]
+ vdup.8 d3, d3[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+ pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+ FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+ 8, /* number of pixels, processed in a single block */ \
+ 5, /* prefetch distance */ \
+ pixman_composite_src_n_8_8888_init, \
+ pixman_composite_src_n_8_8888_cleanup, \
+ pixman_composite_src_n_8_8888_process_pixblock_head, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail, \
+ pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
.macro pixman_composite_over_n_8_8888_process_pixblock_head
/* expecting deinterleaved source data in {d8, d9, d10, d11} */
/* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index effb50b..3db9adf 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -90,6 +90,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
uint8_t, 1, uint8_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
uint32_t, 1, uint32_t, 1)
@@ -289,6 +291,11 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
PIXMAN_STD_FAST_PATH (SRC, pixbuf, pixbuf, a8b8g8r8, neon_composite_src_rpixbuf_8888),
PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8r8g8b8, neon_composite_src_rpixbuf_8888),
PIXMAN_STD_FAST_PATH (SRC, rpixbuf, rpixbuf, a8b8g8r8, neon_composite_src_pixbuf_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, neon_composite_src_n_8_8888),
+ PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, neon_composite_src_n_8_8888),
+
PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8, neon_composite_over_n_8_8),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, neon_composite_over_n_8_0565),
PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, neon_composite_over_n_8_0565),
commit 26659de6cd2775c83a9a6e6660324d5baacf61f9
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Mon Sep 26 19:04:53 2011 +0900
ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888
Instructions are reordered to eliminate pipeline stalls and get
better memory access.
Performance of before/after on cortex-a8 @ 1GHz
<< 2000 x 2000 with scale factor close to 1.x >>
before : 40.53 Mpix/s
after : 50.76 Mpix/s
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 82d248e..f7913ad 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -949,7 +949,7 @@ pixman_asm_function fname
vshrn.u32 d0, q0, #16
vshrn.u32 d1, q1, #16
vld1.32 {d2, d3}, [OUT, :128]
- pld [OUT, PF_OFFS]
+ pld [OUT, #(prefetch_offset * 4)]
vshrn.u32 d4, q2, #16
vshr.u16 q15, q12, #8
vshrn.u32 d5, q3, #16
@@ -1061,15 +1061,169 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_head
- bilinear_over_8888_8_8888_process_four_pixels
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vld1.32 {d0}, [TMP1], STRIDE
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vld1.32 {d1}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vld1.32 {d2}, [TMP2], STRIDE
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vld1.32 {d3}, [TMP2]
+ vmull.u8 q2, d0, d28
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q3, d3, d29
+ vshll.u16 q0, d4, #8
+ vshll.u16 q1, d6, #8
+ vmlsl.u16 q0, d4, d30
+ vmlsl.u16 q1, d6, d31
+ vmlal.u16 q0, d5, d30
+ vmlal.u16 q1, d7, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2}, [TMP3], STRIDE
+ vld1.32 {d3}, [TMP3]
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d4}, [TMP4], STRIDE
+ vld1.32 {d5}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q3, d2, d28
+ vmlal.u8 q3, d3, d29
+ vmull.u8 q1, d4, d28
+ vmlal.u8 q1, d5, d29
+ vshr.u16 q15, q12, #8
+ vld1.32 {d22[0]}, [MASK]!
+ pld [MASK, #prefetch_offset]
+ vadd.u16 q12, q12, q13
+ vmovn.u16 d16, q0
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_tail
+ vshll.u16 q9, d6, #8
+ vshll.u16 q10, d2, #8
+ vmlsl.u16 q9, d6, d30
+ vmlsl.u16 q10, d2, d31
+ vmlal.u16 q9, d7, d30
+ vmlal.u16 q10, d3, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vdup.32 d22, d22[0]
+ vshrn.u32 d18, q9, #16
+ vshrn.u32 d19, q10, #16
+ vmovn.u16 d17, q9
+ vld1.32 {d18, d19}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vmull.u8 q10, d16, d22
+ vmull.u8 q11, d17, d22
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+ vrshrn.u16 d16, q10, #8
+ vrshrn.u16 d17, q11, #8
+ vdup.32 d22, d17[1]
+ vmvn.8 d22, d22
+ vmull.u8 q10, d18, d22
+ vmull.u8 q11, d19, d22
+ vrshr.u16 q9, q10, #8
+ vrshr.u16 q0, q11, #8
+ vraddhn.u16 d18, q9, q10
+ vraddhn.u16 d19, q0, q11
+ vqadd.u8 q9, q8, q9
+ vuzp.8 d18, d19
+ vuzp.8 d18, d19
+ vst1.32 {d18, d19}, [OUT, :128]!
.endm
.macro bilinear_over_8888_8_8888_process_pixblock_tail_head
- bilinear_over_8888_8_8888_process_pixblock_tail
- bilinear_over_8888_8_8888_process_pixblock_head
+ vshll.u16 q9, d6, #8
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vshll.u16 q10, d2, #8
+ vld1.32 {d0}, [TMP1], STRIDE
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlsl.u16 q9, d6, d30
+ vmlsl.u16 q10, d2, d31
+ vld1.32 {d1}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vmlal.u16 q9, d7, d30
+ vmlal.u16 q10, d3, d31
+ vld1.32 {d2}, [TMP2], STRIDE
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vld1.32 {d3}, [TMP2]
+ vdup.32 d22, d22[0]
+ vshrn.u32 d18, q9, #16
+ vshrn.u32 d19, q10, #16
+ vmull.u8 q2, d0, d28
+ vmull.u8 q3, d2, d28
+ vmovn.u16 d17, q9
+ vld1.32 {d18, d19}, [OUT, :128]
+ pld [OUT, #(prefetch_offset * 4)]
+ vmlal.u8 q2, d1, d29
+ vmlal.u8 q3, d3, d29
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vshll.u16 q0, d4, #8
+ vshll.u16 q1, d6, #8
+ vuzp.8 d16, d17
+ vuzp.8 d18, d19
+ vmlsl.u16 q0, d4, d30
+ vmlsl.u16 q1, d6, d31
+ vmull.u8 q10, d16, d22
+ vmull.u8 q11, d17, d22
+ vmlal.u16 q0, d5, d30
+ vmlal.u16 q1, d7, d31
+ vrsra.u16 q10, q10, #8
+ vrsra.u16 q11, q11, #8
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vrshrn.u16 d16, q10, #8
+ vrshrn.u16 d17, q11, #8
+ vld1.32 {d2}, [TMP3], STRIDE
+ vdup.32 d22, d17[1]
+ vld1.32 {d3}, [TMP3]
+ vmvn.8 d22, d22
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d4}, [TMP4], STRIDE
+ vmull.u8 q10, d18, d22
+ vmull.u8 q11, d19, d22
+ vld1.32 {d5}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q3, d2, d28
+ vrshr.u16 q9, q10, #8
+ vrshr.u16 q15, q11, #8
+ vmlal.u8 q3, d3, d29
+ vmull.u8 q1, d4, d28
+ vraddhn.u16 d18, q9, q10
+ vraddhn.u16 d19, q15, q11
+ vmlal.u8 q1, d5, d29
+ vshr.u16 q15, q12, #8
+ vqadd.u8 q9, q8, q9
+ vld1.32 {d22[0]}, [MASK]!
+ vuzp.8 d18, d19
+ vadd.u16 q12, q12, q13
+ vuzp.8 d18, d19
+ vmovn.u16 d16, q0
+ vst1.32 {d18, d19}, [OUT, :128]!
.endm
/* add_8888_8888 */
commit 4481920f405e47b3a92811a8cb06afbd37dee01b
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Wed Sep 21 15:52:13 2011 +0900
ARM: NEON: Instruction scheduling of bilinear over_8888_8888
Instructions are reordered to eliminate pipeline stalls and get
better memory access.
Performance of before/after on cortex-a8 @ 1GHz
<< 2000 x 2000 with scale factor close to 1.x >>
before : 50.43 Mpix/s
after : 61.09 Mpix/s
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 25bcb24..82d248e 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -893,15 +893,158 @@ pixman_asm_function fname
.endm
.macro bilinear_over_8888_8888_process_pixblock_head
- bilinear_over_8888_8888_process_four_pixels
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+
+ vld1.32 {d22}, [TMP1], STRIDE
+ vld1.32 {d23}, [TMP1]
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ vmull.u8 q8, d22, d28
+ vmlal.u8 q8, d23, d29
+
+ vld1.32 {d22}, [TMP2], STRIDE
+ vld1.32 {d23}, [TMP2]
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmull.u8 q9, d22, d28
+ vmlal.u8 q9, d23, d29
+
+ vld1.32 {d22}, [TMP3], STRIDE
+ vld1.32 {d23}, [TMP3]
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vmlal.u16 q0, d17, d30
+
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+
+ vshll.u16 q1, d18, #8
+ vmlsl.u16 q1, d18, d31
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail
+ vshll.u16 q2, d20, #8
+ vmlsl.u16 q2, d20, d30
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vmovn.u16 d7, q2
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vmvn.8 d4, d4
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vrshr.u16 q1, q11, #8
+ vrshr.u16 q10, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q10, q2
+ vqadd.u8 q3, q1, q3
+ vuzp.8 d6, d7
+ vuzp.8 d6, d7
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm
.macro bilinear_over_8888_8888_process_pixblock_tail_head
- bilinear_over_8888_8888_process_pixblock_tail
- bilinear_over_8888_8888_process_pixblock_head
+ vshll.u16 q2, d20, #8
+ mov TMP1, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP1, asl #2
+ vmlsl.u16 q2, d20, d30
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP2, TOP, TMP2, asl #2
+ vmlal.u16 q2, d21, d30
+ vshll.u16 q3, d22, #8
+ vld1.32 {d20}, [TMP1], STRIDE
+ vmlsl.u16 q3, d22, d31
+ vmlal.u16 q3, d23, d31
+ vld1.32 {d21}, [TMP1]
+ vmull.u8 q8, d20, d28
+ vmlal.u8 q8, d21, d29
+ vshrn.u32 d0, q0, #16
+ vshrn.u32 d1, q1, #16
+ vld1.32 {d2, d3}, [OUT, :128]
+ pld [OUT, PF_OFFS]
+ vshrn.u32 d4, q2, #16
+ vshr.u16 q15, q12, #8
+ vld1.32 {d22}, [TMP2], STRIDE
+ vshrn.u32 d5, q3, #16
+ vmovn.u16 d6, q0
+ vld1.32 {d23}, [TMP2]
+ vmull.u8 q9, d22, d28
+ mov TMP3, X, asr #16
+ add X, X, UX
+ add TMP3, TOP, TMP3, asl #2
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP4, TOP, TMP4, asl #2
+ vmlal.u8 q9, d23, d29
+ vmovn.u16 d7, q2
+ vld1.32 {d22}, [TMP3], STRIDE
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vuzp.8 d6, d7
+ vuzp.8 d2, d3
+ vdup.32 d4, d7[1]
+ vld1.32 {d23}, [TMP3]
+ vmvn.8 d4, d4
+ vmull.u8 q10, d22, d28
+ vmlal.u8 q10, d23, d29
+ vmull.u8 q11, d2, d4
+ vmull.u8 q2, d3, d4
+ vshll.u16 q0, d16, #8
+ vmlsl.u16 q0, d16, d30
+ vrshr.u16 q1, q11, #8
+ vmlal.u16 q0, d17, d30
+ vrshr.u16 q8, q2, #8
+ vraddhn.u16 d2, q1, q11
+ vraddhn.u16 d3, q8, q2
+ pld [TMP4, PF_OFFS]
+ vld1.32 {d16}, [TMP4], STRIDE
+ vqadd.u8 q3, q1, q3
+ vld1.32 {d17}, [TMP4]
+ pld [TMP4, PF_OFFS]
+ vmull.u8 q11, d16, d28
+ vmlal.u8 q11, d17, d29
+ vuzp.8 d6, d7
+ vshll.u16 q1, d18, #8
+ vuzp.8 d6, d7
+ vmlsl.u16 q1, d18, d31
+ vadd.u16 q12, q12, q13
+ vmlal.u16 q1, d19, d31
+ vshr.u16 q15, q12, #8
+ vadd.u16 q12, q12, q13
+ vst1.32 {d6, d7}, [OUT, :128]!
.endm
/* over_8888_8_8888 */
commit 1cd916f3a5ebeb943f66eecf0b8ce99af0b95d11
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Fri Sep 23 00:03:22 2011 +0900
ARM: NEON: Replace old bilinear scanline generator with new template
Bilinear scanline functions in pixman-arm-neon-asm-bilinear.S can
be replaced with new template just by wrapping existing macros.
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 784e5df..25bcb24 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -582,198 +582,6 @@ fname:
bilinear_store_&dst_fmt 4, q2, q3
.endm
-.macro generate_bilinear_scanline_func_src_dst \
- fname, src_fmt, dst_fmt, op, \
- bpp_shift, prefetch_distance
-
Reply to: