[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

pixman: Changes to 'debian-experimental'



 ChangeLog                             |  198 +++++++
 Makefile.am                           |    4 
 configure.ac                          |    2 
 debian/changelog                      |    6 
 pixman/pixman-arm-neon-asm-bilinear.S |  922 ++++++++++++++++++++++++++++------
 pixman/pixman-arm-neon-asm.S          |  139 +++++
 pixman/pixman-arm-neon.c              |   10 
 pixman/pixman-gradient-walker.c       |  175 +-----
 pixman/pixman-image.c                 |   73 ++
 pixman/pixman-noop.c                  |    6 
 pixman/pixman-private.h               |   16 
 pixman/pixman-sse2.c                  |    2 
 12 files changed, 1242 insertions(+), 311 deletions(-)

New commits:
commit 39102f8b3e7f36ad912fc95596dcd0a61ae2bab0
Author: Cyril Brulebois <kibi@debian.org>
Date:   Tue Nov 1 12:29:25 2011 +0100

    Upload to experimental.

diff --git a/debian/changelog b/debian/changelog
index e2f7f36..7bebde7 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,8 +1,8 @@
-pixman (0.23.8-1) UNRELEASED; urgency=low
+pixman (0.23.8-1) experimental; urgency=low
 
   * New upstream release.
 
- -- Cyril Brulebois <kibi@debian.org>  Tue, 01 Nov 2011 12:28:45 +0100
+ -- Cyril Brulebois <kibi@debian.org>  Tue, 01 Nov 2011 12:29:16 +0100
 
 pixman (0.23.6-1) experimental; urgency=low
 

commit bfad5455b6885b09fb8a63a7384f077fc0a45741
Author: Cyril Brulebois <kibi@debian.org>
Date:   Tue Nov 1 12:28:58 2011 +0100

    Bump changelogs.

diff --git a/ChangeLog b/ChangeLog
index 6a10342..fa61d98 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,201 @@
+commit a0f1b565811388b0567c845b9b7063d5b93d325e
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date:   Sat Oct 29 05:33:44 2011 -0400
+
+    Pre-release version bump to 0.23.8
+
+commit 498138c293a2abce44ce122114852f4e6c5b87fe
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date:   Tue Oct 25 08:45:34 2011 -0400
+
+    Fix use of uninitialized fields reported by valgrind
+    
+    In pixman-noop.c and pixman-sse2.c, we are accessing
+    image->bits.width/height without first making sure the image is a bits
+    image. The warning is harmless because we never act on this
+    information without checking that the image is a8r8g8b8, but valgrind
+    does warn about it.
+    
+    In pixman-noop.c, just reorder the clauses in the if statement; in
+    pixman-sse2.c require images to have the FAST_PATH_BITS_IMAGE flag
+    set.
+
+commit 6131707e8fc39187d1d358481f7c57c57cfab206
+Merge: 3d4d705 ec7c9c2
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date:   Thu Oct 20 09:13:12 2011 -0400
+
+    Merge branch 'gradients'
+
+commit 3d4d705d2ffa4aeab3dc02a23c2aadbea1374a3f
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date:   Tue Oct 18 21:50:18 2011 +0900
+
+    ARM: NEON: Fix assembly typo error in src_n_8_8888
+    
+    Binutils 2.21 does not complain about missing comma between ARM
+    register and alignement specifier in vld/vst instructions which
+    causes build error on binutils 2.20.
+
+commit 19f118f41f8725f22395d31eac5670cb350b55ec
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date:   Mon Sep 26 18:33:27 2011 +0900
+
+    ARM: NEON: Standard fast path src_n_8_8
+    
+    Performance numbers of before/after on cortex-a8 @ 1GHz
+    
+    - before
+    L1:  28.05  L2:  28.26  M: 26.97 (  4.48%)  HT: 19.79  VT: 19.14  R: 17.61  RT:  9.88 ( 101Kops/s)
+    
+    - after
+    L1:1430.28  L2:1252.10  M:421.93 ( 75.48%)  HT:170.16  VT:138.03  R:145.86  RT: 35.51 ( 255Kops/s)
+
+commit 4db9e2bc13d3ed26416f249e57acec4b41f58b7f
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date:   Mon Sep 26 17:03:54 2011 +0900
+
+    ARM: NEON: Standard fast path src_n_8_8888
+    
+    Performance numbers of before/after on cortex-a8 @ 1GHz
+    
+    - before
+    L1:  32.39  L2:  31.79  M: 30.84 ( 13.77%)  HT: 21.58  VT: 19.75  R: 18.83  RT: 10.46 ( 106Kops/s)
+    
+    - after
+    L1: 516.25  L2: 372.00  M:193.49 ( 85.59%)  HT:136.93  VT:109.10  R:104.48  RT: 34.77 ( 253Kops/s)
+
+commit 26659de6cd2775c83a9a6e6660324d5baacf61f9
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date:   Mon Sep 26 19:04:53 2011 +0900
+
+    ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888
+    
+    Instructions are reordered to eliminate pipeline stalls and get
+    better memory access.
+    
+    Performance of before/after on cortex-a8 @ 1GHz
+    
+    << 2000 x 2000 with scale factor close to 1.x >>
+    before : 40.53 Mpix/s
+    after  : 50.76 Mpix/s
+
+commit 4481920f405e47b3a92811a8cb06afbd37dee01b
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date:   Wed Sep 21 15:52:13 2011 +0900
+
+    ARM: NEON: Instruction scheduling of bilinear over_8888_8888
+    
+    Instructions are reordered to eliminate pipeline stalls and get
+    better memory access.
+    
+    Performance of before/after on cortex-a8 @ 1GHz
+    
+    << 2000 x 2000 with scale factor close to 1.x >>
+    before : 50.43 Mpix/s
+    after  : 61.09 Mpix/s
+
+commit 1cd916f3a5ebeb943f66eecf0b8ce99af0b95d11
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date:   Fri Sep 23 00:03:22 2011 +0900
+
+    ARM: NEON: Replace old bilinear scanline generator with new template
+    
+    Bilinear scanline functions in pixman-arm-neon-asm-bilinear.S can
+    be replaced with new template just by wrapping existing macros.
+
+commit 6682b2b3597c9f431900bfe7b1b42dfbe006bae5
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date:   Tue Sep 20 21:32:35 2011 +0900
+
+    ARM: NEON: Bilinear macro template for instruction scheduling
+    
+    This macro template takes 6 code blocks.
+    
+    1. process_last_pixel
+    2. process_two_pixels
+    3. process_four_pixels
+    4. process_pixblock_head
+    5. process_pixblock_tail
+    6. process_pixblock_tail_head
+    
+    process_last_pixel does not need to update horizontal weight. This
+    is done by the template. two and four code block should update
+    horizontal weight inside of them. head/tail/tail_head blocks
+    consist unrolled core loop. You can apply instruction scheduling
+    to the tail_head blocks.
+    
+    You can also specify size of the pixel block. Supported size is 4
+    and 8. If you want to use mask, give BILINEAR_FLAG_USE_MASK flags
+    to the template, then you can use register MASK. When using d8~d15
+    registers, give BILINEAR_FLAG_USE_ALL_NEON_REGS to make sure
+    registers are properly saved on the stack and later restored.
+
+commit b5e4355fa4973e3edd4abeb11bdc47c42371cc76
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date:   Tue Sep 20 19:46:25 2011 +0900
+
+    ARM: NEON: Some cleanup of bilinear scanline functions
+    
+    Use STRIDE and initial horizontal weight update is done before
+    entering interpolation loop. Cache preload for mask and dst.
+
+commit ec7c9c2b6865b48b8bd14e4509538f8fcbe93463
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date:   Fri Oct 14 09:04:48 2011 -0400
+
+    Simplify gradient_walker_reset()
+    
+    The code that searches for the closest color stop to the given
+    position is duplicated across the various repeat modes. Replace the
+    switch with two if/else constructions, and put the search code between
+    them.
+
+commit 2d0da8ab8d8fef60ed1bbb9d6b75f66577c3f85d
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date:   Fri Oct 14 09:02:14 2011 -0400
+
+    Use sentinels instead of special casing first and last stops
+    
+    When storing the gradient stops internally, allocate two more stops,
+    one before the beginning of the stop list and one after the
+    end. Initialize those stops based on the repeat property of the
+    gradient.
+    
+    This allows gradient_walker_reset() to be simplified because it can
+    now simply pick the two closest stops to the position without special
+    casing the first and last stops.
+
+commit 84d6ca7c891601b019d4862a556ed98b7e6fe525
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date:   Fri Oct 14 07:42:00 2011 -0400
+
+    gradient walker: Correct types and fix formatting
+    
+    The type of pos in gradient_walker_reset() and gradient_walker_pixel()
+    is pixman_fixed_48_16_t and not pixman_fixed_32_32. The types of the
+    positions in the walker struct are pixman_fixed_t and not int32_t, and
+    need_reset is a boolean, not an integer. The spread field should be
+    called repeat and have the type pixman_repeat_t.
+    
+    Also fix some formatting issues, make gradient_walker_reset() static,
+    and delete the pointless PIXMAN_GRADIENT_WALKER_NEED_RESET() macro.
+
+commit ace225b53dee88d134753ac901f26ba3db6781da
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date:   Tue Oct 11 16:12:24 2011 -0400
+
+    Add stable release / development snapshot to draft release notes
+    
+    This will hopefully serve as a reminder to me that I should put this
+    information in the release notes.
+
+commit bb7142d361d56d66ac40debb60a7c4d099764ba8
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date:   Tue Oct 11 06:10:39 2011 -0400
+
+    Post-release version bump to 0.23.7
+
 commit e20ac40bd30484f0f711b52d0c1993ef08760284
 Author: Søren Sandmann Pedersen <ssp@redhat.com>
 Date:   Tue Oct 11 06:00:51 2011 -0400
diff --git a/debian/changelog b/debian/changelog
index af38044..e2f7f36 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+pixman (0.23.8-1) UNRELEASED; urgency=low
+
+  * New upstream release.
+
+ -- Cyril Brulebois <kibi@debian.org>  Tue, 01 Nov 2011 12:28:45 +0100
+
 pixman (0.23.6-1) experimental; urgency=low
 
   [ Rico Tzschichholz ]

commit a0f1b565811388b0567c845b9b7063d5b93d325e
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date:   Sat Oct 29 05:33:44 2011 -0400

    Pre-release version bump to 0.23.8

diff --git a/configure.ac b/configure.ac
index 6c88c84..0552563 100644
--- a/configure.ac
+++ b/configure.ac
@@ -54,7 +54,7 @@ AC_PREREQ([2.57])
 
 m4_define([pixman_major], 0)
 m4_define([pixman_minor], 23)
-m4_define([pixman_micro], 7)
+m4_define([pixman_micro], 8)
 
 m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
 

commit 498138c293a2abce44ce122114852f4e6c5b87fe
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date:   Tue Oct 25 08:45:34 2011 -0400

    Fix use of uninitialized fields reported by valgrind
    
    In pixman-noop.c and pixman-sse2.c, we are accessing
    image->bits.width/height without first making sure the image is a bits
    image. The warning is harmless because we never act on this
    information without checking that the image is a8r8g8b8, but valgrind
    does warn about it.
    
    In pixman-noop.c, just reorder the clauses in the if statement; in
    pixman-sse2.c require images to have the FAST_PATH_BITS_IMAGE flag
    set.

diff --git a/pixman/pixman-noop.c b/pixman/pixman-noop.c
index 906a491..f4012d8 100644
--- a/pixman/pixman-noop.c
+++ b/pixman/pixman-noop.c
@@ -76,12 +76,12 @@ noop_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
     {
 	iter->get_scanline = _pixman_iter_get_scanline_noop;
     }
-    else if ((iter->flags & ITER_NARROW)				&&
+    else if (image->common.extended_format_code == PIXMAN_a8r8g8b8	&&
+	     (iter->flags & ITER_NARROW)				&&
 	     (image->common.flags & FLAGS) == FLAGS			&&
 	     iter->x >= 0 && iter->y >= 0				&&
 	     iter->x + iter->width <= image->bits.width			&&
-	     iter->y + iter->height <= image->bits.height		&&
-	     image->common.extended_format_code == PIXMAN_a8r8g8b8)
+	     iter->y + iter->height <= image->bits.height)
     {
 	iter->buffer =
 	    image->bits.bits + iter->y * image->bits.rowstride + iter->x;
diff --git a/pixman/pixman-sse2.c b/pixman/pixman-sse2.c
index c419511..8adf541 100644
--- a/pixman/pixman-sse2.c
+++ b/pixman/pixman-sse2.c
@@ -5982,7 +5982,7 @@ sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
     int height = iter->height;
 
 #define FLAGS								\
-    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM)
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | FAST_PATH_BITS_IMAGE)
 
     if ((iter->flags & ITER_NARROW)				&&
 	(image->common.flags & FLAGS) == FLAGS			&&

commit 3d4d705d2ffa4aeab3dc02a23c2aadbea1374a3f
Author: Taekyun Kim <tkq.kim@samsung.com>
Date:   Tue Oct 18 21:50:18 2011 +0900

    ARM: NEON: Fix assembly typo error in src_n_8_8888
    
    Binutils 2.21 does not complain about missing comma between ARM
    register and alignement specifier in vld/vst instructions which
    causes build error on binutils 2.20.

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index da8f054..87aae1d 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1260,7 +1260,7 @@ generate_composite_function \
                                     PF subges PF_CTL, PF_CTL, #0x10
     vmull.u8    q11, d24, d3
                                     PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
-        vst4.8      {d28, d29, d30, d31}, [DST_W :128]!
+        vst4.8      {d28, d29, d30, d31}, [DST_W, :128]!
     vrsra.u16   q8, q8, #8
     vrsra.u16   q9, q9, #8
     vrsra.u16   q10, q10, #8

commit 19f118f41f8725f22395d31eac5670cb350b55ec
Author: Taekyun Kim <tkq.kim@samsung.com>
Date:   Mon Sep 26 18:33:27 2011 +0900

    ARM: NEON: Standard fast path src_n_8_8
    
    Performance numbers of before/after on cortex-a8 @ 1GHz
    
    - before
    L1:  28.05  L2:  28.26  M: 26.97 (  4.48%)  HT: 19.79  VT: 19.14  R: 17.61  RT:  9.88 ( 101Kops/s)
    
    - after
    L1:1430.28  L2:1252.10  M:421.93 ( 75.48%)  HT:170.16  VT:138.03  R:145.86  RT: 35.51 ( 255Kops/s)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 1db02db..da8f054 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1292,6 +1292,72 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro pixman_composite_src_n_8_8_process_pixblock_head
+    vmull.u8    q0, d24, d16
+    vmull.u8    q1, d25, d16
+    vmull.u8    q2, d26, d16
+    vmull.u8    q3, d27, d16
+    vrsra.u16   q0, q0,  #8
+    vrsra.u16   q1, q1,  #8
+    vrsra.u16   q2, q2,  #8
+    vrsra.u16   q3, q3,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail
+    vrshrn.u16  d28, q0, #8
+    vrshrn.u16  d29, q1, #8
+    vrshrn.u16  d30, q2, #8
+    vrshrn.u16  d31, q3, #8
+.endm
+
+.macro pixman_composite_src_n_8_8_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add PF_X, PF_X, #8
+        vrshrn.u16  d28, q0, #8
+                                    PF tst PF_CTL, #0x0F
+        vrshrn.u16  d29, q1, #8
+                                    PF addne PF_X, PF_X, #8
+        vrshrn.u16  d30, q2, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vrshrn.u16  d31, q3, #8
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q0,  d24, d16
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q1,  d25, d16
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q2,  d26, d16
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q3,  d27, d16
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vst1.8      {d28, d29, d30, d31}, [DST_W, :128]!
+    vrsra.u16   q0, q0,  #8
+    vrsra.u16   q1, q1,  #8
+    vrsra.u16   q2, q2,  #8
+    vrsra.u16   q3, q3,  #8
+.endm
+
+.macro pixman_composite_src_n_8_8_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d16[0]}, [DUMMY]
+    vdup.8      d16, d16[3]
+.endm
+
+.macro pixman_composite_src_n_8_8_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8_asm_neon, 0, 8, 8, \
+    FLAG_DST_WRITEONLY, \
+    32, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8_init, \
+    pixman_composite_src_n_8_8_cleanup, \
+    pixman_composite_src_n_8_8_process_pixblock_head, \
+    pixman_composite_src_n_8_8_process_pixblock_tail, \
+    pixman_composite_src_n_8_8_process_pixblock_tail_head
+
+/******************************************************************************/
+
 .macro pixman_composite_over_n_8_8888_process_pixblock_head
     /* expecting deinterleaved source data in {d8, d9, d10, d11} */
     /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 3db9adf..ca139de 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -92,6 +92,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
                                       uint8_t, 1, uint32_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
                                       uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8,
+                                      uint8_t, 1, uint8_t, 1)
 
 PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
                                      uint32_t, 1, uint32_t, 1)
@@ -295,6 +297,7 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8r8g8b8, neon_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8b8g8r8, neon_composite_src_n_8_8888),
     PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8b8g8r8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8,       neon_composite_src_n_8_8),
 
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       neon_composite_over_n_8_8),
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   neon_composite_over_n_8_0565),

commit 4db9e2bc13d3ed26416f249e57acec4b41f58b7f
Author: Taekyun Kim <tkq.kim@samsung.com>
Date:   Mon Sep 26 17:03:54 2011 +0900

    ARM: NEON: Standard fast path src_n_8_8888
    
    Performance numbers of before/after on cortex-a8 @ 1GHz
    
    - before
    L1:  32.39  L2:  31.79  M: 30.84 ( 13.77%)  HT: 21.58  VT: 19.75  R: 18.83  RT: 10.46 ( 106Kops/s)
    
    - after
    L1: 516.25  L2: 372.00  M:193.49 ( 85.59%)  HT:136.93  VT:109.10  R:104.48  RT: 34.77 ( 253Kops/s)

diff --git a/pixman/pixman-arm-neon-asm.S b/pixman/pixman-arm-neon-asm.S
index 3fcd07d..1db02db 100644
--- a/pixman/pixman-arm-neon-asm.S
+++ b/pixman/pixman-arm-neon-asm.S
@@ -1219,6 +1219,79 @@ generate_composite_function \
 
 /******************************************************************************/
 
+.macro pixman_composite_src_n_8_8888_process_pixblock_head
+    /* expecting solid source in {d0, d1, d2, d3} */
+    /* mask is in d24 (d25, d26, d27 are unused) */
+
+    /* in */
+    vmull.u8    q8, d24, d0
+    vmull.u8    q9, d24, d1
+    vmull.u8    q10, d24, d2
+    vmull.u8    q11, d24, d3
+    vrsra.u16   q8, q8, #8
+    vrsra.u16   q9, q9, #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail
+    vrshrn.u16  d28, q8, #8
+    vrshrn.u16  d29, q9, #8
+    vrshrn.u16  d30, q10, #8
+    vrshrn.u16  d31, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_process_pixblock_tail_head
+    fetch_mask_pixblock
+                                    PF add PF_X, PF_X, #8
+        vrshrn.u16  d28, q8, #8
+                                    PF tst PF_CTL, #0x0F
+        vrshrn.u16  d29, q9, #8
+                                    PF addne PF_X, PF_X, #8
+        vrshrn.u16  d30, q10, #8
+                                    PF subne PF_CTL, PF_CTL, #1
+        vrshrn.u16  d31, q11, #8
+                                    PF cmp PF_X, ORIG_W
+    vmull.u8    q8, d24, d0
+                                    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
+    vmull.u8    q9, d24, d1
+                                    PF subge PF_X, PF_X, ORIG_W
+    vmull.u8    q10, d24, d2
+                                    PF subges PF_CTL, PF_CTL, #0x10
+    vmull.u8    q11, d24, d3
+                                    PF ldrgeb DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
+        vst4.8      {d28, d29, d30, d31}, [DST_W :128]!
+    vrsra.u16   q8, q8, #8
+    vrsra.u16   q9, q9, #8
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+.endm
+
+.macro pixman_composite_src_n_8_8888_init
+    add         DUMMY, sp, #ARGS_STACK_OFFSET
+    vld1.32     {d3[0]}, [DUMMY]
+    vdup.8      d0, d3[0]
+    vdup.8      d1, d3[1]
+    vdup.8      d2, d3[2]
+    vdup.8      d3, d3[3]
+.endm
+
+.macro pixman_composite_src_n_8_8888_cleanup
+.endm
+
+generate_composite_function \
+    pixman_composite_src_n_8_8888_asm_neon, 0, 8, 32, \
+    FLAG_DST_WRITEONLY | FLAG_DEINTERLEAVE_32BPP, \
+    8, /* number of pixels, processed in a single block */ \
+    5, /* prefetch distance */ \
+    pixman_composite_src_n_8_8888_init, \
+    pixman_composite_src_n_8_8888_cleanup, \
+    pixman_composite_src_n_8_8888_process_pixblock_head, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail, \
+    pixman_composite_src_n_8_8888_process_pixblock_tail_head, \
+
+/******************************************************************************/
+
 .macro pixman_composite_over_n_8_8888_process_pixblock_head
     /* expecting deinterleaved source data in {d8, d9, d10, d11} */
     /* d8 - blue, d9 - green, d10 - red, d11 - alpha */
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index effb50b..3db9adf 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -90,6 +90,8 @@ PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8,
                                       uint8_t, 1, uint8_t, 1)
 PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, neon, add_n_8_8888,
                                       uint8_t, 1, uint32_t, 1)
+PIXMAN_ARM_BIND_FAST_PATH_N_MASK_DST (0, neon, src_n_8_8888,
+                                      uint8_t, 1, uint32_t, 1)
 
 PIXMAN_ARM_BIND_FAST_PATH_SRC_N_DST (SKIP_ZERO_MASK, neon, over_8888_n_8888,
                                      uint32_t, 1, uint32_t, 1)
@@ -289,6 +291,11 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC,  pixbuf,   pixbuf,   a8b8g8r8, neon_composite_src_rpixbuf_8888),
     PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8r8g8b8, neon_composite_src_rpixbuf_8888),
     PIXMAN_STD_FAST_PATH (SRC,  rpixbuf,  rpixbuf,  a8b8g8r8, neon_composite_src_pixbuf_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8r8g8b8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8r8g8b8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       a8b8g8r8, neon_composite_src_n_8_8888),
+    PIXMAN_STD_FAST_PATH (SRC,  solid,    a8,       x8b8g8r8, neon_composite_src_n_8_8888),
+
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       a8,       neon_composite_over_n_8_8),
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       r5g6b5,   neon_composite_over_n_8_0565),
     PIXMAN_STD_FAST_PATH (OVER, solid,    a8,       b5g6r5,   neon_composite_over_n_8_0565),

commit 26659de6cd2775c83a9a6e6660324d5baacf61f9
Author: Taekyun Kim <tkq.kim@samsung.com>
Date:   Mon Sep 26 19:04:53 2011 +0900

    ARM: NEON: Instruction scheduling of bilinear over_8888_8_8888
    
    Instructions are reordered to eliminate pipeline stalls and get
    better memory access.
    
    Performance of before/after on cortex-a8 @ 1GHz
    
    << 2000 x 2000 with scale factor close to 1.x >>
    before : 40.53 Mpix/s
    after  : 50.76 Mpix/s

diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 82d248e..f7913ad 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -949,7 +949,7 @@ pixman_asm_function fname
     vshrn.u32   d0, q0, #16
     vshrn.u32   d1, q1, #16
     vld1.32     {d2, d3}, [OUT, :128]
-    pld         [OUT, PF_OFFS]
+    pld         [OUT, #(prefetch_offset * 4)]
     vshrn.u32   d4, q2, #16
     vshr.u16    q15, q12, #8
     vshrn.u32   d5, q3, #16
@@ -1061,15 +1061,169 @@ pixman_asm_function fname
 .endm
 
 .macro bilinear_over_8888_8_8888_process_pixblock_head
-    bilinear_over_8888_8_8888_process_four_pixels
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+    vld1.32     {d0}, [TMP1], STRIDE
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+    vld1.32     {d1}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    vld1.32     {d2}, [TMP2], STRIDE
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vld1.32     {d3}, [TMP2]
+    vmull.u8    q2, d0, d28
+    vmull.u8    q3, d2, d28
+    vmlal.u8    q2, d1, d29
+    vmlal.u8    q3, d3, d29
+    vshll.u16   q0, d4, #8
+    vshll.u16   q1, d6, #8
+    vmlsl.u16   q0, d4, d30
+    vmlsl.u16   q1, d6, d31
+    vmlal.u16   q0, d5, d30
+    vmlal.u16   q1, d7, d31
+    vshrn.u32   d0, q0, #16
+    vshrn.u32   d1, q1, #16
+    vld1.32     {d2}, [TMP3], STRIDE
+    vld1.32     {d3}, [TMP3]
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d4}, [TMP4], STRIDE
+    vld1.32     {d5}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q3, d2, d28
+    vmlal.u8    q3, d3, d29
+    vmull.u8    q1, d4, d28
+    vmlal.u8    q1, d5, d29
+    vshr.u16    q15, q12, #8
+    vld1.32     {d22[0]}, [MASK]!
+    pld         [MASK, #prefetch_offset]
+    vadd.u16    q12, q12, q13
+    vmovn.u16   d16, q0
 .endm
 
 .macro bilinear_over_8888_8_8888_process_pixblock_tail
+    vshll.u16   q9, d6, #8
+    vshll.u16   q10, d2, #8
+    vmlsl.u16   q9, d6, d30
+    vmlsl.u16   q10, d2, d31
+    vmlal.u16   q9, d7, d30
+    vmlal.u16   q10, d3, d31
+    vshr.u16    q15, q12, #8
+    vadd.u16    q12, q12, q13
+    vdup.32     d22, d22[0]
+    vshrn.u32   d18, q9, #16
+    vshrn.u32   d19, q10, #16
+    vmovn.u16   d17, q9
+    vld1.32     {d18, d19}, [OUT, :128]
+    pld         [OUT, PF_OFFS]
+    vuzp.8      d16, d17
+    vuzp.8      d18, d19
+    vuzp.8      d16, d17
+    vuzp.8      d18, d19
+    vmull.u8    q10, d16, d22
+    vmull.u8    q11, d17, d22
+    vrsra.u16   q10, q10, #8
+    vrsra.u16   q11, q11, #8
+    vrshrn.u16  d16, q10, #8
+    vrshrn.u16  d17, q11, #8
+    vdup.32     d22, d17[1]
+    vmvn.8      d22, d22
+    vmull.u8    q10, d18, d22
+    vmull.u8    q11, d19, d22
+    vrshr.u16   q9, q10, #8
+    vrshr.u16   q0, q11, #8
+    vraddhn.u16 d18, q9, q10
+    vraddhn.u16 d19, q0, q11
+    vqadd.u8    q9, q8, q9
+    vuzp.8      d18, d19
+    vuzp.8      d18, d19
+    vst1.32     {d18, d19}, [OUT, :128]!
 .endm
 
 .macro bilinear_over_8888_8_8888_process_pixblock_tail_head
-    bilinear_over_8888_8_8888_process_pixblock_tail
-    bilinear_over_8888_8_8888_process_pixblock_head
+                                            vshll.u16   q9, d6, #8
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+                                            vshll.u16   q10, d2, #8
+    vld1.32     {d0}, [TMP1], STRIDE
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+                                            vmlsl.u16   q9, d6, d30
+                                            vmlsl.u16   q10, d2, d31
+    vld1.32     {d1}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+                                            vmlal.u16   q9, d7, d30
+                                            vmlal.u16   q10, d3, d31
+    vld1.32     {d2}, [TMP2], STRIDE
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+                                            vshr.u16    q15, q12, #8
+                                            vadd.u16    q12, q12, q13
+    vld1.32     {d3}, [TMP2]
+                                            vdup.32     d22, d22[0]
+                                            vshrn.u32   d18, q9, #16
+                                            vshrn.u32   d19, q10, #16
+    vmull.u8    q2, d0, d28
+    vmull.u8    q3, d2, d28
+                                            vmovn.u16   d17, q9
+                                            vld1.32     {d18, d19}, [OUT, :128]
+                                            pld         [OUT, #(prefetch_offset * 4)]
+    vmlal.u8    q2, d1, d29
+    vmlal.u8    q3, d3, d29
+                                            vuzp.8      d16, d17
+                                            vuzp.8      d18, d19
+    vshll.u16   q0, d4, #8
+    vshll.u16   q1, d6, #8
+                                            vuzp.8      d16, d17
+                                            vuzp.8      d18, d19
+    vmlsl.u16   q0, d4, d30
+    vmlsl.u16   q1, d6, d31
+                                            vmull.u8    q10, d16, d22
+                                            vmull.u8    q11, d17, d22
+    vmlal.u16   q0, d5, d30
+    vmlal.u16   q1, d7, d31
+                                            vrsra.u16   q10, q10, #8
+                                            vrsra.u16   q11, q11, #8
+    vshrn.u32   d0, q0, #16
+    vshrn.u32   d1, q1, #16
+                                            vrshrn.u16  d16, q10, #8
+                                            vrshrn.u16  d17, q11, #8
+    vld1.32     {d2}, [TMP3], STRIDE
+                                            vdup.32     d22, d17[1]
+    vld1.32     {d3}, [TMP3]
+                                            vmvn.8      d22, d22
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d4}, [TMP4], STRIDE
+                                            vmull.u8    q10, d18, d22
+                                            vmull.u8    q11, d19, d22
+    vld1.32     {d5}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q3, d2, d28
+                                            vrshr.u16   q9, q10, #8
+                                            vrshr.u16   q15, q11, #8
+    vmlal.u8    q3, d3, d29
+    vmull.u8    q1, d4, d28
+                                            vraddhn.u16 d18, q9, q10
+                                            vraddhn.u16 d19, q15, q11
+    vmlal.u8    q1, d5, d29
+    vshr.u16    q15, q12, #8
+                                            vqadd.u8    q9, q8, q9
+    vld1.32     {d22[0]}, [MASK]!
+                                            vuzp.8      d18, d19
+    vadd.u16    q12, q12, q13
+                                            vuzp.8      d18, d19
+    vmovn.u16   d16, q0
+                                            vst1.32     {d18, d19}, [OUT, :128]!
 .endm
 
 /* add_8888_8888 */

commit 4481920f405e47b3a92811a8cb06afbd37dee01b
Author: Taekyun Kim <tkq.kim@samsung.com>
Date:   Wed Sep 21 15:52:13 2011 +0900

    ARM: NEON: Instruction scheduling of bilinear over_8888_8888
    
    Instructions are reordered to eliminate pipeline stalls and get
    better memory access.
    
    Performance of before/after on cortex-a8 @ 1GHz
    
    << 2000 x 2000 with scale factor close to 1.x >>
    before : 50.43 Mpix/s
    after  : 61.09 Mpix/s

diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 25bcb24..82d248e 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -893,15 +893,158 @@ pixman_asm_function fname
 .endm
 
 .macro bilinear_over_8888_8888_process_pixblock_head
-    bilinear_over_8888_8888_process_four_pixels
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+
+    vld1.32     {d22}, [TMP1], STRIDE
+    vld1.32     {d23}, [TMP1]
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    vmull.u8    q8, d22, d28
+    vmlal.u8    q8, d23, d29
+
+    vld1.32     {d22}, [TMP2], STRIDE
+    vld1.32     {d23}, [TMP2]
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vmull.u8    q9, d22, d28
+    vmlal.u8    q9, d23, d29
+
+    vld1.32     {d22}, [TMP3], STRIDE
+    vld1.32     {d23}, [TMP3]
+    vmull.u8    q10, d22, d28
+    vmlal.u8    q10, d23, d29
+
+    vshll.u16   q0, d16, #8
+    vmlsl.u16   q0, d16, d30
+    vmlal.u16   q0, d17, d30
+
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d16}, [TMP4], STRIDE
+    vld1.32     {d17}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q11, d16, d28
+    vmlal.u8    q11, d17, d29
+
+    vshll.u16   q1, d18, #8
+    vmlsl.u16   q1, d18, d31
+    vmlal.u16   q1, d19, d31
+    vshr.u16    q15, q12, #8
+    vadd.u16    q12, q12, q13
 .endm
 
 .macro bilinear_over_8888_8888_process_pixblock_tail
+    vshll.u16   q2, d20, #8
+    vmlsl.u16   q2, d20, d30
+    vmlal.u16   q2, d21, d30
+    vshll.u16   q3, d22, #8
+    vmlsl.u16   q3, d22, d31
+    vmlal.u16   q3, d23, d31
+    vshrn.u32   d0, q0, #16
+    vshrn.u32   d1, q1, #16
+    vld1.32     {d2, d3}, [OUT, :128]
+    pld         [OUT, PF_OFFS]
+    vshrn.u32   d4, q2, #16
+    vshr.u16    q15, q12, #8
+    vshrn.u32   d5, q3, #16
+    vmovn.u16   d6, q0
+    vmovn.u16   d7, q2
+    vuzp.8      d6, d7
+    vuzp.8      d2, d3
+    vuzp.8      d6, d7
+    vuzp.8      d2, d3
+    vdup.32     d4, d7[1]
+    vmvn.8      d4, d4
+    vmull.u8    q11, d2, d4
+    vmull.u8    q2, d3, d4
+    vrshr.u16   q1, q11, #8
+    vrshr.u16   q10, q2, #8
+    vraddhn.u16 d2, q1, q11
+    vraddhn.u16 d3, q10, q2
+    vqadd.u8    q3, q1, q3
+    vuzp.8      d6, d7
+    vuzp.8      d6, d7
+    vadd.u16    q12, q12, q13
+    vst1.32     {d6, d7}, [OUT, :128]!
 .endm
 
 .macro bilinear_over_8888_8888_process_pixblock_tail_head
-    bilinear_over_8888_8888_process_pixblock_tail
-    bilinear_over_8888_8888_process_pixblock_head
+                                            vshll.u16   q2, d20, #8
+    mov         TMP1, X, asr #16
+    add         X, X, UX
+    add         TMP1, TOP, TMP1, asl #2
+                                            vmlsl.u16   q2, d20, d30
+    mov         TMP2, X, asr #16
+    add         X, X, UX
+    add         TMP2, TOP, TMP2, asl #2
+                                            vmlal.u16   q2, d21, d30
+                                            vshll.u16   q3, d22, #8
+    vld1.32     {d20}, [TMP1], STRIDE
+                                            vmlsl.u16   q3, d22, d31
+                                            vmlal.u16   q3, d23, d31
+    vld1.32     {d21}, [TMP1]
+    vmull.u8    q8, d20, d28
+    vmlal.u8    q8, d21, d29
+                                            vshrn.u32   d0, q0, #16
+                                            vshrn.u32   d1, q1, #16
+                                            vld1.32     {d2, d3}, [OUT, :128]
+                                            pld         [OUT, PF_OFFS]
+                                            vshrn.u32   d4, q2, #16
+                                            vshr.u16    q15, q12, #8
+    vld1.32     {d22}, [TMP2], STRIDE
+                                            vshrn.u32   d5, q3, #16
+                                            vmovn.u16   d6, q0
+    vld1.32     {d23}, [TMP2]
+    vmull.u8    q9, d22, d28
+    mov         TMP3, X, asr #16
+    add         X, X, UX
+    add         TMP3, TOP, TMP3, asl #2
+    mov         TMP4, X, asr #16
+    add         X, X, UX
+    add         TMP4, TOP, TMP4, asl #2
+    vmlal.u8    q9, d23, d29
+                                            vmovn.u16   d7, q2
+    vld1.32     {d22}, [TMP3], STRIDE
+                                            vuzp.8      d6, d7
+                                            vuzp.8      d2, d3
+                                            vuzp.8      d6, d7
+                                            vuzp.8      d2, d3
+                                            vdup.32     d4, d7[1]
+    vld1.32     {d23}, [TMP3]
+                                            vmvn.8      d4, d4
+    vmull.u8    q10, d22, d28
+    vmlal.u8    q10, d23, d29
+                                            vmull.u8    q11, d2, d4
+                                            vmull.u8    q2, d3, d4
+    vshll.u16   q0, d16, #8
+    vmlsl.u16   q0, d16, d30
+                                            vrshr.u16   q1, q11, #8
+    vmlal.u16   q0, d17, d30
+                                            vrshr.u16   q8, q2, #8
+                                            vraddhn.u16 d2, q1, q11
+                                            vraddhn.u16 d3, q8, q2
+    pld         [TMP4, PF_OFFS]
+    vld1.32     {d16}, [TMP4], STRIDE
+                                            vqadd.u8    q3, q1, q3
+    vld1.32     {d17}, [TMP4]
+    pld         [TMP4, PF_OFFS]
+    vmull.u8    q11, d16, d28
+    vmlal.u8    q11, d17, d29
+                                            vuzp.8      d6, d7
+    vshll.u16   q1, d18, #8
+                                            vuzp.8      d6, d7
+    vmlsl.u16   q1, d18, d31
+                                            vadd.u16    q12, q12, q13
+    vmlal.u16   q1, d19, d31
+    vshr.u16    q15, q12, #8
+    vadd.u16    q12, q12, q13
+                                            vst1.32     {d6, d7}, [OUT, :128]!
 .endm
 
 /* over_8888_8_8888 */

commit 1cd916f3a5ebeb943f66eecf0b8ce99af0b95d11
Author: Taekyun Kim <tkq.kim@samsung.com>
Date:   Fri Sep 23 00:03:22 2011 +0900

    ARM: NEON: Replace old bilinear scanline generator with new template
    
    Bilinear scanline functions in pixman-arm-neon-asm-bilinear.S can
    be replaced with new template just by wrapping existing macros.

diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
index 784e5df..25bcb24 100644
--- a/pixman/pixman-arm-neon-asm-bilinear.S
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -582,198 +582,6 @@ fname:
     bilinear_store_&dst_fmt 4, q2, q3
 .endm
 
-.macro generate_bilinear_scanline_func_src_dst \
-                fname, src_fmt, dst_fmt, op, \
-                bpp_shift, prefetch_distance
-


Reply to: