pixman: Changes to 'debian-unstable'
ChangeLog | 579 ++++++++++++++++++++
Makefile.am | 7
configure.ac | 51 +
debian/changelog | 11
demos/tri-test.c | 2
pixman/Makefile.am | 1
pixman/Makefile.win32 | 6
pixman/pixman-access.c | 97 +++
pixman/pixman-arm-common.h | 90 +++
pixman/pixman-arm-neon-asm-bilinear.S | 768 ++++++++++++++++++++++++++
pixman/pixman-arm-neon-asm.S | 982 +++++++++++++++++++++++++++++++++-
pixman/pixman-arm-neon-asm.h | 17
pixman/pixman-arm-neon.c | 62 ++
pixman/pixman-arm-simd-asm.S | 66 +-
pixman/pixman-arm-simd.c | 9
pixman/pixman-bits-image.c | 20
pixman/pixman-conical-gradient.c | 7
pixman/pixman-fast-path.h | 432 ++++++++++++++
pixman/pixman-general.c | 58 --
pixman/pixman-image.c | 1
pixman/pixman-implementation.c | 46 -
pixman/pixman-linear-gradient.c | 16
pixman/pixman-private.h | 51 -
pixman/pixman-radial-gradient.c | 7
pixman/pixman-solid-fill.c | 17
pixman/pixman-sse2.c | 139 ++++
pixman/pixman-trap.c | 23
pixman/pixman.c | 6
pixman/pixman.h | 6
test/Makefile.am | 2
test/Makefile.win32 | 73 ++
test/affine-test.c | 6
test/blitters-test.c | 13
test/composite-traps-test.c | 8
test/composite.c | 60 +-
test/fetch-test.c | 63 +-
test/scaling-helpers-test.c | 93 +++
test/scaling-test.c | 6
test/stress-test.c | 41 +
test/trap-crasher.c | 20
test/utils.c | 19
test/utils.h | 5
42 files changed, 3679 insertions(+), 307 deletions(-)
New commits:
commit 2296b15c9d4d5002f354695992e12ac5d912677d
Author: Cyril Brulebois <kibi@debian.org>
Date: Fri Apr 29 17:53:20 2011 +0200
Upload to unstable.
diff --git a/debian/changelog b/debian/changelog
index a2680f6..b14d5e2 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,4 +1,4 @@
-pixman (0.21.8-1) UNRELEASED; urgency=low
+pixman (0.21.8-1) unstable; urgency=low
* New upstream release.
* As seen in the upstream announcement: “When this version of pixman is
@@ -7,7 +7,7 @@ pixman (0.21.8-1) UNRELEASED; urgency=low
* This new release should fix the FTBFS on big endian machines, tests
were failing due to missing swapping (Closes: #622211).
- -- Cyril Brulebois <kibi@debian.org> Fri, 29 Apr 2011 17:52:08 +0200
+ -- Cyril Brulebois <kibi@debian.org> Fri, 29 Apr 2011 17:53:12 +0200
pixman (0.21.6-2) unstable; urgency=low
commit c48a9b803597eebd63b3a77f5cc65c7eb2f98fdf
Author: Cyril Brulebois <kibi@debian.org>
Date: Fri Apr 29 17:53:09 2011 +0200
Mention endianness-related FTBFS fix (Closes: #622211).
diff --git a/debian/changelog b/debian/changelog
index a5fdd88..a2680f6 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -4,6 +4,8 @@ pixman (0.21.8-1) UNRELEASED; urgency=low
* As seen in the upstream announcement: “When this version of pixman is
used with the git version of the X server, trapezoid rendering will be
corrupted. This is a known bug in the X server.”
+ * This new release should fix the FTBFS on big endian machines, tests
+ were failing due to missing swapping (Closes: #622211).
-- Cyril Brulebois <kibi@debian.org> Fri, 29 Apr 2011 17:52:08 +0200
commit fa956ebd6b28216e5144cfdc87f44660256e1b1a
Author: Cyril Brulebois <kibi@debian.org>
Date: Fri Apr 29 17:52:36 2011 +0200
Bump changelogs.
diff --git a/ChangeLog b/ChangeLog
index 17896a2..69d93cb 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,582 @@
+commit 89868e93bd8d66f0fac0f0b42cf7718756992e4e
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Tue Apr 19 00:00:37 2011 -0400
+
+ Pre-release version bump to 0.21.8
+
+commit 33f1652b953467f3910605b3be723e21b3ebe078
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date: Wed Apr 13 11:57:35 2011 +0900
+
+ ARM: Enable bilinear fast paths using scanline functions in pixman-arm-neon-asm-bilinear.S
+
+ Enable fast paths which is supported by scanline functions in
+ pixman-arm-neon-asm-bilinear.S
+
+commit e8185f1cb43417d9f7b1d2856bb899f1b84fde81
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date: Wed Apr 13 11:48:40 2011 +0900
+
+ ARM: NEON scanline functions for bilinear scaling
+
+ General fetch->combine->store based bilinear scanline functions.
+ Need further optimizations and eventually will be replaced with optimal
+ functions one by one.
+ General functions should be located in pixman-arm-neon-asm-bilinear.S and
+ optimal functions in pixman-arm-neon-asm.S
+
+ Following general bilinear scanline functions are implemented
+ over_8888_8888
+ add_8888_8888
+ src_8888_8_8888
+ src_8888_8_0565
+ src_0565_8_x888
+ src_0565_8_0565
+ over_8888_8_8888
+ add_8888_8_8888
+
+commit 00939d35628e733fab63606cfb1d7fcb667860d3
+Author: Taekyun Kim <tkq.kim@samsung.com>
+Date: Wed Apr 13 11:43:44 2011 +0900
+
+ ARM: Common macro for scaled bilinear scanline function with A8 mask
+
+ Defining PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST macro for declaration of
+ scaled bilinear scanline functions in common header.
+
+commit b455496890f7f941d561c284aca14783300bedd6
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Fri Mar 11 07:52:57 2011 -0500
+
+ Offset rendering in pixman_composite_trapezoids() by (x_dst, y_dst)
+
+ Previously, this function would do coordinate calculations in such a
+ way that (x_dst, y_dst) would only affect the alignment of the source
+ image, but not of the traps, which would always be considered to be in
+ absolute destination coordinates. This is unlike the
+ pixman_image_composite() function which also registers the mask to the
+ destination.
+
+ This patch makes it so that traps are also offset by (x_dst, y_dst).
+
+ Also add a comment explaining how this function is supposed to
+ operate, and update tri-test.c and composite-trap-test.c to deal with
+ the new semantics.
+
+commit e75e6a4ef5c5a8ac8b0e8464f08f83fd2b6e86ed
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Sat Apr 2 23:24:48 2011 -0400
+
+ ARM: Add 'neon_composite_over_n_8888_0565_ca' fast path
+
+ This improves the performance of the firefox-talos-gfx benchmark with
+ the image16 backend. Benchmark on an 800 MHz ARM Cortex A8:
+
+ Before:
+
+ [ # ] backend test min(s) median(s) stddev. count
+ [ 0] image16 firefox-talos-gfx 121.773 122.218 0.15% 6/6
+
+ After:
+
+ [ # ] backend test min(s) median(s) stddev. count
+ [ 0] image16 firefox-talos-gfx 85.247 85.563 0.22% 6/6
+
+ V2: Slightly better instruction scheduling based on comments from Taekyun Kim.
+ V3: Eliminate all stalls from the inner loop. Also based on comments from Taekyun Kim.
+
+commit 1670b952143284f480c39ff087b5694a64eb7db3
+Author: Gilles Espinasse <g.esp@free.fr>
+Date: Tue Apr 12 22:44:56 2011 +0200
+
+ Fix OpenMP not supported case
+
+ PIXMAN_LINK_WITH_ENV did not fail unless -Wall -Werror is used.
+ So even when the compiler did not support OpenMP, USE_OPENMP was defined.
+ Fix that by running the second OpenMP test only when first AC_OPENMP find supported
+
+ configure tested in the cases :
+ gcc without libgomp support, no openmp option, --enable-openmp and --disable-openmp
+ gcc with libgomp support, no openmp option, --enable-openmp and --disable-openmp
+
+ Not tested with autoconf version not knowing openmp (<2.62)
+
+ Warn when --enable-openmp is requested but no support is found
+
+ Signed-off-by: Gilles Espinasse <g.esp@free.fr>
+
+commit b9e8f7fb7494e4ee4be56d1555632233a494b28e
+Author: Gilles Espinasse <g.esp@free.fr>
+Date: Tue Apr 12 22:44:25 2011 +0200
+
+ Fix missing AC_MSG_RESULT value from Werror test
+
+ Use the correct variable name
+
+ Signed-off-by: Gilles Espinasse <g.esp@free.fr>
+
+commit caae4e82ffdeebfb9aa98a6c49dd563e065c0959
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Mon Mar 21 20:25:27 2011 +0200
+
+ ARM: pipelined NEON implementation of bilinear scaled 'src_8888_0565'
+
+ Benchmark on ARM Cortex-A8 r1p3 @600MHz, 32-bit LPDDR @166MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=10020565, speed=33.59 MPix/s
+ after: op=1, src=20028888, dst=10020565, speed=46.25 MPix/s
+
+ Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=10020565, speed=63.86 MPix/s
+ after: op=1, src=20028888, dst=10020565, speed=84.22 MPix/s
+
+commit d080d59b802c351daed84b92bd4eb20c775b81c7
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Wed Mar 16 17:24:49 2011 +0200
+
+ ARM: pipelined NEON implementation of bilinear scaled 'src_8888_8888'
+
+ Performance of the inner loop when working with the data in L1 cache:
+ ARM Cortex-A8: 41 cycles per 4 pixels (no stalls and partial dual issue)
+ ARM Cortex-A9: 48 cycles per 4 pixels (no stalls)
+
+ It might be still possible to improve performance even more on ARM Cortex-A8
+ with a better use of dual issue.
+
+ Benchmark on ARM Cortex-A8 r1p3 @600MHz, 32-bit LPDDR @166MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=20028888, speed=40.38 MPix/s
+ after: op=1, src=20028888, dst=20028888, speed=48.47 MPix/s
+
+ Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=20028888, speed=79.68 MPix/s
+ after: op=1, src=20028888, dst=20028888, speed=93.11 MPix/s
+
+commit b496a8b279baebb8b9ab4fbcb2101583be08fe3b
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Thu Mar 17 19:42:01 2011 +0200
+
+ ARM: support different levels of loop unrolling in bilinear scaler
+
+ Now an extra 'flag' parameter is supported in bilinear scaline scaling
+ function generation macro. It can be used to enable 4 or 8 pixels per
+ loop iteration unrolling and provide save/restore code for d8-d15
+ registers.
+
+commit 34ca9cf03fa897cd377cdb19acc22e876b2f4b0e
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Mon Mar 21 18:41:53 2011 +0200
+
+ ARM: use less ARM instructions in NEON bilinear scaling code
+
+ This reduces code size and also puts less pressure on the
+ instruction decoder.
+
+commit 0f7be9f72ef6bfe2555b7f2cc29297c4f4762740
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Wed Mar 16 16:33:41 2011 +0200
+
+ ARM: support for software pipelining in bilinear macros
+
+ Now it's possible to override the main loop of bilinear scaling code
+ with optimized pipelined implementation.
+
+commit 9638af95832563040d6bd861cf4c20ab632058df
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Thu Mar 10 16:12:23 2011 +0200
+
+ ARM: use aligned memory writes in NEON bilinear scaling code
+
+commit 8bba3a0e1e54f03ea78fb44314f3bfa57ec8da31
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Thu Mar 10 15:34:10 2011 +0200
+
+ ARM: tweaked horizontal weights update in NEON bilinear scaling code
+
+ Moving horizontal interpolation weights update instructions from the
+ beginning of loop to its end allows to hide some pipeline stalls and
+ improve performance.
+
+commit a2153222677327be43251012f462d19a7e98ce14
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Sun Apr 3 20:32:30 2011 -0400
+
+ ARM: Tiny improvement in over_n_8888_8888_ca_process_pixblock_head
+
+ Instead of two
+
+ mvn d24, d24
+ mvn d25, d25
+
+ use just one
+
+ mvn q12, q12
+
+ Also move another vmvn instruction into the created pipeline bubble,
+ as pointed out by Siarhei.
+
+commit 44f99735d9c6a897078db12172d9d2d07b204f37
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Sat Apr 2 14:12:12 2011 -0400
+
+ Makefile.am: Put development releases in "snapshots" directory
+
+ Up until now, all pixman release, both snapshots and releases were
+ uploaded to the "releases" directory on www.cairographics.org, but
+ it's better to development snapshots in the "snapshots" directory.
+
+ This patch changes Makefile.am to do that.
+
+commit ad3cbfb073fc325e1b3152898ca71b8255675957
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Tue Mar 22 13:42:05 2011 -0400
+
+ test: Fix infinite loop in composite
+
+ When run in PIXMAN_RANDOMIZE_TESTS mode, this test would go into an
+ infinite loop because the loop started at 'seed' but the stop
+ condition was still N_TESTS.
+
+commit b514e63cfc58af21f7097db5a1b04292a758782a
+Author: Alexandros Frantzis <alexandros.frantzis@linaro.org>
+Date: Fri Mar 18 14:37:27 2011 +0200
+
+ Add support for the r8g8b8a8 and r8g8b8x8 formats to the tests.
+
+commit f05a90e5f8d1d0af60e2c684cbe9f1327c33135a
+Author: Alexandros Frantzis <alexandros.frantzis@linaro.org>
+Date: Fri Mar 18 14:36:15 2011 +0200
+
+ Add simple support for the r8g8b8a8 and r8g8b8x8 formats.
+
+ This format is particularly useful on big-endian architectures, where RGBA in
+ memory/file order corresponds to r8g8b8a8 as an uint32_t. This is important
+ because RGBA is in some cases the only available choice (for example as a pixel
+ format in OpenGL ES 2.0).
+
+commit 7eb0abb5e819046537b9f809c7ec332c6679c557
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Mon Mar 14 14:56:22 2011 -0400
+
+ test: Randomize some tests if PIXMAN_RANDOMIZE_TESTS is set
+
+ This patch makes so that composite and stress-test will start from a
+ random seed if the PIXMAN_RANDOMIZE_TESTS environment variable is
+ set. Running the test suite in this mode is useful to get more test
+ coverage.
+
+ Also, in stress-test.c make it so that setting the initial seed causes
+ threads to be turned off. This makes it much easier to see when
+ something fails.
+
+commit 6b27768d81c254a4f1d05473157328d5a5d99b9c
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Sat Mar 12 19:42:58 2011 -0500
+
+ Simplify the prototype for iterator initializers.
+
+ All of the information previously passed to the iterator initializers
+ is now available in the iterator itself, so there is no need to pass
+ it as arguments anymore.
+
+commit 74d0f44b6d6d613d24541b849835da0464cc6fd0
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Sat Mar 12 19:12:35 2011 -0500
+
+ Fill out parts of iters in _pixman_implementation_{src,dest}_iter_init()
+
+ This makes _pixman_implementation_{src,dest}_iter_init() responsible
+ for filling parts of the information in the iterators. Specifically,
+ the information passed as arguments is stored in the iterator.
+
+ Also add a height field to pixman_iter_t().
+
+commit be4eaa0e4f79af38b7b89c5b09ca88d3a88d9396
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Sat Mar 12 19:06:02 2011 -0500
+
+ In delegate_{src,dest}_iter_init() call delegate directly.
+
+ There is no reason to go through
+ _pixman_implementation_{src,dest}_iter_init(), especially since
+ _pixman_implementation_src_iter_init() is doing various other checks
+ that only need to be done once.
+
+ Also call delegate->src_iter_init() directly in pixman-sse2.c
+
+commit 70a923882ca24664344ba91a649e7aa12c3063f7
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Wed Mar 9 13:55:48 2011 +0200
+
+ ARM: a bit faster NEON bilinear scaling for r5g6b5 source images
+
+ Instructions scheduling improved in the code responsible for fetching r5g6b5
+ pixels and converting them to the intermediate x8r8g8b8 color format used in
+ the interpolation part of code. Still a lot of NEON stalls are remaining,
+ which can be resolved later by the use of pipelining.
+
+ Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=10020565, dst=10020565, speed=32.29 MPix/s
+ op=1, src=10020565, dst=20020888, speed=36.82 MPix/s
+ after: op=1, src=10020565, dst=10020565, speed=41.35 MPix/s
+ op=1, src=10020565, dst=20020888, speed=49.16 MPix/s
+
+commit fe99673719091d4a880d031add1369332a75731b
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Wed Mar 9 13:27:41 2011 +0200
+
+ ARM: NEON optimization for bilinear scaled 'src_0565_0565'
+
+ Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=10020565, dst=10020565, speed=3.30 MPix/s
+ after: op=1, src=10020565, dst=10020565, speed=32.29 MPix/s
+
+commit 29003c3befe2159396d181ef9ac1caaadcabf382
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Wed Mar 9 13:21:53 2011 +0200
+
+ ARM: NEON optimization for bilinear scaled 'src_0565_x888'
+
+ Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=10020565, dst=20020888, speed=3.39 MPix/s
+ after: op=1, src=10020565, dst=20020888, speed=36.82 MPix/s
+
+commit 2ee27e7d79637da9173ee1bf3423e5a81534ccb4
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Wed Mar 9 11:53:04 2011 +0200
+
+ ARM: NEON optimization for bilinear scaled 'src_8888_0565'
+
+ Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=10020565, speed=6.56 MPix/s
+ after: op=1, src=20028888, dst=10020565, speed=61.65 MPix/s
+
+commit 11a0c5badbc59ce967707ef836313cc98f8aec4e
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Wed Mar 9 11:46:48 2011 +0200
+
+ ARM: use common macro template for bilinear scaled 'src_8888_8888'
+
+ This is a cleanup for old and now duplicated code. The performance improvement
+ is mostly coming from the enabled use of software prefetch, but instructions
+ scheduling is also slightly better.
+
+ Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=20028888, speed=53.24 MPix/s
+ after: op=1, src=20028888, dst=20028888, speed=74.36 MPix/s
+
+commit 34098dba6763afd3636a14f9c2a079ab08f23b2d
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Wed Mar 9 11:34:15 2011 +0200
+
+ ARM: NEON: common macro template for bilinear scanline scalers
+
+ This allows to generate bilinear scanline scaling functions targeting
+ various source and destination color formats. Right now a8r8g8b8/x8r8g8b8
+ and r5g6b5 color formats are supported. More formats can be added if needed.
+
+commit 66f4ee1b3bccf4516433d61dbf2035551a712fa2
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Wed Mar 9 10:59:46 2011 +0200
+
+ ARM: new bilinear fast path template macro in 'pixman-arm-common.h'
+
+ It can be reused in different ARM NEON bilinear scaling fast path functions.
+
+commit 5921c17639fe5fdc595c850e3347281c1c8746ba
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Sun Mar 6 22:16:32 2011 +0200
+
+ ARM: assembly optimized nearest scaled 'src_8888_8888'
+
+ Benchmark on ARM Cortex-A8 r1p3 @500MHz, 32-bit LPDDR @166MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=20028888, speed=44.36 MPix/s
+ after: op=1, src=20028888, dst=20028888, speed=39.79 MPix/s
+
+ Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=20028888, speed=102.36 MPix/s
+ after: op=1, src=20028888, dst=20028888, speed=163.12 MPix/s
+
+commit f3e17872f5522e25da8e32de83e62bee8cc198d7
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Mon Mar 7 03:10:43 2011 +0200
+
+ ARM: common macro for nearest scaling fast paths
+
+ The code of nearest scaled 'src_0565_0565' function was generalized
+ and moved to a common macro, so that it can be reused for other
+ fast paths.
+
+commit bb3d1b67fd0f42ae00af811c624ea1c44541034d
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Sun Mar 6 16:17:12 2011 +0200
+
+ ARM: use prefetch in nearest scaled 'src_0565_0565'
+
+ Benchmark on ARM Cortex-A8 r1p3 @500MHz, 32-bit LPDDR @166MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=10020565, dst=10020565, speed=75.02 MPix/s
+ after: op=1, src=10020565, dst=10020565, speed=73.63 MPix/s
+
+ Benchmark on ARM Cortex-A8 r2p2 @1GHz, 32-bit LPDDR @200MHz:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=10020565, dst=10020565, speed=176.12 MPix/s
+ after: op=1, src=10020565, dst=10020565, speed=267.50 MPix/s
+
+commit 84e361c8e357e26f299213fbeefe64c73447b116
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Fri Mar 4 15:51:18 2011 -0500
+
+ test: Do endian swapping of the source and destination images.
+
+ Otherwise the test fails on big endian. Fix for bug 34767, reported by
+ Siarhei Siamashka.
+
+commit 84f3c5a71a2de1a96dcf0c7f9ab0a8ee1b1b158f
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Mon Mar 7 13:45:54 2011 -0500
+
+ test: In image_endian_swap() use pixman_image_get_format() to get the bpp.
+
+ There is no reason to pass in the bpp as an argument; it can be gotten
+ directly from the image.
+
+commit 17feaa9c50bb8521b0366345efe181bd99754957
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Tue Feb 22 18:45:03 2011 +0200
+
+ ARM: NEON optimization for bilinear scaled 'src_8888_8888'
+
+ Initial NEON optimization for bilinear scaling. Can be probably
+ improved more.
+
+ Benchmark on ARM Cortex-A8:
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=20028888, speed=6.70 MPix/s
+ after: op=1, src=20028888, dst=20028888, speed=44.27 MPix/s
+
+commit 350029396d911941591149cc82b5e68a78ad6747
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Mon Feb 21 20:18:02 2011 +0200
+
+ SSE2 optimization for bilinear scaled 'src_8888_8888'
+
+ A primitive naive implementation of bilinear scaling using SSE2 intrinsics,
+ which only handles one pixel at a time. It is approximately 2x faster than
+ pixman general compositing path. Single pass processing without intermediate
+ temporary buffer contributes to ~15% and loop unrolling contributes to ~20%
+ of this speedup.
+
+ Benchmark on Intel Core i7 (x86-64):
+ Using cairo-perf-trace:
+ before: image firefox-planet-gnome 12.566 12.610 0.23% 6/6
+ after: image firefox-planet-gnome 10.961 11.013 0.19% 5/6
+
+ Microbenchmark (scaling 2000x2000 image with scale factor close to 1x):
+ before: op=1, src=20028888, dst=20028888, speed=70.48 MPix/s
+ after: op=1, src=20028888, dst=20028888, speed=165.38 MPix/s
+
+commit 0df43b8ae5031dd83775d00b57b6bed809db0e89
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Mon Feb 21 02:07:09 2011 +0200
+
+ test: check correctness of 'bilinear_pad_repeat_get_scanline_bounds'
+
+ Individual correctness check for the new bilinear scaling related
+ supplementary function. This test program uses a bit wider range
+ of input arguments, not covered by other tests.
+
+commit d506bf68fd0e9a1c5dd484daee70631699918387
+Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
+Date: Mon Feb 21 01:29:02 2011 +0200
+
+ Main loop template for fast single pass bilinear scaling
+
+ Can be used for implementing SIMD optimized fast path
+ functions which work with bilinear scaled source images.
+
+ Similar to the template for nearest scaling main loop, the
+ following types of mask are supported:
+ 1. no mask
+ 2. non-scaled a8 mask with SAMPLES_COVER_CLIP flag
+ 3. solid mask
+
+ PAD repeat is fully supported. NONE repeat is partially
+ supported (right now only works if source image has alpha
+ channel or when alpha channel of the source image does not
+ have any effect on the compositing operation).
+
+commit 9ebde285fa990bfa1524f166fbfb1368c346b14a
+Author: Andrea Canciani <ranma42@gmail.com>
+Date: Thu Feb 24 12:53:39 2011 +0100
+
+ test: Silence MSVC warnings
+
+ MSVC does not notice non-returning functions (abort() / assert(0))
+ and warns about paths which end with them in non-void functions:
+
+ c:\cygwin\home\ranma42\code\fdo\pixman\test\fetch-test.c(114) :
+ warning C4715: 'reader' : not all control paths return a value
+ c:\cygwin\home\ranma42\code\fdo\pixman\test\stress-test.c(133) :
+ warning C4715: 'real_reader' : not all control paths return a value
+ c:\cygwin\home\ranma42\code\fdo\pixman\test\composite.c(431) :
+ warning C4715: 'calc_op' : not all control paths return a value
+
+ These warnings can be silenced by adding a return after the
+ termination call.
+
+commit 8868778ea1fdc8e70da76b3b00ea78106c5840d8
+Author: Andrea Canciani <ranma42@gmail.com>
+Date: Tue Feb 22 22:43:48 2011 +0100
+
+ Do not include unused headers
+
+ pixman-combine32.h is included without being used both in
+ pixman-image.c and in pixman-general.c.
+
+commit 72f5e5f608506c18c484bc5bc3e58bd83aeb7691
+Author: Andrea Canciani <ranma42@gmail.com>
+Date: Tue Feb 22 22:04:49 2011 +0100
+
+ test: Add Makefile for Win32
+
+commit 11305b4ecdd36a17592c5c75de9157874853ab20
+Author: Andrea Canciani <ranma42@gmail.com>
+Date: Tue Feb 22 21:46:37 2011 +0100
+
+ test: Fix tests for compilation on Windows
+
+ The Microsoft C compiler cannot handle subobject initialization and
+ Win32 does not provide snprintf.
+
+ Work around these limitations by using normal struct initialization
+ and using sprintf (a manual check shows that the buffer size is
+ sufficient).
+
+commit 20ed723a5a42fb8636bc9a5f32974dec1b66a785
+Author: Andrea Canciani <ranma42@gmail.com>
+Date: Thu Feb 24 10:44:04 2011 +0100
+
+ Fix compilation on Win32
+
+ Makefile.win32 contained a typo and was missing the dependency from
+ the built sources.
+
+commit 48e951000c7ff14f40c671f3efb6abb18162c840
+Author: Søren Sandmann Pedersen <ssp@redhat.com>
+Date: Tue Feb 22 16:13:32 2011 -0500
+
+ Post-release version bump to 0.21.7
+
commit 8b3332166094db657e96c365a524b2cd7513359b
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date: Tue Feb 22 15:43:41 2011 -0500
diff --git a/debian/changelog b/debian/changelog
index e26a43b..a5fdd88 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,12 @@
+pixman (0.21.8-1) UNRELEASED; urgency=low
+
+ * New upstream release.
+ * As seen in the upstream announcement: “When this version of pixman is
+ used with the git version of the X server, trapezoid rendering will be
+ corrupted. This is a known bug in the X server.”
+
+ -- Cyril Brulebois <kibi@debian.org> Fri, 29 Apr 2011 17:52:08 +0200
+
pixman (0.21.6-2) unstable; urgency=low
* Upload to unstable.
commit 89868e93bd8d66f0fac0f0b42cf7718756992e4e
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date: Tue Apr 19 00:00:37 2011 -0400
Pre-release version bump to 0.21.8
diff --git a/configure.ac b/configure.ac
index 09a4948..0d51bd0 100644
--- a/configure.ac
+++ b/configure.ac
@@ -54,7 +54,7 @@ AC_PREREQ([2.57])
m4_define([pixman_major], 0)
m4_define([pixman_minor], 21)
-m4_define([pixman_micro], 7)
+m4_define([pixman_micro], 8)
m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
commit 33f1652b953467f3910605b3be723e21b3ebe078
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Wed Apr 13 11:57:35 2011 +0900
ARM: Enable bilinear fast paths using scanline functions in pixman-arm-neon-asm-bilinear.S
Enable fast paths which is supported by scanline functions in
pixman-arm-neon-asm-bilinear.S
diff --git a/pixman/pixman-arm-neon.c b/pixman/pixman-arm-neon.c
index 77875ad..e5127a6 100644
--- a/pixman/pixman-arm-neon.c
+++ b/pixman/pixman-arm-neon.c
@@ -137,6 +137,23 @@ PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_x888, SRC,
uint16_t, uint32_t)
PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (0, neon, 0565_0565, SRC,
uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, OVER,
+ uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_DST (SKIP_ZERO_SRC, neon, 8888_8888, ADD,
+ uint32_t, uint32_t)
+
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_8888, SRC,
+ uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 8888_8_0565, SRC,
+ uint32_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_x888, SRC,
+ uint16_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (0, neon, 0565_8_0565, SRC,
+ uint16_t, uint16_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, OVER,
+ uint32_t, uint32_t)
+PIXMAN_ARM_BIND_SCALED_BILINEAR_SRC_A8_DST (SKIP_ZERO_SRC, neon, 8888_8_8888, ADD,
+ uint32_t, uint32_t)
void
pixman_composite_src_n_8_asm_neon (int32_t w,
@@ -366,6 +383,28 @@ static const pixman_fast_path_t arm_neon_fast_paths[] =
SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_x888),
SIMPLE_BILINEAR_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_0565),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+ SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8888),
+ SIMPLE_BILINEAR_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8888),
+
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, a8r8g8b8, r5g6b5, neon_8888_8_0565),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, x8r8g8b8, r5g6b5, neon_8888_8_0565),
+
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, x8r8g8b8, neon_0565_8_x888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (SRC, r5g6b5, r5g6b5, neon_0565_8_0565),
+
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, a8r8g8b8, neon_8888_8_8888),
+ SIMPLE_BILINEAR_A8_MASK_FAST_PATH (ADD, a8r8g8b8, x8r8g8b8, neon_8888_8_8888),
+
{ PIXMAN_OP_NONE },
};
commit e8185f1cb43417d9f7b1d2856bb899f1b84fde81
Author: Taekyun Kim <tkq.kim@samsung.com>
Date: Wed Apr 13 11:48:40 2011 +0900
ARM: NEON scanline functions for bilinear scaling
General fetch->combine->store based bilinear scanline functions.
Need further optimizations and eventually will be replaced with optimal
functions one by one.
General functions should be located in pixman-arm-neon-asm-bilinear.S and
optimal functions in pixman-arm-neon-asm.S
Following general bilinear scanline functions are implemented
over_8888_8888
add_8888_8888
src_8888_8_8888
src_8888_8_0565
src_0565_8_x888
src_0565_8_0565
over_8888_8_8888
add_8888_8_8888
diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index d016e9f..be08266 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -115,6 +115,7 @@ libpixman_arm_neon_la_SOURCES = \
pixman-arm-neon.c \
pixman-arm-common.h \
pixman-arm-neon-asm.S \
+ pixman-arm-neon-asm-bilinear.S \
pixman-arm-neon-asm.h
libpixman_arm_neon_la_CFLAGS = $(DEP_CFLAGS)
libpixman_arm_neon_la_LIBADD = $(DEP_LIBS)
diff --git a/pixman/pixman-arm-neon-asm-bilinear.S b/pixman/pixman-arm-neon-asm-bilinear.S
new file mode 100644
index 0000000..9a4a1ff
--- /dev/null
+++ b/pixman/pixman-arm-neon-asm-bilinear.S
@@ -0,0 +1,768 @@
+/*
+ * Copyright © 2011 SCore Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author: Siarhei Siamashka (siarhei.siamashka@nokia.com)
+ * Author: Taekyun Kim (tkq.kim@samsung.com)
+ */
+
+/*
+ * This file contains scaled bilinear scanline functions implemented
+ * using older siarhei's bilinear macro template.
+ *
+ * << General scanline function procedures >>
+ * 1. bilinear interpolate source pixels
+ * 2. load mask pixels
+ * 3. load destination pixels
+ * 4. duplicate mask to fill whole register
+ * 5. interleave source & destination pixels
+ * 6. apply mask to source pixels
+ * 7. combine source & destination pixels
+ * 8, Deinterleave final result
+ * 9. store destination pixels
+ *
+ * All registers with single number (i.e. src0, tmp0) are 64-bits registers.
+ * Registers with double numbers(src01, dst01) are 128-bits registers.
+ * All temp registers can be used freely outside the code block.
+ * Assume that symbol(register .req) OUT and MASK are defined at caller of these macro blocks.
+ *
+ * TODOs
+ * Support 0565 pixel format
+ * Optimization for two and last pixel cases
+ *
+ * Remarks
+ * There can be lots of pipeline stalls inside code block and between code blocks.
+ * Further optimizations will be done by new macro templates using head/tail_head/tail scheme.
+ */
+
+/* Prevent the stack from becoming executable for no reason... */
+#if defined(__linux__) && defined (__ELF__)
+.section .note.GNU-stack,"",%progbits
+#endif
+
+.text
+.fpu neon
+.arch armv7a
+.object_arch armv4
+.eabi_attribute 10, 0
+.eabi_attribute 12, 0
+.arm
+.altmacro
+
+#include "pixman-arm-neon-asm.h"
+
+/*
+ * Bilinear macros from pixman-arm-neon-asm.S
+ */
+
+/* Supplementary macro for setting function attributes */
+.macro pixman_asm_function fname
+ .func fname
+ .global fname
+#ifdef __ELF__
+ .hidden fname
+ .type fname, %function
+#endif
+fname:
+.endm
+
+/*
+ * Bilinear scaling support code which tries to provide pixel fetching, color
+ * format conversion, and interpolation as separate macros which can be used
+ * as the basic building blocks for constructing bilinear scanline functions.
+ */
+
+.macro bilinear_load_8888 reg1, reg2, tmp
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #2
+ add TMP2, BOTTOM, TMP2, asl #2
+ vld1.32 {reg1}, [TMP1]
+ vld1.32 {reg2}, [TMP2]
+.endm
+
+.macro bilinear_load_0565 reg1, reg2, tmp
+ mov TMP2, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ vld1.32 {reg2[0]}, [TMP1]
+ vld1.32 {reg2[1]}, [TMP2]
+ convert_four_0565_to_x888_packed reg2, reg1, reg2, tmp
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_8888 \
+ acc1, acc2, reg1, reg2, reg3, reg4, tmp1, tmp2
+
+ bilinear_load_8888 reg1, reg2, tmp1
+ vmull.u8 acc1, reg1, d28
+ vmlal.u8 acc1, reg2, d29
+ bilinear_load_8888 reg3, reg4, tmp2
+ vmull.u8 acc2, reg3, d28
+ vmlal.u8 acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi
+ bilinear_load_and_vertical_interpolate_two_8888 \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_two_0565 \
+ acc1, acc2, reg1, reg2, reg3, reg4, acc2lo, acc2hi
+
+ mov TMP2, X, asr #16
+ add X, X, UX
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ add TMP3, TOP, TMP4, asl #1
+ add TMP4, BOTTOM, TMP4, asl #1
+ vld1.32 {acc2lo[0]}, [TMP1]
+ vld1.32 {acc2hi[0]}, [TMP3]
+ vld1.32 {acc2lo[1]}, [TMP2]
+ vld1.32 {acc2hi[1]}, [TMP4]
+ convert_0565_to_x888 acc2, reg3, reg2, reg1
+ vzip.u8 reg1, reg3
+ vzip.u8 reg2, reg4
+ vzip.u8 reg3, reg4
+ vzip.u8 reg1, reg2
+ vmull.u8 acc1, reg1, d28
+ vmlal.u8 acc1, reg2, d29
+ vmull.u8 acc2, reg3, d28
+ vmlal.u8 acc2, reg4, d29
+.endm
+
+.macro bilinear_load_and_vertical_interpolate_four_0565 \
+ xacc1, xacc2, xreg1, xreg2, xreg3, xreg4, xacc2lo, xacc2hi \
+ yacc1, yacc2, yreg1, yreg2, yreg3, yreg4, yacc2lo, yacc2hi
+
+ mov TMP2, X, asr #16
+ add X, X, UX
+ mov TMP4, X, asr #16
+ add X, X, UX
+ add TMP1, TOP, TMP2, asl #1
+ add TMP2, BOTTOM, TMP2, asl #1
+ add TMP3, TOP, TMP4, asl #1
+ add TMP4, BOTTOM, TMP4, asl #1
+ vld1.32 {xacc2lo[0]}, [TMP1]
Reply to: