pixman: Changes to 'upstream-experimental'
.gitignore | 2
Makefile.am | 4
configure.ac | 102 +
demos/Makefile.am | 13
demos/checkerboard.c | 71 +
demos/composite-test.c | 55 -
demos/gtk-utils.c | 46
demos/parrot.c | 1079 ++++++++++++++++++++
demos/parrot.jpg |binary
demos/quad2quad.c | 2183 +++++++++++++++++++++++++++++++++++++++++
pixman/Makefile.am | 12
pixman/loongson-mmintrin.h | 273 +++++
pixman/pixman-bits-image.c | 10
pixman/pixman-compiler.h | 4
pixman/pixman-cpu.c | 39
pixman/pixman-fast-path.c | 6
pixman/pixman-mips-dspr2-asm.S | 443 ++++++++
pixman/pixman-mips-dspr2-asm.h | 363 ++++++
pixman/pixman-mips-dspr2.c | 22
pixman/pixman-mips-dspr2.h | 42
pixman/pixman-mmx.c | 665 +++++++++---
pixman/pixman-private.h | 9
test/utils.c | 35
test/utils.h | 9
24 files changed, 5211 insertions(+), 276 deletions(-)
New commits:
commit 1e1a00e964a1d8ef43d6d75c1c3a0b5d518d1979
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date: Tue May 15 13:20:09 2012 -0400
Pre-release version bump to 0.25.6
Note that 0.25.4 was a botched release that doesn't have a tag and
doesn't correspond to any commit ID. It was however uploaded and
announced, so I'll just use the 0.25.6 version number.
diff --git a/configure.ac b/configure.ac
index d949839..502815e 100644
--- a/configure.ac
+++ b/configure.ac
@@ -54,7 +54,7 @@ AC_PREREQ([2.57])
m4_define([pixman_major], 0)
m4_define([pixman_minor], 25)
-m4_define([pixman_micro], 3)
+m4_define([pixman_micro], 6)
m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
commit b2c16aaadfae64d2573abb537bfedd92c13b8d06
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date: Tue May 15 13:19:19 2012 -0400
demos/Makefile.am: Add parrot.c to EXTRA_DIST
To get 'make distcheck' to pass.
diff --git a/demos/Makefile.am b/demos/Makefile.am
index a664d93..8f734cf 100644
--- a/demos/Makefile.am
+++ b/demos/Makefile.am
@@ -22,6 +22,8 @@ DEMOS = \
quad2quad \
checkerboard
+EXTRA_DIST = parrot.c
+
gradient_test_SOURCES = gradient-test.c $(GTK_UTILS)
alpha_test_SOURCES = alpha-test.c $(GTK_UTILS)
composite_test_SOURCES = composite-test.c $(GTK_UTILS)
commit 50d3088d7882e1054a35e917becb7752662da6f0
Author: Matt Turner <mattst88@gmail.com>
Date: Fri May 11 21:59:13 2012 -0400
configure.ac: Rename loongson -> loongson-mmi
Make it match with the other fast paths, and the PIXMAN_DISABLE value is
already loongson-mmi.
diff --git a/configure.ac b/configure.ac
index 57fd060..d949839 100644
--- a/configure.ac
+++ b/configure.ac
@@ -278,7 +278,7 @@ if test "x$LS_CFLAGS" = "x" ; then
fi
have_loongson_mmi=no
-AC_MSG_CHECKING(whether to use Loongson MMI)
+AC_MSG_CHECKING(whether to use Loongson MMI assembler)
xserver_save_CFLAGS=$CFLAGS
CFLAGS=" $LS_CFLAGS $CFLAGS -I$srcdir"
@@ -301,12 +301,12 @@ int main () {
}]])], have_loongson_mmi=yes)
CFLAGS=$xserver_save_CFLAGS
-AC_ARG_ENABLE(loongson,
- [AC_HELP_STRING([--disable-loongson],
- [disable Loongson fast paths])],
- [enable_loongson=$enableval], [enable_loongson=auto])
+AC_ARG_ENABLE(loongson-mmi,
+ [AC_HELP_STRING([--disable-loongson-mmi],
+ [disable Loongson MMI fast paths])],
+ [enable_loongson_mmi=$enableval], [enable_loongson_mmi=auto])
-if test $enable_loongson = no ; then
+if test $enable_loongson_mmi = no ; then
have_loongson_mmi=disabled
fi
@@ -317,7 +317,7 @@ else
fi
AC_MSG_RESULT($have_loongson_mmi)
-if test $enable_loongson = yes && test $have_loongson_mmi = no ; then
+if test $enable_loongson_mmi = yes && test $have_loongson_mmi = no ; then
AC_MSG_ERROR([Loongson MMI not detected])
fi
commit a0a40cb822bec52494c64e6750be50b734dc29df
Author: Matt Turner <mattst88@gmail.com>
Date: Fri May 11 21:49:42 2012 -0400
configure.ac: Fix loongson-mmi out-of-tree builds
When building out-of-tree, gcc wasn't able to find loongson-mmintrin.h
to compile the test program. Add -I$srcdir to CFLAGS to point gcc to it.
diff --git a/configure.ac b/configure.ac
index 345bc33..57fd060 100644
--- a/configure.ac
+++ b/configure.ac
@@ -281,7 +281,7 @@ have_loongson_mmi=no
AC_MSG_CHECKING(whether to use Loongson MMI)
xserver_save_CFLAGS=$CFLAGS
-CFLAGS=" $LS_CFLAGS $CFLAGS"
+CFLAGS=" $LS_CFLAGS $CFLAGS -I$srcdir"
AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
#ifndef __mips_loongson_vector_rev
#error "Loongson Multimedia Instructions are only available on Loongson"
commit 618a08e6aa03b38e8dc71ac610f7fdd55e8a8558
Author: Nemanja Lukic <nemanja.lukic@rt-rk.com>
Date: Thu May 3 00:03:42 2012 +0200
MIPS: DSPr2: Added over_n_8_8888 and over_n_8_0565 fast paths.
Performance numbers before/after on MIPS-74kc @ 1GHz
Referent (before):
lowlevel-blt-bench:
over_n_8_8888 = L1: 10.40 L2: 9.79 M: 8.47 ( 33.62%) HT: 7.64 VT: 7.59 R: 7.48 RT: 5.30 ( 40Kops/s)
over_n_8_0565 = L1: 7.40 L2: 7.23 M: 6.78 ( 17.94%) HT: 6.23 VT: 6.17 R: 6.14 RT: 4.62 ( 37Kops/s)
Optimized:
lowlevel-blt-bench:
over_n_8_8888 = L1: 27.25 L2: 26.24 M: 18.15 ( 72.12%) HT: 14.52 VT: 14.31 R: 13.83 RT: 7.57 ( 48Kops/s)
over_n_8_0565 = L1: 18.91 L2: 17.59 M: 15.06 ( 39.90%) HT: 12.18 VT: 11.98 R: 11.83 RT: 6.80 ( 46Kops/s)
diff --git a/pixman/pixman-mips-dspr2-asm.S b/pixman/pixman-mips-dspr2-asm.S
index 6a0fc18..68ad33f 100644
--- a/pixman/pixman-mips-dspr2-asm.S
+++ b/pixman/pixman-mips-dspr2-asm.S
@@ -527,3 +527,227 @@ LEAF_MIPS_DSPR2(pixman_composite_over_n_8888_0565_ca_asm_mips)
nop
END(pixman_composite_over_n_8888_0565_ca_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8_8888_asm_mips)
+/*
+ * a0 - dst (a8r8g8b8)
+ * a1 - src (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+
+ SAVE_REGS_ON_STACK 4, s0, s1, s2, s3, s4
+ beqz a3, 4f
+ nop
+ li t4, 0x00ff00ff
+ li t5, 0xff
+ addiu t0, a3, -1
+ beqz t0, 3f /* last pixel */
+ srl t6, a1, 24 /* t6 = srca */
+ not s4, a1
+ beq t5, t6, 2f /* if (srca == 0xff) */
+ srl s4, s4, 24
+1:
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ lbu t1, 1(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 111f /* if (t0 == 0) && (t1 == 0) */
+ addiu a2, a2, 2
+ and t3, t0, t1
+
+ lw t2, 0(a0) /* t2 = dst */
+ beq t3, t5, 11f /* if (t0 == 0xff) && (t1 == 0xff) */
+ lw t3, 4(a0) /* t3 = dst */
+
+ MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, s0, s1, t4, t6, t7, t8, t9, s2, s3
+ not s2, s0
+ not s3, s1
+ srl s2, s2, 24
+ srl s3, s3, 24
+ MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s2, s3, t2, t3, t4, t0, t1, t6, t7, t8, t9
+ addu_s.qb s2, t2, s0
+ addu_s.qb s3, t3, s1
+ sw s2, 0(a0)
+ b 111f
+ sw s3, 4(a0)
+11:
+ MIPS_2xUN8x4_MUL_2xUN8 t2, t3, s4, s4, t2, t3, t4, t0, t1, t6, t7, t8, t9
+ addu_s.qb s2, t2, a1
+ addu_s.qb s3, t3, a1
+ sw s2, 0(a0)
+ sw s3, 4(a0)
+
+111:
+ addiu a3, a3, -2
+ addiu t0, a3, -1
+ bgtz t0, 1b
+ addiu a0, a0, 8
+ b 3f
+ nop
+2:
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ lbu t1, 1(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 222f /* if (t0 == 0) && (t1 == 0) */
+ addiu a2, a2, 2
+ and t3, t0, t1
+ beq t3, t5, 22f /* if (t0 == 0xff) && (t1 == 0xff) */
+ nop
+ lw t2, 0(a0) /* t2 = dst */
+ lw t3, 4(a0) /* t3 = dst */
+
+ OVER_2x8888_2x8_2x8888 a1, a1, t0, t1, t2, t3, \
+ t6, t7, t4, t8, t9, s0, s1, s2, s3
+ sw t6, 0(a0)
+ b 222f
+ sw t7, 4(a0)
+22:
+ sw a1, 0(a0)
+ sw a1, 4(a0)
+222:
+ addiu a3, a3, -2
+ addiu t0, a3, -1
+ bgtz t0, 2b
+ addiu a0, a0, 8
+3:
+ blez a3, 4f
+ nop
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ beqz t0, 4f /* if (t0 == 0) */
+ addiu a2, a2, 1
+ move t3, a1
+ beq t0, t5, 31f /* if (t0 == 0xff) */
+ lw t1, 0(a0) /* t1 = dst */
+
+ MIPS_UN8x4_MUL_UN8 a1, t0, t3, t4, t6, t7, t8
+31:
+ not t2, t3
+ srl t2, t2, 24
+ MIPS_UN8x4_MUL_UN8 t1, t2, t1, t4, t6, t7, t8
+ addu_s.qb t2, t1, t3
+ sw t2, 0(a0)
+4:
+ RESTORE_REGS_FROM_STACK 4, s0, s1, s2, s3, s4
+ j ra
+ nop
+
+END(pixman_composite_over_n_8_8888_asm_mips)
+
+LEAF_MIPS_DSPR2(pixman_composite_over_n_8_0565_asm_mips)
+/*
+ * a0 - dst (r5g6b5)
+ * a1 - src (32bit constant)
+ * a2 - mask (a8)
+ * a3 - w
+ */
+ SAVE_REGS_ON_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ beqz a3, 4f
+ nop
+ li t4, 0x00ff00ff
+ li t5, 0xff
+ li t6, 0xf800f800
+ li t7, 0x07e007e0
+ li t8, 0x001F001F
+ addiu t1, a3, -1
+ beqz t1, 3f /* last pixel */
+ srl t0, a1, 24 /* t0 = srca */
+ not v0, a1
+ beq t0, t5, 2f /* if (srca == 0xff) */
+ srl v0, v0, 24
+1:
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ lbu t1, 1(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 111f /* if (t0 == 0) && (t1 == 0) */
+ addiu a2, a2, 2
+ lhu t2, 0(a0) /* t2 = dst */
+ lhu t3, 2(a0) /* t3 = dst */
+ CONVERT_2x0565_TO_2x8888 t2, t3, s0, s1, t7, t8, t9, s2, s3, s4
+ and t9, t0, t1
+ beq t9, t5, 11f /* if (t0 == 0xff) && (t1 == 0xff) */
+ nop
+
+ MIPS_2xUN8x4_MUL_2xUN8 a1, a1, t0, t1, s2, s3, t4, t9, s4, s5, s6, s7, s8
+ not s4, s2
+ not s5, s3
+ srl s4, s4, 24
+ srl s5, s5, 24
+ MIPS_2xUN8x4_MUL_2xUN8 s0, s1, s4, s5, s0, s1, t4, t9, t0, t1, s6, s7, s8
+ addu_s.qb s4, s2, s0
+ addu_s.qb s5, s3, s1
+ CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1
+ sh t2, 0(a0)
+ b 111f
+ sh t3, 2(a0)
+11:
+ MIPS_2xUN8x4_MUL_2xUN8 s0, s1, v0, v0, s0, s1, t4, t9, t0, t1, s6, s7, s8
+ addu_s.qb s4, a1, s0
+ addu_s.qb s5, a1, s1
+ CONVERT_2x8888_TO_2x0565 s4, s5, t2, t3, t6, t7, t8, s0, s1
+ sh t2, 0(a0)
+ sh t3, 2(a0)
+111:
+ addiu a3, a3, -2
+ addiu t0, a3, -1
+ bgtz t0, 1b
+ addiu a0, a0, 4
+ b 3f
+ nop
+2:
+ CONVERT_1x8888_TO_1x0565 a1, s0, s1, s2
+21:
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ lbu t1, 1(a2) /* t1 = mask */
+ or t2, t0, t1
+ beqz t2, 222f /* if (t0 == 0) && (t1 == 0) */
+ addiu a2, a2, 2
+ and t9, t0, t1
+ move s2, s0
+ beq t9, t5, 22f /* if (t0 == 0xff) && (t2 == 0xff) */
+ move s3, s0
+ lhu t2, 0(a0) /* t2 = dst */
+ lhu t3, 2(a0) /* t3 = dst */
+
+ CONVERT_2x0565_TO_2x8888 t2, t3, s2, s3, t7, t8, s4, s5, s6, s7
+ OVER_2x8888_2x8_2x8888 a1, a1, t0, t1, s2, s3, \
+ t2, t3, t4, t9, s4, s5, s6, s7, s8
+ CONVERT_2x8888_TO_2x0565 t2, t3, s2, s3, t6, t7, t8, s4, s5
+22:
+ sh s2, 0(a0)
+ sh s3, 2(a0)
+222:
+ addiu a3, a3, -2
+ addiu t0, a3, -1
+ bgtz t0, 21b
+ addiu a0, a0, 4
+3:
+ blez a3, 4f
+ nop
+ /* a1 = src */
+ lbu t0, 0(a2) /* t0 = mask */
+ beqz t0, 4f /* if (t0 == 0) */
+ nop
+ lhu t1, 0(a0) /* t1 = dst */
+ CONVERT_1x0565_TO_1x8888 t1, t2, t3, t7
+ beq t0, t5, 31f /* if (t0 == 0xff) */
+ move t3, a1
+
+ MIPS_UN8x4_MUL_UN8 a1, t0, t3, t4, t7, t8, t9
+31:
+ not t6, t3
+ srl t6, t6, 24
+ MIPS_UN8x4_MUL_UN8 t2, t6, t2, t4, t7, t8, t9
+ addu_s.qb t1, t2, t3
+ CONVERT_1x8888_TO_1x0565 t1, t2, t3, t7
+ sh t2, 0(a0)
+4:
+ RESTORE_REGS_FROM_STACK 24, v0, s0, s1, s2, s3, s4, s5, s6, s7, s8
+ j ra
+ nop
+
+END(pixman_composite_over_n_8_0565_asm_mips)
diff --git a/pixman/pixman-mips-dspr2-asm.h b/pixman/pixman-mips-dspr2-asm.h
index 12ff42c..8383060 100644
--- a/pixman/pixman-mips-dspr2-asm.h
+++ b/pixman/pixman-mips-dspr2-asm.h
@@ -499,4 +499,71 @@ LEAF_MIPS32R2(symbol) \
precr.qb.ph \d2_8888, \scratch5, \scratch6
.endm
+/*
+ * OVER operation on single a8r8g8b8 source pixel (s_8888) and single a8r8g8b8
+ * destination pixel (d_8888) using a8 mask (m_8). It also requires maskLSR
+ * needed for rounding process. maskLSR must have following value:
+ * li maskLSR, 0x00ff00ff
+ */
+.macro OVER_8888_8_8888 s_8888, \
+ m_8, \
+ d_8888, \
+ out_8888, \
+ maskLSR, \
+ scratch1, scratch2, scratch3, scratch4
+ MIPS_UN8x4_MUL_UN8 \s_8888, \m_8, \
+ \scratch1, \maskLSR, \
+ \scratch2, \scratch3, \scratch4
+
+ not \scratch2, \scratch1
+ srl \scratch2, \scratch2, 24
+
+ MIPS_UN8x4_MUL_UN8 \d_8888, \scratch2, \
+ \d_8888, \maskLSR, \
+ \scratch3, \scratch4, \out_8888
+
+ addu_s.qb \out_8888, \d_8888, \scratch1
+.endm
+
+/*
+ * OVER operation on two a8r8g8b8 source pixels (s1_8888 and s2_8888) and two
+ * a8r8g8b8 destination pixels (d1_8888 and d2_8888) using a8 masks (m1_8 and
+ * m2_8). It also requires maskLSR needed for rounding process. maskLSR must
+ * have following value:
+ * li maskLSR, 0x00ff00ff
+ */
+.macro OVER_2x8888_2x8_2x8888 s1_8888, \
+ s2_8888, \
+ m1_8, \
+ m2_8, \
+ d1_8888, \
+ d2_8888, \
+ out1_8888, \
+ out2_8888, \
+ maskLSR, \
+ scratch1, scratch2, scratch3, \
+ scratch4, scratch5, scratch6
+ MIPS_2xUN8x4_MUL_2xUN8 \s1_8888, \s2_8888, \
+ \m1_8, \m2_8, \
+ \scratch1, \scratch2, \
+ \maskLSR, \
+ \scratch3, \scratch4, \out1_8888, \
+ \out2_8888, \scratch5, \scratch6
+
+ not \scratch3, \scratch1
+ srl \scratch3, \scratch3, 24
+ not \scratch4, \scratch2
+ srl \scratch4, \scratch4, 24
+
+ MIPS_2xUN8x4_MUL_2xUN8 \d1_8888, \d2_8888, \
+ \scratch3, \scratch4, \
+ \d1_8888, \d2_8888, \
+ \maskLSR, \
+ \scratch5, \scratch6, \out1_8888, \
+ \out2_8888, \scratch3, \scratch4
+
+ addu_s.qb \out1_8888, \d1_8888, \scratch1
+ addu_s.qb \out2_8888, \d2_8888, \scratch2
+.endm
+
#endif //PIXMAN_MIPS_DSPR2_ASM_H
diff --git a/pixman/pixman-mips-dspr2.c b/pixman/pixman-mips-dspr2.c
index 018770a..7081734 100644
--- a/pixman/pixman-mips-dspr2.c
+++ b/pixman/pixman-mips-dspr2.c
@@ -53,6 +53,10 @@ PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_8888_ca,
uint32_t, 1, uint32_t, 1)
PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8888_0565_ca,
uint32_t, 1, uint16_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_8888,
+ uint8_t, 1, uint32_t, 1)
+PIXMAN_MIPS_BIND_FAST_PATH_N_MASK_DST (SKIP_ZERO_SRC, over_n_8_0565,
+ uint8_t, 1, uint16_t, 1)
static pixman_bool_t
pixman_fill_mips (uint32_t *bits,
@@ -195,6 +199,12 @@ static const pixman_fast_path_t mips_dspr2_fast_paths[] =
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mips_composite_over_n_8888_8888_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mips_composite_over_n_8888_0565_ca),
PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mips_composite_over_n_8888_0565_ca),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mips_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mips_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mips_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mips_composite_over_n_8_8888),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mips_composite_over_n_8_0565),
+ PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mips_composite_over_n_8_0565),
{ PIXMAN_OP_NONE },
};
commit 7d4beedc612a32b73d7673bbf6447de0f3fca298
Author: Matt Turner <mattst88@gmail.com>
Date: Wed May 9 19:20:55 2012 -0400
mmx: add and use pack_4x565 function
The pack_4x565 makes use of the pack_4xpacked565 function which uses pmadd.
Some of the speed up is probably attributable to removing the artificial
serialization imposed by the
vdest = pack_565 (..., vdest, 0);
vdest = pack_565 (..., vdest, 1);
...
pattern.
Loongson:
over_n_0565 = L1: 16.44 L2: 16.42 M: 13.83 ( 9.85%) HT: 12.83 VT: 12.61 R: 12.34 RT: 8.90 ( 93Kops/s)
over_n_0565 = L1: 42.48 L2: 42.53 M: 29.83 ( 21.20%) HT: 23.39 VT: 23.72 R: 21.80 RT: 11.60 ( 113Kops/s)
over_8888_0565 = L1: 15.61 L2: 15.42 M: 12.11 ( 25.79%) HT: 11.07 VT: 10.70 R: 10.37 RT: 7.25 ( 82Kops/s)
over_8888_0565 = L1: 35.01 L2: 35.20 M: 21.42 ( 45.57%) HT: 18.12 VT: 17.61 R: 16.09 RT: 9.01 ( 97Kops/s)
over_n_8_0565 = L1: 15.17 L2: 14.94 M: 12.57 ( 17.86%) HT: 11.96 VT: 11.52 R: 10.79 RT: 7.31 ( 79Kops/s)
over_n_8_0565 = L1: 29.83 L2: 29.79 M: 21.85 ( 30.94%) HT: 18.82 VT: 18.25 R: 16.15 RT: 8.72 ( 91Kops/s)
over_n_8888_0565_ca = L1: 15.25 L2: 15.02 M: 11.64 ( 41.39%) HT: 11.08 VT: 10.72 R: 10.02 RT: 7.00 ( 77Kops/s)
over_n_8888_0565_ca = L1: 30.12 L2: 29.99 M: 19.47 ( 68.99%) HT: 17.05 VT: 16.55 R: 14.67 RT: 8.38 ( 88Kops/s)
ARM/iwMMXt:
over_n_0565 = L1: 19.29 L2: 19.88 M: 17.38 ( 10.54%) HT: 15.53 VT: 16.11 R: 13.69 RT: 11.00 ( 96Kops/s)
over_n_0565 = L1: 36.02 L2: 34.85 M: 28.04 ( 16.97%) HT: 22.12 VT: 24.21 R: 22.36 RT: 12.22 ( 103Kops/s)
over_8888_0565 = L1: 18.38 L2: 16.59 M: 12.34 ( 22.29%) HT: 11.67 VT: 11.71 R: 11.02 RT: 6.89 ( 72Kops/s)
over_8888_0565 = L1: 24.96 L2: 22.17 M: 15.11 ( 26.81%) HT: 14.14 VT: 13.71 R: 13.18 RT: 8.13 ( 78Kops/s)
over_n_8_0565 = L1: 14.65 L2: 12.44 M: 11.56 ( 14.50%) HT: 10.93 VT: 10.39 R: 10.06 RT: 7.05 ( 70Kops/s)
over_n_8_0565 = L1: 18.37 L2: 14.98 M: 13.97 ( 16.51%) HT: 12.67 VT: 10.35 R: 11.80 RT: 8.14 ( 74Kops/s)
over_n_8888_0565_ca = L1: 14.27 L2: 12.93 M: 10.52 ( 33.23%) HT: 9.70 VT: 9.90 R: 9.31 RT: 6.34 ( 65Kops/s)
over_n_8888_0565_ca = L1: 19.69 L2: 17.58 M: 13.40 ( 42.35%) HT: 11.75 VT: 11.33 R: 11.17 RT: 7.49 ( 73Kops/s)
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index b14201a..01a2bc9 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -598,6 +598,12 @@ pack_4xpacked565 (__m64 a, __m64 b)
#endif
}
+static force_inline __m64
+pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
+{
+ return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
+}
+
#ifndef _MSC_VER
static force_inline __m64
@@ -1396,16 +1402,14 @@ mmx_composite_over_n_0565 (pixman_implementation_t *imp,
while (w >= 4)
{
- __m64 vdest;
+ __m64 vdest = *(__m64 *)dst;
- vdest = *(__m64 *)dst;
-
- vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
- vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
- vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
- vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
+ __m64 v0 = over (vsrc, vsrca, expand565 (vdest, 0));
+ __m64 v1 = over (vsrc, vsrca, expand565 (vdest, 1));
+ __m64 v2 = over (vsrc, vsrca, expand565 (vdest, 2));
+ __m64 v3 = over (vsrc, vsrca, expand565 (vdest, 3));
- *(__m64 *)dst = vdest;
+ *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
dst += 4;
w -= 4;
@@ -1818,22 +1822,19 @@ mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
while (w >= 4)
{
- __m64 vsrc0, vsrc1, vsrc2, vsrc3;
- __m64 vdest;
+ __m64 vdest = *(__m64 *)dst;
- vsrc0 = load8888 ((src + 0));
- vsrc1 = load8888 ((src + 1));
- vsrc2 = load8888 ((src + 2));
- vsrc3 = load8888 ((src + 3));
+ __m64 vsrc0 = load8888 ((src + 0));
+ __m64 vsrc1 = load8888 ((src + 1));
+ __m64 vsrc2 = load8888 ((src + 2));
+ __m64 vsrc3 = load8888 ((src + 3));
- vdest = *(__m64 *)dst;
-
- vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
- vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
- vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
- vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
+ __m64 v0 = over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0));
+ __m64 v1 = over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1));
+ __m64 v2 = over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2));
+ __m64 v3 = over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3));
- *(__m64 *)dst = vdest;
+ *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
w -= 4;
dst += 4;
@@ -2368,25 +2369,22 @@ mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
}
else if (m0 | m1 | m2 | m3)
{
- __m64 vdest;
- __m64 vm0, vm1, vm2, vm3;
-
- vdest = *(__m64 *)dst;
+ __m64 vdest = *(__m64 *)dst;
- vm0 = to_m64 (m0);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
- expand565 (vdest, 0)), vdest, 0);
- vm1 = to_m64 (m1);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
- expand565 (vdest, 1)), vdest, 1);
- vm2 = to_m64 (m2);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
- expand565 (vdest, 2)), vdest, 2);
- vm3 = to_m64 (m3);
- vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
- expand565 (vdest, 3)), vdest, 3);
-
- *(__m64 *)dst = vdest;
+ __m64 vm0 = to_m64 (m0);
+ __m64 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0),
+ expand565 (vdest, 0));
+ __m64 vm1 = to_m64 (m1);
+ __m64 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1),
+ expand565 (vdest, 1));
+ __m64 vm2 = to_m64 (m2);
+ __m64 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2),
+ expand565 (vdest, 2));
+ __m64 vm3 = to_m64 (m3);
+ __m64 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3),
+ expand565 (vdest, 3));
+
+ *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
}
w -= 4;
@@ -2483,24 +2481,23 @@ mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
if ((a0 & a1 & a2 & a3) == 0xFF)
{
- __m64 vdest;
- vdest = pack_565 (invert_colors (load8888 (&s0)), _mm_setzero_si64 (), 0);
- vdest = pack_565 (invert_colors (load8888 (&s1)), vdest, 1);
- vdest = pack_565 (invert_colors (load8888 (&s2)), vdest, 2);
- vdest = pack_565 (invert_colors (load8888 (&s3)), vdest, 3);
+ __m64 v0 = invert_colors (load8888 (&s0));
+ __m64 v1 = invert_colors (load8888 (&s1));
+ __m64 v2 = invert_colors (load8888 (&s2));
+ __m64 v3 = invert_colors (load8888 (&s3));
- *(__m64 *)dst = vdest;
+ *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
}
else if (s0 | s1 | s2 | s3)
{
__m64 vdest = *(__m64 *)dst;
- vdest = pack_565 (over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0)), vdest, 0);
- vdest = pack_565 (over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1)), vdest, 1);
- vdest = pack_565 (over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2)), vdest, 2);
- vdest = pack_565 (over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3)), vdest, 3);
+ __m64 v0 = over_rev_non_pre (load8888 (&s0), expand565 (vdest, 0));
+ __m64 v1 = over_rev_non_pre (load8888 (&s1), expand565 (vdest, 1));
+ __m64 v2 = over_rev_non_pre (load8888 (&s2), expand565 (vdest, 2));
+ __m64 v3 = over_rev_non_pre (load8888 (&s3), expand565 (vdest, 3));
- *(__m64 *)dst = vdest;
+ *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
}
w -= 4;
@@ -2675,12 +2672,12 @@ mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
{
__m64 vdest = *(__m64 *)q;
- vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0)), vdest, 0);
- vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1)), vdest, 1);
- vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2)), vdest, 2);
- vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3)), vdest, 3);
+ __m64 v0 = in_over (vsrc, vsrca, load8888 (&m0), expand565 (vdest, 0));
+ __m64 v1 = in_over (vsrc, vsrca, load8888 (&m1), expand565 (vdest, 1));
+ __m64 v2 = in_over (vsrc, vsrca, load8888 (&m2), expand565 (vdest, 2));
+ __m64 v3 = in_over (vsrc, vsrca, load8888 (&m3), expand565 (vdest, 3));
- *(__m64 *)q = vdest;
+ *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
}
twidth -= 4;
p += 4;
commit 2beabd9fed76de0023eb36b0c938b8803aa8d129
Author: Matt Turner <mattst88@gmail.com>
Date: Thu May 10 16:15:34 2012 -0400
configure.ac: make -march=loongson2f come before CFLAGS
Otherwise we'd have -march=loongson2f being overridden by automake's
CFLAGS ordering which causes build failures when -march=<not loongson2f>
is specified by the user.
diff --git a/configure.ac b/configure.ac
index 5478734..345bc33 100644
--- a/configure.ac
+++ b/configure.ac
@@ -281,7 +281,7 @@ have_loongson_mmi=no
AC_MSG_CHECKING(whether to use Loongson MMI)
xserver_save_CFLAGS=$CFLAGS
-CFLAGS=" $CFLAGS $LS_CFLAGS"
+CFLAGS=" $LS_CFLAGS $CFLAGS"
AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
#ifndef __mips_loongson_vector_rev
#error "Loongson Multimedia Instructions are only available on Loongson"
commit dadb9a318b8ca10c65e31e7278f4335a6968d246
Author: Søren Sandmann Pedersen <ssp@redhat.com>
Date: Tue May 8 10:05:18 2012 -0400
Add Makefile.win32 and Makefile.win32.common to EXTRA_DIST
https://bugs.freedesktop.org/show_bug.cgi?id=46905
diff --git a/Makefile.am b/Makefile.am
index df8677a..88ff897 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -21,6 +21,10 @@ RELEASE_XORG_HOST = $(USERNAME)@xorg.freedesktop.org
RELEASE_XORG_DIR = /srv/xorg.freedesktop.org/archive/individual/lib
RELEASE_ANNOUNCE_LIST = cairo-announce@cairographics.org, xorg-announce@lists.freedesktop.org, pixman@lists.freedesktop.org
+EXTRA_DIST = \
+ Makefile.win32 \
+ Makefile.win32.common
+
tar_gz = $(PACKAGE)-$(VERSION).tar.gz
tar_bz2 = $(PACKAGE)-$(VERSION).tar.bz2
commit 3c57ec471e1aacc863747b82bbe0a84c6d776ab7
Author: Matt Turner <mattst88@gmail.com>
Date: Wed May 9 22:50:50 2012 -0400
.gitignore: add demos/checkerboard and demos/quad2quad
diff --git a/.gitignore b/.gitignore
index 60b5bb4..98612c9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,11 +27,13 @@ config.h
config.h.in
.*.swp
demos/alpha-test
+demos/checkerboard
demos/clip-in
demos/clip-test
demos/composite-test
demos/convolution-test
demos/gradient-test
+demos/quad2quad
demos/radial-test
demos/screen-test
demos/trap-test
commit 2d431b53d3cdbf1997e2d3b8e17408c12220c3a1
Author: Matt Turner <mattst88@gmail.com>
Date: Fri Apr 27 14:12:56 2012 -0400
mmx: Use wpackhus in src_x888_0565 on iwMMXt
iwMMXt which has an unsigned saturation pack instruction, while MMX/EXT
and Loongson don't.
ARM/iwMMXt:
src_8888_0565 = L1: 110.38 L2: 82.33 M: 40.92 ( 73.22%) HT: 35.63 VT: 32.22 R: 30.07 RT: 18.40 ( 132Kops/s)
src_8888_0565 = L1: 117.91 L2: 83.05 M: 41.52 ( 75.58%) HT: 37.63 VT: 35.40 R: 29.37 RT: 19.39 ( 134Kops/s)
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 7fe19d5..b14201a 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -589,9 +589,13 @@ pack_4xpacked565 (__m64 a, __m64 b)
t1 = _mm_or_si64 (t1, g1);
t0 = shift(t0, -5);
+#ifdef USE_ARM_IWMMXT
+ t1 = shift(t1, -5);
+ return _mm_packs_pu32 (t0, t1);
+#else
t1 = shift(t1, -5 + 16);
-
return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
+#endif
}
#ifndef _MSC_VER
commit 2ddd1c498b723e8e48a38eef01d5befba30b5259
Author: Matt Turner <mattst88@gmail.com>
Date: Thu Apr 19 17:33:27 2012 -0400
mmx: add src_8888_0565
Uses the pmadd technique described in
http://software.intel.com/sites/landingpage/legacy/mmx/MMX_App_24-16_Bit_Conversion.pdf
The technique uses the packssdw instruction which uses signed
saturatation. This works in their example because they pack 888 to 555
leaving the high bit as zero. For packing to 565, it is unsuitable, so
we replace it with an or+shuffle.
Loongson:
src_8888_0565 = L1: 106.13 L2: 83.57 M: 33.46 ( 68.90%) HT: 30.29 VT: 27.67 R: 26.11 RT: 15.06 ( 135Kops/s)
src_8888_0565 = L1: 122.10 L2: 117.53 M: 37.97 ( 78.58%) HT: 33.14 VT: 30.09 R: 29.01 RT: 15.76 ( 139Kops/s)
ARM/iwMMXt:
src_8888_0565 = L1: 67.88 L2: 56.61 M: 31.20 ( 56.74%) HT: 29.22 VT: 27.01 R: 25.39 RT: 19.29 ( 130Kops/s)
src_8888_0565 = L1: 110.38 L2: 82.33 M: 40.92 ( 73.22%) HT: 35.63 VT: 32.22 R: 30.07 RT: 18.40 ( 132Kops/s)
diff --git a/pixman/loongson-mmintrin.h b/pixman/loongson-mmintrin.h
index 76ae892..8295ba0 100644
--- a/pixman/loongson-mmintrin.h
+++ b/pixman/loongson-mmintrin.h
@@ -84,6 +84,17 @@ _mm_empty (void)
}
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mm_madd_pi16 (__m64 __m1, __m64 __m2)
+{
+ __m64 ret;
+ asm("pmaddhw %0, %1, %2\n\t"
+ : "=f" (ret)
+ : "f" (__m1), "f" (__m2)
+ );
+ return ret;
+}
+
+extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
_mm_mulhi_pu16 (__m64 __m1, __m64 __m2)
{
__m64 ret;
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 320e20a..7fe19d5 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -179,9 +179,12 @@ typedef struct
mmxdatafield mmx_4x0080;
mmxdatafield mmx_565_rgb;
mmxdatafield mmx_565_unpack_multiplier;
+ mmxdatafield mmx_565_pack_multiplier;
mmxdatafield mmx_565_r;
mmxdatafield mmx_565_g;
mmxdatafield mmx_565_b;
+ mmxdatafield mmx_packed_565_rb;
+ mmxdatafield mmx_packed_565_g;
#ifndef USE_LOONGSON_MMI
mmxdatafield mmx_mask_0;
mmxdatafield mmx_mask_1;
@@ -207,9 +210,12 @@ static const mmx_data_t c =
MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
+ MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
+ MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
+ MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
#ifndef USE_LOONGSON_MMI
MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
@@ -567,6 +573,27 @@ pack_565 (__m64 pixel, __m64 target, int pos)
#endif
}
+static force_inline __m64
+pack_4xpacked565 (__m64 a, __m64 b)
+{
+ __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
+ __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
+
+ __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
+ __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
+
+ __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
+ __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
+
+ t0 = _mm_or_si64 (t0, g0);
+ t1 = _mm_or_si64 (t1, g1);
+
+ t0 = shift(t0, -5);
+ t1 = shift(t1, -5 + 16);
+
+ return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
+}
+
#ifndef _MSC_VER
static force_inline __m64
@@ -2091,6 +2118,60 @@ pixman_fill_mmx (uint32_t *bits,
}
static void
+mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
+ pixman_composite_info_t *info)
+{
+ PIXMAN_COMPOSITE_ARGS (info);
+ uint16_t *dst_line, *dst;
+ uint32_t *src_line, *src, s;
+ int dst_stride, src_stride;
+ int32_t w;
+
+ PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
+ PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
+
+ while (height--)
+ {
+ dst = dst_line;
+ dst_line += dst_stride;
+ src = src_line;
+ src_line += src_stride;
+ w = width;
+
+ while (w && (unsigned long)dst & 7)
+ {
+ s = *src++;
+ *dst = CONVERT_8888_TO_0565 (s);
+ dst++;
+ w--;
+ }
+
+ while (w >= 4)
+ {
+ __m64 vdest;
+ __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
+ __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
+
+ vdest = pack_4xpacked565 (vsrc0, vsrc1);
+
+ *(__m64 *)dst = vdest;
+
+ w -= 4;
+ src += 4;
+ dst += 4;
+ }
+
+ while (w)
+ {
+ s = *src++;
+ *dst = CONVERT_8888_TO_0565 (s);
+ dst++;
+ w--;
+ }
+ }
+}
+
+static void
mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
pixman_composite_info_t *info)
{
@@ -3433,6 +3514,10 @@ static const pixman_fast_path_t mmx_fast_paths[] =
PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
+ PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
+ PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
+ PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
+ PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
commit 3e8fe65a0893fcd82bdea205de49f53be32bb074
Author: Matt Turner <mattst88@gmail.com>
Date: Wed Apr 18 16:24:28 2012 -0400
mmx: add x8f8g8b8 fetcher
Loongson:
Reply to: