[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

pixman: Changes to 'upstream-unstable'



 configure.ac                   |   22 -
 pixman/pixman-arm-simd-asm.S   |   41 +++
 pixman/pixman-arm-simd.c       |    6 
 pixman/pixman-general.c        |   18 -
 pixman/pixman-implementation.c |   16 +
 pixman/pixman-mmx.c            |   64 -----
 pixman/pixman-vmx.c            |  492 ++++++++++++++++-------------------------
 pixman/pixman.c                |   17 -
 test/Makefile.sources          |    2 
 test/affine-bench.c            |   24 +-
 test/cover-test.c              |  449 +++++++++++++++++++++++++++++++++++++
 test/fence-image-self-test.c   |  239 +++++++++++++++++++
 test/lowlevel-blt-bench.c      |    6 
 test/scaling-test.c            |   66 +++--
 test/utils.c                   |  133 ++++++++++-
 test/utils.h                   |   21 +
 16 files changed, 1199 insertions(+), 417 deletions(-)

New commits:
commit fa71d08a81c9bf3f2366ee45474ff868d9e10b8e
Author: Oded Gabbay <oded.gabbay@gmail.com>
Date:   Fri Oct 23 17:58:49 2015 +0300

    Pre-release version bump to 0.33.4
    
    Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>

diff --git a/configure.ac b/configure.ac
index b04cc69..dcacff1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -54,7 +54,7 @@ AC_PREREQ([2.57])
 
 m4_define([pixman_major], 0)
 m4_define([pixman_minor], 33)
-m4_define([pixman_micro], 3)
+m4_define([pixman_micro], 4)
 
 m4_define([pixman_version],[pixman_major.pixman_minor.pixman_micro])
 

commit 9728241bd098bc4260e6cd83997dfecc64adc356
Author: Andrea Canciani <ranma42@gmail.com>
Date:   Tue Oct 13 13:35:59 2015 +0200

    test: Fix fence-image-self-test on Mac
    
    On MacOS X, according to the manpage of mprotect(), "When a program
    violates the protections of a page, it gets a SIGBUS or SIGSEGV
    signal.", but fence-image-self-test was only accepting a SIGSEGV as
    notification of invalid access.
    
    Fixes fence-image-self-test
    
    Reviewed-by: Pekka Paalanen <pekka.paalanen@collabora.co.uk>

diff --git a/test/fence-image-self-test.c b/test/fence-image-self-test.c
index c883038..c80b3cf 100644
--- a/test/fence-image-self-test.c
+++ b/test/fence-image-self-test.c
@@ -73,7 +73,7 @@ prinfo (const char *fmt, ...)
 }
 
 static void
-do_expect_segv (void (*fn)(void *), void *data)
+do_expect_signal (void (*fn)(void *), void *data)
 {
     struct sigaction sa;
 
@@ -82,6 +82,8 @@ do_expect_segv (void (*fn)(void *), void *data)
     sa.sa_sigaction = segv_handler;
     if (sigaction (SIGSEGV, &sa, NULL) == -1)
         die ("sigaction failed", errno);
+    if (sigaction (SIGBUS, &sa, NULL) == -1)
+        die ("sigaction failed", errno);
 
     (*fn)(data);
 
@@ -96,7 +98,7 @@ do_expect_segv (void (*fn)(void *), void *data)
  * to exit with success, and return failure otherwise.
  */
 static pixman_bool_t
-expect_segv (void (*fn)(void *), void *data)
+expect_signal (void (*fn)(void *), void *data)
 {
     pid_t pid, wp;
     int status;
@@ -106,7 +108,7 @@ expect_segv (void (*fn)(void *), void *data)
         die ("fork failed", errno);
 
     if (pid == 0)
-        do_expect_segv (fn, data); /* never returns */
+        do_expect_signal (fn, data); /* never returns */
 
     wp = waitpid (pid, &status, 0);
     if (wp != pid)
@@ -131,9 +133,9 @@ test_read_fault (uint8_t *p, int offset)
 {
     prinfo ("*(uint8_t *)(%p + %d)", p, offset);
 
-    if (expect_segv (read_u8, p + offset))
+    if (expect_signal (read_u8, p + offset))
     {
-        prinfo ("\tSEGV OK\n");
+        prinfo ("\tsignal OK\n");
 
         return TRUE;
     }
diff --git a/test/utils.c b/test/utils.c
index 8657966..f8e42a5 100644
--- a/test/utils.c
+++ b/test/utils.c
@@ -471,9 +471,9 @@ fence_image_destroy (pixman_image_t *image, void *data)
  * min_width is only a minimum width for the image. The width is aligned up
  * for the row size to be divisible by both page size and pixel size.
  *
- * If stride_fence is true, the additional page on each row will be armed
- * to cause SIGSEVG on all accesses. This should catch all accesses outside
- * the valid row pixels.
+ * If stride_fence is true, the additional page on each row will be
+ * armed to cause SIGSEGV or SIGBUS on all accesses. This should catch
+ * all accesses outside the valid row pixels.
  */
 pixman_image_t *
 fence_image_create_bits (pixman_format_code_t format,

commit 7de61d8d14e84623b6fa46506eb74f938287f536
Author: Matt Turner <mattst88@gmail.com>
Date:   Sun Oct 11 14:44:46 2015 -0700

    mmx: Use MMX2 intrinsics from xmmintrin.h directly.
    
    We had lots of hacks to handle the inability to include xmmintrin.h
    without compiling with -msse (lest SSE instructions be used in
    pixman-mmx.c). Some recent version of gcc relaxed this restriction.
    
    Change configure.ac to test that xmmintrin.h can be included and that we
    can use some intrinsics from it, and remove the work-around code from
    pixman-mmx.c.
    
    Evidently allows gcc 4.9.3 to optimize better as well:
    
       text	   data	    bss	    dec	    hex	filename
     657078	  30848	    680	 688606	  a81de	libpixman-1.so.0.33.3 before
     656710	  30848	    680	 688238	  a806e	libpixman-1.so.0.33.3 after
    
    Reviewed-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
    Tested-by: Pekka Paalanen <pekka.paalanen@collabora.co.uk>
    Signed-off-by: Matt Turner <mattst88@gmail.com>

diff --git a/configure.ac b/configure.ac
index 424bfd3..b04cc69 100644
--- a/configure.ac
+++ b/configure.ac
@@ -347,21 +347,14 @@ AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
 #error "Need GCC >= 3.4 for MMX intrinsics"
 #endif
 #include <mmintrin.h>
+#include <xmmintrin.h>
 int main () {
     __m64 v = _mm_cvtsi32_si64 (1);
     __m64 w;
 
-    /* Some versions of clang will choke on K */
-    asm ("pshufw %2, %1, %0\n\t"
-        : "=y" (w)
-        : "y" (v), "K" (5)
-    );
-
-    /* Some versions of clang will choke on this */
-    asm ("pmulhuw %1, %0\n\t"
-	: "+y" (w)
-	: "y" (v)
-    );
+    /* Test some intrinsics from xmmintrin.h */
+    w = _mm_shuffle_pi16(v, 5);
+    w = _mm_mulhi_pu16(w, w);
 
     return _mm_cvtsi64_si32 (v);
 }]])], have_mmx_intrinsics=yes)
diff --git a/pixman/pixman-mmx.c b/pixman/pixman-mmx.c
index 05c48a4..88c3a39 100644
--- a/pixman/pixman-mmx.c
+++ b/pixman/pixman-mmx.c
@@ -40,6 +40,9 @@
 #else
 #include <mmintrin.h>
 #endif
+#ifdef USE_X86_MMX
+#include <xmmintrin.h>
+#endif
 #include "pixman-private.h"
 #include "pixman-combine32.h"
 #include "pixman-inlines.h"
@@ -59,66 +62,7 @@ _mm_empty (void)
 }
 #endif
 
-#ifdef USE_X86_MMX
-# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
-#  include <xmmintrin.h>
-# else
-/* We have to compile with -msse to use xmmintrin.h, but that causes SSE
- * instructions to be generated that we don't want. Just duplicate the
- * functions we want to use.  */
-extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_movemask_pi8 (__m64 __A)
-{
-    int ret;
-
-    asm ("pmovmskb %1, %0\n\t"
-	: "=r" (ret)
-	: "y" (__A)
-    );
-
-    return ret;
-}
-
-extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_mulhi_pu16 (__m64 __A, __m64 __B)
-{
-    asm ("pmulhuw %1, %0\n\t"
-	: "+y" (__A)
-	: "y" (__B)
-    );
-    return __A;
-}
-
-#  ifdef __OPTIMIZE__
-extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
-_mm_shuffle_pi16 (__m64 __A, int8_t const __N)
-{
-    __m64 ret;
-
-    asm ("pshufw %2, %1, %0\n\t"
-	: "=y" (ret)
-	: "y" (__A), "K" (__N)
-    );
-
-    return ret;
-}
-#  else
-#   define _mm_shuffle_pi16(A, N)					\
-    ({									\
-	__m64 ret;							\
-									\
-	asm ("pshufw %2, %1, %0\n\t"					\
-	     : "=y" (ret)						\
-	     : "y" (A), "K" ((const int8_t)N)				\
-	);								\
-									\
-	ret;								\
-    })
-#  endif
-# endif
-#endif
-
-#ifndef _MSC_VER
+#ifndef _MM_SHUFFLE
 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
 #endif

commit 90e62c086766afffd289a321c7de8ea4b5cac87d
Author: Siarhei Siamashka <siarhei.siamashka@gmail.com>
Date:   Fri Sep 4 15:39:00 2015 +0300

    vmx: implement fast path vmx_composite_over_n_8888
    
    Running "lowlevel-blt-bench over_n_8888" on Playstation3 3.2GHz,
    Gentoo ppc (32-bit userland) gave the following results:
    
    before:  over_n_8888 =  L1: 147.47  L2: 205.86  M:121.07
    after:   over_n_8888 =  L1: 287.27  L2: 261.09  M:133.48
    
    Cairo non-trimmed benchmarks on POWER8, 3.4GHz 8 Cores:
    
    ocitysmap          659.69  -> 611.71   :  1.08x speedup
    xfce4-terminal-a1  2725.22 -> 2547.47  :  1.07x speedup
    
    Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
    Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 3eaa866..41efdcf 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -2628,6 +2628,58 @@ vmx_composite_src_x888_8888 (pixman_implementation_t *imp,
 }
 
 static void
+vmx_composite_over_n_8888 (pixman_implementation_t *imp,
+                           pixman_composite_info_t *info)
+{
+    PIXMAN_COMPOSITE_ARGS (info);
+    uint32_t *dst_line, *dst;
+    uint32_t src, ia;
+    int      i, w, dst_stride;
+    vector unsigned int vdst, vsrc, via;
+
+    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
+
+    if (src == 0)
+	return;
+
+    PIXMAN_IMAGE_GET_LINE (
+	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
+
+    vsrc = (vector unsigned int){src, src, src, src};
+    via = negate (splat_alpha (vsrc));
+    ia = ALPHA_8 (~src);
+
+    while (height--)
+    {
+	dst = dst_line;
+	dst_line += dst_stride;
+	w = width;
+
+	while (w && ((uintptr_t)dst & 15))
+	{
+	    uint32_t d = *dst;
+	    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
+	    *dst++ = d;
+	    w--;
+	}
+
+	for (i = w / 4; i > 0; i--)
+	{
+	    vdst = pix_multiply (load_128_aligned (dst), via);
+	    save_128_aligned (dst, pix_add (vsrc, vdst));
+	    dst += 4;
+	}
+
+	for (i = w % 4; --i >= 0;)
+	{
+	    uint32_t d = dst[i];
+	    UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, src);
+	    dst[i] = d;
+	}
+    }
+}
+
+static void
 vmx_composite_over_8888_8888 (pixman_implementation_t *imp,
                                pixman_composite_info_t *info)
 {
@@ -2936,6 +2988,8 @@ FAST_NEAREST_MAINLOOP (vmx_8888_8888_normal_OVER,
 
 static const pixman_fast_path_t vmx_fast_paths[] =
 {
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null, a8r8g8b8, vmx_composite_over_n_8888),
+    PIXMAN_STD_FAST_PATH (OVER, solid,    null, x8r8g8b8, vmx_composite_over_n_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, vmx_composite_over_8888_8888),
     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, vmx_composite_over_8888_8888),

commit 2876d8d3dd6a71cb9eb3ac93e5b9c18b71a452da
Author: Ben Avison <bavison@riscosopen.org>
Date:   Fri Sep 4 03:09:20 2015 +0100

    affine-bench: remove 8e margin from COVER area
    
    Patch "Remove the 8e extra safety margin in COVER_CLIP analysis" reduced
    the required image area for setting the COVER flags in
    pixman.c:analyze_extent(). Do the same reduction in affine-bench.
    
    Leaving the old calculations in place would be very confusing for anyone
    reading the code.
    
    Also add a comment that explains how affine-bench wants to hit the COVER
    paths. This explains why the intricate extent calculations are copied
    from pixman.c.
    
    [Pekka: split patch, change comments, write commit message]
    Signed-off-by: Pekka Paalanen <pekka.paalanen@collabora.co.uk>
    Reviewed-by: Ben Avison <bavison@riscosopen.org>

diff --git a/test/affine-bench.c b/test/affine-bench.c
index 9e0121e..86bf46e 100644
--- a/test/affine-bench.c
+++ b/test/affine-bench.c
@@ -395,14 +395,26 @@ main (int argc, char *argv[])
         return EXIT_FAILURE;
     }
 
+    /* Compute required extents for source and mask image so they qualify
+     * for COVER fast paths and get the flags in pixman.c:analyze_extent().
+     * These computations are for FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,
+     * but at the same time they also allow COVER_CLIP_NEAREST.
+     */
     compute_transformed_extents (&binfo.transform, &dest_box, &transformed);
-    /* The source area is expanded by a tiny bit (8/65536th pixel)
-     * to match the calculation of the COVER_CLIP flags in analyze_extent()
+    xmin = pixman_fixed_to_int (transformed.x1 - pixman_fixed_1 / 2);
+    ymin = pixman_fixed_to_int (transformed.y1 - pixman_fixed_1 / 2);
+    xmax = pixman_fixed_to_int (transformed.x2 + pixman_fixed_1 / 2);
+    ymax = pixman_fixed_to_int (transformed.y2 + pixman_fixed_1 / 2);
+    /* Note:
+     * The upper limits can be reduced to the following when fetchers
+     * are guaranteed to not access pixels with zero weight. This concerns
+     * particularly all bilinear samplers.
+     *
+     * xmax = pixman_fixed_to_int (transformed.x2 + pixman_fixed_1 / 2 - pixman_fixed_e);
+     * ymax = pixman_fixed_to_int (transformed.y2 + pixman_fixed_1 / 2 - pixman_fixed_e);
+     * This is equivalent to subtracting 0.5 and rounding up, rather than
+     * subtracting 0.5, rounding down and adding 1.
      */
-    xmin = pixman_fixed_to_int (transformed.x1 - 8 * pixman_fixed_e - pixman_fixed_1 / 2);
-    ymin = pixman_fixed_to_int (transformed.y1 - 8 * pixman_fixed_e - pixman_fixed_1 / 2);
-    xmax = pixman_fixed_to_int (transformed.x2 + 8 * pixman_fixed_e + pixman_fixed_1 / 2);
-    ymax = pixman_fixed_to_int (transformed.y2 + 8 * pixman_fixed_e + pixman_fixed_1 / 2);
     binfo.src_x = -xmin;
     binfo.src_y = -ymin;
 

commit 0e2e9751282b19280c92be4a80c5ae476bae0ce4
Author: Ben Avison <bavison@riscosopen.org>
Date:   Fri Sep 4 03:09:20 2015 +0100

    Remove the 8e extra safety margin in COVER_CLIP analysis
    
    As discussed in
    http://lists.freedesktop.org/archives/pixman/2015-August/003905.html
    
    the 8 * pixman_fixed_e (8e) adjustment which was applied to the transformed
    coordinates is a legacy of rounding errors which used to occur in old
    versions of Pixman, but which no longer apply. For any affine transform,
    you are now guaranteed to get the same result by transforming the upper
    coordinate as though you transform the lower coordinate and add (size-1)
    steps of the increment in source coordinate space. No projective
    transform routines use the COVER_CLIP flags, so they cannot be affected.
    
    Proof by Siarhei Siamashka:
    
    Let's take a look at the following affine transformation matrix (with 16.16
    fixed point values) and two vectors:
    
             | a   b     c    |
    M      = | d   e     f    |
             | 0   0  0x10000 |
    
             |  x_dst  |
    P     =  |  y_dst  |
             | 0x10000 |
    
             | 0x10000 |
    ONE_X  = |    0    |
             |    0    |
    
    The current matrix multiplication code does the following calculations:
    
                 | (a * x_dst + b * y_dst + 0x8000) / 0x10000 + c |
        M * P =  | (d * x_dst + e * y_dst + 0x8000) / 0x10000 + f |
                 |                   0x10000                      |
    
    These calculations are not perfectly exact and we may get rounding
    because the integer coordinates are adjusted by 0.5 (or 0x8000 in the
    16.16 fixed point format) before doing matrix multiplication. For
    example, if the 'a' coefficient is an odd number and 'b' is zero,
    then we are losing some of the least significant bits when dividing by
    0x10000.
    
    So we need to strictly prove that the following expression is always
    true even though we have to deal with rounding:
    
                                              | a |
        M * (P + ONE_X) - M * P = M * ONE_X = | d |
                                              | 0 |
    
    or
    
       ((a * (x_dst + 0x10000) + b * y_dst + 0x8000) / 0x10000 + c)
      -
       ((a * x_dst             + b * y_dst + 0x8000) / 0x10000 + c)
      =
        a
    
    It's easy to see that this is equivalent to
    
        a + ((a * x_dst + b * y_dst + 0x8000) / 0x10000 + c)
          - ((a * x_dst + b * y_dst + 0x8000) / 0x10000 + c)
      =
        a
    
    Which means that stepping exactly by one pixel horizontally in the
    destination image space (advancing 'x_dst' by 0x10000) is the same as
    changing the transformed 'x_src' coordinate in the source image space
    exactly by 'a'. The same applies to the vertical direction too.
    Repeating these steps, we can reach any pixel in the source image
    space and get exactly the same fixed point coordinates as doing
    matrix multiplications per each pixel.
    
    By the way, the older matrix multiplication implementation, which was
    relying on less accurate calculations with three intermediate roundings
    "((a + 0x8000) >> 16) + ((b + 0x8000) >> 16) + ((c + 0x8000) >> 16)",
    also has the same properties. However reverting
        http://cgit.freedesktop.org/pixman/commit/?id=ed39992564beefe6b12f81e842caba11aff98a9c
    and applying this "Remove the 8e extra safety margin in COVER_CLIP
    analysis" patch makes the cover test fail. The real reason why it fails
    is that the old pixman code was using "pixman_transform_point_3d()"
    function
        http://cgit.freedesktop.org/pixman/tree/pixman/pixman-matrix.c?id=pixman-0.28.2#n49
    for getting the transformed coordinate of the top left corner pixel
    in the image scaling code, but at the same time using a different
    "pixman_transform_point()" function
        http://cgit.freedesktop.org/pixman/tree/pixman/pixman-matrix.c?id=pixman-0.28.2#n82
    in the extents calculation code for setting the cover flag. And these
    functions did the intermediate rounding differently. That's why the 8e
    safety margin was needed.
    
    ** proof ends
    
    However, for COVER_CLIP_NEAREST, the actual margins added were not 8e.
    Because the half-way cases round down, that is, coordinate 0 hits pixel
    index -1 while coordinate e hits pixel index 0, the extra safety margins
    were actually 7e to the left and up, and 9e to the right and down. This
    patch removes the 7e and 9e margins and restores the -e adjustment
    required for NEAREST sampling in Pixman. For reference, see
    pixman/rounding.txt.
    
    For COVER_CLIP_BILINEAR, the margins were exactly 8e as there are no
    additional offsets to be restored, so simply removing the 8e additions
    is enough.
    
    Proof:
    
    All implementations must give the same numerical results as
    bits_image_fetch_pixel_nearest() / bits_image_fetch_pixel_bilinear().
    
    The former does
        int x0 = pixman_fixed_to_int (x - pixman_fixed_e);
    which maps directly to the new test for the nearest flag, when you consider
    that x0 must fall in the interval [0,width).
    
    The latter does
        x1 = x - pixman_fixed_1 / 2;
        x1 = pixman_fixed_to_int (x1);
        x2 = x1 + 1;
    When you write a COVER path, you take advantage of the assumption that
    both x1 and x2 fall in the interval [0, width).
    
    As samplers are allowed to fetch the pixel at x2 unconditionally, we
    require
        x1 >= 0
        x2 < width
    so
        x - pixman_fixed_1 / 2 >= 0
        x - pixman_fixed_1 / 2 + pixman_fixed_1 < width * pixman_fixed_1
    so
        pixman_fixed_to_int (x - pixman_fixed_1 / 2) >= 0
        pixman_fixed_to_int (x + pixman_fixed_1 / 2) < width
    which matches the source code lines for the bilinear case, once you delete
    the lines that add the 8e margin.
    
    Signed-off-by: Ben Avison <bavison@riscosopen.org>
    [Pekka: adjusted commit message, left affine-bench changes for another patch]
    [Pekka: add commit message parts from Siarhei]
    Signed-off-by: Pekka Paalanen <pekka.paalanen@collabora.co.uk>
    Reviewed-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
    Reviewed-by: Ben Avison <bavison@riscosopen.org>

diff --git a/pixman/pixman.c b/pixman/pixman.c
index a07c577..f932eac 100644
--- a/pixman/pixman.c
+++ b/pixman/pixman.c
@@ -497,21 +497,12 @@ analyze_extent (pixman_image_t       *image,
     if (!compute_transformed_extents (transform, extents, &transformed))
 	return FALSE;
 
-    /* Expand the source area by a tiny bit so account of different rounding that
-     * may happen during sampling. Note that (8 * pixman_fixed_e) is very far from
-     * 0.5 so this won't cause the area computed to be overly pessimistic.
-     */
-    transformed.x1 -= 8 * pixman_fixed_e;
-    transformed.y1 -= 8 * pixman_fixed_e;
-    transformed.x2 += 8 * pixman_fixed_e;
-    transformed.y2 += 8 * pixman_fixed_e;
-
     if (image->common.type == BITS)
     {
-	if (pixman_fixed_to_int (transformed.x1) >= 0			&&
-	    pixman_fixed_to_int (transformed.y1) >= 0			&&
-	    pixman_fixed_to_int (transformed.x2) < image->bits.width	&&
-	    pixman_fixed_to_int (transformed.y2) < image->bits.height)
+	if (pixman_fixed_to_int (transformed.x1 - pixman_fixed_e) >= 0                &&
+	    pixman_fixed_to_int (transformed.y1 - pixman_fixed_e) >= 0                &&
+	    pixman_fixed_to_int (transformed.x2 - pixman_fixed_e) < image->bits.width &&
+	    pixman_fixed_to_int (transformed.y2 - pixman_fixed_e) < image->bits.height)
 	{
 	    *flags |= FAST_PATH_SAMPLES_COVER_CLIP_NEAREST;
 	}

commit 23525b4ea5bc2dd67f8f65b90d023b6580ecbc36
Author: Ben Avison <bavison@riscosopen.org>
Date:   Tue Sep 22 12:43:25 2015 +0100

    pixman-general: Tighten up calculation of temporary buffer sizes
    
    Each of the aligns can only add a maximum of 15 bytes to the space
    requirement. This permits some edge cases to use the stack buffer where
    previously it would have deduced that a heap buffer was required.
    
    Reviewed-by: Pekka Paalanen <pekka.paalanen@collabora.co.uk>

diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index fa88463..6141cb0 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -158,9 +158,9 @@ general_composite_rect  (pixman_implementation_t *imp,
     if (width <= 0 || _pixman_multiply_overflows_int (width, Bpp * 3))
 	return;
 
-    if (width * Bpp * 3 > sizeof (stack_scanline_buffer) - 32 * 3)
+    if (width * Bpp * 3 > sizeof (stack_scanline_buffer) - 15 * 3)
     {
-	scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 32 * 3);
+	scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 15 * 3);
 
 	if (!scanline_buffer)
 	    return;

commit 8b49d4b6b460d0c9299bca4ccddd7cd00d8f8441
Author: Siarhei Siamashka <siarhei.siamashka@gmail.com>
Date:   Tue Sep 22 04:25:40 2015 +0300

    pixman-general: Fix stack related pointer arithmetic overflow
    
    As https://bugs.freedesktop.org/show_bug.cgi?id=92027#c6 explains,
    the stack is allocated at the very top of the process address space
    in some configurations (32-bit x86 systems with ASLR disabled).
    And the careless computations done with the 'dest_buffer' pointer
    may overflow, failing the buffer upper limit check.
    
    The problem can be reproduced using the 'stress-test' program,
    which segfaults when executed via setarch:
    
        export CFLAGS="-O2 -m32" && ./autogen.sh
        ./configure --disable-libpng --disable-gtk && make
        setarch i686 -R test/stress-test
    
    This patch introduces the required corrections. The extra check
    for negative 'width' may be redundant (the invalid 'width' value
    is not supposed to reach here), but it's better to play safe
    when dealing with the buffers allocated on stack.
    
    Reported-by: Ludovic Courtès <ludo@gnu.org>
    Signed-off-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
    Reviewed-by: soren.sandmann@gmail.com
    Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>

diff --git a/pixman/pixman-general.c b/pixman/pixman-general.c
index 7cdea29..fa88463 100644
--- a/pixman/pixman-general.c
+++ b/pixman/pixman-general.c
@@ -155,23 +155,21 @@ general_composite_rect  (pixman_implementation_t *imp,
 #define ALIGN(addr)							\
     ((uint8_t *)((((uintptr_t)(addr)) + 15) & (~15)))
 
-    src_buffer = ALIGN (scanline_buffer);
-    mask_buffer = ALIGN (src_buffer + width * Bpp);
-    dest_buffer = ALIGN (mask_buffer + width * Bpp);
+    if (width <= 0 || _pixman_multiply_overflows_int (width, Bpp * 3))
+	return;
 
-    if (ALIGN (dest_buffer + width * Bpp) >
-	    scanline_buffer + sizeof (stack_scanline_buffer))
+    if (width * Bpp * 3 > sizeof (stack_scanline_buffer) - 32 * 3)
     {
 	scanline_buffer = pixman_malloc_ab_plus_c (width, Bpp * 3, 32 * 3);
 
 	if (!scanline_buffer)
 	    return;
-
-	src_buffer = ALIGN (scanline_buffer);
-	mask_buffer = ALIGN (src_buffer + width * Bpp);
-	dest_buffer = ALIGN (mask_buffer + width * Bpp);
     }
 
+    src_buffer = ALIGN (scanline_buffer);
+    mask_buffer = ALIGN (src_buffer + width * Bpp);
+    dest_buffer = ALIGN (mask_buffer + width * Bpp);
+
     if (width_flag == ITER_WIDE)
     {
 	/* To make sure there aren't any NANs in the buffers */

commit 4297e9058d252cac653723fe0b1bee559fbac3a4
Author: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date:   Thu Sep 17 15:43:27 2015 +0200

    test: add a check for FE_DIVBYZERO
    
    Some architectures, such as Microblaze and Nios2, currently do not
    implement FE_DIVBYZERO, even though they have <fenv.h> and
    feenableexcept(). This commit adds a configure.ac check to verify
    whether FE_DIVBYZERO is defined or not, and if not, disables the
    problematic code in test/utils.c.
    
    Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
    Signed-off-by: Marek Vasut <marex@denx.de>
    Acked-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>
    Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>

diff --git a/configure.ac b/configure.ac
index f93cc30..424bfd3 100644
--- a/configure.ac
+++ b/configure.ac
@@ -891,6 +891,11 @@ if test x$have_feenableexcept = xyes; then
    AC_DEFINE(HAVE_FEENABLEEXCEPT, 1, [Whether we have feenableexcept()])
 fi
 
+AC_CHECK_DECL([FE_DIVBYZERO],
+	[AC_DEFINE(HAVE_FEDIVBYZERO, 1, [Whether we have FE_DIVBYZERO])],
+	[],
+	[[#include <fenv.h>]])
+
 AC_CHECK_FUNC(gettimeofday, have_gettimeofday=yes, have_gettimeofday=no)
 AC_CHECK_HEADER(sys/time.h, have_sys_time_h=yes, have_sys_time_h=no)
 if test x$have_gettimeofday = xyes && test x$have_sys_time_h = xyes; then
diff --git a/test/utils.c b/test/utils.c
index 222d4d5..8657966 100644
--- a/test/utils.c
+++ b/test/utils.c
@@ -966,9 +966,11 @@ enable_divbyzero_exceptions (void)
 {
 #ifdef HAVE_FENV_H
 #ifdef HAVE_FEENABLEEXCEPT
+#ifdef HAVE_FEDIVBYZERO
     feenableexcept (FE_DIVBYZERO);
 #endif
 #endif
+#endif
 }
 
 void

commit 8189fad9610981d5b4dcd8f8980ff169110fb33c
Author: Oded Gabbay <oded.gabbay@gmail.com>
Date:   Sun Sep 6 11:45:20 2015 +0300

    vmx: Remove unused expensive functions
    
    Now that we replaced the expensive functions with better performing
    alternatives, we should remove them so they will not be used again.
    
    Running Cairo benchmark on trimmed traces gave the following results:
    
    POWER8, 8 cores, 3.4GHz, RHEL 7.2 ppc64le.
    
    Speedups
    ========
    t-firefox-scrolling     1232.30 -> 1096.55 :  1.12x
    t-gnome-terminal-vim    613.86  -> 553.10  :  1.11x
    t-evolution             405.54  -> 371.02  :  1.09x
    t-firefox-talos-gfx     919.31  -> 862.27  :  1.07x
    t-gvim                  653.02  -> 616.85  :  1.06x
    t-firefox-canvas-alpha  941.29  -> 890.42  :  1.06x
    
    Signed-off-by: Oded Gabbay <oded.gabbay@gmail.com>
    Acked-by: Pekka Paalanen <pekka.paalanen@collabora.co.uk>
    Acked-by: Siarhei Siamashka <siarhei.siamashka@gmail.com>

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index 7ef8bed..3eaa866 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -35,7 +35,6 @@
 
 #define AVV(x...) {x}
 
-static vector unsigned int mask_00ff;
 static vector unsigned int mask_ff000000;
 static vector unsigned int mask_red;
 static vector unsigned int mask_green;
@@ -280,20 +279,6 @@ save_128_aligned (uint32_t* data,
 }
 
 static force_inline vector unsigned int
-create_mask_16_128 (uint16_t mask)
-{
-    uint16_t* src;
-    vector unsigned short vsrc;
-    DECLARE_SRC_MASK_VAR;
-
-    src = &mask;
-
-    COMPUTE_SHIFT_MASK (src);
-    LOAD_VECTOR (src);
-    return (vector unsigned int) vec_splat(vsrc, 0);
-}
-
-static force_inline vector unsigned int
 create_mask_1x32_128 (const uint32_t *src)
 {
     vector unsigned int vsrc;
@@ -311,24 +296,6 @@ create_mask_32_128 (uint32_t mask)
 }
 
 static force_inline vector unsigned int
-unpack_32_1x128 (uint32_t data)
-{
-    vector unsigned int vdata = {0, 0, 0, data};
-    vector unsigned short lo;
-
-    lo = (vector unsigned short)
-#ifdef WORDS_BIGENDIAN
-	vec_mergel ((vector unsigned char) AVV(0),
-		    (vector unsigned char) vdata);
-#else
-	vec_mergel ((vector unsigned char) vdata,
-		    (vector unsigned char) AVV(0));
-#endif
-
-    return (vector unsigned int) lo;
-}
-
-static force_inline vector unsigned int
 unpacklo_128_16x8 (vector unsigned int data1, vector unsigned int data2)
 {
     vector unsigned char lo;
@@ -437,38 +404,6 @@ unpack_565_to_8888 (vector unsigned int lo)
     return vec_or (rb, g);
 }
 
-static force_inline uint32_t
-pack_1x128_32 (vector unsigned int data)
-{
-    vector unsigned char vpack;
-
-    vpack = vec_packsu((vector unsigned short) data,
-			(vector unsigned short) AVV(0));
-
-    return vec_extract((vector unsigned int) vpack, 1);
-}
-
-static force_inline vector unsigned int
-pack_2x128_128 (vector unsigned int lo, vector unsigned int hi)
-{
-    vector unsigned char vpack;
-
-    vpack = vec_packsu((vector unsigned short) hi,
-			(vector unsigned short) lo);
-
-    return (vector unsigned int) vpack;
-}
-
-static force_inline void
-negate_2x128 (vector unsigned int  data_lo,
-	      vector unsigned int  data_hi,
-	      vector unsigned int* neg_lo,
-	      vector unsigned int* neg_hi)
-{
-    *neg_lo = vec_xor (data_lo, mask_00ff);
-    *neg_hi = vec_xor (data_hi, mask_00ff);
-}
-
 static force_inline int
 is_opaque (vector unsigned int x)
 {
@@ -499,136 +434,6 @@ is_transparent (vector unsigned int x)
     return (cmp_result & 0x8888) == 0x8888;
 }
 
-static force_inline vector unsigned int
-expand_pixel_8_1x128 (uint8_t data)
-{
-    vector unsigned int vdata;
-
-    vdata = unpack_32_1x128 ((uint32_t) data);
-
-#ifdef WORDS_BIGENDIAN
-    return vec_perm (vdata, vdata,
-		     (vector unsigned char)AVV (
-			 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
-#else
-    return vec_perm (vdata, vdata,
-		     (vector unsigned char)AVV (
-			 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
-			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
-#endif
-}
-
-static force_inline vector unsigned int
-expand_alpha_1x128 (vector unsigned int data)
-{
-#ifdef WORDS_BIGENDIAN
-    return vec_perm (data, data,
-		     (vector unsigned char)AVV (
-			 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
-			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
-#else
-    return vec_perm (data, data,
-		     (vector unsigned char)AVV (
-			 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
-			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
-#endif
-}
-
-static force_inline void
-expand_alpha_2x128 (vector unsigned int  data_lo,
-		    vector unsigned int  data_hi,
-		    vector unsigned int* alpha_lo,
-		    vector unsigned int* alpha_hi)
-{
-
-    *alpha_lo = expand_alpha_1x128(data_lo);
-    *alpha_hi = expand_alpha_1x128(data_hi);
-}
-
-static force_inline void
-expand_alpha_rev_2x128 (vector unsigned int  data_lo,
-			vector unsigned int  data_hi,
-			vector unsigned int* alpha_lo,
-			vector unsigned int* alpha_hi)
-{
-#ifdef WORDS_BIGENDIAN
-    *alpha_lo = vec_perm (data_lo, data_lo,
-		     (vector unsigned char)AVV (
-			 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
-			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
-
-    *alpha_hi = vec_perm (data_hi, data_hi,
-		     (vector unsigned char)AVV (
-			 0x06, 0x07, 0x06, 0x07, 0x06, 0x07, 0x06, 0x07,
-			 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F, 0x0E, 0x0F));
-#else
-    *alpha_lo = vec_perm (data_lo, data_lo,
-		     (vector unsigned char)AVV (
-			 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
-			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
-
-    *alpha_hi = vec_perm (data_hi, data_hi,
-		     (vector unsigned char)AVV (
-			 0x00, 0x01, 0x00, 0x01, 0x00, 0x01, 0x00, 0x01,
-			 0x08, 0x09, 0x08, 0x09, 0x08, 0x09, 0x08, 0x09));
-#endif
-}
-
-static force_inline void
-pix_multiply_2x128 (vector unsigned int* data_lo,
-		    vector unsigned int* data_hi,
-		    vector unsigned int* alpha_lo,
-		    vector unsigned int* alpha_hi,
-		    vector unsigned int* ret_lo,
-		    vector unsigned int* ret_hi)
-{
-    *ret_lo = pix_multiply(*data_lo, *alpha_lo);
-    *ret_hi = pix_multiply(*data_hi, *alpha_hi);
-}
-
-static force_inline void
-over_2x128 (vector unsigned int* src_lo,
-	    vector unsigned int* src_hi,
-	    vector unsigned int* alpha_lo,
-	    vector unsigned int* alpha_hi,
-	    vector unsigned int* dst_lo,
-	    vector unsigned int* dst_hi)
-{
-    vector unsigned int t1, t2;
-
-    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
-
-    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
-
-    *dst_lo = (vector unsigned int)
-		    vec_adds ((vector unsigned char) *src_lo,
-			      (vector unsigned char) *dst_lo);
-
-    *dst_hi = (vector unsigned int)
-		    vec_adds ((vector unsigned char) *src_hi,
-			      (vector unsigned char) *dst_hi);
-}
-
-static force_inline void
-in_over_2x128 (vector unsigned int* src_lo,
-	       vector unsigned int* src_hi,
-	       vector unsigned int* alpha_lo,
-	       vector unsigned int* alpha_hi,
-	       vector unsigned int* mask_lo,
-	       vector unsigned int* mask_hi,
-	       vector unsigned int* dst_lo,
-	       vector unsigned int* dst_hi)
-{
-    vector unsigned int s_lo, s_hi;
-    vector unsigned int a_lo, a_hi;
-
-    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
-    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
-
-    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
-}
-
 static force_inline uint32_t
 core_combine_over_u_pixel_vmx (uint32_t src, uint32_t dst)
 {
@@ -3259,7 +3064,6 @@ _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
 
     /* VMX constants */


Reply to: