Bug#982425: g++-8: aarch64 -ftree-vectorize generates wrong code

To: Debian Bug Tracking System <submit@bugs.debian.org>
Subject: Bug#982425: g++-8: aarch64 -ftree-vectorize generates wrong code
From: Wookey <wookey@debian.org>
Date: Wed, 10 Feb 2021 01:41:37 +0000
Message-id: <[🔎] 161292129790.9471.430974569845940931.reportbug@cheddar.halon.org.uk>
Reply-to: Wookey <wookey@debian.org>, 982425@bugs.debian.org

Package: g++-8
Version: 8.4.0-7
Severity: important
Tags: patch upstream

g++-8 generates incorrect code on arm64 for -O3 if
targetting aarch64 unless tree vectorisation is disabled. This is
quite bad and affects real packages like vlc and apache arrow.

The upstream bug is https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98949
and the fix is merged upstream here:
https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=de0ede7625f6c4d4bbd2caaf363032b0da80cf69

This has been applied in gcc-9 9.3.0-22  and gcc-10 10.2.1-6 already (cheers for that).
'PR tree-optimization/97236'

Tested with:
$ g++-8 -O3 test.cc && ./a.out
a.out: test.cc:27: int main(): Assertion `bitmap[7] != 0' failed.
Aborted (core dumped)

$ g++-8 -O3 -fno-tree-vectorize test.cc && ./a.out
(no error, code works)

for test case:
 ==test.cc==
#include <cassert>
#include <cstdint>

int main(void) {
  uint64_t a[64], b[64];
  for (int i = 0; i < 64; ++i) {
    a[i] = 1;
    b[i] = 2;
  }
  a[63] = b[63];  // only last element is the same

  uint8_t bitmap[8];  // holds 64 bits, bit_i = 1 if (a[i] == b[i]) else 0, i = 0 ~ 63
  int index = 0;  // index to a[], b[]
  for (int byte = 0; byte < 8; ++byte) {
    uint8_t out_results[8]; // holds 8 comparison results temporarily
    for (int bit = 0; bit < 8; ++bit) {
      out_results[bit] = a[index] == b[index];
      ++index;
    }
    bitmap[byte] = (out_results[0] | out_results[1] << 1 | out_results[2] << 2 |
                    out_results[3] << 3 | out_results[4] << 4 | out_results[5] << 5 |
                    out_results[6] << 6 | out_results[7] << 7);
  }

  // last bitmap should be non-zero, fired on gcc-9.3 aarch64 -O3
  assert(bitmap[7] != 0);
  return 0;
}

Attached is a tested patch.

--
Wookey

diff -u gcc-8-8.4.0/debian/changelog gcc-8-8.4.0/debian/changelog
--- gcc-8-8.4.0/debian/changelog
+++ gcc-8-8.4.0/debian/changelog
@@ -1,3 +1,10 @@
+gcc-8 (8.4.0-7.1) UNRELEASED; urgency=medium
+
+  * Non-maintainer upload.
+  * Fix PR tree-optimization/97236 (AArch64)
+
+ -- Wookey <wookey@softiron-wookey.cambridge.arm.com>  Tue, 09 Feb 2021 13:27:54 +0000
+
 gcc-8 (8.4.0-7) unstable; urgency=medium
 
   * Update to git 20210202 from the gcc-8 branch.
diff -u gcc-8-8.4.0/debian/rules.patch gcc-8-8.4.0/debian/rules.patch
--- gcc-8-8.4.0/debian/rules.patch
+++ gcc-8-8.4.0/debian/rules.patch
@@ -83,6 +83,7 @@
 	verbose-lto-linker \
 	libstdc++-futex \
 	pr97528 \
+	pr97236 \
 
 ifeq (,$(filter $(distrelease),precise trusty stretch jessie wheezy))
   debian_patches += pr90050
only in patch2:
unchanged:
--- gcc-8-8.4.0.orig/debian/patches/pr97236.diff
+++ gcc-8-8.4.0/debian/patches/pr97236.diff
@@ -0,0 +1,110 @@
+From: Matthias Klose <doko@ubuntu.com>
+Date: Tue, 6 Oct 2020 11:41:37 +0000 (+0200)
+Subject: Backport fix for PR/tree-optimization/97236 - fix bad use of VMAT_CONTIGUOUS
+X-Git-Url: https://gcc.gnu.org/git/?p=gcc.git;a=commitdiff_plain;h=de0ede7625f6c4d4bbd2caaf363032b0da80cf69;hp=6f2f4412b9542c0f5fb950e8986ed97b6671806d
+
+Backport fix for PR/tree-optimization/97236 - fix bad use of VMAT_CONTIGUOUS
+
+This avoids using VMAT_CONTIGUOUS with single-element interleaving
+when using V1mode vectors.  Instead keep VMAT_ELEMENTWISE but
+continue to avoid load-lanes and gathers.
+
+2020-10-01  Richard Biener  <rguenther@suse.de>
+
+	PR tree-optimization/97236
+	* tree-vect-stmts.c (get_group_load_store_type): Keep
+	VMAT_ELEMENTWISE for single-element vectors.
+
+	* gcc.dg/vect/pr97236.c: New testcase.
+
+(cherry picked from commit 1ab88985631dd2c5a5e3b5c0dce47cf8b6ed2f82)
+---
+
+diff --git a/src/gcc/testsuite/gcc.dg/vect/pr97236.c b/gcc/testsuite/gcc.dg/vect/pr97236.c
+new file mode 100644
+index 00000000000..9d3dc20d953
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.dg/vect/pr97236.c
+@@ -0,0 +1,43 @@
++typedef unsigned char __uint8_t;
++typedef __uint8_t uint8_t;
++typedef struct plane_t {
++  uint8_t *p_pixels;
++  int i_lines;
++  int i_pitch;
++} plane_t;
++
++typedef struct {
++  plane_t p[5];
++} picture_t;
++
++#define N 4
++
++void __attribute__((noipa))
++picture_Clone(picture_t *picture, picture_t *res)
++{
++  for (int i = 0; i < N; i++) {
++    res->p[i].p_pixels = picture->p[i].p_pixels;
++    res->p[i].i_lines = picture->p[i].i_lines;
++    res->p[i].i_pitch = picture->p[i].i_pitch;
++  }
++}
++
++int
++main()
++{
++  picture_t aaa, bbb;
++  uint8_t pixels[10] = {1, 1, 1, 1, 1, 1, 1, 1};
++
++  for (unsigned i = 0; i < N; i++)
++    aaa.p[i].p_pixels = pixels;
++
++  picture_Clone (&aaa, &bbb);
++
++  uint8_t c = 0;
++  for (unsigned i = 0; i < N; i++)
++    c += bbb.p[i].p_pixels[0];
++
++  if (c != N)
++    __builtin_abort ();
++  return 0;
++}
+diff --git a/src/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
+index 7472558048c..ea75c7249aa 100644
+--- a/src/gcc/tree-vect-stmts.c
++++ b/src/gcc/tree-vect-stmts.c
+@@ -2209,25 +2209,23 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
+ 	  /* First cope with the degenerate case of a single-element
+ 	     vector.  */
+ 	  if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U))
+-	    *memory_access_type = VMAT_CONTIGUOUS;
++	    ;
+ 
+ 	  /* Otherwise try using LOAD/STORE_LANES.  */
+-	  if (*memory_access_type == VMAT_ELEMENTWISE
+-	      && (vls_type == VLS_LOAD
+-		  ? vect_load_lanes_supported (vectype, group_size, masked_p)
+-		  : vect_store_lanes_supported (vectype, group_size,
+-						masked_p)))
++	  else if (vls_type == VLS_LOAD
++		   ? vect_load_lanes_supported (vectype, group_size, masked_p)
++		   : vect_store_lanes_supported (vectype, group_size,
++						 masked_p))
+ 	    {
+ 	      *memory_access_type = VMAT_LOAD_STORE_LANES;
+ 	      overrun_p = would_overrun_p;
+ 	    }
+ 
+ 	  /* If that fails, try using permuting loads.  */
+-	  if (*memory_access_type == VMAT_ELEMENTWISE
+-	      && (vls_type == VLS_LOAD
+-		  ? vect_grouped_load_supported (vectype, single_element_p,
+-						 group_size)
+-		  : vect_grouped_store_supported (vectype, group_size)))
++	  else if (vls_type == VLS_LOAD
++		   ? vect_grouped_load_supported (vectype, single_element_p,
++						  group_size)
++		   : vect_grouped_store_supported (vectype, group_size))
+ 	    {
+ 	      *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
+ 	      overrun_p = would_overrun_p;

Reply to:

Follow-Ups:
- Bug#982425: marked as done (g++-8: aarch64 -ftree-vectorize generates wrong code)
  - From: "Debian Bug Tracking System" <owner@bugs.debian.org>

Prev by Date: Bug#982414: gcc-defaults should provide a libgccjit-dev package
Next by Date: Processing of elfutils_0.183-2~exp1_source.changes
Previous by thread: Bug#982414: gcc-defaults should provide a libgccjit-dev package
Next by thread: Bug#982425: marked as done (g++-8: aarch64 -ftree-vectorize generates wrong code)
Index(es):
- Date
- Thread