Bug#982425: g++-8: aarch64 -ftree-vectorize generates wrong code
Package: g++-8
Version: 8.4.0-7
Severity: important
Tags: patch upstream
g++-8 generates incorrect code on arm64 for -O3 if
targetting aarch64 unless tree vectorisation is disabled. This is
quite bad and affects real packages like vlc and apache arrow.
The upstream bug is https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98949
and the fix is merged upstream here:
https://gcc.gnu.org/git/gitweb.cgi?p=gcc.git;h=de0ede7625f6c4d4bbd2caaf363032b0da80cf69
This has been applied in gcc-9 9.3.0-22 and gcc-10 10.2.1-6 already (cheers for that).
'PR tree-optimization/97236'
Tested with:
$ g++-8 -O3 test.cc && ./a.out
a.out: test.cc:27: int main(): Assertion `bitmap[7] != 0' failed.
Aborted (core dumped)
$ g++-8 -O3 -fno-tree-vectorize test.cc && ./a.out
(no error, code works)
for test case:
==test.cc==
#include <cassert>
#include <cstdint>
int main(void) {
uint64_t a[64], b[64];
for (int i = 0; i < 64; ++i) {
a[i] = 1;
b[i] = 2;
}
a[63] = b[63]; // only last element is the same
uint8_t bitmap[8]; // holds 64 bits, bit_i = 1 if (a[i] == b[i]) else 0, i = 0 ~ 63
int index = 0; // index to a[], b[]
for (int byte = 0; byte < 8; ++byte) {
uint8_t out_results[8]; // holds 8 comparison results temporarily
for (int bit = 0; bit < 8; ++bit) {
out_results[bit] = a[index] == b[index];
++index;
}
bitmap[byte] = (out_results[0] | out_results[1] << 1 | out_results[2] << 2 |
out_results[3] << 3 | out_results[4] << 4 | out_results[5] << 5 |
out_results[6] << 6 | out_results[7] << 7);
}
// last bitmap should be non-zero, fired on gcc-9.3 aarch64 -O3
assert(bitmap[7] != 0);
return 0;
}
Attached is a tested patch.
--
Wookey
diff -u gcc-8-8.4.0/debian/changelog gcc-8-8.4.0/debian/changelog
--- gcc-8-8.4.0/debian/changelog
+++ gcc-8-8.4.0/debian/changelog
@@ -1,3 +1,10 @@
+gcc-8 (8.4.0-7.1) UNRELEASED; urgency=medium
+
+ * Non-maintainer upload.
+ * Fix PR tree-optimization/97236 (AArch64)
+
+ -- Wookey <wookey@softiron-wookey.cambridge.arm.com> Tue, 09 Feb 2021 13:27:54 +0000
+
gcc-8 (8.4.0-7) unstable; urgency=medium
* Update to git 20210202 from the gcc-8 branch.
diff -u gcc-8-8.4.0/debian/rules.patch gcc-8-8.4.0/debian/rules.patch
--- gcc-8-8.4.0/debian/rules.patch
+++ gcc-8-8.4.0/debian/rules.patch
@@ -83,6 +83,7 @@
verbose-lto-linker \
libstdc++-futex \
pr97528 \
+ pr97236 \
ifeq (,$(filter $(distrelease),precise trusty stretch jessie wheezy))
debian_patches += pr90050
only in patch2:
unchanged:
--- gcc-8-8.4.0.orig/debian/patches/pr97236.diff
+++ gcc-8-8.4.0/debian/patches/pr97236.diff
@@ -0,0 +1,110 @@
+From: Matthias Klose <doko@ubuntu.com>
+Date: Tue, 6 Oct 2020 11:41:37 +0000 (+0200)
+Subject: Backport fix for PR/tree-optimization/97236 - fix bad use of VMAT_CONTIGUOUS
+X-Git-Url: https://gcc.gnu.org/git/?p=gcc.git;a=commitdiff_plain;h=de0ede7625f6c4d4bbd2caaf363032b0da80cf69;hp=6f2f4412b9542c0f5fb950e8986ed97b6671806d
+
+Backport fix for PR/tree-optimization/97236 - fix bad use of VMAT_CONTIGUOUS
+
+This avoids using VMAT_CONTIGUOUS with single-element interleaving
+when using V1mode vectors. Instead keep VMAT_ELEMENTWISE but
+continue to avoid load-lanes and gathers.
+
+2020-10-01 Richard Biener <rguenther@suse.de>
+
+ PR tree-optimization/97236
+ * tree-vect-stmts.c (get_group_load_store_type): Keep
+ VMAT_ELEMENTWISE for single-element vectors.
+
+ * gcc.dg/vect/pr97236.c: New testcase.
+
+(cherry picked from commit 1ab88985631dd2c5a5e3b5c0dce47cf8b6ed2f82)
+---
+
+diff --git a/src/gcc/testsuite/gcc.dg/vect/pr97236.c b/gcc/testsuite/gcc.dg/vect/pr97236.c
+new file mode 100644
+index 00000000000..9d3dc20d953
+--- /dev/null
++++ b/src/gcc/testsuite/gcc.dg/vect/pr97236.c
+@@ -0,0 +1,43 @@
++typedef unsigned char __uint8_t;
++typedef __uint8_t uint8_t;
++typedef struct plane_t {
++ uint8_t *p_pixels;
++ int i_lines;
++ int i_pitch;
++} plane_t;
++
++typedef struct {
++ plane_t p[5];
++} picture_t;
++
++#define N 4
++
++void __attribute__((noipa))
++picture_Clone(picture_t *picture, picture_t *res)
++{
++ for (int i = 0; i < N; i++) {
++ res->p[i].p_pixels = picture->p[i].p_pixels;
++ res->p[i].i_lines = picture->p[i].i_lines;
++ res->p[i].i_pitch = picture->p[i].i_pitch;
++ }
++}
++
++int
++main()
++{
++ picture_t aaa, bbb;
++ uint8_t pixels[10] = {1, 1, 1, 1, 1, 1, 1, 1};
++
++ for (unsigned i = 0; i < N; i++)
++ aaa.p[i].p_pixels = pixels;
++
++ picture_Clone (&aaa, &bbb);
++
++ uint8_t c = 0;
++ for (unsigned i = 0; i < N; i++)
++ c += bbb.p[i].p_pixels[0];
++
++ if (c != N)
++ __builtin_abort ();
++ return 0;
++}
+diff --git a/src/gcc/tree-vect-stmts.c b/gcc/tree-vect-stmts.c
+index 7472558048c..ea75c7249aa 100644
+--- a/src/gcc/tree-vect-stmts.c
++++ b/src/gcc/tree-vect-stmts.c
+@@ -2209,25 +2209,23 @@ get_group_load_store_type (gimple *stmt, tree vectype, bool slp,
+ /* First cope with the degenerate case of a single-element
+ vector. */
+ if (known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U))
+- *memory_access_type = VMAT_CONTIGUOUS;
++ ;
+
+ /* Otherwise try using LOAD/STORE_LANES. */
+- if (*memory_access_type == VMAT_ELEMENTWISE
+- && (vls_type == VLS_LOAD
+- ? vect_load_lanes_supported (vectype, group_size, masked_p)
+- : vect_store_lanes_supported (vectype, group_size,
+- masked_p)))
++ else if (vls_type == VLS_LOAD
++ ? vect_load_lanes_supported (vectype, group_size, masked_p)
++ : vect_store_lanes_supported (vectype, group_size,
++ masked_p))
+ {
+ *memory_access_type = VMAT_LOAD_STORE_LANES;
+ overrun_p = would_overrun_p;
+ }
+
+ /* If that fails, try using permuting loads. */
+- if (*memory_access_type == VMAT_ELEMENTWISE
+- && (vls_type == VLS_LOAD
+- ? vect_grouped_load_supported (vectype, single_element_p,
+- group_size)
+- : vect_grouped_store_supported (vectype, group_size)))
++ else if (vls_type == VLS_LOAD
++ ? vect_grouped_load_supported (vectype, single_element_p,
++ group_size)
++ : vect_grouped_store_supported (vectype, group_size))
+ {
+ *memory_access_type = VMAT_CONTIGUOUS_PERMUTE;
+ overrun_p = would_overrun_p;
Reply to: