[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

mesa: Changes to 'upstream-experimental'



Rebased ref, commits from common ancestor:
commit e7a27f70b91e202ad9afc3e67e1080572d4d4a0b
Author: Dave Airlie <airlied@redhat.com>
Date:   Tue Feb 2 17:54:43 2016 +1000

    virgl: mark function as static
    
    This is fallout from the previous changes.
    
    Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=93961
    
    Signed-off-by: Dave Airlie <airlied@redhat.com>

diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
index a49a89e..ba00988 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@@ -754,7 +754,7 @@ static void virgl_fence_reference(struct virgl_winsys *vws,
 }
 
 
-struct virgl_winsys *
+static struct virgl_winsys *
 virgl_drm_winsys_create(int drmFD)
 {
    struct virgl_drm_winsys *qdws;

commit 7221b8aec648c0da3f7ce386e143e9fdbb36a0ce
Author: Roland Scheidegger <sroland@vmware.com>
Date:   Tue Feb 2 03:51:22 2016 +0100

    gallivm: add PK2H/UP2H support
    
    Add support for these opcodes, the conversion functions were already
    there albeit need some new packing stuff.
    Just like the tgsi version, piglit won't like it for all the same
    reasons, so it's disabled (UP2H passes piglit arb_shader_language_packing
    tests, albeit since PK2H won't due to those rounding differences I don't
    know if that one works or not as the piglit test is rather difficult to
    deal with).
    
    Reviewed-by: Brian Paul <brianp@vmware.com>

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index daa2043..d80c997 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -273,7 +273,7 @@ lp_build_uninterleave1(struct gallivm_state *gallivm,
    unsigned i;
    assert(num_elems <= LP_MAX_VECTOR_LENGTH);
 
-   for(i = 0; i < num_elems / 2; ++i)
+   for (i = 0; i < num_elems / 2; ++i)
       elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
 
    shuffle = LLVMConstVector(elems, num_elems / 2);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index f6b42ee..43af6b4 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -548,9 +548,10 @@ pk2h_fetch_args(
 }
 
 static void
-pk2h_emit(const struct lp_build_tgsi_action *action,
-          struct lp_build_tgsi_context *bld_base,
-          struct lp_build_emit_data *emit_data)
+pk2h_emit(
+   const struct lp_build_tgsi_action *action,
+   struct lp_build_tgsi_context *bld_base,
+   struct lp_build_emit_data *emit_data)
 {
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    struct lp_type f16i_t;
@@ -575,9 +576,10 @@ static struct lp_build_tgsi_action pk2h_action = {
 /* TGSI_OPCODE_UP2H */
 
 static void
-up2h_emit(const struct lp_build_tgsi_action *action,
-          struct lp_build_tgsi_context *bld_base,
-          struct lp_build_emit_data *emit_data)
+up2h_emit(
+   const struct lp_build_tgsi_action *action,
+   struct lp_build_tgsi_context *bld_base,
+   struct lp_build_emit_data *emit_data)
 {
    struct gallivm_state *gallivm = bld_base->base.gallivm;
    LLVMBuilderRef builder = gallivm->builder;

commit 5171ec9ca92ce489e32c227ae3b4b6df621bbf40
Author: Roland Scheidegger <sroland@vmware.com>
Date:   Fri Jan 29 02:49:22 2016 +0100

    gallivm: add PK2H/UP2H support
    
    Add support for these opcodes, the conversion functions were already
    there albeit need some new packing stuff.
    Just like the tgsi version, piglit won't like it for all the same
    reasons, so it's disabled (UP2H passes piglit arb_shader_language_packing
    tests, albeit since PK2H won't due those rounding differences I don't
    know if that one works or not as the piglit test is rather difficult to
    deal with).

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index 7854142..7cf0dee 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -130,6 +130,7 @@ lp_build_half_to_float(struct gallivm_state *gallivm,
  *
  * Convert float32 to half floats, preserving Infs and NaNs,
  * with rounding towards zero (trunc).
+ * XXX: For GL, would prefer rounding towards nearest(-even).
  */
 LLVMValueRef
 lp_build_float_to_half(struct gallivm_state *gallivm,
@@ -143,6 +144,15 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
    struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
    LLVMValueRef result;
 
+   /*
+    * Note: Newer llvm versions (3.6 or so) support fptrunc to 16 bits
+    * directly, without any (x86 or generic) intrinsics.
+    * Albeit the rounding mode cannot be specified (and is undefined,
+    * though in practice on x86 seems to do nearest-even but it may
+    * be dependent on instruction set support), so is essentially
+    * useless.
+    */
+
    if (util_cpu_caps.has_f16c &&
        (length == 4 || length == 8)) {
       struct lp_type i168_type = lp_type_int_vec(16, 16 * 8);
@@ -187,7 +197,11 @@ lp_build_float_to_half(struct gallivm_state *gallivm,
         LLVMValueRef index = LLVMConstInt(i32t, i, 0);
         LLVMValueRef f32 = LLVMBuildExtractElement(builder, src, index, "");
 #if 0
-        /* XXX: not really supported by backends */
+        /*
+         * XXX: not really supported by backends.
+         * Even if they would now, rounding mode cannot be specified and
+         * is undefined.
+         */
         LLVMValueRef f16 = lp_build_intrinsic_unary(builder, "llvm.convert.to.fp16", i16t, f32);
 #else
         LLVMValueRef f16 = LLVMBuildCall(builder, func, &f32, 1, "");
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index 0b0f7f0..daa2043 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -257,6 +257,32 @@ lp_build_concat_n(struct gallivm_state *gallivm,
 
 
 /**
+ * Un-interleave vector.
+ * This will return a vector consisting of every second element
+ * (depending on lo_hi, beginning at 0 or 1).
+ * The returned vector size (elems and width) will only be half
+ * that of the source vector.
+ */
+LLVMValueRef
+lp_build_uninterleave1(struct gallivm_state *gallivm,
+                       unsigned num_elems,
+                       LLVMValueRef a,
+                       unsigned lo_hi)
+{
+   LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];
+   unsigned i;
+   assert(num_elems <= LP_MAX_VECTOR_LENGTH);
+
+   for(i = 0; i < num_elems / 2; ++i)
+      elems[i] = lp_build_const_int32(gallivm, 2*i + lo_hi);
+
+   shuffle = LLVMConstVector(elems, num_elems / 2);
+
+   return LLVMBuildShuffleVector(gallivm->builder, a, a, shuffle, "");
+}
+
+
+/**
  * Interleave vector elements.
  *
  * Matches the PUNPCKLxx and PUNPCKHxx SSE instructions
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.h b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
index 7cede35..367fba1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.h
@@ -58,6 +58,11 @@ lp_build_interleave2(struct gallivm_state *gallivm,
                      LLVMValueRef b,
                      unsigned lo_hi);
 
+LLVMValueRef
+lp_build_uninterleave1(struct gallivm_state *gallivm,
+                       unsigned num_elems,
+                       LLVMValueRef a,
+                       unsigned lo_hi);
 
 void
 lp_build_unpack2(struct gallivm_state *gallivm,
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index c88dfbf..1cbe47c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -248,7 +248,6 @@ lp_build_tgsi_inst_llvm(
    /* Ignore deprecated instructions */
    switch (inst->Instruction.Opcode) {
 
-   case TGSI_OPCODE_UP2H:
    case TGSI_OPCODE_UP2US:
    case TGSI_OPCODE_UP4B:
    case TGSI_OPCODE_UP4UB:
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
index 6f75bec..f6b42ee 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
@@ -45,8 +45,10 @@
 #include "lp_bld_arit.h"
 #include "lp_bld_bitarit.h"
 #include "lp_bld_const.h"
+#include "lp_bld_conv.h"
 #include "lp_bld_gather.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_pack.h"
 
 #include "tgsi/tgsi_exec.h"
 
@@ -530,6 +532,75 @@ static struct lp_build_tgsi_action log_action = {
    log_emit	 /* emit */
 };
 
+/* TGSI_OPCODE_PK2H */
+
+static void
+pk2h_fetch_args(
+   struct lp_build_tgsi_context * bld_base,
+   struct lp_build_emit_data * emit_data)
+{
+   /* src0.x */
+   emit_data->args[0] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                            0, TGSI_CHAN_X);
+   /* src0.y */
+   emit_data->args[1] = lp_build_emit_fetch(bld_base, emit_data->inst,
+                                            0, TGSI_CHAN_Y);
+}
+
+static void
+pk2h_emit(const struct lp_build_tgsi_action *action,
+          struct lp_build_tgsi_context *bld_base,
+          struct lp_build_emit_data *emit_data)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   struct lp_type f16i_t;
+   LLVMValueRef lo, hi, res;
+
+   f16i_t = lp_type_uint_vec(16, bld_base->base.type.length * 32);
+   lo = lp_build_float_to_half(gallivm, emit_data->args[0]);
+   hi = lp_build_float_to_half(gallivm, emit_data->args[1]);
+   /* maybe some interleave doubling vector width would be useful... */
+   lo = lp_build_pad_vector(gallivm, lo, bld_base->base.type.length * 2);
+   hi = lp_build_pad_vector(gallivm, hi, bld_base->base.type.length * 2);
+   res = lp_build_interleave2(gallivm, f16i_t, lo, hi, 0);
+
+   emit_data->output[emit_data->chan] = res;
+}
+
+static struct lp_build_tgsi_action pk2h_action = {
+   pk2h_fetch_args, /* fetch_args */
+   pk2h_emit        /* emit */
+};
+
+/* TGSI_OPCODE_UP2H */
+
+static void
+up2h_emit(const struct lp_build_tgsi_action *action,
+          struct lp_build_tgsi_context *bld_base,
+          struct lp_build_emit_data *emit_data)
+{
+   struct gallivm_state *gallivm = bld_base->base.gallivm;
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMContextRef context = gallivm->context;
+   LLVMValueRef lo, hi, res[2], arg;
+   unsigned nr = bld_base->base.type.length;
+   LLVMTypeRef i16t = LLVMVectorType(LLVMInt16TypeInContext(context), nr * 2);
+
+   arg = LLVMBuildBitCast(builder, emit_data->args[0], i16t, "");
+   lo = lp_build_uninterleave1(gallivm, nr * 2, arg, 0);
+   hi = lp_build_uninterleave1(gallivm, nr * 2, arg, 1);
+   res[0] = lp_build_half_to_float(gallivm, lo);
+   res[1] = lp_build_half_to_float(gallivm, hi);
+
+   emit_data->output[0] = emit_data->output[2] = res[0];
+   emit_data->output[1] = emit_data->output[3] = res[1];
+}
+
+static struct lp_build_tgsi_action up2h_action = {
+   scalar_unary_fetch_args, /* fetch_args */
+   up2h_emit                /* emit */
+};
+
 /* TGSI_OPCODE_LRP */
 
 static void
@@ -1032,10 +1103,12 @@ lp_set_default_actions(struct lp_build_tgsi_context * bld_base)
    bld_base->op_actions[TGSI_OPCODE_EXP] = exp_action;
    bld_base->op_actions[TGSI_OPCODE_LIT] = lit_action;
    bld_base->op_actions[TGSI_OPCODE_LOG] = log_action;
+   bld_base->op_actions[TGSI_OPCODE_PK2H] = pk2h_action;
    bld_base->op_actions[TGSI_OPCODE_RSQ] = rsq_action;
    bld_base->op_actions[TGSI_OPCODE_SQRT] = sqrt_action;
    bld_base->op_actions[TGSI_OPCODE_POW] = pow_action;
    bld_base->op_actions[TGSI_OPCODE_SCS] = scs_action;
+   bld_base->op_actions[TGSI_OPCODE_UP2H] = up2h_action;
    bld_base->op_actions[TGSI_OPCODE_XPD] = xpd_action;
 
    bld_base->op_actions[TGSI_OPCODE_BREAKC].fetch_args = scalar_unary_fetch_args;

commit dc16086e3b616f767ab31c86505502c4df7739ac
Author: Roland Scheidegger <sroland@vmware.com>
Date:   Tue Feb 2 03:36:19 2016 +0100

    tgsi: add PK2H/UP2H support
    
    The util functions handle the half-float conversion.
    Note that piglit won't like it much due to:
    a) The util functions use magic float mul conversion but when run inside
    softpipe/llvmpipe, denorms are flushed to zero, therefore when the conversion
    is from/to f16 denorm the result will be zero. This is a bug which should be
    fixed in these functions (should not rely on denorms being available), but
    will happen elsewhere just the same (e.g. conversion to f16 render targets).
    b) The util functions use trunc round mode rather than round-to-nearest. This
    is NOT a bug (as it is a d3d10 requirement). This will result of rounding not
    representable finite values to MAX_F16 rather than INFINITY. My belief is the
    piglit tests are wrong here but it's difficult to tell (generally glsl
    rounding mode is undefined, however I'm not sure if rounding mode might need
    to be consistent for different operations). Nevertheless, for gl it would be
    better to use round-to-nearest, but using different rounding for GL and d3d10
    is an unsolved problem (as it affects things like conversion to f16 render
    targets, clear colors, this shader opcode).
    
    Hence for now don't enable the cap bit (so the code is unused).
    (Code is from imirkin, comment from sroland)
    
    Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
    Reviewed-by: Roland Scheidegger <sroland@vmvware.com>
    Reviewed-by: Brian Paul <brianp@vmware.com>

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index f67c162..d898fd6 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -58,6 +58,7 @@
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
+#include "util/u_half.h"
 #include "util/u_memory.h"
 #include "util/u_math.h"
 
@@ -3058,6 +3059,45 @@ exec_dp2(struct tgsi_exec_machine *mach,
 }
 
 static void
+exec_pk2h(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   unsigned chan;
+   union tgsi_exec_channel arg[2], dst;
+
+   fetch_source(mach, &arg[0], &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_FLOAT);
+   fetch_source(mach, &arg[1], &inst->Src[0], TGSI_CHAN_Y, TGSI_EXEC_DATA_FLOAT);
+   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
+      dst.u[chan] = util_float_to_half(arg[0].f[chan]) |
+         (util_float_to_half(arg[1].f[chan]) << 16);
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst, &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_UINT);
+      }
+   }
+}
+
+static void
+exec_up2h(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   unsigned chan;
+   union tgsi_exec_channel arg, dst[2];
+
+   fetch_source(mach, &arg, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_UINT);
+   for (chan = 0; chan < TGSI_QUAD_SIZE; chan++) {
+      dst[0].f[chan] = util_half_to_float(arg.u[chan] & 0xffff);
+      dst[1].f[chan] = util_half_to_float(arg.u[chan] >> 16);
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &dst[chan & 1], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
 exec_scs(struct tgsi_exec_machine *mach,
          const struct tgsi_full_instruction *inst)
 {
@@ -4339,7 +4379,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_PK2H:
-      assert (0);
+      exec_pk2h(mach, inst);
       break;
 
    case TGSI_OPCODE_PK2US:
@@ -4425,7 +4465,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_UP2H:
-      assert (0);
+      exec_up2h(mach, inst);
       break;
 
    case TGSI_OPCODE_UP2US:
diff --git a/src/gallium/auxiliary/util/u_half.h b/src/gallium/auxiliary/util/u_half.h
index d28fae3..966d213 100644
--- a/src/gallium/auxiliary/util/u_half.h
+++ b/src/gallium/auxiliary/util/u_half.h
@@ -74,7 +74,11 @@ util_float_to_half(float f)
       f32.ui &= round_mask;
       f32.f  *= magic.f;
       f32.ui -= round_mask;
-
+      /*
+       * XXX: The magic mul relies on denorms being available, otherwise
+       * all f16 denorms get flushed to zero - hence when this is used
+       * for tgsi_exec in softpipe we won't get f16 denorms.
+       */
       /*
        * Clamp to max finite value if overflowed.
        * OpenGL has completely undefined rounding behavior for float to
@@ -112,6 +116,7 @@ util_half_to_float(uint16_t f16)
 
    /* Adjust */
    f32.f *= magic.f;
+   /* XXX: The magic mul relies on denorms being available */
 
    /* Inf / NaN */
    if (f32.f >= infnan.f)

commit 99bd96abbb62d2c7da60c6102661b590e05bf143
Author: Roland Scheidegger <sroland@vmware.com>
Date:   Tue Feb 2 03:14:12 2016 +0100

    llvmpipe: drop scissor planes early if the tri is fully inside them
    
    If the tri is fully inside a scissor edge (or rather, we just use the
    bounding box of the tri for the comparison), then we can drop these
    additional scissor "planes" early. We do not even need to allocate
    space for them in the tri.
    
    The math actually appears to be slightly iffy due to bounding boxes
    being rounded, but it doesn't matter in the end.
    
    Those scissor rects are costly - the 4 planes from the scissor are
    already more expensive to calculate than the 3 planes from the tri itself,
    and it also prevents us from using the specialized raster code for small
    tris.
    
    This helps openarena performance by about 8% or so. Of course, it helps
    there that while openarena often enables scissoring (and even moves the
    scissor rect around) I have not seen a single tri actually hit the
    scissor rect, ever.
    
    v2: drop individual scissor edges, and do it earlier, not even allocating
    space for them.
    v3: help the compiler a bit with simpler code, suggested by Brian.
    
    Reviewed-by: Brian Paul <brianp@vmware.com>

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index f425825..f6e1198 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -336,13 +336,6 @@ try_setup_line( struct lp_setup_context *setup,
       layer = MIN2(layer, scene->fb_max_layer);
    }
 
-   if (setup->scissor_test) {
-      nr_planes = 8;
-   }
-   else {
-      nr_planes = 4;
-   }
-
    dx = v1[0][0] - v2[0][0];
    dy = v1[0][1] - v2[0][1];
    area = (dx * dx  + dy * dy);
@@ -591,6 +584,20 @@ try_setup_line( struct lp_setup_context *setup,
    bbox.x0 = MAX2(bbox.x0, 0);
    bbox.y0 = MAX2(bbox.y0, 0);
 
+   nr_planes = 4;
+   /*
+    * Determine how many scissor planes we need, that is drop scissor
+    * edges if the bounding box of the tri is fully inside that edge.
+    */
+   if (setup->scissor_test) {
+      /* why not just use draw_regions */
+      struct u_rect *scissor = &setup->scissors[viewport_index];
+      nr_planes += (bbox.x0 < scissor->x0);
+      nr_planes += (bbox.x1 > scissor->x1);
+      nr_planes += (bbox.y0 < scissor->y0);
+      nr_planes += (bbox.y1 > scissor->y1);
+   }
+
    line = lp_setup_alloc_triangle(scene,
                                   key->num_inputs,
                                   nr_planes,
@@ -708,30 +715,44 @@ try_setup_line( struct lp_setup_context *setup,
     * Note that otherwise, the scissor planes only vary in 'C' value,
     * and even then only on state-changes.  Could alternatively store
     * these planes elsewhere.
+    * (Or only store the c value together with a bit indicating which
+    * scissor edge this is, so rasterization would treat them differently
+    * (easier to evaluate) to ordinary planes.)
     */
-   if (nr_planes == 8) {
-      const struct u_rect *scissor =
-         &setup->scissors[viewport_index];
-
-      plane[4].dcdx = -1 << 8;
-      plane[4].dcdy = 0;
-      plane[4].c = (1-scissor->x0) << 8;
-      plane[4].eo = 1 << 8;
-
-      plane[5].dcdx = 1 << 8;
-      plane[5].dcdy = 0;
-      plane[5].c = (scissor->x1+1) << 8;
-      plane[5].eo = 0;
-
-      plane[6].dcdx = 0;
-      plane[6].dcdy = 1 << 8;
-      plane[6].c = (1-scissor->y0) << 8;
-      plane[6].eo = 1 << 8;
-
-      plane[7].dcdx = 0;
-      plane[7].dcdy = -1 << 8;
-      plane[7].c = (scissor->y1+1) << 8;
-      plane[7].eo = 0;
+   if (nr_planes > 4) {
+      /* why not just use draw_regions */
+      struct u_rect *scissor = &setup->scissors[viewport_index];
+      struct lp_rast_plane *plane_s = &plane[4];
+
+      if (bbox.x0 < scissor->x0) {
+         plane_s->dcdx = -1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (1-scissor->x0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (bbox.x1 > scissor->x1) {
+         plane_s->dcdx = 1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (scissor->x1+1) << 8;
+         plane_s->eo = 0 << 8;
+         plane_s++;
+      }
+      if (bbox.y0 < scissor->y0) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = 1 << 8;
+         plane_s->c = (1-scissor->y0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (bbox.y1 > scissor->y1) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = -1 << 8;
+         plane_s->c = (scissor->y1+1) << 8;
+         plane_s->eo = 0;
+         plane_s++;
+      }
+      assert(plane_s == &plane[nr_planes]);
    }
 
    return lp_setup_bin_triangle(setup, line, &bbox, nr_planes, viewport_index);
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 1e3a750..7b00889 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -302,13 +302,6 @@ do_triangle_ccw(struct lp_setup_context *setup,
       layer = MIN2(layer, scene->fb_max_layer);
    }
 
-   if (setup->scissor_test) {
-      nr_planes = 7;
-   }
-   else {
-      nr_planes = 3;
-   }
-
    /* Bounding rectangle (in pixels) */
    {
       /* Yes this is necessary to accurately calculate bounding boxes
@@ -347,6 +340,20 @@ do_triangle_ccw(struct lp_setup_context *setup,
    bbox.x0 = MAX2(bbox.x0, 0);
    bbox.y0 = MAX2(bbox.y0, 0);
 
+   nr_planes = 3;
+   /*
+    * Determine how many scissor planes we need, that is drop scissor
+    * edges if the bounding box of the tri is fully inside that edge.
+    */
+   if (setup->scissor_test) {
+      /* why not just use draw_regions */
+      struct u_rect *scissor = &setup->scissors[viewport_index];
+      nr_planes += (bbox.x0 < scissor->x0);
+      nr_planes += (bbox.x1 > scissor->x1);
+      nr_planes += (bbox.y0 < scissor->y0);
+      nr_planes += (bbox.y1 > scissor->y1);
+   }
+
    tri = lp_setup_alloc_triangle(scene,
                                  key->num_inputs,
                                  nr_planes,
@@ -367,13 +374,11 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
    /* Setup parameter interpolants:
     */
-   setup->setup.variant->jit_function( v0,
-				       v1,
-				       v2,
-				       frontfacing,
-				       GET_A0(&tri->inputs),
-				       GET_DADX(&tri->inputs),
-				       GET_DADY(&tri->inputs) );
+   setup->setup.variant->jit_function(v0, v1, v2,
+                                      frontfacing,
+                                      GET_A0(&tri->inputs),
+                                      GET_DADX(&tri->inputs),
+                                      GET_DADY(&tri->inputs));
 
    tri->inputs.frontfacing = frontfacing;
    tri->inputs.disable = FALSE;
@@ -383,9 +388,9 @@ do_triangle_ccw(struct lp_setup_context *setup,
 
    if (0)
       lp_dump_setup_coef(&setup->setup.variant->key,
-			 (const float (*)[4])GET_A0(&tri->inputs),
-			 (const float (*)[4])GET_DADX(&tri->inputs),
-			 (const float (*)[4])GET_DADY(&tri->inputs));
+                         (const float (*)[4])GET_A0(&tri->inputs),
+                         (const float (*)[4])GET_DADX(&tri->inputs),
+                         (const float (*)[4])GET_DADY(&tri->inputs));
 
    plane = GET_PLANES(tri);
 
@@ -672,29 +677,44 @@ do_triangle_ccw(struct lp_setup_context *setup,
     * Note that otherwise, the scissor planes only vary in 'C' value,
     * and even then only on state-changes.  Could alternatively store
     * these planes elsewhere.
+    * (Or only store the c value together with a bit indicating which
+    * scissor edge this is, so rasterization would treat them differently
+    * (easier to evaluate) to ordinary planes.)
     */
-   if (nr_planes == 7) {
-      const struct u_rect *scissor = &setup->scissors[viewport_index];
-
-      plane[3].dcdx = -1 << 8;
-      plane[3].dcdy = 0;
-      plane[3].c = (1-scissor->x0) << 8;
-      plane[3].eo = 1 << 8;
-
-      plane[4].dcdx = 1 << 8;
-      plane[4].dcdy = 0;
-      plane[4].c = (scissor->x1+1) << 8;
-      plane[4].eo = 0;
-
-      plane[5].dcdx = 0;
-      plane[5].dcdy = 1 << 8;
-      plane[5].c = (1-scissor->y0) << 8;
-      plane[5].eo = 1 << 8;
-
-      plane[6].dcdx = 0;
-      plane[6].dcdy = -1 << 8;
-      plane[6].c = (scissor->y1+1) << 8;
-      plane[6].eo = 0;
+   if (nr_planes > 3) {
+      /* why not just use draw_regions */
+      struct u_rect *scissor = &setup->scissors[viewport_index];
+      struct lp_rast_plane *plane_s = &plane[3];
+
+      if (bbox.x0 < scissor->x0) {
+         plane_s->dcdx = -1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (1-scissor->x0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (bbox.x1 > scissor->x1) {
+         plane_s->dcdx = 1 << 8;
+         plane_s->dcdy = 0;
+         plane_s->c = (scissor->x1+1) << 8;
+         plane_s->eo = 0 << 8;
+         plane_s++;
+      }
+      if (bbox.y0 < scissor->y0) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = 1 << 8;
+         plane_s->c = (1-scissor->y0) << 8;
+         plane_s->eo = 1 << 8;
+         plane_s++;
+      }
+      if (bbox.y1 > scissor->y1) {
+         plane_s->dcdx = 0;
+         plane_s->dcdy = -1 << 8;
+         plane_s->c = (scissor->y1+1) << 8;
+         plane_s->eo = 0;
+         plane_s++;
+      }
+      assert(plane_s == &plane[nr_planes]);
    }
 
    return lp_setup_bin_triangle(setup, tri, &bbox, nr_planes, viewport_index);

commit 9d2a34e1054370663de06124aa1bc6282b450fe1
Author: Roland Scheidegger <sroland@vmware.com>
Date:   Sun Jan 31 01:27:09 2016 +0100

    llvmpipe: minor cleanup of sse2 for calc_fixed_position
    
    Just slightly simpler assembly.
    
    Reviewed-by: Brian Paul <brianp@vmware.com>

diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 907129d..1e3a750 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -984,17 +984,16 @@ calc_fixed_position(struct lp_setup_context *setup,
     * Both should be acceptable, I think.
     */
 #if defined(PIPE_ARCH_SSE)
-   __m128d v0r, v1r, v2r;
+   __m128 v0r, v1r;
    __m128 vxy0xy2, vxy1xy0;
    __m128i vxy0xy2i, vxy1xy0i;
    __m128i dxdy0120, x0x2y0y2, x1x0y1y0, x0120, y0120;
    __m128 pix_offset = _mm_set1_ps(setup->pixel_offset);
    __m128 fixed_one = _mm_set1_ps((float)FIXED_ONE);
-   v0r = _mm_load_sd((const double *)v0[0]);
-   v1r = _mm_load_sd((const double *)v1[0]);
-   v2r = _mm_load_sd((const double *)v2[0]);
-   vxy0xy2 = _mm_castpd_ps(_mm_unpacklo_pd(v0r, v2r));
-   vxy1xy0 = _mm_castpd_ps(_mm_unpacklo_pd(v1r, v0r));
+   v0r = _mm_castpd_ps(_mm_load_sd((double *)v0[0]));
+   vxy0xy2 = _mm_loadh_pi(v0r, (__m64 *)v2[0]);
+   v1r = _mm_castpd_ps(_mm_load_sd((double *)v1[0]));
+   vxy1xy0 = _mm_movelh_ps(v1r, vxy0xy2);
    vxy0xy2 = _mm_sub_ps(vxy0xy2, pix_offset);
    vxy1xy0 = _mm_sub_ps(vxy1xy0, pix_offset);
    vxy0xy2 = _mm_mul_ps(vxy0xy2, fixed_one);

commit 8aa168eb8f0f1f634d0fc6733812c70d3a597518
Author: Roland Scheidegger <sroland@vmware.com>
Date:   Sun Jan 31 01:26:59 2016 +0100

    llvmpipe: use vector loads for (optimized) tri raster funcs
    
    When we switched to 64bit rasterization, we could no longer use straight
    aligned loads for loading the plane data. However, what the code actually
    does for loading 3 planes, is 12 scalar loads + 9 unpacks, and then there's
    another 8 unpacks for the transpose we need (!).
    
    It would be possible to do the (scalar) loads of course already transposed
    (at least saving the additional unpacks), however instead just use
    (un)aligned vector loads, and recalculate the eo values, which is much less
    instructions (note in case of the triangle_32_3_4 case, the eo values are
    not even used, making the scalar loads + unpacks for them all the more
    pointless).
    
    This drops execution time of the triangle_32_3_4 function considerably,
    albeit it doesn't really make a measurable difference (for small tris we're
    essentially limited by vertex throughput in any case), for triangle_32_3_16
    it's essentially noise (the loop is more costly than the initial code there).
    
    (I'm thinking about just ditching storing the eo values in the plane data,
    so could switch back to using aligned planes, however right now they are
    still used in the other raster functions dealing with planes with scalar
    code. Also not touching the ppc code, might not be that bad there in any
    case.)
    
    Reviewed-by: Brian Paul <brianp@vmware.com>

diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index db45cbb..34008e1 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -308,17 +308,4 @@ void
 lp_debug_draw_bins_by_coverage( struct lp_scene *scene );
 
 
-#ifdef PIPE_ARCH_SSE
-#include <emmintrin.h>
-#include "util/u_sse.h"
-
-static inline __m128i
-lp_plane_to_m128i(const struct lp_rast_plane *plane)
-{
-   return _mm_setr_epi32((int32_t)plane->c, (int32_t)plane->dcdx,
-                         (int32_t)plane->dcdy, (int32_t)plane->eo);
-}
-
-#endif
-
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index 0ae6ec2..f4a2f02 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -239,7 +239,7 @@ sign_bits4(const __m128i *cstep, int cdiff)
 
 void
 lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
-                      const union lp_rast_cmd_arg arg)
+                         const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    const struct lp_rast_plane *plane = GET_PLANES(tri);
@@ -250,26 +250,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
    struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
    unsigned nr = 0;
 
-   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
+   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
    __m128i zero = _mm_setzero_si128();
 
-   __m128i c;
-   __m128i dcdx;
-   __m128i dcdy;
-   __m128i rej4;
-
-   __m128i dcdx2;
-   __m128i dcdx3;
+   __m128i c, dcdx, dcdy, rej4;
+   __m128i dcdx_neg_mask, dcdy_neg_mask;
+   __m128i dcdx2, dcdx3;
    
    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    __m128i unused;
-   
+
    transpose4_epi32(&p0, &p1, &p2, &zero,
-                    &c, &dcdx, &dcdy, &rej4);
+                    &c, &unused, &dcdx, &dcdy);
+
+   /* recalc eo - easier than trying to load as scalars / shuffle... */
+   dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
+   dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
+   rej4 = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
+                        _mm_and_si128(dcdx_neg_mask, dcdx));
 
    /* Adjust dcdx;
     */
@@ -349,32 +352,29 @@ lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
 
 void
 lp_rast_triangle_32_3_4(struct lp_rasterizer_task *task,
-                     const union lp_rast_cmd_arg arg)
+                        const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    const struct lp_rast_plane *plane = GET_PLANES(tri);
    unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
    unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
 
-   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
-   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
-   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
+   /* p0 and p2 are aligned, p1 is not (plane size 24 bytes). */
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* clo, chi, dcdx, dcdy */
+   __m128i p1 = _mm_loadu_si128((__m128i *)&plane[1]);
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]);
    __m128i zero = _mm_setzero_si128();
 
-   __m128i c;
-   __m128i dcdx;
-   __m128i dcdy;
+   __m128i c, dcdx, dcdy;
+   __m128i dcdx2, dcdx3;
 
-   __m128i dcdx2;
-   __m128i dcdx3;
-   
    __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
    __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
    __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
    __m128i unused;
 
    transpose4_epi32(&p0, &p1, &p2, &zero,
-                    &c, &dcdx, &dcdy, &unused);
+                    &c, &unused, &dcdx, &dcdy);
 
    /* Adjust dcdx;
     */

commit ab30426e335116e29473faaafe8b57ec760516ee
Author: Roland Scheidegger <sroland@vmware.com>
Date:   Fri Jan 29 03:18:36 2016 +0100

    i965: Provide sse2 version for rgba8 <-> bgra8 swizzle
    
    The existing code used ssse3, and because it isn't compiled in a separate
    file compiled with that, it is usually not used (that, of course, could
    be fixed...), whereas sse2 is always present at least with 64bit builds.
    This should be pretty much as fast as the pshufb version, albeit those
    code paths aren't really used on chips without llc in any case.
    
    v2: fix andnot argument order, add comments
    v3: use pshuflw/hw instead of shifts (suggested by Matt Turner), cut comments
    
    Reviewed-by: Matt Turner <mattst88@gmail.com>

diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 108dd87..b52bbfb 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -2773,6 +2773,11 @@ intel_miptree_map(struct brw_context *brw,
    } else if (!(mode & GL_MAP_WRITE_BIT) &&
               !mt->compressed && cpu_has_sse4_1 &&
               (mt->pitch % 16 == 0)) {
+      /*
+       * XXX: without sse4_1, in some situations it would be beneficial
+       * to copy regardless (with an ordinary memcpy) as otherwise mesa
+       * may access uncached memory bytewise.
+       */
       intel_miptree_map_movntdqa(brw, mt, map, level, slice);
 #endif
    } else {
diff --git a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
index 2383401..671dc59 100644
--- a/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
+++ b/src/mesa/drivers/dri/i965/intel_tiled_memcpy.c
@@ -36,10 +36,13 @@
 #include "brw_context.h"
 #include "intel_tiled_memcpy.h"
 
-#ifdef __SSSE3__
+#if defined(__SSSE3__)
 #include <tmmintrin.h>
+#elif defined(__SSE2__)
+#include <emmintrin.h>
 #endif
 
+
 #define FILE_DEBUG_FLAG DEBUG_TEXTURE
 
 #define ALIGN_DOWN(a, b) ROUND_DOWN_TO(a, b)
@@ -56,23 +59,65 @@ static const uint32_t ytile_width = 128;
 static const uint32_t ytile_height = 32;
 static const uint32_t ytile_span = 16;
 
-#ifdef __SSSE3__
+#if defined(__SSSE3__)
 static const uint8_t rgba8_permutation[16] =
    { 2,1,0,3, 6,5,4,7, 10,9,8,11, 14,13,12,15 };
 
 /* NOTE: dst must be 16-byte aligned. src may be unaligned. */
-#define rgba8_copy_16_aligned_dst(dst, src)                            \
-   _mm_store_si128((__m128i *)(dst),                                   \
-                   _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)), \
-                                    *(__m128i *) rgba8_permutation))
+static inline void
+rgba8_copy_16_aligned_dst(void *dst, const void *src)
+{
+   _mm_store_si128((__m128i *)(dst),
+                   _mm_shuffle_epi8(_mm_loadu_si128((__m128i *)(src)),
+                                    *(__m128i *)rgba8_permutation));
+}
 
 /* NOTE: src must be 16-byte aligned. dst may be unaligned. */
-#define rgba8_copy_16_aligned_src(dst, src)                            \
-   _mm_storeu_si128((__m128i *)(dst),                                  \
-                    _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)), \
-                                     *(__m128i *) rgba8_permutation))
+static inline void
+rgba8_copy_16_aligned_src(void *dst, const void *src)
+{
+   _mm_storeu_si128((__m128i *)(dst),
+                    _mm_shuffle_epi8(_mm_load_si128((__m128i *)(src)),
+                                     *(__m128i *)rgba8_permutation));
+}
+
+#elif defined(__SSE2__)
+static inline void
+rgba8_copy_16_aligned_dst(void *dst, const void *src)


Reply to: