[Date Prev][Date Next] [Thread Prev][Thread Next] [Date Index] [Thread Index]

mesa: Changes to 'upstream-experimental'



Rebased ref, commits from common ancestor:
commit 3b48f6a4c06db57a7203d247994b05e55c9418c1
Author: Chris Forbes <chrisf@ijw.co.nz>
Date:   Sun Aug 3 19:55:55 2014 +1200

    mesa: Add a new function for getting the nonconst sampler array index
    
    If the array index is not a constant expression, the existing support
    will assume a zero offset (giving us the sampler index of the base of
    the array).
    
    For dynamically uniform indexing of sampler arrays, we need both that
    and the indexing expression.
    
    Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
    Reviewed-by: Matt Turner <mattst88@gmail.com>
    Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>

diff --git a/src/mesa/program/sampler.cpp b/src/mesa/program/sampler.cpp
index e6532be..29a5408 100644
--- a/src/mesa/program/sampler.cpp
+++ b/src/mesa/program/sampler.cpp
@@ -134,3 +134,14 @@ _mesa_get_sampler_uniform_value(class ir_dereference *sampler,
    return shader_program->UniformStorage[location].sampler[shader].index +
           getname.offset;
 }
+
+
+extern "C" class ir_rvalue *
+_mesa_get_sampler_array_nonconst_index(class ir_dereference *sampler)
+{
+   ir_dereference_array *deref_arr = sampler->as_dereference_array();
+   if (!deref_arr || deref_arr->array_index->as_constant())
+      return NULL;
+
+   return deref_arr->array_index;
+}
diff --git a/src/mesa/program/sampler.h b/src/mesa/program/sampler.h
index 22467e9..8b7c3b6 100644
--- a/src/mesa/program/sampler.h
+++ b/src/mesa/program/sampler.h
@@ -27,3 +27,6 @@ int
 _mesa_get_sampler_uniform_value(class ir_dereference *sampler,
 				struct gl_shader_program *shader_program,
 				const struct gl_program *prog);
+
+class ir_rvalue *
+_mesa_get_sampler_array_nonconst_index(class ir_dereference *sampler);

commit 1b4761bc27a50208dba2bc028c9835fed572e696
Author: Chris Forbes <chrisf@ijw.co.nz>
Date:   Sun Aug 3 17:57:05 2014 +1200

    glsl: Allow dynamically uniform sampler array indexing with 4.0/gs5
    
    V2: Expand comment to explain what dynamically uniform expressions are
    about.
    
    Signed-off-by: Chris Forbes <chrisf@ijw.co.nz>
    Reviewed-by: Matt Turner <mattst88@gmail.com>
    Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>

diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp
index 50f9987..5ca85f6 100644
--- a/src/glsl/ast_array_index.cpp
+++ b/src/glsl/ast_array_index.cpp
@@ -213,6 +213,13 @@ _mesa_ast_array_index_to_hir(void *mem_ctx,
        * as using a loop counter as the index to an array of samplers.  If the
        * loop in unrolled, the code should compile correctly.  Instead, emit a
        * warning.
+       *
+       * In GLSL 4.00 / ARB_gpu_shader5, this requirement is relaxed again to allow
+       * indexing with dynamically uniform expressions. Note that these are not
+       * required to be uniforms or expressions based on them, but merely that the
+       * values must not diverge between shader invocations run together. If the
+       * values *do* diverge, then the behavior of the operation requiring a
+       * dynamically uniform expression is undefined.
        */
       if (array->type->element_type()->is_sampler()) {
 	 if (!state->is_version(130, 100)) {
@@ -227,7 +234,7 @@ _mesa_ast_array_index_to_hir(void *mem_ctx,
 				  "expressions will be forbidden in GLSL 1.30 "
 				  "and later");
 	    }
-	 } else {
+	 } else if (!state->is_version(400, 0) && !state->ARB_gpu_shader5_enable) {
 	    _mesa_glsl_error(&loc, state,
 			     "sampler arrays indexed with non-constant "
 			     "expressions is forbidden in GLSL 1.30 and "

commit f525bd01d1430a5e33f57805f50fe4e89aa86ae8
Author: Ilia Mirkin <imirkin@alum.mit.edu>
Date:   Wed Aug 6 23:45:05 2014 -0400

    nvc0/ir: describe the tex arguments for fermi/kepler
    
    Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index ade315d..7da9b0b 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -567,6 +567,31 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
    const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
    const int chipset = prog->getTarget()->getChipset();
 
+   // Arguments to the TEX instruction are a little insane. Even though the
+   // encoding is identical between SM20 and SM30, the arguments mean
+   // different things between Fermi and Kepler+. A lot of arguments are
+   // optional based on flags passed to the instruction. This summarizes the
+   // order of things.
+   //
+   // Fermi:
+   //  array/indirect
+   //  coords
+   //  sample
+   //  lod bias
+   //  depth compare
+   //  offsets:
+   //    - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
+   //    - other: 4 bits each, single reg
+   //
+   // Kepler+:
+   //  indirect handle
+   //  array (+ offsets for txd in upper 16 bits)
+   //  coords
+   //  sample
+   //  lod bias
+   //  depth compare
+   //  offsets (same as fermi, except txd which takes it with array)
+
    if (chipset >= NVISA_GK104_CHIPSET) {
       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
          // XXX this ignores tsc, and assumes a 1:1 mapping

commit b3cbd862242e0ff75584fef706f2b2a3da8e49f2
Author: Ilia Mirkin <imirkin@alum.mit.edu>
Date:   Wed Jul 9 00:41:11 2014 -0400

    nvc0/ir: add kepler+ support for indirect texture references
    
    Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 4a9e48f..ade315d 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -569,9 +569,17 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
 
    if (chipset >= NVISA_GK104_CHIPSET) {
       if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
-         WARN("indirect TEX not implemented\n");
-      }
-      if (i->tex.r == i->tex.s) {
+         // XXX this ignores tsc, and assumes a 1:1 mapping
+         assert(i->tex.rIndirectSrc >= 0);
+         Value *hnd = loadTexHandle(
+               bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                          i->getIndirectR(), bld.mkImm(2)),
+               i->tex.r);
+         i->tex.r = 0xff;
+         i->tex.s = 0x1f;
+         i->setIndirectR(hnd);
+         i->setIndirectS(NULL);
+      } else if (i->tex.r == i->tex.s) {
          i->tex.r += prog->driver->io.texBindBase / 4;
          i->tex.s  = 0; // only a single cX[] value possible here
       } else {
@@ -595,6 +603,16 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
             i->setSrc(s, i->getSrc(s - 1));
          i->setSrc(0, layer);
       }
+      // Move the indirect reference to the first place
+      if (i->tex.rIndirectSrc >= 0) {
+         Value *hnd = i->getIndirectR();
+
+         i->setIndirectR(NULL);
+         i->moveSources(0, 1);
+         i->setSrc(0, hnd);
+         i->tex.rIndirectSrc = 0;
+         i->tex.sIndirectSrc = -1;
+      }
    } else
    // (nvc0) generate and move the tsc/tic/array source to the front
    if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
@@ -688,14 +706,14 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
             // The offset goes into the upper 16 bits of the array index. So
             // create it if it's not already there, and INSBF it if it already
             // is.
+            s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
             if (i->tex.target.isArray()) {
                bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(0),
                          bld.loadImm(NULL, imm), bld.mkImm(0xc10),
-                         i->getSrc(0));
+                         i->getSrc(s));
             } else {
-               for (int s = dim; s >= 1; --s)
-                  i->setSrc(s, i->getSrc(s - 1));
-               i->setSrc(0, bld.loadImm(NULL, imm << 16));
+               i->moveSources(s, 1);
+               i->setSrc(s, bld.loadImm(NULL, imm << 16));
             }
          } else {
             i->setSrc(s, bld.loadImm(NULL, imm));
@@ -792,6 +810,8 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd)
    if (chipset >= NVISA_GK104_CHIPSET) {
       if (!txd->tex.target.isArray() && txd->tex.useOffsets)
          expected_args++;
+      if (txd->tex.rIndirectSrc >= 0 || txd->tex.sIndirectSrc >= 0)
+         expected_args++;
    } else {
       if (txd->tex.useOffsets)
          expected_args++;

commit af3619e88043ce85560b8220dc16244f8898a926
Author: Ilia Mirkin <imirkin@alum.mit.edu>
Date:   Wed Aug 6 01:22:49 2014 -0400

    nvc0/ir: add base tex offset for fermi indirect tex case
    
    Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index f010767..4a9e48f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -603,10 +603,18 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
       Value *ticRel = i->getIndirectR();
       Value *tscRel = i->getIndirectS();
 
-      if (ticRel)
+      if (ticRel) {
          i->setSrc(i->tex.rIndirectSrc, NULL);
-      if (tscRel)
+         if (i->tex.r)
+            ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+                                ticRel, bld.mkImm(i->tex.r));
+      }
+      if (tscRel) {
          i->setSrc(i->tex.sIndirectSrc, NULL);
+         if (i->tex.s)
+            tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
+                                tscRel, bld.mkImm(i->tex.s));
+      }
 
       Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
       for (int s = dim; s >= 1; --s)

commit f73594778b0ef9804fc7839d5e2051d051d4ef48
Author: Kenneth Graunke <kenneth@whitecape.org>
Date:   Mon Aug 11 15:05:54 2014 -0700

    i965: Revert part of f5cc3fdcf1680b116612fac7c39f1bd79f5e555e.
    
    Fixes non-termination in various Piglit tests.
    
    Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index 1b8c987..29d2e02 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -82,7 +82,7 @@ is_expression(const vec4_instruction *const inst)
    case SHADER_OPCODE_COS:
       return inst->mlen == 0;
    default:
-      return !inst->has_side_effects();
+      return false;
    }
 }
 

commit 602a3f92d4d695e116794597db81623a8fd4c653
Author: Eric Anholt <eric@anholt.net>
Date:   Sat Aug 9 11:01:53 2014 -0700

    vc4: Flip which primitives are considered front-facing.
    
    This mostly fixes glxgears rendering.

diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 69fd218..d2c53a5 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -101,7 +101,7 @@ vc4_create_rasterizer_state(struct pipe_context *pctx,
         /* XXX: per_vertex */
         so->point_size = cso->point_size;
 
-        if (!cso->front_ccw)
+        if (cso->front_ccw)
                 so->config_bits[0] |= VC4_CONFIG_BITS_CW_PRIMITIVES;
 
         if (cso->offset_tri)

commit f097516505daaaf5c25c919d56cbce54eb441a48
Author: Eric Anholt <eric@anholt.net>
Date:   Sat Aug 9 11:00:51 2014 -0700

    vc4: Don't forget to set the depth clear value in the packet.
    
    This gets glxgears partially rendering again.

diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index ffcbbb2..8ca4031 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -105,7 +105,7 @@ vc4_setup_rcl(struct vc4_context *vc4)
         cl_u8(&vc4->rcl, VC4_PACKET_CLEAR_COLORS);
         cl_u32(&vc4->rcl, vc4->clear_color[0]);
         cl_u32(&vc4->rcl, vc4->clear_color[1]);
-        cl_u32(&vc4->rcl, 0);
+        cl_u32(&vc4->rcl, vc4->clear_depth);
         cl_u8(&vc4->rcl, 0);
 
         cl_start_reloc(&vc4->rcl, 1);

commit e63598aecb5d1cc2a20b8db1ef85790e301f4241
Author: Eric Anholt <eric@anholt.net>
Date:   Tue Aug 5 14:24:29 2014 -0700

    vc4: Add support for gl_FragCoord.
    
    This isn't passing all tests (glsl-fs-fragcoord-zw-ortho, for example),
    but it does get a bunch more tests passing.
    
    v2: Rebase on helpers change.

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 8109f63..d871dcd 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -570,6 +570,20 @@ emit_vertex_input(struct tgsi_to_qir *trans, int attr)
 }
 
 static void
+emit_fragcoord_input(struct tgsi_to_qir *trans, int attr)
+{
+        struct qcompile *c = trans->c;
+
+        trans->inputs[attr * 4 + 0] = qir_FRAG_X(c);
+        trans->inputs[attr * 4 + 1] = qir_FRAG_Y(c);
+        trans->inputs[attr * 4 + 2] =
+                qir_FMUL(c,
+                         qir_FRAG_Z(c),
+                         qir_uniform_f(trans, 1.0 / 0xffffff));
+        trans->inputs[attr * 4 + 3] = qir_FRAG_RCP_W(c);
+}
+
+static void
 emit_fragment_input(struct tgsi_to_qir *trans, int attr)
 {
         struct qcompile *c = trans->c;
@@ -599,7 +613,12 @@ emit_tgsi_declaration(struct tgsi_to_qir *trans,
                      i <= decl->Range.Last;
                      i++) {
                         if (c->stage == QSTAGE_FRAG) {
-                                emit_fragment_input(trans, i);
+                                if (decl->Semantic.Name ==
+                                    TGSI_SEMANTIC_POSITION) {
+                                        emit_fragcoord_input(trans, i);
+                                } else {
+                                        emit_fragment_input(trans, i);
+                                }
                         } else {
                                 emit_vertex_input(trans, i);
                         }
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index 9462da5..6509a2b 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -65,6 +65,11 @@ static const struct qir_op_info qir_op_info[] = {
         [QOP_TLB_COLOR_WRITE] = { "tlb_color", 0, 1, true },
         [QOP_VARY_ADD_C] = { "vary_add_c", 1, 1 },
 
+        [QOP_FRAG_X] = { "frag_x", 1, 0 },
+        [QOP_FRAG_Y] = { "frag_y", 1, 0 },
+        [QOP_FRAG_Z] = { "frag_z", 1, 0 },
+        [QOP_FRAG_RCP_W] = { "frag_rcp_w", 1, 0 },
+
         [QOP_TEX_S] = { "tex_s", 0, 2 },
         [QOP_TEX_T] = { "tex_t", 0, 2 },
         [QOP_TEX_R] = { "tex_r", 0, 2 },
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 5d1f088..7d98062 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -74,6 +74,11 @@ enum qop {
         QOP_TLB_COLOR_WRITE,
         QOP_VARY_ADD_C,
 
+        QOP_FRAG_X,
+        QOP_FRAG_Y,
+        QOP_FRAG_Z,
+        QOP_FRAG_RCP_W,
+
         /** Texture x coordinate parameter write */
         QOP_TEX_S,
         /** Texture y coordinate parameter write */
@@ -204,6 +209,15 @@ bool qir_opt_algebraic(struct qcompile *c);
 bool qir_opt_copy_propagation(struct qcompile *c);
 bool qir_opt_dead_code(struct qcompile *c);
 
+#define QIR_ALU0(name)                                                   \
+static inline struct qreg                                                \
+qir_##name(struct qcompile *c)                                           \
+{                                                                        \
+        struct qreg t = qir_get_temp(c);                                 \
+        qir_emit(c, qir_inst(QOP_##name, t, c->undef, c->undef));        \
+        return t;                                                        \
+}
+
 #define QIR_ALU1(name)                                                   \
 static inline struct qreg                                                \
 qir_##name(struct qcompile *c, struct qreg a)                            \
@@ -257,6 +271,10 @@ QIR_NODST_2(TEX_S)
 QIR_NODST_2(TEX_T)
 QIR_NODST_2(TEX_R)
 QIR_NODST_2(TEX_B)
+QIR_ALU0(FRAG_X)
+QIR_ALU0(FRAG_Y)
+QIR_ALU0(FRAG_Z)
+QIR_ALU0(FRAG_RCP_W)
 
 static inline struct qreg
 qir_CMP(struct qcompile *c, struct qreg cmp, struct qreg a, struct qreg b)
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 33abf6d..63f37dd 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -213,7 +213,8 @@ vc4_generate_code(struct qcompile *c)
                         if (qinst->src[i].file == QFILE_TEMP)
                                 reg_uses_remaining[qinst->src[i].index]++;
                 }
-                if (qinst->op == QOP_TLB_PASSTHROUGH_Z_WRITE)
+                if (qinst->op == QOP_TLB_PASSTHROUGH_Z_WRITE ||
+                    qinst->op == QOP_FRAG_Z)
                         reg_in_use[3 + 32 + QPU_R_FRAG_PAYLOAD_ZW] = true;
         }
 
@@ -460,6 +461,33 @@ vc4_generate_code(struct qcompile *c)
 
                         break;
 
+                case QOP_FRAG_X:
+                        queue(c, qpu_inst(qpu_a_ITOF(dst,
+                                                     qpu_ra(QPU_R_XY_PIXEL_COORD)),
+                                          qpu_m_NOP()));
+                        break;
+
+                case QOP_FRAG_Y:
+                        queue(c, qpu_inst(qpu_a_ITOF(dst,
+                                                     qpu_rb(QPU_R_XY_PIXEL_COORD)),
+                                          qpu_m_NOP()));
+                        break;
+
+                case QOP_FRAG_Z:
+                        queue(c, qpu_inst(qpu_a_ITOF(dst,
+                                                     qpu_rb(QPU_R_FRAG_PAYLOAD_ZW)),
+                                          qpu_m_NOP()));
+                        break;
+
+                case QOP_FRAG_RCP_W:
+                        queue(c, qpu_inst(qpu_a_MOV(qpu_rb(QPU_W_SFU_RECIP),
+                                                    qpu_ra(QPU_R_FRAG_PAYLOAD_ZW)),
+                                          qpu_m_NOP()));
+
+                        queue(c, qpu_inst(qpu_a_MOV(dst, qpu_r4()),
+                                          qpu_m_NOP()));
+                        break;
+
                 case QOP_TLB_PASSTHROUGH_Z_WRITE:
                         queue(c, qpu_inst(qpu_a_MOV(qpu_ra(QPU_W_TLB_Z),
                                                     qpu_rb(QPU_R_FRAG_PAYLOAD_ZW)),
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 940596e..4ddc9ac 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -102,6 +102,10 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_MAX_VIEWPORTS:
                 return 1;
 
+        case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+        case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+                return 1;
+
                 /* Unsupported features. */
         case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
         case PIPE_CAP_ANISOTROPIC_FILTER:
@@ -111,8 +115,6 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_TEXTURE_SWIZZLE:
         case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
         case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
-        case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
-        case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
         case PIPE_CAP_SEAMLESS_CUBE_MAP:
         case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
         case PIPE_CAP_TGSI_INSTANCEID:

commit d34fbdda120b2b0e0a33a4a18fa112e58f8a2f4a
Author: Eric Anholt <eric@anholt.net>
Date:   Tue Aug 5 14:23:40 2014 -0700

    vc4: Refactor shader input setup again.
    
    This makes some space for handling special inputs like fragcoords.

diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 8252630..8109f63 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -570,6 +570,24 @@ emit_vertex_input(struct tgsi_to_qir *trans, int attr)
 }
 
 static void
+emit_fragment_input(struct tgsi_to_qir *trans, int attr)
+{
+        struct qcompile *c = trans->c;
+
+        for (int i = 0; i < 4; i++) {
+                struct qreg vary = {
+                        QFILE_VARY,
+                        attr * 4 + i
+                };
+
+                /* XXX: multiply by W */
+                trans->inputs[attr * 4 + i] =
+                        qir_VARY_ADD_C(c, qir_MOV(c, vary));
+                c->num_inputs++;
+        }
+}
+
+static void
 emit_tgsi_declaration(struct tgsi_to_qir *trans,
                       struct tgsi_full_declaration *decl)
 {
@@ -577,23 +595,12 @@ emit_tgsi_declaration(struct tgsi_to_qir *trans,
 
         switch (decl->Declaration.File) {
         case TGSI_FILE_INPUT:
-                if (c->stage == QSTAGE_FRAG) {
-                        for (int i = decl->Range.First * 4;
-                             i < (decl->Range.Last + 1) * 4;
-                             i++) {
-                                struct qreg vary = {
-                                        QFILE_VARY,
-                                        i
-                                };
-                                trans->inputs[i] =
-                                        qir_VARY_ADD_C(c, qir_MOV(c, vary));
-
-                                c->num_inputs++;
-                        }
-                } else {
-                        for (int i = decl->Range.First;
-                             i <= decl->Range.Last;
-                             i++) {
+                for (int i = decl->Range.First;
+                     i <= decl->Range.Last;
+                     i++) {
+                        if (c->stage == QSTAGE_FRAG) {
+                                emit_fragment_input(trans, i);
+                        } else {
                                 emit_vertex_input(trans, i);
                         }
                 }

commit a7faca5d2716c5f87f228c6f82eaf10373154852
Author: Eric Anholt <eric@anholt.net>
Date:   Tue Aug 5 11:00:51 2014 -0700

    vc4: Clean up the tile alloc buffer size.
    
    This prevents some simulator assertion failures, but it does mean (since
    I've dropped the "* 16" padding) that on real hardware you need a kernel
    that does overflow memory management (currently, "drm/vc4: Add support for
    binner overflow memory allocation." in my kernel tree).

diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 2fb57aa..ec218d3 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -45,7 +45,15 @@ vc4_start_draw(struct vc4_context *vc4)
         uint32_t tilew = align(width, 64) / 64;
         uint32_t tileh = align(height, 64) / 64;
 
-        uint32_t tile_alloc_size = 32 * tilew * tileh * 16;
+        /* Tile alloc memory setup: We use an initial alloc size of 32b.  The
+         * hardware then aligns that to 256b (we use 4096, because all of our
+         * BO allocations align to that anyway), then for some reason the
+         * simulator wants an extra page available, even if you have overflow
+         * memory set up.
+         */
+        uint32_t tile_alloc_size = 32 * tilew * tileh;
+        tile_alloc_size = align(tile_alloc_size, 4096);
+        tile_alloc_size += 4096;
         uint32_t tile_state_size = 48 * tilew * tileh;
         if (!vc4->tile_alloc || vc4->tile_alloc->size < tile_alloc_size) {
                 vc4_bo_unreference(&vc4->tile_alloc);

commit 7050ab510decce2606ffcd0298b3c7fb13a1401e
Author: Eric Anholt <eric@anholt.net>
Date:   Tue Aug 5 11:00:08 2014 -0700

    vc4: Clarify some values implicitly chosen for binning config.
    
    These #defines are 0, but it should help make math above make more sense.

diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index a76880c..2fb57aa 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -67,7 +67,10 @@ vc4_start_draw(struct vc4_context *vc4)
         cl_reloc(vc4, &vc4->bcl, vc4->tile_state, 0);
         cl_u8(&vc4->bcl, tilew);
         cl_u8(&vc4->bcl, tileh);
-        cl_u8(&vc4->bcl, VC4_BIN_CONFIG_AUTO_INIT_TSDA);
+        cl_u8(&vc4->bcl,
+              VC4_BIN_CONFIG_AUTO_INIT_TSDA |
+              VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32 |
+              VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32);
 
         cl_u8(&vc4->bcl, VC4_PACKET_START_TILE_BINNING);
 

commit ed5cb5d7d5d3d3cfeafcf67b2020044da9825abe
Author: Eric Anholt <eric@anholt.net>
Date:   Tue Aug 5 10:54:56 2014 -0700

    vc4: Improve simulator memory allocation.
    
    This should reduce a bunch of spurious failures in sim.

diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index fc3d554..827d617 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -32,6 +32,8 @@
 #include "vc4_simulator_validate.h"
 #include "simpenrose/simpenrose.h"
 
+#define OVERFLOW_SIZE (32 * 1024 * 1024)
+
 static struct drm_gem_cma_object *
 vc4_wrap_bo_with_cma(struct drm_device *dev, struct vc4_bo *bo)
 {
@@ -234,7 +236,7 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args,
         struct exec_info exec;
         struct drm_device local_dev = {
                 .vc4 = vc4,
-                .simulator_mem_next = 0,
+                .simulator_mem_next = OVERFLOW_SIZE,
         };
         struct drm_device *dev = &local_dev;
         int ret;
@@ -290,9 +292,23 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args,
 void
 vc4_simulator_init(struct vc4_screen *screen)
 {
-        simpenrose_init_hardware();
-        screen->simulator_mem_base = simpenrose_get_mem_start();
-        screen->simulator_mem_size = simpenrose_get_mem_size();
+        screen->simulator_mem_size = 256 * 1024 * 1024;
+        screen->simulator_mem_base = malloc(screen->simulator_mem_size);
+
+        /* We supply our own memory so that we can have more aperture
+         * available (256MB instead of simpenrose's default 64MB).
+         */
+        simpenrose_init_hardware_supply_mem(screen->simulator_mem_base,
+                                            screen->simulator_mem_size);
+
+        /* Carve out low memory for tile allocation overflow.  The kernel
+         * should be automatically handling overflow memory setup on real
+         * hardware, but for simulation we just get one shot to set up enough
+         * overflow memory before execution.  This overflow mem will be used
+         * up over the whole lifetime of simpenrose (not reused on each
+         * flush), so it had better be big.
+         */
+        simpenrose_supply_overflow_mem(0, OVERFLOW_SIZE);
 }
 
 #endif /* USE_VC4_SIMULATOR */

commit f5f8dd29c30e727c10ff087321d87c40c064de04
Author: Eric Anholt <eric@anholt.net>
Date:   Mon Aug 4 18:30:33 2014 -0700

    vc4: Handle stride==0 in VBO validation

diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.c b/src/gallium/drivers/vc4/vc4_simulator_validate.c
index 421107a..038de0f 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.c
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.c
@@ -942,18 +942,20 @@ validate_shader_rec(struct drm_device *dev,
 			stride |= (*(uint32_t *)(pkt_u + 100 + i * 4)) & ~0xff;
 
 		if (vbo->base.size < offset ||
-		    vbo->base.size - offset < attr_size ||
-		    stride == 0) {
+		    vbo->base.size - offset < attr_size) {
 			DRM_ERROR("BO offset overflow (%d + %d > %d)\n",
 				  offset, attr_size, vbo->base.size);
 			return -EINVAL;
 		}
 
-		max_index = (vbo->base.size - offset - attr_size) / stride;
-		if (state->max_index > max_index) {
-			DRM_ERROR("primitives use index %d out of supplied %d\n",
-				  state->max_index, max_index);
-			return -EINVAL;
+		if (stride != 0) {
+			max_index = ((vbo->base.size - offset - attr_size) /
+				     stride);
+			if (state->max_index > max_index) {
+				DRM_ERROR("primitives use index %d out of supplied %d\n",
+					  state->max_index, max_index);
+				return -EINVAL;
+			}
 		}
 
 		*(uint32_t *)(pkt_v + o) = vbo->paddr + offset;

commit 0f034055f96b9dd7b1c54e8fa5422d22c26f2269
Author: Eric Anholt <eric@anholt.net>
Date:   Mon Aug 4 16:38:07 2014 -0700

    vc4: Stash some debug code for looking at what BOs are at what hindex.
    
    When you're debugging validation, it's nice to know what the BOs are for.

diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index faec853..a87cdfa 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -106,6 +106,7 @@ vc4_bo_open_name(struct vc4_screen *screen, uint32_t name,
         bo->screen = screen;
         bo->handle = o.handle;
         bo->size = o.size;
+        bo->name = "winsys";
 
 #ifdef USE_VC4_SIMULATOR
         vc4_bo_map(bo);
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index 8038fee5..fc3d554 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -75,6 +75,10 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct exec_info *exec)
                 struct vc4_bo *bo = bos[i];
                 struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo);
 
+#if 0
+                fprintf(stderr, "bo hindex %d: %s\n", i, bo->name);
+#endif
+
                 vc4_bo_map(bo);
                 memcpy(obj->vaddr, bo->map, bo->size);
 

commit 8ebfa8fdb27bb5efaeda4fe567622d5de4779342
Author: Eric Anholt <eric@anholt.net>
Date:   Mon Aug 4 13:01:29 2014 -0700

    vc4: Use GEM under simulation even for non-winsys BOs.
    
    In addition to reducing sim-specific code, it also avoids our local handle
    allocation conflicting with the host GEM's handle numbering, which was
    causing vc4_gem_hindex() to not distinguish between winsys BOs and the
    same-numbered non-winsys bo.

diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index 581ba89..faec853 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -45,7 +45,6 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
         bo->size = size;
         bo->name = name;
 
-#ifndef USE_VC4_SIMULATOR
         struct drm_mode_create_dumb create;
         memset(&create, 0, sizeof(create));
 
@@ -59,12 +58,6 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
 
         bo->handle = create.handle;
         assert(create.size >= size);
-#else /* USE_VC4_SIMULATOR */
-        static int next_handle = 0;
-        bo->handle = next_handle++;
-
-        bo->map = malloc(size);
-#endif /* USE_VC4_SIMULATOR */
 
         return bo;
 }
@@ -72,20 +65,23 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
 void
 vc4_bo_free(struct vc4_bo *bo)
 {
-#ifndef USE_VC4_SIMULATOR
         struct vc4_screen *screen = bo->screen;
 
-        if (bo->map)
+        if (bo->map) {
+#ifdef USE_VC4_SIMULATOR
+                if (bo->simulator_winsys_map) {
+                        free(bo->map);
+                        bo->map = bo->simulator_winsys_map;
+                }
+#endif
                 munmap(bo->map, bo->size);
+        }
 
         struct drm_gem_close c;
         c.handle = bo->handle;
         int ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c);
         if (ret != 0)
                 fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno));
-#else
-        free(bo->map);
-#endif
 
         free(bo);
 }
@@ -137,7 +133,6 @@ vc4_bo_alloc_mem(struct vc4_screen *screen, const void *data, uint32_t size,
 bool
 vc4_bo_flink(struct vc4_bo *bo, uint32_t *name)
 {
-#ifndef USE_VC4_SIMULATOR
         struct drm_gem_flink flink = {
                 .handle = bo->handle,
         };
@@ -150,7 +145,6 @@ vc4_bo_flink(struct vc4_bo *bo, uint32_t *name)
         }
 
         *name = flink.name;
-#endif /* USE_VC4_SIMULATOR */
 
         return true;
 }
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index 88eda4f..8038fee5 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -75,6 +75,7 @@ vc4_simulator_pin_bos(struct drm_device *dev, struct exec_info *exec)
                 struct vc4_bo *bo = bos[i];
                 struct drm_gem_cma_object *obj = vc4_wrap_bo_with_cma(dev, bo);
 
+                vc4_bo_map(bo);
                 memcpy(obj->vaddr, bo->map, bo->size);
 
                 exec->bo[i].bo = obj;

commit cdc208bdaf90017c2e1aaa54d2318b956e801ca0
Author: Eric Anholt <eric@anholt.net>
Date:   Mon Aug 4 13:00:56 2014 -0700

    vc4: Don't forget to unmap the GEM BO when freeing.
    
    Otherwise it'll stick around forever.

diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index 653787e..581ba89 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -75,6 +75,9 @@ vc4_bo_free(struct vc4_bo *bo)
 #ifndef USE_VC4_SIMULATOR
         struct vc4_screen *screen = bo->screen;
 
+        if (bo->map)
+                munmap(bo->map, bo->size);
+
         struct drm_gem_close c;
         c.handle = bo->handle;
         int ret = drmIoctl(screen->fd, DRM_IOCTL_GEM_CLOSE, &c);

commit d2cc7f97df655bbca6486fbe81e35139215d7b72
Author: Eric Anholt <eric@anholt.net>
Date:   Sat Aug 2 21:28:34 2014 -0700

    vc4: Add validation of raster-format textures.
    
    ... and reject everything else, for now.
    
    v2: Rebase on v2 of the rendering config validation change.

diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.c b/src/gallium/drivers/vc4/vc4_simulator_validate.c
index 241ca17..421107a 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.c
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.c
@@ -101,8 +101,9 @@ gl_shader_rec_size(uint32_t pointer_bits)
 }
 
 static bool
-check_fbo_size(struct exec_info *exec, struct drm_gem_cma_object *fbo,
-	       uint32_t offset, uint8_t tiling_format, uint8_t cpp)
+check_tex_size(struct exec_info *exec, struct drm_gem_cma_object *fbo,
+	       uint32_t offset, uint8_t tiling_format,
+	       uint32_t width, uint32_t height, uint8_t cpp)
 {
 	uint32_t width_align, height_align;
 	uint32_t aligned_row_len, aligned_h, size;
@@ -125,14 +126,14 @@ check_fbo_size(struct exec_info *exec, struct drm_gem_cma_object *fbo,
 		return false;
 	}
 
-	/* The values are limited by the packet bitfields, so we don't need to
-	 * worry as much about integer overflow.
+	/* The values are limited by the packet/texture parameter bitfields,
+	 * so we don't need to worry as much about integer overflow.
 	 */
-	BUG_ON(exec->fb_width > 65535);
-	BUG_ON(exec->fb_height > 65535);
+	BUG_ON(width > 65535);
+	BUG_ON(height > 65535);
 
-	aligned_row_len = roundup(exec->fb_width * cpp, width_align);
-	aligned_h = roundup(exec->fb_height, height_align);
+	aligned_row_len = roundup(width * cpp, width_align);
+	aligned_h = roundup(height, height_align);
 
 	if (INT_MAX / aligned_row_len < aligned_h) {
 		DRM_ERROR("Overflow in fbo size (%d * %d)\n",
@@ -144,8 +145,7 @@ check_fbo_size(struct exec_info *exec, struct drm_gem_cma_object *fbo,
 	if (size + offset < size ||
 	    size + offset > fbo->base.size) {
 		DRM_ERROR("Overflow in %dx%d fbo size (%d + %d > %d)\n",
-			  exec->fb_width, exec->fb_height, size, offset,
-			  fbo->base.size);
+			  width, height, size, offset, fbo->base.size);
 		return false;
 	}
 
@@ -247,11 +247,11 @@ validate_loadstore_tile_buffer_general(VALIDATE_ARGS)
 
 	offset = *(uint32_t *)(untrusted + 2);
 
-	if (!check_fbo_size(exec, fbo, offset,
+	if (!check_tex_size(exec, fbo, offset,
 			    ((packet_b0 &
 			      VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK) >>
 			     VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT),
-			    cpp)) {
+			    exec->fb_width, exec->fb_height, cpp)) {
 		return -EINVAL;
 	}
 
@@ -499,11 +499,11 @@ validate_tile_rendering_mode_config(VALIDATE_ARGS)
 	}
 
 	offset = *(uint32_t *)untrusted;
-	if (!check_fbo_size(exec, fbo, offset,
+	if (!check_tex_size(exec, fbo, offset,
 			    ((flags &
 			      VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK) >>
 			     VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT),
-			    cpp)) {
+			    exec->fb_width, exec->fb_height, cpp)) {
 		return -EINVAL;
 	}
 
@@ -699,14 +699,91 @@ reloc_tex(struct exec_info *exec,
 
 {
 	struct drm_gem_cma_object *tex;
-	uint32_t unvalidated_p0 = *(uint32_t *)(uniform_data_u +
-						sample->p_offset[0]);
+	uint32_t p0 = *(uint32_t *)(uniform_data_u + sample->p_offset[0]);
+	uint32_t p1 = *(uint32_t *)(uniform_data_u + sample->p_offset[1]);
 	uint32_t *validated_p0 = exec->uniforms_v + sample->p_offset[0];
+	uint32_t offset = p0 & ~0xfff;
+	uint32_t miplevels = (p0 & 0x15);
+	uint32_t width = (p1 >> 8) & 2047;
+	uint32_t height = (p1 >> 20) & 2047;
+	uint32_t type, cpp, tiling_format;
+	int i;
+
+	if (width == 0)
+		width = 2048;
+	if (height == 0)
+		height = 2048;
+
+	if (p0 & (1 << 9)) {
+		DRM_ERROR("Cube maps unsupported\n");
+		return false;
+	}
+
+	type = ((p0 >> 4) & 15) | ((p1 >> 31) << 4);
+
+	switch (type) {
+	case 0: /* RGBA8888 */
+	case 1: /* RGBX8888 */
+	case 16: /* RGBA32R */
+		cpp = 4;
+		break;
+	case 2: /* RGBA4444 */
+	case 3: /* RGBA5551 */
+	case 4: /* RGB565 */
+	case 7: /* LUMALPHA */
+	case 9: /* S16F */
+	case 11: /* S16 */
+		cpp = 2;


Reply to: