nv50: prepare for having multiple functions

At some point we'll want to support real subroutines instead of just inlining them into the main shader. Since recursive calls are forbidden, we can just save all used registers to a fixed local memory region and restore them on a return, no need for a stack pointer.
author: Christoph Bumiller <e0425955@student.tuwien.ac.at> 2010-09-07 15:40:34 +0200
committer: Christoph Bumiller <e0425955@student.tuwien.ac.at> 2010-09-09 19:21:34 +0200
commit: d91b8865ec2bb41f9b58ad5ce2df7f6f48f98281 (patch)
tree: bc6ff85f2f891b9b05124cc1f8ec874159a3ace0
parent: 217542a061ef31150b1b04f1b45b6099bcc153fe (diff)
8 files changed, 171 insertions, 52 deletions
diff --git a/src/gallium/drivers/nv50/nv50_pc.c b/src/gallium/drivers/nv50/nv50_pc.c
index e34c0553eb..c54f16e4c5 100644
--- a/src/gallium/drivers/nv50/nv50_pc.c
+++ b/src/gallium/drivers/nv50/nv50_pc.c
@@ -304,7 +304,7 @@ nv_pc_pass_in_order(struct nv_basic_block *root, nv_pc_pass_func f, void *priv)
 }
 
 static void
-nv_do_print_program(void *priv, struct nv_basic_block *b)
+nv_do_print_function(void *priv, struct nv_basic_block *b)
 {
    struct nv_instruction *i = b->phi;
 
@@ -323,11 +323,23 @@ nv_do_print_program(void *priv, struct nv_basic_block *b)
 }
 
 void
-nv_print_program(struct nv_basic_block *root)
+nv_print_function(struct nv_basic_block *root)
 {
-   nv_pc_pass_in_order(root, nv_do_print_program, root);
+   if (root->subroutine)
+      debug_printf("SUBROUTINE %i\n", root->subroutine);
+   else
+      debug_printf("MAIN\n");
 
-   debug_printf("END\n\n");
+   nv_pc_pass_in_order(root, nv_do_print_function, root);
+}
+
+void
+nv_print_program(struct nv_pc *pc)
+{
+   int i;
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i])
+         nv_print_function(pc->root[i]);
 }
 
 static INLINE void
@@ -388,11 +400,18 @@ nv50_generate_code(struct nv50_translation_info *ti)
    if (!pc)
       return 1;
 
+   pc->root = CALLOC(ti->subr_nr + 1, sizeof(pc->root[0]));
+   if (!pc->root) {
+      FREE(pc);
+      return 1;
+   }
+   pc->num_subroutines = ti->subr_nr;
+
    ret = nv50_tgsi_to_nc(pc, ti);
    if (ret)
       goto out;
 #ifdef NV50PC_DEBUG
-   nv_print_program(pc->root);
+   nv_print_program(pc);
 #endif
 
    /* optimization */
@@ -400,7 +419,7 @@ nv50_generate_code(struct nv50_translation_info *ti)
    if (ret)
       goto out;
 #ifdef NV50PC_DEBUG
-   nv_print_program(pc->root);
+   nv_print_program(pc);
 #endif
 
    /* register allocation */
@@ -408,7 +427,7 @@ nv50_generate_code(struct nv50_translation_info *ti)
    if (ret)
       goto out;
 #ifdef NV50PC_DEBUG
-   nv_print_program(pc->root);
+   nv_print_program(pc);
 #endif
 
    /* prepare for emission */
@@ -441,16 +460,19 @@ nv50_generate_code(struct nv50_translation_info *ti)
 
 out:
    nv_pc_free_refs(pc);
-   if (ret) {
+
+   if (pc->bb_list)
+      FREE(pc->bb_list);
+
+   if (ret) { /* on success, these will be referenced by nv50_program */
       if (pc->emit)
-         free(pc->emit);
+         FREE(pc->emit);
       if (pc->immd_buf)
-         free(pc->immd_buf);
+         FREE(pc->immd_buf);
       if (pc->fixups)
-         free(pc->fixups);
+         FREE(pc->fixups);
    }
-   free(pc);
-
+   FREE(pc);
    return ret;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_pc.h b/src/gallium/drivers/nv50/nv50_pc.h
index 703d32d334..d9cc775572 100644
--- a/src/gallium/drivers/nv50/nv50_pc.h
+++ b/src/gallium/drivers/nv50/nv50_pc.h
@@ -282,7 +282,7 @@ struct nv_basic_block {
    ubyte in_kind[8];
 
    int id;
-   struct nv_basic_block *last_visitor;
+   int subroutine;
    uint priv;
    uint pass_seq;
 
@@ -314,10 +314,10 @@ nv_fixup_apply(uint32_t *bin, struct nv_fixup *fixup, uint32_t data)
    bin[fixup->offset / 4] = val;
 }
 
-struct nv_pc {
-   struct nv50_translation_info *ti;
+struct nv50_translation_info;
 
-   struct nv_basic_block *root;
+struct nv_pc {
+   struct nv_basic_block **root;
    struct nv_basic_block *current_block;
    struct nv_basic_block *parent_block;
 
@@ -332,6 +332,7 @@ struct nv_pc {
    int num_instructions;
    int num_refs;
    int num_blocks;
+   int num_subroutines;
 
    int max_reg[4];
 
@@ -463,7 +464,8 @@ void nv_print_instruction(struct nv_instruction *);
 
 /* nv50_pc.c */
 
-void nv_print_program(struct nv_basic_block *b);
+void nv_print_function(struct nv_basic_block *root);
+void nv_print_program(struct nv_pc *);
 
 boolean nv_op_commutative(uint opcode);
 int nv50_indirect_opnd(struct nv_instruction *);
diff --git a/src/gallium/drivers/nv50/nv50_pc_optimize.c b/src/gallium/drivers/nv50/nv50_pc_optimize.c
index 1ed5032175..4f5bdc1f9f 100644
--- a/src/gallium/drivers/nv50/nv50_pc_optimize.c
+++ b/src/gallium/drivers/nv50/nv50_pc_optimize.c
@@ -213,23 +213,36 @@ nv_pc_pass_pre_emission(void *priv, struct nv_basic_block *b)
    pc->bin_size += b->bin_size *= 4;
 }
 
-int
-nv_pc_exec_pass2(struct nv_pc *pc)
+static int
+nv_pc_pass2(struct nv_pc *pc, struct nv_basic_block *root)
 {
    struct nv_pass pass;
 
    pass.pc = pc;
 
    pc->pass_seq++;
-   nv_pass_flatten(&pass, pc->root);
+
+   nv_pass_flatten(&pass, root);
+
+   nv_pc_pass_in_order(root, nv_pc_pass_pre_emission, pc);
+
+   return 0;
+}
+
+int
+nv_pc_exec_pass2(struct nv_pc *pc)
+{
+   int i, ret;
 
    NV50_DBGMSG("preparing %u blocks for emission\n", pc->num_blocks);
 
-   pc->bb_list = CALLOC(pc->num_blocks, sizeof(struct nv_basic_block *));
-   pc->num_blocks = 0;
+   pc->bb_list = CALLOC(pc->num_blocks, sizeof(pc->bb_list[0]));
 
-   nv_pc_pass_in_order(pc->root, nv_pc_pass_pre_emission, pc);
+   pc->num_blocks = 0;
 
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i] && (ret = nv_pc_pass2(pc, pc->root[i])))
+         return ret;
    return 0;
 }
 
@@ -1032,8 +1045,8 @@ nv_pass_cse(struct nv_pass *ctx, struct nv_basic_block *b)
    return 0;
 }
 
-int
-nv_pc_exec_pass0(struct nv_pc *pc)
+static int
+nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
 {
    struct nv_pass_reld_elim *reldelim;
    struct nv_pass pass;
@@ -1047,35 +1060,35 @@ nv_pc_exec_pass0(struct nv_pc *pc)
     * to whether sources are supported memory loads.
     */
    pc->pass_seq++;
-   ret = nv_pass_lower_arith(&pass, pc->root);
+   ret = nv_pass_lower_arith(&pass, root);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_fold_loads(&pass, pc->root);
+   ret = nv_pass_fold_loads(&pass, root);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_fold_stores(&pass, pc->root);
+   ret = nv_pass_fold_stores(&pass, root);
    if (ret)
       return ret;
 
    reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
    reldelim->pc = pc;
    pc->pass_seq++;
-   ret = nv_pass_reload_elim(reldelim, pc->root);
+   ret = nv_pass_reload_elim(reldelim, root);
    FREE(reldelim);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_cse(&pass, pc->root);
+   ret = nv_pass_cse(&pass, root);
    if (ret)
       return ret;
 
    pc->pass_seq++;
-   ret = nv_pass_lower_mods(&pass, pc->root);
+   ret = nv_pass_lower_mods(&pass, root);
    if (ret)
       return ret;
 
@@ -1083,14 +1096,25 @@ nv_pc_exec_pass0(struct nv_pc *pc)
    do {
       dce.removed = 0;
       pc->pass_seq++;
-      ret = nv_pass_dce(&dce, pc->root);
+      ret = nv_pass_dce(&dce, root);
       if (ret)
          return ret;
    } while (dce.removed);
 
-   ret = nv_pass_tex_mask(&pass, pc->root);
+   ret = nv_pass_tex_mask(&pass, root);
    if (ret)
       return ret;
 
    return ret;
 }
+
+int
+nv_pc_exec_pass0(struct nv_pc *pc)
+{
+   int i, ret;
+
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i] && (ret = nv_pc_pass0(pc, pc->root[i])))
+         return ret;
+   return 0;
+}
diff --git a/src/gallium/drivers/nv50/nv50_pc_regalloc.c b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
index d401706b5b..2998343db5 100644
--- a/src/gallium/drivers/nv50/nv50_pc_regalloc.c
+++ b/src/gallium/drivers/nv50/nv50_pc_regalloc.c
@@ -874,8 +874,8 @@ pass_linear_scan(struct nv_pc_pass *ctx, int iter)
    return 0;
 }
 
-int
-nv_pc_exec_pass1(struct nv_pc *pc)
+static int
+nv_pc_pass1(struct nv_pc *pc, struct nv_basic_block *root)
 {
    struct nv_pc_pass *ctx;
    int i, ret;
@@ -890,12 +890,12 @@ nv_pc_exec_pass1(struct nv_pc *pc)
    ctx->insns = CALLOC(NV_PC_MAX_INSTRUCTIONS, sizeof(struct nv_instruction *));
 
    pc->pass_seq++;
-   ret = pass_generate_phi_movs(ctx, pc->root);
+   ret = pass_generate_phi_movs(ctx, root);
    assert(!ret);
 
    for (i = 0; i < pc->loop_nesting_bound; ++i) {
       pc->pass_seq++;
-      ret = pass_build_live_sets(ctx, pc->root);
+      ret = pass_build_live_sets(ctx, root);
       assert(!ret && "live sets");
       if (ret) {
          NOUVEAU_ERR("failed to build live sets (iteration %d)\n", i);
@@ -904,10 +904,10 @@ nv_pc_exec_pass1(struct nv_pc *pc)
    }
 
    pc->pass_seq++;
-   nv_pc_pass_in_order(pc->root, pass_order_instructions, ctx);
+   nv_pc_pass_in_order(root, pass_order_instructions, ctx);
 
    pc->pass_seq++;
-   ret = pass_build_intervals(ctx, pc->root);
+   ret = pass_build_intervals(ctx, root);
    assert(!ret && "build intervals");
    if (ret) {
       NOUVEAU_ERR("failed to build live intervals\n");
@@ -944,3 +944,14 @@ out:
    FREE(ctx);
    return ret;
 }
+
+int
+nv_pc_exec_pass1(struct nv_pc *pc)
+{
+   int i, ret;
+
+   for (i = 0; i < pc->num_subroutines + 1; ++i)
+      if (pc->root[i] && (ret = nv_pc_pass1(pc, pc->root[i])))
+         return ret;
+   return 0;
+}
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index d7d3030e2f..925028700c 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -147,10 +147,17 @@ prog_inst(struct nv50_translation_info *ti,
    int s, c, k;
    unsigned mask;
 
+   if (inst->Instruction.Opcode == TGSI_OPCODE_BGNSUB) {
+      ti->subr[ti->subr_nr].pos = id - 1;
+      ti->subr[ti->subr_nr].id = ti->subr_nr + 1; /* id 0 is main program */
+      ++ti->subr_nr;
+   }
+
    if (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
+      dst = &inst->Dst[0].Register;
+
       for (c = 0; c < 4; ++c) {
-         dst = &inst->Dst[0].Register;
-         if (inst->Dst[0].Register.Indirect)
+         if (dst->Indirect)
             nv50_indirect_outputs(ti, id);
          if (!(dst->WriteMask & (1 << c)))
             continue;
@@ -182,6 +189,44 @@ prog_inst(struct nv50_translation_info *ti,
    }
 }
 
+/* Probably should introduce something like struct tgsi_function_declaration
+ * instead of trying to guess inputs/outputs.
+ */
+static void
+prog_subroutine_inst(struct nv50_subroutine *subr,
+                     const struct tgsi_full_instruction *inst)
+{
+   const struct tgsi_dst_register *dst;
+   const struct tgsi_src_register *src;
+   int s, c, k;
+   unsigned mask;
+
+   for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) {
+      src = &inst->Src[s].Register;
+      if (src->File != TGSI_FILE_TEMPORARY)
+         continue;
+      mask = nv50_tgsi_src_mask(inst, s);
+
+      assert(!inst->Src[s].Register.Indirect);
+
+      for (c = 0; c < 4; ++c) {
+         k = tgsi_util_get_full_src_register_swizzle(&inst->Src[s], c);
+
+         if ((mask & (1 << c)) && k < TGSI_SWIZZLE_W)
+            if (!(subr->retv[src->Index / 32][k] & (1 << (src->Index % 32))))
+               subr->argv[src->Index / 32][k] |= 1 << (src->Index % 32);
+      }
+   }
+
+   if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) {
+      dst = &inst->Dst[0].Register;
+
+      for (c = 0; c < 4; ++c)
+         if (dst->WriteMask & (1 << c))
+            subr->retv[dst->Index / 32][c] |= 1 << (dst->Index % 32);
+   }
+}
+
 static void
 prog_immediate(struct nv50_translation_info *ti,
                const struct tgsi_full_immediate *imm)
@@ -482,7 +527,7 @@ nv50_prog_scan(struct nv50_translation_info *ti)
 {
    struct nv50_program *p = ti->p;
    struct tgsi_parse_context parse;
-   int ret;
+   int ret, i;
 
    p->vp.edgeflag = 0x40;
    p->vp.psiz = 0x40;
@@ -496,6 +541,9 @@ nv50_prog_scan(struct nv50_translation_info *ti)
    tgsi_dump(p->pipe.tokens, 0);
 #endif
 
+   ti->subr =
+      CALLOC(ti->scan.opcode_count[TGSI_OPCODE_BGNSUB], sizeof(ti->subr[0]));
+
    ti->immd32 = (uint32_t *)MALLOC(ti->scan.immediate_count * 16);
    ti->immd32_ty = (ubyte *)MALLOC(ti->scan.immediate_count * sizeof(ubyte));
 
@@ -519,6 +567,13 @@ nv50_prog_scan(struct nv50_translation_info *ti)
       }
    }
 
+   /* Scan to determine which registers are inputs/outputs of a subroutine. */
+   for (i = 0; i < ti->subr_nr; ++i) {
+      int pc = ti->subr[i].id;
+      while (ti->insns[pc].Instruction.Opcode != TGSI_OPCODE_ENDSUB)
+         prog_subroutine_inst(&ti->subr[i], &ti->insns[pc++]);
+   }
+
    p->in_nr = ti->scan.file_max[TGSI_FILE_INPUT] + 1;
    p->out_nr = ti->scan.file_max[TGSI_FILE_OUTPUT] + 1;
 
@@ -572,6 +627,8 @@ out:
       FREE(ti->immd32_ty);
    if (ti->insns)
       FREE(ti->insns);
+   if (ti->subr)
+      FREE(ti->subr);
    FREE(ti);
    return ret ? FALSE : TRUE;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.h b/src/gallium/drivers/nv50/nv50_program.h
index 3c3f1f7f97..918baf325f 100644
--- a/src/gallium/drivers/nv50/nv50_program.h
+++ b/src/gallium/drivers/nv50/nv50_program.h
@@ -27,6 +27,8 @@
 #include "tgsi/tgsi_scan.h"
 #include "nouveau/nouveau_class.h"
 
+#define NV50_CAP_MAX_PROGRAM_TEMPS (128 / 4)
+
 struct nv50_varying {
    uint8_t id; /* tgsi index */
    uint8_t hw; /* hw index, nv50 wants flat FP inputs last */
@@ -92,13 +94,13 @@ struct nv50_program {
 #define NV50_INTERP_FLAT     (1 << 1)
 #define NV50_INTERP_CENTROID (1 << 2)
 
-#define NV50_PROG_MAX_SUBROUTINES 8
-
 /* analyze TGSI and see which TEMP[] are used as subroutine inputs/outputs */
 struct nv50_subroutine {
-   int id;
-   uint32_t argv[4][1]; /* 4 bitmasks, for each of xyzw, only allow 32 TEMPs */
-   uint32_t retv[4][1];
+   unsigned id;
+   unsigned pos;
+   /* function inputs and outputs */
+   uint32_t argv[NV50_CAP_MAX_PROGRAM_TEMPS][4];
+   uint32_t retv[NV50_CAP_MAX_PROGRAM_TEMPS][4];
 };
 
 struct nv50_translation_info {
@@ -119,8 +121,8 @@ struct nv50_translation_info {
    unsigned immd32_nr;
    ubyte *immd32_ty;
    ubyte edgeflag_out;
-   struct nv50_subroutine subr[NV50_PROG_MAX_SUBROUTINES];
-   int subr_nr;
+   struct nv50_subroutine *subr;
+   unsigned subr_nr;
 };
 
 int nv50_generate_code(struct nv50_translation_info *ti);
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index fc75d81d54..c1efa443da 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -26,6 +26,7 @@
 #include "nv50_context.h"
 #include "nv50_screen.h"
 #include "nv50_resource.h"
+#include "nv50_program.h"
 
 #include "nouveau/nouveau_stateobj.h"
 
@@ -152,7 +153,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return 0;
 	case PIPE_CAP_MAX_VS_TEMPS:
 	case PIPE_CAP_MAX_FS_TEMPS: /* no spilling atm */
-		return 128 / 4;
+		return NV50_CAP_MAX_PROGRAM_TEMPS;
 	case PIPE_CAP_DEPTH_CLAMP:
 		return 1;
 	default:
diff --git a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
index 386dbda423..dea8fa0663 100644
--- a/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
+++ b/src/gallium/drivers/nv50/nv50_tgsi_to_nc.c
@@ -1850,7 +1850,7 @@ nv50_tgsi_to_nc(struct nv_pc *pc, struct nv50_translation_info *ti)
    struct bld_context *bld = CALLOC_STRUCT(bld_context);
    int c;
 
-   pc->root = pc->current_block = new_basic_block(pc);
+   pc->root[0] = pc->current_block = new_basic_block(pc);
 
    bld->pc = pc;
    bld->ti = ti;
author	Christoph Bumiller <e0425955@student.tuwien.ac.at>	2010-09-07 15:40:34 +0200
committer	Christoph Bumiller <e0425955@student.tuwien.ac.at>	2010-09-09 19:21:34 +0200
commit	d91b8865ec2bb41f9b58ad5ce2df7f6f48f98281 (patch)
tree	bc6ff85f2f891b9b05124cc1f8ec874159a3ace0
parent	217542a061ef31150b1b04f1b45b6099bcc153fe (diff)