예제 #1
0
static void ei_if(struct r300_vertex_program_compiler * compiler,
					struct rc_instruction *rci,
					unsigned int * inst,
					unsigned int branch_depth)
{
	unsigned int predicate_opcode;
	int is_math = 0;

	if (!compiler->Base.is_r500) {
		rc_error(&compiler->Base,"Opcode IF not supported\n");
		return;
	}

	/* Reserve a temporary to use as our predicate stack counter, if we
	 * don't already have one. */
	if (!compiler->PredicateMask) {
		unsigned int writemasks[RC_REGISTER_MAX_INDEX];
		struct rc_instruction * inst;
		unsigned int i;
		memset(writemasks, 0, sizeof(writemasks));
		for(inst = compiler->Base.Program.Instructions.Next;
				inst != &compiler->Base.Program.Instructions;
							inst = inst->Next) {
			rc_for_all_writes_mask(inst, mark_write, writemasks);
		}
		for(i = 0; i < compiler->Base.max_temp_regs; i++) {
			unsigned int mask = ~writemasks[i] & RC_MASK_XYZW;
			/* Only the W component can be used fo the predicate
			 * stack counter. */
			if (mask & RC_MASK_W) {
				compiler->PredicateMask = RC_MASK_W;
				compiler->PredicateIndex = i;
				break;
			}
		}
		if (i == compiler->Base.max_temp_regs) {
			rc_error(&compiler->Base, "No free temporary to use for"
					" predicate stack counter.\n");
			return;
		}
	}
	predicate_opcode =
			branch_depth ? VE_PRED_SET_NEQ_PUSH : ME_PRED_SET_NEQ;

	rci->U.I.SrcReg[0].Swizzle = RC_MAKE_SWIZZLE_SMEAR(GET_SWZ(rci->U.I.SrcReg[0].Swizzle,0));
	if (branch_depth == 0) {
		is_math = 1;
		predicate_opcode = ME_PRED_SET_NEQ;
		inst[1] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
		inst[2] = 0;
	} else {
		predicate_opcode = VE_PRED_SET_NEQ_PUSH;
		inst[1] = t_pred_src(compiler);
		inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
	}

	inst[0] = t_pred_dst(compiler, predicate_opcode, is_math);
	inst[3] = 0;

}
예제 #2
0
static void ei_mad(struct r300_vertex_program_code *vp,
				      struct rc_sub_instruction *vpi,
				      unsigned int * inst)
{
	/* Remarks about hardware limitations of MAD
	 * (please preserve this comment, as this information is _NOT_
	 * in the documentation provided by AMD).
	 *
	 * As described in the documentation, MAD with three unique temporary
	 * source registers requires the use of the macro version.
	 *
	 * However (and this is not mentioned in the documentation), apparently
	 * the macro version is _NOT_ a full superset of the normal version.
	 * In particular, the macro version does not always work when relative
	 * addressing is used in the source operands.
	 *
	 * This limitation caused incorrect rendering in Sauerbraten's OpenGL
	 * assembly shader path when using medium quality animations
	 * (i.e. animations with matrix blending instead of quaternion blending).
	 *
	 * Unfortunately, I (nha) have been unable to extract a Piglit regression
	 * test for this issue - for some reason, it is possible to have vertex
	 * programs whose prefix is *exactly* the same as the prefix of the
	 * offending program in Sauerbraten up to the offending instruction
	 * without causing any trouble.
	 *
	 * Bottom line: Only use the macro version only when really necessary;
	 * according to AMD docs, this should improve performance by one clock
	 * as a nice side bonus.
	 */
	if (vpi->SrcReg[0].File == RC_FILE_TEMPORARY &&
	    vpi->SrcReg[1].File == RC_FILE_TEMPORARY &&
	    vpi->SrcReg[2].File == RC_FILE_TEMPORARY &&
	    vpi->SrcReg[0].Index != vpi->SrcReg[1].Index &&
	    vpi->SrcReg[0].Index != vpi->SrcReg[2].Index &&
	    vpi->SrcReg[1].Index != vpi->SrcReg[2].Index) {
		inst[0] = PVS_OP_DST_OPERAND(PVS_MACRO_OP_2CLK_MADD,
				0,
				1,
				t_dst_index(vp, &vpi->DstReg),
				t_dst_mask(vpi->DstReg.WriteMask),
				t_dst_class(vpi->DstReg.File));
	} else {
		inst[0] = PVS_OP_DST_OPERAND(VE_MULTIPLY_ADD,
				0,
				0,
				t_dst_index(vp, &vpi->DstReg),
				t_dst_mask(vpi->DstReg.WriteMask),
				t_dst_class(vpi->DstReg.File));
	}
	inst[1] = t_src(vp, &vpi->SrcReg[0]);
	inst[2] = t_src(vp, &vpi->SrcReg[1]);
	inst[3] = t_src(vp, &vpi->SrcReg[2]);
}
예제 #3
0
static void ei_vector2(struct r300_vertex_program_code *vp,
				unsigned int hw_opcode,
				struct rc_sub_instruction *vpi,
				unsigned int * inst)
{
	inst[0] = PVS_OP_DST_OPERAND(hw_opcode,
				     0,
				     0,
				     t_dst_index(vp, &vpi->DstReg),
				     t_dst_mask(vpi->DstReg.WriteMask),
				     t_dst_class(vpi->DstReg.File));
	inst[1] = t_src(vp, &vpi->SrcReg[0]);
	inst[2] = t_src(vp, &vpi->SrcReg[1]);
	inst[3] = __CONST(1, RC_SWIZZLE_ZERO);
}
예제 #4
0
/**
 * Generate an R200 vertex program from Mesa's internal representation.
 *
 * \return  GL_TRUE for success, GL_FALSE for failure.
 */
static GLboolean r200_translate_vertex_program(struct gl_context *ctx, struct r200_vertex_program *vp)
{
   struct gl_vertex_program *mesa_vp = &vp->mesa_program;
   struct prog_instruction *vpi;
   int i;
   VERTEX_SHADER_INSTRUCTION *o_inst;
   unsigned long operands;
   int are_srcs_scalar;
   unsigned long hw_op;
   int dofogfix = 0;
   int fog_temp_i = 0;
   int free_inputs;
   int array_count = 0;
   int u_temp_used;

   vp->native = GL_FALSE;
   vp->translated = GL_TRUE;
   vp->fogmode = ctx->Fog.Mode;

   if (mesa_vp->Base.NumInstructions == 0)
      return GL_FALSE;

#if 0
   if ((mesa_vp->Base.InputsRead &
      ~(VERT_BIT_POS | VERT_BIT_NORMAL | VERT_BIT_COLOR0 | VERT_BIT_COLOR1 |
      VERT_BIT_FOG | VERT_BIT_TEX0 | VERT_BIT_TEX1 | VERT_BIT_TEX2 |
      VERT_BIT_TEX3 | VERT_BIT_TEX4 | VERT_BIT_TEX5)) != 0) {
      if (R200_DEBUG & RADEON_FALLBACKS) {
	 fprintf(stderr, "can't handle vert prog inputs 0x%x\n",
	    mesa_vp->Base.InputsRead);
      }
      return GL_FALSE;
   }
#endif

   if ((mesa_vp->Base.OutputsWritten &
      ~((1 << VARYING_SLOT_POS) | (1 << VARYING_SLOT_COL0) | (1 << VARYING_SLOT_COL1) |
      (1 << VARYING_SLOT_FOGC) | (1 << VARYING_SLOT_TEX0) | (1 << VARYING_SLOT_TEX1) |
      (1 << VARYING_SLOT_TEX2) | (1 << VARYING_SLOT_TEX3) | (1 << VARYING_SLOT_TEX4) |
      (1 << VARYING_SLOT_TEX5) | (1 << VARYING_SLOT_PSIZ))) != 0) {
      if (R200_DEBUG & RADEON_FALLBACKS) {
	 fprintf(stderr, "can't handle vert prog outputs 0x%llx\n",
                 (unsigned long long) mesa_vp->Base.OutputsWritten);
      }
      return GL_FALSE;
   }

   /* Initial value should be last tmp reg that hw supports.
      Strangely enough r300 doesnt mind even though these would be out of range.
      Smart enough to realize that it doesnt need it? */
   int u_temp_i = R200_VSF_MAX_TEMPS - 1;
   struct prog_src_register src[3];
   struct prog_dst_register dst;

/* FIXME: is changing the prog safe to do here? */
   if (mesa_vp->IsPositionInvariant &&
      /* make sure we only do this once */
       !(mesa_vp->Base.OutputsWritten & (1 << VARYING_SLOT_POS))) {
	 _mesa_insert_mvp_code(ctx, mesa_vp);
      }

   /* for fogc, can't change mesa_vp, as it would hose swtnl, and exp with
      base e isn't directly available neither. */
   if ((mesa_vp->Base.OutputsWritten & (1 << VARYING_SLOT_FOGC)) && !vp->fogpidx) {
      struct gl_program_parameter_list *paramList;
      gl_state_index tokens[STATE_LENGTH] = { STATE_FOG_PARAMS, 0, 0, 0, 0 };
      paramList = mesa_vp->Base.Parameters;
      vp->fogpidx = _mesa_add_state_reference(paramList, tokens);
   }

   vp->pos_end = 0;
   mesa_vp->Base.NumNativeInstructions = 0;
   if (mesa_vp->Base.Parameters)
      mesa_vp->Base.NumNativeParameters = mesa_vp->Base.Parameters->NumParameters;
   else
      mesa_vp->Base.NumNativeParameters = 0;

   for(i = 0; i < VERT_ATTRIB_MAX; i++)
      vp->inputs[i] = -1;
   for(i = 0; i < 15; i++)
      vp->inputmap_rev[i] = 255;
   free_inputs = 0x2ffd;

/* fglrx uses fixed inputs as follows for conventional attribs.
   generic attribs use non-fixed assignment, fglrx will always use the
   lowest attrib values available. We'll just do the same.
   There are 12 generic attribs possible, corresponding to attrib 0, 2-11
   and 13 in a hw vertex prog.
   attr 1 and 12 aren't used for generic attribs as those cannot be made vec4
   (correspond to vertex normal/weight - maybe weight actually could be made vec4).
   Additionally, not more than 12 arrays in total are possible I think.
   attr 0 is pos, R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0
   attr 2-5 use colors 0-3 (R200_VTX_FP_RGBA << R200_VTX_COLOR_0/1/2/3_SHIFT in R200_SE_VTX_FMT_0)
   attr 6-11 use tex 0-5 (4 << R200_VTX_TEX0/1/2/3/4/5_COMP_CNT_SHIFT in R200_SE_VTX_FMT_1)
   attr 13 uses vtx1 pos (R200_VTX_XY1|R200_VTX_Z1|R200_VTX_W1 in R200_SE_VTX_FMT_0)
*/

/* attr 4,5 and 13 are only used with generic attribs.
   Haven't seen attr 14 used, maybe that's for the hw pointsize vec1 (which is
   not possibe to use with vertex progs as it is lacking in vert prog specification) */
/* may look different when using idx buf / input_route instead of se_vtx_fmt? */
   if (mesa_vp->Base.InputsRead & VERT_BIT_POS) {
      vp->inputs[VERT_ATTRIB_POS] = 0;
      vp->inputmap_rev[0] = VERT_ATTRIB_POS;
      free_inputs &= ~(1 << 0);
      array_count++;
   }
   if (mesa_vp->Base.InputsRead & VERT_BIT_WEIGHT) {
      vp->inputs[VERT_ATTRIB_WEIGHT] = 12;
      vp->inputmap_rev[1] = VERT_ATTRIB_WEIGHT;
      array_count++;
   }
   if (mesa_vp->Base.InputsRead & VERT_BIT_NORMAL) {
      vp->inputs[VERT_ATTRIB_NORMAL] = 1;
      vp->inputmap_rev[2] = VERT_ATTRIB_NORMAL;
      array_count++;
   }
   if (mesa_vp->Base.InputsRead & VERT_BIT_COLOR0) {
      vp->inputs[VERT_ATTRIB_COLOR0] = 2;
      vp->inputmap_rev[4] = VERT_ATTRIB_COLOR0;
      free_inputs &= ~(1 << 2);
      array_count++;
   }
   if (mesa_vp->Base.InputsRead & VERT_BIT_COLOR1) {
      vp->inputs[VERT_ATTRIB_COLOR1] = 3;
      vp->inputmap_rev[5] = VERT_ATTRIB_COLOR1;
      free_inputs &= ~(1 << 3);
      array_count++;
   }
   if (mesa_vp->Base.InputsRead & VERT_BIT_FOG) {
      vp->inputs[VERT_ATTRIB_FOG] = 15; array_count++;
      vp->inputmap_rev[3] = VERT_ATTRIB_FOG;
      array_count++;
   }
   /* VERT_ATTRIB_TEX0-5 */
   for (i = 0; i <= 5; i++) {
      if (mesa_vp->Base.InputsRead & VERT_BIT_TEX(i)) {
	 vp->inputs[VERT_ATTRIB_TEX(i)] = i + 6;
	 vp->inputmap_rev[8 + i] = VERT_ATTRIB_TEX(i);
	 free_inputs &= ~(1 << (i + 6));
	 array_count++;
      }
   }
   /* using VERT_ATTRIB_TEX6/7 would be illegal */
   for (; i < VERT_ATTRIB_TEX_MAX; i++) {
      if (mesa_vp->Base.InputsRead & VERT_BIT_TEX(i)) {
          if (R200_DEBUG & RADEON_FALLBACKS) {
              fprintf(stderr, "texture attribute %d in vert prog\n", i);
          }
          return GL_FALSE;
      }
   }
   /* completely ignore aliasing? */
   for (i = 0; i < VERT_ATTRIB_GENERIC_MAX; i++) {
      int j;
   /* completely ignore aliasing? */
      if (mesa_vp->Base.InputsRead & VERT_BIT_GENERIC(i)) {
	 array_count++;
	 if (array_count > 12) {
	    if (R200_DEBUG & RADEON_FALLBACKS) {
	       fprintf(stderr, "more than 12 attribs used in vert prog\n");
	    }
	    return GL_FALSE;
	 }
	 for (j = 0; j < 14; j++) {
	    /* will always find one due to limited array_count */
	    if (free_inputs & (1 << j)) {
	       free_inputs &= ~(1 << j);
	       vp->inputs[VERT_ATTRIB_GENERIC(i)] = j;
	       if (j == 0) {
                  /* mapped to pos */
                  vp->inputmap_rev[j] = VERT_ATTRIB_GENERIC(i);
	       } else if (j < 12) {
                  /* mapped to col/tex */
                  vp->inputmap_rev[j + 2] = VERT_ATTRIB_GENERIC(i);
	       } else {
                  /* mapped to pos1 */
                  vp->inputmap_rev[j + 1] = VERT_ATTRIB_GENERIC(i);
               }
	       break;
	    }
	 }
      }
   }

   if (!(mesa_vp->Base.OutputsWritten & (1 << VARYING_SLOT_POS))) {
      if (R200_DEBUG & RADEON_FALLBACKS) {
	 fprintf(stderr, "can't handle vert prog without position output\n");
      }
      return GL_FALSE;
   }
   if (free_inputs & 1) {
      if (R200_DEBUG & RADEON_FALLBACKS) {
	 fprintf(stderr, "can't handle vert prog without position input\n");
      }
      return GL_FALSE;
   }

   o_inst = vp->instr;
   for (vpi = mesa_vp->Base.Instructions; vpi->Opcode != OPCODE_END; vpi++, o_inst++){
      operands = op_operands(vpi->Opcode);
      are_srcs_scalar = operands & SCALAR_FLAG;
      operands &= OP_MASK;

      for(i = 0; i < operands; i++) {
	 src[i] = vpi->SrcReg[i];
	 /* hack up default attrib values as per spec as swizzling.
	    normal, fog, secondary color. Crazy?
	    May need more if we don't submit vec4 elements? */
	 if (src[i].File == PROGRAM_INPUT) {
	    if (src[i].Index == VERT_ATTRIB_NORMAL) {
	       int j;
	       for (j = 0; j < 4; j++) {
		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
		     src[i].Swizzle |= SWIZZLE_ONE << (j*3);
		  }
	       }
	    }
	    else if (src[i].Index == VERT_ATTRIB_COLOR1) {
	       int j;
	       for (j = 0; j < 4; j++) {
		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
		     src[i].Swizzle |= SWIZZLE_ZERO << (j*3);
		  }
	       }
	    }
	    else if (src[i].Index == VERT_ATTRIB_FOG) {
	       int j;
	       for (j = 0; j < 4; j++) {
		  if (GET_SWZ(src[i].Swizzle, j) == SWIZZLE_W) {
		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
		     src[i].Swizzle |= SWIZZLE_ONE << (j*3);
		  }
		  else if ((GET_SWZ(src[i].Swizzle, j) == SWIZZLE_Y) ||
			    GET_SWZ(src[i].Swizzle, j) == SWIZZLE_Z) {
		     src[i].Swizzle &= ~(SWIZZLE_W << (j*3));
		     src[i].Swizzle |= SWIZZLE_ZERO << (j*3);
		  }
	       }
	    }
	 }
      }

      if(operands == 3){
	 if( CMP_SRCS(src[1], src[2]) || CMP_SRCS(src[0], src[2]) ){
	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
		VSF_FLAG_ALL);

	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[2]),
		  SWIZZLE_X, SWIZZLE_Y,
		  SWIZZLE_Z, SWIZZLE_W,
		  t_src_class(src[2].File), VSF_FLAG_NONE) | (src[2].RelAddr << 4);

	    o_inst->src1 = ZERO_SRC_0;
	    o_inst->src2 = UNUSED_SRC_1;
	    o_inst++;

	    src[2].File = PROGRAM_TEMPORARY;
	    src[2].Index = u_temp_i;
	    src[2].RelAddr = 0;
	    u_temp_i--;
	 }
      }

      if(operands >= 2){
	 if( CMP_SRCS(src[1], src[0]) ){
	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
		(u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
		VSF_FLAG_ALL);

	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
		  SWIZZLE_X, SWIZZLE_Y,
		  SWIZZLE_Z, SWIZZLE_W,
		  t_src_class(src[0].File), VSF_FLAG_NONE) | (src[0].RelAddr << 4);

	    o_inst->src1 = ZERO_SRC_0;
	    o_inst->src2 = UNUSED_SRC_1;
	    o_inst++;

	    src[0].File = PROGRAM_TEMPORARY;
	    src[0].Index = u_temp_i;
	    src[0].RelAddr = 0;
	    u_temp_i--;
	 }
      }

      dst = vpi->DstReg;
      if (dst.File == PROGRAM_OUTPUT &&
	  dst.Index == VARYING_SLOT_FOGC &&
	  dst.WriteMask & WRITEMASK_X) {
	  fog_temp_i = u_temp_i;
	  dst.File = PROGRAM_TEMPORARY;
	  dst.Index = fog_temp_i;
	  dofogfix = 1;
	  u_temp_i--;
      }

      /* These ops need special handling. */
      switch(vpi->Opcode){
      case OPCODE_POW:
/* pow takes only one argument, first scalar is in slot x, 2nd in slot z (other slots don't matter).
   So may need to insert additional instruction */
	 if ((src[0].File == src[1].File) &&
	     (src[0].Index == src[1].Index)) {
	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&dst),
		   t_dst_mask(dst.WriteMask));
	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
		   SWIZZLE_ZERO,
		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
		   SWIZZLE_ZERO,
		   t_src_class(src[0].File),
		   src[0].Negate) | (src[0].RelAddr << 4);
	    o_inst->src1 = UNUSED_SRC_0;
	    o_inst->src2 = UNUSED_SRC_0;
	 }
	 else {
	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
		   (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
		   VSF_FLAG_ALL);
	    o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
		   t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
		   SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO,
		   t_src_class(src[0].File),
		   src[0].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
	    o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
		   SWIZZLE_ZERO, SWIZZLE_ZERO,
		   t_swizzle(GET_SWZ(src[1].Swizzle, 0)), SWIZZLE_ZERO,
		   t_src_class(src[1].File),
		   src[1].Negate ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
	    o_inst->src2 = UNUSED_SRC_1;
	    o_inst++;

	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_POW, t_dst(&dst),
		   t_dst_mask(dst.WriteMask));
	    o_inst->src0 = MAKE_VSF_SOURCE(u_temp_i,
		   VSF_IN_COMPONENT_X,
		   VSF_IN_COMPONENT_Y,
		   VSF_IN_COMPONENT_Z,
		   VSF_IN_COMPONENT_W,
		   VSF_IN_CLASS_TMP,
		   VSF_FLAG_NONE);
	    o_inst->src1 = UNUSED_SRC_0;
	    o_inst->src2 = UNUSED_SRC_0;
	    u_temp_i--;
	 }
	 goto next;

      case OPCODE_MOV://ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{} {ZERO ZERO ZERO ZERO} 
      case OPCODE_SWZ:
	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
		t_dst_mask(dst.WriteMask));
	 o_inst->src0 = t_src(vp, &src[0]);
	 o_inst->src1 = ZERO_SRC_0;
	 o_inst->src2 = UNUSED_SRC_1;
	 goto next;

      case OPCODE_MAD:
	 /* only 2 read ports into temp memory thus may need the macro op MAD_2
	    instead (requiring 2 clocks) if all inputs are in temp memory
	    (and, only if they actually reference 3 distinct temps) */
	 hw_op=(src[0].File == PROGRAM_TEMPORARY &&
	    src[1].File == PROGRAM_TEMPORARY &&
	    src[2].File == PROGRAM_TEMPORARY &&
	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[1].RelAddr << 8) | src[1].Index)) &&
	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[2].RelAddr << 8) | src[2].Index)) &&
	    (((src[1].RelAddr << 8) | src[1].Index) != ((src[2].RelAddr << 8) | src[2].Index))) ?
	    R200_VPI_OUT_OP_MAD_2 : R200_VPI_OUT_OP_MAD;

	 o_inst->op = MAKE_VSF_OP(hw_op, t_dst(&dst),
	    t_dst_mask(dst.WriteMask));
	 o_inst->src0 = t_src(vp, &src[0]);
#if 0
if ((o_inst - vp->instr) == 31) {
/* fix up the broken vertex program of quake4 demo... */
o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
			SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X,
			t_src_class(src[1].File),
			src[1].Negate) | (src[1].RelAddr << 4);
o_inst->src2 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
			SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y,
			t_src_class(src[1].File),
			src[1].Negate) | (src[1].RelAddr << 4);
}
else {
	 o_inst->src1 = t_src(vp, &src[1]);
	 o_inst->src2 = t_src(vp, &src[2]);
}
#else
	 o_inst->src1 = t_src(vp, &src[1]);
	 o_inst->src2 = t_src(vp, &src[2]);
#endif
	 goto next;

      case OPCODE_DP3://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ZERO} PARAM 0{} {X Y Z ZERO} 
	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&dst),
		t_dst_mask(dst.WriteMask));

	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
		SWIZZLE_ZERO,
		t_src_class(src[0].File),
		src[0].Negate) | (src[0].RelAddr << 4);

	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
		SWIZZLE_ZERO,
		t_src_class(src[1].File),
		src[1].Negate) | (src[1].RelAddr << 4);

	 o_inst->src2 = UNUSED_SRC_1;
	 goto next;

      case OPCODE_DPH://DOT RESULT 1.X Y Z W PARAM 0{} {X Y Z ONE} PARAM 0{} {X Y Z W} 
	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_DOT, t_dst(&dst),
		t_dst_mask(dst.WriteMask));

	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
		VSF_IN_COMPONENT_ONE,
		t_src_class(src[0].File),
		src[0].Negate) | (src[0].RelAddr << 4);
	 o_inst->src1 = t_src(vp, &src[1]);
	 o_inst->src2 = UNUSED_SRC_1;
	 goto next;

      case OPCODE_SUB://ADD RESULT 1.X Y Z W TMP 0{} {X Y Z W} PARAM 1{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
		t_dst_mask(dst.WriteMask));

	 o_inst->src0 = t_src(vp, &src[0]);
	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
		t_swizzle(GET_SWZ(src[1].Swizzle, 0)),
		t_swizzle(GET_SWZ(src[1].Swizzle, 1)),
		t_swizzle(GET_SWZ(src[1].Swizzle, 2)),
		t_swizzle(GET_SWZ(src[1].Swizzle, 3)),
		t_src_class(src[1].File),
		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);
	 o_inst->src2 = UNUSED_SRC_1;
	 goto next;

      case OPCODE_ABS://MAX RESULT 1.X Y Z W PARAM 0{} {X Y Z W} PARAM 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W
	 o_inst->op=MAKE_VSF_OP(R200_VPI_OUT_OP_MAX, t_dst(&dst),
		t_dst_mask(dst.WriteMask));

	 o_inst->src0=t_src(vp, &src[0]);
	 o_inst->src1=MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
		t_swizzle(GET_SWZ(src[0].Swizzle, 0)),
		t_swizzle(GET_SWZ(src[0].Swizzle, 1)),
		t_swizzle(GET_SWZ(src[0].Swizzle, 2)),
		t_swizzle(GET_SWZ(src[0].Swizzle, 3)),
		t_src_class(src[0].File),
		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[0].RelAddr << 4);
	 o_inst->src2 = UNUSED_SRC_1;
	 goto next;

      case OPCODE_FLR:
      /* FRC TMP 0.X Y Z W PARAM 0{} {X Y Z W} 
         ADD RESULT 1.X Y Z W PARAM 0{} {X Y Z W} TMP 0{X Y Z W } {X Y Z W} neg Xneg Yneg Zneg W */

	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_FRC,
	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
	    t_dst_mask(dst.WriteMask));

	 o_inst->src0 = t_src(vp, &src[0]);
	 o_inst->src1 = UNUSED_SRC_0;
	 o_inst->src2 = UNUSED_SRC_1;
	 o_inst++;

	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD, t_dst(&dst),
		t_dst_mask(dst.WriteMask));

	 o_inst->src0 = t_src(vp, &src[0]);
	 o_inst->src1 = MAKE_VSF_SOURCE(u_temp_i,
		VSF_IN_COMPONENT_X,
		VSF_IN_COMPONENT_Y,
		VSF_IN_COMPONENT_Z,
		VSF_IN_COMPONENT_W,
		VSF_IN_CLASS_TMP,
		/* Not 100% sure about this */
		(!src[0].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE/*VSF_FLAG_ALL*/);

	 o_inst->src2 = UNUSED_SRC_0;
	 u_temp_i--;
	 goto next;

      case OPCODE_XPD:
	 /* mul r0, r1.yzxw, r2.zxyw
	    mad r0, -r2.yzxw, r1.zxyw, r0
	  */
	 hw_op=(src[0].File == PROGRAM_TEMPORARY &&
	    src[1].File == PROGRAM_TEMPORARY &&
	    (((src[0].RelAddr << 8) | src[0].Index) != ((src[1].RelAddr << 8) | src[1].Index))) ?
	    R200_VPI_OUT_OP_MAD_2 : R200_VPI_OUT_OP_MAD;

	 o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
	    (u_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
	    t_dst_mask(dst.WriteMask));

	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
		t_src_class(src[0].File),
		src[0].Negate) | (src[0].RelAddr << 4);

	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
		t_src_class(src[1].File),
		src[1].Negate) | (src[1].RelAddr << 4);

	 o_inst->src2 = UNUSED_SRC_1;
	 o_inst++;
	 u_temp_i--;

	 o_inst->op = MAKE_VSF_OP(hw_op, t_dst(&dst),
		t_dst_mask(dst.WriteMask));

	 o_inst->src0 = MAKE_VSF_SOURCE(t_src_index(vp, &src[1]),
		t_swizzle(GET_SWZ(src[1].Swizzle, 1)), // y
		t_swizzle(GET_SWZ(src[1].Swizzle, 2)), // z
		t_swizzle(GET_SWZ(src[1].Swizzle, 0)), // x
		t_swizzle(GET_SWZ(src[1].Swizzle, 3)), // w
		t_src_class(src[1].File),
		(!src[1].Negate) ? VSF_FLAG_ALL : VSF_FLAG_NONE) | (src[1].RelAddr << 4);

	 o_inst->src1 = MAKE_VSF_SOURCE(t_src_index(vp, &src[0]),
		t_swizzle(GET_SWZ(src[0].Swizzle, 2)), // z
		t_swizzle(GET_SWZ(src[0].Swizzle, 0)), // x
		t_swizzle(GET_SWZ(src[0].Swizzle, 1)), // y
		t_swizzle(GET_SWZ(src[0].Swizzle, 3)), // w
		t_src_class(src[0].File),
		src[0].Negate) | (src[0].RelAddr << 4);

	 o_inst->src2 = MAKE_VSF_SOURCE(u_temp_i+1,
		VSF_IN_COMPONENT_X,
		VSF_IN_COMPONENT_Y,
		VSF_IN_COMPONENT_Z,
		VSF_IN_COMPONENT_W,
		VSF_IN_CLASS_TMP,
		VSF_FLAG_NONE);
	 goto next;

      case OPCODE_END:
	 assert(0);
      default:
	 break;
      }

      o_inst->op = MAKE_VSF_OP(t_opcode(vpi->Opcode), t_dst(&dst),
	    t_dst_mask(dst.WriteMask));

      if(are_srcs_scalar){
	 switch(operands){
	    case 1:
		o_inst->src0 = t_src_scalar(vp, &src[0]);
		o_inst->src1 = UNUSED_SRC_0;
		o_inst->src2 = UNUSED_SRC_1;
	    break;

	    case 2:
		o_inst->src0 = t_src_scalar(vp, &src[0]);
		o_inst->src1 = t_src_scalar(vp, &src[1]);
		o_inst->src2 = UNUSED_SRC_1;
	    break;

	    case 3:
		o_inst->src0 = t_src_scalar(vp, &src[0]);
		o_inst->src1 = t_src_scalar(vp, &src[1]);
		o_inst->src2 = t_src_scalar(vp, &src[2]);
	    break;

	    default:
		fprintf(stderr, "illegal number of operands %lu\n", operands);
		exit(-1);
	    break;
	 }
      } else {
	 switch(operands){
	    case 1:
		o_inst->src0 = t_src(vp, &src[0]);
		o_inst->src1 = UNUSED_SRC_0;
		o_inst->src2 = UNUSED_SRC_1;
	    break;

	    case 2:
		o_inst->src0 = t_src(vp, &src[0]);
		o_inst->src1 = t_src(vp, &src[1]);
		o_inst->src2 = UNUSED_SRC_1;
	    break;

	    case 3:
		o_inst->src0 = t_src(vp, &src[0]);
		o_inst->src1 = t_src(vp, &src[1]);
		o_inst->src2 = t_src(vp, &src[2]);
	    break;

	    default:
		fprintf(stderr, "illegal number of operands %lu\n", operands);
		exit(-1);
	    break;
	 }
      }
      next:

      if (dofogfix) {
	 o_inst++;
	 if (vp->fogmode == GL_EXP) {
	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
		VSF_FLAG_X);
	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, X, X, X, X, PARAM, NONE);
	    o_inst->src2 = UNUSED_SRC_1;
	    o_inst++;
	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_EXP_E,
		R200_VSF_OUT_CLASS_RESULT_FOGC,
		VSF_FLAG_X);
	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
	    o_inst->src1 = UNUSED_SRC_0;
	    o_inst->src2 = UNUSED_SRC_1;
	 }
	 else if (vp->fogmode == GL_EXP2) {
	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
		VSF_FLAG_X);
	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, X, X, X, X, PARAM, NONE);
	    o_inst->src2 = UNUSED_SRC_1;
	    o_inst++;
	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
		VSF_FLAG_X);
	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
	    o_inst->src1 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
	    o_inst->src2 = UNUSED_SRC_1;
	    o_inst++;
	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_EXP_E,
		R200_VSF_OUT_CLASS_RESULT_FOGC,
		VSF_FLAG_X);
	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
	    o_inst->src1 = UNUSED_SRC_0;
	    o_inst->src2 = UNUSED_SRC_1;
	 }
	 else { /* fogmode == GL_LINEAR */
		/* could do that with single op (dot) if using params like
		   with fixed function pipeline fog */
	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_ADD,
		(fog_temp_i << R200_VPI_OUT_REG_INDEX_SHIFT) | R200_VSF_OUT_CLASS_TMP,
		VSF_FLAG_X);
	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, ALL);
	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, Z, Z, Z, Z, PARAM, NONE);
	    o_inst->src2 = UNUSED_SRC_1;
	    o_inst++;
	    o_inst->op = MAKE_VSF_OP(R200_VPI_OUT_OP_MUL,
		R200_VSF_OUT_CLASS_RESULT_FOGC,
		VSF_FLAG_X);
	    o_inst->src0 = EASY_VSF_SOURCE(fog_temp_i, X, X, X, X, TMP, NONE);
	    o_inst->src1 = EASY_VSF_SOURCE(vp->fogpidx, W, W, W, W, PARAM, NONE);
	    o_inst->src2 = UNUSED_SRC_1;

	 }
         dofogfix = 0;
      }

      u_temp_used = (R200_VSF_MAX_TEMPS - 1) - u_temp_i;
      if (mesa_vp->Base.NumNativeTemporaries <
	 (mesa_vp->Base.NumTemporaries + u_temp_used)) {
	 mesa_vp->Base.NumNativeTemporaries =
	    mesa_vp->Base.NumTemporaries + u_temp_used;
      }
      if ((mesa_vp->Base.NumTemporaries + u_temp_used) > R200_VSF_MAX_TEMPS) {
	 if (R200_DEBUG & RADEON_FALLBACKS) {
	    fprintf(stderr, "Ran out of temps, num temps %d, us %d\n", mesa_vp->Base.NumTemporaries, u_temp_used);
	 }
	 return GL_FALSE;
      }
      u_temp_i = R200_VSF_MAX_TEMPS - 1;
      if(o_inst - vp->instr >= R200_VSF_MAX_INST) {
	 mesa_vp->Base.NumNativeInstructions = 129;
	 if (R200_DEBUG & RADEON_FALLBACKS) {
	    fprintf(stderr, "more than 128 native instructions\n");
	 }
	 return GL_FALSE;
      }
      if ((o_inst->op & R200_VSF_OUT_CLASS_MASK) == R200_VSF_OUT_CLASS_RESULT_POS) {
	 vp->pos_end = (o_inst - vp->instr);
      }
   }

   vp->native = GL_TRUE;
   mesa_vp->Base.NumNativeInstructions = (o_inst - vp->instr);
#if 0
   fprintf(stderr, "hw program:\n");
   for(i=0; i < vp->program.length; i++)
      fprintf(stderr, "%08x\n", vp->instr[i]);
#endif
   return GL_TRUE;
}