static INLINE void
emit_elt32(void* priv, unsigned start, unsigned vc)
{
	struct push_context* ctx = priv;
	struct nouveau_channel *chan = ctx->chan;
	uint32_t *elts = (uint32_t *)ctx->idxbuf + start;
	int idxbias = ctx->idxbias;

	while (vc) {
		unsigned push = MIN2(vc, 2047);

		OUT_RING(chan, RING_3D_NI(NV30_3D_VB_ELEMENT_U32, push));
		assert(AVAIL_RING(chan) >= push);
		if(idxbias)
		{
			for(unsigned i = 0; i < push; ++i)
				OUT_RING(chan, elts[i] + idxbias);
		}
		else
			OUT_RINGp(chan, elts, push);

		vc -= push;
		elts += push;
	}
}
Beispiel #2
0
static void
nvc0_m2mf_push_rect(struct pipe_screen *pscreen,
                    const struct nv50_m2mf_rect *dst,
                    const void *data,
                    unsigned nblocksx, unsigned nblocksy)
{
   struct nouveau_channel *chan;
   const uint8_t *src = (const uint8_t *)data;
   const int cpp = dst->cpp;
   const int line_len = nblocksx * cpp;
   int dy = dst->y;

   assert(nouveau_bo_tile_layout(dst->bo));

   BEGIN_RING(chan, RING_MF(TILING_MODE_OUT), 5);
   OUT_RING  (chan, dst->tile_mode);
   OUT_RING  (chan, dst->width * cpp);
   OUT_RING  (chan, dst->height);
   OUT_RING  (chan, dst->depth);
   OUT_RING  (chan, dst->z);

   while (nblocksy) {
      int line_count, words;
      int size = MIN2(AVAIL_RING(chan), NV04_PFIFO_MAX_PACKET_LEN);

      if (size < (12 + words)) {
         FIRE_RING(chan);
         continue;
      }
      line_count = (size * 4) / line_len;
      words = (line_count * line_len + 3) / 4;

      BEGIN_RING(chan, RING_MF(OFFSET_OUT_HIGH), 2);
      OUT_RELOCh(chan, dst->bo, dst->base, dst->domain | NOUVEAU_BO_WR);
      OUT_RELOCl(chan, dst->bo, dst->base, dst->domain | NOUVEAU_BO_WR);

      BEGIN_RING(chan, RING_MF(TILING_POSITION_OUT_X), 2);
      OUT_RING  (chan, dst->x * cpp);
      OUT_RING  (chan, dy);
      BEGIN_RING(chan, RING_MF(LINE_LENGTH_IN), 2);
      OUT_RING  (chan, line_len);
      OUT_RING  (chan, line_count);
      BEGIN_RING(chan, RING_MF(EXEC), 1);
      OUT_RING  (chan, (1 << NVC0_M2MF_EXEC_INC__SHIFT) |
                 NVC0_M2MF_EXEC_PUSH | NVC0_M2MF_EXEC_LINEAR_IN);

      BEGIN_RING_NI(chan, RING_MF(DATA), words);
      OUT_RINGp (chan, src, words);

      dy += line_count;
      src += line_len * line_count;
      nblocksy -= line_count;
   }
}
static void
nvfx_render_flush(struct draw_stage *stage, unsigned flags)
{
	struct nvfx_render_stage *rs = nvfx_render_stage(stage);
	struct nvfx_context *nvfx = rs->nvfx;
	struct nouveau_channel *chan = nvfx->screen->base.channel;

	if (rs->prim != NV30_3D_VERTEX_BEGIN_END_STOP) {
		assert(AVAIL_RING(chan) >= 2);
		OUT_RING(chan, RING_3D(NV30_3D_VERTEX_BEGIN_END, 1));
		OUT_RING(chan, NV30_3D_VERTEX_BEGIN_END_STOP);
		rs->prim = NV30_3D_VERTEX_BEGIN_END_STOP;
	}
}
static void
nv50_draw_arrays_instanced(struct pipe_context *pipe,
			   unsigned mode, unsigned start, unsigned count,
			   unsigned startInstance, unsigned instanceCount)
{
	struct nv50_context *nv50 = nv50_context(pipe);
	struct nouveau_channel *chan = nv50->screen->tesla->channel;
	struct nouveau_grobj *tesla = nv50->screen->tesla;
	struct instance a[16];
	unsigned prim = nv50_prim(mode);

	instance_init(nv50, a, startInstance);
	if (!nv50_state_validate(nv50, 10 + 16*3))
		return;

	if (nv50->vbo_fifo) {
		nv50_push_elements_instanced(pipe, NULL, 0, 0, mode, start,
					     count, startInstance,
					     instanceCount);
		return;
	}

	BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
	OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
	OUT_RING  (chan, startInstance);
	while (instanceCount--) {
		if (AVAIL_RING(chan) < (7 + 16*3)) {
			FIRE_RING(chan);
			if (!nv50_state_validate(nv50, 7 + 16*3)) {
				assert(0);
				return;
			}
		}
		instance_step(nv50, a);

		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
		OUT_RING  (chan, prim);
		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BUFFER_FIRST, 2);
		OUT_RING  (chan, start);
		OUT_RING  (chan, count);
		BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
		OUT_RING  (chan, 0);

		prim |= (1 << 28);
	}
}
Beispiel #5
0
void
nvc0_m2mf_push_linear(struct nouveau_context *nv,
                      struct nouveau_bo *dst, unsigned offset, unsigned domain,
                      unsigned size, void *data)
{
   struct nouveau_channel *chan = nv->screen->channel;
   uint32_t *src = (uint32_t *)data;
   unsigned count = (size + 3) / 4;

   MARK_RING (chan, 8, 2);

   BEGIN_RING(chan, RING_MF(OFFSET_OUT_HIGH), 2);
   OUT_RELOCh(chan, dst, offset, domain | NOUVEAU_BO_WR);
   OUT_RELOCl(chan, dst, offset, domain | NOUVEAU_BO_WR);
   BEGIN_RING(chan, RING_MF(LINE_LENGTH_IN), 2);
   OUT_RING  (chan, size);
   OUT_RING  (chan, 1);
   BEGIN_RING(chan, RING_MF(EXEC), 1);
   OUT_RING  (chan, 0x100111);

   while (count) {
      unsigned nr = AVAIL_RING(chan);

      if (nr < 9) {
         FIRE_RING(chan);
         nouveau_bo_validate(chan, dst, NOUVEAU_BO_WR);
         continue;
      }
      nr = MIN2(count, nr - 1);
      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN);
   
      BEGIN_RING_NI(chan, RING_MF(DATA), nr);
      OUT_RINGp (chan, src, nr);

      src += nr;
      count -= nr;
   }
}
Beispiel #6
0
void
nvc0_cb_push(struct nouveau_context *nv,
             struct nouveau_bo *bo, unsigned domain,
             unsigned base, unsigned size,
             unsigned offset, unsigned words, const uint32_t *data)
{
   struct nouveau_channel *chan = nv->screen->channel;

   assert(!(offset & 3));
   size = align(size, 0x100);

   MARK_RING (chan, 16, 2);
   BEGIN_RING(chan, RING_3D(CB_SIZE), 3);
   OUT_RING  (chan, size);
   OUT_RELOCh(chan, bo, base, domain | NOUVEAU_BO_WR);
   OUT_RELOCl(chan, bo, base, domain | NOUVEAU_BO_WR);

   while (words) {
      unsigned nr = AVAIL_RING(chan);
      nr = MIN2(nr, words);
      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN - 1);

      BEGIN_RING_1I(chan, RING_3D(CB_POS), nr + 1);
      OUT_RING  (chan, offset);
      OUT_RINGp (chan, data, nr);

      words -= nr;
      data += nr;
      offset += nr * 4;

      if (words) {
         MARK_RING(chan, 6, 1);
         nouveau_bo_validate(chan, bo, domain | NOUVEAU_BO_WR);
      }
   }
}
Beispiel #7
0
void
nvc0_m2mf_push_linear(struct nouveau_context *nv,
                      struct nouveau_bo *dst, unsigned offset, unsigned domain,
                      unsigned size, const void *data)
{
   struct nouveau_channel *chan = nv->screen->channel;
   uint32_t *src = (uint32_t *)data;
   unsigned count = (size + 3) / 4;

   while (count) {
      unsigned nr;

      MARK_RING (chan, 16, 2);

      nr = AVAIL_RING(chan) - 9;
      nr = MIN2(count, nr);
      nr = MIN2(nr, NV04_PFIFO_MAX_PACKET_LEN);

      BEGIN_RING(chan, RING_MF(OFFSET_OUT_HIGH), 2);
      OUT_RELOCh(chan, dst, offset, domain | NOUVEAU_BO_WR);
      OUT_RELOCl(chan, dst, offset, domain | NOUVEAU_BO_WR);
      BEGIN_RING(chan, RING_MF(LINE_LENGTH_IN), 2);
      OUT_RING  (chan, nr * 4);
      OUT_RING  (chan, 1);
      BEGIN_RING(chan, RING_MF(EXEC), 1);
      OUT_RING  (chan, 0x100111);

      /* must not be interrupted (trap on QUERY fence, 0x50 works however) */
      BEGIN_RING_NI(chan, RING_MF(DATA), nr);
      OUT_RINGp (chan, src, nr);

      count -= nr;
      src += nr;
      offset += nr * 4;
   }
}
static INLINE void
nvfx_render_prim(struct draw_stage *stage, struct prim_header *prim,
	       unsigned mode, unsigned count)
{
	struct nvfx_render_stage *rs = nvfx_render_stage(stage);
	struct nvfx_context *nvfx = rs->nvfx;

	struct nvfx_screen *screen = nvfx->screen;
	struct nouveau_channel *chan = screen->base.channel;
	boolean no_elements = nvfx->vertprog->draw_no_elements;
	unsigned num_attribs = nvfx->vertprog->draw_elements;

	/* we need to account the flush as well here even if it is done afterthis
	 * function
	 */
	if (AVAIL_RING(chan) < ((1 + count * num_attribs * 4) + 6 + 64)) {
		nvfx_render_flush(stage, 0);
		FIRE_RING(chan);
		nvfx_state_emit(nvfx);

		assert(AVAIL_RING(chan) >= ((1 + count * num_attribs * 4) + 6 + 64));
	}

	/* Switch primitive modes if necessary */
	if (rs->prim != mode) {
		if (rs->prim != NV30_3D_VERTEX_BEGIN_END_STOP) {
			OUT_RING(chan, RING_3D(NV30_3D_VERTEX_BEGIN_END, 1));
			OUT_RING(chan, NV30_3D_VERTEX_BEGIN_END_STOP);
		}

		/* XXX: any command a lot of times seems to (mostly) fix corruption that would otherwise happen */
		/* this seems to cause issues on nv3x, and also be unneeded there */
		if(nvfx->is_nv4x)
		{
			int i;
			for(i = 0; i < 32; ++i)
			{
				OUT_RING(chan, RING_3D(0x1dac, 1));
				OUT_RING(chan, 0);
			}
		}

		OUT_RING(chan, RING_3D(NV30_3D_VERTEX_BEGIN_END, 1));
		OUT_RING  (chan, mode);
		rs->prim = mode;
	}

	OUT_RING(chan, RING_3D_NI(NV30_3D_VERTEX_DATA, num_attribs * 4 * count));
	if(no_elements) {
		OUT_RING(chan, 0);
		OUT_RING(chan, 0);
		OUT_RING(chan, 0);
		OUT_RING(chan, 0);
	} else {
		for (unsigned i = 0; i < count; ++i)
		{
			struct vertex_header* v = prim->v[i];
			/* TODO: disable divide where it's causing the problem, and remove this hack */
			OUT_RING(chan, fui(v->data[0][0] / v->data[0][3]));
			OUT_RING(chan, fui(v->data[0][1] / v->data[0][3]));
			OUT_RING(chan, fui(v->data[0][2] / v->data[0][3]));
			OUT_RING(chan, fui(1.0f / v->data[0][3]));
			OUT_RINGp(chan, &v->data[1][0], 4 * (num_attribs - 1));
		}
	}
}
void
nv50_constbufs_validate(struct nv50_context *nv50)
{
   struct nouveau_channel *chan = nv50->screen->base.channel;
   unsigned s;

   for (s = 0; s < 3; ++s) {
      struct nv04_resource *res;
      int i;
      unsigned p, b;

      if (s == PIPE_SHADER_FRAGMENT)
         p = NV50_3D_SET_PROGRAM_CB_PROGRAM_FRAGMENT;
      else
      if (s == PIPE_SHADER_GEOMETRY)
         p = NV50_3D_SET_PROGRAM_CB_PROGRAM_GEOMETRY;
      else
         p = NV50_3D_SET_PROGRAM_CB_PROGRAM_VERTEX;

      while (nv50->constbuf_dirty[s]) {
         struct nouveau_bo *bo;
         unsigned start = 0;
         unsigned words = 0;

         i = ffs(nv50->constbuf_dirty[s]) - 1;
         nv50->constbuf_dirty[s] &= ~(1 << i);

         res = nv04_resource(nv50->constbuf[s][i]);
         if (!res) {
            if (i != 0) {
               BEGIN_RING(chan, RING_3D(SET_PROGRAM_CB), 1);
               OUT_RING  (chan, (i << 8) | p | 0);
            }
            continue;
         }

         if (i == 0) {
            b = NV50_CB_PVP + s;

            /* always upload GL uniforms through CB DATA */
            bo = nv50->screen->uniforms;
            words = res->base.width0 / 4;
         } else {
            b = s * 16 + i;

            assert(0);

            if (!nouveau_resource_mapped_by_gpu(&res->base)) {
               nouveau_buffer_migrate(&nv50->base, res, NOUVEAU_BO_VRAM);

               BEGIN_RING(chan, RING_3D(CODE_CB_FLUSH), 1);
               OUT_RING  (chan, 0);
            }
            MARK_RING (chan, 6, 2);
            BEGIN_RING(chan, RING_3D(CB_DEF_ADDRESS_HIGH), 3);
            OUT_RESRCh(chan, res, 0, NOUVEAU_BO_RD);
            OUT_RESRCl(chan, res, 0, NOUVEAU_BO_RD);
            OUT_RING  (chan, (b << 16) | (res->base.width0 & 0xffff));
            BEGIN_RING(chan, RING_3D(SET_PROGRAM_CB), 1);
            OUT_RING  (chan, (b << 12) | (i << 8) | p | 1);

            bo = res->bo;

            nv50_bufctx_add_resident(nv50, NV50_BUFCTX_CONSTANT, res,
                                     res->domain | NOUVEAU_BO_RD);
         }

         if (words) {
            MARK_RING(chan, 8, 1);

            nouveau_bo_validate(chan, bo, res->domain | NOUVEAU_BO_WR);
         }

         while (words) {
            unsigned nr = AVAIL_RING(chan);

            if (nr < 16) {
               FIRE_RING(chan);
               nouveau_bo_validate(chan, bo, res->domain | NOUVEAU_BO_WR);
               continue;
            }
            nr = MIN2(MIN2(nr - 3, words), NV04_PFIFO_MAX_PACKET_LEN);

            BEGIN_RING(chan, RING_3D(CB_ADDR), 1);
            OUT_RING  (chan, (start << 8) | b);
            BEGIN_RING_NI(chan, RING_3D(CB_DATA(0)), nr);
            OUT_RINGp (chan, &res->data[start * 4], nr);

            start += nr;
            words -= nr;
         }
      }
   }
}
Beispiel #10
0
void
nv50_push_elements_instanced(struct pipe_context *pipe,
                             struct pipe_resource *idxbuf,
                             unsigned idxsize, int idxbias,
                             unsigned mode, unsigned start, unsigned count,
                             unsigned i_start, unsigned i_count)
{
   struct nv50_context *nv50 = nv50_context(pipe);
   struct nouveau_grobj *tesla = nv50->screen->tesla;
   struct nouveau_channel *chan = tesla->channel;
   struct push_context ctx;
   const unsigned p_overhead = 4 + /* begin/end */
                               4; /* potential edgeflag enable/disable */
   const unsigned v_overhead = 1 + /* VERTEX_DATA packet header */
                               2; /* potential edgeflag modification */
   struct util_split_prim s;
   unsigned vtx_size;
   boolean nzi = FALSE;
   int i;

   ctx.nv50 = nv50;
   ctx.attr_nr = 0;
   ctx.idxbuf = NULL;
   ctx.vtx_size = 0;
   ctx.edgeflag = 0.5f;
   ctx.edgeflag_attr = nv50->vertprog->vp.edgeflag;

   /* map vertex buffers, determine vertex size */
   for (i = 0; i < nv50->vtxelt->num_elements; i++) {
      struct pipe_vertex_element *ve = &nv50->vtxelt->pipe[i];
      struct pipe_vertex_buffer *vb = &nv50->vtxbuf[ve->vertex_buffer_index];
      struct nouveau_bo *bo = nv50_resource(vb->buffer)->bo;
      unsigned size, nr_components, n;

      if (!(nv50->vbo_fifo & (1 << i)))
         continue;
      n = ctx.attr_nr++;

      if (nouveau_bo_map(bo, NOUVEAU_BO_RD)) {
         assert(bo->map);
         return;
      }
      ctx.attr[n].map = (uint8_t *)bo->map + vb->buffer_offset + ve->src_offset;
      nouveau_bo_unmap(bo);

      ctx.attr[n].stride = vb->stride;
      ctx.attr[n].divisor = ve->instance_divisor;
      if (ctx.attr[n].divisor) {
         ctx.attr[n].step = i_start % ve->instance_divisor;
         ctx.attr[n].map = (uint8_t *)ctx.attr[n].map + i_start * vb->stride;
      }

      size = util_format_get_component_bits(ve->src_format,
                                            UTIL_FORMAT_COLORSPACE_RGB, 0);
      nr_components = util_format_get_nr_components(ve->src_format);
      switch (size) {
      case 8:
         switch (nr_components) {
         case 1: ctx.attr[n].push = emit_b08_1; break;
         case 2: ctx.attr[n].push = emit_b16_1; break;
         case 3: ctx.attr[n].push = emit_b08_3; break;
         case 4: ctx.attr[n].push = emit_b32_1; break;
         }
         ctx.vtx_size++;
         break;
      case 16:
         switch (nr_components) {
         case 1: ctx.attr[n].push = emit_b16_1; break;
         case 2: ctx.attr[n].push = emit_b32_1; break;
         case 3: ctx.attr[n].push = emit_b16_3; break;
         case 4: ctx.attr[n].push = emit_b32_2; break;
         }
         ctx.vtx_size += (nr_components + 1) >> 1;
         break;
      case 32:
         switch (nr_components) {
         case 1: ctx.attr[n].push = emit_b32_1; break;
         case 2: ctx.attr[n].push = emit_b32_2; break;
         case 3: ctx.attr[n].push = emit_b32_3; break;
         case 4: ctx.attr[n].push = emit_b32_4; break;
         }
         ctx.vtx_size += nr_components;
         break;
      default:
         assert(0);
         return;
      }
   }
   vtx_size = ctx.vtx_size + v_overhead;

   /* map index buffer, if present */
   if (idxbuf) {
      struct nouveau_bo *bo = nv50_resource(idxbuf)->bo;

      if (nouveau_bo_map(bo, NOUVEAU_BO_RD)) {
         assert(bo->map);
         return;
      }
      ctx.idxbuf = bo->map;
      ctx.idxbias = idxbias;
      ctx.idxsize = idxsize;
      nouveau_bo_unmap(bo);
   }

   s.priv = &ctx;
   s.edge = emit_edgeflag;
   if (idxbuf) {
      if (idxsize == 1)
         s.emit = idxbias ? emit_elt08_biased : emit_elt08;
      else
      if (idxsize == 2)
         s.emit = idxbias ? emit_elt16_biased : emit_elt16;
      else
         s.emit = idxbias ? emit_elt32_biased : emit_elt32;
   } else
      s.emit = emit_verts;

   /* per-instance loop */
   BEGIN_RING(chan, tesla, NV50TCL_CB_ADDR, 2);
   OUT_RING  (chan, NV50_CB_AUX | (24 << 8));
   OUT_RING  (chan, i_start);
   while (i_count--) {
      unsigned max_verts;
      boolean done;

      for (i = 0; i < ctx.attr_nr; i++) {
         if (!ctx.attr[i].divisor ||
              ctx.attr[i].divisor != ++ctx.attr[i].step)
            continue;
         ctx.attr[i].step = 0;
         ctx.attr[i].map = (uint8_t *)ctx.attr[i].map + ctx.attr[i].stride;
      }

      util_split_prim_init(&s, mode, start, count);
      do {
         if (AVAIL_RING(chan) < p_overhead + (6 * vtx_size)) {
            FIRE_RING(chan);
            if (!nv50_state_validate(nv50, p_overhead + (6 * vtx_size))) {
               assert(0);
               return;
            }
         }

         max_verts  = AVAIL_RING(chan);
         max_verts -= p_overhead;
         max_verts /= vtx_size;

         BEGIN_RING(chan, tesla, NV50TCL_VERTEX_BEGIN, 1);
         OUT_RING  (chan, nv50_prim(s.mode) | (nzi ? (1 << 28) : 0));
         done = util_split_prim_next(&s, max_verts);
         BEGIN_RING(chan, tesla, NV50TCL_VERTEX_END, 1);
         OUT_RING  (chan, 0);
      } while (!done);

      nzi = TRUE;
   }
}
void
nvfx_push_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
{
	struct nvfx_context *nvfx = nvfx_context(pipe);
	struct nouveau_channel *chan = nvfx->screen->base.channel;
	struct push_context ctx;
	struct util_split_prim s;
	unsigned instances_left = info->instance_count;
	int vtx_value;
	unsigned hw_mode = nvgl_primitive(info->mode);
	int i;
	struct
	{
		uint8_t* map;
		unsigned step;
	} per_instance[16];
	unsigned p_overhead = 64 /* magic fix */
			+ 4 /* begin/end */
			+ 4; /* potential edgeflag enable/disable */

	ctx.chan = nvfx->screen->base.channel;
	ctx.translate = nvfx->vtxelt->translate;
	ctx.idxbuf = NULL;
	ctx.vertex_length = nvfx->vtxelt->vertex_length;
	ctx.max_vertices_per_packet = nvfx->vtxelt->max_vertices_per_packet;
	ctx.edgeflag = 0.5f;
	// TODO: figure out if we really want to handle this, and do so in that case
	ctx.edgeflag_attr = 0xff; // nvfx->vertprog->cfg.edgeflag_in;

	if(!nvfx->use_vertex_buffers)
	{
		for(i = 0; i < nvfx->vtxelt->num_per_vertex_buffer_infos; ++i)
		{
			struct nvfx_per_vertex_buffer_info* vbi = &nvfx->vtxelt->per_vertex_buffer_info[i];
			struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[vbi->vertex_buffer_index];
			uint8_t* data = nvfx_buffer(vb->buffer)->data + vb->buffer_offset;
			if(info->indexed)
				data += info->index_bias * vb->stride;
			ctx.translate->set_buffer(ctx.translate, i, data, vb->stride, ~0);
		}

		if(ctx.edgeflag_attr < 16)
			vtx_value = -(ctx.vertex_length + 3);  /* vertex data and edgeflag header and value */
		else
		{
			p_overhead += 1; /* initial vertex_data header */
			vtx_value = -ctx.vertex_length;  /* vertex data and edgeflag header and value */
		}

		if (info->indexed) {
			// XXX: this case and is broken and probably need a new VTX_ATTR push path
			if (nvfx->idxbuf.index_size == 1)
				s.emit = emit_vertices_lookup8;
			else if (nvfx->idxbuf.index_size == 2)
				s.emit = emit_vertices_lookup16;
			else
				s.emit = emit_vertices_lookup32;
		} else
			s.emit = emit_vertices;
	}
	else
	{
		if(!info->indexed || nvfx->use_index_buffer)
		{
			s.emit = info->indexed ? emit_ib_ranges : emit_vb_ranges;
			p_overhead += 3;
			vtx_value = 0;
		}
		else if (nvfx->idxbuf.index_size == 4)
		{
			s.emit = emit_elt32;
			p_overhead += 1;
			vtx_value = 8;
		}
		else
		{
			s.emit = (nvfx->idxbuf.index_size == 2) ? emit_elt16 : emit_elt8;
			p_overhead += 3;
			vtx_value = 7;
		}
	}

	ctx.idxbias = info->index_bias;
	if(nvfx->use_vertex_buffers)
		ctx.idxbias -= nvfx->base_vertex;

	/* map index buffer, if present */
	if (info->indexed && !nvfx->use_index_buffer)
		ctx.idxbuf = nvfx_buffer(nvfx->idxbuf.buffer)->data + nvfx->idxbuf.offset;

	s.priv = &ctx;
	s.edge = emit_edgeflag;

	for (i = 0; i < nvfx->vtxelt->num_per_instance; ++i)
	{
		struct nvfx_per_instance_element *ve = &nvfx->vtxelt->per_instance[i];
		struct pipe_vertex_buffer *vb = &nvfx->vtxbuf[ve->base.vertex_buffer_index];
		float v[4];
		per_instance[i].step = info->start_instance % ve->instance_divisor;
		per_instance[i].map = nvfx_buffer(vb->buffer)->data + vb->buffer_offset + ve->base.src_offset;

		nvfx->vtxelt->per_instance[i].base.fetch_rgba_float(v, per_instance[i].map, 0, 0);

		WAIT_RING(chan, 5);
		nvfx_emit_vtx_attr(chan, nvfx->vtxelt->per_instance[i].base.idx, v, nvfx->vtxelt->per_instance[i].base.ncomp);
	}

	/* per-instance loop */
	while (instances_left--) {
		int max_verts;
		boolean done;

		util_split_prim_init(&s, info->mode, info->start, info->count);
		nvfx_state_emit(nvfx);
		for(;;) {
			max_verts  = AVAIL_RING(chan);
			max_verts -= p_overhead;

			/* if vtx_value < 0, each vertex is -vtx_value words long
			 * otherwise, each vertex is 2^(vtx_value) / 255 words long (this is an approximation)
			 */
			if(vtx_value < 0)
			{
				max_verts /= -vtx_value;
				max_verts -= (max_verts >> 10); /* vertex data headers */
			}
			else
			{
				if(max_verts >= (1 << 23)) /* avoid overflow here */
					max_verts = (1 << 23);
				max_verts = (max_verts * 255) >> vtx_value;
			}

			//printf("avail %u max_verts %u\n", AVAIL_RING(chan), max_verts);

			if(max_verts >= 16)
			{
				/* XXX: any command a lot of times seems to (mostly) fix corruption that would otherwise happen */
				/* this seems to cause issues on nv3x, and also be unneeded there */
				if(nvfx->is_nv4x)
				{
					int i;
					for(i = 0; i < 32; ++i)
					{
						OUT_RING(chan, RING_3D(0x1dac, 1));
						OUT_RING(chan, 0);
					}
				}

				OUT_RING(chan, RING_3D(NV30_3D_VERTEX_BEGIN_END, 1));
				OUT_RING(chan, hw_mode);
				done = util_split_prim_next(&s, max_verts);
				OUT_RING(chan, RING_3D(NV30_3D_VERTEX_BEGIN_END, 1));
				OUT_RING(chan, 0);

				if(done)
					break;
			}

			FIRE_RING(chan);
			nvfx_state_emit(nvfx);
		}
void
nv50_upload_sifc(struct nv50_context *nv50,
		 struct nouveau_bo *bo, unsigned dst_offset, unsigned reloc,
		 unsigned dst_format, int dst_w, int dst_h, int dst_pitch,
		 void *src, unsigned src_format, int src_pitch,
		 int x, int y, int w, int h, int cpp)
{
	struct nouveau_channel *chan = nv50->screen->base.channel;
	struct nouveau_grobj *eng2d = nv50->screen->eng2d;
	unsigned line_dwords = (w * cpp + 3) / 4;

	reloc |= NOUVEAU_BO_WR;

	MARK_RING (chan, 32, 2); /* flush on lack of space or relocs */

	if (bo->tile_flags) {
		BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 5);
		OUT_RING  (chan, dst_format);
		OUT_RING  (chan, 0);
		OUT_RING  (chan, bo->tile_mode << 4);
		OUT_RING  (chan, 1);
		OUT_RING  (chan, 0);
	} else {
		BEGIN_RING(chan, eng2d, NV50_2D_DST_FORMAT, 2);
		OUT_RING  (chan, dst_format);
		OUT_RING  (chan, 1);
		BEGIN_RING(chan, eng2d, NV50_2D_DST_PITCH, 1);
		OUT_RING  (chan, dst_pitch);
	}

	BEGIN_RING(chan, eng2d, NV50_2D_DST_WIDTH, 4);
	OUT_RING  (chan, dst_w);
	OUT_RING  (chan, dst_h);
	OUT_RELOCh(chan, bo, dst_offset, reloc);
	OUT_RELOCl(chan, bo, dst_offset, reloc);

	/* NV50_2D_OPERATION_SRCCOPY assumed already set */

	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_BITMAP_ENABLE, 2);
	OUT_RING  (chan, 0);
	OUT_RING  (chan, src_format);
	BEGIN_RING(chan, eng2d, NV50_2D_SIFC_WIDTH, 10);
	OUT_RING  (chan, w);
	OUT_RING  (chan, h);
	OUT_RING  (chan, 0);
	OUT_RING  (chan, 1);
	OUT_RING  (chan, 0);
	OUT_RING  (chan, 1);
	OUT_RING  (chan, 0);
	OUT_RING  (chan, x);
	OUT_RING  (chan, 0);
	OUT_RING  (chan, y);

	while (h--) {
		const uint32_t *p = src;
		unsigned count = line_dwords;

		while (count) {
			unsigned nr = MIN2(count, 1792);

			if (AVAIL_RING(chan) <= nr) {
				FIRE_RING (chan);

				BEGIN_RING(chan, eng2d,
					   NV50_2D_DST_ADDRESS_HIGH, 2);
				OUT_RELOCh(chan, bo, dst_offset, reloc);
				OUT_RELOCl(chan, bo, dst_offset, reloc);
			}
			assert(AVAIL_RING(chan) > nr);

			BEGIN_RING(chan, eng2d,
				   NV50_2D_SIFC_DATA | (2 << 29), nr);
			OUT_RINGp (chan, p, nr);

			p += nr;
			count -= nr;
		}

		src = (uint8_t *) src + src_pitch;
	}
}