Ejemplo n.º 1
0
static int
debug_triangle(int tilex, int tiley,
               const union lp_rast_cmd_arg arg,
               struct tile *tile,
               char val)
{
   const struct lp_rast_triangle *tri = arg.triangle.tri;
   unsigned plane_mask = arg.triangle.plane_mask;
   const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
   struct lp_rast_plane plane[8];
   int x, y;
   int count = 0;
   unsigned i, nr_planes = 0;
   boolean blend = tile->state->variant->key.blend.rt[0].blend_enable;

   if (tri->inputs.disable) {
      /* This triangle was partially binned and has been disabled */
      return 0;
   }

   while (plane_mask) {
      plane[nr_planes] = tri_plane[u_bit_scan(&plane_mask)];
      plane[nr_planes].c = (plane[nr_planes].c +
                            IMUL64(plane[nr_planes].dcdy, tiley) -
                            IMUL64(plane[nr_planes].dcdx, tilex));
      nr_planes++;
   }

   for(y = 0; y < TILE_SIZE; y++)
   {
      for(x = 0; x < TILE_SIZE; x++)
      {
         for (i = 0; i < nr_planes; i++)
            if (plane[i].c <= 0)
               goto out;
         
         plot(tile, x, y, val, blend);
         count++;

      out:
         for (i = 0; i < nr_planes; i++)
            plane[i].c -= plane[i].dcdx;
      }

      for (i = 0; i < nr_planes; i++) {
         plane[i].c += IMUL64(plane[i].dcdx, TILE_SIZE);
         plane[i].c += plane[i].dcdy;
      }
   }
   return count;
}
void
lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
                     const union lp_rast_cmd_arg arg)
{
   const struct lp_rast_triangle *tri = arg.triangle.tri;
   const struct lp_rast_plane *plane = GET_PLANES(tri);
   unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
   unsigned y = (arg.triangle.plane_mask >> 8) + task->y;

   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
   __m128i zero = _mm_setzero_si128();

   __m128i c;
   __m128i dcdx;
   __m128i dcdy;

   __m128i dcdx2;
   __m128i dcdx3;
   
   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
   __m128i unused;

   transpose4_epi32(&p0, &p1, &p2, &zero,
                    &c, &dcdx, &dcdy, &unused);

   /* Adjust dcdx;
    */
   dcdx = _mm_sub_epi32(zero, dcdx);

   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));

   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
   c = _mm_sub_epi32(c, _mm_set1_epi32(1));

   dcdx2 = _mm_add_epi32(dcdx, dcdx);
   dcdx3 = _mm_add_epi32(dcdx2, dcdx);

   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
                    &span_0, &span_1, &span_2, &unused);


   {
      __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
      __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
      __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
      
      __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);

      __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
      __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
      __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));

      __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
      __m128i c_01 = _mm_packs_epi32(c_0, c_1);

      __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
      __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
      __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));

      __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);

      __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
      __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
      __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));

      __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
      __m128i c_23 = _mm_packs_epi32(c_2, c_3);
      __m128i c_0123 = _mm_packs_epi16(c_01, c_23);

      unsigned mask = _mm_movemask_epi8(c_0123);

      if (mask != 0xffff)
         lp_rast_shade_quads_mask(task,
                                  &tri->inputs,
                                  x,
                                  y,
                                  0xffff & ~mask);
   }
}
void
lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
                      const union lp_rast_cmd_arg arg)
{
   const struct lp_rast_triangle *tri = arg.triangle.tri;
   const struct lp_rast_plane *plane = GET_PLANES(tri);
   int x = (arg.triangle.plane_mask & 0xff) + task->x;
   int y = (arg.triangle.plane_mask >> 8) + task->y;
   unsigned i, j;

   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
   unsigned nr = 0;

   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
   __m128i zero = _mm_setzero_si128();

   __m128i c;
   __m128i dcdx;
   __m128i dcdy;
   __m128i rej4;

   __m128i dcdx2;
   __m128i dcdx3;
   
   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
   __m128i unused;
   
   transpose4_epi32(&p0, &p1, &p2, &zero,
                    &c, &dcdx, &dcdy, &rej4);

   /* Adjust dcdx;
    */
   dcdx = _mm_sub_epi32(zero, dcdx);

   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
   rej4 = _mm_slli_epi32(rej4, 2);

   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
   rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));

   dcdx2 = _mm_add_epi32(dcdx, dcdx);
   dcdx3 = _mm_add_epi32(dcdx2, dcdx);

   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
                    &span_0, &span_1, &span_2, &unused);

   for (i = 0; i < 4; i++) {
      __m128i cx = c;

      for (j = 0; j < 4; j++) {
         __m128i c4rej = _mm_add_epi32(cx, rej4);
         __m128i rej_masks = _mm_srai_epi32(c4rej, 31);

         /* if (is_zero(rej_masks)) */
         if (_mm_movemask_epi8(rej_masks) == 0) {
            __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
            __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
            __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);

            __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);

            __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
            __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
            __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));

            __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
            __m128i c_01 = _mm_packs_epi32(c_0, c_1);

            __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
            __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
            __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));

            __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);

            __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
            __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
            __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));

            __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
            __m128i c_23 = _mm_packs_epi32(c_2, c_3);
            __m128i c_0123 = _mm_packs_epi16(c_01, c_23);

            unsigned mask = _mm_movemask_epi8(c_0123);

            out[nr].i = i;
            out[nr].j = j;
            out[nr].mask = mask;
            if (mask != 0xffff)
               nr++;
         }
         cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
      }

      c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
   }

   for (i = 0; i < nr; i++)
      lp_rast_shade_quads_mask(task,
                               &tri->inputs,
                               x + 4 * out[i].j,
                               y + 4 * out[i].i,
                               0xffff & ~out[i].mask);
}
Ejemplo n.º 4
0
static boolean
try_setup_line( struct lp_setup_context *setup,
               const float (*v1)[4],
               const float (*v2)[4])
{
   struct llvmpipe_context *lp_context = (struct llvmpipe_context *)setup->pipe;
   struct lp_scene *scene = setup->scene;
   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
   struct lp_rast_triangle *line;
   struct lp_rast_plane *plane;
   struct lp_line_info info;
   float width = MAX2(1.0, setup->line_width);
   const struct u_rect *scissor;
   struct u_rect bbox, bboxpos;
   boolean s_planes[4];
   unsigned tri_bytes;
   int x[4]; 
   int y[4];
   int i;
   int nr_planes = 4;
   unsigned viewport_index = 0;
   unsigned layer = 0;
   
   /* linewidth should be interpreted as integer */
   int fixed_width = util_iround(width) * FIXED_ONE;

   float x_offset=0;
   float y_offset=0;
   float x_offset_end=0;
   float y_offset_end=0;
      
   float x1diff;
   float y1diff;
   float x2diff;
   float y2diff;
   float dx, dy;
   float area;
   const float (*pv)[4];

   boolean draw_start;
   boolean draw_end;
   boolean will_draw_start;
   boolean will_draw_end;

   if (0)
      print_line(setup, v1, v2);

   if (setup->flatshade_first) {
      pv = v1;
   }
   else {
      pv = v2;
   }
   if (setup->viewport_index_slot > 0) {
      unsigned *udata = (unsigned*)pv[setup->viewport_index_slot];
      viewport_index = lp_clamp_viewport_idx(*udata);
   }
   if (setup->layer_slot > 0) {
      layer = *(unsigned*)pv[setup->layer_slot];
      layer = MIN2(layer, scene->fb_max_layer);
   }

   dx = v1[0][0] - v2[0][0];
   dy = v1[0][1] - v2[0][1];
   area = (dx * dx  + dy * dy);
   if (area == 0) {
      LP_COUNT(nr_culled_tris);
      return TRUE;
   }

   info.oneoverarea = 1.0f / area;
   info.dx = dx;
   info.dy = dy;
   info.v1 = v1;
   info.v2 = v2;

  
   /* X-MAJOR LINE */
   if (fabsf(dx) >= fabsf(dy)) {
      float dydx = dy / dx;

      x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5;
      y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5;
      x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5;
      y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5;

      if (y2diff==-0.5 && dy<0){
         y2diff = 0.5;
      }
      
      /* 
       * Diamond exit rule test for starting point 
       */    
      if (fabsf(x1diff) + fabsf(y1diff) < 0.5) {
         draw_start = TRUE;
      }
      else if (sign(x1diff) == sign(-dx)) {
         draw_start = FALSE;
      }
      else if (sign(-y1diff) != sign(dy)) {
         draw_start = TRUE;
      }
      else {
         /* do intersection test */
         float yintersect = fracf(v1[0][1]) + x1diff * dydx;
         draw_start = (yintersect < 1.0 && yintersect > 0.0);
      }


      /* 
       * Diamond exit rule test for ending point 
       */    
      if (fabsf(x2diff) + fabsf(y2diff) < 0.5) {
         draw_end = FALSE;
      }
      else if (sign(x2diff) != sign(-dx)) {
         draw_end = FALSE;
      }
      else if (sign(-y2diff) == sign(dy)) {
         draw_end = TRUE;
      }
      else {
         /* do intersection test */
         float yintersect = fracf(v2[0][1]) + x2diff * dydx;
         draw_end = (yintersect < 1.0 && yintersect > 0.0);
      }

      /* Are we already drawing start/end?
       */
      will_draw_start = sign(-x1diff) != sign(dx);
      will_draw_end = (sign(x2diff) == sign(-dx)) || x2diff==0;

      if (dx < 0) {
         /* if v2 is to the right of v1, swap pointers */
         const float (*temp)[4] = v1;
         v1 = v2;
         v2 = temp;
         dx = -dx;
         dy = -dy;
         /* Otherwise shift planes appropriately */
         if (will_draw_start != draw_start) {
            x_offset_end = - x1diff - 0.5;
            y_offset_end = x_offset_end * dydx;

         }
         if (will_draw_end != draw_end) {
            x_offset = - x2diff - 0.5;
            y_offset = x_offset * dydx;
         }

      }
      else{
         /* Otherwise shift planes appropriately */
         if (will_draw_start != draw_start) {
            x_offset = - x1diff + 0.5;
            y_offset = x_offset * dydx;
         }
         if (will_draw_end != draw_end) {
            x_offset_end = - x2diff + 0.5;
            y_offset_end = x_offset_end * dydx;
         }
      }
  
      /* x/y positions in fixed point */
      x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset);
      x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset);
      x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset);
      x[3] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset);
      
      y[0] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset) - fixed_width/2;
      y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) - fixed_width/2;
      y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) + fixed_width/2;
      y[3] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset) + fixed_width/2;
      
   }
   else {
      const float dxdy = dx / dy;

      /* Y-MAJOR LINE */      
      x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5;
      y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5;
      x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5;
      y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5;

      if (x2diff==-0.5 && dx<0) {
         x2diff = 0.5;
      }

      /* 
       * Diamond exit rule test for starting point 
       */    
      if (fabsf(x1diff) + fabsf(y1diff) < 0.5) {
         draw_start = TRUE;
      }
      else if (sign(-y1diff) == sign(dy)) {
         draw_start = FALSE;
      }
      else if (sign(x1diff) != sign(-dx)) {
         draw_start = TRUE;
      }
      else {
         /* do intersection test */
         float xintersect = fracf(v1[0][0]) + y1diff * dxdy;
         draw_start = (xintersect < 1.0 && xintersect > 0.0);
      }

      /* 
       * Diamond exit rule test for ending point 
       */    
      if (fabsf(x2diff) + fabsf(y2diff) < 0.5) {
         draw_end = FALSE;
      }
      else if (sign(-y2diff) != sign(dy) ) {
         draw_end = FALSE;
      }
      else if (sign(x2diff) == sign(-dx) ) {
         draw_end = TRUE;
      }
      else {
         /* do intersection test */
         float xintersect = fracf(v2[0][0]) + y2diff * dxdy;
         draw_end = (xintersect < 1.0 && xintersect >= 0.0);
      }

      /* Are we already drawing start/end?
       */
      will_draw_start = sign(y1diff) == sign(dy);
      will_draw_end = (sign(-y2diff) == sign(dy)) || y2diff==0;

      if (dy > 0) {
         /* if v2 is on top of v1, swap pointers */
         const float (*temp)[4] = v1;
         v1 = v2;
         v2 = temp; 
         dx = -dx;
         dy = -dy;

         /* Otherwise shift planes appropriately */
         if (will_draw_start != draw_start) {
            y_offset_end = - y1diff + 0.5;
            x_offset_end = y_offset_end * dxdy;
         }
         if (will_draw_end != draw_end) {
            y_offset = - y2diff + 0.5;
            x_offset = y_offset * dxdy;
         }
      }
      else {
         /* Otherwise shift planes appropriately */
         if (will_draw_start != draw_start) {
            y_offset = - y1diff - 0.5;
            x_offset = y_offset * dxdy;
                     
         }
         if (will_draw_end != draw_end) {
            y_offset_end = - y2diff - 0.5;
            x_offset_end = y_offset_end * dxdy;
         }
      }

      /* x/y positions in fixed point */
      x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) - fixed_width/2;
      x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) - fixed_width/2;
      x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) + fixed_width/2;
      x[3] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) + fixed_width/2;
     
      y[0] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset); 
      y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset);
      y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset);
      y[3] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset);
   }

   /* Bounding rectangle (in pixels) */
   {
      /* Yes this is necessary to accurately calculate bounding boxes
       * with the two fill-conventions we support.  GL (normally) ends
       * up needing a bottom-left fill convention, which requires
       * slightly different rounding.
       */
      int adj = (setup->bottom_edge_rule != 0) ? 1 : 0;

      bbox.x0 = (MIN4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER;
      bbox.x1 = (MAX4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER;
      bbox.y0 = (MIN4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
      bbox.y1 = (MAX4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;

      /* Inclusive coordinates:
       */
      bbox.x1--;
      bbox.y1--;
   }

   if (bbox.x1 < bbox.x0 ||
       bbox.y1 < bbox.y0) {
      if (0) debug_printf("empty bounding box\n");
      LP_COUNT(nr_culled_tris);
      return TRUE;
   }

   if (!u_rect_test_intersection(&setup->draw_regions[viewport_index], &bbox)) {
      if (0) debug_printf("offscreen\n");
      LP_COUNT(nr_culled_tris);
      return TRUE;
   }

   bboxpos = bbox;

   /* Can safely discard negative regions:
    */
   bboxpos.x0 = MAX2(bboxpos.x0, 0);
   bboxpos.y0 = MAX2(bboxpos.y0, 0);

   nr_planes = 4;
   /*
    * Determine how many scissor planes we need, that is drop scissor
    * edges if the bounding box of the tri is fully inside that edge.
    */
   if (setup->scissor_test) {
      /* why not just use draw_regions */
      scissor = &setup->scissors[viewport_index];
      scissor_planes_needed(s_planes, &bboxpos, scissor);
      nr_planes += s_planes[0] + s_planes[1] + s_planes[2] + s_planes[3];
   }

   line = lp_setup_alloc_triangle(scene,
                                  key->num_inputs,
                                  nr_planes,
                                  &tri_bytes);
   if (!line)
      return FALSE;

#ifdef DEBUG
   line->v[0][0] = v1[0][0];
   line->v[1][0] = v2[0][0];   
   line->v[0][1] = v1[0][1];
   line->v[1][1] = v2[0][1];
#endif

   LP_COUNT(nr_tris);

   if (lp_context->active_statistics_queries &&
       !llvmpipe_rasterization_disabled(lp_context)) {
      lp_context->pipeline_statistics.c_primitives++;
   }

   /* calculate the deltas */
   plane = GET_PLANES(line);
   plane[0].dcdy = x[0] - x[1];
   plane[1].dcdy = x[1] - x[2];
   plane[2].dcdy = x[2] - x[3];
   plane[3].dcdy = x[3] - x[0];

   plane[0].dcdx = y[0] - y[1];
   plane[1].dcdx = y[1] - y[2];
   plane[2].dcdx = y[2] - y[3];
   plane[3].dcdx = y[3] - y[0];

   if (draw_will_inject_frontface(lp_context->draw) &&
       setup->face_slot > 0) {
      line->inputs.frontfacing = v1[setup->face_slot][0];
   } else {
      line->inputs.frontfacing = TRUE;
   }

   /* Setup parameter interpolants:
    */
   info.a0 = GET_A0(&line->inputs);
   info.dadx = GET_DADX(&line->inputs);
   info.dady = GET_DADY(&line->inputs);
   info.frontfacing = line->inputs.frontfacing;
   setup_line_coefficients(setup, &info); 

   line->inputs.disable = FALSE;
   line->inputs.opaque = FALSE;
   line->inputs.layer = layer;
   line->inputs.viewport_index = viewport_index;

   /*
    * XXX: this code is mostly identical to the one in lp_setup_tri, except it
    * uses 4 planes instead of 3. Could share the code (including the sse
    * assembly, in fact we'd get the 4th plane for free).
    * The only difference apart from storing the 4th plane would be some
    * different shuffle for calculating dcdx/dcdy.
    */
   for (i = 0; i < 4; i++) {

      /* half-edge constants, will be iterated over the whole render
       * target.
       */
      plane[i].c = IMUL64(plane[i].dcdx, x[i]) - IMUL64(plane[i].dcdy, y[i]);

      /* correct for top-left vs. bottom-left fill convention.
       */
      if (plane[i].dcdx < 0) {
         /* both fill conventions want this - adjust for left edges */
         plane[i].c++;
      }
      else if (plane[i].dcdx == 0) {
         if (setup->pixel_offset == 0) {
            /* correct for top-left fill convention:
             */
            if (plane[i].dcdy > 0) plane[i].c++;
         }
         else {
            /* correct for bottom-left fill convention:
             */
            if (plane[i].dcdy < 0) plane[i].c++;
         }
      }

      plane[i].dcdx *= FIXED_ONE;
      plane[i].dcdy *= FIXED_ONE;

      /* find trivial reject offsets for each edge for a single-pixel
       * sized block.  These will be scaled up at each recursive level to
       * match the active blocksize.  Scaling in this way works best if
       * the blocks are square.
       */
      plane[i].eo = 0;
      if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
      if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
   }


   /* 
    * When rasterizing scissored tris, use the intersection of the
    * triangle bounding box and the scissor rect to generate the
    * scissor planes.
    *
    * This permits us to cut off the triangle "tails" that are present
    * in the intermediate recursive levels caused when two of the
    * triangles edges don't diverge quickly enough to trivially reject
    * exterior blocks from the triangle.
    *
    * It's not really clear if it's worth worrying about these tails,
    * but since we generate the planes for each scissored tri, it's
    * free to trim them in this case.
    * 
    * Note that otherwise, the scissor planes only vary in 'C' value,
    * and even then only on state-changes.  Could alternatively store
    * these planes elsewhere.
    * (Or only store the c value together with a bit indicating which
    * scissor edge this is, so rasterization would treat them differently
    * (easier to evaluate) to ordinary planes.)
    */
   if (nr_planes > 4) {
      struct lp_rast_plane *plane_s = &plane[4];

      if (s_planes[0]) {
         plane_s->dcdx = -1 << 8;
         plane_s->dcdy = 0;
         plane_s->c = (1-scissor->x0) << 8;
         plane_s->eo = 1 << 8;
         plane_s++;
      }
      if (s_planes[1]) {
         plane_s->dcdx = 1 << 8;
         plane_s->dcdy = 0;
         plane_s->c = (scissor->x1+1) << 8;
         plane_s->eo = 0 << 8;
         plane_s++;
      }
      if (s_planes[2]) {
         plane_s->dcdx = 0;
         plane_s->dcdy = 1 << 8;
         plane_s->c = (1-scissor->y0) << 8;
         plane_s->eo = 1 << 8;
         plane_s++;
      }
      if (s_planes[3]) {
         plane_s->dcdx = 0;
         plane_s->dcdy = -1 << 8;
         plane_s->c = (scissor->y1+1) << 8;
         plane_s->eo = 0;
         plane_s++;
      }
      assert(plane_s == &plane[nr_planes]);
   }

   return lp_setup_bin_triangle(setup, line, &bbox, &bboxpos, nr_planes, viewport_index);
}
Ejemplo n.º 5
0
static boolean
try_setup_point( struct lp_setup_context *setup,
                 const float (*v0)[4] )
{
   /* x/y positions in fixed point */
   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
   const int sizeAttr = setup->psize;
   const float size
      = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0]
      : setup->point_size;
   
   /* Point size as fixed point integer, remove rounding errors 
    * and gives minimum width for very small points
    */
   int fixed_width = MAX2(FIXED_ONE,
                          (subpixel_snap(size) + FIXED_ONE/2 - 1) & ~(FIXED_ONE-1));

   const int x0 = subpixel_snap(v0[0][0] - setup->pixel_offset) - fixed_width/2;
   const int y0 = subpixel_snap(v0[0][1] - setup->pixel_offset) - fixed_width/2;
     
   struct lp_scene *scene = setup->scene;
   struct lp_rast_triangle *point;
   unsigned bytes;
   struct u_rect bbox;
   unsigned nr_planes = 4;
   struct point_info info;


   /* Bounding rectangle (in pixels) */
   {
      /* Yes this is necessary to accurately calculate bounding boxes
       * with the two fill-conventions we support.  GL (normally) ends
       * up needing a bottom-left fill convention, which requires
       * slightly different rounding.
       */
      int adj = (setup->pixel_offset != 0) ? 1 : 0;

      bbox.x0 = (x0 + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
      bbox.x1 = (x0 + fixed_width + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
      bbox.y0 = (y0 + (FIXED_ONE-1)) >> FIXED_ORDER;
      bbox.y1 = (y0 + fixed_width + (FIXED_ONE-1)) >> FIXED_ORDER;

      /* Inclusive coordinates:
       */
      bbox.x1--;
      bbox.y1--;
   }
   
   if (!u_rect_test_intersection(&setup->draw_region, &bbox)) {
      if (0) debug_printf("offscreen\n");
      LP_COUNT(nr_culled_tris);
      return TRUE;
   }

   u_rect_find_intersection(&setup->draw_region, &bbox);

   point = lp_setup_alloc_triangle(scene,
                                   key->num_inputs,
                                   nr_planes,
                                   &bytes);
   if (!point)
      return FALSE;

#ifdef DEBUG
   point->v[0][0] = v0[0][0];
   point->v[0][1] = v0[0][1];
#endif

   info.v0 = v0;
   info.dx01 = 0;
   info.dx12 = fixed_width;
   info.dy01 = fixed_width;
   info.dy12 = 0;
   info.a0 = GET_A0(&point->inputs);
   info.dadx = GET_DADX(&point->inputs);
   info.dady = GET_DADY(&point->inputs);
   
   /* Setup parameter interpolants:
    */
   setup_point_coefficients(setup, &info);

   point->inputs.frontfacing = TRUE;
   point->inputs.disable = FALSE;
   point->inputs.opaque = FALSE;

   {
      struct lp_rast_plane *plane = GET_PLANES(point);

      plane[0].dcdx = -1;
      plane[0].dcdy = 0;
      plane[0].c = 1-bbox.x0;
      plane[0].eo = 1;

      plane[1].dcdx = 1;
      plane[1].dcdy = 0;
      plane[1].c = bbox.x1+1;
      plane[1].eo = 0;

      plane[2].dcdx = 0;
      plane[2].dcdy = 1;
      plane[2].c = 1-bbox.y0;
      plane[2].eo = 1;

      plane[3].dcdx = 0;
      plane[3].dcdy = -1;
      plane[3].c = bbox.y1+1;
      plane[3].eo = 0;
   }

   return lp_setup_bin_triangle(setup, point, &bbox, nr_planes);
}
Ejemplo n.º 6
0
static boolean
try_setup_line( struct lp_setup_context *setup,
               const float (*v1)[4],
               const float (*v2)[4])
{
   struct lp_scene *scene = setup->scene;
   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
   struct lp_rast_triangle *line;
   struct lp_rast_plane *plane;
   struct lp_line_info info;
   float width = MAX2(1.0, setup->line_width);
   struct u_rect bbox;
   unsigned tri_bytes;
   int x[4]; 
   int y[4];
   int i;
   int nr_planes = 4;
   
   /* linewidth should be interpreted as integer */
   int fixed_width = util_iround(width) * FIXED_ONE;

   float x_offset=0;
   float y_offset=0;
   float x_offset_end=0;
   float y_offset_end=0;
      
   float x1diff;
   float y1diff;
   float x2diff;
   float y2diff;
   float dx, dy;
   float area;

   boolean draw_start;
   boolean draw_end;
   boolean will_draw_start;
   boolean will_draw_end;

   if (0)
      print_line(setup, v1, v2);

   if (setup->scissor_test) {
      nr_planes = 8;
   }
   else {
      nr_planes = 4;
   }


   dx = v1[0][0] - v2[0][0];
   dy = v1[0][1] - v2[0][1];
   area = (dx * dx  + dy * dy);
   if (area == 0) {
      LP_COUNT(nr_culled_tris);
      return TRUE;
   }

   info.oneoverarea = 1.0f / area;
   info.dx = dx;
   info.dy = dy;
   info.v1 = v1;
   info.v2 = v2;

  
   /* X-MAJOR LINE */
   if (fabsf(dx) >= fabsf(dy)) {
      float dydx = dy / dx;

      x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5;
      y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5;
      x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5;
      y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5;

      if (y2diff==-0.5 && dy<0){
         y2diff = 0.5;
      }
      
      /* 
       * Diamond exit rule test for starting point 
       */    
      if (fabsf(x1diff) + fabsf(y1diff) < 0.5) {
         draw_start = TRUE;
      }
      else if (sign(x1diff) == sign(-dx)) {
         draw_start = FALSE;
      }
      else if (sign(-y1diff) != sign(dy)) {
         draw_start = TRUE;
      }
      else {
         /* do intersection test */
         float yintersect = fracf(v1[0][1]) + x1diff * dydx;
         draw_start = (yintersect < 1.0 && yintersect > 0.0);
      }


      /* 
       * Diamond exit rule test for ending point 
       */    
      if (fabsf(x2diff) + fabsf(y2diff) < 0.5) {
         draw_end = FALSE;
      }
      else if (sign(x2diff) != sign(-dx)) {
         draw_end = FALSE;
      }
      else if (sign(-y2diff) == sign(dy)) {
         draw_end = TRUE;
      }
      else {
         /* do intersection test */
         float yintersect = fracf(v2[0][1]) + x2diff * dydx;
         draw_end = (yintersect < 1.0 && yintersect > 0.0);
      }

      /* Are we already drawing start/end?
       */
      will_draw_start = sign(-x1diff) != sign(dx);
      will_draw_end = (sign(x2diff) == sign(-dx)) || x2diff==0;

      if (dx < 0) {
         /* if v2 is to the right of v1, swap pointers */
         const float (*temp)[4] = v1;
         v1 = v2;
         v2 = temp;
         dx = -dx;
         dy = -dy;
         /* Otherwise shift planes appropriately */
         if (will_draw_start != draw_start) {
            x_offset_end = - x1diff - 0.5;
            y_offset_end = x_offset_end * dydx;

         }
         if (will_draw_end != draw_end) {
            x_offset = - x2diff - 0.5;
            y_offset = x_offset * dydx;
         }

      }
      else{
         /* Otherwise shift planes appropriately */
         if (will_draw_start != draw_start) {
            x_offset = - x1diff + 0.5;
            y_offset = x_offset * dydx;
         }
         if (will_draw_end != draw_end) {
            x_offset_end = - x2diff + 0.5;
            y_offset_end = x_offset_end * dydx;
         }
      }
  
      /* x/y positions in fixed point */
      x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset);
      x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset);
      x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset);
      x[3] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset);
      
      y[0] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset) - fixed_width/2;
      y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) - fixed_width/2;
      y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset) + fixed_width/2;
      y[3] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset) + fixed_width/2;
      
   }
   else {
      const float dxdy = dx / dy;

      /* Y-MAJOR LINE */      
      x1diff = v1[0][0] - (float) floor(v1[0][0]) - 0.5;
      y1diff = v1[0][1] - (float) floor(v1[0][1]) - 0.5;
      x2diff = v2[0][0] - (float) floor(v2[0][0]) - 0.5;
      y2diff = v2[0][1] - (float) floor(v2[0][1]) - 0.5;

      if (x2diff==-0.5 && dx<0) {
         x2diff = 0.5;
      }

      /* 
       * Diamond exit rule test for starting point 
       */    
      if (fabsf(x1diff) + fabsf(y1diff) < 0.5) {
         draw_start = TRUE;
      }
      else if (sign(-y1diff) == sign(dy)) {
         draw_start = FALSE;
      }
      else if (sign(x1diff) != sign(-dx)) {
         draw_start = TRUE;
      }
      else {
         /* do intersection test */
         float xintersect = fracf(v1[0][0]) + y1diff * dxdy;
         draw_start = (xintersect < 1.0 && xintersect > 0.0);
      }

      /* 
       * Diamond exit rule test for ending point 
       */    
      if (fabsf(x2diff) + fabsf(y2diff) < 0.5) {
         draw_end = FALSE;
      }
      else if (sign(-y2diff) != sign(dy) ) {
         draw_end = FALSE;
      }
      else if (sign(x2diff) == sign(-dx) ) {
         draw_end = TRUE;
      }
      else {
         /* do intersection test */
         float xintersect = fracf(v2[0][0]) + y2diff * dxdy;
         draw_end = (xintersect < 1.0 && xintersect >= 0.0);
      }

      /* Are we already drawing start/end?
       */
      will_draw_start = sign(y1diff) == sign(dy);
      will_draw_end = (sign(-y2diff) == sign(dy)) || y2diff==0;

      if (dy > 0) {
         /* if v2 is on top of v1, swap pointers */
         const float (*temp)[4] = v1;
         v1 = v2;
         v2 = temp; 
         dx = -dx;
         dy = -dy;

         /* Otherwise shift planes appropriately */
         if (will_draw_start != draw_start) {
            y_offset_end = - y1diff + 0.5;
            x_offset_end = y_offset_end * dxdy;
         }
         if (will_draw_end != draw_end) {
            y_offset = - y2diff + 0.5;
            x_offset = y_offset * dxdy;
         }
      }
      else {
         /* Otherwise shift planes appropriately */
         if (will_draw_start != draw_start) {
            y_offset = - y1diff - 0.5;
            x_offset = y_offset * dxdy;
                     
         }
         if (will_draw_end != draw_end) {
            y_offset_end = - y2diff - 0.5;
            x_offset_end = y_offset_end * dxdy;
         }
      }

      /* x/y positions in fixed point */
      x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) - fixed_width/2;
      x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) - fixed_width/2;
      x[2] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) + fixed_width/2;
      x[3] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) + fixed_width/2;
     
      y[0] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset); 
      y[1] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset);
      y[2] = subpixel_snap(v2[0][1] + y_offset_end - setup->pixel_offset);
      y[3] = subpixel_snap(v1[0][1] + y_offset     - setup->pixel_offset);
   }



   LP_COUNT(nr_tris);

 
   /* Bounding rectangle (in pixels) */
   {
      /* Yes this is necessary to accurately calculate bounding boxes
       * with the two fill-conventions we support.  GL (normally) ends
       * up needing a bottom-left fill convention, which requires
       * slightly different rounding.
       */
      int adj = (setup->pixel_offset != 0) ? 1 : 0;

      bbox.x0 = (MIN4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER;
      bbox.x1 = (MAX4(x[0], x[1], x[2], x[3]) + (FIXED_ONE-1)) >> FIXED_ORDER;
      bbox.y0 = (MIN4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;
      bbox.y1 = (MAX4(y[0], y[1], y[2], y[3]) + (FIXED_ONE-1) + adj) >> FIXED_ORDER;

      /* Inclusive coordinates:
       */
      bbox.x1--;
      bbox.y1--;
   }

   if (bbox.x1 < bbox.x0 ||
       bbox.y1 < bbox.y0) {
      if (0) debug_printf("empty bounding box\n");
      LP_COUNT(nr_culled_tris);
      return TRUE;
   }

   if (!u_rect_test_intersection(&setup->draw_region, &bbox)) {
      if (0) debug_printf("offscreen\n");
      LP_COUNT(nr_culled_tris);
      return TRUE;
   }

   /* Can safely discard negative regions:
    */
   bbox.x0 = MAX2(bbox.x0, 0);
   bbox.y0 = MAX2(bbox.y0, 0);

   line = lp_setup_alloc_triangle(scene,
                                  key->num_inputs,
                                  nr_planes,
                                  &tri_bytes);
   if (!line)
      return FALSE;

#ifdef DEBUG
   line->v[0][0] = v1[0][0];
   line->v[1][0] = v2[0][0];   
   line->v[0][1] = v1[0][1];
   line->v[1][1] = v2[0][1];
#endif

   /* calculate the deltas */
   plane = GET_PLANES(line);
   plane[0].dcdy = x[0] - x[1];
   plane[1].dcdy = x[1] - x[2];
   plane[2].dcdy = x[2] - x[3];
   plane[3].dcdy = x[3] - x[0];

   plane[0].dcdx = y[0] - y[1];
   plane[1].dcdx = y[1] - y[2];
   plane[2].dcdx = y[2] - y[3];
   plane[3].dcdx = y[3] - y[0];


   /* Setup parameter interpolants:
    */
   info.a0 = GET_A0(&line->inputs);
   info.dadx = GET_DADX(&line->inputs);
   info.dady = GET_DADY(&line->inputs);
   setup_line_coefficients(setup, &info); 

   line->inputs.frontfacing = TRUE;
   line->inputs.disable = FALSE;
   line->inputs.opaque = FALSE;

   for (i = 0; i < 4; i++) {

      /* half-edge constants, will be interated over the whole render
       * target.
       */
      plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];

      
      /* correct for top-left vs. bottom-left fill convention.  
       *
       * note that we're overloading gl_rasterization_rules to mean
       * both (0.5,0.5) pixel centers *and* bottom-left filling
       * convention.
       *
       * GL actually has a top-left filling convention, but GL's
       * notion of "top" differs from gallium's...
       *
       * Also, sometimes (in FBO cases) GL will render upside down
       * to its usual method, in which case it will probably want
       * to use the opposite, top-left convention.
       */         
      if (plane[i].dcdx < 0) {
         /* both fill conventions want this - adjust for left edges */
         plane[i].c++;            
      }
      else if (plane[i].dcdx == 0) {
         if (setup->pixel_offset == 0) {
            /* correct for top-left fill convention:
             */
            if (plane[i].dcdy > 0) plane[i].c++;
         }
         else {
            /* correct for bottom-left fill convention:
             */
            if (plane[i].dcdy < 0) plane[i].c++;
         }
      }

      plane[i].dcdx *= FIXED_ONE;
      plane[i].dcdy *= FIXED_ONE;

      /* find trivial reject offsets for each edge for a single-pixel
       * sized block.  These will be scaled up at each recursive level to
       * match the active blocksize.  Scaling in this way works best if
       * the blocks are square.
       */
      plane[i].eo = 0;
      if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
      if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
   }


   /* 
    * When rasterizing scissored tris, use the intersection of the
    * triangle bounding box and the scissor rect to generate the
    * scissor planes.
    *
    * This permits us to cut off the triangle "tails" that are present
    * in the intermediate recursive levels caused when two of the
    * triangles edges don't diverge quickly enough to trivially reject
    * exterior blocks from the triangle.
    *
    * It's not really clear if it's worth worrying about these tails,
    * but since we generate the planes for each scissored tri, it's
    * free to trim them in this case.
    * 
    * Note that otherwise, the scissor planes only vary in 'C' value,
    * and even then only on state-changes.  Could alternatively store
    * these planes elsewhere.
    */
   if (nr_planes == 8) {
      const struct u_rect *scissor = &setup->scissor;

      plane[4].dcdx = -1;
      plane[4].dcdy = 0;
      plane[4].c = 1-scissor->x0;
      plane[4].eo = 1;

      plane[5].dcdx = 1;
      plane[5].dcdy = 0;
      plane[5].c = scissor->x1+1;
      plane[5].eo = 0;

      plane[6].dcdx = 0;
      plane[6].dcdy = 1;
      plane[6].c = 1-scissor->y0;
      plane[6].eo = 1;

      plane[7].dcdx = 0;
      plane[7].dcdy = -1;
      plane[7].c = scissor->y1+1;
      plane[7].eo = 0;
   }

   return lp_setup_bin_triangle(setup, line, &bbox, nr_planes);
}
Ejemplo n.º 7
0
void
lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task,
                      const union lp_rast_cmd_arg arg)
{
   const struct lp_rast_triangle *tri = arg.triangle.tri;
   const struct lp_rast_plane *plane = GET_PLANES(tri);
   int x = (arg.triangle.plane_mask & 0xff) + task->x;
   int y = (arg.triangle.plane_mask >> 8) + task->y;
   unsigned i, j;

   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
   unsigned nr = 0;

   __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */
   __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */
   __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */
   __m128i zero = vec_splats((unsigned char) 0);

   __m128i c;
   __m128i dcdx;
   __m128i dcdy;
   __m128i rej4;

   __m128i dcdx2;
   __m128i dcdx3;

   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
   __m128i unused;

   __m128i vshuf_mask0;
   __m128i vshuf_mask1;
   __m128i vshuf_mask2;

#ifdef PIPE_ARCH_LITTLE_ENDIAN
   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100);
   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504);
   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908);
#else
   vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F);
   vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B);
   vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607);
#endif

   transpose4_epi32(&p0, &p1, &p2, &zero,
                    &c, &dcdx, &dcdy, &rej4);

   /* Adjust dcdx;
    */
   dcdx = vec_sub_epi32(zero, dcdx);

   c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x)));
   c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y)));
   rej4 = vec_slli_epi32(rej4, 2);

   /*
    * Adjust so we can just check the sign bit (< 0 comparison),
    * instead of having to do a less efficient <= 0 comparison
    */
   c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1));
   rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1));

   dcdx2 = vec_add_epi32(dcdx, dcdx);
   dcdx3 = vec_add_epi32(dcdx2, dcdx);

   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
                    &span_0, &span_1, &span_2, &unused);

   for (i = 0; i < 4; i++) {
      __m128i cx = c;

      for (j = 0; j < 4; j++) {
         __m128i c4rej = vec_add_epi32(cx, rej4);
         __m128i rej_masks = vec_srai_epi32(c4rej, 31);

         /* if (is_zero(rej_masks)) */
         if (vec_movemask_epi8(rej_masks) == 0) {
            __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0);
            __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1);
            __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2);

            __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0);

            __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0));
            __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1));
            __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2));

            __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1);
            __m128i c_01 = vec_packs_epi32(c_0, c_1);

            __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0));
            __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1));
            __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2));

            __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2);

            __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0));
            __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1));
            __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2));

            __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3);
            __m128i c_23 = vec_packs_epi32(c_2, c_3);
            __m128i c_0123 = vec_packs_epi16(c_01, c_23);

            unsigned mask = vec_movemask_epi8(c_0123);

            out[nr].i = i;
            out[nr].j = j;
            out[nr].mask = mask;
            if (mask != 0xffff)
               nr++;
         }
         cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2));
      }

      c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2));
   }

   for (i = 0; i < nr; i++)
      lp_rast_shade_quads_mask(task,
                               &tri->inputs,
                               x + 4 * out[i].j,
                               y + 4 * out[i].i,
                               0xffff & ~out[i].mask);
}