示例#1
0
文件: clpf_rdo.c 项目: brion/aomedia
void aom_clpf_detect_multi_c(const uint8_t *rec, const uint8_t *org,
                             int rstride, int ostride, int x0, int y0,
                             int width, int height, int *sum, int size,
                             unsigned int bd) {
  int x, y;

  for (y = y0; y < y0 + size; y++) {
    for (x = x0; x < x0 + size; x++) {
      const int O = org[y * ostride + x];
      const int X = rec[y * rstride + x];
      const int A = rec[AOMMAX(0, y - 2) * rstride + x];
      const int B = rec[AOMMAX(0, y - 1) * rstride + x];
      const int C = rec[y * rstride + AOMMAX(0, x - 2)];
      const int D = rec[y * rstride + AOMMAX(0, x - 1)];
      const int E = rec[y * rstride + AOMMIN(width - 1, x + 1)];
      const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)];
      const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x];
      const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x];
      const int delta1 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 1, bd);
      const int delta2 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 2, bd);
      const int delta3 = av1_clpf_sample(X, A, B, C, D, E, F, G, H, 4, bd);
      const int F1 = X + delta1;
      const int F2 = X + delta2;
      const int F3 = X + delta3;
      sum[0] += (O - X) * (O - X);
      sum[1] += (O - F1) * (O - F1);
      sum[2] += (O - F2) * (O - F2);
      sum[3] += (O - F3) * (O - F3);
    }
  }
}
示例#2
0
文件: clpf_rdo.c 项目: brion/aomedia
// Identical to aom_clpf_detect_c() apart from "rec" and "org".
void aom_clpf_detect_hbd_c(const uint16_t *rec, const uint16_t *org,
                           int rstride, int ostride, int x0, int y0, int width,
                           int height, int *sum0, int *sum1,
                           unsigned int strength, int size, unsigned int bd) {
  const int shift = bd - 8;
  int x, y;
  for (y = y0; y < y0 + size; y++) {
    for (x = x0; x < x0 + size; x++) {
      const int O = org[y * ostride + x] >> shift;
      const int X = rec[y * rstride + x] >> shift;
      const int A = rec[AOMMAX(0, y - 2) * rstride + x] >> shift;
      const int B = rec[AOMMAX(0, y - 1) * rstride + x] >> shift;
      const int C = rec[y * rstride + AOMMAX(0, x - 2)] >> shift;
      const int D = rec[y * rstride + AOMMAX(0, x - 1)] >> shift;
      const int E = rec[y * rstride + AOMMIN(width - 1, x + 1)] >> shift;
      const int F = rec[y * rstride + AOMMIN(width - 1, x + 2)] >> shift;
      const int G = rec[AOMMIN(height - 1, y + 1) * rstride + x] >> shift;
      const int H = rec[AOMMIN(height - 1, y + 2) * rstride + x] >> shift;
      const int delta = av1_clpf_sample(X, A, B, C, D, E, F, G, H,
                                        strength >> shift, bd - shift);
      const int Y = X + delta;
      *sum0 += (O - X) * (O - X);
      *sum1 += (O - Y) * (O - Y);
    }
  }
}
示例#3
0
void av1_set_speed_features_framesize_dependent(AV1_COMP *cpi) {
  SPEED_FEATURES *const sf = &cpi->sf;
  const AV1EncoderConfig *const oxcf = &cpi->oxcf;
  AV1_COMMON *const cm = &cpi->common;
  RD_OPT *const rd = &cpi->rd;
  int i;

// Limit memory usage for high resolutions
#if CONFIG_EXT_REFS
  // TODO(zoeliu): Temporary solution to resolve the insufficient RAM issue for
  //               ext-refs. Need to work with @yunqingwang to have a more
  //               effective solution.
  if (AOMMIN(cm->width, cm->height) > 720) {
    // Turn off the use of upsampled references for HD resolution
    sf->use_upsampled_references = 0;
  } else if ((AOMMIN(cm->width, cm->height) > 540) &&
             (oxcf->profile != PROFILE_0)) {
    sf->use_upsampled_references = 0;
  }
#else
  if (AOMMIN(cm->width, cm->height) > 1080) {
    sf->use_upsampled_references = 0;
  } else if ((AOMMIN(cm->width, cm->height) > 720) &&
             (oxcf->profile != PROFILE_0)) {
    sf->use_upsampled_references = 0;
  }
#endif  // CONFIG_EXT_REFS

  if (oxcf->mode == GOOD) {
    set_good_speed_feature_framesize_dependent(cpi, sf, oxcf->speed);
  }

  if (sf->disable_split_mask == DISABLE_ALL_SPLIT) {
    sf->adaptive_pred_interp_filter = 0;
  }

  // Check for masked out split cases.
  for (i = 0; i < MAX_REFS; ++i) {
    if (sf->disable_split_mask & (1 << i)) {
      rd->thresh_mult_sub8x8[i] = INT_MAX;
    }
  }

  // This is only used in motion vector unit test.
  if (cpi->oxcf.motion_vector_unit_test == 1)
    cpi->find_fractional_mv_step = av1_return_max_sub_pixel_mv;
  else if (cpi->oxcf.motion_vector_unit_test == 2)
    cpi->find_fractional_mv_step = av1_return_min_sub_pixel_mv;
}
示例#4
0
void aom_dk_reader_fill(struct aom_dk_reader *r) {
  const uint8_t *const buffer_end = r->buffer_end;
  const uint8_t *buffer = r->buffer;
  const uint8_t *buffer_start = buffer;
  BD_VALUE value = r->value;
  int count = r->count;
  const size_t bytes_left = buffer_end - buffer;
  const size_t bits_left = bytes_left * CHAR_BIT;
  int shift = BD_VALUE_SIZE - CHAR_BIT - (count + CHAR_BIT);

  if (r->decrypt_cb) {
    size_t n = AOMMIN(sizeof(r->clear_buffer), bytes_left);
    r->decrypt_cb(r->decrypt_state, buffer, r->clear_buffer, (int)n);
    buffer = r->clear_buffer;
    buffer_start = r->clear_buffer;
  }
  if (bits_left > BD_VALUE_SIZE) {
    const int bits = (shift & 0xfffffff8) + CHAR_BIT;
    BD_VALUE nv;
    BD_VALUE big_endian_values;
    memcpy(&big_endian_values, buffer, sizeof(BD_VALUE));
#if SIZE_MAX == 0xffffffffffffffffULL
    big_endian_values = HToBE64(big_endian_values);
#else
    big_endian_values = HToBE32(big_endian_values);
#endif
    nv = big_endian_values >> (BD_VALUE_SIZE - bits);
    count += bits;
    buffer += (bits >> 3);
    value = r->value | (nv << (shift & 0x7));
  } else {
示例#5
0
static uint8_t scan_col_mbmi(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                             const int mi_row, const int mi_col, int block,
                             const MV_REFERENCE_FRAME rf[2],
                             int col_offset,
                             CANDIDATE_MV *ref_mv_stack,
                             uint8_t *refmv_count) {
  const TileInfo *const tile = &xd->tile;
  int i;
  uint8_t newmv_count = 0;

  for (i = 0; i < xd->n8_h && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
    POSITION mi_pos;
    mi_pos.row = i;
    mi_pos.col = col_offset;

    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, &mi_pos)) {
      const MODE_INFO *const candidate_mi =
          xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
      const MB_MODE_INFO *const candidate_mbmi = &candidate_mi->mbmi;
      const int len = AOMMIN(xd->n8_h,
                        num_8x8_blocks_high_lookup[candidate_mbmi->sb_type]);
      newmv_count += add_ref_mv_candidate(xd, candidate_mi, candidate_mbmi, rf,
                                          refmv_count, ref_mv_stack,
                                          cm->allow_high_precision_mv, len,
                                          block, mi_pos.col);
      i += len;
    } else {
      ++i;
    }
  }
  return newmv_count;
}
示例#6
0
static int cost_and_tokenize_map(Av1ColorMapParam *param, TOKENEXTRA **t,
                                 int plane, int calc_rate, int allow_update_cdf,
                                 FRAME_COUNTS *counts, MapCdf map_pb_cdf) {
  const uint8_t *const color_map = param->color_map;
  MapCdf map_cdf = param->map_cdf;
  ColorCost color_cost = param->color_cost;
  const int plane_block_width = param->plane_width;
  const int rows = param->rows;
  const int cols = param->cols;
  const int n = param->n_colors;
  const int palette_size_idx = n - PALETTE_MIN_SIZE;
  int this_rate = 0;
  uint8_t color_order[PALETTE_MAX_SIZE];

  (void)plane;
  (void)counts;

  for (int k = 1; k < rows + cols - 1; ++k) {
    for (int j = AOMMIN(k, cols - 1); j >= AOMMAX(0, k - rows + 1); --j) {
      int i = k - j;
      int color_new_idx;
      const int color_ctx = av1_get_palette_color_index_context(
          color_map, plane_block_width, i, j, n, color_order, &color_new_idx);
      assert(color_new_idx >= 0 && color_new_idx < n);
      if (calc_rate) {
        this_rate += (*color_cost)[palette_size_idx][color_ctx][color_new_idx];
      } else {
        (*t)->token = color_new_idx;
        (*t)->color_map_cdf = map_pb_cdf[palette_size_idx][color_ctx];
        ++(*t);
        if (allow_update_cdf)
          update_cdf(map_cdf[palette_size_idx][color_ctx], color_new_idx, n);
#if CONFIG_ENTROPY_STATS
        if (plane) {
          ++counts->palette_uv_color_index[palette_size_idx][color_ctx]
                                          [color_new_idx];
        } else {
          ++counts->palette_y_color_index[palette_size_idx][color_ctx]
                                         [color_new_idx];
        }
#endif
      }
    }
  }
  if (calc_rate) return this_rate;
  return 0;
}
示例#7
0
void av1_find_mv_refs(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
#if CONFIG_REF_MV
                      uint8_t *ref_mv_count,
                      CANDIDATE_MV *ref_mv_stack,
#endif
                      int_mv *mv_ref_list, int mi_row, int mi_col,
                      find_mv_refs_sync sync, void *const data,
                      int16_t *mode_context) {
#if CONFIG_REF_MV
  int idx, all_zero = 1;
  if (ref_frame <= ALTREF_FRAME)
    find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col,
                     sync, data, mode_context);
#else
  find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1, mi_row, mi_col, sync,
                   data, mode_context);
#endif

#if CONFIG_REF_MV
  setup_ref_mv_list(cm, xd, ref_frame,
                    ref_mv_count, ref_mv_stack,
                    mv_ref_list, -1,
                    mi_row, mi_col, mode_context);

  if (*ref_mv_count >= 2) {
    for (idx = 0; idx < AOMMIN(3, *ref_mv_count); ++idx) {
      if (ref_mv_stack[idx].this_mv.as_int != 0)
        all_zero = 0;
      if (ref_frame > ALTREF_FRAME)
        if (ref_mv_stack[idx].comp_mv.as_int != 0)
          all_zero = 0;
    }
  } else if (ref_frame <= ALTREF_FRAME) {
    for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx)
      if (mv_ref_list[idx].as_int != 0)
        all_zero = 0;
  }

  if (all_zero)
    mode_context[ref_frame] |= (1 << ALL_ZERO_FLAG_OFFSET);
#endif
}
示例#8
0
static void handle_sec_rect_block(const MB_MODE_INFO * const candidate,
                                  uint8_t refmv_count,
                                  CANDIDATE_MV *ref_mv_stack,
                                  MV_REFERENCE_FRAME ref_frame,
                                  int16_t *mode_context) {
  int rf, idx;

  for (rf = 0; rf < 2; ++rf) {
    if (candidate->ref_frame[rf] == ref_frame) {
      const int list_range = AOMMIN(refmv_count, MAX_MV_REF_CANDIDATES);
      const int_mv pred_mv = candidate->mv[rf];
      for (idx = 0; idx < list_range; ++idx)
        if (pred_mv.as_int == ref_mv_stack[idx].this_mv.as_int)
          break;

      if (idx < list_range) {
        if (idx == 0)
          mode_context[ref_frame] |= (1 << SKIP_NEARESTMV_OFFSET);
        else if (idx == 1)
          mode_context[ref_frame] |= (1 << SKIP_NEARMV_OFFSET);
      }
    }
  }
}
示例#9
0
// Update the segmentation map, and related quantities: cyclic refresh map,
// refresh sb_index, and target number of blocks to be refreshed.
// The map is set to either 0/CR_SEGMENT_ID_BASE (no refresh) or to
// 1/CR_SEGMENT_ID_BOOST1 (refresh) for each superblock.
// Blocks labeled as BOOST1 may later get set to BOOST2 (during the
// encoding of the superblock).
static void cyclic_refresh_update_map(AV1_COMP *const cpi) {
  AV1_COMMON *const cm = &cpi->common;
  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
  unsigned char *const seg_map = cpi->segmentation_map;
  int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
  int xmis, ymis, x, y;
  memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
  sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
  sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
  sbs_in_frame = sb_cols * sb_rows;
  // Number of target blocks to get the q delta (segment 1).
  block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
  // Set the segmentation map: cycle through the superblocks, starting at
  // cr->mb_index, and stopping when either block_count blocks have been found
  // to be refreshed, or we have passed through whole frame.
  assert(cr->sb_index < sbs_in_frame);
  i = cr->sb_index;
  cr->target_num_seg_blocks = 0;
  do {
    int sum_map = 0;
    // Get the mi_row/mi_col corresponding to superblock index i.
    int sb_row_index = (i / sb_cols);
    int sb_col_index = i - sb_row_index * sb_cols;
    int mi_row = sb_row_index * MI_BLOCK_SIZE;
    int mi_col = sb_col_index * MI_BLOCK_SIZE;
    int qindex_thresh =
        cpi->oxcf.content == AOM_CONTENT_SCREEN
            ? av1_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
            : 0;
    assert(mi_row >= 0 && mi_row < cm->mi_rows);
    assert(mi_col >= 0 && mi_col < cm->mi_cols);
    bl_index = mi_row * cm->mi_cols + mi_col;
    // Loop through all 8x8 blocks in superblock and update map.
    xmis =
        AOMMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
    ymis =
        AOMMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
    for (y = 0; y < ymis; y++) {
      for (x = 0; x < xmis; x++) {
        const int bl_index2 = bl_index + y * cm->mi_cols + x;
        // If the block is as a candidate for clean up then mark it
        // for possible boost/refresh (segment 1). The segment id may get
        // reset to 0 later if block gets coded anything other than ZEROMV.
        if (cr->map[bl_index2] == 0) {
          if (cr->last_coded_q_map[bl_index2] > qindex_thresh) sum_map++;
        } else if (cr->map[bl_index2] < 0) {
          cr->map[bl_index2]++;
        }
      }
    }
    // Enforce constant segment over superblock.
    // If segment is at least half of superblock, set to 1.
    if (sum_map >= xmis * ymis / 2) {
      for (y = 0; y < ymis; y++)
        for (x = 0; x < xmis; x++) {
          seg_map[bl_index + y * cm->mi_cols + x] = CR_SEGMENT_ID_BOOST1;
        }
      cr->target_num_seg_blocks += xmis * ymis;
    }
    i++;
    if (i == sbs_in_frame) {
      i = 0;
    }
  } while (cr->target_num_seg_blocks < block_count && i != cr->sb_index);
  cr->sb_index = i;
}
示例#10
0
// Prior to coding a given prediction block, of size bsize at (mi_row, mi_col),
// check if we should reset the segment_id, and update the cyclic_refresh map
// and segmentation map.
void av1_cyclic_refresh_update_segment(AV1_COMP *const cpi,
                                       MB_MODE_INFO *const mbmi, int mi_row,
                                       int mi_col, BLOCK_SIZE bsize,
                                       int64_t rate, int64_t dist, int skip) {
  const AV1_COMMON *const cm = &cpi->common;
  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
  const int bw = num_8x8_blocks_wide_lookup[bsize];
  const int bh = num_8x8_blocks_high_lookup[bsize];
  const int xmis = AOMMIN(cm->mi_cols - mi_col, bw);
  const int ymis = AOMMIN(cm->mi_rows - mi_row, bh);
  const int block_index = mi_row * cm->mi_cols + mi_col;
  const int refresh_this_block =
      candidate_refresh_aq(cr, mbmi, rate, dist, bsize);
  // Default is to not update the refresh map.
  int new_map_value = cr->map[block_index];
  int x = 0;
  int y = 0;

  // If this block is labeled for refresh, check if we should reset the
  // segment_id.
  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
    mbmi->segment_id = refresh_this_block;
    // Reset segment_id if will be skipped.
    if (skip) mbmi->segment_id = CR_SEGMENT_ID_BASE;
  }

  // Update the cyclic refresh map, to be used for setting segmentation map
  // for the next frame. If the block  will be refreshed this frame, mark it
  // as clean. The magnitude of the -ve influences how long before we consider
  // it for refresh again.
  if (cyclic_refresh_segment_id_boosted(mbmi->segment_id)) {
    new_map_value = -cr->time_for_refresh;
  } else if (refresh_this_block) {
    // Else if it is accepted as candidate for refresh, and has not already
    // been refreshed (marked as 1) then mark it as a candidate for cleanup
    // for future time (marked as 0), otherwise don't update it.
    if (cr->map[block_index] == 1) new_map_value = 0;
  } else {
    // Leave it marked as block that is not candidate for refresh.
    new_map_value = 1;
  }

  // Update entries in the cyclic refresh map with new_map_value, and
  // copy mbmi->segment_id into global segmentation map.
  for (y = 0; y < ymis; y++)
    for (x = 0; x < xmis; x++) {
      int map_offset = block_index + y * cm->mi_cols + x;
      cr->map[map_offset] = new_map_value;
      cpi->segmentation_map[map_offset] = mbmi->segment_id;
      // Inter skip blocks were clearly not coded at the current qindex, so
      // don't update the map for them. For cases where motion is non-zero or
      // the reference frame isn't the previous frame, the previous value in
      // the map for this spatial location is not entirely correct.
      if ((!is_inter_block(mbmi) || !skip) &&
          mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
        cr->last_coded_q_map[map_offset] = clamp(
            cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
      } else if (is_inter_block(mbmi) && skip &&
                 mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
        cr->last_coded_q_map[map_offset] =
            AOMMIN(clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id],
                         0, MAXQ),
                   cr->last_coded_q_map[map_offset]);
      }
    }
}
示例#11
0
static void set_good_speed_feature_framesize_dependent(AV1_COMP *cpi,
                                                       SPEED_FEATURES *sf,
                                                       int speed) {
  AV1_COMMON *const cm = &cpi->common;

  if (speed >= 1) {
    if (AOMMIN(cm->width, cm->height) >= 720) {
      sf->disable_split_mask =
          cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
      sf->partition_search_breakout_dist_thr = (1 << 23);
    } else {
      sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
      sf->partition_search_breakout_dist_thr = (1 << 21);
    }
  }

  if (speed >= 2) {
    if (AOMMIN(cm->width, cm->height) >= 720) {
      sf->disable_split_mask =
          cm->show_frame ? DISABLE_ALL_SPLIT : DISABLE_ALL_INTER_SPLIT;
      sf->adaptive_pred_interp_filter = 0;
      sf->partition_search_breakout_dist_thr = (1 << 24);
      sf->partition_search_breakout_rate_thr = 120;
    } else {
      sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
      sf->partition_search_breakout_dist_thr = (1 << 22);
      sf->partition_search_breakout_rate_thr = 100;
    }
    sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
  }

  if (speed >= 3) {
    if (AOMMIN(cm->width, cm->height) >= 720) {
      sf->disable_split_mask = DISABLE_ALL_SPLIT;
      sf->schedule_mode_search = cm->base_qindex < 220 ? 1 : 0;
      sf->partition_search_breakout_dist_thr = (1 << 25);
      sf->partition_search_breakout_rate_thr = 200;
    } else {
      sf->max_intra_bsize = BLOCK_32X32;
      sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
      sf->schedule_mode_search = cm->base_qindex < 175 ? 1 : 0;
      sf->partition_search_breakout_dist_thr = (1 << 23);
      sf->partition_search_breakout_rate_thr = 120;
    }
  }

  // If this is a two pass clip that fits the criteria for animated or
  // graphics content then reset disable_split_mask for speeds 1-4.
  // Also if the image edge is internal to the coded area.
  if ((speed >= 1) && (cpi->oxcf.pass == 2) &&
      ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
       (av1_internal_image_edge(cpi)))) {
    sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
  }

  if (speed >= 4) {
    if (AOMMIN(cm->width, cm->height) >= 720) {
      sf->partition_search_breakout_dist_thr = (1 << 26);
    } else {
      sf->partition_search_breakout_dist_thr = (1 << 24);
    }
    sf->disable_split_mask = DISABLE_ALL_SPLIT;
  }
}
示例#12
0
static int svdcmp(double **u, int m, int n, double w[], double **v) {
  const int max_its = 30;
  int flag, i, its, j, jj, k, l, nm;
  double anorm, c, f, g, h, s, scale, x, y, z;
  double *rv1 = (double *)aom_malloc(sizeof(*rv1) * (n + 1));
  g = scale = anorm = 0.0;
  for (i = 0; i < n; i++) {
    l = i + 1;
    rv1[i] = scale * g;
    g = s = scale = 0.0;
    if (i < m) {
      for (k = i; k < m; k++) scale += fabs(u[k][i]);
      if (scale != 0.) {
        for (k = i; k < m; k++) {
          u[k][i] /= scale;
          s += u[k][i] * u[k][i];
        }
        f = u[i][i];
        g = -sign(sqrt(s), f);
        h = f * g - s;
        u[i][i] = f - g;
        for (j = l; j < n; j++) {
          for (s = 0.0, k = i; k < m; k++) s += u[k][i] * u[k][j];
          f = s / h;
          for (k = i; k < m; k++) u[k][j] += f * u[k][i];
        }
        for (k = i; k < m; k++) u[k][i] *= scale;
      }
    }
    w[i] = scale * g;
    g = s = scale = 0.0;
    if (i < m && i != n - 1) {
      for (k = l; k < n; k++) scale += fabs(u[i][k]);
      if (scale != 0.) {
        for (k = l; k < n; k++) {
          u[i][k] /= scale;
          s += u[i][k] * u[i][k];
        }
        f = u[i][l];
        g = -sign(sqrt(s), f);
        h = f * g - s;
        u[i][l] = f - g;
        for (k = l; k < n; k++) rv1[k] = u[i][k] / h;
        for (j = l; j < m; j++) {
          for (s = 0.0, k = l; k < n; k++) s += u[j][k] * u[i][k];
          for (k = l; k < n; k++) u[j][k] += s * rv1[k];
        }
        for (k = l; k < n; k++) u[i][k] *= scale;
      }
    }
    anorm = fmax(anorm, (fabs(w[i]) + fabs(rv1[i])));
  }

  for (i = n - 1; i >= 0; i--) {
    if (i < n - 1) {
      if (g != 0.) {
        for (j = l; j < n; j++) v[j][i] = (u[i][j] / u[i][l]) / g;
        for (j = l; j < n; j++) {
          for (s = 0.0, k = l; k < n; k++) s += u[i][k] * v[k][j];
          for (k = l; k < n; k++) v[k][j] += s * v[k][i];
        }
      }
      for (j = l; j < n; j++) v[i][j] = v[j][i] = 0.0;
    }
    v[i][i] = 1.0;
    g = rv1[i];
    l = i;
  }
  for (i = AOMMIN(m, n) - 1; i >= 0; i--) {
    l = i + 1;
    g = w[i];
    for (j = l; j < n; j++) u[i][j] = 0.0;
    if (g != 0.) {
      g = 1.0 / g;
      for (j = l; j < n; j++) {
        for (s = 0.0, k = l; k < m; k++) s += u[k][i] * u[k][j];
        f = (s / u[i][i]) * g;
        for (k = i; k < m; k++) u[k][j] += f * u[k][i];
      }
      for (j = i; j < m; j++) u[j][i] *= g;
    } else {
      for (j = i; j < m; j++) u[j][i] = 0.0;
    }
    ++u[i][i];
  }
  for (k = n - 1; k >= 0; k--) {
    for (its = 0; its < max_its; its++) {
      flag = 1;
      for (l = k; l >= 0; l--) {
        nm = l - 1;
        if ((double)(fabs(rv1[l]) + anorm) == anorm || nm < 0) {
          flag = 0;
          break;
        }
        if ((double)(fabs(w[nm]) + anorm) == anorm) break;
      }
      if (flag) {
        c = 0.0;
        s = 1.0;
        for (i = l; i <= k; i++) {
          f = s * rv1[i];
          rv1[i] = c * rv1[i];
          if ((double)(fabs(f) + anorm) == anorm) break;
          g = w[i];
          h = pythag(f, g);
          w[i] = h;
          h = 1.0 / h;
          c = g * h;
          s = -f * h;
          for (j = 0; j < m; j++) {
            y = u[j][nm];
            z = u[j][i];
            u[j][nm] = y * c + z * s;
            u[j][i] = z * c - y * s;
          }
        }
      }
      z = w[k];
      if (l == k) {
        if (z < 0.0) {
          w[k] = -z;
          for (j = 0; j < n; j++) v[j][k] = -v[j][k];
        }
        break;
      }
      if (its == max_its - 1) {
        aom_free(rv1);
        return 1;
      }
      assert(k > 0);
      x = w[l];
      nm = k - 1;
      y = w[nm];
      g = rv1[nm];
      h = rv1[k];
      f = ((y - z) * (y + z) + (g - h) * (g + h)) / (2.0 * h * y);
      g = pythag(f, 1.0);
      f = ((x - z) * (x + z) + h * ((y / (f + sign(g, f))) - h)) / x;
      c = s = 1.0;
      for (j = l; j <= nm; j++) {
        i = j + 1;
        g = rv1[i];
        y = w[i];
        h = s * g;
        g = c * g;
        z = pythag(f, h);
        rv1[j] = z;
        c = f / z;
        s = h / z;
        f = x * c + g * s;
        g = g * c - x * s;
        h = y * s;
        y *= c;
        for (jj = 0; jj < n; jj++) {
          x = v[jj][j];
          z = v[jj][i];
          v[jj][j] = x * c + z * s;
          v[jj][i] = z * c - x * s;
        }
        z = pythag(f, h);
        w[j] = z;
        if (z != 0.) {
          z = 1.0 / z;
          c = f * z;
          s = h * z;
        }
        f = c * g + s * y;
        x = c * y - s * g;
        for (jj = 0; jj < m; jj++) {
          y = u[jj][j];
          z = u[jj][i];
          u[jj][j] = y * c + z * s;
          u[jj][i] = z * c - y * s;
        }
      }
      rv1[l] = 0.0;
      rv1[k] = f;
      w[k] = x;
    }
  }
  aom_free(rv1);
  return 0;
}
示例#13
0
void av1_append_sub8x8_mvs_for_idx(AV1_COMMON *cm, MACROBLOCKD *xd, int block,
                                   int ref, int mi_row, int mi_col,
                                   int_mv *nearest_mv, int_mv *near_mv) {
  int_mv mv_list[MAX_MV_REF_CANDIDATES];
  MODE_INFO *const mi = xd->mi[0];
  b_mode_info *bmi = mi->bmi;
  int n;

#if CONFIG_REF_MV
  CANDIDATE_MV ref_mv_stack[MAX_REF_MV_STACK_SIZE];
  CANDIDATE_MV tmp_mv;
  uint8_t ref_mv_count = 0, idx;
  uint8_t above_count = 0, left_count = 0;
  MV_REFERENCE_FRAME rf[2] = { mi->mbmi.ref_frame[ref], NONE };
#endif

  assert(MAX_MV_REF_CANDIDATES == 2);

  find_mv_refs_idx(cm, xd, mi, mi->mbmi.ref_frame[ref], mv_list, block, mi_row,
                   mi_col, NULL, NULL, NULL);

#if CONFIG_REF_MV
  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf,
                -1, 0, ref_mv_stack, &ref_mv_count);
  above_count = ref_mv_count;

  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf,
                0, -1, ref_mv_stack, &ref_mv_count);
  left_count = ref_mv_count - above_count;

  if (above_count > 1 && left_count > 0) {
    tmp_mv = ref_mv_stack[1];
    ref_mv_stack[1] = ref_mv_stack[above_count];
    ref_mv_stack[above_count] = tmp_mv;
  }

  for (idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, ref_mv_count); ++idx) {
    mv_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
    clamp_mv_ref(&mv_list[idx].as_mv,
                 xd->n8_w << 3, xd->n8_h << 3, xd);
  }
#endif

  near_mv->as_int = 0;
  switch (block) {
    case 0:
      nearest_mv->as_int = mv_list[0].as_int;
      near_mv->as_int = mv_list[1].as_int;
      break;
    case 1:
    case 2:
      nearest_mv->as_int = bmi[0].as_mv[ref].as_int;
      for (n = 0; n < MAX_MV_REF_CANDIDATES; ++n)
        if (nearest_mv->as_int != mv_list[n].as_int) {
          near_mv->as_int = mv_list[n].as_int;
          break;
        }
      break;
    case 3: {
      int_mv candidates[2 + MAX_MV_REF_CANDIDATES];
      candidates[0] = bmi[1].as_mv[ref];
      candidates[1] = bmi[0].as_mv[ref];
      candidates[2] = mv_list[0];
      candidates[3] = mv_list[1];

      nearest_mv->as_int = bmi[2].as_mv[ref].as_int;
      for (n = 0; n < 2 + MAX_MV_REF_CANDIDATES; ++n)
        if (nearest_mv->as_int != candidates[n].as_int) {
          near_mv->as_int = candidates[n].as_int;
          break;
        }
      break;
    }
    default: assert(0 && "Invalid block index.");
  }
}
示例#14
0
static void setup_ref_mv_list(const AV1_COMMON *cm, const MACROBLOCKD *xd,
                              MV_REFERENCE_FRAME ref_frame,
                              uint8_t *refmv_count,
                              CANDIDATE_MV *ref_mv_stack,
                              int_mv *mv_ref_list,
                              int block, int mi_row, int mi_col,
                              int16_t *mode_context) {
  int idx, nearest_refmv_count = 0;
  uint8_t newmv_count = 0;
  CANDIDATE_MV tmp_mv;
  int len, nr_len;

  const MV_REF *const prev_frame_mvs_base = cm->use_prev_frame_mvs ?
      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;

  const int bs = AOMMAX(xd->n8_w, xd->n8_h);
  const int has_tr = has_top_right(xd, mi_row, mi_col, bs);
  MV_REFERENCE_FRAME rf[2];

  av1_set_ref_frame(rf, ref_frame);
  mode_context[ref_frame] = 0;
  *refmv_count = 0;

  // Scan the first above row mode info.
  newmv_count += scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf,
                               -1, ref_mv_stack, refmv_count);
  // Scan the first left column mode info.
  newmv_count += scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf,
                               -1, ref_mv_stack, refmv_count);

  // Check top-right boundary
  if (has_tr)
    newmv_count += scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf,
                                 -1, 1, ref_mv_stack, refmv_count);

  nearest_refmv_count = *refmv_count;

  for (idx = 0; idx < nearest_refmv_count; ++idx)
    ref_mv_stack[idx].weight += REF_CAT_LEVEL;

  if (prev_frame_mvs_base && cm->show_frame && cm->last_show_frame &&
      rf[1] == NONE) {
    int blk_row, blk_col;
    int coll_blk_count = 0;

    for (blk_row = 0; blk_row < xd->n8_h; ++blk_row) {
      for (blk_col = 0; blk_col < xd->n8_w; ++blk_col) {
        coll_blk_count += add_col_ref_mv(cm, prev_frame_mvs_base, xd,
                                         mi_row, mi_col, ref_frame,
                                         blk_row, blk_col,
                                         refmv_count, ref_mv_stack,
                                         mode_context);
      }
    }
    if (coll_blk_count == 0)
      mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
  } else {
    mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
  }

  // Scan the second outer area.
  for (idx = 2; idx <= 4; ++idx) {
    scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf,
                  -idx, ref_mv_stack, refmv_count);
    scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf,
                  -idx, ref_mv_stack, refmv_count);
  }

  switch (nearest_refmv_count) {
    case 0:
      mode_context[ref_frame] |= 0;
      if (*refmv_count >= 1)
        mode_context[ref_frame] |= 1;

      if (*refmv_count == 1)
        mode_context[ref_frame] |= (1 << REFMV_OFFSET);
      else if (*refmv_count >= 2)
        mode_context[ref_frame] |= (2 << REFMV_OFFSET);
      break;
    case 1:
      mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;

      if (*refmv_count == 1)
        mode_context[ref_frame] |= (3 << REFMV_OFFSET);
      else if (*refmv_count >= 2)
        mode_context[ref_frame] |= (4 << REFMV_OFFSET);
      break;

    case 2:
    default:
      if (newmv_count >= 2)
        mode_context[ref_frame] |= 4;
      else if (newmv_count == 1)
        mode_context[ref_frame] |= 5;
      else
        mode_context[ref_frame] |= 6;

      mode_context[ref_frame] |= (5 << REFMV_OFFSET);
      break;
  }

  // Rank the likelihood and assign nearest and near mvs.
  len = nearest_refmv_count;
  while (len > 0) {
    nr_len = 0;
    for (idx = 1; idx < len; ++idx) {
      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
        tmp_mv = ref_mv_stack[idx - 1];
        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
        ref_mv_stack[idx] = tmp_mv;
        nr_len = idx;
      }
    }
    len = nr_len;
  }

  len = *refmv_count;
  while (len > nearest_refmv_count) {
    nr_len = nearest_refmv_count;
    for (idx = nearest_refmv_count + 1; idx < len; ++idx) {
      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
        tmp_mv = ref_mv_stack[idx - 1];
        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
        ref_mv_stack[idx] = tmp_mv;
        nr_len = idx;
      }
    }
    len = nr_len;
  }

  if (xd->is_sec_rect) {
    if (xd->n8_w < xd->n8_h) {
      const MODE_INFO *const candidate_mi = xd->mi[-1];
      const MB_MODE_INFO *const candidate_mbmi = &candidate_mi->mbmi;
      handle_sec_rect_block(candidate_mbmi, nearest_refmv_count, ref_mv_stack,
                            ref_frame, mode_context);
    }

    if (xd->n8_w > xd->n8_h) {
      const MODE_INFO *const candidate_mi = xd->mi[-xd->mi_stride];
      const MB_MODE_INFO *const candidate_mbmi = &candidate_mi->mbmi;
      handle_sec_rect_block(candidate_mbmi, nearest_refmv_count, ref_mv_stack,
                            ref_frame, mode_context);
    }
  }

  if (rf[1] > NONE) {
    for (idx = 0; idx < *refmv_count; ++idx) {
      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv,
                   xd->n8_w << 3 , xd->n8_h << 3, xd);
      clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv,
                   xd->n8_w << 3 , xd->n8_h << 3, xd);
    }
  } else {
    for (idx = 0; idx < AOMMIN(MAX_MV_REF_CANDIDATES, *refmv_count); ++idx) {
      mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
      clamp_mv_ref(&mv_ref_list[idx].as_mv,
                   xd->n8_w << 3, xd->n8_h << 3, xd);
    }
  }
}
示例#15
0
/* SSE2 version of the rotzoom/affine warp filter */
void av1_warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
                          int stride, uint8_t *pred, int p_col, int p_row,
                          int p_width, int p_height, int p_stride,
                          int subsampling_x, int subsampling_y, int ref_frm,
                          int16_t alpha, int16_t beta, int16_t gamma,
                          int16_t delta) {
  __m128i tmp[15];
  int i, j, k;

  /* Note: For this code to work, the left/right frame borders need to be
     extended by at least 13 pixels each. By the time we get here, other
     code will have set up this border, but we allow an explicit check
     for debugging purposes.
  */
  /*for (i = 0; i < height; ++i) {
    for (j = 0; j < 13; ++j) {
      assert(ref[i * stride - 13 + j] == ref[i * stride]);
      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
    }
  }*/

  for (i = 0; i < p_height; i += 8) {
    for (j = 0; j < p_width; j += 8) {
      // (x, y) coordinates of the center of this block in the destination
      // image
      int32_t dst_x = p_col + j + 4;
      int32_t dst_y = p_row + i + 4;

      int32_t x4, y4, ix4, sx4, iy4, sy4;
      if (subsampling_x)
        x4 = ROUND_POWER_OF_TWO_SIGNED(
            mat[2] * 2 * dst_x + mat[3] * 2 * dst_y + mat[0] +
                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
            1);
      else
        x4 = mat[2] * dst_x + mat[3] * dst_y + mat[0];

      if (subsampling_y)
        y4 = ROUND_POWER_OF_TWO_SIGNED(
            mat[4] * 2 * dst_x + mat[5] * 2 * dst_y + mat[1] +
                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
            1);
      else
        y4 = mat[4] * dst_x + mat[5] * dst_y + mat[1];

      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);

      // Horizontal filter
      for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
        int iy = iy4 + k;
        if (iy < 0)
          iy = 0;
        else if (iy > height - 1)
          iy = height - 1;

        // If the block is aligned such that, after clamping, every sample
        // would be taken from the leftmost/rightmost column, then we can
        // skip the expensive horizontal filter.
        if (ix4 <= -7) {
          tmp[k + 7] = _mm_set1_epi16(
              ref[iy * stride] *
              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
        } else if (ix4 >= width + 6) {
          tmp[k + 7] = _mm_set1_epi16(
              ref[iy * stride + (width - 1)] *
              (1 << (WARPEDPIXEL_FILTER_BITS - HORSHEAR_REDUCE_PREC_BITS)));
        } else {
          int sx = sx4 + alpha * (-4) + beta * k +
                   // Include rounding and offset here
                   (1 << (WARPEDDIFF_PREC_BITS - 1)) +
                   (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);

          // Load source pixels
          __m128i zero = _mm_setzero_si128();
          __m128i src =
              _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));

          // Filter even-index pixels
          __m128i tmp_0 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_2 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_4 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_6 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));

          // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
          __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
          // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
          __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
          // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
          __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
          // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
          __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);

          // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
          __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
          // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
          __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
          // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
          __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
          // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
          __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);

          __m128i round_const =
              _mm_set1_epi32((1 << HORSHEAR_REDUCE_PREC_BITS) >> 1);

          // Calculate filtered results
          __m128i src_0 = _mm_unpacklo_epi8(src, zero);
          __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
          __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
          __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
          __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
          __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
          __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
          __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);

          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                           _mm_add_epi32(res_2, res_6));
          res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
                                    HORSHEAR_REDUCE_PREC_BITS);

          // Filter odd-index pixels
          __m128i tmp_1 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_3 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_5 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
          __m128i tmp_7 = _mm_loadu_si128(
              (__m128i *)(filter + ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));

          __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
          __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
          __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
          __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);

          __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
          __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
          __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
          __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);

          __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
          __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
          __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
          __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
          __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
          __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
          __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
          __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);

          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                          _mm_add_epi32(res_3, res_7));
          res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
                                   HORSHEAR_REDUCE_PREC_BITS);

          // Combine results into one register.
          // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
          // as this order helps with the vertical filter.
          tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
        }
      }

      // Vertical filter
      for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
        int sy = sy4 + gamma * (-4) + delta * k +
                 (1 << (WARPEDDIFF_PREC_BITS - 1)) +
                 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);

        // Load from tmp and rearrange pairs of consecutive rows into the
        // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
        __m128i *src = tmp + (k + 4);
        __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
        __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
        __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
        __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);

        // Filter even-index pixels
        __m128i tmp_0 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_2 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_4 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_6 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));

        __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
        __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
        __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
        __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);

        __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
        __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
        __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
        __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);

        __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
        __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
        __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
        __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);

        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
                                         _mm_add_epi32(res_4, res_6));

        // Filter odd-index pixels
        __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
        __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
        __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
        __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);

        __m128i tmp_1 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_3 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_5 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
        __m128i tmp_7 = _mm_loadu_si128(
            (__m128i *)(filter + ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));

        __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
        __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
        __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
        __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);

        __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
        __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
        __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
        __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);

        __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
        __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
        __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
        __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);

        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
                                        _mm_add_epi32(res_5, res_7));

        // Rearrange pixels back into the order 0 ... 7
        __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
        __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);

        // Round and pack into 8 bits
        __m128i round_const =
            _mm_set1_epi32((1 << VERSHEAR_REDUCE_PREC_BITS) >> 1);

        __m128i res_lo_round = _mm_srai_epi32(
            _mm_add_epi32(res_lo, round_const), VERSHEAR_REDUCE_PREC_BITS);
        __m128i res_hi_round = _mm_srai_epi32(
            _mm_add_epi32(res_hi, round_const), VERSHEAR_REDUCE_PREC_BITS);

        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);

        // Store, blending with 'pred' if needed
        __m128i *p = (__m128i *)&pred[(i + k + 4) * p_stride + j];

        // Note: If we're outputting a 4x4 block, we need to be very careful
        // to only output 4 pixels at this point, to avoid encode/decode
        // mismatches when encoding with multiple threads.
        if (p_width == 4) {
          if (ref_frm) {
            const __m128i orig = _mm_cvtsi32_si128(*(uint32_t *)p);
            res_8bit = _mm_avg_epu8(res_8bit, orig);
          }
          *(uint32_t *)p = _mm_cvtsi128_si32(res_8bit);
        } else {
          if (ref_frm) res_8bit = _mm_avg_epu8(res_8bit, _mm_loadl_epi64(p));
          _mm_storel_epi64(p, res_8bit);
        }
      }
    }
  }
}
示例#16
0
// Setup cyclic background refresh: set delta q and segmentation map.
void av1_cyclic_refresh_setup(AV1_COMP *const cpi) {
  AV1_COMMON *const cm = &cpi->common;
  const RATE_CONTROL *const rc = &cpi->rc;
  CYCLIC_REFRESH *const cr = cpi->cyclic_refresh;
  struct segmentation *const seg = &cm->seg;
  const int apply_cyclic_refresh = apply_cyclic_refresh_bitrate(cm, rc);
  if (cm->current_video_frame == 0) cr->low_content_avg = 0.0;
  // Don't apply refresh on key frame or enhancement layer frames.
  if (!apply_cyclic_refresh || cm->frame_type == KEY_FRAME) {
    // Set segmentation map to 0 and disable.
    unsigned char *const seg_map = cpi->segmentation_map;
    memset(seg_map, 0, cm->mi_rows * cm->mi_cols);
    av1_disable_segmentation(&cm->seg);
    if (cm->frame_type == KEY_FRAME) {
      memset(cr->last_coded_q_map, MAXQ,
             cm->mi_rows * cm->mi_cols * sizeof(*cr->last_coded_q_map));
      cr->sb_index = 0;
    }
    return;
  } else {
    int qindex_delta = 0;
    int qindex2;
    const double q = av1_convert_qindex_to_q(cm->base_qindex, cm->bit_depth);
    aom_clear_system_state();
    // Set rate threshold to some multiple (set to 2 for now) of the target
    // rate (target is given by sb64_target_rate and scaled by 256).
    cr->thresh_rate_sb = ((int64_t)(rc->sb64_target_rate) << 8) << 2;
    // Distortion threshold, quadratic in Q, scale factor to be adjusted.
    // q will not exceed 457, so (q * q) is within 32bit; see:
    // av1_convert_qindex_to_q(), av1_ac_quant(), ac_qlookup*[].
    cr->thresh_dist_sb = ((int64_t)(q * q)) << 2;

    // Set up segmentation.
    // Clear down the segment map.
    av1_enable_segmentation(&cm->seg);
    av1_clearall_segfeatures(seg);
    // Select delta coding method.
    seg->abs_delta = SEGMENT_DELTADATA;

    // Note: setting temporal_update has no effect, as the seg-map coding method
    // (temporal or spatial) is determined in
    // av1_choose_segmap_coding_method(),
    // based on the coding cost of each method. For error_resilient mode on the
    // last_frame_seg_map is set to 0, so if temporal coding is used, it is
    // relative to 0 previous map.
    // seg->temporal_update = 0;

    // Segment BASE "Q" feature is disabled so it defaults to the baseline Q.
    av1_disable_segfeature(seg, CR_SEGMENT_ID_BASE, SEG_LVL_ALT_Q);
    // Use segment BOOST1 for in-frame Q adjustment.
    av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q);
    // Use segment BOOST2 for more aggressive in-frame Q adjustment.
    av1_enable_segfeature(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q);

    // Set the q delta for segment BOOST1.
    qindex_delta = compute_deltaq(cpi, cm->base_qindex, cr->rate_ratio_qdelta);
    cr->qindex_delta[1] = qindex_delta;

    // Compute rd-mult for segment BOOST1.
    qindex2 = clamp(cm->base_qindex + cm->y_dc_delta_q + qindex_delta, 0, MAXQ);

    cr->rdmult = av1_compute_rd_mult(cpi, qindex2);

    av1_set_segdata(seg, CR_SEGMENT_ID_BOOST1, SEG_LVL_ALT_Q, qindex_delta);

    // Set a more aggressive (higher) q delta for segment BOOST2.
    qindex_delta = compute_deltaq(
        cpi, cm->base_qindex,
        AOMMIN(CR_MAX_RATE_TARGET_RATIO,
               0.1 * cr->rate_boost_fac * cr->rate_ratio_qdelta));
    cr->qindex_delta[2] = qindex_delta;
    av1_set_segdata(seg, CR_SEGMENT_ID_BOOST2, SEG_LVL_ALT_Q, qindex_delta);

    // Update the segmentation and refresh map.
    cyclic_refresh_update_map(cpi);
  }
}
示例#17
0
文件: dering.c 项目: brion/aomedia
void av1_dering_frame(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                      MACROBLOCKD *xd, int global_level) {
  int r, c;
  int sbr, sbc;
  int nhsb, nvsb;
  int16_t src[OD_DERING_INBUF_SIZE];
  int16_t *linebuf[3];
  int16_t colbuf[3][OD_BSIZE_MAX + 2 * OD_FILT_VBORDER][OD_FILT_HBORDER];
  dering_list dlist[MAX_MIB_SIZE * MAX_MIB_SIZE];
  unsigned char *row_dering, *prev_row_dering, *curr_row_dering;
  int dering_count;
  int dir[OD_DERING_NBLOCKS][OD_DERING_NBLOCKS] = { { 0 } };
  int stride;
  int bsize[3];
  int dec[3];
  int pli;
  int dering_left;
  int coeff_shift = AOMMAX(cm->bit_depth - 8, 0);
  int nplanes;
  if (xd->plane[1].subsampling_x == xd->plane[1].subsampling_y &&
      xd->plane[2].subsampling_x == xd->plane[2].subsampling_y)
    nplanes = 3;
  else
    nplanes = 1;
  nvsb = (cm->mi_rows + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
  nhsb = (cm->mi_cols + MAX_MIB_SIZE - 1) / MAX_MIB_SIZE;
  av1_setup_dst_planes(xd->plane, frame, 0, 0);
  row_dering = aom_malloc(sizeof(*row_dering) * nhsb * 2);
  memset(row_dering, 1, sizeof(*row_dering) * (nhsb + 2) * 2);
  prev_row_dering = row_dering + 1;
  curr_row_dering = prev_row_dering + nhsb + 2;
  for (pli = 0; pli < nplanes; pli++) {
    dec[pli] = xd->plane[pli].subsampling_x;
    bsize[pli] = OD_DERING_SIZE_LOG2 - dec[pli];
  }
  stride = (cm->mi_cols << bsize[0]) + 2 * OD_FILT_HBORDER;
  for (pli = 0; pli < nplanes; pli++) {
    linebuf[pli] = aom_malloc(sizeof(*linebuf) * OD_FILT_VBORDER * stride);
  }
  for (sbr = 0; sbr < nvsb; sbr++) {
    for (pli = 0; pli < nplanes; pli++) {
      for (r = 0; r < (MAX_MIB_SIZE << bsize[pli]) + 2 * OD_FILT_VBORDER; r++) {
        for (c = 0; c < OD_FILT_HBORDER; c++) {
          colbuf[pli][r][c] = OD_DERING_VERY_LARGE;
        }
      }
    }
    dering_left = 1;
    for (sbc = 0; sbc < nhsb; sbc++) {
      int level;
      int nhb, nvb;
      int cstart = 0;
      if (!dering_left) cstart = -OD_FILT_HBORDER;
      nhb = AOMMIN(MAX_MIB_SIZE, cm->mi_cols - MAX_MIB_SIZE * sbc);
      nvb = AOMMIN(MAX_MIB_SIZE, cm->mi_rows - MAX_MIB_SIZE * sbr);
      level = compute_level_from_index(
          global_level, cm->mi_grid_visible[MAX_MIB_SIZE * sbr * cm->mi_stride +
                                            MAX_MIB_SIZE * sbc]
                            ->mbmi.dering_gain);
      curr_row_dering[sbc] = 0;
      if (level == 0 ||
          (dering_count = sb_compute_dering_list(
               cm, sbr * MAX_MIB_SIZE, sbc * MAX_MIB_SIZE, dlist)) == 0) {
        dering_left = 0;
        continue;
      }
      curr_row_dering[sbc] = 1;
      for (pli = 0; pli < nplanes; pli++) {
        int16_t dst[OD_BSIZE_MAX * OD_BSIZE_MAX];
        int threshold;
        int coffset;
        int rend, cend;
        if (sbc == nhsb - 1)
          cend = (nhb << bsize[pli]);
        else
          cend = (nhb << bsize[pli]) + OD_FILT_HBORDER;
        if (sbr == nvsb - 1)
          rend = (nvb << bsize[pli]);
        else
          rend = (nvb << bsize[pli]) + OD_FILT_VBORDER;
        coffset = sbc * MAX_MIB_SIZE << bsize[pli];
        if (sbc == nhsb - 1) {
          /* On the last superblock column, fill in the right border with
             OD_DERING_VERY_LARGE to avoid filtering with the outside. */
          for (r = 0; r < rend + OD_FILT_VBORDER; r++) {
            for (c = cend; c < (nhb << bsize[pli]) + OD_FILT_HBORDER; ++c) {
              src[r * OD_FILT_BSTRIDE + c + OD_FILT_HBORDER] =
                  OD_DERING_VERY_LARGE;
            }
          }
        }
        if (sbr == nvsb - 1) {
          /* On the last superblock row, fill in the bottom border with
             OD_DERING_VERY_LARGE to avoid filtering with the outside. */
          for (r = rend; r < rend + OD_FILT_VBORDER; r++) {
            for (c = 0; c < (nhb << bsize[pli]) + 2 * OD_FILT_HBORDER; c++) {
              src[(r + OD_FILT_VBORDER) * OD_FILT_BSTRIDE + c] =
                  OD_DERING_VERY_LARGE;
            }
          }
        }
        /* Copy in the pixels we need from the current superblock for
           deringing.*/
        copy_sb8_16(
            cm,
            &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER + cstart],
            OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
            (MAX_MIB_SIZE << bsize[pli]) * sbr, coffset + cstart,
            xd->plane[pli].dst.stride, rend, cend - cstart);
        if (!prev_row_dering[sbc]) {
          copy_sb8_16(cm, &src[OD_FILT_HBORDER], OD_FILT_BSTRIDE,
                      xd->plane[pli].dst.buf,
                      (MAX_MIB_SIZE << bsize[pli]) * sbr - OD_FILT_VBORDER,
                      coffset, xd->plane[pli].dst.stride, OD_FILT_VBORDER,
                      nhb << bsize[pli]);
        } else if (sbr > 0) {
          for (r = 0; r < OD_FILT_VBORDER; r++) {
            for (c = 0; c < nhb << bsize[pli]; c++) {
              src[r * OD_FILT_BSTRIDE + c + OD_FILT_HBORDER] =
                  linebuf[pli][r * stride + coffset + c];
            }
          }
        } else {
          for (r = 0; r < OD_FILT_VBORDER; r++) {
            for (c = 0; c < nhb << bsize[pli]; c++) {
              src[r * OD_FILT_BSTRIDE + c + OD_FILT_HBORDER] =
                  OD_DERING_VERY_LARGE;
            }
          }
        }
        if (!prev_row_dering[sbc - 1]) {
          copy_sb8_16(cm, src, OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
                      (MAX_MIB_SIZE << bsize[pli]) * sbr - OD_FILT_VBORDER,
                      coffset - OD_FILT_HBORDER, xd->plane[pli].dst.stride,
                      OD_FILT_VBORDER, OD_FILT_HBORDER);
        } else if (sbr > 0 && sbc > 0) {
          for (r = 0; r < OD_FILT_VBORDER; r++) {
            for (c = -OD_FILT_HBORDER; c < 0; c++) {
              src[r * OD_FILT_BSTRIDE + c + OD_FILT_HBORDER] =
                  linebuf[pli][r * stride + coffset + c];
            }
          }
        } else {
          for (r = 0; r < OD_FILT_VBORDER; r++) {
            for (c = -OD_FILT_HBORDER; c < 0; c++) {
              src[r * OD_FILT_BSTRIDE + c + OD_FILT_HBORDER] =
                  OD_DERING_VERY_LARGE;
            }
          }
        }
        if (!prev_row_dering[sbc + 1]) {
          copy_sb8_16(cm, &src[OD_FILT_HBORDER + (nhb << bsize[pli])],
                      OD_FILT_BSTRIDE, xd->plane[pli].dst.buf,
                      (MAX_MIB_SIZE << bsize[pli]) * sbr - OD_FILT_VBORDER,
                      coffset + (nhb << bsize[pli]), xd->plane[pli].dst.stride,
                      OD_FILT_VBORDER, OD_FILT_HBORDER);
        } else if (sbr > 0 && sbc < nhsb - 1) {
          for (r = 0; r < OD_FILT_VBORDER; r++) {
            for (c = nhb << bsize[pli];
                 c < (nhb << bsize[pli]) + OD_FILT_HBORDER; c++) {
              src[r * OD_FILT_BSTRIDE + c + OD_FILT_HBORDER] =
                  linebuf[pli][r * stride + coffset + c];
            }
          }
        } else {
          for (r = 0; r < OD_FILT_VBORDER; r++) {
            for (c = nhb << bsize[pli];
                 c < (nhb << bsize[pli]) + OD_FILT_HBORDER; c++) {
              src[r * OD_FILT_BSTRIDE + c + OD_FILT_HBORDER] =
                  OD_DERING_VERY_LARGE;
            }
          }
        }
        if (dering_left) {
          /* If we deringed the superblock on the left then we need to copy in
             saved pixels. */
          for (r = 0; r < rend + OD_FILT_VBORDER; r++) {
            for (c = 0; c < OD_FILT_HBORDER; c++) {
              src[r * OD_FILT_BSTRIDE + c] = colbuf[pli][r][c];
            }
          }
        }
        for (r = 0; r < rend + OD_FILT_VBORDER; r++) {
          for (c = 0; c < OD_FILT_HBORDER; c++) {
            /* Saving pixels in case we need to dering the superblock on the
               right. */
            colbuf[pli][r][c] =
                src[r * OD_FILT_BSTRIDE + c + (nhb << bsize[pli])];
          }
        }
        copy_sb8_16(cm, &linebuf[pli][coffset], stride, xd->plane[pli].dst.buf,
                    (MAX_MIB_SIZE << bsize[pli]) * (sbr + 1) - OD_FILT_VBORDER,
                    coffset, xd->plane[pli].dst.stride, OD_FILT_VBORDER,
                    (nhb << bsize[pli]));

        /* FIXME: This is a temporary hack that uses more conservative
           deringing for chroma. */
        if (pli)
          threshold = (level * 5 + 4) >> 3 << coeff_shift;
        else
          threshold = level << coeff_shift;
        if (threshold == 0) continue;
        od_dering(
            dst, &src[OD_FILT_VBORDER * OD_FILT_BSTRIDE + OD_FILT_HBORDER],
            dec[pli], dir, pli, dlist, dering_count, threshold, coeff_shift);
#if CONFIG_AOM_HIGHBITDEPTH
        if (cm->use_highbitdepth) {
          copy_dering_16bit_to_16bit(
              (int16_t *)&CONVERT_TO_SHORTPTR(
                  xd->plane[pli]
                      .dst.buf)[xd->plane[pli].dst.stride *
                                    (MAX_MIB_SIZE * sbr << bsize[pli]) +
                                (sbc * MAX_MIB_SIZE << bsize[pli])],
              xd->plane[pli].dst.stride, dst, dlist, dering_count,
              3 - dec[pli]);
        } else {
#endif
          copy_dering_16bit_to_8bit(
              &xd->plane[pli].dst.buf[xd->plane[pli].dst.stride *
                                          (MAX_MIB_SIZE * sbr << bsize[pli]) +
                                      (sbc * MAX_MIB_SIZE << bsize[pli])],
              xd->plane[pli].dst.stride, dst, dlist, dering_count, bsize[pli]);
#if CONFIG_AOM_HIGHBITDEPTH
        }
#endif
      }
      dering_left = 1;
    }
    {
      unsigned char *tmp;
      tmp = prev_row_dering;
      prev_row_dering = curr_row_dering;
      curr_row_dering = tmp;
    }
  }
示例#18
0
文件: clpf_rdo.c 项目: brion/aomedia
// Calculate the square error of all filter settings.  Result:
// res[0][0]   : unfiltered
// res[0][1-3] : strength=1,2,4, no signals
// (Only for luma:)
// res[1][0]   : (bit count, fb size = 128)
// res[1][1-3] : strength=1,2,4, fb size = 128
// res[1][4]   : unfiltered, including skip
// res[1][5-7] : strength=1,2,4, including skip, fb_size = 128
// res[2][0]   : (bit count, fb size = 64)
// res[2][1-3] : strength=1,2,4, fb size = 64
// res[3][0]   : (bit count, fb size = 32)
// res[3][1-3] : strength=1,2,4, fb size = 32
static int clpf_rdo(int y, int x, const YV12_BUFFER_CONFIG *rec,
                    const YV12_BUFFER_CONFIG *org, const AV1_COMMON *cm,
                    unsigned int block_size, unsigned int fb_size_log2, int w,
                    int h, int64_t res[4][8], int plane) {
  int c, m, n, filtered = 0;
  int sum[8];
  const int subx = plane != AOM_PLANE_Y && rec->subsampling_x;
  const int suby = plane != AOM_PLANE_Y && rec->subsampling_y;
  int bslog = get_msb(block_size);
  uint8_t *rec_buffer =
      plane != AOM_PLANE_Y
          ? (plane == AOM_PLANE_U ? rec->u_buffer : rec->v_buffer)
          : rec->y_buffer;
  uint8_t *org_buffer =
      plane != AOM_PLANE_Y
          ? (plane == AOM_PLANE_U ? org->u_buffer : org->v_buffer)
          : org->y_buffer;
  int rec_width = plane != AOM_PLANE_Y ? rec->uv_crop_width : rec->y_crop_width;
  int rec_height =
      plane != AOM_PLANE_Y ? rec->uv_crop_height : rec->y_crop_height;
  int rec_stride = plane != AOM_PLANE_Y ? rec->uv_stride : rec->y_stride;
  int org_stride = plane != AOM_PLANE_Y ? org->uv_stride : org->y_stride;
  sum[0] = sum[1] = sum[2] = sum[3] = sum[4] = sum[5] = sum[6] = sum[7] = 0;
  if (plane == AOM_PLANE_Y &&
      fb_size_log2 > (unsigned int)get_msb(MAX_FB_SIZE) - 3) {
    int w1, h1, w2, h2, i, sum1, sum2, sum3, oldfiltered;

    filtered = fb_size_log2-- == MAX_FB_SIZE_LOG2;
    w1 = AOMMIN(1 << (fb_size_log2 - bslog), w);
    h1 = AOMMIN(1 << (fb_size_log2 - bslog), h);
    w2 = AOMMIN(w - (1 << (fb_size_log2 - bslog)), w >> 1);
    h2 = AOMMIN(h - (1 << (fb_size_log2 - bslog)), h >> 1);
    i = get_msb(MAX_FB_SIZE) - fb_size_log2;
    sum1 = (int)res[i][1];
    sum2 = (int)res[i][2];
    sum3 = (int)res[i][3];
    oldfiltered = (int)res[i][0];
    res[i][0] = 0;

    filtered |= clpf_rdo(y, x, rec, org, cm, block_size, fb_size_log2, w1, h1,
                         res, plane);
    if (1 << (fb_size_log2 - bslog) < w)
      filtered |= clpf_rdo(y, x + (1 << fb_size_log2), rec, org, cm, block_size,
                           fb_size_log2, w2, h1, res, plane);
    if (1 << (fb_size_log2 - bslog) < h) {
      filtered |= clpf_rdo(y + (1 << fb_size_log2), x, rec, org, cm, block_size,
                           fb_size_log2, w1, h2, res, plane);
      filtered |=
          clpf_rdo(y + (1 << fb_size_log2), x + (1 << fb_size_log2), rec, org,
                   cm, block_size, fb_size_log2, w2, h2, res, plane);
    }

    // Correct sums for unfiltered blocks
    res[i][1] = AOMMIN(sum1 + res[i][0], res[i][1]);
    res[i][2] = AOMMIN(sum2 + res[i][0], res[i][2]);
    res[i][3] = AOMMIN(sum3 + res[i][0], res[i][3]);
    if (i == 1) {
      res[i][5] = AOMMIN(sum1 + res[i][4], res[i][5]);
      res[i][6] = AOMMIN(sum2 + res[i][4], res[i][6]);
      res[i][7] = AOMMIN(sum3 + res[i][4], res[i][7]);
    }

    res[i][0] = oldfiltered + filtered;  // Number of signal bits

    return filtered;
  }
示例#19
0
int64_t refine_integerized_param(WarpedMotionParams *wm,
                                 TransformationType wmtype, int use_hbd, int bd,
                                 uint8_t *ref, int r_width, int r_height,
                                 int r_stride, uint8_t *dst, int d_width,
                                 int d_height, int d_stride, int n_refinements,
                                 int64_t best_frame_error) {
  static const int max_trans_model_params[TRANS_TYPES] = { 0, 2, 4, 6 };
  const int border = ERRORADV_BORDER;
  int i = 0, p;
  int n_params = max_trans_model_params[wmtype];
  int32_t *param_mat = wm->wmmat;
  int64_t step_error, best_error;
  int32_t step;
  int32_t *param;
  int32_t curr_param;
  int32_t best_param;

  force_wmtype(wm, wmtype);
  best_error = av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
                              dst + border * d_stride + border, border, border,
                              d_width - 2 * border, d_height - 2 * border,
                              d_stride, 0, 0, best_frame_error);
  best_error = AOMMIN(best_error, best_frame_error);
  step = 1 << (n_refinements - 1);
  for (i = 0; i < n_refinements; i++, step >>= 1) {
    for (p = 0; p < n_params; ++p) {
      int step_dir = 0;
      // Skip searches for parameters that are forced to be 0
      param = param_mat + p;
      curr_param = *param;
      best_param = curr_param;
      // look to the left
      *param = add_param_offset(p, curr_param, -step);
      step_error =
          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
                         dst + border * d_stride + border, border, border,
                         d_width - 2 * border, d_height - 2 * border, d_stride,
                         0, 0, best_error);
      if (step_error < best_error) {
        best_error = step_error;
        best_param = *param;
        step_dir = -1;
      }

      // look to the right
      *param = add_param_offset(p, curr_param, step);
      step_error =
          av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
                         dst + border * d_stride + border, border, border,
                         d_width - 2 * border, d_height - 2 * border, d_stride,
                         0, 0, best_error);
      if (step_error < best_error) {
        best_error = step_error;
        best_param = *param;
        step_dir = 1;
      }
      *param = best_param;

      // look to the direction chosen above repeatedly until error increases
      // for the biggest step size
      while (step_dir) {
        *param = add_param_offset(p, best_param, step * step_dir);
        step_error =
            av1_warp_error(wm, use_hbd, bd, ref, r_width, r_height, r_stride,
                           dst + border * d_stride + border, border, border,
                           d_width - 2 * border, d_height - 2 * border,
                           d_stride, 0, 0, best_error);
        if (step_error < best_error) {
          best_error = step_error;
          best_param = *param;
        } else {
          *param = best_param;
          step_dir = 0;
        }
      }
    }
  }
  force_wmtype(wm, wmtype);
  wm->wmtype = get_gmtype(wm);
  return best_error;
}
示例#20
0
void av1_tokenize_sb_vartx(const AV1_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                           RUN_TYPE dry_run, int mi_row, int mi_col,
                           BLOCK_SIZE bsize, int *rate,
                           uint8_t allow_update_cdf) {
  const AV1_COMMON *const cm = &cpi->common;
  const int num_planes = av1_num_planes(cm);
  MACROBLOCK *const x = &td->mb;
  MACROBLOCKD *const xd = &x->e_mbd;
  MB_MODE_INFO *const mbmi = xd->mi[0];
  (void)t;
  struct tokenize_b_args arg = { cpi, td, t, 0, allow_update_cdf };
  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return;

  if (mbmi->skip) {
    av1_reset_skip_context(xd, mi_row, mi_col, bsize, num_planes);
    return;
  }

  for (int plane = 0; plane < num_planes; ++plane) {
    if (!is_chroma_reference(mi_row, mi_col, bsize,
                             xd->plane[plane].subsampling_x,
                             xd->plane[plane].subsampling_y)) {
      continue;
    }
    const struct macroblockd_plane *const pd = &xd->plane[plane];
    const BLOCK_SIZE bsizec =
        scale_chroma_bsize(bsize, pd->subsampling_x, pd->subsampling_y);
    const BLOCK_SIZE plane_bsize =
        get_plane_block_size(bsizec, pd->subsampling_x, pd->subsampling_y);
    const int mi_width = block_size_wide[plane_bsize] >> tx_size_wide_log2[0];
    const int mi_height = block_size_high[plane_bsize] >> tx_size_high_log2[0];
    const TX_SIZE max_tx_size = get_vartx_max_txsize(xd, plane_bsize, plane);
    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
    int bw = block_size_wide[txb_size] >> tx_size_wide_log2[0];
    int bh = block_size_high[txb_size] >> tx_size_high_log2[0];
    int idx, idy;
    int block = 0;
    int step = tx_size_wide_unit[max_tx_size] * tx_size_high_unit[max_tx_size];

    const BLOCK_SIZE max_unit_bsize =
        get_plane_block_size(BLOCK_64X64, pd->subsampling_x, pd->subsampling_y);
    int mu_blocks_wide =
        block_size_wide[max_unit_bsize] >> tx_size_wide_log2[0];
    int mu_blocks_high =
        block_size_high[max_unit_bsize] >> tx_size_high_log2[0];

    mu_blocks_wide = AOMMIN(mi_width, mu_blocks_wide);
    mu_blocks_high = AOMMIN(mi_height, mu_blocks_high);

    for (idy = 0; idy < mi_height; idy += mu_blocks_high) {
      for (idx = 0; idx < mi_width; idx += mu_blocks_wide) {
        int blk_row, blk_col;
        const int unit_height = AOMMIN(mu_blocks_high + idy, mi_height);
        const int unit_width = AOMMIN(mu_blocks_wide + idx, mi_width);
        for (blk_row = idy; blk_row < unit_height; blk_row += bh) {
          for (blk_col = idx; blk_col < unit_width; blk_col += bw) {
            tokenize_vartx(td, t, dry_run, max_tx_size, plane_bsize, blk_row,
                           blk_col, block, plane, &arg);
            block += step;
          }
        }
      }
    }
  }
  if (rate) *rate += arg.this_rate;
}
示例#21
0
void av1_encode_tiles_mt(AV1_COMP *cpi) {
  AV1_COMMON *const cm = &cpi->common;
  const int tile_cols = cm->tile_cols;
  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  const int num_workers = AOMMIN(cpi->oxcf.max_threads, tile_cols);
  int i;

  av1_init_tile_data(cpi);

  // Only run once to create threads and allocate thread data.
  if (cpi->num_workers == 0) {
    CHECK_MEM_ERROR(cm, cpi->workers,
                    aom_malloc(num_workers * sizeof(*cpi->workers)));

    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
                    aom_calloc(num_workers, sizeof(*cpi->tile_thr_data)));

    for (i = 0; i < num_workers; i++) {
      AVxWorker *const worker = &cpi->workers[i];
      EncWorkerData *const thread_data = &cpi->tile_thr_data[i];

      ++cpi->num_workers;
      winterface->init(worker);

      thread_data->cpi = cpi;

      if (i < num_workers - 1) {
        // Allocate thread data.
        CHECK_MEM_ERROR(cm, thread_data->td,
                        aom_memalign(32, sizeof(*thread_data->td)));
        av1_zero(*thread_data->td);

        // Set up pc_tree.
        thread_data->td->leaf_tree = NULL;
        thread_data->td->pc_tree = NULL;
        av1_setup_pc_tree(cm, thread_data->td);

        // Set up variance tree if needed.
        if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
          av1_setup_var_tree(cm, thread_data->td);

        // Allocate frame counters in thread data.
        CHECK_MEM_ERROR(cm, thread_data->td->counts,
                        aom_calloc(1, sizeof(*thread_data->td->counts)));

        // Create threads
        if (!winterface->reset(worker))
          aom_internal_error(&cm->error, AOM_CODEC_ERROR,
                             "Tile encoder thread creation failed");
      } else {
        // Main thread acts as a worker and uses the thread data in cpi.
        thread_data->td = &cpi->td;
      }

      winterface->sync(worker);
    }
  }

  for (i = 0; i < num_workers; i++) {
    AVxWorker *const worker = &cpi->workers[i];
    EncWorkerData *thread_data;

    worker->hook = (AVxWorkerHook)enc_worker_hook;
    worker->data1 = &cpi->tile_thr_data[i];
    worker->data2 = NULL;
    thread_data = (EncWorkerData *)worker->data1;

    // Before encoding a frame, copy the thread data from cpi.
    if (thread_data->td != &cpi->td) {
      thread_data->td->mb = cpi->td.mb;
      thread_data->td->rd_counts = cpi->td.rd_counts;
    }
    if (thread_data->td->counts != &cpi->common.counts) {
      memcpy(thread_data->td->counts, &cpi->common.counts,
             sizeof(cpi->common.counts));
    }

#if CONFIG_PALETTE
    // Allocate buffers used by palette coding mode.
    if (cpi->common.allow_screen_content_tools && i < num_workers - 1) {
      MACROBLOCK *x = &thread_data->td->mb;
      CHECK_MEM_ERROR(cm, x->palette_buffer,
                      aom_memalign(16, sizeof(*x->palette_buffer)));
    }
#endif  // CONFIG_PALETTE
  }

  // Encode a frame
  for (i = 0; i < num_workers; i++) {
    AVxWorker *const worker = &cpi->workers[i];
    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;

    // Set the starting tile for each thread.
    thread_data->start = i;

    if (i == cpi->num_workers - 1)
      winterface->execute(worker);
    else
      winterface->launch(worker);
  }

  // Encoding ends.
  for (i = 0; i < num_workers; i++) {
    AVxWorker *const worker = &cpi->workers[i];
    winterface->sync(worker);
  }

  for (i = 0; i < num_workers; i++) {
    AVxWorker *const worker = &cpi->workers[i];
    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;

    // Accumulate counters.
    if (i < cpi->num_workers - 1) {
      av1_accumulate_frame_counts(&cm->counts, thread_data->td->counts);
      accumulate_rd_opt(&cpi->td, thread_data->td);
    }
  }
}
示例#22
0
static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, AV1_COMMON *cm,
                                struct macroblockd_plane *planes, int start,
                                int stop, int y_only, AVxWorker *workers,
                                int nworkers, AV1LfSync *lf_sync) {
#if CONFIG_EXT_PARTITION
  printf(
      "STOPPING: This code has not been modified to work with the "
      "extended coding unit size experiment");
  exit(EXIT_FAILURE);
#endif  // CONFIG_EXT_PARTITION

  const AVxWorkerInterface *const winterface = aom_get_worker_interface();
  // Number of superblock rows and cols
  const int sb_rows = mi_rows_aligned_to_sb(cm) >> cm->mib_size_log2;
  // Decoder may allocate more threads than number of tiles based on user's
  // input.
  const int tile_cols = cm->tile_cols;
  const int num_workers = AOMMIN(nworkers, tile_cols);
  int i;

  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
      num_workers > lf_sync->num_workers) {
    av1_loop_filter_dealloc(lf_sync);
    av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
  }

// Set up loopfilter thread data.
// The decoder is capping num_workers because it has been observed that using
// more threads on the loopfilter than there are cores will hurt performance
// on Android. This is because the system will only schedule the tile decode
// workers on cores equal to the number of tile columns. Then if the decoder
// tries to use more threads for the loopfilter, it will hurt performance
// because of contention. If the multithreading code changes in the future
// then the number of workers used by the loopfilter should be revisited.

#if CONFIG_PARALLEL_DEBLOCKING
  // Initialize cur_sb_col to -1 for all SB rows.
  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);

  // Filter all the vertical edges in the whole frame
  for (i = 0; i < num_workers; ++i) {
    AVxWorker *const worker = &workers[i];
    LFWorkerData *const lf_data = &lf_sync->lfdata[i];

    worker->hook = (AVxWorkerHook)loop_filter_ver_row_worker;
    worker->data1 = lf_sync;
    worker->data2 = lf_data;

    // Loopfilter data
    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
    lf_data->start = start + i * cm->mib_size;
    lf_data->stop = stop;
    lf_data->y_only = y_only;

    // Start loopfiltering
    if (i == num_workers - 1) {
      winterface->execute(worker);
    } else {
      winterface->launch(worker);
    }
  }

  // Wait till all rows are finished
  for (i = 0; i < num_workers; ++i) {
    winterface->sync(&workers[i]);
  }

  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
  // Filter all the horizontal edges in the whole frame
  for (i = 0; i < num_workers; ++i) {
    AVxWorker *const worker = &workers[i];
    LFWorkerData *const lf_data = &lf_sync->lfdata[i];

    worker->hook = (AVxWorkerHook)loop_filter_hor_row_worker;
    worker->data1 = lf_sync;
    worker->data2 = lf_data;

    // Loopfilter data
    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
    lf_data->start = start + i * cm->mib_size;
    lf_data->stop = stop;
    lf_data->y_only = y_only;

    // Start loopfiltering
    if (i == num_workers - 1) {
      winterface->execute(worker);
    } else {
      winterface->launch(worker);
    }
  }

  // Wait till all rows are finished
  for (i = 0; i < num_workers; ++i) {
    winterface->sync(&workers[i]);
  }
#else   // CONFIG_PARALLEL_DEBLOCKING
  // Initialize cur_sb_col to -1 for all SB rows.
  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);

  for (i = 0; i < num_workers; ++i) {
    AVxWorker *const worker = &workers[i];
    LFWorkerData *const lf_data = &lf_sync->lfdata[i];

    worker->hook = (AVxWorkerHook)loop_filter_row_worker;
    worker->data1 = lf_sync;
    worker->data2 = lf_data;

    // Loopfilter data
    av1_loop_filter_data_reset(lf_data, frame, cm, planes);
    lf_data->start = start + i * cm->mib_size;
    lf_data->stop = stop;
    lf_data->y_only = y_only;

    // Start loopfiltering
    if (i == num_workers - 1) {
      winterface->execute(worker);
    } else {
      winterface->launch(worker);
    }
  }

  // Wait till all rows are finished
  for (i = 0; i < num_workers; ++i) {
    winterface->sync(&workers[i]);
  }
#endif  // CONFIG_PARALLEL_DEBLOCKING
}