예제 #1
op_plan *op_plan_core(char const *name, op_set set, int part_size, int nargs,
                      op_arg *args, int ninds, int *inds, int staging) {
  // set exec length
  int exec_length = set->size;
  for (int i = 0; i < nargs; i++) {
    if (args[i].opt && args[i].idx != -1 && args[i].acc != OP_READ) {
      exec_length += set->exec_size;

  /* first look for an existing execution plan */

  int ip = 0, match = 0;

  while (match == 0 && ip < OP_plan_index) {
    if ((strcmp(name, OP_plans[ip].name) == 0) && (set == OP_plans[ip].set) &&
        (nargs == OP_plans[ip].nargs) && (ninds == OP_plans[ip].ninds) &&
        (part_size == OP_plans[ip].part_size)) {
      match = 1;
      for (int m = 0; m < nargs; m++) {
        if (args[m].dat != NULL && OP_plans[ip].dats[m] != NULL)
          match = match && (args[m].dat->size == OP_plans[ip].dats[m]->size) &&
                  (args[m].dat->dim == OP_plans[ip].dats[m]->dim) &&
                  (args[m].map == OP_plans[ip].maps[m]) &&
                  (args[m].idx == OP_plans[ip].idxs[m]) &&
                  (args[m].acc == OP_plans[ip].accs[m]);
          match = match && (args[m].dat == OP_plans[ip].dats[m]) &&
                  (args[m].map == OP_plans[ip].maps[m]) &&
                  (args[m].idx == OP_plans[ip].idxs[m]) &&
                  (args[m].acc == OP_plans[ip].accs[m]);

  if (match) {
    if (OP_diags > 3)
      printf(" old execution plan #%d\n", ip);
    return &(OP_plans[ip]);
  } else {
    if (OP_diags > 1)
      printf(" new execution plan #%d for kernel %s\n", ip, name);
  double wall_t1, wall_t2, cpu_t1, cpu_t2;
  op_timers_core(&cpu_t1, &wall_t1);
  /* work out worst case shared memory requirement per element */

  int halo_exchange = 0;
  for (int i = 0; i < nargs; i++) {
    if (args[i].opt && args[i].idx != -1 && args[i].acc != OP_WRITE &&
        args[i].acc != OP_INC) {
      halo_exchange = 1;

  int maxbytes = 0;
  for (int m = 0; m < nargs; m++) {
    if (args[m].opt && inds[m] >= 0) {
      if ((staging == OP_STAGE_INC && args[m].acc == OP_INC) ||
          (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE))
        maxbytes += args[m].dat->size;

  /* set blocksize and number of blocks; adaptive size based on 48kB of shared
   * memory */

  int bsize = part_size; // blocksize
  if (bsize == 0 && maxbytes > 0)
    bsize = MAX((24 * 1024 / (64 * maxbytes)) * 64,
                256); // 48kB exactly is too much, make it 24
  else if (bsize == 0 && maxbytes == 0)
    bsize = 256;

  // If we do 1 level of coloring, do it in one go
  if (staging == OP_COLOR2)
    bsize = exec_length;

  int nblocks = 0;

  int indirect_reduce = 0;
  for (int m = 0; m < nargs; m++) {
    indirect_reduce |=
        (args[m].acc != OP_READ && args[m].argtype == OP_ARG_GBL);
  indirect_reduce &= (ninds > 0);

  /* Work out indirection arrays for OP_INCs */
  int ninds_staged = 0; // number of distinct (unique dat) indirect incs
  int *inds_staged = (int *)op_malloc(nargs * sizeof(int));
  int *inds_to_inds_staged = (int *)op_malloc(ninds * sizeof(int));

  for (int i = 0; i < nargs; i++)
    inds_staged[i] = -1;
  for (int i = 0; i < ninds; i++)
    inds_to_inds_staged[i] = -1;
  for (int i = 0; i < nargs; i++) {
    if (inds[i] >= 0 &&
        ((staging == OP_STAGE_INC && args[i].acc == OP_INC) ||
         (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE))) {
      if (inds_to_inds_staged[inds[i]] == -1) {
        inds_to_inds_staged[inds[i]] = ninds_staged;
        inds_staged[i] = ninds_staged;
      } else {
        inds_staged[i] = inds_to_inds_staged[inds[i]];

  int *invinds_staged = (int *)op_malloc(ninds_staged * sizeof(int));
  for (int i = 0; i < ninds_staged; i++)
    invinds_staged[i] = -1;
  for (int i = 0; i < nargs; i++)
    if (inds[i] >= 0 &&
        ((staging == OP_STAGE_INC && args[i].acc == OP_INC) ||
         (staging == OP_STAGE_ALL || staging == OP_STAGE_PERMUTE)) &&
        invinds_staged[inds_staged[i]] == -1)
      invinds_staged[inds_staged[i]] = i;

  int prev_offset = 0;
  int next_offset = 0;

  while (next_offset < exec_length) {
    prev_offset = next_offset;
    if (prev_offset + bsize >= set->core_size && prev_offset < set->core_size) {
      next_offset = set->core_size;
    } else if (prev_offset + bsize >= set->size && prev_offset < set->size &&
               indirect_reduce) {
      next_offset = set->size;
    } else if (prev_offset + bsize >= exec_length &&
               prev_offset < exec_length) {
      next_offset = exec_length;
    } else {
      next_offset = prev_offset + bsize;

  // If we do 1 level of coloring, we have a single "block"
  if (staging == OP_COLOR2) {
    nblocks = 1;
    prev_offset = 0;
    next_offset = exec_length;

  /* enlarge OP_plans array if needed */

  if (ip == OP_plan_max) {
    // printf("allocating more memory for OP_plans %d\n", OP_plan_max);
    OP_plan_max += 10;
    OP_plans = (op_plan *)op_realloc(OP_plans, OP_plan_max * sizeof(op_plan));
    if (OP_plans == NULL) {
      printf(" op_plan error -- error reallocating memory for OP_plans\n");

  /* allocate memory for new execution plan and store input arguments */

  OP_plans[ip].dats = (op_dat *)op_malloc(nargs * sizeof(op_dat));
  OP_plans[ip].idxs = (int *)op_malloc(nargs * sizeof(int));
  OP_plans[ip].optflags = (int *)op_malloc(nargs * sizeof(int));
  OP_plans[ip].maps = (op_map *)op_malloc(nargs * sizeof(op_map));
  OP_plans[ip].accs = (op_access *)op_malloc(nargs * sizeof(op_access));
  OP_plans[ip].inds_staged =
      (op_access *)op_malloc(ninds_staged * sizeof(op_access));

  OP_plans[ip].nthrcol = (int *)op_malloc(nblocks * sizeof(int));
  OP_plans[ip].thrcol = (int *)op_malloc(exec_length * sizeof(int));
  OP_plans[ip].col_reord = (int *)op_malloc((exec_length + 16) * sizeof(int));
  OP_plans[ip].col_offsets = NULL;
  OP_plans[ip].offset = (int *)op_malloc(nblocks * sizeof(int));
  OP_plans[ip].ind_maps = (int **)op_malloc(ninds_staged * sizeof(int *));
  OP_plans[ip].ind_offs =
      (int *)op_malloc(nblocks * ninds_staged * sizeof(int));
  OP_plans[ip].ind_sizes =
      (int *)op_malloc(nblocks * ninds_staged * sizeof(int));
  OP_plans[ip].nindirect = (int *)op_calloc(ninds, sizeof(int));
  OP_plans[ip].loc_maps = (short **)op_malloc(nargs * sizeof(short *));
  OP_plans[ip].nelems = (int *)op_malloc(nblocks * sizeof(int));
  OP_plans[ip].ncolblk =
      (int *)op_calloc(exec_length, sizeof(int)); /* max possibly needed */
  OP_plans[ip].blkmap = (int *)op_calloc(nblocks, sizeof(int));

  int *offsets = (int *)op_malloc((ninds_staged + 1) * sizeof(int));
  offsets[0] = 0;
  for (int m = 0; m < ninds_staged; m++) {
    int count = 0;
    for (int m2 = 0; m2 < nargs; m2++)
      if (inds_staged[m2] == m)
    offsets[m + 1] = offsets[m] + count;
  OP_plans[ip].ind_map =
      (int *)op_malloc(offsets[ninds_staged] * exec_length * sizeof(int));
  for (int m = 0; m < ninds_staged; m++) {
    OP_plans[ip].ind_maps[m] = &OP_plans[ip].ind_map[exec_length * offsets[m]];

  int counter = 0;
  for (int m = 0; m < nargs; m++) {
    if (inds_staged[m] >= 0)
      OP_plans[ip].loc_maps[m] = NULL;

    OP_plans[ip].dats[m] = args[m].dat;
    OP_plans[ip].idxs[m] = args[m].idx;
    OP_plans[ip].optflags[m] = args[m].opt;
    OP_plans[ip].maps[m] = args[m].map;
    OP_plans[ip].accs[m] = args[m].acc;

  OP_plans[ip].loc_map =
      (short *)op_malloc(counter * exec_length * sizeof(short));
  counter = 0;
  for (int m = 0; m < nargs; m++) {
    if (inds_staged[m] >= 0) {
      OP_plans[ip].loc_maps[m] = &OP_plans[ip].loc_map[exec_length * (counter)];

  OP_plans[ip].name = name;
  OP_plans[ip].set = set;
  OP_plans[ip].nargs = nargs;
  OP_plans[ip].ninds = ninds;
  OP_plans[ip].ninds_staged = ninds_staged;
  OP_plans[ip].part_size = part_size;
  OP_plans[ip].nblocks = nblocks;
  OP_plans[ip].ncolors_core = 0;
  OP_plans[ip].ncolors_owned = 0;
  OP_plans[ip].count = 1;
  OP_plans[ip].inds_staged = inds_staged;


  /* define aliases */

  op_dat *dats = OP_plans[ip].dats;
  int *idxs = OP_plans[ip].idxs;
  op_map *maps = OP_plans[ip].maps;
  op_access *accs = OP_plans[ip].accs;

  int *offset = OP_plans[ip].offset;
  int *nelems = OP_plans[ip].nelems;
  int **ind_maps = OP_plans[ip].ind_maps;
  int *ind_offs = OP_plans[ip].ind_offs;
  int *ind_sizes = OP_plans[ip].ind_sizes;
  int *nindirect = OP_plans[ip].nindirect;

  /* allocate working arrays */
  uint **work;
  work = (uint **)op_malloc(ninds * sizeof(uint *));

  for (int m = 0; m < ninds; m++) {
    int m2 = 0;
    while (inds[m2] != m)
    if (args[m2].opt == 0) {
      work[m] = NULL;

    int to_size = (maps[m2]->to)->exec_size + (maps[m2]->to)->nonexec_size +
    work[m] = (uint *)op_malloc(to_size * sizeof(uint));

  int *work2;
  work2 =
      (int *)op_malloc(nargs * bsize * sizeof(int)); /* max possibly needed */

  /* process set one block at a time */

  float total_colors = 0;

  prev_offset = 0;
  next_offset = 0;
  for (int b = 0; b < nblocks; b++) {
    prev_offset = next_offset;
    if (prev_offset + bsize >= set->core_size && prev_offset < set->core_size) {
      next_offset = set->core_size;
    } else if (prev_offset + bsize >= set->size && prev_offset < set->size &&
               indirect_reduce) {
      next_offset = set->size;
    } else if (prev_offset + bsize >= exec_length &&
               prev_offset < exec_length) {
      next_offset = exec_length;
    } else {
      next_offset = prev_offset + bsize;

    if (staging == OP_COLOR2) {
      prev_offset = 0;
      next_offset = exec_length;
    int bs = next_offset - prev_offset;

    offset[b] = prev_offset; /* offset for block */
    nelems[b] = bs;          /* size of block */

    /* loop over indirection sets */
    for (int m = 0; m < ninds; m++) {
      int m2 = 0;
      while (inds[m2] != m)
      int m3 = inds_staged[m2];
      if (m3 < 0)
      if (args[m2].opt == 0) {
        if (b == 0) {
          ind_offs[m3 + b * ninds_staged] = 0;
          ind_sizes[m3 + b * ninds_staged] = 0;
        } else {
          ind_offs[m3 + b * ninds_staged] =
              ind_offs[m3 + (b - 1) * ninds_staged];
          ind_sizes[m3 + b * ninds_staged] = 0;
      /* build the list of elements indirectly referenced in this block */

      int ne = 0; /* number of elements */
      for (int m2 = 0; m2 < nargs; m2++) {
        if (inds[m2] == m) {
          for (int e = prev_offset; e < next_offset; e++)
            work2[ne++] = maps[m2]->map[idxs[m2] + e * maps[m2]->dim];

      /* sort them, then eliminate duplicates */

      qsort(work2, ne, sizeof(int), comp);

      int nde = 0;
      int p = 0;
      while (p < ne) {
        work2[nde] = work2[p];
        while (p < ne && work2[p] == work2[nde])
      ne = nde; /* number of distinct elements */

         if (OP_diags > 5) { printf(" indirection set %d: ",m); for (int e=0;
         e<ne; e++) printf("
         %d",work2[e]); printf(" \n"); } */

      /* store mapping and renumbered mappings in execution plan */

      for (int e = 0; e < ne; e++) {
        ind_maps[m3][nindirect[m]++] = work2[e];
        work[m][work2[e]] = e; // inverse mapping

      for (int m2 = 0; m2 < nargs; m2++) {
        if (inds[m2] == m) {
          for (int e = prev_offset; e < next_offset; e++)
            OP_plans[ip].loc_maps[m2][e] =
                (short)(work[m][maps[m2]->map[idxs[m2] + e * maps[m2]->dim]]);

      if (b == 0) {
        ind_offs[m3 + b * ninds_staged] = 0;
        ind_sizes[m3 + b * ninds_staged] = nindirect[m];
      } else {
        ind_offs[m3 + b * ninds_staged] =
            ind_offs[m3 + (b - 1) * ninds_staged] +
            ind_sizes[m3 + (b - 1) * ninds_staged];
        ind_sizes[m3 + b * ninds_staged] =
            nindirect[m] - ind_offs[m3 + b * ninds_staged];

    /* now colour main set elements */

    for (int e = prev_offset; e < next_offset; e++)
      OP_plans[ip].thrcol[e] = -1;

    int repeat = 1;
    int ncolor = 0;
    int ncolors = 0;

    while (repeat) {
      repeat = 0;

      for (int m = 0; m < nargs; m++) {
        if (inds[m] >= 0 && args[m].opt)
          for (int e = prev_offset; e < next_offset; e++)
            work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] =
                0; /* zero out color array */

      for (int e = prev_offset; e < next_offset; e++) {
        if (OP_plans[ip].thrcol[e] == -1) {
          int mask = 0;
          if (staging == OP_COLOR2 && halo_exchange && e >= set->core_size &&
              ncolor == 0)
            mask = 1;
          for (int m = 0; m < nargs; m++)
            if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
              mask |=
                      [maps[m]->map[idxs[m] +
                                    e * maps[m]->dim]]; /* set bits of mask */

          int color = ffs(~mask) - 1; /* find first bit not set */
          if (color == -1) {          /* run out of colors on this pass */
            repeat = 1;
          } else {
            OP_plans[ip].thrcol[e] = ncolor + color;
            mask = 1 << color;
            ncolors = MAX(ncolors, ncolor + color + 1);

            for (int m = 0; m < nargs; m++)
              if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
                work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] |=
                    mask; /* set color bit */

      ncolor += 32; /* increment base level */

    OP_plans[ip].nthrcol[b] =
        ncolors; /* number of thread colors in this block */
    total_colors += ncolors;

    // if(ncolors>1) printf(" number of colors in this block = %d \n",ncolors);

  /* create element permutation by color */
  if (staging == OP_STAGE_PERMUTE || staging == OP_COLOR2) {
    int size_of_col_offsets = 0;
    for (int b = 0; b < nblocks; b++) {
      size_of_col_offsets += OP_plans[ip].nthrcol[b] + 1;
    // allocate
    OP_plans[ip].col_offsets = (int **)op_malloc(nblocks * sizeof(int *));
    int *col_offsets = (int *)op_malloc(size_of_col_offsets * sizeof(int *));

    size_of_col_offsets = 0;
    op_keyvalue *kv = (op_keyvalue *)op_malloc(bsize * sizeof(op_keyvalue));
    for (int b = 0; b < nblocks; b++) {
      int ncolor = OP_plans[ip].nthrcol[b];
      for (int e = 0; e < nelems[b]; e++) {
        kv[e].key = OP_plans[ip].thrcol[offset[b] + e];
        kv[e].value = e;
      qsort(kv, nelems[b], sizeof(op_keyvalue), comp2);
      OP_plans[ip].col_offsets[b] = col_offsets + size_of_col_offsets;
      OP_plans[ip].col_offsets[b][0] = 0;
      size_of_col_offsets += (ncolor + 1);

      // Set up permutation and pointers to beginning of each color
      ncolor = 0;
      for (int e = 0; e < nelems[b]; e++) {
        OP_plans[ip].thrcol[offset[b] + e] = kv[e].key;
        OP_plans[ip].col_reord[offset[b] + e] = kv[e].value;
        if (e > 0)
          if (kv[e].key > kv[e - 1].key) {
            OP_plans[ip].col_offsets[b][ncolor] = e;
      OP_plans[ip].col_offsets[b][ncolor + 1] = nelems[b];
    for (int i = exec_length; i < exec_length + 16; i++)
      OP_plans[ip].col_reord[i] = 0;

  /* color the blocks, after initialising colors to 0 */

  int *blk_col;

  blk_col = (int *)op_malloc(nblocks * sizeof(int));
  for (int b = 0; b < nblocks; b++)
    blk_col[b] = -1;

  int repeat = 1;
  int ncolor = 0;
  int ncolors = 0;

  while (repeat) {
    repeat = 0;

    for (int m = 0; m < nargs; m++) {
      if (inds[m] >= 0 && args[m].opt) {
        int to_size = (maps[m]->to)->exec_size + (maps[m]->to)->nonexec_size +
        for (int e = 0; e < to_size; e++)
          work[inds[m]][e] = 0; // zero out color arrays
    prev_offset = 0;
    next_offset = 0;
    for (int b = 0; b < nblocks; b++) {
      prev_offset = next_offset;

      if (prev_offset + bsize >= set->core_size &&
          prev_offset < set->core_size) {
        next_offset = set->core_size;
      } else if (prev_offset + bsize >= set->size && prev_offset < set->size &&
                 indirect_reduce) {
        next_offset = set->size;
      } else if (prev_offset + bsize >= exec_length &&
                 prev_offset < exec_length) {
        next_offset = exec_length;
      } else {
        next_offset = prev_offset + bsize;
      if (blk_col[b] == -1) { // color not yet assigned to block
        uint mask = 0;
        if (next_offset > set->core_size) { // should not use block colors from
                                            // the core set when doing the
                                            // non_core ones
          if (prev_offset <= set->core_size)
            OP_plans[ip].ncolors_core = ncolors;
          for (int shifter = 0; shifter < OP_plans[ip].ncolors_core; shifter++)
            mask |= 1 << shifter;
          if (prev_offset == set->size && indirect_reduce)
            OP_plans[ip].ncolors_owned = ncolors;
          for (int shifter = OP_plans[ip].ncolors_core;
               indirect_reduce && shifter < OP_plans[ip].ncolors_owned;
            mask |= 1 << shifter;

        for (int m = 0; m < nargs; m++) {
          if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
            for (int e = prev_offset; e < next_offset; e++)
              mask |= work[inds[m]]
                          [maps[m]->map[idxs[m] + e * maps[m]->dim]]; // set
                                                                      // bits of
                                                                      // mask

        int color = ffs(~mask) - 1; // find first bit not set
        if (color == -1) {          // run out of colors on this pass
          repeat = 1;
        } else {
          blk_col[b] = ncolor + color;
          mask = 1 << color;
          ncolors = MAX(ncolors, ncolor + color + 1);

          for (int m = 0; m < nargs; m++) {
            if (inds[m] >= 0 && (accs[m] == OP_INC || accs[m] == OP_RW) &&
              for (int e = prev_offset; e < next_offset; e++)
                work[inds[m]][maps[m]->map[idxs[m] + e * maps[m]->dim]] |= mask;

    ncolor += 32; // increment base level

  /* store block mapping and number of blocks per color */

  if (indirect_reduce && OP_plans[ip].ncolors_owned == 0)
    OP_plans[ip].ncolors_owned =
        ncolors; // no MPI, so get the reduction arrays after everyting is done
  OP_plans[ip].ncolors = ncolors;
  if (staging == OP_COLOR2)
    OP_plans[ip].ncolors = OP_plans[ip].nthrcol[0];

  /*for(int col = 0; col = OP_plans[ip].ncolors;col++) //should initialize to
    zero because op_calloc returns garbage!!
    OP_plans[ip].ncolblk[col] = 0;

  for (int b = 0; b < nblocks; b++)
    OP_plans[ip].ncolblk[blk_col[b]]++; // number of blocks of each color

  for (int c = 1; c < ncolors; c++)
    OP_plans[ip].ncolblk[c] += OP_plans[ip].ncolblk[c - 1]; // cumsum

  for (int c = 0; c < ncolors; c++)
    work2[c] = 0;

  for (int b = 0; b < nblocks; b++) {
    int c = blk_col[b];
    int b2 = work2[c]; // number of preceding blocks of this color
    if (c > 0)
      b2 += OP_plans[ip].ncolblk[c - 1]; // plus previous colors

    OP_plans[ip].blkmap[b2] = b;

    work2[c]++; // increment counter

  for (int c = ncolors - 1; c > 0; c--)
    OP_plans[ip].ncolblk[c] -= OP_plans[ip].ncolblk[c - 1]; // undo cumsum

  /* reorder blocks by color? */

  /* work out shared memory requirements */
  OP_plans[ip].nsharedCol = (int *)op_malloc(ncolors * sizeof(int));
  float total_shared = 0;
  for (int col = 0; col < ncolors; col++) {
    OP_plans[ip].nsharedCol[col] = 0;
    for (int b = 0; b < nblocks; b++) {
      if (blk_col[b] == col) {
        int nbytes = 0;
        for (int m = 0; m < ninds_staged; m++) {
          int m2 = 0;
          while (inds_staged[m2] != m)
          if (args[m2].opt == 0)

          nbytes +=
              ROUND_UP_64(ind_sizes[m + b * ninds_staged] * dats[m2]->size);
        OP_plans[ip].nsharedCol[col] =
            MAX(OP_plans[ip].nsharedCol[col], nbytes);
        total_shared += nbytes;

  OP_plans[ip].nshared = 0;
  total_shared = 0;

  for (int b = 0; b < nblocks; b++) {
    int nbytes = 0;
    for (int m = 0; m < ninds_staged; m++) {
      int m2 = 0;
      while (inds_staged[m2] != m)
      if (args[m2].opt == 0)

      nbytes += ROUND_UP_64(ind_sizes[m + b * ninds_staged] * dats[m2]->size);
    OP_plans[ip].nshared = MAX(OP_plans[ip].nshared, nbytes);
    total_shared += nbytes;

  /* work out total bandwidth requirements */

  OP_plans[ip].transfer = 0;
  OP_plans[ip].transfer2 = 0;
  float transfer3 = 0;

  if (staging != OP_COLOR2 && staging != OP_STAGE_INC) {
    for (int b = 0; b < nblocks; b++) {
      for (int m = 0; m < nargs; m++) // for each argument
        if (args[m].opt) {
          if (inds[m] < 0) // if it is directly addressed
            float fac = 2.0f;
            if (accs[m] == OP_READ ||
                accs[m] == OP_WRITE) // if you only read or write it
              fac = 1.0f;
            if (dats[m] != NULL) {
              OP_plans[ip].transfer +=
                  fac * nelems[b] * dats[m]->size; // cost of reading it all
              OP_plans[ip].transfer2 += fac * nelems[b] * dats[m]->size;
              transfer3 += fac * nelems[b] * dats[m]->size;
          } else // if it is indirectly addressed: cost of reading the pointer
                 // to it
            OP_plans[ip].transfer += nelems[b] * sizeof(short);
            OP_plans[ip].transfer2 += nelems[b] * sizeof(short);
            transfer3 += nelems[b] * sizeof(short);
      for (int m = 0; m < ninds; m++) // for each indirect mapping
        int m2 = 0;
        while (inds[m2] != m) // find the first argument that uses this mapping
        if (args[m2].opt == 0)
        float fac = 2.0f;
        if (accs[m2] == OP_READ || accs[m2] == OP_WRITE) // only read it
          fac = 1.0f;
        if (staging == OP_STAGE_INC && accs[m2] != OP_INC) {
          OP_plans[ip].transfer += 1;
          OP_plans[ip].transfer2 += 1;
        OP_plans[ip].transfer +=
            fac * ind_sizes[m + b * ninds] *
            dats[m2]->size; // simply read all data one by one

        /* work out how many cache lines are used by indirect addressing */

        int i_map, l_new, l_old;
        int e0 = ind_offs[m + b * ninds];       // where it starts
        int e1 = e0 + ind_sizes[m + b * ninds]; // where it ends

        l_old = -1;

        for (int e = e0; e < e1;
             e++) // iterate through every indirectly accessed data element
          i_map = ind_maps[m][e]; // the pointer to the data element
          l_new = (i_map * dats[m2]->size) /
                  OP_cache_line_size; // which cache line it is on (full size,
                                      // dim*sizeof(type))
          if (l_new > l_old) // if it is on a further cache line (that is not
                             // yet loaded, - i_map is ordered)
            OP_plans[ip].transfer2 +=
                fac * OP_cache_line_size; // load the cache line
          l_old = l_new;
          l_new = ((i_map + 1) * dats[m2]->size - 1) /
                  OP_cache_line_size; // the last byte of the data
          OP_plans[ip].transfer2 += fac * (l_new - l_old) *
                                    OP_cache_line_size; // again, if not loaded,
                                                        // load it (can be
                                                        // multiple cache lines)
          l_old = l_new;

        l_old = -1;

        for (int e = e0; e < e1; e++) {
          i_map = ind_maps[m][e]; // pointer to the data element
          l_new = (i_map * dats[m2]->size) /
                  (dats[m2]->dim * OP_cache_line_size); // which cache line the
                                                        // first dimension of
                                                        // the data is on
          if (l_new > l_old)
            transfer3 +=
                fac * dats[m2]->dim *
                OP_cache_line_size; // if not loaded yet, load all cache lines
          l_old = l_new;
          l_new =
              ((i_map + 1) * dats[m2]->size - 1) /
              (dats[m2]->dim * OP_cache_line_size); // primitve type's last byte
          transfer3 += fac * (l_new - l_old) * dats[m2]->dim *
                       OP_cache_line_size; // load it
          l_old = l_new;

        /* also include mappings to load/store data */

        fac = 1.0f;
        if (accs[m2] == OP_RW)
          fac = 2.0f;
        OP_plans[ip].transfer += fac * ind_sizes[m + b * ninds] * sizeof(int);
        OP_plans[ip].transfer2 += fac * ind_sizes[m + b * ninds] * sizeof(int);
        transfer3 += fac * ind_sizes[m + b * ninds] * sizeof(int);

  /* print out useful information */

  if (OP_diags > 1) {
    printf(" number of blocks       = %d \n", nblocks);
    printf(" number of block colors = %d \n", OP_plans[ip].ncolors);
    printf(" maximum block size     = %d \n", bsize);
    printf(" average thread colors  = %.2f \n", total_colors / nblocks);
    printf(" shared memory required = ");
    for (int i = 0; i < ncolors - 1; i++)
      printf(" %.2f KB,", OP_plans[ip].nsharedCol[i] / 1024.0f);
    printf(" %.2f KB\n", OP_plans[ip].nsharedCol[ncolors - 1] / 1024.0f);
    printf(" average data reuse     = %.2f \n",
           maxbytes * (exec_length / total_shared));
    printf(" data transfer (used)   = %.2f MB \n",
           OP_plans[ip].transfer / (1024.0f * 1024.0f));
    printf(" data transfer (total)  = %.2f MB \n",
           OP_plans[ip].transfer2 / (1024.0f * 1024.0f));
    printf(" SoA/AoS transfer ratio = %.2f \n\n",
           transfer3 / OP_plans[ip].transfer2);

  /* validate plan info */

  op_plan_check(OP_plans[ip], ninds_staged, inds_staged);

  /* free work arrays */

  for (int m = 0; m < ninds; m++)
  op_timers_core(&cpu_t2, &wall_t2);
  for (int i = 0; i < OP_kern_max; i++) {
    if (strcmp(name, OP_kernels[i].name) == 0) {
      OP_kernels[i].plan_time += wall_t2 - wall_t1;
  /* return pointer to plan */
  OP_plan_time += wall_t2 - wall_t1;
  return &(OP_plans[ip]);
예제 #2
int RAND_poll(void)
    HCRYPTPROV hProvider = 0;
    DWORD w;
    int good = 0;

    /* Determine the OS version we are on so we can turn off things
     * that do not work properly.
    OSVERSIONINFO osverinfo ;
    osverinfo.dwOSVersionInfoSize = sizeof(OSVERSIONINFO) ;
    GetVersionEx( &osverinfo ) ;

#if defined(OPENSSL_SYS_WINCE)
# if defined(_WIN32_WCE) && _WIN32_WCE>=300
    /* Even though MSDN says _WIN32_WCE>=210, it doesn't seem to be available
     * in commonly available implementations prior 300... */
        BYTE buf[64]; /* ARRAY OK 2009-06-05 yngve */
        /* poll the CryptoAPI PRNG */
        /* The CryptoAPI returns sizeof(buf) bytes of randomness */
        if (CryptAcquireContextW(&hProvider, NULL, NULL, PROV_RSA_FULL,
            if (CryptGenRandom(hProvider, sizeof(buf), buf))
                RAND_add(buf, sizeof(buf), sizeof(buf));
            CryptReleaseContext(hProvider, 0);
# endif
     * None of below libraries are present on Windows CE, which is
     * why we #ifndef the whole section. This also excuses us from
     * handling the GetProcAddress issue. The trouble is that in
     * real Win32 API GetProcAddress is available in ANSI flavor
     * only. In WinCE on the other hand GetProcAddress is a macro
     * most commonly defined as GetProcAddressW, which accepts
     * Unicode argument. If we were to call GetProcAddress under
     * WinCE, I'd recommend to either redefine GetProcAddress as
     * GetProcAddressA (there seem to be one in common CE spec) or
     * implement own shim routine, which would accept ANSI argument
     * and expand it to Unicode.
        /* load functions dynamically - not available on all systems */
        HMODULE advapi = LoadLibrary(TEXT("ADVAPI32.DLL"));
        HMODULE kernel = LoadLibrary(TEXT("KERNEL32.DLL"));
        HMODULE user = NULL;
        HMODULE netapi = LoadLibrary(TEXT("NETAPI32.DLL"));
        NETSTATGET netstatget = NULL;
        NETFREE netfree = NULL;
        BYTE buf[64]; /* ARRAY OK 2009-06-05 yngve */

        if (netapi)
            netstatget = (NETSTATGET) GetProcAddress(netapi,"NetStatisticsGet");
            netfree = (NETFREE) GetProcAddress(netapi,"NetApiBufferFree");

        if (netstatget && netfree)
            LPBYTE outbuf;
            /* NetStatisticsGet() is a Unicode only function
             * STAT_WORKSTATION_0 contains 45 fields and STAT_SERVER_0
             * contains 17 fields.  We treat each field as a source of
             * one byte of entropy.

            if (netstatget(NULL, L"LanmanWorkstation", 0, 0, &outbuf) == 0)
                RAND_add(outbuf, sizeof(STAT_WORKSTATION_0), 45);
            if (netstatget(NULL, L"LanmanServer", 0, 0, &outbuf) == 0)
                RAND_add(outbuf, sizeof(STAT_SERVER_0), 17);

        if (netapi)

        /* It appears like this can cause an exception deep within ADVAPI32.DLL
         * at random times on Windows 2000.  Reported by Jeffrey Altman.
         * Only use it on NT.
        /* Wolfgang Marczy <*****@*****.**> reports that
         * the RegQueryValueEx call below can hang on NT4.0 (SP6).
         * So we don't use this at all for now. */
#if 0
        if ( osverinfo.dwPlatformId == VER_PLATFORM_WIN32_NT &&
                osverinfo.dwMajorVersion < 5)
            /* Read Performance Statistics from NT/2000 registry
             * The size of the performance data can vary from call
             * to call so we must guess the size of the buffer to use
             * and increase its size if we get an ERROR_MORE_DATA
             * return instead of ERROR_SUCCESS.
            LONG   rc=ERROR_MORE_DATA;
            char * buf=NULL;
            DWORD bufsz=0;
            DWORD length;

            while (rc == ERROR_MORE_DATA)
                buf = op_realloc(buf,bufsz+8192);
                if (!buf)
                bufsz += 8192;

                length = bufsz;
                rc = RegQueryValueEx(HKEY_PERFORMANCE_DATA, TEXT("Global"),
                                     NULL, NULL, buf, &length);
            if (rc == ERROR_SUCCESS)
                /* For entropy count assume only least significant
                * byte of each DWORD is random.
                RAND_add(&length, sizeof(length), 0);
                RAND_add(buf, length, length / 4.0);

                /* Close the Registry Key to allow Windows to cleanup/close
                 * the open handle
                 * Note: The 'HKEY_PERFORMANCE_DATA' key is implicitly opened
                 *       when the RegQueryValueEx above is done.  However, if
                 *       it is not explicitly closed, it can cause disk
                 *       partition manipulation problems.
            if (buf)

        if (advapi)
             * If it's available, then it's available in both ANSI
             * and UNICODE flavors even in Win9x, documentation says.
             * We favor Unicode...
            acquire = (CRYPTACQUIRECONTEXTW) GetProcAddress(advapi,
            gen = (CRYPTGENRANDOM) GetProcAddress(advapi,
            release = (CRYPTRELEASECONTEXT) GetProcAddress(advapi,

        if (acquire && gen && release)
            /* poll the CryptoAPI PRNG */
            /* The CryptoAPI returns sizeof(buf) bytes of randomness */
            if (acquire(&hProvider, NULL, NULL, PROV_RSA_FULL,
                if (gen(hProvider, sizeof(buf), buf) != 0)
                    RAND_add(buf, sizeof(buf), 0);
                    good = 1;
#if 0
                    printf("randomness from PROV_RSA_FULL\n");
                release(hProvider, 0);

            /* poll the Pentium PRG with CryptoAPI */
            if (acquire(&hProvider, 0, INTEL_DEF_PROV, PROV_INTEL_SEC, 0))
                if (gen(hProvider, sizeof(buf), buf) != 0)
                    RAND_add(buf, sizeof(buf), sizeof(buf));
                    good = 1;
#if 0
                    printf("randomness from PROV_INTEL_SEC\n");
                release(hProvider, 0);

        if (advapi)

        if (
            (osverinfo.dwPlatformId != VER_PLATFORM_WIN32_NT || !OPENSSL_isservice()) &&
            (user = LoadLibrary(TEXT("USER32.DLL"))))
            GETCURSORINFO cursor;
            GETQUEUESTATUS queue;

            win = (GETFOREGROUNDWINDOW) GetProcAddress(user, "GetForegroundWindow");
            cursor = (GETCURSORINFO) GetProcAddress(user, "GetCursorInfo");
            queue = (GETQUEUESTATUS) GetProcAddress(user, "GetQueueStatus");

            if (win)
                /* window handle */
                HWND h = win();
                RAND_add(&h, sizeof(h), 0);
            if (cursor)
                /* unfortunately, its not safe to call GetCursorInfo()
                 * on NT4 even though it exists in SP3 (or SP6) and
                 * higher.
                if ( osverinfo.dwPlatformId == VER_PLATFORM_WIN32_NT &&
                        osverinfo.dwMajorVersion < 5)
                    cursor = 0;
            if (cursor)
                /* cursor position */
                /* assume 2 bytes of entropy */
                CURSORINFO ci;
                ci.cbSize = sizeof(CURSORINFO);
                if (cursor(&ci))
                    RAND_add(&ci, ci.cbSize, 2);

            if (queue)
                /* message queue status */
                /* assume 1 byte of entropy */
                w = queue(QS_ALLEVENTS);
                RAND_add(&w, sizeof(w), 1);


        /* Toolhelp32 snapshot: enumerate processes, threads, modules and heap
         * http://msdn.microsoft.com/library/psdk/winbase/toolhelp_5pfd.htm
         * (Win 9x and 2000 only, not available on NT)
         * This seeding method was proposed in Peter Gutmann, Software
         * Generation of Practically Strong Random Numbers,
         * http://www.usenix.org/publications/library/proceedings/sec98/gutmann.html
         * revised version at http://www.cryptoengines.com/~peter/06_random.pdf
         * (The assignment of entropy estimates below is arbitrary, but based
         * on Peter's analysis the full poll appears to be safe. Additional
         * interactive seeding is encouraged.)

        if (kernel)
            CLOSETOOLHELP32SNAPSHOT close_snap;
            HANDLE handle;

            HEAP32FIRST heap_first;
            HEAP32NEXT heap_next;
            HEAP32LIST heaplist_first, heaplist_next;
            PROCESS32 process_first, process_next;
            THREAD32 thread_first, thread_next;
            MODULE32 module_first, module_next;

            HEAPLIST32 hlist;
            HEAPENTRY32 hentry;
            PROCESSENTRY32 p;
            THREADENTRY32 t;
            MODULEENTRY32 m;
            DWORD starttime = 0;

            snap = (CREATETOOLHELP32SNAPSHOT)
                   GetProcAddress(kernel, "CreateToolhelp32Snapshot");
            close_snap = (CLOSETOOLHELP32SNAPSHOT)
                         GetProcAddress(kernel, "CloseToolhelp32Snapshot");
            heap_first = (HEAP32FIRST) GetProcAddress(kernel, "Heap32First");
            heap_next = (HEAP32NEXT) GetProcAddress(kernel, "Heap32Next");
            heaplist_first = (HEAP32LIST) GetProcAddress(kernel, "Heap32ListFirst");
            heaplist_next = (HEAP32LIST) GetProcAddress(kernel, "Heap32ListNext");
            process_first = (PROCESS32) GetProcAddress(kernel, "Process32First");
            process_next = (PROCESS32) GetProcAddress(kernel, "Process32Next");
            thread_first = (THREAD32) GetProcAddress(kernel, "Thread32First");
            thread_next = (THREAD32) GetProcAddress(kernel, "Thread32Next");
            module_first = (MODULE32) GetProcAddress(kernel, "Module32First");
            module_next = (MODULE32) GetProcAddress(kernel, "Module32Next");

            if (snap && heap_first && heap_next && heaplist_first &&
                    heaplist_next && process_first && process_next &&
                    thread_first && thread_next && module_first &&
                    module_next && (handle = snap(TH32CS_SNAPALL,0))
                    != INVALID_HANDLE_VALUE)
                /* heap list and heap walking */
                /* HEAPLIST32 contains 3 fields that will change with
                 * each entry.  Consider each field a source of 1 byte
                 * of entropy.
                 * HEAPENTRY32 contains 5 fields that will change with
                 * each entry.  Consider each field a source of 1 byte
                 * of entropy.
                ZeroMemory(&hlist, sizeof(HEAPLIST32));
                hlist.dwSize = sizeof(HEAPLIST32);
                if (good) starttime = GetTickCount();
#ifdef _MSC_VER
                if (heaplist_first(handle, &hlist))
                       following discussion on dev ML, exception on WinCE (or other Win
                       platform) is theoretically of unknown origin; prevent infinite
                       loop here when this theoretical case occurs; otherwise cope with
                       the expected (MSDN documented) exception-throwing behaviour of
                       Heap32Next() on WinCE.

                       based on patch in original message by Tanguy Fautré (2009/03/02)
                           Subject: RAND_poll() and CreateToolhelp32Snapshot() stability
                    int ex_cnt_limit = 42;
                        RAND_add(&hlist, hlist.dwSize, 3);
                            ZeroMemory(&hentry, sizeof(HEAPENTRY32));
                            hentry.dwSize = sizeof(HEAPENTRY32);
                            if (heap_first(&hentry,
                                int entrycnt = 80;
                                             hentry.dwSize, 5);
                                while (heap_next(&hentry)
                                        && (!good || (GetTickCount()-starttime)<MAXDELAY)
                                        && --entrycnt > 0);
                        __except (EXCEPTION_EXECUTE_HANDLER)
                            /* ignore access violations when walking the heap list */
                    } while (heaplist_next(handle, &hlist)
                             && (!good || (GetTickCount()-starttime)<MAXDELAY)
                             && ex_cnt_limit > 0);

                if (heaplist_first(handle, &hlist))
                    int total_entrycnt = 1024;
                    DWORD end_of_run = GetTickCount()+300; /* Milliseconds // Will not work properly when it is about to wrap around, which happens every 49 days */
                        RAND_add(&hlist, hlist.dwSize, 3);
                        hentry.dwSize = sizeof(HEAPENTRY32);
                        if (heap_first(&hentry,
                            int entrycnt = 80;
                            do {
                                         hentry.dwSize, 5);

                                if(total_entrycnt %6 == 0 && GetTickCount() > end_of_run)
                                    total_entrycnt =0;

                            } while (heap_next(&hentry)
                                     && --entrycnt > 0 && --total_entrycnt > 0);
                    } while (heaplist_next(handle, &hlist)
                             && (!good || (GetTickCount()-starttime)<MAXDELAY));

                /* process walking */
                /* PROCESSENTRY32 contains 9 fields that will change
                 * with each entry.  Consider each field a source of
                 * 1 byte of entropy.
                p.dwSize = sizeof(PROCESSENTRY32);

                if (good) starttime = GetTickCount();
                if (process_first(handle, &p))
                        RAND_add(&p, p.dwSize, 9);
                    while (process_next(handle, &p) && (!good || (GetTickCount()-starttime)<MAXDELAY));

                /* thread walking */
                /* THREADENTRY32 contains 6 fields that will change
                 * with each entry.  Consider each field a source of
                 * 1 byte of entropy.
                t.dwSize = sizeof(THREADENTRY32);
                if (good) starttime = GetTickCount();
                if (thread_first(handle, &t))
                        RAND_add(&t, t.dwSize, 6);
                    while (thread_next(handle, &t) && (!good || (GetTickCount()-starttime)<MAXDELAY));

                /* module walking */
                /* MODULEENTRY32 contains 9 fields that will change
                 * with each entry.  Consider each field a source of
                 * 1 byte of entropy.
                m.dwSize = sizeof(MODULEENTRY32);
                if (good) starttime = GetTickCount();
                if (module_first(handle, &m))
                        RAND_add(&m, m.dwSize, 9);
                    while (module_next(handle, &m)
                            && (!good || (GetTickCount()-starttime)<MAXDELAY));
                if (close_snap)

