Exemplo n.º 1
0
static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
	size_t hwDim;
	int    i;

	/* Use internal remapping when computing the ranges for this thread. */
	strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n");

	for(i=0;i<ctx->nds;i++){
		strb_appendf(&ctx->s, "\ti%dDim     = srcSize[%d];\n", i, ctx->axisList[i]);
	}
	for(i=0;i<ctx->nds;i++){
		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->axisList[i]);
	}
	for(i=0;i<ctx->ndd;i++){
		strb_appendf(&ctx->s, "\ti%dMStep   = dstMaxSteps[%d];\n", i, i);
	}
	for(i=0;i<ctx->ndd;i++){
		strb_appendf(&ctx->s, "\ti%dAStep   = dstArgmaxSteps[%d];\n", i, i);
	}
	for(i=ctx->nds-1;i>=ctx->ndd;i--){
		/**
		 * If this is the last index, it's the first cumulative dimension
		 * product we generate, and thus we initialize to 1.
		 */

		if(i == ctx->nds-1){
			strb_appendf(&ctx->s, "\ti%dPDim    = 1;\n", i);
		}else{
			strb_appendf(&ctx->s, "\ti%dPDim    = i%dPDim * i%dDim;\n", i, i+1, i+1);
		}
	}
	for(i=0;i<ctx->nds;i++){
		/**
		 * Up to 3 dimensions get to rely on hardware loops.
		 * The others, if any, have to use software looping beginning at 0.
		 */

		if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
			strb_appendf(&ctx->s, "\ti%dStart   = gi%d * ci%d;\n", i, hwDim, hwDim);
		}else{
			strb_appendf(&ctx->s, "\ti%dStart   = 0;\n", i);
		}
	}
	for(i=0;i<ctx->nds;i++){
		/**
		 * Up to 3 dimensions get to rely on hardware loops.
		 * The others, if any, have to use software looping beginning at 0.
		 */

		if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + ci%d;\n", i, i, hwDim);
		}else{
			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + i%dDim;\n", i, i, i);
		}
	}

	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\t\n");
}
Exemplo n.º 2
0
static void  maxandargmaxAppendTypedefs         (maxandargmax_ctx*  ctx){
	strb_appends(&ctx->s, "/* Typedefs */\n");
	strb_appendf(&ctx->s, "typedef %s     T;/* The type of the array being processed. */\n", ctx->dstMaxType);
	strb_appendf(&ctx->s, "typedef %s     X;/* Index type: signed 32/64-bit. */\n",          ctx->dstArgmaxType);
	strb_appends(&ctx->s, "\n");
	strb_appends(&ctx->s, "\n");
	strb_appends(&ctx->s, "\n");
}
Exemplo n.º 3
0
static void  maxandargmaxAppendLoops            (maxandargmax_ctx*  ctx){
	strb_appends(&ctx->s, "\t/**\n");
	strb_appends(&ctx->s, "\t * FREE LOOPS.\n");
	strb_appends(&ctx->s, "\t */\n");
	strb_appends(&ctx->s, "\t\n");

	maxandargmaxAppendLoopMacroDefs  (ctx);
	maxandargmaxAppendLoopOuter      (ctx);
	maxandargmaxAppendLoopMacroUndefs(ctx);
}
Exemplo n.º 4
0
static void  maxandargmaxAppendKernel           (maxandargmax_ctx*  ctx){
	maxandargmaxAppendTypedefs         (ctx);
	maxandargmaxAppendPrototype        (ctx);
	strb_appends           (&ctx->s, "{\n");
	maxandargmaxAppendOffsets          (ctx);
	maxandargmaxAppendIndexDeclarations(ctx);
	maxandargmaxAppendRangeCalculations(ctx);
	maxandargmaxAppendLoops            (ctx);
	strb_appends           (&ctx->s, "}\n");
}
Exemplo n.º 5
0
static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx){
	int i;

	/**
	 * Outer Loop Header Generation
	 */

	for(i=0;i<ctx->ndd;i++){
		strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);
	}

	/**
	 * Inner Loop Generation
	 */

	maxandargmaxAppendLoopInner(ctx);

	/**
	 * Outer Loop Trailer Generation
	 */

	for(i=0;i<ctx->ndd;i++){
		strb_appends(&ctx->s, "\t}\n");
	}
}
Exemplo n.º 6
0
static void  appendIdxes                        (strb*              s,
                                                 const char*        prologue,
                                                 const char*        prefix,
                                                 int                startIdx,
                                                 int                endIdx,
                                                 const char*        suffix,
                                                 const char*        epilogue){
	int i;

	prologue = prologue ? prologue : "";
	prefix   = prefix   ? prefix   : "";
	suffix   = suffix   ? suffix   : "";
	epilogue = epilogue ? epilogue : "";

	strb_appends(s, prologue);
	for(i=startIdx;i<endIdx;i++){
		strb_appendf(s, "%s%d%s%s", prefix, i, suffix, &","[i==endIdx-1]);
	}
	strb_appends(s, epilogue);
}
Exemplo n.º 7
0
static void  maxandargmaxAppendLoopMacroUndefs  (maxandargmax_ctx*  ctx){
	strb_appends(&ctx->s, "#undef FOROVER\n");
	strb_appends(&ctx->s, "#undef ESCAPE\n");
	strb_appends(&ctx->s, "#undef SRCINDEXER\n");
	strb_appends(&ctx->s, "#undef RDXINDEXER\n");
	strb_appends(&ctx->s, "#undef DSTMINDEXER\n");
	strb_appends(&ctx->s, "#undef DSTAINDEXER\n");
}
Exemplo n.º 8
0
static void  maxandargmaxAppendOffsets          (maxandargmax_ctx*  ctx){
	strb_appends(&ctx->s, "\t/* Add offsets */\n");
	strb_appends(&ctx->s, "\tsrc       = (const GLOBAL_MEM T*)((const GLOBAL_MEM char*)src       + srcOff);\n");
	strb_appends(&ctx->s, "\tdstMax    = (GLOBAL_MEM T*)      ((GLOBAL_MEM char*)      dstMax    + dstMaxOff);\n");
	strb_appends(&ctx->s, "\tdstArgmax = (GLOBAL_MEM X*)      ((GLOBAL_MEM char*)      dstArgmax + dstArgmaxOff);\n");
	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\t\n");
}
Exemplo n.º 9
0
static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx){
	int i;

	/**
	 * FOROVER Macro
	 */

	strb_appends(&ctx->s, "#define FOROVER(idx)    for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n");

	/**
	 * ESCAPE Macro
	 */

	strb_appends(&ctx->s, "#define ESCAPE(idx)     if(i##idx >= i##idx##Dim){continue;}\n");

	/**
	 * SRCINDEXER Macro
	 */

	appendIdxes (&ctx->s, "#define SRCINDEXER(", "i", 0, ctx->nds, "", ")   (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + ");
	for(i=0;i<ctx->nds;i++){
		strb_appendf(&ctx->s, "i%d*i%dSStep + \\\n                                            ", i, i);
	}
	strb_appends(&ctx->s, "0))\n");

	/**
	 * RDXINDEXER Macro
	 */

	appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ")              (");
	for(i=ctx->ndd;i<ctx->nds;i++){
		strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n                                        ", i, i);
	}
	strb_appends(&ctx->s, "0)\n");

	/**
	 * DSTMINDEXER Macro
	 */

	appendIdxes (&ctx->s, "#define DSTMINDEXER(", "i", 0, ctx->ndd, "", ")        (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + ");
	for(i=0;i<ctx->ndd;i++){
		strb_appendf(&ctx->s, "i%d*i%dMStep + \\\n                                                  ", i, i);
	}
	strb_appends(&ctx->s, "0))\n");

	/**
	 * DSTAINDEXER Macro
	 */

	appendIdxes (&ctx->s, "#define DSTAINDEXER(", "i", 0, ctx->ndd, "", ")        (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + ");
	for(i=0;i<ctx->ndd;i++){
		strb_appendf(&ctx->s, "i%d*i%dAStep + \\\n                                                     ", i, i);
	}
	strb_appends(&ctx->s, "0))\n");
}
Exemplo n.º 10
0
static void  maxandargmaxAppendLoopInner        (maxandargmax_ctx*  ctx){
	int i;

	/**
	 * Inner Loop Prologue
	 */

	strb_appends(&ctx->s, "\t/**\n");
	strb_appends(&ctx->s, "\t * Reduction initialization.\n");
	strb_appends(&ctx->s, "\t */\n");
	strb_appends(&ctx->s, "\t\n");

	appendIdxes (&ctx->s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->ndd, "", "");
	if(ctx->ndd && ctx->ndr){strb_appends(&ctx->s, ",");}
	appendIdxes (&ctx->s, "", "i", ctx->ndd, ctx->nds, "Start", ");\n");

	appendIdxes (&ctx->s, "\tX maxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "Start", ");\n");

	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\t/**\n");
	strb_appends(&ctx->s, "\t * REDUCTION LOOPS.\n");
	strb_appends(&ctx->s, "\t */\n");
	strb_appends(&ctx->s, "\t\n");

	/**
	 * Inner Loop Header Generation
	 */

	for(i=ctx->ndd;i<ctx->nds;i++){
		strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);
	}

	/**
	 * Inner Loop Body Generation
	 */

	appendIdxes (&ctx->s, "\tT V = SRCINDEXER(", "i", 0, ctx->nds, "", ");\n");
	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\tif(V > maxV){\n");
	strb_appends(&ctx->s, "\t\tmaxV = V;\n");
	appendIdxes (&ctx->s, "\t\tmaxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
	strb_appends(&ctx->s, "\t}\n");

	/**
	 * Inner Loop Trailer Generation
	 */

	for(i=ctx->ndd;i<ctx->nds;i++){
		strb_appends(&ctx->s, "\t}\n");
	}
	strb_appends(&ctx->s, "\t\n");

	/**
	 * Inner Loop Epilogue Generation
	 */

	strb_appends(&ctx->s, "\t/**\n");
	strb_appends(&ctx->s, "\t * Destination writeback.\n");
	strb_appends(&ctx->s, "\t */\n");
	strb_appends(&ctx->s, "\t\n");
	appendIdxes (&ctx->s, "\tDSTMINDEXER(", "i", 0, ctx->ndd, "", ") = maxV;\n");
	appendIdxes (&ctx->s, "\tDSTAINDEXER(", "i", 0, ctx->ndd, "", ") = maxI;\n");
}
Exemplo n.º 11
0
static void  maxandargmaxAppendIndexDeclarations(maxandargmax_ctx*  ctx){
	int i;
	strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n");

	strb_appends(&ctx->s, "\tX bi0 = GID_0,        bi1 = GID_1,        bi2 = GID_2;\n");
	strb_appends(&ctx->s, "\tX bd0 = LDIM_0,       bd1 = LDIM_1,       bd2 = LDIM_2;\n");
	strb_appends(&ctx->s, "\tX ti0 = LID_0,        ti1 = LID_1,        ti2 = LID_2;\n");
	strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0,  gi1 = bi1*bd1+ti1,  gi2 = bi2*bd2+ti2;\n");
	if(ctx->ndh>0){
		strb_appends(&ctx->s, "\tX ");
		for(i=0;i<ctx->ndh;i++){
			strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s",
			             i, i, (i==ctx->ndh-1) ? ";\n" : ", ");
		}
	}

	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n");

	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "",        ";\n");}
	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "Dim",     ";\n");}
	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "Start",   ";\n");}
	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "End",     ";\n");}
	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "SStep",   ";\n");}
	if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->ndd, "MStep",   ";\n");}
	if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->ndd, "AStep",   ";\n");}
	if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim",    ";\n");}

	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\t\n");
}
Exemplo n.º 12
0
static void  maxandargmaxAppendPrototype        (maxandargmax_ctx*  ctx){
	strb_appends(&ctx->s, "KERNEL void maxandargmax(const GLOBAL_MEM T*        src,\n");
	strb_appends(&ctx->s, "                         const X         srcOff,\n");
	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        srcSteps,\n");
	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        srcSize,\n");
	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        chunkSize,\n");
	strb_appends(&ctx->s, "                         GLOBAL_MEM T*              dstMax,\n");
	strb_appends(&ctx->s, "                         const X         dstMaxOff,\n");
	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        dstMaxSteps,\n");
	strb_appends(&ctx->s, "                         GLOBAL_MEM X*              dstArgmax,\n");
	strb_appends(&ctx->s, "                         const X         dstArgmaxOff,\n");
	strb_appends(&ctx->s, "                         const GLOBAL_MEM X*        dstArgmaxSteps)");
}
Exemplo n.º 13
0
static gpukernel *cuda_newkernel(void *c, unsigned int count,
                                 const char **strings, const size_t *lengths,
                                 const char *fname, unsigned int argcount,
                                 const int *types, int flags, int *ret,
                                 char **err_str) {
    cuda_context *ctx = (cuda_context *)c;
    strb sb = STRB_STATIC_INIT;
    char *bin, *log = NULL;
    srckey k, *ak;
    binval *av;
    gpukernel *res;
    size_t bin_len = 0, log_len = 0;
    CUdevice dev;
    unsigned int i;
    int ptx_mode = 0;
    int binary_mode = 0;
    int major, minor;

    if (count == 0) FAIL(NULL, GA_VALUE_ERROR);

    if (flags & GA_USE_OPENCL)
      FAIL(NULL, GA_DEVSUP_ERROR);

    if (flags & GA_USE_BINARY) {
      // GA_USE_BINARY is exclusive
      if (flags & ~GA_USE_BINARY)
        FAIL(NULL, GA_INVALID_ERROR);
      // We need the length for binary data and there is only one blob.
      if (count != 1 || lengths == NULL || lengths[0] == 0)
        FAIL(NULL, GA_VALUE_ERROR);
    }

    cuda_enter(ctx);

    ctx->err = cuCtxGetDevice(&dev);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }
    ctx->err = cuDeviceComputeCapability(&major, &minor, dev);
    if (ctx->err != CUDA_SUCCESS) {
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    // GA_USE_CLUDA is done later
    // GA_USE_SMALL will always work
    if (flags & GA_USE_DOUBLE) {
      if (major < 1 || (major == 1 && minor < 3)) {
        cuda_exit(ctx);
        FAIL(NULL, GA_DEVSUP_ERROR);
      }
    }
    if (flags & GA_USE_COMPLEX) {
      // just for now since it is most likely broken
      cuda_exit(ctx);
      FAIL(NULL, GA_DEVSUP_ERROR);
    }
    // GA_USE_HALF should always work

    if (flags & GA_USE_PTX) {
      ptx_mode = 1;
    } else if (flags & GA_USE_BINARY) {
      binary_mode = 1;
    }

    if (binary_mode) {
      bin = memdup(strings[0], lengths[0]);
      bin_len = lengths[0];
      if (bin == NULL) {
        cuda_exit(ctx);
        FAIL(NULL, GA_MEMORY_ERROR);
      }
    } else {
      if (flags & GA_USE_CLUDA) {
        strb_appends(&sb, CUDA_PREAMBLE);
      }

      if (lengths == NULL) {
        for (i = 0; i < count; i++)
        strb_appends(&sb, strings[i]);
      } else {
        for (i = 0; i < count; i++) {
          if (lengths[i] == 0)
            strb_appends(&sb, strings[i]);
          else
            strb_appendn(&sb, strings[i], lengths[i]);
        }
      }

      strb_append0(&sb);

      if (strb_error(&sb)) {
        strb_clear(&sb);
        cuda_exit(ctx);
        return NULL;
      }

      if (ptx_mode) {
        bin = sb.s;
        bin_len = sb.l;
      } else {
        bin = NULL;
        if (compile_cache != NULL) {
          k.src = sb.s;
          k.len = sb.l;
          memcpy(k.arch, ctx->bin_id, BIN_ID_LEN);
          av = cache_get(compile_cache, &k);
          if (av != NULL) {
            bin = memdup(av->bin, av->len);
            bin_len = av->len;
          }
        }
        if (bin == NULL) {
          bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len,
                              &log, &log_len, ret);
        }
        if (bin == NULL) {
          if (err_str != NULL) {
            strb debug_msg = STRB_STATIC_INIT;

            // We're substituting debug_msg for a string with this first line:
            strb_appends(&debug_msg, "CUDA kernel build failure ::\n");

            /* Delete the final NUL */
            sb.l--;
            gpukernel_source_with_line_numbers(1, (const char **)&sb.s,
                                               &sb.l, &debug_msg);

            if (log != NULL) {
              strb_appends(&debug_msg, "\nCompiler log:\n");
              strb_appendn(&debug_msg, log, log_len);
              free(log);
            }
            *err_str = strb_cstr(&debug_msg);
            // *err_str will be free()d by the caller (see docs in kernel.h)
          }
          strb_clear(&sb);
          cuda_exit(ctx);
          return NULL;
        }
        if (compile_cache == NULL)
          compile_cache = cache_twoq(16, 16, 16, 8, src_eq, src_hash, src_free,
                                     bin_free);

        if (compile_cache != NULL) {
          ak = malloc(sizeof(*ak));
          av = malloc(sizeof(*av));
          if (ak == NULL || av == NULL) {
            free(ak);
            free(av);
            goto done;
          }
          ak->src = memdup(sb.s, sb.l);
          if (ak->src == NULL) {
            free(ak);
            free(av);
            goto done;
          }
          ak->len = sb.l;
          memmove(ak->arch, ctx->bin_id, BIN_ID_LEN);
          av->len = bin_len;
          av->bin = memdup(bin, bin_len);
          if (av->bin == NULL) {
            src_free(ak);
            free(av);
            goto done;
          }
          cache_add(compile_cache, ak, av);
        }
      done:
        strb_clear(&sb);
      }
    }

    res = calloc(1, sizeof(*res));
    if (res == NULL) {
      free(bin);
      cuda_exit(ctx);
      FAIL(NULL, GA_SYS_ERROR);
    }

    res->bin_sz = bin_len;
    res->bin = bin;

    res->refcnt = 1;
    res->argcount = argcount;
    res->types = calloc(argcount, sizeof(int));
    if (res->types == NULL) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_MEMORY_ERROR);
    }
    memcpy(res->types, types, argcount*sizeof(int));
    res->args = calloc(argcount, sizeof(void *));
    if (res->args == NULL) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_MEMORY_ERROR);
    }

    ctx->err = cuModuleLoadData(&res->m, bin);

    if (ctx->err != CUDA_SUCCESS) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    ctx->err = cuModuleGetFunction(&res->k, res->m, fname);
    if (ctx->err != CUDA_SUCCESS) {
      _cuda_freekernel(res);
      cuda_exit(ctx);
      FAIL(NULL, GA_IMPL_ERROR);
    }

    res->ctx = ctx;
    ctx->refcnt++;
    cuda_exit(ctx);
    TAG_KER(res);
    return res;
}
Exemplo n.º 14
0
static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
                            GpuArray *a, const GpuArray *v,
                            const GpuArray *ind, int addr32) {
  strb sb = STRB_STATIC_INIT;
  int *atypes;
  char *sz, *ssz;
  unsigned int i, i2;
  unsigned int nargs, apos;
  int flags = GA_USE_CLUDA;
  int res;

  nargs = 9 + 2 * v->nd;

  atypes = calloc(nargs, sizeof(int));
  if (atypes == NULL)
    return GA_MEMORY_ERROR;

  if (addr32) {
    sz = "ga_uint";
    ssz = "ga_int";
  } else {
    sz = "ga_size";
    ssz = "ga_ssize";
  }

  apos = 0;
  strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, "
               "GLOBAL_MEM const %s *v, ga_size v_off,",
               gpuarray_get_type(a->typecode)->cluda_name,
               gpuarray_get_type(v->typecode)->cluda_name);
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  for (i = 0; i < v->nd; i++) {
    strb_appendf(&sb, " ga_ssize s%u, ga_size d%u,", i, i);
    atypes[apos++] = GA_SSIZE;
    atypes[apos++] = GA_SIZE;
  }
  strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, "
               "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n",
               gpuarray_get_type(ind->typecode)->cluda_name);
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_BUFFER;
  assert(apos == nargs);
  strb_appendf(&sb, "  const %s idx0 = LDIM_0 * GID_0 + LID_0;\n"
               "  const %s numThreads0 = LDIM_0 * GDIM_0;\n"
               "  const %s idx1 = LDIM_1 * GID_1 + LID_1;\n"
               "  const %s numThreads1 = LDIM_1 * GDIM_1;\n"
               "  %s i0, i1;\n", sz, sz, sz, sz, sz);
  strb_appends(&sb, "  if (idx0 >= n0 || idx1 >= n1) return;\n");
  strb_appendf(&sb, "  r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n"
               "  ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n",
               gpuarray_get_type(a->typecode)->cluda_name,
               gpuarray_get_type(ind->typecode)->cluda_name);
  strb_appendf(&sb, "  for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n"
               "    %s ii0 = ind[i0];\n"
               "    %s pos0 = v_off;\n"
               "    if (ii0 < 0) ii0 += d0;\n"
               "    if ((ii0 < 0) || (ii0 >= d0)) {\n"
               "      *err = -1;\n"
               "      continue;\n"
               "    }\n"
               "    pos0 += ii0 * (%s)s0;\n"
               "    for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n"
               "      %s p = pos0;\n", ssz, sz, sz, sz);
  if (v->nd > 1) {
    strb_appendf(&sb, "      %s pos, ii = i1;\n", sz);
    for (i2 = v->nd; i2 > 1; i2--) {
      i = i2 - 1;
      if (i > 1)
        strb_appendf(&sb, "      pos = ii %% (%s)d%u;\n"
                     "      ii /= (%s)d%u;\n", sz, i, sz, i);
      else
        strb_appends(&sb, "      pos = ii;\n");
      strb_appendf(&sb, "      p += pos * (%s)s%u;\n", ssz, i);
    }
  }
  strb_appendf(&sb, "      r[i0*((%s)n1) + i1] = *((GLOBAL_MEM %s *)(((GLOBAL_MEM char *)v) + p));\n",
               sz, gpuarray_get_type(v->typecode)->cluda_name);
  strb_appends(&sb, "    }\n"
               "  }\n"
               "}\n");
  if (strb_error(&sb)) {
    res = GA_MEMORY_ERROR;
    goto bail;
  }
  flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1);
  res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "take1",
                       nargs, atypes, flags, err_str);
bail:
  free(atypes);
  strb_clear(&sb);
  return res;
}