static void  maxandargmaxAppendRangeCalculations(maxandargmax_ctx*  ctx){
	size_t hwDim;
	int    i;

	/* Use internal remapping when computing the ranges for this thread. */
	strb_appends(&ctx->s, "\t/* Compute ranges for this thread. */\n");

		strb_appendf(&ctx->s, "\ti%dDim     = srcSize[%d];\n", i, ctx->axisList[i]);
		strb_appendf(&ctx->s, "\ti%dSStep   = srcSteps[%d];\n", i, ctx->axisList[i]);
		strb_appendf(&ctx->s, "\ti%dMStep   = dstMaxSteps[%d];\n", i, i);
		strb_appendf(&ctx->s, "\ti%dAStep   = dstArgmaxSteps[%d];\n", i, i);
		 * If this is the last index, it's the first cumulative dimension
		 * product we generate, and thus we initialize to 1.

		if(i == ctx->nds-1){
			strb_appendf(&ctx->s, "\ti%dPDim    = 1;\n", i);
			strb_appendf(&ctx->s, "\ti%dPDim    = i%dPDim * i%dDim;\n", i, i+1, i+1);
		 * Up to 3 dimensions get to rely on hardware loops.
		 * The others, if any, have to use software looping beginning at 0.

		if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
			strb_appendf(&ctx->s, "\ti%dStart   = gi%d * ci%d;\n", i, hwDim, hwDim);
			strb_appendf(&ctx->s, "\ti%dStart   = 0;\n", i);
		 * Up to 3 dimensions get to rely on hardware loops.
		 * The others, if any, have to use software looping beginning at 0.

		if(axisInSet(ctx->axisList[i], ctx->hwAxisList, ctx->ndh, &hwDim)){
			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + ci%d;\n", i, i, hwDim);
			strb_appendf(&ctx->s, "\ti%dEnd     = i%dStart + i%dDim;\n", i, i, i);

	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\t\n");
static void  maxandargmaxAppendTypedefs         (maxandargmax_ctx*  ctx){
	strb_appends(&ctx->s, "/* Typedefs */\n");
	strb_appendf(&ctx->s, "typedef %s     T;/* The type of the array being processed. */\n", ctx->dstMaxType);
	strb_appendf(&ctx->s, "typedef %s     X;/* Index type: signed 32/64-bit. */\n",          ctx->dstArgmaxType);
	strb_appends(&ctx->s, "\n");
	strb_appends(&ctx->s, "\n");
	strb_appends(&ctx->s, "\n");
static void  maxandargmaxAppendLoopMacroDefs    (maxandargmax_ctx*  ctx){
	int i;

	 * FOROVER Macro

	strb_appends(&ctx->s, "#define FOROVER(idx)    for(i##idx = i##idx##Start; i##idx < i##idx##End; i##idx++)\n");

	 * ESCAPE Macro

	strb_appends(&ctx->s, "#define ESCAPE(idx)     if(i##idx >= i##idx##Dim){continue;}\n");


	appendIdxes (&ctx->s, "#define SRCINDEXER(", "i", 0, ctx->nds, "", ")   (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)src + ");
		strb_appendf(&ctx->s, "i%d*i%dSStep + \\\n                                            ", i, i);
	strb_appends(&ctx->s, "0))\n");


	appendIdxes (&ctx->s, "#define RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ")              (");
		strb_appendf(&ctx->s, "i%d*i%dPDim + \\\n                                        ", i, i);
	strb_appends(&ctx->s, "0)\n");


	appendIdxes (&ctx->s, "#define DSTMINDEXER(", "i", 0, ctx->ndd, "", ")        (*(GLOBAL_MEM T*)((GLOBAL_MEM char*)dstMax + ");
		strb_appendf(&ctx->s, "i%d*i%dMStep + \\\n                                                  ", i, i);
	strb_appends(&ctx->s, "0))\n");


	appendIdxes (&ctx->s, "#define DSTAINDEXER(", "i", 0, ctx->ndd, "", ")        (*(GLOBAL_MEM X*)((GLOBAL_MEM char*)dstArgmax + ");
		strb_appendf(&ctx->s, "i%d*i%dAStep + \\\n                                                     ", i, i);
	strb_appends(&ctx->s, "0))\n");
static void  maxandargmaxAppendLoopOuter        (maxandargmax_ctx*  ctx){
	int i;

	 * Outer Loop Header Generation

		strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);

	 * Inner Loop Generation


	 * Outer Loop Trailer Generation

		strb_appends(&ctx->s, "\t}\n");
static void  maxandargmaxAppendIndexDeclarations(maxandargmax_ctx*  ctx){
	int i;
	strb_appends(&ctx->s, "\t/* GPU kernel coordinates. Always 3D. */\n");

	strb_appends(&ctx->s, "\tX bi0 = GID_0,        bi1 = GID_1,        bi2 = GID_2;\n");
	strb_appends(&ctx->s, "\tX bd0 = LDIM_0,       bd1 = LDIM_1,       bd2 = LDIM_2;\n");
	strb_appends(&ctx->s, "\tX ti0 = LID_0,        ti1 = LID_1,        ti2 = LID_2;\n");
	strb_appends(&ctx->s, "\tX gi0 = bi0*bd0+ti0,  gi1 = bi1*bd1+ti1,  gi2 = bi2*bd2+ti2;\n");
		strb_appends(&ctx->s, "\tX ");
			strb_appendf(&ctx->s, "ci%u = chunkSize[%u]%s",
			             i, i, (i==ctx->ndh-1) ? ";\n" : ", ");

	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\t/* Free indices & Reduction indices */\n");

	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "",        ";\n");}
	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "Dim",     ";\n");}
	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "Start",   ";\n");}
	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "End",     ";\n");}
	if(ctx->nds > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->nds, "SStep",   ";\n");}
	if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->ndd, "MStep",   ";\n");}
	if(ctx->ndd > 0){appendIdxes (&ctx->s, "\tX ", "i", 0,               ctx->ndd, "AStep",   ";\n");}
	if(ctx->nds > ctx->ndd){appendIdxes (&ctx->s, "\tX ", "i", ctx->ndd, ctx->nds, "PDim",    ";\n");}

	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\t\n");
Beispiel #6
void gpukernel_source_with_line_numbers(unsigned int count, const char **news, size_t *newl,
                                        strb *src) {
  assert(src != NULL);
  unsigned int section, line, i, j;
  size_t len;

  line=1;  // start the line counter at 1
  for(section=0; section<count; section++) {
    len = (newl == NULL)?0:newl[section];
    if(len<=0) // If either newl==NULL, or has length zero for this section, use strlen() to determine

    i=0; // position of line-starts within news[section]
    while(i<len) {
      strb_appendf(src, "%04d\t", line);

      for(j=i; j<len && news[section][j] != '\n'; j++); // look for next line-end
      strb_appendn(src, news[section]+i, (j-i));
      strb_appendc(src, '\n');

      i = j+1;  // Character after the newline
Beispiel #7
void gpuarray_elem_perdim(strb *sb, unsigned int nd,
                          const size_t *dims, const ssize_t *str,
                          const char *id) {
  int i;

  if (nd > 0) {
    strb_appendf(sb, "int %si = i;", id);

    for (i = nd-1; i > 0; i--) {
      strb_appendf(sb, "%s %c= ((%si %% %" SPREFIX "u) * "
                   "%" SPREFIX "d);%si = %si / %" SPREFIX "u;", id,
                   (str[i] < 0 ? '-' : '+'), id, dims[i],
                   ssabs(str[i]), id, id, dims[i]);
    strb_appendf(sb, "%s %c= (%si * %" SPREFIX "d);", id,
                 (str[0] < 0 ? '-' : '+'), id, ssabs(str[0]));
static void cuda_perdim_ptx(strb *sb, unsigned int nd,
			    const size_t *dims, const ssize_t *str,
			    const char *id, unsigned int bits) {
  int i;

  if (nd > 0) {
    strb_appendf(sb, "mov.u%u %si, i;\n", bits, id);
    for (i = nd-1; i > 0; i--) {
      strb_appendf(sb, "rem.u%u rl1, %si, %" SPREFIX "uU;\n"
		   "mad.lo.s%u %s, rl1, %" SPREFIX "d, %s;\n"
		   "div.u%u %si, %si, %" SPREFIX "uU;\n",
		   bits, id, dims[i],
		   bits, id, str[i], id,
		   bits, id, id, dims[i]);

    strb_appendf(sb, "mad.lo.s%u %s, %si, %" SPREFIX "d, %s;\n",
		 bits, id, id, str[0], id);
static void  appendIdxes                        (strb*              s,
                                                 const char*        prologue,
                                                 const char*        prefix,
                                                 int                startIdx,
                                                 int                endIdx,
                                                 const char*        suffix,
                                                 const char*        epilogue){
	int i;

	prologue = prologue ? prologue : "";
	prefix   = prefix   ? prefix   : "";
	suffix   = suffix   ? suffix   : "";
	epilogue = epilogue ? epilogue : "";

	strb_appends(s, prologue);
		strb_appendf(s, "%s%d%s%s", prefix, i, suffix, &","[i==endIdx-1]);
	strb_appends(s, epilogue);
Beispiel #10
static void  maxandargmaxAppendLoopInner        (maxandargmax_ctx*  ctx){
	int i;

	 * Inner Loop Prologue

	strb_appends(&ctx->s, "\t/**\n");
	strb_appends(&ctx->s, "\t * Reduction initialization.\n");
	strb_appends(&ctx->s, "\t */\n");
	strb_appends(&ctx->s, "\t\n");

	appendIdxes (&ctx->s, "\tT maxV = SRCINDEXER(", "i", 0, ctx->ndd, "", "");
	if(ctx->ndd && ctx->ndr){strb_appends(&ctx->s, ",");}
	appendIdxes (&ctx->s, "", "i", ctx->ndd, ctx->nds, "Start", ");\n");

	appendIdxes (&ctx->s, "\tX maxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "Start", ");\n");

	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\t/**\n");
	strb_appends(&ctx->s, "\t * REDUCTION LOOPS.\n");
	strb_appends(&ctx->s, "\t */\n");
	strb_appends(&ctx->s, "\t\n");

	 * Inner Loop Header Generation

		strb_appendf(&ctx->s, "\tFOROVER(%d){ESCAPE(%d)\n", i, i);

	 * Inner Loop Body Generation

	appendIdxes (&ctx->s, "\tT V = SRCINDEXER(", "i", 0, ctx->nds, "", ");\n");
	strb_appends(&ctx->s, "\t\n");
	strb_appends(&ctx->s, "\tif(V > maxV){\n");
	strb_appends(&ctx->s, "\t\tmaxV = V;\n");
	appendIdxes (&ctx->s, "\t\tmaxI = RDXINDEXER(", "i", ctx->ndd, ctx->nds, "", ");\n");
	strb_appends(&ctx->s, "\t}\n");

	 * Inner Loop Trailer Generation

		strb_appends(&ctx->s, "\t}\n");
	strb_appends(&ctx->s, "\t\n");

	 * Inner Loop Epilogue Generation

	strb_appends(&ctx->s, "\t/**\n");
	strb_appends(&ctx->s, "\t * Destination writeback.\n");
	strb_appends(&ctx->s, "\t */\n");
	strb_appends(&ctx->s, "\t\n");
	appendIdxes (&ctx->s, "\tDSTMINDEXER(", "i", 0, ctx->ndd, "", ") = maxV;\n");
	appendIdxes (&ctx->s, "\tDSTAINDEXER(", "i", 0, ctx->ndd, "", ") = maxI;\n");
Beispiel #11
static inline int gen_extcopy_kernel(const extcopy_args *a,
				     cuda_context *ctx, gpukernel **v,
				     size_t nEls) {
  strb sb = STRB_STATIC_INIT;
  int res = GA_SYS_ERROR;
  int flags = GA_USE_PTX;
  unsigned int bits = sizeof(void *)*8;
  int types[2];
  const char *in_t, *in_ld_t;
  const char *out_t, *out_ld_t;
  const char *rmod;

  in_t = map_t(a->itype);
  out_t = map_t(a->otype);
  /* Since float16 ('f16') is not a fully-supported type we need to use
     it as b16 (basically uint16) for read and write operations. */
  if (a->itype == GA_HALF)
    in_ld_t = "b16";
    in_ld_t = in_t;
  if (a->otype == GA_HALF)
    out_ld_t = "b16";
    out_ld_t = out_t;
  rmod = get_rmod(a->itype, a->otype);
  if (in_t == NULL || out_t == NULL) return GA_DEVSUP_ERROR;

  strb_appendf(&sb, ELEM_HEADER_PTX, "4.1", ctx->bin_id,
               bits, bits, bits, bits, in_t, out_t, bits,
               bits, bits, bits, bits, nEls, bits, bits);

  cuda_perdim_ptx(&sb, a->ind, a->idims, a->istr, "a_p", bits);
  cuda_perdim_ptx(&sb, a->ond, a->odims, a->ostr, "b_p", bits);

  strb_appendf(&sb, "ld.param.u%u rp1, [a_data];\n"
	       "cvt.s%u.s%u rp2, a_p;\n"
	       "add.s%u rp1, rp1, rp2;\n"
	       " tmpa, [rp1+%" SPREFIX "u];\n"
	       "cvt%s.%s.%s tmpb, tmpa;\n"
	       "ld.param.u%u rp1, [b_data];\n"
	       "cvt.s%u.s%u rp2, b_p;\n"
	       "add.s%u rp1, rp1, rp2;\n"
	       " [rp1+%" SPREFIX "u], tmpb;\n", bits,
	       bits, bits,
	       in_ld_t, a->ioff,
	       rmod, out_t, in_t,
	       bits, bits,
	       out_ld_t, a->ooff);

  strb_appendf(&sb, ELEM_FOOTER_PTX, bits, bits, nEls);

  if (strb_error(&sb))
    goto fail;

  if (a->itype == GA_DOUBLE || a->otype == GA_DOUBLE ||
      a->itype == GA_CDOUBLE || a->otype == GA_CDOUBLE) {
    flags |= GA_USE_DOUBLE;

  if (a->otype == GA_HALF || a->itype == GA_HALF) {
    flags |= GA_USE_HALF;

  if (gpuarray_get_elsize(a->otype) < 4 || gpuarray_get_elsize(a->itype) < 4) {
    /* Should check for non-mod4 strides too */
    flags |= GA_USE_SMALL;

  if (a->otype == GA_CFLOAT || a->itype == GA_CFLOAT ||
      a->otype == GA_CDOUBLE || a->itype == GA_CDOUBLE) {
    flags |= GA_USE_COMPLEX;

  types[0] = types[1] = GA_BUFFER;
  res = GA_NO_ERROR;
  *v = cuda_newkernel(ctx, 1, (const char **)&sb.s, &sb.l, "extcpy",
                      2, types, flags, &res, NULL);
  return res;
Beispiel #12
static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str,
                            GpuArray *a, const GpuArray *v,
                            const GpuArray *ind, int addr32) {
  strb sb = STRB_STATIC_INIT;
  int *atypes;
  char *sz, *ssz;
  unsigned int i, i2;
  unsigned int nargs, apos;
  int flags = GA_USE_CLUDA;
  int res;

  nargs = 9 + 2 * v->nd;

  atypes = calloc(nargs, sizeof(int));
  if (atypes == NULL)
    return GA_MEMORY_ERROR;

  if (addr32) {
    sz = "ga_uint";
    ssz = "ga_int";
  } else {
    sz = "ga_size";
    ssz = "ga_ssize";

  apos = 0;
  strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, "
               "GLOBAL_MEM const %s *v, ga_size v_off,",
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  for (i = 0; i < v->nd; i++) {
    strb_appendf(&sb, " ga_ssize s%u, ga_size d%u,", i, i);
    atypes[apos++] = GA_SSIZE;
    atypes[apos++] = GA_SIZE;
  strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, "
               "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n",
  atypes[apos++] = GA_BUFFER;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_SIZE;
  atypes[apos++] = GA_BUFFER;
  assert(apos == nargs);
  strb_appendf(&sb, "  const %s idx0 = LDIM_0 * GID_0 + LID_0;\n"
               "  const %s numThreads0 = LDIM_0 * GDIM_0;\n"
               "  const %s idx1 = LDIM_1 * GID_1 + LID_1;\n"
               "  const %s numThreads1 = LDIM_1 * GDIM_1;\n"
               "  %s i0, i1;\n", sz, sz, sz, sz, sz);
  strb_appends(&sb, "  if (idx0 >= n0 || idx1 >= n1) return;\n");
  strb_appendf(&sb, "  r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n"
               "  ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n",
  strb_appendf(&sb, "  for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n"
               "    %s ii0 = ind[i0];\n"
               "    %s pos0 = v_off;\n"
               "    if (ii0 < 0) ii0 += d0;\n"
               "    if ((ii0 < 0) || (ii0 >= d0)) {\n"
               "      *err = -1;\n"
               "      continue;\n"
               "    }\n"
               "    pos0 += ii0 * (%s)s0;\n"
               "    for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n"
               "      %s p = pos0;\n", ssz, sz, sz, sz);
  if (v->nd > 1) {
    strb_appendf(&sb, "      %s pos, ii = i1;\n", sz);
    for (i2 = v->nd; i2 > 1; i2--) {
      i = i2 - 1;
      if (i > 1)
        strb_appendf(&sb, "      pos = ii %% (%s)d%u;\n"
                     "      ii /= (%s)d%u;\n", sz, i, sz, i);
        strb_appends(&sb, "      pos = ii;\n");
      strb_appendf(&sb, "      p += pos * (%s)s%u;\n", ssz, i);
  strb_appendf(&sb, "      r[i0*((%s)n1) + i1] = *((GLOBAL_MEM %s *)(((GLOBAL_MEM char *)v) + p));\n",
               sz, gpuarray_get_type(v->typecode)->cluda_name);
  strb_appends(&sb, "    }\n"
               "  }\n"
  if (strb_error(&sb)) {
    res = GA_MEMORY_ERROR;
    goto bail;
  flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1);
  res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "take1",
                       nargs, atypes, flags, err_str);
  return res;