void default_tags(struct strb **tags, const char *src) { char rtime[128]; struct tm *tm; time_t t; strb_clear(tags); if (src != NULL) add_tag(tags, "source", "%s", src); if (conf.host_name != NULL) add_tag(tags, "hostname", "%s", conf.host_name); t = time(NULL); if ((tm = localtime(&t)) != NULL) { /* * Okay, in a struct tm, everything is zero-based (including * month!) except day of the month which is one-based. * * To make thing clearer, strftime(3) measures everything as * you would expect... except that day of the week runs from * 0-6 but day of the year runs from 1-366. * * Fun fun fun. */ add_tag(tags, "hour", "%.2d", tm->tm_hour); add_tag(tags, "minute", "%.2d", tm->tm_min); add_tag(tags, "second", "%.2d", tm->tm_sec); add_tag(tags, "day", "%.2d", tm->tm_mday); add_tag(tags, "month", "%.2d", tm->tm_mon + 1); add_tag(tags, "year", "%.4d", 1900 + tm->tm_year); add_tag(tags, "year2", "%.2d", tm->tm_year % 100); add_tag(tags, "dayofweek", "%d", tm->tm_wday); add_tag(tags, "dayofyear", "%.2d", tm->tm_yday + 1); add_tag(tags, "quarter", "%d", tm->tm_mon / 3 + 1); } if (rfc822time(t, rtime, sizeof rtime) != NULL) add_tag(tags, "rfc822date", "%s", rtime); }
static inline int gen_extcopy_kernel(const extcopy_args *a, cuda_context *ctx, gpukernel **v, size_t nEls) { strb sb = STRB_STATIC_INIT; int res = GA_SYS_ERROR; int flags = GA_USE_PTX; unsigned int bits = sizeof(void *)*8; int types[2]; const char *in_t, *in_ld_t; const char *out_t, *out_ld_t; const char *rmod; in_t = map_t(a->itype); out_t = map_t(a->otype); /* Since float16 ('f16') is not a fully-supported type we need to use it as b16 (basically uint16) for read and write operations. */ if (a->itype == GA_HALF) in_ld_t = "b16"; else in_ld_t = in_t; if (a->otype == GA_HALF) out_ld_t = "b16"; else out_ld_t = out_t; rmod = get_rmod(a->itype, a->otype); if (in_t == NULL || out_t == NULL) return GA_DEVSUP_ERROR; strb_appendf(&sb, ELEM_HEADER_PTX, "4.1", ctx->bin_id, bits, bits, bits, bits, in_t, out_t, bits, bits, bits, bits, bits, nEls, bits, bits); cuda_perdim_ptx(&sb, a->ind, a->idims, a->istr, "a_p", bits); cuda_perdim_ptx(&sb, a->ond, a->odims, a->ostr, "b_p", bits); strb_appendf(&sb, "ld.param.u%u rp1, [a_data];\n" "cvt.s%u.s%u rp2, a_p;\n" "add.s%u rp1, rp1, rp2;\n" "ld.global.%s tmpa, [rp1+%" SPREFIX "u];\n" "cvt%s.%s.%s tmpb, tmpa;\n" "ld.param.u%u rp1, [b_data];\n" "cvt.s%u.s%u rp2, b_p;\n" "add.s%u rp1, rp1, rp2;\n" "st.global.%s [rp1+%" SPREFIX "u], tmpb;\n", bits, bits, bits, bits, in_ld_t, a->ioff, rmod, out_t, in_t, bits, bits, bits, bits, out_ld_t, a->ooff); strb_appendf(&sb, ELEM_FOOTER_PTX, bits, bits, nEls); if (strb_error(&sb)) goto fail; if (a->itype == GA_DOUBLE || a->otype == GA_DOUBLE || a->itype == GA_CDOUBLE || a->otype == GA_CDOUBLE) { flags |= GA_USE_DOUBLE; } if (a->otype == GA_HALF || a->itype == GA_HALF) { flags |= GA_USE_HALF; } if (gpuarray_get_elsize(a->otype) < 4 || gpuarray_get_elsize(a->itype) < 4) { /* Should check for non-mod4 strides too */ flags |= GA_USE_SMALL; } if (a->otype == GA_CFLOAT || a->itype == GA_CFLOAT || a->otype == GA_CDOUBLE || a->itype == GA_CDOUBLE) { flags |= GA_USE_COMPLEX; } types[0] = types[1] = GA_BUFFER; res = GA_NO_ERROR; *v = cuda_newkernel(ctx, 1, (const char **)&sb.s, &sb.l, "extcpy", 2, types, flags, &res, NULL); fail: strb_clear(&sb); return res; }
static gpukernel *cuda_newkernel(void *c, unsigned int count, const char **strings, const size_t *lengths, const char *fname, unsigned int argcount, const int *types, int flags, int *ret, char **err_str) { cuda_context *ctx = (cuda_context *)c; strb sb = STRB_STATIC_INIT; char *bin, *log = NULL; srckey k, *ak; binval *av; gpukernel *res; size_t bin_len = 0, log_len = 0; CUdevice dev; unsigned int i; int ptx_mode = 0; int binary_mode = 0; int major, minor; if (count == 0) FAIL(NULL, GA_VALUE_ERROR); if (flags & GA_USE_OPENCL) FAIL(NULL, GA_DEVSUP_ERROR); if (flags & GA_USE_BINARY) { // GA_USE_BINARY is exclusive if (flags & ~GA_USE_BINARY) FAIL(NULL, GA_INVALID_ERROR); // We need the length for binary data and there is only one blob. if (count != 1 || lengths == NULL || lengths[0] == 0) FAIL(NULL, GA_VALUE_ERROR); } cuda_enter(ctx); ctx->err = cuCtxGetDevice(&dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuDeviceComputeCapability(&major, &minor, dev); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } // GA_USE_CLUDA is done later // GA_USE_SMALL will always work if (flags & GA_USE_DOUBLE) { if (major < 1 || (major == 1 && minor < 3)) { cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } } if (flags & GA_USE_COMPLEX) { // just for now since it is most likely broken cuda_exit(ctx); FAIL(NULL, GA_DEVSUP_ERROR); } // GA_USE_HALF should always work if (flags & GA_USE_PTX) { ptx_mode = 1; } else if (flags & GA_USE_BINARY) { binary_mode = 1; } if (binary_mode) { bin = memdup(strings[0], lengths[0]); bin_len = lengths[0]; if (bin == NULL) { cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } } else { if (flags & GA_USE_CLUDA) { strb_appends(&sb, CUDA_PREAMBLE); } if (lengths == NULL) { for (i = 0; i < count; i++) strb_appends(&sb, strings[i]); } else { for (i = 0; i < count; i++) { if (lengths[i] == 0) strb_appends(&sb, strings[i]); else strb_appendn(&sb, strings[i], lengths[i]); } } strb_append0(&sb); if (strb_error(&sb)) { strb_clear(&sb); cuda_exit(ctx); return NULL; } if (ptx_mode) { bin = sb.s; bin_len = sb.l; } else { bin = NULL; if (compile_cache != NULL) { k.src = sb.s; k.len = sb.l; memcpy(k.arch, ctx->bin_id, BIN_ID_LEN); av = cache_get(compile_cache, &k); if (av != NULL) { bin = memdup(av->bin, av->len); bin_len = av->len; } } if (bin == NULL) { bin = call_compiler(sb.s, sb.l, ctx->bin_id, &bin_len, &log, &log_len, ret); } if (bin == NULL) { if (err_str != NULL) { strb debug_msg = STRB_STATIC_INIT; // We're substituting debug_msg for a string with this first line: strb_appends(&debug_msg, "CUDA kernel build failure ::\n"); /* Delete the final NUL */ sb.l--; gpukernel_source_with_line_numbers(1, (const char **)&sb.s, &sb.l, &debug_msg); if (log != NULL) { strb_appends(&debug_msg, "\nCompiler log:\n"); strb_appendn(&debug_msg, log, log_len); free(log); } *err_str = strb_cstr(&debug_msg); // *err_str will be free()d by the caller (see docs in kernel.h) } strb_clear(&sb); cuda_exit(ctx); return NULL; } if (compile_cache == NULL) compile_cache = cache_twoq(16, 16, 16, 8, src_eq, src_hash, src_free, bin_free); if (compile_cache != NULL) { ak = malloc(sizeof(*ak)); av = malloc(sizeof(*av)); if (ak == NULL || av == NULL) { free(ak); free(av); goto done; } ak->src = memdup(sb.s, sb.l); if (ak->src == NULL) { free(ak); free(av); goto done; } ak->len = sb.l; memmove(ak->arch, ctx->bin_id, BIN_ID_LEN); av->len = bin_len; av->bin = memdup(bin, bin_len); if (av->bin == NULL) { src_free(ak); free(av); goto done; } cache_add(compile_cache, ak, av); } done: strb_clear(&sb); } } res = calloc(1, sizeof(*res)); if (res == NULL) { free(bin); cuda_exit(ctx); FAIL(NULL, GA_SYS_ERROR); } res->bin_sz = bin_len; res->bin = bin; res->refcnt = 1; res->argcount = argcount; res->types = calloc(argcount, sizeof(int)); if (res->types == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } memcpy(res->types, types, argcount*sizeof(int)); res->args = calloc(argcount, sizeof(void *)); if (res->args == NULL) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_MEMORY_ERROR); } ctx->err = cuModuleLoadData(&res->m, bin); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } ctx->err = cuModuleGetFunction(&res->k, res->m, fname); if (ctx->err != CUDA_SUCCESS) { _cuda_freekernel(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->ctx = ctx; ctx->refcnt++; cuda_exit(ctx); TAG_KER(res); return res; }
static int gen_take1_kernel(GpuKernel *k, gpucontext *ctx, char **err_str, GpuArray *a, const GpuArray *v, const GpuArray *ind, int addr32) { strb sb = STRB_STATIC_INIT; int *atypes; char *sz, *ssz; unsigned int i, i2; unsigned int nargs, apos; int flags = GA_USE_CLUDA; int res; nargs = 9 + 2 * v->nd; atypes = calloc(nargs, sizeof(int)); if (atypes == NULL) return GA_MEMORY_ERROR; if (addr32) { sz = "ga_uint"; ssz = "ga_int"; } else { sz = "ga_size"; ssz = "ga_ssize"; } apos = 0; strb_appendf(&sb, "KERNEL void take1(GLOBAL_MEM %s *r, ga_size r_off, " "GLOBAL_MEM const %s *v, ga_size v_off,", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(v->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; for (i = 0; i < v->nd; i++) { strb_appendf(&sb, " ga_ssize s%u, ga_size d%u,", i, i); atypes[apos++] = GA_SSIZE; atypes[apos++] = GA_SIZE; } strb_appendf(&sb, " GLOBAL_MEM const %s *ind, ga_size i_off, " "ga_size n0, ga_size n1, GLOBAL_MEM int* err) {\n", gpuarray_get_type(ind->typecode)->cluda_name); atypes[apos++] = GA_BUFFER; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_SIZE; atypes[apos++] = GA_BUFFER; assert(apos == nargs); strb_appendf(&sb, " const %s idx0 = LDIM_0 * GID_0 + LID_0;\n" " const %s numThreads0 = LDIM_0 * GDIM_0;\n" " const %s idx1 = LDIM_1 * GID_1 + LID_1;\n" " const %s numThreads1 = LDIM_1 * GDIM_1;\n" " %s i0, i1;\n", sz, sz, sz, sz, sz); strb_appends(&sb, " if (idx0 >= n0 || idx1 >= n1) return;\n"); strb_appendf(&sb, " r = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)r) + r_off);\n" " ind = (GLOBAL_MEM %s *)(((GLOBAL_MEM char *)ind) + i_off);\n", gpuarray_get_type(a->typecode)->cluda_name, gpuarray_get_type(ind->typecode)->cluda_name); strb_appendf(&sb, " for (i0 = idx0; i0 < n0; i0 += numThreads0) {\n" " %s ii0 = ind[i0];\n" " %s pos0 = v_off;\n" " if (ii0 < 0) ii0 += d0;\n" " if ((ii0 < 0) || (ii0 >= d0)) {\n" " *err = -1;\n" " continue;\n" " }\n" " pos0 += ii0 * (%s)s0;\n" " for (i1 = idx1; i1 < n1; i1 += numThreads1) {\n" " %s p = pos0;\n", ssz, sz, sz, sz); if (v->nd > 1) { strb_appendf(&sb, " %s pos, ii = i1;\n", sz); for (i2 = v->nd; i2 > 1; i2--) { i = i2 - 1; if (i > 1) strb_appendf(&sb, " pos = ii %% (%s)d%u;\n" " ii /= (%s)d%u;\n", sz, i, sz, i); else strb_appends(&sb, " pos = ii;\n"); strb_appendf(&sb, " p += pos * (%s)s%u;\n", ssz, i); } } strb_appendf(&sb, " r[i0*((%s)n1) + i1] = *((GLOBAL_MEM %s *)(((GLOBAL_MEM char *)v) + p));\n", sz, gpuarray_get_type(v->typecode)->cluda_name); strb_appends(&sb, " }\n" " }\n" "}\n"); if (strb_error(&sb)) { res = GA_MEMORY_ERROR; goto bail; } flags |= gpuarray_type_flags(a->typecode, v->typecode, GA_BYTE, -1); res = GpuKernel_init(k, ctx, 1, (const char **)&sb.s, &sb.l, "take1", nargs, atypes, flags, err_str); bail: free(atypes); strb_clear(&sb); return res; }
void strb_create(struct strb **sbp) { *sbp = xcalloc(1, STRBOFFSET); strb_clear(sbp); }