int main(int argc, char **argv) { // // NN, Q, CC, SEED, MAX_PASSES // int c; opterr = 0; SEED = time(NULL); while ((c = getopt(argc,argv,"N:Q:C:S:P:T:")) != -1) { switch (c) { case 'N': NN = atoi(optarg); break; case 'Q': Q = atoi(optarg); break; case 'C': CC = atof(optarg); break; case 'S': SEED = atoi(optarg); break; case 'P': MAX_PASSES = atoi(optarg); break; case 'T': TL = atof(optarg); break; } } weights = new MyRandom(1,2009); qs = new MyRandom(-1 * Q, Q); prob = new MyRandom(0, 1 ); printf("NN = %d, Q = %d, CC = %f, SEED = %d, MAX_PASSES = %d, TL = %f\n",NN,Q,CC,SEED,MAX_PASSES,TL); genTest(); //doTheFlop(); return 0; }
// generate code for a captured BB void generate(Rewriter* r, CBB* cbb) { uint8_t* buf; uint64_t buf0; int used, i, usedTotal; if (cbb == 0) return; if (r->cs == 0) return; if (r->showEmuSteps) printf("Generating code for BB %s (%d instructions)\n", cbb_prettyName(cbb), cbb->count); usedTotal = 0; buf0 = (uint64_t) reserveCodeStorage(r->cs, 0); // remember start address for(i = 0; i < cbb->count; i++) { Instr* instr = cbb->instr + i; buf = reserveCodeStorage(r->cs, 15); if (instr->ptLen > 0) { used = genPassThrough(buf, instr); } else { switch(instr->type) { case IT_ADD: used = genAdd(buf, &(instr->src), &(instr->dst)); break; case IT_CLTQ: used = genCltq(buf, instr->vtype); break; case IT_CQTO: used = genCqto(buf, instr->vtype); break; case IT_CMP: used = genCmp(buf, &(instr->src), &(instr->dst)); break; case IT_DEC: used = genDec(buf, &(instr->dst)); break; case IT_IMUL: used = genIMul(buf, &(instr->src), &(instr->dst)); break; case IT_IDIV1: used = genIDiv1(buf, &(instr->dst)); break; case IT_INC: used = genInc(buf, &(instr->dst)); break; case IT_XOR: used = genXor(buf, &(instr->src), &(instr->dst)); break; case IT_OR: used = genOr(buf, &(instr->src), &(instr->dst)); break; case IT_AND: used = genAnd(buf, &(instr->src), &(instr->dst)); break; case IT_SHL: used = genShl(buf, &(instr->src), &(instr->dst)); break; case IT_SHR: used = genShr(buf, &(instr->src), &(instr->dst)); break; case IT_SAR: used = genSar(buf, &(instr->src), &(instr->dst)); break; case IT_LEA: used = genLea(buf, &(instr->src), &(instr->dst)); break; case IT_MOV: case IT_MOVSX: // converting move used = genMov(buf, &(instr->src), &(instr->dst)); break; case IT_CMOVO: case IT_CMOVNO: case IT_CMOVC: case IT_CMOVNC: case IT_CMOVZ: case IT_CMOVNZ: case IT_CMOVBE: case IT_CMOVA: case IT_CMOVS: case IT_CMOVNS: case IT_CMOVP: case IT_CMOVNP: case IT_CMOVL: case IT_CMOVGE: case IT_CMOVLE: case IT_CMOVG: used = genCMov(buf, instr->type, &(instr->src), &(instr->dst)); break; case IT_POP: used = genPop(buf, &(instr->dst)); break; case IT_PUSH: used = genPush(buf, &(instr->dst)); break; case IT_RET: used = genRet(buf); break; case IT_SUB: used = genSub(buf, &(instr->src), &(instr->dst)); break; case IT_TEST: used = genTest(buf, &(instr->src), &(instr->dst)); break; case IT_HINT_CALL: case IT_HINT_RET: used = 0; break; default: assert(0); } } assert(used < 15); instr->addr = (uint64_t) buf; instr->len = used; usedTotal += used; if (r->showEmuSteps) { printf(" I%2d : %-32s", i, instr2string(instr, 1, 0)); printf(" (%s)+%lx %s\n", cbb_prettyName(cbb), instr->addr - buf0, bytes2string(instr, 0, used)); } useCodeStorage(r->cs, used); } if (r->showEmuSteps) { if (instrIsJcc(cbb->endType)) { assert(cbb->nextBranch != 0); assert(cbb->nextFallThrough != 0); printf(" I%2d : %s (%s),", i, instrName(cbb->endType, 0), cbb_prettyName(cbb->nextBranch)); printf(" fall-through to (%s)\n", cbb_prettyName(cbb->nextFallThrough)); } } // add padding space after generated code for jump instruction buf = useCodeStorage(r->cs, 10); cbb->size = usedTotal; // start address of generated code. // if CBB had no instruction, this points to the padding buffer cbb->addr1 = (cbb->count == 0) ? ((uint64_t)buf) : cbb->instr[0].addr; }
int main(int argc, char *argv[]) { char out[1024*1024]; CLBLASKernExtra kextra; BlasGenSettings gset; TileMulOpts mulOpts; int i; cl_uint blockM = 4, blockN = 4, blockK = 8; struct KgenContext *ctx = createKgenContext(out, sizeof(out), 1); FType alpha; cl_int err; unsigned int iterNum = 1; const char* const shortOptions = "hd:f:l:t:a:b:s:g:i:c:ov"; const struct option longOptions[] = { {"help", no_argument, NULL, 'h'}, {"device", required_argument, NULL, 'd'}, {"fetch", required_argument, NULL, 'f'}, {"local", required_argument, NULL, 'l'}, {"type", required_argument, NULL, 't'}, {"a", required_argument, NULL, 'a'}, {"b", required_argument, NULL, 'b'}, {"skew", required_argument, NULL, 's'}, {"globalcycling", required_argument, NULL, 'g'}, {"iter", required_argument, NULL, 'i'}, {"core", required_argument, NULL, 'c'}, {"old", no_argument, NULL, 'o'}, {"verbose", no_argument, NULL, 'v'}, {NULL, 0, NULL, 0} }; int nextOption; cl_device_type deviceType = CL_DEVICE_TYPE_GPU; bool verbose = false; SubproblemDim *subdims = gset.subdims; bool separateFetch = false; memset(&gset, 0, sizeof(gset)); memset(&mulOpts, 0, sizeof(mulOpts)); memset(&kextra, 0, sizeof(kextra)); gset.kextra = &kextra; gset.flags |= BGF_WHOLE_A; mulOpts.core = TILEMUL_MAD; mulOpts.flags = TILEMUL_FORCE_VECTORIZATION; kextra.vecLen = 1; kextra.dtype = TYPE_FLOAT; alpha.f = 1; // parse command line do { nextOption = getopt_long(argc, argv, shortOptions, longOptions, NULL); switch (nextOption) { case 'h': printUsage(argv[0], EXIT_SUCCESS); break; case 'd': if (!strcmp("cpu", optarg)) { deviceType = CL_DEVICE_TYPE_CPU; } else if (!strcmp("gpu", optarg)) { deviceType = CL_DEVICE_TYPE_GPU; } else { printf("Unknown device type %s. Supported values are \"cpu\" " "and \"gpu\".\n", optarg); exit(EXIT_FAILURE); } break; case 'f': kextra.vecLen = atoi(optarg); break; case 'l': if (!strcmp(optarg, "A")) { mulOpts.memA = CLMEM_LOCAL_MEMORY; } else if (!strcmp(optarg, "B")) { mulOpts.memB = CLMEM_LOCAL_MEMORY; } else { printf("Wrong matrix specified: %s. Supported values are " "A, B.\n", optarg); exit(EXIT_FAILURE); } break; case 't': if (!strcmp(optarg, "s")) { kextra.dtype = TYPE_FLOAT; alpha.f = 1; } else if (!strcmp(optarg, "d")) { kextra.dtype = TYPE_DOUBLE; alpha.d = 1; } else if (!strcmp(optarg, "c")) { kextra.dtype = TYPE_COMPLEX_FLOAT; alpha.f2.s[0] = 1; alpha.f2.s[1] = 0; } else if (!strcmp(optarg, "z")) { kextra.dtype = TYPE_COMPLEX_DOUBLE; alpha.d2.s[0] = 1; alpha.d2.s[1] = 0; } else { printf("Wrong type specified: %s. Supported values are " "s, d, c, z.\n", optarg); exit(EXIT_FAILURE); } break; case 'a': if (!strcmp(optarg, "r")) { mulOpts.flags &= ~TILEMUL_TRA; } else if (!strcmp(optarg, "c")) { mulOpts.flags |= TILEMUL_TRA; } else { printf("Wrong tile a parameter specified: %s. Supported values " "are \"r\", \"c\".\n", optarg); exit(EXIT_FAILURE); } break; case 'b': if (!strcmp(optarg, "r")) { mulOpts.flags &= ~TILEMUL_TRB; } else if (!strcmp(optarg, "c")) { mulOpts.flags |= TILEMUL_TRB; } else { printf("Wrong tile b order specified: %s. Supported values " "are \"r\", \"c\".\n", optarg); exit(EXIT_FAILURE); } break; case 's': if (!strcmp(optarg, "a")) { mulOpts.flags |= TILEMUL_SKEW_A; } else if (!strcmp(optarg, "b")) { mulOpts.flags |= TILEMUL_SKEW_B; } else if (!strcmp(optarg, "k")) { mulOpts.flags |= TILEMUL_SKEW_K; } else { printf("Wrong skew parameter specified: %s. Supported values " "are \"a\", \"b\", \"k\"\n", optarg); exit(EXIT_FAILURE); } break; case 'g': if (!strcmp(optarg, "a")) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_A; } else if (!strcmp(optarg, "b")) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_B; } else if (!strcmp(optarg, "k")) { mulOpts.flags |= TILEMUL_GLOBAL_CYCLIC_K; } else { printf("Wrong global cycling parameter specified: %s. " "Supported values are \"a\", \"b\", \"k\"\n", optarg); exit(EXIT_FAILURE); } break; case 'i': iterNum = atoi(optarg); break; case 'c': if (!strcmp("muladd", optarg)) { mulOpts.core = TILEMUL_MULADD; } else if (!strcmp("mad", optarg)) { mulOpts.core = TILEMUL_MAD; } else if (!strcmp("dot", optarg)) { mulOpts.core = TILEMUL_DOT; } else { printf("Unknown multiplier core %s. Supported values" " are \"muladd\", \"mad\" and \"dot\".\n", optarg); exit(EXIT_FAILURE); } break; case 'o': separateFetch = false; break; case 'v': verbose = true; break; case -1: break; default: printUsage(argv[0], EXIT_FAILURE); break; } } while (nextOption != -1); if (optind + 2 >= argc) { printf("Error: Not all sizes are specified\n"); printUsage(argv[0], EXIT_FAILURE); } blockM = atoi(argv[optind]); blockN = atoi(argv[optind + 1]); blockK = atoi(argv[optind + 2]); if ((mulOpts.memA == CLMEM_LOCAL_MEMORY || mulOpts.memB == CLMEM_LOCAL_MEMORY) && ((mulOpts.flags & TILEMUL_GLOBAL_CYCLIC) != 0)) { printf("One of matrixes is in local memory, " "disabling global cycling\n"); mulOpts.flags &= ~TILEMUL_GLOBAL_CYCLIC; } if (mulOpts.flags & TILEMUL_TRA) { kextra.flags |= KEXTRA_TRANS_A; } if (mulOpts.flags & TILEMUL_TRB) { kextra.flags |= KEXTRA_TRANS_B; } subdims[0].y = blockM * ITEM_WORK_M; subdims[0].x = blockN * ITEM_WORK_N; subdims[0].bwidth = blockK * ITEM_BLOCKS_K; subdims[1].y = blockM; subdims[1].x = blockN; subdims[1].bwidth = blockK; memset(out, 0, sizeof(out)); i = isDoubleBasedType(kextra.dtype); kgenDeclareUptrs(ctx, i); genTest(ctx, &gset, &mulOpts, separateFetch); destroyKgenContext(ctx); printf("Kernel code: \n\"%s\"\n", out); err = run(out, subdims[0].y, subdims[0].x, subdims[0].bwidth, alpha, &gset, mulOpts.flags, deviceType, verbose, iterNum); if (err != CL_SUCCESS) { printf("Test run failed, error %d\n", err); return EXIT_FAILURE; } return EXIT_SUCCESS; }