void readDataFromLMem(uint64_t *dataOut, int size, int sizeBytes, int burstLengthInBytes, max_engine_t *engine, max_file_t *maxfile) { max_actions_t *actions = max_actions_init(maxfile, NULL); max_set_ticks(actions, "KernelLMem_Write_CommandAndDataStream", 0); max_set_uint64t(actions, "KernelLMem_Write_CommandAndDataStream", "totalBursts", size * 8 / burstLengthInBytes); max_set_uint64t(actions, "KernelLMem_Write_CommandAndDataStream", "wordsPerBurst", burstLengthInBytes / 8); max_set_ticks(actions, "KernelLMem_Read_CommandAndDataStream", size); max_set_uint64t(actions, "KernelLMem_Read_CommandAndDataStream", "totalBursts", size * 8 / burstLengthInBytes); max_set_uint64t(actions, "KernelLMem_Read_CommandAndDataStream", "wordsPerBurst", burstLengthInBytes / 8); max_run(engine, actions); max_reset_engine(engine); max_queue_output(actions, "toCpu", dataOut, sizeBytes); max_run(engine, actions); max_actions_free(actions); }
void AirfoilDFEInterface::runOutputAction () { max_actions_t * act; act = max_actions_init(maxfile, NULL); max_queue_output(act, "qCPUOut", dfeQ, qDatSize); max_lmem_linear(act, "qRead", memAddresses[q], qDatSize); max_ignore_lmem(act, "setupWrite"); max_ignore_lmem(act, "updateQ"); max_ignore_lmem(act, "updateQold"); max_ignore_lmem(act, "updateSaveQold"); max_ignore_lmem(act, "adtQ"); max_ignore_lmem(act, "adtDxRead"); max_ignore_lmem(act, "resReadOnly"); max_ignore_kernel(act, "AirfoilDFEResKernel"); max_ignore_kernel(act, "AirfoilDFEAdtKernel"); max_ignore_kernel(act, "AirfoilDFEUpdateKernel"); max_run(engine, act); max_actions_free(act); int thispart = 1; int thisind = 0; for (int i = 0; i < (*domain).ncellcomputedfe; i++){ int cellpart = readlocs[2*i]; int cellind = readlocs[2*i+1]; for (int j = 0; j < 4; j ++) (*domain).q[cellpart][cellind*4+j] = dfeQ[i*4+j]; thisind++; if (thisind == (*domain).ncell[thispart]){ thispart++; thisind = 0; } } }
int main(void) { const int no = 50; const int k = 2; const int row = 24; const int col = 24; const int batch_size = 384; int z2_offset = 0; int z2_size = no*row*col*batch_size*sizeof(real); int sel_offset = z2_offset+z2_size; int sel_size = no*row*col*batch_size/8; int z_offset = sel_offset+sel_size; int z_size = no*row*col/k/k*batch_size*sizeof(real); int a_offset = z_offset+z_size; int a_size = no*row*col/k/k*batch_size*sizeof(real); real* z2 = (real*)malloc(z2_size); uchar* sel = (uchar*)malloc(sel_size); real* z = (real*)malloc(z_size); real* a = (real*)malloc(a_size); max_file_t *maxfile = CNN_FW_MaxPool_V0_DP_L0_0_init(); max_engine_t *engine = max_load(maxfile, "*"); printf("Writing to LMem.\n"); max_actions_t* act = max_actions_init(maxfile, "writeLMem"); max_set_param_uint64t(act, "offset", z2_offset); max_set_param_uint64t(act, "size", z2_size); max_queue_input(act, "cpu_to_lmem_at_cpu", z2, z2_size); max_run(engine, act); printf("Running on DFE.\n"); act = max_actions_init(maxfile, "default"); max_set_param_uint64t(act, "no", no); max_set_param_uint64t(act, "z2_offset", z2_offset); max_set_param_uint64t(act, "sel_offset", sel_offset); max_set_param_uint64t(act, "z_offset", z_offset); max_set_param_uint64t(act, "a_offset", a_offset); max_run(engine, act); printf("Reading from LMemBytes.\n"); act = max_actions_init(maxfile, "readLMemBytes"); max_set_param_uint64t(act, "offset", sel_offset); max_set_param_uint64t(act, "size", sel_size); max_queue_output(act, "lmem_to_cpu_at_cpu", sel, sel_size); max_run(engine, act); printf("Reading from LMem.\n"); act = max_actions_init(maxfile, "readLMem"); max_set_param_uint64t(act, "offset", z_offset); max_set_param_uint64t(act, "size", z_size); max_queue_output(act, "lmem_to_cpu_at_cpu", z, z_size); max_run(engine, act); printf("Reading from LMem.\n"); act = max_actions_init(maxfile, "readLMem"); max_set_param_uint64t(act, "offset", a_offset); max_set_param_uint64t(act, "size", a_size); max_queue_output(act, "lmem_to_cpu_at_cpu", a, a_size); max_run(engine, act); max_unload(engine); printf("Done.\n"); free(z2); free(sel); free(z); free(a); return 0; }
/** * Runs the main action to compute a predictor or corrector step */ void AirfoilDFEInterface::runMainAction(int k, double cfl, double gam, double gm1, double eps, double *rms) { int cpuresind = 0; int schedind = 0; for (int d = 1; d < (*domain).ndomain; d++){ for (int res_edge_iter = 0; res_edge_iter < (*domain).nedge[d]; res_edge_iter++){ int thispart = d; int thisind = res_edge_iter; for (int i = 0; i < 2; i++){ int thiscellpart = (*domain).ecellpart[thispart][thisind*2+i]; int thiscellind = (*domain).ecellind[thispart][thisind*2+i]; if (reads[7*schedind+3] == 1){ for (int j = 0; j < 4; j++) { cpu_res_qpadt[cpuresind*5+j] = (*domain).q[thiscellpart][4*thiscellind+j]; } cpu_res_qpadt[cpuresind*5+4] = (*domain).adt[thiscellpart][thiscellind]; cpuresind ++; } schedind ++ ; } } } max_actions_t * act = max_actions_init(maxfile, NULL); max_set_ticks(act, "AirfoilDFEAdtKernel", (*domain).ncellcomputedfe); max_set_uint64t(act, "AirfoilDFEAdtKernel", "numTicks", (*domain).ncellcomputedfe); max_set_double(act, "AirfoilDFEAdtKernel", "cfl", cfl); max_set_double(act, "AirfoilDFEAdtKernel", "gam", gam); max_set_double(act, "AirfoilDFEAdtKernel", "gm1", gm1); max_lmem_linear(act, "adtQ", memAddresses[q], memAddresses[q+1] - memAddresses[q]); max_lmem_linear(act, "adtDxRead", memAddresses[adtDx], adtDxDatSize); max_set_ticks(act, "AirfoilDFEResKernel", resFlushTicks); max_set_double(act, "AirfoilDFEResKernel", "gm1", gm1); max_set_double(act, "AirfoilDFEResKernel", "eps", eps); max_set_uint64t(act, "AirfoilDFEResKernel", "nTicks", resFlushTicks); max_queue_input(act, "cpu_qpadt_to_res", cpu_res_qpadt, cpuQpadtSize); max_lmem_linear(act, "resReadOnly", memAddresses[resReadOnly], resReadOnlyDatSize); max_queue_output(act,"cpu_res_from_res", dfe_res_res, passtorescount*sizeof(double)*4); double * rmsOut = (double *) malloc(16*sizeof(double)); max_set_ticks(act,"AirfoilDFEUpdateKernel", (*domain).ncellcomputedfe); max_set_uint64t(act, "AirfoilDFEUpdateKernel", "numCells", (*domain).ncellcomputedfe); max_set_uint64t(act, "AirfoilDFEUpdateKernel", "doSaveQold", k==1); max_lmem_linear(act, "updateQ", memAddresses[q], memAddresses[q+1] - memAddresses[q]); max_lmem_linear(act, "updateQold", memAddresses[qold], memAddresses[qold+1] - memAddresses[qold]); max_queue_output(act,"rmsOut", rmsOut, 16*sizeof(double)); if (k == 0) { max_ignore_lmem(act, "updateSaveQold"); } else { max_lmem_linear(act, "updateSaveQold", memAddresses[qold], memAddresses[qold+1] - memAddresses[qold]); } max_ignore_lmem(act, "setupWrite"); max_ignore_lmem(act, "qRead"); max_run(engine, act); max_actions_free(act); for (int i = 0; i < 16; i++) (*rms) += rmsOut[i]; cpuresind = 0; schedind = 0; for (int d = 1; d < (*domain).ndomain; d++){ for (int res_edge_iter = 0; res_edge_iter < (*domain).nedge[d]; res_edge_iter++){ int thispart = d; int thisind = res_edge_iter; for (int i = 0; i < 2; i++){ int thiscellpart = (*domain).ecellpart[thispart][thisind*2+i]; int thiscellind = (*domain).ecellind[thispart][thisind*2+i]; if (reads[7*schedind+3] == 1){ for (int j = 0; j < 4; j++) { (*domain).res[thiscellpart][4*thiscellind+j] += dfe_res_res[cpuresind*4+j]; } cpuresind ++; } schedind ++ ; } } } }
int main(int argc, char *argv[]) { (void) argc; (void) argv; max_file_t *maxfile = INIT_NAME(); if(!maxfile) { printf("Failed to init MAX file\n"); return -1; } max_config_set_bool(MAX_CONFIG_PRINTF_TO_STDOUT, true); const char *device_name = "*"; printf("Opening device: %s\n", device_name); max_engine_t *engine = max_load(maxfile, device_name); if(!engine) { printf("Failed to open Max device\n"); exit(-1); } max_reset_engine(engine); /* * SLiC is so shit, that if we don't run an empty action, no debug outputs will be generated. */ max_actions_t *action = max_actions_init(maxfile, NULL); max_run(engine, action); max_actions_free(action); srand(time(NULL)); single_entry_t *outputData = calloc(MAX_DEPTH, sizeof(single_entry_t)); void *configWordBuffer = NULL; posix_memalign(&configWordBuffer, 4096, 512 * sizeof(configWord_t)); max_llstream_t *configWordStream = max_llstream_setup(engine, "configWord", 512, sizeof(configWord_t), configWordBuffer); uint64_t configBase = 0; printf("Sending config word...\n"); void *configWordSlot; while (max_llstream_write_acquire(configWordStream, 1, &configWordSlot) != 1) usleep(10); configWord_t *configWord = configWordSlot; configWord->wordCount = MAX_DEPTH; configWord->base = configBase; max_llstream_write(configWordStream, 1); getchar(); printf("Streaming 'read_fifo'...\n"); fflush(stdout); action = max_actions_init(maxfile, NULL); max_queue_output(action, "read_fifo", outputData, sizeof(single_entry_t) * MAX_DEPTH); max_disable_reset(action); max_disable_validation(action); max_enable_partial_memory(action); max_run(engine, action); max_actions_free(action); printf("Comparing...\n"); fflush(stdout); uint8_t fail = 0; for (size_t entryIx=0; entryIx < MAX_DEPTH; entryIx++) { uint64_t *output = (uint64_t *)outputData[entryIx].data; size_t quadsPerEntry = sizeof(single_entry_t) / sizeof(uint64_t); uint64_t expected = (configBase + entryIx); if (expected != output[0]) { fail = 1; printf("[Entry: %zd, Quad: %zd] Mismatch: input 0x%lx, output 0x%lx\n", entryIx, 0L, expected, output[0]); } for (size_t q = 1; !fail && q < quadsPerEntry; q++) { if (0 != output[q]) { fail = 1; printf("[Entry: %zd, Quad: %zd] Mismatch: input 0x%lx, output 0x%lx\n", entryIx, q, 0L, output[q]); } } } printf("%s\n", fail ? "FAILED!" : "Success"); return fail; }
void my_process(int data_x_offset,const cateType* data_y,int mb_idx,real learning_rate){ { int t = K_fw_l0_conv; load_engine(t); printf("Running on DFE: fw_l0_conv"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", 1); max_set_param_uint64t(act, "no", NKERS[0]); max_queue_input(act, "b", layer0_b, layer0_b_size); max_queue_input(act, "w", layer0_w, layer0_w_size); max_set_param_uint64t(act, "x_offset", data_x_offset+mb_idx*layer0_x_size); max_set_param_uint64t(act, "z_offset", layer0_z2_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_fw_l0_maxpool; load_engine(t); printf("Running on DFE: fw_l0_maxpool"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "no", NKERS[0]); max_set_param_uint64t(act, "z2_offset", layer0_z2_offset); max_set_param_uint64t(act, "sel_offset", layer0_sel_offset); max_set_param_uint64t(act, "z_offset", layer0_z_offset); max_set_param_uint64t(act, "a_offset", layer0_a_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_fw_l1_conv; load_engine(t); printf("Running on DFE: fw_l1_conv"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", NKERS[0]); max_set_param_uint64t(act, "no", NKERS[1]); max_queue_input(act, "b", layer1_b, layer1_b_size); max_queue_input(act, "w", layer1_w, layer1_w_size); max_set_param_uint64t(act, "x_offset", layer1_x_offset); max_set_param_uint64t(act, "z_offset", layer1_z2_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_fw_l1_maxpool; load_engine(t); printf("Running on DFE: fw_l1_maxpool"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "no", NKERS[1]); max_set_param_uint64t(act, "z2_offset", layer1_z2_offset); max_set_param_uint64t(act, "sel_offset", layer1_sel_offset); max_set_param_uint64t(act, "z_offset", layer1_z_offset); max_set_param_uint64t(act, "a_offset", layer1_a_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_fw_l3_softmax; load_engine(t); printf("Running on DFE: fw_l3_softmax"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", NKERS[2]); max_set_param_uint64t(act, "x_offset", layer3_x_offset); max_queue_input(act, "w", layer3_w, layer3_w_size); max_queue_input(act, "b", layer3_b, layer3_b_size); max_set_param_uint64t(act, "softmax_offset", layer3_sm_offset); max_queue_output(act, "pred", layer3_pred, layer3_pred_size); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } //TODO: learning rate<0 exit { int t = K_bp_l3_softmax; load_engine(t); printf("Running on DFE: bp_l3_softmax"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", NKERS[2]); max_set_param_uint64t(act, "x_offset", layer3_x_offset); max_queue_input(act, "w", layer3_w, layer3_w_size); max_set_param_uint64t(act, "softmax_offset", layer3_sm_offset); max_queue_input(act, "std", data_y+mb_idx*layer3_pred_size, layer3_pred_size); max_queue_output(act, "w_grad", layer3_w_grad, layer3_w_grad_size); max_queue_output(act, "b_grad", layer3_b_grad, layer3_b_grad_size); max_set_param_uint64t(act, "x_grad_offset", layer3_x_grad_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_bp_l1_maxpool; load_engine(t); printf("Running on DFE: bp_l1_maxpool"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "no", NKERS[1]); max_set_param_uint64t(act, "a_grad_offset", layer1_a_grad_offset); max_set_param_uint64t(act, "z_offset", layer1_z_offset); max_set_param_uint64t(act, "sel_offset", layer1_sel_offset); max_set_param_uint64t(act, "z2_grad_offset", layer1_z2_grad_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_bp_l1_conv; load_engine(t); printf("Running on DFE: bp_l1_conv"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", NKERS[0]); max_set_param_uint64t(act, "no", NKERS[1]); max_set_param_uint64t(act, "z_grad_offset", layer1_z2_grad_offset); max_set_param_uint64t(act, "x_offset", layer1_x_offset); max_set_param_uint64t(act, "x_grad_offset", layer1_x_grad_offset); max_queue_input(act, "w", layer1_w, layer1_w_size); max_queue_output(act, "w_grad", layer1_w_grad, layer1_w_grad_size); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_bp_l0_maxpool; load_engine(t); printf("Running on DFE: bp_l0_maxpool"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "no", NKERS[0]); max_set_param_uint64t(act, "a_grad_offset", layer0_a_grad_offset); max_set_param_uint64t(act, "z_offset", layer0_z_offset); max_set_param_uint64t(act, "sel_offset", layer0_sel_offset); max_set_param_uint64t(act, "z2_grad_offset", layer0_z2_grad_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_bp_l0_conv; load_engine(t); printf("Running on DFE: bp_l0_conv"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", 1); max_set_param_uint64t(act, "no", NKERS[0]); max_set_param_uint64t(act, "z_grad_offset", layer0_z2_grad_offset); max_set_param_uint64t(act, "x_offset", data_x_offset+mb_idx*layer0_x_size); max_set_param_uint64t(act, "x_grad_offset", layer0_x_grad_offset); max_queue_input(act, "w", layer0_w, layer0_w_size); max_queue_output(act, "w_grad", layer0_w_grad, layer0_w_grad_size); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } }