void load_maxfiles(){ fprintf(stdout,"Init maxfiles\n"); max_files[K_fw_l0_conv] = CNN_FW_Conv_V0_DP_L0_0_init(); max_files[K_fw_l0_maxpool] = CNN_FW_MaxPool_V0_DP_L0_0_init(); max_files[K_fw_l1_conv] = CNN_FW_Conv_V0_DP_L1_0_init(); max_files[K_fw_l1_maxpool] = CNN_FW_MaxPool_V0_DP_L1_0_init(); max_files[K_fw_l2_mlp] = NULL; max_files[K_fw_l3_softmax] = CNN_FW_Softmax_V0_DP_L3_0_init(); max_files[K_bp_l3_softmax] = CNN_BP_Softmax_V0_DP_L3_0_init(); max_files[K_bp_l2_mlp] = NULL; max_files[K_bp_l1_maxpool] = CNN_BP_MaxPool_V0_DP_L1_0_init(); max_files[K_bp_l1_conv] = CNN_BP_Conv_V0_DP_L1_0_init(); max_files[K_bp_l0_maxpool] = CNN_BP_MaxPool_V0_DP_L0_0_init(); max_files[K_bp_l0_conv] = CNN_BP_Conv_V0_DP_L0_0_init(); for (int i=0;i<K_TOTAL;++i){ max_engines[i] = NULL; } cur_engine = -1; int t = K_fw_l0_conv; load_engine(t); { fprintf(stdout,"Writing to LMem : train_set_x\n"); max_actions_t* act; act = max_actions_init(max_files[t], "writeLMem"); max_set_param_uint64t(act, "offset", train_set_x_offset); max_set_param_uint64t(act, "size", train_set_x_size); max_queue_input(act, "cpu_to_lmem_at_cpu", train_set_x, train_set_x_size); max_run(max_engines[t], act); max_actions_free(act); } { fprintf(stdout,"Writing to LMem : valid_set_x\n"); max_actions_t* act; act = max_actions_init(max_files[t], "writeLMem"); max_set_param_uint64t(act, "offset", valid_set_x_offset); max_set_param_uint64t(act, "size", valid_set_x_size); max_queue_input(act, "cpu_to_lmem_at_cpu", valid_set_x, valid_set_x_size); max_run(max_engines[t], act); max_actions_free(act); } { fprintf(stdout,"Writing to LMem : test_set_x\n"); max_actions_t* act; act = max_actions_init(max_files[t], "writeLMem"); max_set_param_uint64t(act, "offset", test_set_x_offset); max_set_param_uint64t(act, "size", test_set_x_size); max_queue_input(act, "cpu_to_lmem_at_cpu", test_set_x, test_set_x_size); max_run(max_engines[t], act); max_actions_free(act); } }
void writeDataToLMem(uint64_t *dataIn, int size, int sizeBytes, int burstLengthInBytes, max_engine_t *engine, max_file_t *maxfile) { printf("size=%d, sizeBytes=%d, burstLengthInBytes=%d\n", size, sizeBytes, burstLengthInBytes); printf("Performing max_actions_init()\n"); max_actions_t *actions = max_actions_init(maxfile, NULL); printf("Done\n"); max_set_ticks(actions, "KernelLMem_Write_CommandAndDataStream", size); max_set_uint64t(actions, "KernelLMem_Write_CommandAndDataStream", "totalBursts", size * 8 / burstLengthInBytes); max_set_uint64t(actions, "KernelLMem_Write_CommandAndDataStream", "wordsPerBurst", burstLengthInBytes / 8); max_set_ticks(actions, "KernelLMem_Read_CommandAndDataStream", 0); max_set_uint64t(actions, "KernelLMem_Read_CommandAndDataStream", "totalBursts", size * 8 / burstLengthInBytes); max_set_uint64t(actions, "KernelLMem_Read_CommandAndDataStream", "wordsPerBurst", burstLengthInBytes / 8); max_run(engine, actions); max_reset_engine(engine); max_queue_input(actions, "fromCpu", dataIn, sizeBytes); max_lmem_set_interrupt_on(actions, "toLmem"); printf("Performing max_run()\n"); max_run(engine, actions); printf("Done\n"); max_actions_free(actions); }
int main(void) { const int no = 50; const int k = 2; const int row = 24; const int col = 24; const int batch_size = 384; int z2_offset = 0; int z2_size = no*row*col*batch_size*sizeof(real); int sel_offset = z2_offset+z2_size; int sel_size = no*row*col*batch_size/8; int z_offset = sel_offset+sel_size; int z_size = no*row*col/k/k*batch_size*sizeof(real); int a_offset = z_offset+z_size; int a_size = no*row*col/k/k*batch_size*sizeof(real); real* z2 = (real*)malloc(z2_size); uchar* sel = (uchar*)malloc(sel_size); real* z = (real*)malloc(z_size); real* a = (real*)malloc(a_size); max_file_t *maxfile = CNN_FW_MaxPool_V0_DP_L0_0_init(); max_engine_t *engine = max_load(maxfile, "*"); printf("Writing to LMem.\n"); max_actions_t* act = max_actions_init(maxfile, "writeLMem"); max_set_param_uint64t(act, "offset", z2_offset); max_set_param_uint64t(act, "size", z2_size); max_queue_input(act, "cpu_to_lmem_at_cpu", z2, z2_size); max_run(engine, act); printf("Running on DFE.\n"); act = max_actions_init(maxfile, "default"); max_set_param_uint64t(act, "no", no); max_set_param_uint64t(act, "z2_offset", z2_offset); max_set_param_uint64t(act, "sel_offset", sel_offset); max_set_param_uint64t(act, "z_offset", z_offset); max_set_param_uint64t(act, "a_offset", a_offset); max_run(engine, act); printf("Reading from LMemBytes.\n"); act = max_actions_init(maxfile, "readLMemBytes"); max_set_param_uint64t(act, "offset", sel_offset); max_set_param_uint64t(act, "size", sel_size); max_queue_output(act, "lmem_to_cpu_at_cpu", sel, sel_size); max_run(engine, act); printf("Reading from LMem.\n"); act = max_actions_init(maxfile, "readLMem"); max_set_param_uint64t(act, "offset", z_offset); max_set_param_uint64t(act, "size", z_size); max_queue_output(act, "lmem_to_cpu_at_cpu", z, z_size); max_run(engine, act); printf("Reading from LMem.\n"); act = max_actions_init(maxfile, "readLMem"); max_set_param_uint64t(act, "offset", a_offset); max_set_param_uint64t(act, "size", a_size); max_queue_output(act, "lmem_to_cpu_at_cpu", a, a_size); max_run(engine, act); max_unload(engine); printf("Done.\n"); free(z2); free(sel); free(z); free(a); return 0; }
/** * Runs the main action to compute a predictor or corrector step */ void AirfoilDFEInterface::runMainAction(int k, double cfl, double gam, double gm1, double eps, double *rms) { int cpuresind = 0; int schedind = 0; for (int d = 1; d < (*domain).ndomain; d++){ for (int res_edge_iter = 0; res_edge_iter < (*domain).nedge[d]; res_edge_iter++){ int thispart = d; int thisind = res_edge_iter; for (int i = 0; i < 2; i++){ int thiscellpart = (*domain).ecellpart[thispart][thisind*2+i]; int thiscellind = (*domain).ecellind[thispart][thisind*2+i]; if (reads[7*schedind+3] == 1){ for (int j = 0; j < 4; j++) { cpu_res_qpadt[cpuresind*5+j] = (*domain).q[thiscellpart][4*thiscellind+j]; } cpu_res_qpadt[cpuresind*5+4] = (*domain).adt[thiscellpart][thiscellind]; cpuresind ++; } schedind ++ ; } } } max_actions_t * act = max_actions_init(maxfile, NULL); max_set_ticks(act, "AirfoilDFEAdtKernel", (*domain).ncellcomputedfe); max_set_uint64t(act, "AirfoilDFEAdtKernel", "numTicks", (*domain).ncellcomputedfe); max_set_double(act, "AirfoilDFEAdtKernel", "cfl", cfl); max_set_double(act, "AirfoilDFEAdtKernel", "gam", gam); max_set_double(act, "AirfoilDFEAdtKernel", "gm1", gm1); max_lmem_linear(act, "adtQ", memAddresses[q], memAddresses[q+1] - memAddresses[q]); max_lmem_linear(act, "adtDxRead", memAddresses[adtDx], adtDxDatSize); max_set_ticks(act, "AirfoilDFEResKernel", resFlushTicks); max_set_double(act, "AirfoilDFEResKernel", "gm1", gm1); max_set_double(act, "AirfoilDFEResKernel", "eps", eps); max_set_uint64t(act, "AirfoilDFEResKernel", "nTicks", resFlushTicks); max_queue_input(act, "cpu_qpadt_to_res", cpu_res_qpadt, cpuQpadtSize); max_lmem_linear(act, "resReadOnly", memAddresses[resReadOnly], resReadOnlyDatSize); max_queue_output(act,"cpu_res_from_res", dfe_res_res, passtorescount*sizeof(double)*4); double * rmsOut = (double *) malloc(16*sizeof(double)); max_set_ticks(act,"AirfoilDFEUpdateKernel", (*domain).ncellcomputedfe); max_set_uint64t(act, "AirfoilDFEUpdateKernel", "numCells", (*domain).ncellcomputedfe); max_set_uint64t(act, "AirfoilDFEUpdateKernel", "doSaveQold", k==1); max_lmem_linear(act, "updateQ", memAddresses[q], memAddresses[q+1] - memAddresses[q]); max_lmem_linear(act, "updateQold", memAddresses[qold], memAddresses[qold+1] - memAddresses[qold]); max_queue_output(act,"rmsOut", rmsOut, 16*sizeof(double)); if (k == 0) { max_ignore_lmem(act, "updateSaveQold"); } else { max_lmem_linear(act, "updateSaveQold", memAddresses[qold], memAddresses[qold+1] - memAddresses[qold]); } max_ignore_lmem(act, "setupWrite"); max_ignore_lmem(act, "qRead"); max_run(engine, act); max_actions_free(act); for (int i = 0; i < 16; i++) (*rms) += rmsOut[i]; cpuresind = 0; schedind = 0; for (int d = 1; d < (*domain).ndomain; d++){ for (int res_edge_iter = 0; res_edge_iter < (*domain).nedge[d]; res_edge_iter++){ int thispart = d; int thisind = res_edge_iter; for (int i = 0; i < 2; i++){ int thiscellpart = (*domain).ecellpart[thispart][thisind*2+i]; int thiscellind = (*domain).ecellind[thispart][thisind*2+i]; if (reads[7*schedind+3] == 1){ for (int j = 0; j < 4; j++) { (*domain).res[thiscellpart][4*thiscellind+j] += dfe_res_res[cpuresind*4+j]; } cpuresind ++; } schedind ++ ; } } } }
/** * The initial action to set up arrays in lmem for main compute */ void AirfoilDFEInterface::runSetupAction () { max_actions_t * act; act = max_actions_init(maxfile, NULL); max_queue_input(act, "setupCPU", dfeAdtDX, adtDxDatSize); max_lmem_linear(act, "setupWrite", memAddresses[adtDx], adtDxDatSize); max_ignore_lmem(act, "adtDxRead"); max_ignore_lmem(act, "resReadOnly"); max_ignore_lmem(act, "qRead"); max_ignore_lmem(act, "updateQ"); max_ignore_lmem(act, "updateQold"); max_ignore_lmem(act, "adtQ"); max_ignore_lmem(act, "updateSaveQold"); max_ignore_kernel(act, "AirfoilDFEResKernel"); max_ignore_kernel(act, "AirfoilDFEAdtKernel"); max_ignore_kernel(act, "AirfoilDFEUpdateKernel"); max_run(engine, act); max_actions_free(act); act = max_actions_init(maxfile, NULL); max_queue_input(act, "setupCPU", dfeQ, qDatSize); max_lmem_linear(act, "setupWrite", memAddresses[q], qDatSize); max_ignore_lmem(act, "adtDxRead"); max_ignore_lmem(act, "resReadOnly"); max_ignore_lmem(act, "qRead"); max_ignore_lmem(act, "updateQ"); max_ignore_lmem(act, "updateQold"); max_ignore_lmem(act, "updateSaveQold"); max_ignore_lmem(act, "adtQ"); max_ignore_kernel(act, "AirfoilDFEResKernel"); max_ignore_kernel(act, "AirfoilDFEAdtKernel"); max_ignore_kernel(act, "AirfoilDFEUpdateKernel"); max_run(engine, act); max_actions_free(act); act = max_actions_init(maxfile, NULL); max_queue_input(act, "setupCPU", dfeQ, qDatSize); max_lmem_linear(act, "setupWrite", memAddresses[qold], qDatSize); max_ignore_lmem(act, "adtDxRead"); max_ignore_lmem(act, "resReadOnly"); max_ignore_lmem(act, "qRead"); max_ignore_lmem(act, "updateQ"); max_ignore_lmem(act, "updateQold"); max_ignore_lmem(act, "adtQ"); max_ignore_lmem(act, "updateSaveQold"); max_ignore_kernel(act, "AirfoilDFEResKernel"); max_ignore_kernel(act, "AirfoilDFEAdtKernel"); max_ignore_kernel(act, "AirfoilDFEUpdateKernel"); max_run(engine, act); max_actions_free(act); act = max_actions_init(maxfile, NULL); max_queue_input(act, "setupCPU", dfeResReadOnly, resReadOnlyDatSize); max_lmem_linear(act, "setupWrite", memAddresses[resReadOnly], resReadOnlyDatSize); max_ignore_lmem(act, "adtDxRead"); max_ignore_lmem(act, "resReadOnly"); max_ignore_lmem(act, "qRead"); max_ignore_lmem(act, "updateQ"); max_ignore_lmem(act, "updateQold"); max_ignore_lmem(act, "adtQ"); max_ignore_lmem(act, "updateSaveQold"); max_ignore_kernel(act, "AirfoilDFEResKernel"); max_ignore_kernel(act, "AirfoilDFEAdtKernel"); max_ignore_kernel(act, "AirfoilDFEUpdateKernel"); max_run(engine, act); max_actions_free(act); }
void my_process(int data_x_offset,const cateType* data_y,int mb_idx,real learning_rate){ { int t = K_fw_l0_conv; load_engine(t); printf("Running on DFE: fw_l0_conv"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", 1); max_set_param_uint64t(act, "no", NKERS[0]); max_queue_input(act, "b", layer0_b, layer0_b_size); max_queue_input(act, "w", layer0_w, layer0_w_size); max_set_param_uint64t(act, "x_offset", data_x_offset+mb_idx*layer0_x_size); max_set_param_uint64t(act, "z_offset", layer0_z2_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_fw_l0_maxpool; load_engine(t); printf("Running on DFE: fw_l0_maxpool"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "no", NKERS[0]); max_set_param_uint64t(act, "z2_offset", layer0_z2_offset); max_set_param_uint64t(act, "sel_offset", layer0_sel_offset); max_set_param_uint64t(act, "z_offset", layer0_z_offset); max_set_param_uint64t(act, "a_offset", layer0_a_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_fw_l1_conv; load_engine(t); printf("Running on DFE: fw_l1_conv"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", NKERS[0]); max_set_param_uint64t(act, "no", NKERS[1]); max_queue_input(act, "b", layer1_b, layer1_b_size); max_queue_input(act, "w", layer1_w, layer1_w_size); max_set_param_uint64t(act, "x_offset", layer1_x_offset); max_set_param_uint64t(act, "z_offset", layer1_z2_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_fw_l1_maxpool; load_engine(t); printf("Running on DFE: fw_l1_maxpool"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "no", NKERS[1]); max_set_param_uint64t(act, "z2_offset", layer1_z2_offset); max_set_param_uint64t(act, "sel_offset", layer1_sel_offset); max_set_param_uint64t(act, "z_offset", layer1_z_offset); max_set_param_uint64t(act, "a_offset", layer1_a_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_fw_l3_softmax; load_engine(t); printf("Running on DFE: fw_l3_softmax"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", NKERS[2]); max_set_param_uint64t(act, "x_offset", layer3_x_offset); max_queue_input(act, "w", layer3_w, layer3_w_size); max_queue_input(act, "b", layer3_b, layer3_b_size); max_set_param_uint64t(act, "softmax_offset", layer3_sm_offset); max_queue_output(act, "pred", layer3_pred, layer3_pred_size); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } //TODO: learning rate<0 exit { int t = K_bp_l3_softmax; load_engine(t); printf("Running on DFE: bp_l3_softmax"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", NKERS[2]); max_set_param_uint64t(act, "x_offset", layer3_x_offset); max_queue_input(act, "w", layer3_w, layer3_w_size); max_set_param_uint64t(act, "softmax_offset", layer3_sm_offset); max_queue_input(act, "std", data_y+mb_idx*layer3_pred_size, layer3_pred_size); max_queue_output(act, "w_grad", layer3_w_grad, layer3_w_grad_size); max_queue_output(act, "b_grad", layer3_b_grad, layer3_b_grad_size); max_set_param_uint64t(act, "x_grad_offset", layer3_x_grad_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_bp_l1_maxpool; load_engine(t); printf("Running on DFE: bp_l1_maxpool"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "no", NKERS[1]); max_set_param_uint64t(act, "a_grad_offset", layer1_a_grad_offset); max_set_param_uint64t(act, "z_offset", layer1_z_offset); max_set_param_uint64t(act, "sel_offset", layer1_sel_offset); max_set_param_uint64t(act, "z2_grad_offset", layer1_z2_grad_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_bp_l1_conv; load_engine(t); printf("Running on DFE: bp_l1_conv"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", NKERS[0]); max_set_param_uint64t(act, "no", NKERS[1]); max_set_param_uint64t(act, "z_grad_offset", layer1_z2_grad_offset); max_set_param_uint64t(act, "x_offset", layer1_x_offset); max_set_param_uint64t(act, "x_grad_offset", layer1_x_grad_offset); max_queue_input(act, "w", layer1_w, layer1_w_size); max_queue_output(act, "w_grad", layer1_w_grad, layer1_w_grad_size); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_bp_l0_maxpool; load_engine(t); printf("Running on DFE: bp_l0_maxpool"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "no", NKERS[0]); max_set_param_uint64t(act, "a_grad_offset", layer0_a_grad_offset); max_set_param_uint64t(act, "z_offset", layer0_z_offset); max_set_param_uint64t(act, "sel_offset", layer0_sel_offset); max_set_param_uint64t(act, "z2_grad_offset", layer0_z2_grad_offset); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } { int t = K_bp_l0_conv; load_engine(t); printf("Running on DFE: bp_l0_conv"); mark_timer(false,1); max_actions_t* act = max_actions_init(max_files[t], "default"); max_set_param_uint64t(act, "ni", 1); max_set_param_uint64t(act, "no", NKERS[0]); max_set_param_uint64t(act, "z_grad_offset", layer0_z2_grad_offset); max_set_param_uint64t(act, "x_offset", data_x_offset+mb_idx*layer0_x_size); max_set_param_uint64t(act, "x_grad_offset", layer0_x_grad_offset); max_queue_input(act, "w", layer0_w, layer0_w_size); max_queue_output(act, "w_grad", layer0_w_grad, layer0_w_grad_size); max_run(max_engines[t], act); max_actions_free(act); mark_timer(true,1); } }