void load_convolutional_weights(layer l, FILE *fp) { if(l.binary){ //load_convolutional_weights_binary(l, fp); //return; } if(l.numload) l.n = l.numload; int num = l.c/l.groups*l.n*l.size*l.size; fread(l.biases, sizeof(float), l.n, fp); if (l.batch_normalize && (!l.dontloadscales)){ fread(l.scales, sizeof(float), l.n, fp); fread(l.rolling_mean, sizeof(float), l.n, fp); fread(l.rolling_variance, sizeof(float), l.n, fp); if(0){ int i; for(i = 0; i < l.n; ++i){ printf("%g, ", l.rolling_mean[i]); } printf("\n"); for(i = 0; i < l.n; ++i){ printf("%g, ", l.rolling_variance[i]); } printf("\n"); } if(0){ fill_cpu(l.n, 0, l.rolling_mean, 1); fill_cpu(l.n, 0, l.rolling_variance, 1); } if(0){ int i; for(i = 0; i < l.n; ++i){ printf("%g, ", l.rolling_mean[i]); } printf("\n"); for(i = 0; i < l.n; ++i){ printf("%g, ", l.rolling_variance[i]); } printf("\n"); } } fread(l.weights, sizeof(float), num, fp); //if(l.c == 3) scal_cpu(num, 1./256, l.weights, 1); if (l.flipped) { transpose_matrix(l.weights, l.c*l.size*l.size, l.n); } //if (l.binary) binarize_weights(l.weights, l.n, l.c*l.size*l.size, l.weights); #ifdef GPU if(gpu_index >= 0){ push_convolutional_layer(l); } #endif }
void forward_convolutional_layer(convolutional_layer l, network_state state) { int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); /* if(l.binary){ binarize_filters(l.filters, l.n, l.c*l.size*l.size, l.binary_filters); binarize_filters2(l.filters, l.n, l.c*l.size*l.size, l.cfilters, l.scales); swap_binary(&l); } */ if(l.binary){ int m = l.n; int k = l.size*l.size*l.c; int n = out_h*out_w; char *a = l.cfilters; float *b = state.workspace; float *c = l.output; for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm_bin(m,n,k,1,a,k,b,n,c,n); c += n*m; state.input += l.c*l.h*l.w; } scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w); add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); activate_array(l.output, m*n*l.batch, l.activation); return; } int m = l.n; int k = l.size*l.size*l.c; int n = out_h*out_w; float *a = l.filters; float *b = state.workspace; float *c = l.output; for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); c += n*m; state.input += l.c*l.h*l.w; } if(l.batch_normalize){ forward_batchnorm_layer(l, state); } add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); activate_array(l.output, m*n*l.batch, l.activation); }
void forward_deconvolutional_layer(const layer l, network net) { int i; int m = l.size * l.size * l.n; int n = l.h * l.w; int k = l.c; fill_cpu(l.outputs * l.batch, 0, l.output, 1); for (i = 0; i < l.batch; ++i) { real_t *a = l.weights; real_t *b = net.input + i * l.c * l.h * l.w; real_t *c = net.workspace; gemm_cpu(1, 0, m, n, k, 1, a, m, b, n, 0, c, n); col2im_cpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output + i * l.outputs); } if (l.batch_normalize) { forward_batchnorm_layer(l, net); } else { add_bias(l.output, l.biases, l.batch, l.n, l.out_w * l.out_h); } activate_array(l.output, l.batch * l.n * l.out_w * l.out_h, l.activation); }
void forward_connected_layer(connected_layer l, network_state state) { int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); int m = l.batch; int k = l.inputs; int n = l.outputs; float *a = state.input; float *b = l.weights; float *c = l.output; gemm(0,1,m,n,k,1,a,k,b,k,1,c,n); if(l.batch_normalize){ if(state.train){ mean_cpu(l.output, l.batch, l.outputs, 1, l.mean); variance_cpu(l.output, l.mean, l.batch, l.outputs, 1, l.variance); scal_cpu(l.outputs, .95, l.rolling_mean, 1); axpy_cpu(l.outputs, .05, l.mean, 1, l.rolling_mean, 1); scal_cpu(l.outputs, .95, l.rolling_variance, 1); axpy_cpu(l.outputs, .05, l.variance, 1, l.rolling_variance, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.outputs, 1); copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.outputs, 1); } scale_bias(l.output, l.scales, l.batch, l.outputs, 1); } for(i = 0; i < l.batch; ++i){ axpy_cpu(l.outputs, 1, l.biases, 1, l.output + i*l.outputs, 1); } activate_array(l.output, l.outputs*l.batch, l.activation); }
void forward_deconvolutional_layer(const layer l, network_state state) { int i; int out_h = l.out_h; int out_w = l.out_w; int size = out_h*out_w; int m = l.size*l.size*l.n; int n = l.h*l.w; int k = l.c; fill_cpu(l.outputs*l.batch, 0, l.output, 1); for(i = 0; i < l.batch; ++i){ float *a = l.weights; float *b = state.input + i*l.c*l.h*l.w; float *c = state.workspace; gemm(1,0,m,n,k,1,a,m,b,n,0,c,n); col2im_cpu(c, l.n, out_h, out_w, l.size, l.stride, 0, l.output+i*l.n*size); } if(l.batch_normalize){ forward_batchnorm_layer(l, state); } else { add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w); } activate_array(l.output, l.batch*l.n*size, l.activation); }
void slerp(float *start, float *end, float s, int n, float *out) { float omega = acos(dot_cpu(n, start, 1, end, 1)); float so = sin(omega); fill_cpu(n, 0, out, 1); axpy_cpu(n, sin((1-s)*omega)/so, start, 1, out, 1); axpy_cpu(n, sin(s*omega)/so, end, 1, out, 1); float mag = mag_array(out, n); scale_array(out, n, 1./mag); }
void forward_rnn_layer(layer l, network_state state) { network_state s = { 0 }; s.train = state.train; int i; layer input_layer = *(l.input_layer); layer self_layer = *(l.self_layer); layer output_layer = *(l.output_layer); fill_cpu(l.outputs * l.batch * l.steps, 0, output_layer.delta, 1); fill_cpu(l.hidden * l.batch * l.steps, 0, self_layer.delta, 1); fill_cpu(l.hidden * l.batch * l.steps, 0, input_layer.delta, 1); if (state.train) fill_cpu(l.hidden * l.batch, 0, l.state, 1); for (i = 0; i < l.steps; ++i) { s.input = state.input; forward_connected_layer(input_layer, s); s.input = l.state; forward_connected_layer(self_layer, s); float *old_state = l.state; if (state.train) l.state += l.hidden * l.batch; if (l.shortcut) { copy_cpu(l.hidden * l.batch, old_state, 1, l.state, 1); } else { fill_cpu(l.hidden * l.batch, 0, l.state, 1); } axpy_cpu(l.hidden * l.batch, 1, input_layer.output, 1, l.state, 1); axpy_cpu(l.hidden * l.batch, 1, self_layer.output, 1, l.state, 1); s.input = l.state; forward_connected_layer(output_layer, s); state.input += l.inputs * l.batch; increment_layer(&input_layer, 1); increment_layer(&self_layer, 1); increment_layer(&output_layer, 1); } }
void forward_network(network net, network_state state) { state.workspace = net.workspace; int i; for(i = 0; i < net.n; ++i){ state.index = i; layer l = net.layers[i]; if(l.delta){ fill_cpu(l.outputs * l.batch, 0, l.delta, 1); } l.forward(l, state); state.input = l.output; } }
void forward_network(network net) { int i; for(i = 0; i < net.n; ++i){ net.index = i; layer l = net.layers[i]; if(l.delta){ fill_cpu(l.outputs * l.batch, 0, l.delta, 1); } l.forward(l, net); net.input = l.output; if(l.truth) { net.truth = l.output; } } calc_network_cost(net); }
void forward_convolutional_layer(convolutional_layer l, network net) { int i, j; fill_cpu(l.outputs*l.batch, 0, l.output, 1); if(l.xnor){ binarize_weights(l.weights, l.n, l.c/l.groups*l.size*l.size, l.binary_weights); swap_binary(&l); binarize_cpu(net.input, l.c*l.h*l.w*l.batch, l.binary_input); net.input = l.binary_input; } int m = l.n/l.groups; int k = l.size*l.size*l.c/l.groups; int n = l.out_w*l.out_h; for(i = 0; i < l.batch; ++i){ for(j = 0; j < l.groups; ++j){ float *a = l.weights + j*l.nweights/l.groups; float *b = net.workspace; float *c = l.output + (i*l.groups + j)*n*m; im2col_cpu(net.input + (i*l.groups + j)*l.c/l.groups*l.h*l.w, l.c/l.groups, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); } } if(l.batch_normalize){ forward_batchnorm_layer(l, net); } else { add_bias(l.output, l.biases, l.batch, l.n, l.out_h*l.out_w); } activate_array(l.output, l.outputs*l.batch, l.activation); if(l.binary || l.xnor) swap_binary(&l); }
void forward_convolutional_layer(const convolutional_layer l, network_state state) { int out_h = convolutional_out_height(l); int out_w = convolutional_out_width(l); int i; fill_cpu(l.outputs*l.batch, 0, l.output, 1); int m = l.n; int k = l.size*l.size*l.c; int n = out_h*out_w; float *a = l.filters; float *b = l.col_image; float *c = l.output; // printf("the l.size is %i \n", l.size); ///* //printf("the m,k,n is %i,%i,%i \n", m,k,n); for(i = 0; i < l.batch; ++i){ im2col_cpu(state.input, l.c, l.h, l.w, l.size, l.stride, l.pad, b); gemm(0,0,m,n,k,1,a,k,b,n,1,c,n); c += n*m; state.input += l.c*l.h*l.w; } //*/ //add by fanghao /* int ii,jj,kk,mm,pp,tt; int lcc = l.c; int lhh = l.h; int lww = l.w; int kernel = l.size; int pad; if(l.pad) pad = l.size/2; else pad = l.pad; lhh += 2*pad; lww += 2*pad; float *dataP; dataP = (float *)calloc(lcc*lhh*lww, sizeof(float)); //printf("the l.h is %i \n", l.h); //printf("the l.w is %i \n", l.w); //printf("the lhh is %i \n", lhh); //printf("the lww is %i \n", lww); //printf("the pad is %i \n", pad); for(ii=0; ii < lcc; ii++) for(jj=pad; jj<lhh-pad; jj++) for(kk=pad; kk<lww-pad; kk++) dataP[ii*lhh*lww + jj*lww + kk] = state.input[ii*(lhh - 2*pad)*(lww-2*pad) + (jj - pad)*(lww - 2*pad) + kk-pad]; for(ii=0; ii<m; ii++) for(jj=0; jj<out_h; jj++) for(kk=0; kk<out_w; kk++) { float tempAcc = 0.0; for(mm=0; mm<lcc; mm++) for(pp=0; pp<kernel; pp++) for(tt=0; tt<kernel; tt++) tempAcc += a[ii*lcc*kernel*kernel+mm*kernel*kernel+pp*kernel+tt]*dataP[mm*lhh*lww+(l.stride*jj+pp)*lww+l.stride*kk+tt]; c[ii*out_h*out_w+jj*out_w+kk] = tempAcc; } // c += n*m; //state.input += l.c*l.h*l.w; // */ if(l.batch_normalize){ if(state.train){ mean_cpu(l.output, l.batch, l.n, l.out_h*l.out_w, l.mean); variance_cpu(l.output, l.mean, l.batch, l.n, l.out_h*l.out_w, l.variance); normalize_cpu(l.output, l.mean, l.variance, l.batch, l.n, l.out_h*l.out_w); } else { normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.n, l.out_h*l.out_w); } scale_bias(l.output, l.scales, l.batch, l.n, out_h*out_w); } add_bias(l.output, l.biases, l.batch, l.n, out_h*out_w); activate_array(l.output, m*n*l.batch, l.activation); }
void train_dcgan(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch) { #ifdef GPU char *backup_directory = "/home/kunle12/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); network *gnet = load_network(cfg, weight, clear); network *anet = load_network(acfg, aweight, clear); //float orig_rate = anet->learning_rate; int i, j, k; layer imlayer = {0}; for (i = 0; i < gnet->n; ++i) { if (gnet->layers[i].out_c == 3) { imlayer = gnet->layers[i]; break; } } printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay); int imgs = gnet->batch*gnet->subdivisions; i = *gnet->seen/imgs; data train, buffer; list *plist = get_paths(train_images); //int N = plist->size; char **paths = (char **)list_to_array(plist); load_args args= get_base_args(anet); args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; args.threads=16; args.classes = 1; char *ls[2] = {"imagenet", "zzzzzzzz"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; gnet->train = 1; anet->train = 1; int x_size = gnet->inputs*gnet->batch; int y_size = gnet->truths*gnet->batch; float *imerror = cuda_make_array(0, y_size); //int ay_size = anet->truths*anet->batch; float aloss_avg = -1; //data generated = copy_data(train); if (maxbatch == 0) maxbatch = gnet->max_batches; while (get_current_batch(gnet) < maxbatch) { i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; //translate_data_rows(train, -.5); //scale_data_rows(train, 2); load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gen = copy_data(train); for (j = 0; j < imgs; ++j) { train.y.vals[j][0] = 1; gen.y.vals[j][0] = 0; } time=clock(); for(j = 0; j < gnet->subdivisions; ++j){ get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0); int z; for(z = 0; z < x_size; ++z){ gnet->input[z] = rand_normal(); } for(z = 0; z < gnet->batch; ++z){ float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs); scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag); } /* for(z = 0; z < 100; ++z){ printf("%f, ", gnet->input[z]); } printf("\n"); printf("input: %f %f\n", mean_array(gnet->input, x_size), variance_array(gnet->input, x_size)); */ //cuda_push_array(gnet->input_gpu, gnet->input, x_size); //cuda_push_array(gnet->truth_gpu, gnet->truth, y_size); *gnet->seen += gnet->batch; forward_network(gnet); fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1); copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1); anet->delta_gpu = imerror; forward_network(anet); backward_network(anet); //float genaloss = *anet->cost / anet->batch; //printf("%f\n", genaloss); scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1); //printf("realness %f\n", cuda_mag_array(imerror, imlayer.outputs*imlayer.batch)); //printf("features %f\n", cuda_mag_array(gnet->layers[gnet->n-1].delta_gpu, imlayer.outputs*imlayer.batch)); axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1); backward_network(gnet); /* for(k = 0; k < gnet->n; ++k){ layer l = gnet->layers[k]; cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch); printf("%d: %f %f\n", k, mean_array(l.output, l.outputs*l.batch), variance_array(l.output, l.outputs*l.batch)); } */ for(k = 0; k < gnet->batch; ++k){ int index = j*gnet->batch + k; copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gen); //randomize_data(merge); float aloss = train_network(anet, merge); //translate_image(im, 1); //scale_image(im, .5); //translate_image(im2, 1); //scale_image(im2, .5); #ifdef OPENCV if(display){ image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]); image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]); show_image(im, "gen", 1); show_image(im2, "train", 1); save_image(im, "gen"); save_image(im2, "train"); } #endif /* if(aloss < .1){ anet->learning_rate = 0; } else if (aloss > .3){ anet->learning_rate = orig_rate; } */ update_network_gpu(gnet); free_data(merge); free_data(train); free_data(gen); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); if(i%10000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(gnet, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(gnet, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(gnet, buff); #endif free_network(gnet); free_network(anet); }
void train_prog(char *cfg, char *weight, char *acfg, char *aweight, int clear, int display, char *train_images, int maxbatch) { #ifdef GPU char *backup_directory = "/home/kunle12/backup/"; srand(time(0)); char *base = basecfg(cfg); char *abase = basecfg(acfg); printf("%s\n", base); network *gnet = load_network(cfg, weight, clear); network *anet = load_network(acfg, aweight, clear); int i, j, k; layer imlayer = gnet->layers[gnet->n-1]; printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", gnet->learning_rate, gnet->momentum, gnet->decay); int imgs = gnet->batch*gnet->subdivisions; i = *gnet->seen/imgs; data train, buffer; list *plist = get_paths(train_images); char **paths = (char **)list_to_array(plist); load_args args= get_base_args(anet); args.paths = paths; args.n = imgs; args.m = plist->size; args.d = &buffer; args.type = CLASSIFICATION_DATA; args.threads=16; args.classes = 1; char *ls[2] = {"imagenet", "zzzzzzzz"}; args.labels = ls; pthread_t load_thread = load_data_in_thread(args); clock_t time; gnet->train = 1; anet->train = 1; int x_size = gnet->inputs*gnet->batch; int y_size = gnet->truths*gnet->batch; float *imerror = cuda_make_array(0, y_size); float aloss_avg = -1; if (maxbatch == 0) maxbatch = gnet->max_batches; while (get_current_batch(gnet) < maxbatch) { { int cb = get_current_batch(gnet); float alpha = (float) cb / (maxbatch/2); if(alpha > 1) alpha = 1; float beta = 1 - alpha; printf("%f %f\n", alpha, beta); set_network_alpha_beta(gnet, alpha, beta); set_network_alpha_beta(anet, beta, alpha); } i += 1; time=clock(); pthread_join(load_thread, 0); train = buffer; load_thread = load_data_in_thread(args); printf("Loaded: %lf seconds\n", sec(clock()-time)); data gen = copy_data(train); for (j = 0; j < imgs; ++j) { train.y.vals[j][0] = 1; gen.y.vals[j][0] = 0; } time=clock(); for (j = 0; j < gnet->subdivisions; ++j) { get_next_batch(train, gnet->batch, j*gnet->batch, gnet->truth, 0); int z; for(z = 0; z < x_size; ++z){ gnet->input[z] = rand_normal(); } /* for(z = 0; z < gnet->batch; ++z){ float mag = mag_array(gnet->input + z*gnet->inputs, gnet->inputs); scale_array(gnet->input + z*gnet->inputs, gnet->inputs, 1./mag); } */ *gnet->seen += gnet->batch; forward_network(gnet); fill_gpu(imlayer.outputs*imlayer.batch, 0, imerror, 1); fill_cpu(anet->truths*anet->batch, 1, anet->truth, 1); copy_cpu(anet->inputs*anet->batch, imlayer.output, 1, anet->input, 1); anet->delta_gpu = imerror; forward_network(anet); backward_network(anet); //float genaloss = *anet->cost / anet->batch; scal_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1); scal_gpu(imlayer.outputs*imlayer.batch, 0, gnet->layers[gnet->n-1].delta_gpu, 1); axpy_gpu(imlayer.outputs*imlayer.batch, 1, imerror, 1, gnet->layers[gnet->n-1].delta_gpu, 1); backward_network(gnet); for(k = 0; k < gnet->batch; ++k){ int index = j*gnet->batch + k; copy_cpu(gnet->outputs, gnet->output + k*gnet->outputs, 1, gen.X.vals[index], 1); } } harmless_update_network_gpu(anet); data merge = concat_data(train, gen); float aloss = train_network(anet, merge); #ifdef OPENCV if(display){ image im = float_to_image(anet->w, anet->h, anet->c, gen.X.vals[0]); image im2 = float_to_image(anet->w, anet->h, anet->c, train.X.vals[0]); show_image(im, "gen", 1); show_image(im2, "train", 1); save_image(im, "gen"); save_image(im2, "train"); } #endif update_network_gpu(gnet); free_data(merge); free_data(train); free_data(gen); if (aloss_avg < 0) aloss_avg = aloss; aloss_avg = aloss_avg*.9 + aloss*.1; printf("%d: adv: %f | adv_avg: %f, %f rate, %lf seconds, %d images\n", i, aloss, aloss_avg, get_current_rate(gnet), sec(clock()-time), i*imgs); if(i%10000==0){ char buff[256]; sprintf(buff, "%s/%s_%d.weights", backup_directory, base, i); save_weights(gnet, buff); sprintf(buff, "%s/%s_%d.weights", backup_directory, abase, i); save_weights(anet, buff); } if(i%1000==0){ char buff[256]; sprintf(buff, "%s/%s.backup", backup_directory, base); save_weights(gnet, buff); sprintf(buff, "%s/%s.backup", backup_directory, abase); save_weights(anet, buff); } } char buff[256]; sprintf(buff, "%s/%s_final.weights", backup_directory, base); save_weights(gnet, buff); #endif free_network( gnet ); free_network( anet ); }
/* * Routine used to setup a newly inserted CPU in preparation for starting * it running code. */ int mp_cpu_configure(int cpuid) { md_t *mdp; mde_cookie_t rootnode, cpunode = MDE_INVAL_ELEM_COOKIE; int listsz, i; mde_cookie_t *listp = NULL; int num_nodes; uint64_t cpuid_prop; cpu_t *cpu; processorid_t id; ASSERT(MUTEX_HELD(&cpu_lock)); if ((mdp = md_get_handle()) == NULL) return (ENODEV); rootnode = md_root_node(mdp); ASSERT(rootnode != MDE_INVAL_ELEM_COOKIE); num_nodes = md_node_count(mdp); ASSERT(num_nodes > 0); listsz = num_nodes * sizeof (mde_cookie_t); listp = kmem_zalloc(listsz, KM_SLEEP); num_nodes = md_scan_dag(mdp, rootnode, md_find_name(mdp, "cpu"), md_find_name(mdp, "fwd"), listp); if (num_nodes < 0) return (ENODEV); for (i = 0; i < num_nodes; i++) { if (md_get_prop_val(mdp, listp[i], "id", &cpuid_prop)) break; if (cpuid_prop == (uint64_t)cpuid) { cpunode = listp[i]; break; } } if (cpunode == MDE_INVAL_ELEM_COOKIE) return (ENODEV); kmem_free(listp, listsz); mpo_cpu_add(mdp, cpuid); /* * Note: uses cpu_lock to protect cpunodes * which will be modified inside of fill_cpu and * setup_exec_unit_mappings. */ fill_cpu(mdp, cpunode); /* * Adding a CPU may cause the execution unit sharing * relationships to change. Update the mappings in * the cpunode structures. */ setup_chip_mappings(mdp); setup_exec_unit_mappings(mdp); /* propagate the updated mappings to the CPU structures */ for (id = 0; id < NCPU; id++) { if ((cpu = cpu_get(id)) == NULL) continue; cpu_map_exec_units(cpu); } (void) md_fini_handle(mdp); if ((i = setup_cpu_common(cpuid)) != 0) { (void) cleanup_cpu_common(cpuid); return (i); } return (0); }
/* * map_wellknown - map known devices & registers */ static void map_wellknown(pnode_t curnode) { extern int status_okay(int, char *, int); char tmp_name[MAXSYSNAME]; int sok; #ifdef VPRINTF VPRINTF("map_wellknown(%x)\n", curnode); #endif /* VPRINTF */ for (curnode = CHILD(curnode); curnode; curnode = NEXT(curnode)) { /* * prune subtree if status property indicating not okay */ sok = status_okay((int)curnode, (char *)NULL, 0); if (!sok) { char devtype_buf[OBP_MAXPROPNAME]; int size; #ifdef VPRINTF VPRINTF("map_wellknown: !okay status property\n"); #endif /* VPRINTF */ /* * a status property indicating bad memory will be * associated with a node which has a "device_type" * property with a value of "memory-controller" */ if ((size = GETPROPLEN(curnode, OBP_DEVICETYPE)) == -1) continue; if (size > OBP_MAXPROPNAME) { cmn_err(CE_CONT, "node %x '%s' prop too " "big\n", curnode, OBP_DEVICETYPE); continue; } if (GETPROP(curnode, OBP_DEVICETYPE, devtype_buf) == -1) { cmn_err(CE_CONT, "node %x '%s' get failed\n", curnode, OBP_DEVICETYPE); continue; } if (strcmp(devtype_buf, "memory-controller") != 0) continue; /* * ...else fall thru and process the node... */ } bzero(tmp_name, MAXSYSNAME); if (GETPROP(curnode, OBP_NAME, (caddr_t)tmp_name) != -1) fill_address(curnode, tmp_name); if (GETPROP(curnode, OBP_DEVICETYPE, tmp_name) != -1 && strcmp(tmp_name, "cpu") == 0) { fill_cpu(curnode); } if (strcmp(tmp_name, "tod") == 0) have_tod(curnode); if (sok && (strcmp(tmp_name, "memory-controller") == 0) && (&plat_fill_mc != NULL)) plat_fill_mc(curnode); map_wellknown(curnode); } }
void forward_lstm_layer(layer l, network state) { network s = { 0 }; s.train = state.train; int i; layer wf = *(l.wf); layer wi = *(l.wi); layer wg = *(l.wg); layer wo = *(l.wo); layer uf = *(l.uf); layer ui = *(l.ui); layer ug = *(l.ug); layer uo = *(l.uo); fill_cpu(l.outputs * l.batch * l.steps, 0, wf.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, wi.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, wg.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, wo.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, uf.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, ui.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, ug.delta, 1); fill_cpu(l.outputs * l.batch * l.steps, 0, uo.delta, 1); if (state.train) { fill_cpu(l.outputs * l.batch * l.steps, 0, l.delta, 1); } for (i = 0; i < l.steps; ++i) { s.input = l.h_cpu; forward_connected_layer(wf, s); forward_connected_layer(wi, s); forward_connected_layer(wg, s); forward_connected_layer(wo, s); s.input = state.input; forward_connected_layer(uf, s); forward_connected_layer(ui, s); forward_connected_layer(ug, s); forward_connected_layer(uo, s); copy_cpu(l.outputs * l.batch, wf.output, 1, l.f_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uf.output, 1, l.f_cpu, 1); copy_cpu(l.outputs * l.batch, wi.output, 1, l.i_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ui.output, 1, l.i_cpu, 1); copy_cpu(l.outputs * l.batch, wg.output, 1, l.g_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, ug.output, 1, l.g_cpu, 1); copy_cpu(l.outputs * l.batch, wo.output, 1, l.o_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, uo.output, 1, l.o_cpu, 1); activate_array(l.f_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.i_cpu, l.outputs * l.batch, LOGISTIC); activate_array(l.g_cpu, l.outputs * l.batch, TANH); activate_array(l.o_cpu, l.outputs * l.batch, LOGISTIC); copy_cpu(l.outputs * l.batch, l.i_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.g_cpu, 1, l.temp_cpu, 1); mul_cpu(l.outputs * l.batch, l.f_cpu, 1, l.c_cpu, 1); axpy_cpu(l.outputs * l.batch, 1, l.temp_cpu, 1, l.c_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.h_cpu, 1); activate_array(l.h_cpu, l.outputs * l.batch, TANH); mul_cpu(l.outputs * l.batch, l.o_cpu, 1, l.h_cpu, 1); copy_cpu(l.outputs * l.batch, l.c_cpu, 1, l.cell_cpu, 1); copy_cpu(l.outputs * l.batch, l.h_cpu, 1, l.output, 1); state.input += l.inputs * l.batch; l.output += l.outputs * l.batch; l.cell_cpu += l.outputs * l.batch; increment_layer(&wf, 1); increment_layer(&wi, 1); increment_layer(&wg, 1); increment_layer(&wo, 1); increment_layer(&uf, 1); increment_layer(&ui, 1); increment_layer(&ug, 1); increment_layer(&uo, 1); } }
void forward_iseg_layer(const layer l, network net) { double time = what_time_is_it_now(); int i, b, j, k; int ids = l.extra; memcpy(l.output, net.input, l.outputs * l.batch * sizeof(real_t)); memset(l.delta, 0, l.outputs * l.batch * sizeof(real_t)); #ifndef GPU for (b = 0; b < l.batch; ++b) { int index = b * l.outputs; activate_array(l.output + index, l.classes * l.w * l.h, LOGISTIC); } #endif for (b = 0; b < l.batch; ++b) { // a priori, each pixel has no class for (i = 0; i < l.classes; ++i) { for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + i * l.w * l.h + k; l.delta[index] = 0 - l.output[index]; } } // a priori, embedding should be small magnitude for (i = 0; i < ids; ++i) { for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + (i + l.classes) * l.w * l.h + k; l.delta[index] = .1 * (0 - l.output[index]); } } memset(l.counts, 0, 90 * sizeof(int)); for (i = 0; i < 90; ++i) { fill_cpu(ids, 0, l.sums[i], 1); int c = net.truth[b * l.truths + i * (l.w * l.h + 1)]; if (c < 0) break; // add up metric embeddings for each instance for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + c * l.w * l.h + k; real_t v = net.truth[b * l.truths + i * (l.w * l.h + 1) + 1 + k]; if (v) { l.delta[index] = v - l.output[index]; axpy_cpu(ids, 1, l.output + b * l.outputs + l.classes * l.w * l.h + k, l.w * l.h, l.sums[i], 1); ++l.counts[i]; } } } real_t *mse = calloc(90, sizeof(real_t)); for (i = 0; i < 90; ++i) { int c = net.truth[b * l.truths + i * (l.w * l.h + 1)]; if (c < 0) break; for (k = 0; k < l.w * l.h; ++k) { real_t v = net.truth[b * l.truths + i * (l.w * l.h + 1) + 1 + k]; if (v) { int z; real_t sum = 0; for (z = 0; z < ids; ++z) { int index = b * l.outputs + (l.classes + z) * l.w * l.h + k; sum += pow(l.sums[i][z] / l.counts[i] - l.output[index], 2); } mse[i] += sum; } } mse[i] /= l.counts[i]; } // Calculate average embedding for (i = 0; i < 90; ++i) { if (!l.counts[i]) continue; scal_cpu(ids, 1.f / l.counts[i], l.sums[i], 1); if (b == 0 && net.gpu_index == 0) { printf("%4d, %6.3f, ", l.counts[i], mse[i]); for (j = 0; j < ids; ++j) { printf("%6.3f,", l.sums[i][j]); } printf("\n"); } } free(mse); // Calculate embedding loss for (i = 0; i < 90; ++i) { if (!l.counts[i]) continue; for (k = 0; k < l.w * l.h; ++k) { real_t v = net.truth[b * l.truths + i * (l.w * l.h + 1) + 1 + k]; if (v) { for (j = 0; j < 90; ++j) { if (!l.counts[j]) continue; int z; for (z = 0; z < ids; ++z) { int index = b * l.outputs + (l.classes + z) * l.w * l.h + k; real_t diff = l.sums[j][z] - l.output[index]; if (j == i) l.delta[index] += diff < 0 ? -.1 : .1; else l.delta[index] += -(diff < 0 ? -.1 : .1); } } } } } for (i = 0; i < ids; ++i) { for (k = 0; k < l.w * l.h; ++k) { int index = b * l.outputs + (i + l.classes) * l.w * l.h + k; l.delta[index] *= .01; } } } *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2); printf("took %lf sec\n", what_time_is_it_now() - time); }