void forward_softmax_layer_gpu(const softmax_layer l, network_state state) { if(l.softmax_tree){ int i; int count = 0; for (i = 0; i < l.softmax_tree->groups; ++i) { int group_size = l.softmax_tree->group_size[i]; softmax_gpu(state.input + count, group_size, l.batch, l.inputs, 1, 0, 1, l.temperature, l.output_gpu + count); count += group_size; } } else { softmax_gpu(state.input, l.inputs/l.groups, l.batch, l.inputs, l.groups, l.inputs/l.groups, 1, l.temperature, l.output_gpu); } }
void forward_region_layer_gpu(const region_layer l, network_state state) { /* if(!state.train){ copy_ongpu(l.batch*l.inputs, state.input, 1, l.output_gpu, 1); return; } */ flatten_ongpu(state.input, l.h*l.w, l.n*(l.coords + l.classes + 1), l.batch, 1, l.output_gpu); if(l.softmax_tree){ int i; int count = 5; for (i = 0; i < l.softmax_tree->groups; ++i) { int group_size = l.softmax_tree->group_size[i]; softmax_gpu(l.output_gpu+count, group_size, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + count); count += group_size; } }else if (l.softmax){ softmax_gpu(l.output_gpu+5, l.classes, l.classes + 5, l.w*l.h*l.n*l.batch, 1, l.output_gpu + 5); } float *in_cpu = calloc(l.batch*l.inputs, sizeof(float)); float *truth_cpu = 0; if(state.truth){ int num_truth = l.batch*l.truths; truth_cpu = calloc(num_truth, sizeof(float)); cuda_pull_array(state.truth, truth_cpu, num_truth); } cuda_pull_array(l.output_gpu, in_cpu, l.batch*l.inputs); //cudaStreamSynchronize(get_cuda_stream()); network_state cpu_state = state; cpu_state.train = state.train; cpu_state.truth = truth_cpu; cpu_state.input = in_cpu; forward_region_layer(l, cpu_state); //cuda_push_array(l.output_gpu, l.output, l.batch*l.outputs); free(cpu_state.input); if(!state.train) return; cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs); //cudaStreamSynchronize(get_cuda_stream()); if(cpu_state.truth) free(cpu_state.truth); }