void resize_convolutional_layer(convolutional_layer *l, int w, int h) { l->w = w; l->h = h; int out_w = convolutional_out_width(*l); int out_h = convolutional_out_height(*l); l->out_w = out_w; l->out_h = out_h; l->outputs = l->out_h * l->out_w * l->out_c; l->inputs = l->w * l->h * l->c; l->col_image = realloc(l->col_image, out_h*out_w*l->size*l->size*l->c*sizeof(float)); l->output = realloc(l->output, l->batch*out_h * out_w * l->n*sizeof(float)); l->delta = realloc(l->delta, l->batch*out_h * out_w * l->n*sizeof(float)); #ifdef GPU cuda_free(l->col_image_gpu); cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->col_image_gpu = cuda_make_array(l->col_image, out_h*out_w*l->size*l->size*l->c); l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n); l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n); #endif }
void time_ongpu(int TA, int TB, int m, int k, int n) { int iter = 10; float *a = random_matrix(m,k); float *b = random_matrix(k,n); int lda = (!TA)?k:m; int ldb = (!TB)?n:k; float *c = random_matrix(m,n); float *a_cl = cuda_make_array(a, m*k); float *b_cl = cuda_make_array(b, k*n); float *c_cl = cuda_make_array(c, m*n); int i; clock_t start = clock(), end; for(i = 0; i<iter; ++i){ gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n); cudaThreadSynchronize(); } double flop = ((double)m)*n*(2.*k + 2.)*iter; double gflop = flop/pow(10., 9); end = clock(); double seconds = sec(end-start); printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds); cuda_free(a_cl); cuda_free(b_cl); cuda_free(c_cl); free(a); free(b); free(c); }
void resize_reorg_old_layer(layer *l, int w, int h) { int stride = l->stride; int c = l->c; l->h = h; l->w = w; if(l->reverse){ l->out_w = w*stride; l->out_h = h*stride; l->out_c = c/(stride*stride); }else{ l->out_w = w/stride; l->out_h = h/stride; l->out_c = c*(stride*stride); } l->outputs = l->out_h * l->out_w * l->out_c; l->inputs = l->outputs; int output_size = l->outputs * l->batch; l->output = realloc(l->output, output_size * sizeof(float)); l->delta = realloc(l->delta, output_size * sizeof(float)); #ifdef GPU cuda_free(l->output_gpu); cuda_free(l->delta_gpu); l->output_gpu = cuda_make_array(l->output, output_size); l->delta_gpu = cuda_make_array(l->delta, output_size); #endif }
void resize_maxpool_layer(maxpool_layer *l, int w, int h) { int stride = l->stride; l->h = h; l->w = w; l->inputs = h*w*l->c; l->out_w = (w-1)/stride + 1; l->out_h = (h-1)/stride + 1; l->outputs = l->out_w * l->out_h * l->c; int output_size = l->outputs * l->batch; l->indexes = realloc(l->indexes, output_size * sizeof(int)); l->output = realloc(l->output, output_size * sizeof(float)); l->delta = realloc(l->delta, output_size * sizeof(float)); #ifdef GPU cuda_free((float *)l->indexes_gpu); cuda_free(l->output_gpu); cuda_free(l->delta_gpu); l->indexes_gpu = cuda_make_int_array(output_size); l->output_gpu = cuda_make_array(l->output, output_size); l->delta_gpu = cuda_make_array(l->delta, output_size); #endif }
static gpudata *cuda_transfer(gpudata *src, size_t offset, size_t sz, void *dst_c, int may_share) { cuda_context *ctx = src->ctx; cuda_context *dst_ctx = (cuda_context *)dst_c; gpudata *dst; ASSERT_BUF(src); ASSERT_CTX(ctx); ASSERT_CTX(dst_ctx); if (ctx == dst_ctx) { if (may_share && offset == 0) { cuda_retain(src); return src; } dst = cuda_alloc(ctx, sz, NULL, 0, NULL); if (dst == NULL) return NULL; cuda_enter(ctx); cuda_wait(src, CUDA_WAIT_READ); cuda_wait(dst, CUDA_WAIT_WRITE); ctx->err = cuMemcpyDtoDAsync(dst->ptr, src->ptr+offset, sz, ctx->s); if (ctx->err != CUDA_SUCCESS) { cuda_exit(ctx); cuda_free(dst); return NULL; } cuda_record(src, CUDA_WAIT_READ); cuda_record(dst, CUDA_WAIT_WRITE); cuda_exit(ctx); return dst; } dst = cuda_alloc(dst_ctx, sz, NULL, 0, NULL); if (dst == NULL) return NULL; cuda_enter(ctx); cuda_waits(src, CUDA_WAIT_READ, dst_ctx->mem_s); cuda_waits(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s); ctx->err = cuMemcpyPeerAsync(dst->ptr, dst->ctx->ctx, src->ptr+offset, src->ctx->ctx, sz, dst_ctx->mem_s); if (ctx->err != CUDA_SUCCESS) { cuda_free(dst); cuda_exit(ctx); return NULL; } cuda_records(dst, CUDA_WAIT_WRITE, dst_ctx->mem_s); cuda_records(src, CUDA_WAIT_READ, dst_ctx->mem_s); cuda_exit(ctx); return dst; }
void resize_cost_layer(cost_layer *l, int inputs) { l->inputs = inputs; l->outputs = inputs; l->delta = realloc(l->delta, inputs*l->batch*sizeof(float)); l->output = realloc(l->output, inputs*l->batch*sizeof(float)); #ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->delta_gpu = cuda_make_array(l->delta, inputs*l->batch); l->output_gpu = cuda_make_array(l->output, inputs*l->batch); #endif }
void free_network(network net) { int i; for(i = 0; i < net.n; ++i){ free_layer(net.layers[i]); } free(net.layers); if(net.input) free(net.input); if(net.truth) free(net.truth); #ifdef GPU if(net.input_gpu) cuda_free(net.input_gpu); if(net.truth_gpu) cuda_free(net.truth_gpu); #endif }
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc) { float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M)); float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K)); float *C_gpu = cuda_make_array(C, ldc*M); gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc); cuda_pull_array(C_gpu, C, ldc*M); cuda_free(A_gpu); cuda_free(B_gpu); cuda_free(C_gpu); }
void resize_dropout_layer(dropout_layer *l, int inputs) { l->rand = realloc(l->rand, l->inputs*l->batch*sizeof(float)); #ifdef GPU cuda_free(l->rand_gpu); l->rand_gpu = cuda_make_array(l->rand, inputs*l->batch); #endif }
void resize_iseg_layer(layer *l, int w, int h) { l->w = w; l->h = h; l->outputs = h * w * l->c; l->inputs = l->outputs; l->output = realloc(l->output, l->batch * l->outputs * sizeof(real_t)); l->delta = realloc(l->delta, l->batch * l->outputs * sizeof(real_t)); #ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->delta_gpu = cuda_make_array(l->delta, l->batch * l->outputs); l->output_gpu = cuda_make_array(l->output, l->batch * l->outputs); #endif }
void resize_shortcut_layer(layer *l, int w, int h) { assert(l->w == l->out_w); assert(l->h == l->out_h); l->w = l->out_w = w; l->h = l->out_h = h; l->outputs = w * h * l->out_c; l->inputs = l->outputs; l->delta = realloc(l->delta, l->outputs * l->batch * sizeof(real_t)); l->output = realloc(l->output, l->outputs * l->batch * sizeof(real_t)); #ifdef GPU cuda_free(l->output_gpu); cuda_free(l->delta_gpu); l->output_gpu = cuda_make_array(l->output, l->outputs * l->batch); l->delta_gpu = cuda_make_array(l->delta, l->outputs * l->batch); #endif }
void resize_region_layer(layer *l, int w, int h) { l->w = w; l->h = h; l->outputs = h*w*l->n*(l->classes + l->coords + 1); l->inputs = l->outputs; l->output = realloc(l->output, l->batch*l->outputs*sizeof(float)); l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float)); #ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs); l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs); #endif }
void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size, int iters) { int iter = 0; for (iter = 0; iter < iters; ++iter) { image delta = make_image(recon.w, recon.h, recon.c); NETWORK_STATE(state); #ifdef GPU state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c); state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c); state.truth = cuda_make_array(features, get_network_output_size(net)); forward_network_gpu(net, state); backward_network_gpu(net, state); cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c); cuda_free(state.input); cuda_free(state.delta); cuda_free(state.truth); #else state.input = recon.data; state.delta = delta.data; state.truth = features; forward_network(net, state); backward_network(net, state); #endif fltadd(update.data, delta.data, recon.w * recon.h * recon.c); smooth(recon, update, lambda, smooth_size); fltaddmul(recon.data, update.data, recon.w * recon.h * recon.c, rate); scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1); //float mag = mag_array(recon.data, recon.w*recon.h*recon.c); //scal_cpu(recon.w*recon.h*recon.c, 600/mag, recon.data, 1); constrain_image(recon); free_image(delta); } }
static gpudata *cuda_alloc(void *c, size_t size, void *data, int flags, int *ret) { gpudata *res; cuda_context *ctx = (cuda_context *)c; int fl = CU_EVENT_DISABLE_TIMING; if ((flags & GA_BUFFER_INIT) && data == NULL) FAIL(NULL, GA_VALUE_ERROR); if ((flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) == (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) FAIL(NULL, GA_VALUE_ERROR); /* TODO: figure out how to make this work */ if (flags & GA_BUFFER_HOST) FAIL(NULL, GA_DEVSUP_ERROR); res = malloc(sizeof(*res)); if (res == NULL) FAIL(NULL, GA_SYS_ERROR); res->refcnt = 1; res->sz = size; res->flags = flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY); cuda_enter(ctx); if (ctx->err != CUDA_SUCCESS) { free(res); FAIL(NULL, GA_IMPL_ERROR); } if (ctx->flags & GA_CTX_MULTI_THREAD) fl |= CU_EVENT_BLOCKING_SYNC; ctx->err = cuEventCreate(&res->ev, fl); if (ctx->err != CUDA_SUCCESS) { free(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } if (size == 0) size = 1; ctx->err = cuMemAlloc(&res->ptr, size); if (ctx->err != CUDA_SUCCESS) { cuEventDestroy(res->ev); free(res); cuda_exit(ctx); FAIL(NULL, GA_IMPL_ERROR); } res->ctx = ctx; ctx->refcnt++; if (flags & GA_BUFFER_INIT) { ctx->err = cuMemcpyHtoD(res->ptr, data, size); if (ctx->err != CUDA_SUCCESS) { cuda_free(res); FAIL(NULL, GA_IMPL_ERROR) }
void reconstruct_picture(network net, float *features, image recon, image update, float rate, float momentum, float lambda, int smooth_size) { scale_image(recon, 2); translate_image(recon, -1); image delta = make_image(recon.w, recon.h, recon.c); network_state state = {0}; #ifdef GPU state.input = cuda_make_array(recon.data, recon.w*recon.h*recon.c); state.delta = cuda_make_array(delta.data, delta.w*delta.h*delta.c); state.truth = cuda_make_array(features, get_network_output_size(net)); forward_network_gpu(net, state); backward_network_gpu(net, state); cuda_pull_array(state.delta, delta.data, delta.w*delta.h*delta.c); cuda_free(state.input); cuda_free(state.delta); cuda_free(state.truth); #else state.input = recon.data; state.delta = delta.data; state.truth = features; forward_network(net, state); backward_network(net, state); #endif axpy_cpu(recon.w*recon.h*recon.c, 1, delta.data, 1, update.data, 1); smooth(recon, update, lambda, smooth_size); axpy_cpu(recon.w*recon.h*recon.c, rate, update.data, 1, recon.data, 1); scal_cpu(recon.w*recon.h*recon.c, momentum, update.data, 1); translate_image(recon, 1); scale_image(recon, .5); constrain_image(recon); free_image(delta); }
void resize_convolutional_layer(convolutional_layer *l, int w, int h) { l->w = w; l->h = h; int out_w = convolutional_out_width(*l); int out_h = convolutional_out_height(*l); l->out_w = out_w; l->out_h = out_h; l->outputs = l->out_h * l->out_w * l->out_c; l->inputs = l->w * l->h * l->c; l->output = realloc(l->output, l->batch*l->outputs*sizeof(float)); l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float)); if(l->batch_normalize){ l->x = realloc(l->x, l->batch*l->outputs*sizeof(float)); l->x_norm = realloc(l->x_norm, l->batch*l->outputs*sizeof(float)); } #ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs); l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs); if(l->batch_normalize){ cuda_free(l->x_gpu); cuda_free(l->x_norm_gpu); l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs); l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs); } #ifdef CUDNN cudnn_convolutional_setup(l); #endif #endif l->workspace_size = get_workspace_size(*l); }
void free_network(network net) { int i; for (i = 0; i < net.n; ++i) { free_layer(net.layers[i]); } free(net.layers); free(net.scales); free(net.steps); free(net.seen); #ifdef GPU if (gpu_index >= 0) cuda_free(net.workspace); else free(net.workspace); if (net.input_state_gpu) cuda_free(net.input_state_gpu); if (*net.input_gpu) cuda_free(*net.input_gpu); if (*net.truth_gpu) cuda_free(*net.truth_gpu); if (net.input_gpu) free(net.input_gpu); if (net.truth_gpu) free(net.truth_gpu); if (*net.input16_gpu) cuda_free(*net.input16_gpu); if (*net.output16_gpu) cuda_free(*net.output16_gpu); if (net.input16_gpu) free(net.input16_gpu); if (net.output16_gpu) free(net.output16_gpu); if (net.max_input16_size) free(net.max_input16_size); if (net.max_output16_size) free(net.max_output16_size); #else free(net.workspace); #endif }
void resize_deconvolutional_layer(layer *l, int h, int w) { l->h = h; l->w = w; l->out_h = (l->h) * l->stride + l->size/2 - l->pad; l->out_w = (l->w) * l->stride + l->size/2 - l->pad; l->outputs = l->out_h * l->out_w * l->out_c; l->inputs = l->w * l->h * l->c; l->output = realloc(l->output, l->batch*l->outputs*sizeof(float)); l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float)); if(l->batch_normalize){ l->x = realloc(l->x, l->batch*l->outputs*sizeof(float)); l->x_norm = realloc(l->x_norm, l->batch*l->outputs*sizeof(float)); } #ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs); l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs); if(l->batch_normalize){ cuda_free(l->x_gpu); cuda_free(l->x_norm_gpu); l->x_gpu = cuda_make_array(l->output, l->batch*l->outputs); l->x_norm_gpu = cuda_make_array(l->output, l->batch*l->outputs); } #ifdef CUDNN cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetTensor4dDescriptor(l->normTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, 1, l->out_c, 1, 1); #endif #endif l->workspace_size = get_workspace_size(*l); }
static gpudata *cuda_alloc(void *c, size_t size, void *data, int flags, int *ret) { gpudata *res = NULL, *prev = NULL; cuda_context *ctx = (cuda_context *)c; size_t asize; int err; if ((flags & GA_BUFFER_INIT) && data == NULL) FAIL(NULL, GA_VALUE_ERROR); if ((flags & (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) == (GA_BUFFER_READ_ONLY|GA_BUFFER_WRITE_ONLY)) FAIL(NULL, GA_VALUE_ERROR); /* TODO: figure out how to make this work */ if (flags & GA_BUFFER_HOST) FAIL(NULL, GA_DEVSUP_ERROR); /* We don't want to manage really small allocations so we round up * to a multiple of FRAG_SIZE. This also ensures that if we split a * block, the next block starts properly aligned for any data type. */ if (!(ctx->flags & GA_CTX_DISABLE_ALLOCATION_CACHE)) { asize = roundup(size, FRAG_SIZE); find_best(ctx, &res, &prev, asize); } else { asize = size; } if (res == NULL) { err = allocate(ctx, &res, &prev, asize); if (err != GA_NO_ERROR) FAIL(NULL, err); } err = extract(res, prev, asize); if (err != GA_NO_ERROR) FAIL(NULL, err); /* It's out of the freelist, so add a ref */ res->ctx->refcnt++; /* We consider this buffer allocated and ready to go */ res->refcnt = 1; if (flags & GA_BUFFER_INIT) { err = cuda_write(res, 0, data, size); if (err != GA_NO_ERROR) { cuda_free(res); FAIL(NULL, err); } } return res; }
void resize_crop_layer(layer *l, int w, int h) { l->w = w; l->h = h; l->out_w = l->scale * w; l->out_h = l->scale * h; l->inputs = l->w * l->h * l->c; l->outputs = l->out_h * l->out_w * l->out_c; l->output = realloc(l->output, l->batch * l->outputs * sizeof(real_t)); #ifdef GPU cuda_free(l->output_gpu); l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch); #endif }
int resize_network(network *net, int w, int h) { int i; //if(w == net->w && h == net->h) return 0; net->w = w; net->h = h; int inputs = 0; size_t workspace_size = 0; //fprintf(stderr, "Resizing to %d x %d...", w, h); //fflush(stderr); for (i = 0; i < net->n; ++i){ layer l = net->layers[i]; if(l.type == CONVOLUTIONAL){ resize_convolutional_layer(&l, w, h); }else if(l.type == CROP){ resize_crop_layer(&l, w, h); }else if(l.type == MAXPOOL){ resize_maxpool_layer(&l, w, h); }else if(l.type == AVGPOOL){ resize_avgpool_layer(&l, w, h); }else if(l.type == NORMALIZATION){ resize_normalization_layer(&l, w, h); }else if(l.type == COST){ resize_cost_layer(&l, inputs); }else{ error("Cannot resize this type of layer"); } if(l.workspace_size > workspace_size) workspace_size = l.workspace_size; inputs = l.outputs; net->layers[i] = l; w = l.out_w; h = l.out_h; if(l.type == AVGPOOL) break; } #ifdef GPU cuda_free(net->workspace); net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1); #else free(net->workspace); net->workspace = calloc(1, workspace_size); #endif //fprintf(stderr, " Done!\n"); return 0; }
static void cuda_free_ctx(cuda_context *ctx) { gpuarray_blas_ops *blas_ops; ASSERT_CTX(ctx); ctx->refcnt--; if (ctx->refcnt == 0) { assert(ctx->enter == 0 && "Context was active when freed!"); if (ctx->blas_handle != NULL) { ctx->err = cuda_property(ctx, NULL, NULL, GA_CTX_PROP_BLAS_OPS, &blas_ops); blas_ops->teardown(ctx); } ctx->refcnt = 2; /* Prevent recursive calls */ cuda_free(ctx->errbuf); cuStreamDestroy(ctx->s); if (!(ctx->flags & DONTFREE)) cuCtxDestroy(ctx->ctx); cache_free(ctx->extcopy_cache); CLEAR(ctx); free(ctx); } }
static void cuda_float_free(float* x) { cuda_free((void*)x); }
void optimize_picture(network *net, image orig, int max_layer, float scale, float rate, float thresh, int norm) { //scale_image(orig, 2); //translate_image(orig, -1); net->n = max_layer + 1; int dx = rand()%16 - 8; int dy = rand()%16 - 8; int flip = rand()%2; image crop = crop_image(orig, dx, dy, orig.w, orig.h); image im = resize_image(crop, (int)(orig.w * scale), (int)(orig.h * scale)); if(flip) flip_image(im); resize_network(net, im.w, im.h); layer_t last = net->layers[net->n-1]; //net->layers[net->n - 1].activation = LINEAR; image delta = make_image(im.w, im.h, im.c); NETWORK_STATE(state); #ifdef GPU state.input = cuda_make_array(im.data, im.w*im.h*im.c); state.delta = cuda_make_array(im.data, im.w*im.h*im.c); forward_network_gpu(*net, state); copy_ongpu(last.outputs, last.output_gpu, 1, last.delta_gpu, 1); cuda_pull_array(last.delta_gpu, last.delta, last.outputs); calculate_loss(last.delta, last.delta, last.outputs, thresh); cuda_push_array(last.delta_gpu, last.delta, last.outputs); backward_network_gpu(*net, state); cuda_pull_array(state.delta, delta.data, im.w*im.h*im.c); cuda_free(state.input); cuda_free(state.delta); #else state.input = im.data; state.delta = delta.data; forward_network(*net, state); fltcpy(last.delta, last.output, last.outputs); calculate_loss(last.output, last.delta, last.outputs, thresh); backward_network(*net, state); #endif if(flip) flip_image(delta); //normalize_array(delta.data, delta.w*delta.h*delta.c); image resized = resize_image(delta, orig.w, orig.h); image out = crop_image(resized, -dx, -dy, orig.w, orig.h); /* image g = grayscale_image(out); free_image(out); out = g; */ //rate = rate / abs_mean(out.data, out.w*out.h*out.c); if(norm) normalize_array(out.data, out.w*out.h*out.c); fltaddmul(orig.data, out.data, orig.w * orig.h * orig.c, rate); /* normalize_array(orig.data, orig.w*orig.h*orig.c); scale_image(orig, sqrt(var)); translate_image(orig, mean); */ //translate_image(orig, 1); //scale_image(orig, .5); //normalize_image(orig); constrain_image(orig); free_image(crop); free_image(im); free_image(delta); free_image(resized); free_image(out); }
int resize_network(network *net, int w, int h) { #ifdef GPU cuda_set_device(net->gpu_index); if(gpu_index >= 0){ cuda_free(net->workspace); if (net->input_gpu) { cuda_free(*net->input_gpu); *net->input_gpu = 0; cuda_free(*net->truth_gpu); *net->truth_gpu = 0; } } #endif int i; //if(w == net->w && h == net->h) return 0; net->w = w; net->h = h; int inputs = 0; size_t workspace_size = 0; //fprintf(stderr, "Resizing to %d x %d...\n", w, h); //fflush(stderr); for (i = 0; i < net->n; ++i){ layer l = net->layers[i]; //printf(" %d: layer = %d,", i, l.type); if(l.type == CONVOLUTIONAL){ resize_convolutional_layer(&l, w, h); }else if(l.type == CROP){ resize_crop_layer(&l, w, h); }else if(l.type == MAXPOOL){ resize_maxpool_layer(&l, w, h); }else if(l.type == REGION){ resize_region_layer(&l, w, h); }else if (l.type == YOLO) { resize_yolo_layer(&l, w, h); }else if(l.type == ROUTE){ resize_route_layer(&l, net); }else if (l.type == SHORTCUT) { resize_shortcut_layer(&l, w, h); }else if (l.type == UPSAMPLE) { resize_upsample_layer(&l, w, h); }else if(l.type == REORG){ resize_reorg_layer(&l, w, h); }else if(l.type == AVGPOOL){ resize_avgpool_layer(&l, w, h); }else if(l.type == NORMALIZATION){ resize_normalization_layer(&l, w, h); }else if(l.type == COST){ resize_cost_layer(&l, inputs); }else{ fprintf(stderr, "Resizing type %d \n", (int)l.type); error("Cannot resize this type of layer"); } if(l.workspace_size > workspace_size) workspace_size = l.workspace_size; inputs = l.outputs; net->layers[i] = l; w = l.out_w; h = l.out_h; if(l.type == AVGPOOL) break; } #ifdef GPU if(gpu_index >= 0){ printf(" try to allocate workspace = %zu * sizeof(float), ", workspace_size / sizeof(float) + 1); net->workspace = cuda_make_array(0, workspace_size/sizeof(float) + 1); printf(" CUDA allocate done! \n"); }else { free(net->workspace); net->workspace = calloc(1, workspace_size); } #else free(net->workspace); net->workspace = calloc(1, workspace_size); #endif //fprintf(stderr, " Done!\n"); return 0; }
void resize_convolutional_layer(convolutional_layer *l, int w, int h) { l->w = w; l->h = h; int out_w = convolutional_out_width(*l); int out_h = convolutional_out_height(*l); l->out_w = out_w; l->out_h = out_h; l->outputs = l->out_h * l->out_w * l->out_c; l->inputs = l->w * l->h * l->c; l->output = realloc(l->output, l->batch*out_h * out_w * l->n*sizeof(float)); l->delta = realloc(l->delta, l->batch*out_h * out_w * l->n*sizeof(float)); #ifdef GPU cuda_free(l->delta_gpu); cuda_free(l->output_gpu); l->delta_gpu = cuda_make_array(l->delta, l->batch*out_h*out_w*l->n); l->output_gpu = cuda_make_array(l->output, l->batch*out_h*out_w*l->n); #ifdef CUDNN cudnnSetTensor4dDescriptor(l->dsrcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->ddstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->dfilterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); cudnnSetTensor4dDescriptor(l->srcTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->c, l->h, l->w); cudnnSetTensor4dDescriptor(l->dstTensorDesc, CUDNN_TENSOR_NCHW, CUDNN_DATA_FLOAT, l->batch, l->out_c, l->out_h, l->out_w); cudnnSetFilter4dDescriptor(l->filterDesc, CUDNN_DATA_FLOAT, CUDNN_TENSOR_NCHW, l->n, l->c, l->size, l->size); int padding = l->pad ? l->size/2 : 0; cudnnSetConvolution2dDescriptor(l->convDesc, padding, padding, l->stride, l->stride, 1, 1, CUDNN_CROSS_CORRELATION); cudnnGetConvolutionForwardAlgorithm(cudnn_handle(), l->srcTensorDesc, l->filterDesc, l->convDesc, l->dstTensorDesc, CUDNN_CONVOLUTION_FWD_PREFER_FASTEST, 0, &l->fw_algo); cudnnGetConvolutionBackwardDataAlgorithm(cudnn_handle(), l->filterDesc, l->ddstTensorDesc, l->convDesc, l->dsrcTensorDesc, CUDNN_CONVOLUTION_BWD_DATA_PREFER_FASTEST, 0, &l->bd_algo); cudnnGetConvolutionBackwardFilterAlgorithm(cudnn_handle(), l->srcTensorDesc, l->ddstTensorDesc, l->convDesc, l->dfilterDesc, CUDNN_CONVOLUTION_BWD_FILTER_PREFER_FASTEST, 0, &l->bf_algo); #endif #endif l->workspace_size = get_workspace_size(*l); }
void free_layer(layer l) { if(l.type == DROPOUT){ if(l.rand) free(l.rand); #ifdef GPU if(l.rand_gpu) cuda_free(l.rand_gpu); #endif return; } if(l.cweights) free(l.cweights); if(l.indexes) free(l.indexes); if(l.input_layers) free(l.input_layers); if(l.input_sizes) free(l.input_sizes); if(l.map) free(l.map); if(l.rand) free(l.rand); if(l.cost) free(l.cost); if(l.state) free(l.state); if(l.prev_state) free(l.prev_state); if(l.forgot_state) free(l.forgot_state); if(l.forgot_delta) free(l.forgot_delta); if(l.state_delta) free(l.state_delta); if(l.concat) free(l.concat); if(l.concat_delta) free(l.concat_delta); if(l.binary_weights) free(l.binary_weights); if(l.biases) free(l.biases); if(l.bias_updates) free(l.bias_updates); if(l.scales) free(l.scales); if(l.scale_updates) free(l.scale_updates); if(l.weights) free(l.weights); if(l.weight_updates) free(l.weight_updates); if(l.col_image) free(l.col_image); if(l.delta) free(l.delta); if(l.output) free(l.output); if(l.squared) free(l.squared); if(l.norms) free(l.norms); if(l.spatial_mean) free(l.spatial_mean); if(l.mean) free(l.mean); if(l.variance) free(l.variance); if(l.mean_delta) free(l.mean_delta); if(l.variance_delta) free(l.variance_delta); if(l.rolling_mean) free(l.rolling_mean); if(l.rolling_variance) free(l.rolling_variance); if(l.x) free(l.x); if(l.x_norm) free(l.x_norm); if(l.m) free(l.m); if(l.v) free(l.v); if(l.z_cpu) free(l.z_cpu); if(l.r_cpu) free(l.r_cpu); if(l.h_cpu) free(l.h_cpu); if(l.binary_input) free(l.binary_input); #ifdef GPU if(l.indexes_gpu) cuda_free((float *)l.indexes_gpu); if(l.z_gpu) cuda_free(l.z_gpu); if(l.r_gpu) cuda_free(l.r_gpu); if(l.h_gpu) cuda_free(l.h_gpu); if(l.m_gpu) cuda_free(l.m_gpu); if(l.v_gpu) cuda_free(l.v_gpu); if(l.prev_state_gpu) cuda_free(l.prev_state_gpu); if(l.forgot_state_gpu) cuda_free(l.forgot_state_gpu); if(l.forgot_delta_gpu) cuda_free(l.forgot_delta_gpu); if(l.state_gpu) cuda_free(l.state_gpu); if(l.state_delta_gpu) cuda_free(l.state_delta_gpu); if(l.gate_gpu) cuda_free(l.gate_gpu); if(l.gate_delta_gpu) cuda_free(l.gate_delta_gpu); if(l.save_gpu) cuda_free(l.save_gpu); if(l.save_delta_gpu) cuda_free(l.save_delta_gpu); if(l.concat_gpu) cuda_free(l.concat_gpu); if(l.concat_delta_gpu) cuda_free(l.concat_delta_gpu); if(l.binary_input_gpu) cuda_free(l.binary_input_gpu); if(l.binary_weights_gpu) cuda_free(l.binary_weights_gpu); if(l.mean_gpu) cuda_free(l.mean_gpu); if(l.variance_gpu) cuda_free(l.variance_gpu); if(l.rolling_mean_gpu) cuda_free(l.rolling_mean_gpu); if(l.rolling_variance_gpu) cuda_free(l.rolling_variance_gpu); if(l.variance_delta_gpu) cuda_free(l.variance_delta_gpu); if(l.mean_delta_gpu) cuda_free(l.mean_delta_gpu); if(l.col_image_gpu) cuda_free(l.col_image_gpu); if(l.x_gpu) cuda_free(l.x_gpu); if(l.x_norm_gpu) cuda_free(l.x_norm_gpu); if(l.weights_gpu) cuda_free(l.weights_gpu); if(l.weight_updates_gpu) cuda_free(l.weight_updates_gpu); if(l.biases_gpu) cuda_free(l.biases_gpu); if(l.bias_updates_gpu) cuda_free(l.bias_updates_gpu); if(l.scales_gpu) cuda_free(l.scales_gpu); if(l.scale_updates_gpu) cuda_free(l.scale_updates_gpu); if(l.output_gpu) cuda_free(l.output_gpu); if(l.delta_gpu) cuda_free(l.delta_gpu); if(l.rand_gpu) cuda_free(l.rand_gpu); if(l.squared_gpu) cuda_free(l.squared_gpu); if(l.norms_gpu) cuda_free(l.norms_gpu); #endif }
int resize_network(network *net, int w, int h) { #ifdef GPU cuda_set_device(net->gpu_index); cuda_free(net->workspace); #endif int i; //if(w == net->w && h == net->h) return 0; net->w = w; net->h = h; int inputs = 0; size_t workspace_size = 0; //fprintf(stderr, "Resizing to %d x %d...\n", w, h); //fflush(stderr); for (i = 0; i < net->n; ++i){ layer l = net->layers[i]; if(l.type == CONVOLUTIONAL){ resize_convolutional_layer(&l, w, h); }else if(l.type == CROP){ resize_crop_layer(&l, w, h); }else if(l.type == MAXPOOL){ resize_maxpool_layer(&l, w, h); }else if(l.type == REGION){ resize_region_layer(&l, w, h); }else if(l.type == ROUTE){ resize_route_layer(&l, net); }else if(l.type == REORG){ resize_reorg_layer(&l, w, h); }else if(l.type == AVGPOOL){ resize_avgpool_layer(&l, w, h); }else if(l.type == NORMALIZATION){ resize_normalization_layer(&l, w, h); }else if(l.type == COST){ resize_cost_layer(&l, inputs); }else{ error("Cannot resize this type of layer"); } if(l.workspace_size > workspace_size) workspace_size = l.workspace_size; inputs = l.outputs; net->layers[i] = l; w = l.out_w; h = l.out_h; if(l.type == AVGPOOL) break; } layer out = get_network_output_layer(*net); net->inputs = net->layers[0].inputs; net->outputs = out.outputs; net->truths = out.outputs; if(net->layers[net->n-1].truths) net->truths = net->layers[net->n-1].truths; net->output = out.output; free(net->input); free(net->truth); net->input = calloc(net->inputs*net->batch, sizeof(float)); net->truth = calloc(net->truths*net->batch, sizeof(float)); #ifdef GPU if(gpu_index >= 0){ cuda_free(net->input_gpu); cuda_free(net->truth_gpu); net->input_gpu = cuda_make_array(net->input, net->inputs*net->batch); net->truth_gpu = cuda_make_array(net->truth, net->truths*net->batch); net->workspace = cuda_make_array(0, (workspace_size-1)/sizeof(float)+1); }else { free(net->workspace); net->workspace = calloc(1, workspace_size); } #else free(net->workspace); net->workspace = calloc(1, workspace_size); #endif //fprintf(stderr, " Done!\n"); return 0; }