void forward_deconvolutional_layer(const layer l, network net) { int i; int m = l.size * l.size * l.n; int n = l.h * l.w; int k = l.c; fill_cpu(l.outputs * l.batch, 0, l.output, 1); for (i = 0; i < l.batch; ++i) { real_t *a = l.weights; real_t *b = net.input + i * l.c * l.h * l.w; real_t *c = net.workspace; gemm_cpu(1, 0, m, n, k, 1, a, m, b, n, 0, c, n); col2im_cpu(net.workspace, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, l.output + i * l.outputs); } if (l.batch_normalize) { forward_batchnorm_layer(l, net); } else { add_bias(l.output, l.biases, l.batch, l.n, l.out_w * l.out_h); } activate_array(l.output, l.batch * l.n * l.out_w * l.out_h, l.activation); }
void gemm(int TA, int TB, int M, int N, int K, float ALPHA, float *A, int lda, float *B, int ldb, float BETA, float *C, int ldc) { gemm_cpu( TA, TB, M, N, K, ALPHA,A,lda, B, ldb,BETA,C,ldc); }
void backward_deconvolutional_layer(layer l, network net) { int i; gradient_array(l.output, l.outputs*l.batch, l.activation, l.delta); if(l.batch_normalize){ backward_batchnorm_layer(l, net); } else { backward_bias(l.bias_updates, l.delta, l.batch, l.n, l.out_w*l.out_h); } //if(net.delta) memset(net.delta, 0, l.batch*l.h*l.w*l.c*sizeof(float)); for(i = 0; i < l.batch; ++i){ int m = l.c; int n = l.size*l.size*l.n; int k = l.h*l.w; float *a = net.input + i*m*k; float *b = net.workspace; float *c = l.weight_updates; im2col_cpu(l.delta + i*l.outputs, l.out_c, l.out_h, l.out_w, l.size, l.stride, l.pad, b); gemm_cpu(0,1,m,n,k,1,a,k,b,k,1,c,n); if(net.delta){ int m = l.c; int n = l.h*l.w; int k = l.size*l.size*l.n; float *a = l.weights; float *b = net.workspace; float *c = net.delta + i*n*m; gemm_cpu(0,0,m,n,k,1,a,k,b,n,1,c,n); } } }
void time_random_matrix(int TA, int TB, int m, int k, int n) { float *a; if(!TA) a = random_matrix(m,k); else a = random_matrix(k,m); int lda = (!TA)?k:m; float *b; if(!TB) b = random_matrix(k,n); else b = random_matrix(n,k); int ldb = (!TB)?n:k; float *c = random_matrix(m,n); int i; clock_t start = clock(), end; for(i = 0; i<10; ++i){ gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); } end = clock(); printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf ms\n",m,k,k,n, TA, TB, (float)(end-start)/CLOCKS_PER_SEC); free(a); free(b); free(c); }
void test_gpu_accuracy(int TA, int TB, int m, int k, int n) { srand(0); float *a; if(!TA) a = random_matrix(m,k); else a = random_matrix(k,m); int lda = (!TA)?k:m; float *b; if(!TB) b = random_matrix(k,n); else b = random_matrix(n,k); int ldb = (!TB)?n:k; float *c = random_matrix(m,n); float *c_gpu = random_matrix(m,n); memset(c, 0, m*n*sizeof(float)); memset(c_gpu, 0, m*n*sizeof(float)); int i; //pm(m,k,b); gemm_gpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c_gpu,n); //printf("GPU\n"); //pm(m, n, c_gpu); gemm_cpu(TA,TB,m,n,k,1,a,lda,b,ldb,1,c,n); //printf("\n\nCPU\n"); //pm(m, n, c); double sse = 0; for(i = 0; i < m*n; ++i) { //printf("%f %f\n", c[i], c_gpu[i]); sse += pow(c[i]-c_gpu[i], 2); } printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %g SSE\n",m,k,k,n, TA, TB, sse/(m*n)); free(a); free(b); free(c); free(c_gpu); }