Exemple #1
0
void backward_connected_layer_gpu(connected_layer l, network_state state)
{
    int i;
    constrain_ongpu(l.outputs*l.batch, 5, l.delta_gpu, 1);
    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
    for(i = 0; i < l.batch; ++i){
        axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
    }

    if(l.batch_normalize){
        backward_batchnorm_layer_gpu(l, state);
    }

    int m = l.outputs;
    int k = l.batch;
    int n = l.inputs;
    float * a = l.delta_gpu;
    float * b = state.input;
    float * c = l.weight_updates_gpu;
    gemm_ongpu(1,0,m,n,k,1,a,m,b,n,1,c,n);

    m = l.batch;
    k = l.outputs;
    n = l.inputs;

    a = l.delta_gpu;
    b = l.weights_gpu;
    c = state.delta;

    if(c) gemm_ongpu(0,0,m,n,k,1,a,k,b,n,1,c,n);
}
void forward_local_layer_gpu(const local_layer l, network_state state)
{
	int out_h = local_out_height(l);
	int out_w = local_out_width(l);
	int i, j;
	int locations = out_h * out_w;

	for(i = 0; i < l.batch; ++i) {
		copy_ongpu(l.outputs, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1, state.st_handle.stream);
	}

	for(i = 0; i < l.batch; ++i) {
		float *input = state.input + i*l.w*l.h*l.c;
		im2col_ongpu(input, l.c, l.h, l.w,
				l.size, l.stride, l.pad, l.col_image_gpu, state.st_handle.stream);
		float *output = l.output_gpu + i*l.outputs;
		for(j = 0; j < locations; ++j) {
			float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
			float *b = l.col_image_gpu + j;
			float *c = output + j;

			int m = l.n;
			int n = 1;
			int k = l.size*l.size*l.c;

			//printf("passou no local layer forward_local_layer_gpu\n");
			gemm_ongpu(0,0,m,n,k,1,a,k,b,locations,1,c,locations, state.st_handle);
		}
	}
	activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, state.st_handle.stream);
}
Exemple #3
0
void time_ongpu(int TA, int TB, int m, int k, int n)
{
    int iter = 10;
    float *a = random_matrix(m,k);
    float *b = random_matrix(k,n);

    int lda = (!TA)?k:m;
    int ldb = (!TB)?n:k;

    float *c = random_matrix(m,n);

    float *a_cl = cuda_make_array(a, m*k);
    float *b_cl = cuda_make_array(b, k*n);
    float *c_cl = cuda_make_array(c, m*n);

    int i;
    clock_t start = clock(), end;
    for(i = 0; i<iter; ++i){
        gemm_ongpu(TA,TB,m,n,k,1,a_cl,lda,b_cl,ldb,1,c_cl,n);
        cudaThreadSynchronize();
    }
    double flop = ((double)m)*n*(2.*k + 2.)*iter;
    double gflop = flop/pow(10., 9);
    end = clock();
    double seconds = sec(end-start);
    printf("Matrix Multiplication %dx%d * %dx%d, TA=%d, TB=%d: %lf s, %lf GFLOPS\n",m,k,k,n, TA, TB, seconds, gflop/seconds);
    cuda_free(a_cl);
    cuda_free(b_cl);
    cuda_free(c_cl);
    free(a);
    free(b);
    free(c);
}
void backward_local_layer_gpu(local_layer l, network_state state)
{
	int i, j;
	int locations = l.out_w*l.out_h;

	gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu, state.st_handle.stream);
	for(i = 0; i < l.batch; ++i) {
		axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1, state.st_handle.stream);
	}

	for(i = 0; i < l.batch; ++i) {
		float *input = state.input + i*l.w*l.h*l.c;
		im2col_ongpu(input, l.c, l.h, l.w,
				l.size, l.stride, l.pad, l.col_image_gpu, state.st_handle.stream);

		for(j = 0; j < locations; ++j) {
			float *a = l.delta_gpu + i*l.outputs + j;
			float *b = l.col_image_gpu + j;
			float *c = l.weight_updates_gpu + j*l.size*l.size*l.c*l.n;
			int m = l.n;
			int n = l.size*l.size*l.c;
			int k = 1;
			//printf("passou no backward_local_layer_gpu first call\n");
			gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n, state.st_handle);
		}

		if(state.delta) {
			for(j = 0; j < locations; ++j) {
				float *a = l.weights_gpu + j*l.size*l.size*l.c*l.n;
				float *b = l.delta_gpu + i*l.outputs + j;
				float *c = l.col_image_gpu + j;

				int m = l.size*l.size*l.c;
				int n = 1;
				int k = l.n;
				//printf("passou no backward_local_layer_gpu second call\n");
				gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations, state.st_handle);
			}

			col2im_ongpu(l.col_image_gpu, l.c, l.h, l.w, l.size, l.stride, l.pad, state.delta+i*l.c*l.h*l.w, state.st_handle.stream);
		}
	}
}
void backward_local_layer_gpu(local_layer l, network_state state)
{
    int i, j;
    int locations = l.out_w*l.out_h;

    gradient_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation, l.delta_gpu);
    for(i = 0; i < l.batch; ++i){
        axpy_ongpu(l.outputs, 1, l.delta_gpu + i*l.outputs, 1, l.bias_updates_gpu, 1);
    }

    for(i = 0; i < l.batch; ++i){
        float *input = state.input + i*l.w*l.h*l.c;
        im2col_ongpu(input, l.c, l.h, l.w, 
                l.size, l.stride, l.pad, l.col_image_gpu);

        for(j = 0; j < locations; ++j){ 
            float *a = l.delta_gpu + i*l.outputs + j;
            float *b = l.col_image_gpu + j;
            float *c = l.filter_updates_gpu + j*l.size*l.size*l.c*l.n;
            int m = l.n;
            int n = l.size*l.size*l.c;
            int k = 1;

            gemm_ongpu(0,1,m,n,k,1,a,locations,b,locations,1,c,n);
        }

        if(state.delta){
            for(j = 0; j < locations; ++j){ 
                float *a = l.filters_gpu + j*l.size*l.size*l.c*l.n;
                float *b = l.delta_gpu + i*l.outputs + j;
                float *c = l.col_image_gpu + j;

                int m = l.size*l.size*l.c;
                int n = 1;
                int k = l.n;

                gemm_ongpu(1,0,m,n,k,1,a,m,b,locations,0,c,locations);
            }

            col2im_ongpu(l.col_image_gpu, l.c,  l.h,  l.w,  l.size,  l.stride, l.pad, state.delta+i*l.c*l.h*l.w);
        }
    }
}
Exemple #6
0
void gemm_gpu(int TA, int TB, int M, int N, int K, float ALPHA, 
        float *A, int lda, 
        float *B, int ldb,
        float BETA,
        float *C, int ldc)
{
    float *A_gpu = cuda_make_array(A, (TA ? lda*K:lda*M));
    float *B_gpu = cuda_make_array(B, (TB ? ldb*N : ldb*K));
    float *C_gpu = cuda_make_array(C, ldc*M);

    gemm_ongpu(TA, TB, M, N, K, ALPHA, A_gpu, lda, B_gpu, ldb, BETA, C_gpu, ldc);

    cuda_pull_array(C_gpu, C, ldc*M);
    cuda_free(A_gpu);
    cuda_free(B_gpu);
    cuda_free(C_gpu);
}
void forward_connected_layer_gpu(connected_layer l, network_state state)
{
    int i;
    fill_ongpu(l.outputs*l.batch, 0, l.output_gpu, 1);

    int m = l.batch;
    int k = l.inputs;
    int n = l.outputs;
    float * a = state.input;
    float * b = l.weights_gpu;
    float * c = l.output_gpu;
    gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
    if(l.batch_normalize){
        forward_batchnorm_layer_gpu(l, state);
    }
    for(i = 0; i < l.batch; ++i){
        axpy_ongpu(l.outputs, 1, l.biases_gpu, 1, l.output_gpu + i*l.outputs, 1);
    }
    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);
}
Exemple #8
0
void forward_connected_layer_gpu(connected_layer l, network_state state)
{
    int i;
    for(i = 0; i < l.batch; ++i){
        copy_ongpu_offset(l.outputs, l.biases_gpu, 0, 1, l.output_gpu, i*l.outputs, 1);
    }
    int m = l.batch;
    int k = l.inputs;
    int n = l.outputs;
    float * a = state.input;
    float * b = l.weights_gpu;
    float * c = l.output_gpu;
    gemm_ongpu(0,1,m,n,k,1,a,k,b,k,1,c,n);
    activate_array_ongpu(l.output_gpu, l.outputs*l.batch, l.activation);

/*
    cuda_pull_array(l.output_gpu, l.output, l.outputs*l.batch);
    float avg = mean_array(l.output, l.outputs*l.batch);
    printf("%f\n", avg);
    */
}