示例#1
0
int main()
{
	int N, N2;
	printf(" \n Input matrix size N x N, N = ");
	scanf("%d", &N);
	printf(" N = %d \n \n", N);
	N2 = N*N;

	double *A, *B, *C_cpu, *C_gpu, *D_cpu, *D_gpu, t1, t2, cpu_time, gpu_time;
	double r_cpu, *r_gpu, nrmC_cpu, *nrmC_gpu;

	A = (double *) malloc(N2*sizeof(double));
	B = (double *) malloc(N2*sizeof(double));
	C_cpu = (double *) malloc(N2*sizeof(double));
	C_gpu = (double *) malloc(N2*sizeof(double));
	D_cpu = (double *) malloc(N2*sizeof(double));
	D_gpu = (double *) malloc(N2*sizeof(double));

	r_gpu = (double *) malloc(1*sizeof(double));
	nrmC_gpu = (double *) malloc(1*sizeof(double));

	initial(A, B, N);

	t1 = clock();

	#pragma acc data copyin(A[0:N2], B[0:N2]) copyout(C_cpu[0:N2])
	{
		cublas_gemm(A, B, C_cpu, N);
	}
	r_cpu = dot_cpu(C_cpu, B, N2);
	axpy_cpu(-1.0*r_cpu, B, C_cpu, N2);
	nrmC_cpu = norm_cpu(C_cpu, N2);
	copy_cpu(C_cpu, D_cpu, N2);
	scal_cpu(1.0/nrmC_cpu, D_cpu, N2);

	t2 = clock();
	cpu_time = 1.0*(t2 - t1)/CLOCKS_PER_SEC;

	t1 = clock();

	#pragma acc enter data copyin(A[0:N2], B[0:N2]) create(C_gpu[0:N2], r_gpu[0], nrmC_gpu[0], D_gpu[0:N2])
	{
		gpu_cublas1(A, B, C_gpu, D_gpu, r_gpu, nrmC_gpu, N, N2);
	}
	#pragma acc update host(D_gpu[0:N2])

	t2 = clock();
	gpu_time = 1.0*(t2 - t1)/CLOCKS_PER_SEC;
	printf(" gpu part success \n");


	printf(" \n error = %f \n", error(D_cpu, D_gpu, N2));
	printf(" gpu time = %f, cpu times = %f \n", gpu_time, cpu_time);

	return 0;
}
示例#2
0
文件: lsd.c 项目: kunle12/darknet
void slerp(float *start, float *end, float s, int n, float *out)
{
    float omega = acos(dot_cpu(n, start, 1, end, 1));
    float so = sin(omega);
    fill_cpu(n, 0, out, 1);
    axpy_cpu(n, sin((1-s)*omega)/so, start, 1, out, 1);
    axpy_cpu(n, sin(s*omega)/so, end, 1, out, 1);

    float mag = mag_array(out, n);
    scale_array(out, n, 1./mag);
}
示例#3
0
float cuda_compare(float *x_gpu, float *x, size_t n, char *s) {
	float *tmp = calloc(n, sizeof(float));
	cuda_pull_array(x_gpu, tmp, n);
	//int i;
	//for(i = 0; i < n; ++i) printf("%f %f\n", tmp[i], x[i]);
	axpy_cpu(n, -1, x, 1, tmp, 1);
	float err = dot_cpu(n, tmp, 1, tmp, 1);
	printf("Error %s: %f\n", s, sqrt(err / n));
	free(tmp);
	return err;
}
示例#4
0
void forward_cost_layer_gpu(cost_layer l, network_state state)
{
    if (!state.truth) return;
    if (l.cost_type == MASKED) {
        mask_ongpu(l.batch*l.inputs, state.input, state.truth);
    }
    
    copy_ongpu(l.batch*l.inputs, state.truth, 1, l.delta_gpu, 1);
    axpy_ongpu(l.batch*l.inputs, -1, state.input, 1, l.delta_gpu, 1);

    cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs);
    *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1);
}
示例#5
0
void forward_cost_layer(cost_layer l, network_state state)
{
    if (!state.truth) return;
    if(l.cost_type == MASKED){
        int i;
        for(i = 0; i < l.batch*l.inputs; ++i){
            if(state.truth[i] == 0) state.input[i] = 0;
        }
    }
    copy_cpu(l.batch*l.inputs, state.truth, 1, l.delta, 1);
    axpy_cpu(l.batch*l.inputs, -1, state.input, 1, l.delta, 1);
    *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1);
    //printf("cost: %f\n", *l.output);
}
示例#6
0
void forward_cost_layer_gpu(cost_layer l, network_state state)
{
    if (!state.truth) return;
    if (l.cost_type == MASKED) {
        mask_ongpu(l.batch*l.inputs, state.input, SECRET_NUM, state.truth);
    }

    if(l.cost_type == SMOOTH){
        smooth_l1_gpu(l.batch*l.inputs, state.input, state.truth, l.delta_gpu);
    } else {
        copy_ongpu(l.batch*l.inputs, state.truth, 1, l.delta_gpu, 1);
        axpy_ongpu(l.batch*l.inputs, -1, state.input, 1, l.delta_gpu, 1);
    }

    cuda_pull_array(l.delta_gpu, l.delta, l.batch*l.inputs);
    *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1);
}
示例#7
0
void forward_cost_layer(cost_layer l, network_state state)
{
    if (!state.truth) return;
    if(l.cost_type == MASKED){
        int i;
        for(i = 0; i < l.batch*l.inputs; ++i){
            if(state.truth[i] == SECRET_NUM) state.input[i] = SECRET_NUM;
        }
    }
    if(l.cost_type == SMOOTH){
        smooth_l1_cpu(l.batch*l.inputs, state.input, state.truth, l.delta);
    } else {
        copy_cpu(l.batch*l.inputs, state.truth, 1, l.delta, 1);
        axpy_cpu(l.batch*l.inputs, -1, state.input, 1, l.delta, 1);
    }
    *(l.output) = dot_cpu(l.batch*l.inputs, l.delta, 1, l.delta, 1);
    //printf("cost: %f\n", *l.output);
}