Beispiel #1
0
void CompareLocale() {
  json::Value output;
  for (auto& pair : compareList) {
    json::Value lhs, rhs;
    File flhs("locale" / pair.name);
    File frhs("locale_diff" / pair.name);
    if (!frhs) continue;
    if (flhs) json::parse(flhs, lhs, json::mJSCall);
    json::parse(frhs, rhs, json::mJSCall);
    pair.func(lhs, rhs, output);
  }
  json::write(File("locale_diff/stringlist.js", "w"), output, json::mJSON);
}
void main(int argc, char** argv)
{
	//int k = atoi(argv[1]);	
	//int  N  = pow(2,k);
	int	N=1024;
	int	k=10;
	
	float * a = (float *) malloc(sizeof(float)*N* N * 2);
	float * b = (float *) malloc(sizeof(float) *N*N * 2);
	float * c = (float *) malloc(sizeof(float) * N*N* 2);
	float p = 2*M_PI ;	
	for (int i =0; i< N*N; i++)
	{
		a[2*i] = 1;
		a[2*i+1] = 0;
		b[2*i] = 1;
		b[2*i+1] = 0;
	}
#if 0 
	srand(1);
	for(int i =0;i<N*N;i++)
	{	
		a[2*i]=sin(i%N *2 *M_PI);
		//printf("%f\n",uu[2*i]);
		a[2*i+1] =0 ;
	}
#endif
	print_platforms_devices();

	cl_context ctx;
	cl_command_queue queue;
	create_context_on("NVIDIA","GeForce GTX 590",0,&ctx,&queue,0);

	cl_int status;
	cl_mem buf_a = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_b = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float)  * N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");
	
	cl_mem buf_c = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) * N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_d = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float)*N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");
	cl_mem buf_e = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_f = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	cl_mem buf_g = clCreateBuffer(ctx, CL_MEM_READ_WRITE, 
	sizeof(float) *N *N* 2 , 0, &status);
	CHECK_CL_ERROR(status, "clCreateBuffer");

	CALL_CL_GUARDED(clEnqueueWriteBuffer, (
	queue, buf_a, /*blocking*/ CL_TRUE, /*offset*/ 0,
	sizeof(float) *N*N*2, a,
	0, NULL, NULL));

	CALL_CL_GUARDED(clEnqueueWriteBuffer, (
	queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0,
	sizeof(float) *N *N* 2, b,
	0, NULL, NULL));

	CALL_CL_GUARDED(clEnqueueWriteBuffer, (
	queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0,
	sizeof(float)  *N* N*2, c,
	0, NULL, NULL));

	char *knl_text = read_file("vec_add.cl");
	cl_kernel vec_add = kernel_from_string(ctx, knl_text, "sum", NULL);
	free(knl_text);

	knl_text = read_file("mat_etr_mul.cl");
	cl_kernel mat_etr_mul = kernel_from_string(ctx, knl_text, "mult", NULL);
	free(knl_text);


	knl_text = read_file("radix-4-float.cl");
	cl_kernel fft1D = kernel_from_string(ctx, knl_text, "fft1D", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-init.cl");
	cl_kernel fft_init = kernel_from_string(ctx, knl_text, "fft1D_init", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-interm.cl");
	cl_kernel fft_interm = kernel_from_string(ctx, knl_text, "fft1D", NULL);
	free(knl_text);

	knl_text = read_file("transpose-soln-gpu.cl");
	cl_kernel mat_trans = kernel_from_string(ctx, knl_text, "transpose", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-modi.cl");
	cl_kernel fft_init_w = kernel_from_string(ctx, knl_text, "fft1D_init", NULL);
	free(knl_text);

	knl_text = read_file("vec_zero.cl");
	cl_kernel vec_zero = kernel_from_string(ctx, knl_text, "zero", NULL);
	free(knl_text);

	knl_text = read_file("reduction.cl");
	cl_kernel reduct_mul = kernel_from_string(ctx, knl_text, "reduction_mult", NULL);
	free(knl_text);

	knl_text = read_file("reduction1D.cl");
	cl_kernel reduct = kernel_from_string(ctx, knl_text, "reduction", NULL);
	free(knl_text);

	knl_text = read_file("reduction-init.cl");
	cl_kernel reduct_init = kernel_from_string(ctx, knl_text, "reduction_init", NULL);
	free(knl_text);


	knl_text = read_file("reduct-energy.cl");
	cl_kernel reduct_eng = kernel_from_string(ctx, knl_text, "reduction_eng", NULL);
	free(knl_text);

	knl_text = read_file("resid.cl");
	cl_kernel resid = kernel_from_string(ctx, knl_text, "resid", NULL);
	free(knl_text);

	knl_text = read_file("resid-init.cl");
	cl_kernel resid_init = kernel_from_string(ctx, knl_text, "resid_init", NULL);
	free(knl_text);


	knl_text = read_file("radix-4-big.cl");
	cl_kernel fft_big = kernel_from_string(ctx, knl_text, "fft1D_big", NULL);
	free(knl_text);
	knl_text = read_file("radix-4-big-clean.cl");
	cl_kernel fft_clean = kernel_from_string(ctx, knl_text, "fft1D_clean", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-2D.cl");
	cl_kernel fft_2D = kernel_from_string(ctx, knl_text, "fft2D_big", NULL);
	free(knl_text);

	knl_text = read_file("radix-4-2D-clean.cl");
	cl_kernel fft_2D_clean = kernel_from_string(ctx, knl_text, "fft2D_clean", NULL);
	free(knl_text);


	knl_text = read_file("mat-trans-3D.cl");
	cl_kernel mat_trans_3D = kernel_from_string(ctx, knl_text, "transpose_3D", NULL);
	free(knl_text);
	int Ns =1 ;
	int direction = 1;
	timestamp_type time1, time2;
	
	struct parameter param;

	param.N = N;
	param.epsilon = 0.1;
	param.s =1;
	
	float kk =1e-4;



	param.h = 2*PI/N;
	param.N = N;
	
 	param.maxCG = 1000;
	param.maxN = 5;
	
	//Minimum and starting time step
	float mink = 1e-7;
	float startk = 1e-4;

	// Tolerances
	param.Ntol = 1e-4;
	param.cgtol = 1e-7;
	float ksafety = 0.8;
	float kfact = 1.3;
	float kfact2 = 1/1.3;
	float Nfact = 0.7;
	float CGfact = 0.7;
	double elapsed ;

	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);

//for(int s=0;s<100;s++)
	//fft_1D_big(buf_a,buf_b,buf_c,N,fft_big,fft_clean,mat_trans,queue,direction,0);
	//fft_1D_new(buf_a,buf_b,buf_c,N,fft_init,fft_interm, fft1D,queue,direction,0);
	//fft_1D(buf_a,buf_b,buf_c,N,fft_init, fft1D,queue,direction,0);
	//fft2D(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft1D,mat_trans,queue, 1);
	//fft2D_new(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, 1);
	//fft2D_big(buf_a,buf_b,buf_c,buf_d,N,fft_big,fft_clean,mat_trans,queue,direction);
	//fft2D_big_new(buf_a,buf_b,buf_c,buf_d,N,fft_2D,fft_2D_clean,
			//mat_trans,mat_trans_3D,queue,direction);
	//fft_w(buf_a,buf_b,buf_c,buf_d,buf_e,N,0.1,0,1,fft_init_w,fft_init,fft1D,mat_trans,queue);
#if 0
	frhs(buf_a,buf_b,buf_c,buf_d,buf_e,&param,fft1D_init,fft1D,mat_trans,
		 vec_add, queue);
#endif
#if 0	
	float E1 = energy(buf_a, buf_b, buf_c,buf_d, buf_e,buf_f,1e-4, 
				&param, fft_init,fft1D,mat_trans,reduct_eng,
				reduct,queue);
#endif

	
	//float reside = residual(buf_a,buf_b,resid,resid_init,queue,N*N);
	/*fft_d_q(buf_a,buf_b,buf_c,buf_d, N,0.1,k ,1, 
		 fft1D_init,
		fft1D,mat_trans,queue);*/
	//for(int j= 0;j<N;j++)
	//{
		//fft_1D_w_orig(buf_a,buf_b,buf_c,N,fft1D_init,fft1D,queue,1,j);
	//}
	//fft_shar(buf_a,buf_b,buf_c,buf_d,N,0.1,0,1,fft1D_init,fft1D,mat_trans,queue);
	//mat__trans(buf_a,buf_b,N,mat_trans,queue,4,0.1,0,1);
	//double elapsed = reduction_mult(buf_a, buf_b,buf_c,N*N,reduct_mul,reduct,queue);

	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft_1D_big(buf_a,buf_b,buf_c,N*N,fft_big,fft_clean,mat_trans,queue,direction,0);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Hierarchy 1D FFT of size %d  array  on gpu takes %f s\n", N*N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");

	


	CALL_CL_GUARDED(clFinish, (queue));

	
	get_timestamp(&time1);
	fft2D(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft1D,mat_trans,queue, 1);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Navie 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");
	//printf("data access from global achieve %f GB/s\n",sizeof(float)*2*16*N*N/elapsed*1e-9);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft2D_new(buf_a,buf_b,buf_c,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, 1);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("local data exchange 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");


	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft2D_big(buf_a,buf_b,buf_c,buf_d,N,fft_big,fft_clean,mat_trans,queue,direction);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Hierarchy 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");


	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time1);
	fft2D_big_new(buf_a,buf_b,buf_c,buf_d,N,fft_2D,fft_2D_clean,
			mat_trans,mat_trans_3D,queue,direction);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	printf("Using 2D kernel 2D FFT of size %d * %d matrix  on gpu takes %f s\n", N,N,elapsed);
	printf("achieve %f GFLOPS \n",6*2*N*N*k/elapsed*1e-9);
	printf("---------------------------------------------\n");



	get_timestamp(&time1);






	direction = -1;
	//fft_1D(buf_b,buf_c,buf_d,N,fft_init, fft1D,queue,direction,0);
	fft2D(buf_b,buf_c,buf_d,buf_e,N,fft_init,fft1D,mat_trans,queue, direction);
	//fft2D_new(buf_b,buf_c,buf_e,buf_d,N,fft_init,fft_interm,fft1D,mat_trans,queue, -1);
	//fft2D_big(buf_b,buf_c,buf_d,buf_e,N,fft_big,fft_clean,mat_trans,queue,direction);
	CALL_CL_GUARDED(clFinish, (queue));
	get_timestamp(&time2);
	elapsed = timestamp_diff_in_seconds(time1,time2);
	//printf("1D inverse %f s\n", elapsed);
	#if 0
	float test;
	CALL_CL_GUARDED(clFinish, (queue));
	CALL_CL_GUARDED(clEnqueueReadBuffer, (
        	queue, buf_b, /*blocking*/ CL_TRUE, /*offset*/ 0,
       		sizeof(float), &test,
        	0, NULL, NULL));
	

		printf("test success and %f \n",test);		
	

	#endif
	#if 0
	CALL_CL_GUARDED(clFinish, (queue));
	CALL_CL_GUARDED(clEnqueueReadBuffer, (
        	queue, buf_c, /*blocking*/ CL_TRUE, /*offset*/ 0,
       		2*N*N* sizeof(float), c,
        	0, NULL, NULL));
	

	/*for(int i =0; i<  N; i++)
	{
		printf("a%f+ i*",a[2*i]);		
		printf("%f\n",a[2*i+1]);
	}*/
	int T = 10<N? 10:N ;
	for(int i =0; i<  T; i++)
	{
		printf("%f + i*",a[2*i]);		
		printf("%f\t",a[2*i+1]);
		printf("%f + i*",c[2*i]);		
		printf("%f\n",c[2*i+1]);
	}

	#endif 
/*	for( Ns = 1;Ns < N; Ns *= 2 )
	{
		for (int j = 0; j<N/2; j++)
		{
			fftiteration(j,N,Ns,a,b);
		}
		float * d;
		d = a ;
		a = b;
		b = d;
		//printf("ok\n");

	}

*/


	
	CALL_CL_GUARDED(clReleaseMemObject, (buf_a));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_b));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_c));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_d));
	CALL_CL_GUARDED(clReleaseMemObject, (buf_e));
	CALL_CL_GUARDED(clReleaseKernel, (fft1D));
	CALL_CL_GUARDED(clReleaseKernel, (fft_init));
	CALL_CL_GUARDED(clReleaseKernel, (vec_add));
	CALL_CL_GUARDED(clReleaseKernel, (reduct_mul));
	CALL_CL_GUARDED(clReleaseKernel, (reduct));
	CALL_CL_GUARDED(clReleaseKernel, (mat_trans));
	CALL_CL_GUARDED(clReleaseCommandQueue, (queue));
	CALL_CL_GUARDED(clReleaseContext, (ctx));

}
/*---------------------------------RK5------------------------------*/
double ode5(double yold[], double ynew[], double &h, double t, 
		//&h - Call by reference - Increments h in main
        int numberofequations, void frhs(double[], double, double[])) {
    int i;
    double *k1, *k2, *k3, *k4, *k5, *k6, *temp, *ynewstar; //*yerror;  //pointers to arrays

    //Values from Numerical Recipes

    double c2 = 0.2, c3 = 0.3, c4 = 0.8, c5 = 8.0 / 9.0, //Constants
            a21 = 0.2, //Coefficients
            a31 = 3.0 / 40.0, a32 = 9.0 / 40.0,
            a41 = 44.0 / 45.0, a42 = -56.0 / 15.0, a43 = 32.0 / 9.0,
            
            a51 = 19372.0 / 6561.0, a52 = -25360.0 / 2187.0, 
            a53 = 64448.0 / 6561.0, a54 = -212.0 / 729.0,
            
            a61 = 9017.0 / 3168.0, a62 = -355.0 / 33.0, a63 = 46732.0 / 5247.0, 
            a64 = 49.0 / 176.0, a65 = -5103.0 / 18656.0,
            
            a71 = 35.0 / 384.0, a73 = 500.0 / 1113.0, a74 = 125.0 / 192.0, 
            a75 = -2187.0 / 6784.0, a76 = 11.0 / 84.0;
    /*Error value coefficients*/ 
    double e1 = 71.0 / 57600.0, e3 = -71.0 / 16695.0, e4 = 71.0 / 1920.0, 
            e5 = -17253.0 / 339200.0, e6 = 22.0 / 525.0, e7 = -1.0 / 40.0;
    double *error, *scale; //Scale now
    double hmin = 1.0e-10; 
    //Minimum size to avoid getting stuck in an infinite loop
    
    double tolerable_error = 1.0e-7; //Error limit
    double error_min = 1.0e-6; //Error limit for scale
    double max_error_i; //To find maximum error
    k1 = new double[10 * numberofequations]; 
    //integer*numberofequations where the integer=# of arrays being used
    
    if (NULL == k1) {
        printf("Cannot allocate k1 in ode5() \n");
        return (0);
    }
    k2 = k1 + numberofequations;
    k3 = k2 + numberofequations;
    k4 = k3 + numberofequations;
    k5 = k4 + numberofequations;
    k6 = k5 + numberofequations;
    temp = k6 + numberofequations;
    ynewstar = temp + numberofequations; //For the embedded 4th order
    error = ynewstar + numberofequations; //For the error array
    scale = error + numberofequations; //For the scale array
    frhs(yold, t, k1); // Get the RHS
    //printf("\n Calling frhs (yold,t,k1) \n");


repeat_current_step: //goto label
    //printf("h = %f\n",h);
    double h_remember = h;
    for (i = 0; i < numberofequations; i++)//Step 1
    {
        temp[i] = yold[i] + a21 * h * k1[i];
    }
    frhs(temp, t + c2*h, k2); //Step 2
    //printf("\n Calling frhs (yold,t+c2*h,k2) \n");
    for (i = 0; i < numberofequations; i++) {
        temp[i] = yold[i] + h * (a31 * k1[i] + a32 * k2[i]);
    }
    //printf("\n Calling frhs (yold,t+c3*h,k3) \n");
    frhs(temp, t + c3*h, k3); //Step 3
    for (i = 0; i < numberofequations; i++) {
        temp[i] = yold[i] + h * (a41 * k1[i] + a42 * k2[i] + a43 * k3[i]);
    }
    //printf("\n Calling frhs (yold,t+c4*h,k4) \n");
    frhs(temp, t + c4*h, k4); //Step 4
    for (i = 0; i < numberofequations; i++) {
        temp[i] = yold[i] + h * (a51 * k1[i] + a52 * k2[i] + 
                a53 * k3[i] + a54 * k4[i]);
    }
    //printf("\n Calling frhs (yold,t+c5*h,k5) \n");
    frhs(temp, t + c5*h, k5); //Step 5
    for (i = 0; i < numberofequations; i++) {
        temp[i] = yold[i] + h * (a61 * k1[i] + a62 * k2[i] + a63 * k3[i] 
                                + a64 * k4[i] + a65 * k5[i]);
    }
    //printf("\n Calling frhs (yold,t+c6*h,k6) \n");
    frhs(temp, t + h, k6); //Step 6
    for (i = 0; i < numberofequations; i++) {
        ynew[i] = yold[i] + h * (a71 * k1[i] + a73 * k3[i] + a74 * k4[i] 
                                + a75 * k5[i] + a76 * k6[i]);
    }

    //printf("\n Final \n");
    frhs(ynew, t + h, ynewstar); //Final

    for (i = 0; i < numberofequations; i++)//to set up the scale
        scale[i] = fabs(yold[i]) + fabs(h * k1[i]) + fabs(error_min);

    for (i = 0; i < numberofequations; i++)//Estimate error
    {
        error[i] = (h * (e1 * k1[i] + e3 * k3[i] + e4 * k4[i] 
                + e5 * k5[i] + e6 * k6[i] + e7 * ynewstar[i])) / scale[i];
        //printf("Error(%d) = %g\n",i,error[i]);
    }

    max_error_i = fabs(error[0]);
    for (i = 0; i < numberofequations; i++) {
        if (fabs(error[i]) > max_error_i) {
            max_error_i = fabs(error[i]);
        }
    }//Finding the maximum error

    if (max_error_i > tolerable_error) {
        h = h / 5;

        if (h < hmin) exit(0);

        goto repeat_current_step;
    }

    if (max_error_i < tolerable_error) {
        h = h * (pow(tolerable_error / max_error_i, 0.2));
    }
    //printf("h value in ode5 = %f\n",h);

    //getch();
    //exit(0);

    delete[]k1; //Free memory
    return (h_remember);
}//end ode5