Exemplo n.º 1
0
int main(int argc, char* argv[])
{
    acc_set_device_num(0, acc_device_nvidia);

    // read command line arguments
    readcmdline(&options, argc, argv);
    int nx = options.nx;
    int ny = options.ny;
    int N  = options.N;
    int nt = options.nt;

    printf("========================================================================\n");
    printf("                      Welcome to mini-stencil!\n");
    printf("mesh :: %d * %d, dx = %f\n", nx, ny, options.dx);
    printf("time :: %d, time steps from 0 .. %f\n", nt, options.nt * options.dt);
    printf("========================================================================\n");

    // allocate global fields
    x_new = (double*) malloc(sizeof(double)*nx*ny);
    x_old = (double*) malloc(sizeof(double)*nx*ny); 
    bndN  = (double*) malloc(sizeof(double)*nx);
    bndS  = (double*) malloc(sizeof(double)*nx); 
    bndE  = (double*) malloc(sizeof(double)*ny); 
    bndW  = (double*) malloc(sizeof(double)*ny); 

    double* b      = (double*) malloc(N*sizeof(double));
    double* deltax = (double*) malloc(N*sizeof(double));

    // set dirichlet boundary conditions to 0 all around
    memset(x_old, 0, sizeof(double) * nx * ny);
    memset(bndN, 0, sizeof(double) * nx);
    memset(bndS, 0, sizeof(double) * nx);
    memset(bndE, 0, sizeof(double) * ny);
    memset(bndW, 0, sizeof(double) * ny);
    memset(deltax, 0, sizeof(double) * N);

    // set the initial condition
    // a circle of concentration 0.1 centred at (xdim/4, ydim/4) with radius
    // no larger than 1/8 of both xdim and ydim
    memset(x_new, 0, sizeof(double) * nx * ny);
    double xc = 1.0 / 4.0;
    double yc = (ny - 1) * options.dx / 4;
    double radius = fmin(xc, yc) / 2.0;
    int i,j;
    //
    for (j = 0; j < ny; j++)
    {
        double y = (j - 1) * options.dx;
        for (i = 0; i < nx; i++)
        {
            double x = (i - 1) * options.dx;
            if ((x - xc) * (x - xc) + (y - yc) * (y - yc) < radius * radius)
                //((double(*)[nx])x_new)[j][i] = 0.1;
                x_new[i+j*nx] = 0.1;
        }
    }

    flops_bc = 0;
    flops_diff = 0;
    flops_blas1 = 0;
    verbose_output = 0;
    iters_cg = 0;
    iters_newton = 0;

    // initialize temporary storage fields used by the cg solver
    // I do this here so that the fields are persistent between calls
    // to the CG solver. This is useful if we want to avoid malloc/free calls
    // on the device for the OpenACC implementation (feel free to suggest a better
    // method for doing this)
    printf("INITIALIZING CG STATE\n");
    Ap    = (double*) malloc(N*sizeof(double));
    r     = (double*) malloc(N*sizeof(double)); 
    p     = (double*) malloc(N*sizeof(double));
    Fx    = (double*) malloc(N*sizeof(double));
    Fxold = (double*) malloc(N*sizeof(double));
    v     = (double*) malloc(N*sizeof(double));
    xold  = (double*) malloc(N*sizeof(double));

    int cg_converged = 1;

    double timespent;

    // start timer
    timespent = -omp_get_wtime();

    // main timeloop
    double tolerance = 1.e-6;
    int timestep;

    for (timestep = 1; timestep <= nt; timestep++)
      {
	// set x_new and x_old to be the solution
	ss_copy(x_old, x_new, N);

	double residual;
	int    converged = 0;
	int    it = 1;
	for ( ; it <= 50; it++)
	  {
	    // compute residual : requires both x_new and x_old
	    diffusion(x_new, b);
	    residual = ss_norm2(b, N);

	    // check for convergence
	    if (residual < tolerance)
	      {
		converged = 1;
		break;
	      }

	    // solve linear system to get -deltax
	    ss_cg(deltax, b, 200, tolerance, &cg_converged);

	    // check that the CG solver converged
	    if (!cg_converged) break;

	    // update solution
	    ss_axpy(x_new, -1.0, deltax, N);
	  }
	iters_newton += it;

	// output some statistics
	//if (converged && verbose_output)
	if (converged && verbose_output)
	  printf("step %d required %d iterations for residual %E\n", timestep, it, residual);
	if (!converged)
	  {
	    fprintf(stderr, "step %d ERROR : nonlinear iterations failed to converge\n", timestep);
	    break;
	  }
      }

    // get times
    timespent += omp_get_wtime();
    unsigned long long flops_total = flops_diff + flops_blas1;

    ////////////////////////////////////////////////////////////////////
    // write final solution to BOV file for visualization
    ////////////////////////////////////////////////////////////////////

    // binary data
    {
        FILE* output = fopen("output.bin", "w");
        fwrite(x_new, sizeof(double), nx * ny, output);
        fclose(output);
    }

    // metadata
    {
        FILE* output = fopen("output.bov", "wb");
        fprintf(output, "TIME: 0.0\n");
        fprintf(output, "DATA_FILE: output.bin\n");
        fprintf(output, "DATA_SIZE: %d, %d, 1\n", nx, ny);
        fprintf(output, "DATA_FORMAT: DOUBLE\n");
        fprintf(output, "VARIABLE: phi\n");
        fprintf(output, "DATA_ENDIAN: LITTLE\n");
        fprintf(output, "CENTERING: nodal\n");
        //fprintf(output, "BYTE_OFFSET: 4\n");
        fprintf(output, "BRICK_SIZE: 1.0 %f 1.0\n", (ny - 1) * options.dx);
        fclose(output);
    }

    // print table sumarizing results
    printf("--------------------------------------------------------------------------------\n");
    printf("simulation took %f seconds (%f GFLOP/s)\n", timespent, flops_total / 1e9 / timespent);
    printf("%u conjugate gradient iterations\n", iters_cg);
    printf("%u newton iterations\n", iters_newton);
    printf("--------------------------------------------------------------------------------\n");

    // deallocate global fields
    free (x_new);
    free (x_old);
    free (bndN);
    free (bndS);
    free (bndE);
    free (bndW);

    printf("Goodbye!\n");

    return 0;
}
Exemplo n.º 2
0
int main(int argc, char* argv[])
{
    // read command line arguments
    readcmdline(&options, argc, argv);
    int nx = options.nx;
    int ny = options.ny;
    int N  = options.N;
    int nt = options.nt;

    printf("========================================================================\n");
    printf("                      Welcome to mini-stencil!\n");
    printf("mesh :: %d * %d, dx = %f\n", nx, ny, options.dx);
    printf("time :: %d, time steps from 0 .. %f\n", nt, options.nt * options.dt);
    printf("========================================================================\n");

    // allocate global fields
    x_new = (double*) malloc(sizeof(double)*nx*ny);
    x_old = (double*) malloc(sizeof(double)*nx*ny); 
    bndN  = (double*) malloc(sizeof(double)*nx);
    bndS  = (double*) malloc(sizeof(double)*nx); 
    bndE  = (double*) malloc(sizeof(double)*ny); 
    bndW  = (double*) malloc(sizeof(double)*ny); 
	
    double* b      = (double*) malloc(N*sizeof(double));
    double* deltax = (double*) malloc(N*sizeof(double));

	FILE *fp;
	char *source_str_diffusion_center,*source_str_diffusion_east_west,*source_str_diffusion_north_south_corners,*source_str_merged_blas1, *source_str_merged_blas2,*source_str_merged_blas3,*source_str_merged_blas4;
	size_t source_size[8];
	
	fp= fopen("operators.cl","r");
	if (!fp){
		printf("Failed to load kernel/ \n");
		exit(1);
	}
	source_str_diffusion_center=(char*)malloc(MAX_SOURCE_SIZE);
	source_size[0]=fread(source_str_diffusion_center,1,MAX_SOURCE_SIZE,fp);
	fclose(fp);
	
	fp= fopen("operators1.cl","r");
	if (!fp){
		printf("Failed to load kernel/ \n");
		exit(1);
	}
	source_str_diffusion_east_west=(char*)malloc(MAX_SOURCE_SIZE);
	source_size[1]=fread(source_str_diffusion_east_west,1,MAX_SOURCE_SIZE,fp);
	fclose(fp);
	
	fp= fopen("merged_blas1.cl","r");
	if (!fp){
		printf("Failed to load kernel/ \n");
		exit(1);
	}
	source_str_merged_blas1=(char*)malloc(MAX_SOURCE_SIZE);
	source_size[2]=fread(source_str_merged_blas1,1,MAX_SOURCE_SIZE,fp);
	fclose(fp);
	
	fp= fopen("merged_blas2.cl","r");
	if (!fp){
		printf("Failed to load kernel/ \n");
		exit(1);
	}
	source_str_merged_blas2=(char*)malloc(MAX_SOURCE_SIZE);
	source_size[3]=fread(source_str_merged_blas2,1,MAX_SOURCE_SIZE,fp);
	fclose(fp);
	
	fp= fopen("merged_blas3.cl","r");
	if (!fp){
		printf("Failed to load kernel/ \n");
		exit(1);
	}
	source_str_merged_blas3=(char*)malloc(MAX_SOURCE_SIZE);
	source_size[4]=fread(source_str_merged_blas3,1,MAX_SOURCE_SIZE,fp);
	fclose(fp);
	
	fp= fopen("merged_blas4.cl","r");
	if (!fp){
		printf("Failed to load kernel/ \n");
		exit(1);
	}
	source_str_merged_blas4=(char*)malloc(MAX_SOURCE_SIZE);
	source_size[5]=fread(source_str_merged_blas4,1,MAX_SOURCE_SIZE,fp);
	fclose(fp);
	
	fp= fopen("operators2.cl","r");
	if (!fp){
		printf("Failed to load kernel/ \n");
		exit(1);
	}
	source_str_diffusion_north_south_corners=(char*)malloc(MAX_SOURCE_SIZE);
	source_size[6]=fread(source_str_diffusion_north_south_corners,1,MAX_SOURCE_SIZE,fp);
	fclose(fp);
	
	
	
	cl_platform_id platform_id=NULL;
	cl_device_id device_id = NULL;
	cl_uint ret_num_devices;
	cl_uint ret_num_platforms;
	cl_int ret = clGetPlatformIDs (1,&platform_id, & ret_num_platforms);
	ret = clGetDeviceIDs (platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices);
	if (ret!=CL_SUCCESS)
		printf("NO DEVICES");
		
	cl_context context = clCreateContext (NULL, 1, &device_id, NULL, NULL, &ret);
	if (ret!=CL_SUCCESS)
		printf("NO CONTEXT");
	cl_command_queue command_queue=clCreateCommandQueue (context, device_id, 0 , &ret);
	if (ret!=CL_SUCCESS)
		printf("NO QUEUE");
	
	//initializing clblas
	cl_int err1 = clAmdBlasSetup();
    if (err1 != CL_SUCCESS) {
        printf("clAmdBlasSetup() failed with %d\n", err1);
        clReleaseCommandQueue(command_queue);
        clReleaseContext(context);
        return 1;
    }
	
	cl_event event = NULL;
	
	cl_mem x_new_device = clCreateBuffer(context, CL_MEM_READ_WRITE, nx*ny*sizeof(double), NULL, &ret);
	cl_mem x_old_device = clCreateBuffer(context, CL_MEM_READ_WRITE, nx*ny*sizeof(double), NULL, &ret);
	cl_mem b_device = clCreateBuffer(context, CL_MEM_READ_WRITE, N*sizeof(double), NULL, &ret);
	cl_mem deltax_device = clCreateBuffer(context, CL_MEM_READ_WRITE, N*sizeof(double), NULL, &ret);
	cl_mem residual_device = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double), NULL, &ret);
	cl_mem scratchBuff_nrm2 = clCreateBuffer(context, CL_MEM_READ_WRITE, 2*N*sizeof(double), NULL, &ret);
	cl_double residual;
	
	cl_mem bnd_device = clCreateBuffer(context, CL_MEM_READ_WRITE, (2*nx+2*ny)*sizeof(double), NULL, &ret);
	if (ret!=CL_SUCCESS)
		printf("NO BUFFERS");
		
    // set dirichlet boundary conditions to 0 all around
    memset(bndN, 0, sizeof(double) * nx);
    memset(bndS, 0, sizeof(double) * nx);
    memset(bndE, 0, sizeof(double) * ny);
    memset(bndW, 0, sizeof(double) * ny);

	ret=clEnqueueWriteBuffer(command_queue, bnd_device, CL_TRUE, 0 ,nx*sizeof(double), bndN,0, NULL, NULL);
	ret=clEnqueueWriteBuffer(command_queue, bnd_device, CL_TRUE, nx*sizeof(double) ,nx*sizeof(double), bndS,0, NULL, NULL);
	ret=clEnqueueWriteBuffer(command_queue, bnd_device, CL_TRUE, 2*nx*sizeof(double),ny*sizeof(double), bndW,0, NULL, NULL);
	ret=clEnqueueWriteBuffer(command_queue, bnd_device, CL_TRUE, 2*nx*sizeof(double)+ny*sizeof(double) ,ny*sizeof(double), bndE,0, NULL, NULL);
	if (ret!=CL_SUCCESS)
		printf("NO COPYING");
	
	//TODO::clEnqueueFillBuffer(command_queue,bndN_device,,0,nx*sizeof(double)NULL, NULL) - only for OpenCL 1.2
	
    // set the initial condition
    // a circle of concentration 0.1 centred at (xdim/4, ydim/4) with radius
    // no larger than 1/8 of both xdim and ydim
    memset(x_new, 0, sizeof(double) * nx * ny);
    double xc = 1.0 / 4.0;
    double yc = (ny - 1) * options.dx / 4;
    double radius = fmin(xc, yc) / 2.0;
    int i,j;
    //
    for (j = 0; j < ny; j++)
    {
        double y = (j - 1) * options.dx;
        for (i = 0; i < nx; i++)
        {
            double x = (i - 1) * options.dx;
            if ((x - xc) * (x - xc) + (y - yc) * (y - yc) < radius * radius)
                //((double(*)[nx])x_new)[j][i] = 0.1;
                x_new[i+j*nx] = 0.1;
        }
    }

	ret=clEnqueueWriteBuffer(command_queue, x_new_device, CL_TRUE, 0 ,nx*ny*sizeof(double), x_new,0, NULL, NULL);
	if (ret!=CL_SUCCESS)
		printf("NO COPYING");
	
		
    double time_in_bcs = 0.0;
    double time_in_diff = 0.0;
    flops_bc = 0;
    flops_diff = 0;
    flops_blas1 = 0;
    verbose_output = 0;
    iters_cg = 0;
    iters_newton = 0;

    
    // main timeloop
    double alpha = options.alpha;
    cl_double alpha_device=alpha;
	double tolerance = 1.e-6;
    int timestep;
	
	cl_program program[8];
	program[0]=clCreateProgramWithSource(context,1, (const char **)&source_str_diffusion_center, (const size_t*)&source_size[0], &ret);
	if (ret!=CL_SUCCESS)
		printf("NO PROGRAM");
	program[1]=clCreateProgramWithSource(context,1, (const char **)&source_str_diffusion_east_west, (const size_t*)&source_size[1], &ret);
	if (ret!=CL_SUCCESS)
		printf("NO PROGRAM");
	program[2]=clCreateProgramWithSource(context,1, (const char **)&source_str_merged_blas1, (const size_t*)&source_size[2], &ret);
	if (ret!=CL_SUCCESS)
		printf("NO PROGRAM");
	program[3]=clCreateProgramWithSource(context,1, (const char **)&source_str_merged_blas2, (const size_t*)&source_size[3], &ret);
	if (ret!=CL_SUCCESS)
		printf("NO PROGRAM");
	program[4]=clCreateProgramWithSource(context,1, (const char **)&source_str_merged_blas3, (const size_t*)&source_size[4], &ret);
	if (ret!=CL_SUCCESS)
		printf("NO PROGRAM");
	program[5]=clCreateProgramWithSource(context,1, (const char **)&source_str_merged_blas4, (const size_t*)&source_size[5], &ret);
	if (ret!=CL_SUCCESS)
		printf("NO PROGRAM");
	program[6]=clCreateProgramWithSource(context,1, (const char **)&source_str_diffusion_north_south_corners, (const size_t*)&source_size[6], &ret);
	if (ret!=CL_SUCCESS)
		printf("NO PROGRAM");
	
	const char options_cl[] ="";
	ret=clBuildProgram(program[0],1, &device_id, options_cl, NULL, NULL);
	char err=ret;
	if (ret!=CL_SUCCESS)
		printf("%d",err);
	ret=clBuildProgram(program[1],1, &device_id, options_cl, NULL, NULL);
	err=ret;
	if (ret!=CL_SUCCESS)
		printf("%d",err);
	ret=clBuildProgram(program[2],1, &device_id, options_cl, NULL, NULL);
	err=ret;
	if (ret!=CL_SUCCESS)
		printf("%d",err);
	ret=clBuildProgram(program[3],1, &device_id, options_cl, NULL, NULL);
	err=ret;
	if (ret!=CL_SUCCESS)
		printf("%d",err);
	ret=clBuildProgram(program[4],1, &device_id, options_cl, NULL, NULL);
	err=ret;
	if (ret!=CL_SUCCESS)
		printf("%d",err);
	ret=clBuildProgram(program[5],1, &device_id, options_cl, NULL, NULL);
	err=ret;
	if (ret!=CL_SUCCESS)
		printf("%d",err);
	ret=clBuildProgram(program[6],1, &device_id, options_cl, NULL, NULL);
	err=ret;
	if (ret!=CL_SUCCESS)
		printf("%d",err);
	
	cl_kernel kernel[8];
	kernel[0]= clCreateKernel(program[0], "cl_diffusion_center", &ret);
	if (ret!=CL_SUCCESS)
		printf("NO KERNEL 0");
	kernel[1]= clCreateKernel(program[1], "cl_diffusion_east_west", &ret);
	if (ret!=CL_SUCCESS)
		printf("NO KERNEL 1");
	kernel[2]= clCreateKernel(program[2], "cl_merged_blas1", &ret);
	if (ret!=CL_SUCCESS)
		printf("NO KERNEL 2");
	kernel[3]= clCreateKernel(program[3], "cl_merged_blas2", &ret);
		if (ret!=CL_SUCCESS)
		printf("NO KERNEL 3");
	kernel[4]= clCreateKernel(program[4], "cl_merged_blas3", &ret);
		if (ret!=CL_SUCCESS)
		printf("NO KERNEL 4");	
	kernel[5]= clCreateKernel(program[5], "cl_merged_blas4", &ret);
		if (ret!=CL_SUCCESS)
		printf("NO KERNEL 5");	
	kernel[6]= clCreateKernel(program[6], "cl_diffusion_north_south_corners", &ret);
		if (ret!=CL_SUCCESS)
		printf("NO KERNEL 6");
	
	ret=clSetKernelArg(kernel[0],0,sizeof(cl_mem),(void*)&x_new_device);
	ret=clSetKernelArg(kernel[0],1,sizeof(cl_mem),(void*)&b_device);
	ret=clSetKernelArg(kernel[0],2,sizeof(cl_mem),(void*)&x_old_device);
	ret=clSetKernelArg(kernel[0],3,sizeof(cl_mem),(void*)&bnd_device);
	ret=clSetKernelArg(kernel[0],4,sizeof(int),&nx);
	ret=clSetKernelArg(kernel[0],5,sizeof(int),&ny);
	cl_double dxs = (1000.0*options.dx*options.dx);
	ret=clSetKernelArg(kernel[0],6,sizeof(double),&dxs);
	ret=clSetKernelArg(kernel[0],7,sizeof(double),&alpha_device);
	
	if (ret!=CL_SUCCESS)
		printf("NO ARGS");
	size_t global_item_size1[2]={nx,ny};
	size_t local_item_size1[2]={16,16};

	ret=clSetKernelArg(kernel[1],0,sizeof(cl_mem),(void*)&x_new_device);
	ret=clSetKernelArg(kernel[1],1,sizeof(cl_mem),(void*)&b_device);
	ret=clSetKernelArg(kernel[1],2,sizeof(cl_mem),(void*)&x_old_device);
	ret=clSetKernelArg(kernel[1],3,sizeof(cl_mem),(void*)&bnd_device);
	ret=clSetKernelArg(kernel[1],4,sizeof(int),&nx);
	ret=clSetKernelArg(kernel[1],5,sizeof(int),&ny);
	dxs = (1000.0*options.dx*options.dx);
	ret=clSetKernelArg(kernel[1],6,sizeof(double),&dxs);
	ret=clSetKernelArg(kernel[1],7,sizeof(double),&alpha_device);
	
	if (ret!=CL_SUCCESS)
		printf("NO ARGS");
	size_t global_item_size2[2]={(ny-2),2};
	size_t local_item_size2[2]={256,1};
	
	ret=clSetKernelArg(kernel[6],0,sizeof(cl_mem),(void*)&x_new_device);
	ret=clSetKernelArg(kernel[6],1,sizeof(cl_mem),(void*)&b_device);
	ret=clSetKernelArg(kernel[6],2,sizeof(cl_mem),(void*)&x_old_device);
	ret=clSetKernelArg(kernel[6],3,sizeof(cl_mem),(void*)&bnd_device);
	ret=clSetKernelArg(kernel[6],4,sizeof(int),&nx);
	ret=clSetKernelArg(kernel[6],5,sizeof(int),&ny);
	dxs = (1000.0*options.dx*options.dx);
	ret=clSetKernelArg(kernel[6],6,sizeof(double),&dxs);
	ret=clSetKernelArg(kernel[6],7,sizeof(double),&alpha_device);
	
	if (ret!=CL_SUCCESS)
		printf("NO ARGS");
	size_t global_item_size3[2]={(nx),2};
	size_t local_item_size3[2]={256,1};
	
	// start timer
    double timespent = -omp_get_wtime();
	//TODO::EXECUTE THE KERNEL!!!!
	ret=clEnqueueWriteBuffer(command_queue, x_new_device, CL_TRUE, 0 ,(nx*ny)*sizeof(double), x_new,0, NULL, NULL);
	ret=clEnqueueWriteBuffer(command_queue, x_old_device, CL_TRUE, 0 ,(nx*ny)*sizeof(double), x_old,0, NULL, NULL);
	ret=clEnqueueWriteBuffer(command_queue, bnd_device, CL_TRUE, 0 , nx*sizeof(double), bndN,0, NULL, NULL);
	ret=clEnqueueWriteBuffer(command_queue, bnd_device, CL_TRUE, nx*sizeof(double) ,(nx)*sizeof(double), bndS,0, NULL, NULL);
	ret=clEnqueueWriteBuffer(command_queue, bnd_device, CL_TRUE, (2*nx)*sizeof(double) ,(ny)*sizeof(double), bndW,0, NULL, NULL);
	ret=clEnqueueWriteBuffer(command_queue, bnd_device, CL_TRUE, (2*nx+ny)*sizeof(double) ,(ny)*sizeof(double), bndE,0, NULL, NULL);
	ret=clEnqueueWriteBuffer(command_queue, b_device, CL_TRUE, 0 ,(nx*ny)*sizeof(double), b,0, NULL, NULL);
	ret=clEnqueueWriteBuffer(command_queue, deltax_device, CL_TRUE, 0 ,(nx*ny)*sizeof(double), deltax,0, NULL, NULL);
		
    
	for (timestep = 1; timestep <= nt; timestep++)
    {
        // set x_new and x_old to be the solution
        //ss_copy(x_old, x_new, N);
			
		ret=clEnqueueCopyBuffer(command_queue, x_new_device,x_old_device,0,0,N*sizeof(double), 0, NULL, NULL);
		double residual;
        int    converged = 0;
        int    it = 1;
        for ( ; it <= 50; it++)
        {
            // compute residual : requires both x_new and x_old
            //diffusion(x_new, b);
			ret=clSetKernelArg(kernel[0],0,sizeof(cl_mem),(void*)&x_new_device);
			ret=clSetKernelArg(kernel[0],1,sizeof(cl_mem),(void*)&b_device);
			ret=clSetKernelArg(kernel[1],0,sizeof(cl_mem),(void*)&x_new_device);
			ret=clSetKernelArg(kernel[1],1,sizeof(cl_mem),(void*)&b_device);
			ret=clSetKernelArg(kernel[6],0,sizeof(cl_mem),(void*)&x_new_device);
			ret=clSetKernelArg(kernel[6],1,sizeof(cl_mem),(void*)&b_device);
			ret= clEnqueueNDRangeKernel(command_queue, kernel[0], 2, NULL, global_item_size1, local_item_size1, 0 ,NULL, NULL);
			ret= clEnqueueNDRangeKernel(command_queue, kernel[1], 2, NULL, global_item_size2, local_item_size2, 0 ,NULL, NULL);
			ret= clEnqueueNDRangeKernel(command_queue, kernel[6], 2, NULL, global_item_size3, local_item_size3, 0 ,NULL, NULL);
									
			//ss_norm2(b, N);
			ret=clAmdBlasDnrm2(N,residual_device, 0, b_device,0,1, scratchBuff_nrm2,1,&command_queue,0,NULL,NULL);
			ret=clEnqueueReadBuffer(command_queue, residual_device, CL_TRUE, 0 ,sizeof(double), &residual,0, NULL, NULL);
			
            // check for convergence
            if (residual < tolerance)
            {
                converged = 1;
                break;
            }

            // solve linear system to get -deltax
            int cg_converged = 0;
            ss_cg(200, tolerance, &cg_converged,bnd_device,x_new_device,x_old_device,b_device,deltax_device,&command_queue,&context,program,kernel,dxs,alpha_device,nx,ny);
			
            // check that the CG solver converged
            if (!cg_converged) break;

            // update solution
            //ss_axpy(x_new, -1.0, deltax, N);
			cl_double daxpy_alpha = -1.0;
			err=clAmdBlasDaxpy(nx*ny, daxpy_alpha, deltax_device, 0, 1, x_new_device, 0, 1, 1, &command_queue,0, NULL, NULL);
        }
        iters_newton += it;

        // output some statistics
        //if (converged && verbose_output)
        if (converged && verbose_output)
            printf("step %d required %d iterations for residual %E\n", timestep, it, residual);
        if (!converged)
        {
            fprintf(stderr, "step %d ERROR : nonlinear iterations failed to converge\n", timestep);
            break;
        }
    }
	ret=clEnqueueReadBuffer(command_queue, x_new_device, CL_TRUE, 0 ,N*sizeof(double), x_new,0, NULL, NULL);
			
	 // get times
    timespent += omp_get_wtime();
    unsigned long long flops_total = flops_diff + flops_blas1;

	clAmdBlasTeardown();
	ret=clFlush(command_queue);
	ret=clFinish(command_queue);
	
	int loop_temp;
	for (loop_temp=0;loop_temp<7;loop_temp++){
		ret=clReleaseKernel(kernel[loop_temp]);
		ret=clReleaseProgram(program[loop_temp]);
	}
	ret=clReleaseMemObject(x_new_device);
	ret=clReleaseMemObject(x_old_device);
	ret=clReleaseMemObject(bnd_device);
	ret=clReleaseMemObject(b_device);
	ret=clReleaseMemObject(deltax_device);
	ret=clReleaseCommandQueue(command_queue);
	ret=clReleaseContext(context);
	
	
   
    ////////////////////////////////////////////////////////////////////
    // write final solution to BOV file for visualization
    ////////////////////////////////////////////////////////////////////

    // binary data
    {
        FILE* output = fopen("output.bin", "w");
        fwrite(x_new, sizeof(double), nx * ny, output);
        fclose(output);
    }

    // metadata
    {
        FILE* output = fopen("output.bov", "wb");
        fprintf(output, "TIME: 0.0\n");
        fprintf(output, "DATA_FILE: output.bin\n");
        fprintf(output, "DATA_SIZE: %d, %d, 1\n", nx, ny);
        fprintf(output, "DATA_FORMAT: DOUBLE\n");
        fprintf(output, "VARIABLE: phi\n");
        fprintf(output, "DATA_ENDIAN: LITTLE\n");
        fprintf(output, "CENTERING: nodal\n");
        //fprintf(output, "BYTE_OFFSET: 4\n");
        fprintf(output, "BRICK_SIZE: 1.0 %f 1.0\n", (ny - 1) * options.dx);
        fclose(output);
    }

    // print table sumarizing results
    printf("--------------------------------------------------------------------------------\n");
    printf("simulation took %f seconds\n", timespent);
    printf("%d conjugate gradient iterations\n", (int)iters_cg);
    printf("%d newton iterations\n", (int)iters_newton);
    printf("--------------------------------------------------------------------------------\n");

    // deallocate global fields
    free (x_new);
    free (x_old);
    free (bndN);
    free (bndS);
    free (bndE);
    free (bndW);

    printf("Goodbye!\n");

    return 0;
}