Esempio n. 1
0
int main(int argc, char** argv) 
{

   int step,burst;

   int nparticle = 8192; /* MUST be a nice power of two for simplicity */
   int nstep = 500;
   int nburst = 20; /* MUST divide the value of nstep without remainder */
   int nthread = 64; /* chosen for ATI Radeon HD 5870 */

   float dt = 0.0001;
   float eps = 0.0001;
   cl_float4* pos1 = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0);
   cl_float4* pos2 = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0);
   cl_float4* vel = (cl_float4*)clmalloc(stdgpu,nparticle*sizeof(cl_float4),0);

   nbody_init(nparticle,pos1,vel);

   void* h = clopen(stdgpu,"nbody_kern.cl",CLLD_NOW);
   cl_kernel krn = clsym(stdgpu,h,"nbody_kern",CLLD_NOW);

   clndrange_t ndr = clndrange_init1d(0,nparticle,nthread);

   clarg_set(stdgpu,krn,0,dt);
   clarg_set(stdgpu,krn,1,eps);
   clarg_set_global(stdgpu,krn,4,vel);
   clarg_set_local(stdgpu,krn,5,nthread*sizeof(cl_float4));

	clmsync(stdgpu,0,pos1,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
	clmsync(stdgpu,0,vel,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   for(step=0; step<nstep; step+=nburst) {

      for(burst=0; burst<nburst; burst+=2) {

         clarg_set_global(stdgpu,krn,2,pos1);
         clarg_set_global(stdgpu,krn,3,pos2);
         clfork(stdgpu,0,krn,&ndr,CL_EVENT_NOWAIT);

         clarg_set_global(stdgpu,krn,2,pos2);
         clarg_set_global(stdgpu,krn,3,pos1);
         clfork(stdgpu,0,krn,&ndr,CL_EVENT_NOWAIT);
      
      }

      clmsync(stdgpu,0,pos1,CL_MEM_HOST|CL_EVENT_NOWAIT);

      clwait(stdgpu,0,CL_KERNEL_EVENT|CL_MEM_EVENT);

   }

   nbody_output(nparticle,pos1,vel);

   clclose(stdgpu,h);

   clfree(pos1);
   clfree(pos2);
   clfree(vel);  
}
Esempio n. 2
0
File: nD.cpp Progetto: nickoppen/nD
void exKernel(void * clHandle, clndrange_t * ndr, char * kernelName, int rows, int cols, cl_uchar * wrkArea, char * debugFile)
{
	cl_kernel krn;
	char msg[255];

	sprintf(msg, "Getting the kernel %s with clsym\n", kernelName);
	debugdebug(debugFile, msg);

	krn = clsym(stdacc, clHandle, kernelName, CLLD_NOW);   // grab the kernel from the program just loaded

	sprintf(msg, "calling clForka with rows: %d and cols: %d\n", rows, cols);
	debugdebug(debugFile, msg);
	clforka(stdacc, 0, krn, ndr, CL_EVENT_WAIT, 1, rows, cols, wrkArea);

	debugdebug(debugFile, (char*)"Transferring memory contents from the Epiphany using clmsync\n");
	clmsync(stdacc, 0, wrkArea, CL_MEM_HOST|CL_EVENT_WAIT);

}
Esempio n. 3
0
int main()
{
   cl_uint n = 64;

#if(1)

	/* use default contexts, if no GPU use CPU */
   CLCONTEXT* cp = (stdgpu)? stdgpu : stdcpu;

   unsigned int devnum = 0;

   void* clh = clopen(cp,"matvecmult.cl",CLLD_NOW);
   cl_kernel krn = clsym(cp,clh,"matvecmult_kern",0);

   /* allocate OpenCL device-sharable memory */
   cl_float* aa = (float*)clmalloc(cp,n*n*sizeof(cl_float),0);
   cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0);

   clndrange_t ndr = clndrange_init1d( 0, n, 64);

   /* initialize vectors a[] and b[], zero c[] */
   int i,j; 
   for(i=0;i<n;i++) for(j=0;j<n;j++) aa[i*n+j] = 1.1f*i*j;
   for(i=0;i<n;i++) b[i] = 2.2f*i;
   for(i=0;i<n;i++) c[i] = 0.0f;

   /* define the computational domain and workgroup size */
   //clndrange_t ndr = clndrange_init1d( 0, n, 64);

   /* non-blocking sync vectors a and b to device memory (copy to GPU)*/
   clmsync(cp,devnum,aa,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   /* set the kernel arguments */
   clarg_set(cp,krn,0,n);
   clarg_set_global(cp,krn,1,aa);
   clarg_set_global(cp,krn,2,b);
   clarg_set_global(cp,krn,3,c);

   /* non-blocking fork of the OpenCL kernel to execute on the GPU */
   clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);

   /* non-blocking sync vector c to host memory (copy back to host) */
   clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT);

   /* force execution of operations in command queue (non-blocking call) */
   clflush(cp,devnum,0);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   for(i=0;i<n;i++) printf("%d %f %f\n",i,b[i],c[i]);

   clfree(aa);
   clfree(b);
   clfree(c);

   clclose(cp,clh);

#endif

}
Esempio n. 4
0
int main()
{
   cl_uint n = 1024;

	/* use default contexts, if no GPU use CPU */
   CLCONTEXT* cp = (stdgpu)? stdgpu : stdcpu;

   unsigned int devnum = 0;

#ifdef __FreeBSD__
   void* clh = clopen(cp,"matvecmult_special.cl",CLLD_NOW);
   cl_kernel krn = clsym(cp,clh,"matvecmult_special_kern",0);
#else
   cl_kernel krn = clsym(cp,0,"matvecmult_special_kern",0);
#endif

   /* allocate OpenCL device-sharable memory */
   cl_float* aa = (float*)clmalloc(cp,n*n*sizeof(cl_float),0);
   cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0);

   /* initialize vectors a[] and b[], zero c[] */
   int i,j; 
   for(i=0;i<n;i++) for(j=0;j<n;j++) aa[i*n+j] = 1.1f*i*j;
   for(i=0;i<n;i++) b[i] = 2.2f*i;
   for(i=0;i<n;i++) c[i] = 0.0f;


	/***
	 *** Create a image2d allocation to be used as a read-only table.
	 *** The table will consist of a 24x24 array of float coefficients.
	 *** The clmctl() call is used to set the type and shape of the table.
	 *** Note that we will only use the first component of the float4 elements.
	 ***/	
	cl_float4* table 
		= (cl_float4*)clmalloc(cp,24*24*sizeof(cl_float4),CL_MEM_DETACHED);
	clmctl(table,CL_MCTL_SET_IMAGE2D,24,24,0);
	clmattach(cp,table);

	/* initialize the table to some contrived values */
	for(i=0;i<24;i++) for(j=0;j<24;j++) table[i*24+j].x = 0.125f*(i-j);


   /* define the computational domain and workgroup size */
   clndrange_t ndr = clndrange_init1d( 0, n, 64);

   /* non-blocking sync vectors a and b to device memory (copy to GPU)*/
   clmsync(cp,devnum,aa,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   clmsync(cp,devnum,table,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   /* set the kernel arguments */
   clarg_set(cp,krn,0,n);
   clarg_set_global(cp,krn,1,aa);
   clarg_set_global(cp,krn,2,b);
   clarg_set_global(cp,krn,3,c);
   clarg_set_global(cp,krn,4,table);

   /* non-blocking fork of the OpenCL kernel to execute on the GPU */
   clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);

   /* non-blocking sync vector c to host memory (copy back to host) */
   clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   for(i=0;i<n;i++) printf("%d %f %f\n",i,b[i],c[i]);

   clfree(aa);
   clfree(b);
   clfree(c);

#ifdef __FreeBSD__
	clclose(cp,clh);
#endif

}
Esempio n. 5
0
int main()
{
   cl_uint n = 1024;

	/* use default contexts, if no ACCELERATOR use CPU */
   CLCONTEXT* cp = (stdacc)? stdacc : stdcpu;

   unsigned int devnum = 0;

	/******************************************************************
	 *** this example requires the .cl file to be available at run-time
	 *** and shows how to pass compiler options to the OCL compiler
	 ******************************************************************/

   void* clh = clopen(cp,"outerprod.cl",CLLD_NOBUILD);
   clbuild(cp,clh,"-D COEF=2", 0);
   cl_kernel krn = clsym(cp,clh,"outerprod_kern",0);

	if (!krn) { fprintf(stderr,"error: no OpenCL kernel\n"); exit(-1); }

   /* allocate OpenCL device-sharable memory */
   cl_float* a = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* b = (float*)clmalloc(cp,n*sizeof(cl_float),0);
   cl_float* c = (float*)clmalloc(cp,n*sizeof(cl_float),0);

   /* initialize vectors a[] and b[], zero c[] */
   int i; 
   for(i=0;i<n;i++) a[i] = 1.1f*i;
   for(i=0;i<n;i++) b[i] = 2.2f*i;
   for(i=0;i<n;i++) c[i] = 0.0f;

   /* non-blocking sync vectors a and b to device memory (copy to GPU)*/
   clmsync(cp,devnum,a,CL_MEM_DEVICE|CL_EVENT_WAIT);
   clmsync(cp,devnum,b,CL_MEM_DEVICE|CL_EVENT_WAIT);

   /* define the computational domain and workgroup size */
   clndrange_t ndr = clndrange_init1d( 0, n, 16);

   /* set the kernel arguments */
   clarg_set_global(cp,krn,0,a);
   clarg_set_global(cp,krn,1,b);
   clarg_set_global(cp,krn,2,c);

   /* non-blocking fork of the OpenCL kernel to execute on the GPU */
   clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   /* non-blocking sync vector c to host memory (copy back to host) */
   clmsync(cp,0,c,CL_MEM_HOST|CL_EVENT_NOWAIT);

   /* block on completion of operations in command queue */
   clwait(cp,devnum,CL_ALL_EVENT);

   for(i=0;i<n;i++) printf("%d %f %f %f\n",i,a[i],b[i],c[i]);

   clfree(a);
   clfree(b);
   clfree(c);

   clclose(cp,clh);
}
Esempio n. 6
0
/*
 * Calculates the path loss matrix for many transmitters and saves the
 * output to one common raster map.
 *
 * int nrows .............. the number of rows in the DEM.
 * int ncols .............. the number of columns in the DEM.
 * int tx_coord_col ....... transmitter column coordinate (in raster cells).
 * int tx_coord_row ....... transmitter row coordinate (in raster cells).
 * int tx_elevation ....... transmitter height above sea level.
 * int anthena_height ..... height of transmitter anthena above ground.
 * float rx_height ........ receiver height above ground.
 * float frequency ........ transmitter frequency in Mhz.
 * int radius ............. calculation radius around to the transmitter
 *                          (in raster cells).
 * int infd ............... file descriptor to read from the DEM.
 * int outfd .............. file descriptor to write resulting raster.
 * RASTER_MAP_TYPE rtype .. DEM element data type.
 *
 */
void hata_cl (const int nrows, const int ncols,
              const int tx_coord_col, const int tx_coord_row,
              const int tx_elevation, const int tx_anthena_height,
              const float rx_height, const float frequency,
              const int radius,
              const int infd, const int outfd, 
              RASTER_MAP_TYPE rtype)
{
    int i;

    /* number of processing tiles needed around each transmitter */
    int ntile = (radius*2) / _HATA_WITEM_PER_DIM_ + 1;

    G_message ("Receiver height above ground is %5.2f mts", rx_height);
    G_message ("Transmitter frequency is %7.2f Mhz", frequency);
    G_message ("Calculation radius is %d raster cells", radius);

    CONTEXT * ctx = init_context ( );

    /* initialize raster data */
    void *dem_in = load_raster (ctx, nrows, ncols, infd, rtype);
    void *dem_out = null_raster (ctx, nrows, ncols, rtype, NULL);

    /* load, compile and link the OpenCL kernel file */
    void* h = clopen (ctx, NULL, CLLD_NOW);
    /* select a kernel function from the loaded file */
    cl_kernel krn = clsym (ctx, h, "hata_urban_kern", CLLD_NOW);

    /* defines a 2D matrix of work groups & work items */
    size_t global_wsize = ntile * _HATA_WITEM_PER_DIM_;
    size_t local_wsize = _HATA_WITEM_PER_DIM_;

    if ((global_wsize % local_wsize) != 0)
        G_fatal_error ("Work group dimensions are incorrect.");
    else
        G_message ("Creating %d work-group(s) of [%d x %d] threads each", 
                   global_wsize/local_wsize,
                   local_wsize, local_wsize);

    clndrange_t ndr = clndrange_init2d (0,global_wsize,local_wsize, 
                                        0,global_wsize,local_wsize);

    // correction factor ahr
    float ahr = (1.1f*log10(frequency) - 0.7f)*rx_height - 
                (1.56f*log10(frequency) - 0.8f);

    /* set kernel parameters and local memory size */
    clarg_set (ctx, krn, 0, ncols);
    clarg_set (ctx, krn, 3, rx_height);
    clarg_set (ctx, krn, 4, frequency);
    clarg_set (ctx, krn, 5, ahr);

    size_t lmem_size = _HATA_WITEM_PER_DIM_*_HATA_WITEM_PER_DIM_*
                       sizeof(cl_float2);

    if (lmem_size > _HATA_MAX_LOCAL_MEM_)
        G_fatal_error ("Allocated local memory (%d) exceeds %d bytes.", lmem_size,
                                                                        _HATA_MAX_LOCAL_MEM_);

    clarg_set_local (ctx, krn, 8, lmem_size);

    /* transfer data to the device */
    clmsync (ctx, 0, dem_in, CL_MEM_DEVICE|CL_EVENT_NOWAIT);
    clmsync (ctx, 0, dem_out, CL_MEM_DEVICE|CL_EVENT_NOWAIT);

    /* set remaining kernel parameters */
    clarg_set_global (ctx, krn, 6, dem_in);
    clarg_set_global (ctx, krn, 7, dem_out);

    G_message ("Simulating ...");

    /* test transmitters

         ASNEBE | 1468 | 1200
         ASKZK  | 1395 | 1180
         ASPAR  | 1400 | 1245
         ASMARG | 1460 | 1203
         ASTARA | 1422 | 1235
         ASKZ   | 1396 | 1181
         ASENTP | 1387 | 1185
         ASISKP | 1399 | 1189
         ASEME  | 1448 | 1262
         ASOSTR | 1498 | 1253
         ASMELT | 1429 | 1196
         ASMART | 1446 | 1213
         ASLOM  | 1425 | 1226
         ASAZU  | 1421 | 1235
         ASONCE | 1433 | 1208
         ASMAR  | 1404 | 1150
         ASTEP  | 1450 | 1235
         ASAVLJ | 1417 | 1182
         ASTOZI | 1436 | 1198
         ASTRAL | 1422 | 1227
         ASTEG  | 1404 | 1193
    */
    const int ntest_tx = 21;
    cl_int2 *test_tx = malloc (ntest_tx * sizeof(cl_int2));

    test_tx[0] = (cl_int2) {1468,1200};
    test_tx[1] = (cl_int2) {1395,1180};
    test_tx[2] = (cl_int2) {1400,1245};
    test_tx[3] = (cl_int2) {1460,1203};
    test_tx[4] = (cl_int2) {1422,1235};
    test_tx[5] = (cl_int2) {1396,1181};
    test_tx[6] = (cl_int2) {1387,1185};
    test_tx[7] = (cl_int2) {1399,1189};
    test_tx[8] = (cl_int2) {1448,1262};
    test_tx[9] = (cl_int2) {1498,1253};
    test_tx[10] = (cl_int2) {1429,1196};
    test_tx[11] = (cl_int2) {1446,1213};
    test_tx[12] = (cl_int2) {1425,1226};
    test_tx[13] = (cl_int2) {1421,1235};
    test_tx[14] = (cl_int2) {1433,1208};
    test_tx[15] = (cl_int2) {1404,1150};
    test_tx[16] = (cl_int2) {1450,1235};
    test_tx[17] = (cl_int2) {1417,1182};
    test_tx[18] = (cl_int2) {1436,1198};
    test_tx[19] = (cl_int2) {1422,1227};
    test_tx[20] = (cl_int2) {1404,1193};

    /* for each transmitter */
    for (i = 0; i < ntest_tx; i++)
    {
        /* display completion percentage */
        G_percent (i, ntest_tx, 1);

        /* Transmitter coordinates */
        cl_int2 tx_coord = test_tx[i];

        //G_message ("The transmitter is located at %d, %d", tx_coord.x,
        //                                                   tx_coord.y);
        /* Transmitter heights */
        //G_message ("Transmitter height above sea level is %d mts", tx_elevation);
        //G_message ("Transmitter anthena height above ground is %d mts", tx_anthena_height);

        /* transmitter data */ 
        cl_int4 tx_data = (cl_int4) {tx_coord.x, tx_coord.y,
                                     tx_elevation, tx_anthena_height};

        /* tile offset within the raster */
        cl_int2 tx_offset = (cl_int2) {tx_coord.x - radius,
                                       tx_coord.y - radius};

        /* set (more) remaining kernel parameters */
        clarg_set (ctx, krn, 1, tx_data);
        clarg_set (ctx, krn, 2, tx_offset);

        /* start kernel execution */
        clfork (ctx, 0, krn, &ndr, CL_EVENT_NOWAIT);

        clwait (ctx, 0, CL_KERNEL_EVENT|CL_MEM_EVENT|CL_EVENT_RELEASE);
    }

    /* sync memory */
    clmsync (ctx, 0, dem_out, CL_MEM_HOST|CL_EVENT_WAIT);

    /* write the calculation output */
    save_raster (dem_out, nrows, ncols, outfd);

    /* free allocated resources */
    free (test_tx);
    clclose (ctx, h);
    clfree (dem_out);
    clfree (dem_in);
}
Esempio n. 7
0
/*
 * Calculates an interference matrix resulting from the tx power of all
 * transmitters combined with their respective path losses and the thermal
 * noise. The output is saved in one raster map.
 *
 * int nrows .............. the number of rows in the DEM.
 * int ncols .............. the number of columns in the DEM.
 * int tx_coord_col ....... transmitter column coordinate (in raster cells).
 * int tx_coord_row ....... transmitter row coordinate (in raster cells).
 * int tx_elevation ....... transmitter height above sea level.
 * int anthena_height ..... height of transmitter anthena above ground.
 * float rx_height ........ receiver height above ground.
 * float frequency ........ transmitter frequency in Mhz.
 * int radius ............. calculation radius around to the transmitter
 *                          (in raster cells).
 * int infd ............... file descriptor to read from the DEM.
 * int outfd .............. file descriptor to write the interference
 *                          data to.
 * RASTER_MAP_TYPE rtype .. DEM element data type.
 *
 */
void hata_interference_cl (const int nrows, const int ncols,
                           const int tx_coord_col, const int tx_coord_row,
                           const int tx_elevation, const int tx_anthena_height,
                           const float rx_height, const float frequency,
                           const int radius,
                           const int infd, const int outfd, 
                           RASTER_MAP_TYPE rtype)
{
    int i;

    /* number of processing tiles needed around each transmitter */
    int ntile = (radius*2) / _HATA_WITEM_PER_DIM_ + 1;

    G_message ("Receiver height above ground is %5.2f mts", rx_height);
    G_message ("Transmitter frequency is %7.2f Mhz", frequency);
    G_message ("Calculation radius is %d raster cells", radius);

    CONTEXT * ctx = init_context ( );

    /* initialize raster data */
    void *dem_in = load_raster (ctx, nrows, ncols, infd, rtype);

    /* interference matrix */
    void *qrm_out = alloc_raster (ctx, nrows, ncols, rtype, 
                                  (FCELL)_HATA_THERMAL_NOISE_, NULL);

    /* load, compile and link the OpenCL kernel file */
    void* h = clopen (ctx, NULL, CLLD_NOW);

    /* select a kernel function from the loaded file */
    cl_kernel krn = clsym (ctx, h, "hata_urban_interference", CLLD_NOW);

    /* defines a 2D matrix of work groups & work items */
    size_t global_wsize = ntile * _HATA_WITEM_PER_DIM_;
    size_t local_wsize = _HATA_WITEM_PER_DIM_;

    if ((global_wsize % local_wsize) != 0)
        G_fatal_error ("Work group dimensions are incorrect.");
    else
        G_message ("Creating %d work-group(s) of [%d x %d] threads each", 
                   global_wsize/local_wsize,
                   local_wsize, local_wsize);

    clndrange_t ndr = clndrange_init2d (0,global_wsize,local_wsize, 
                                        0,global_wsize,local_wsize);

    // correction factor ahr
    float ahr = (1.1f*log10(frequency) - 0.7f)*rx_height - 
                (1.56f*log10(frequency) - 0.8f);

    /* set kernel parameters and local memory size */
    clarg_set (ctx, krn, 0, ncols);
    clarg_set (ctx, krn, 3, rx_height);
    clarg_set (ctx, krn, 4, frequency);
    clarg_set (ctx, krn, 5, ahr);

    size_t lmem_size = _HATA_WITEM_PER_DIM_*_HATA_WITEM_PER_DIM_*
                       sizeof(cl_float2);

    if (lmem_size > _HATA_MAX_LOCAL_MEM_)
        G_fatal_error ("Allocated local memory (%d) exceeds %d bytes.", lmem_size,
                                                                        _HATA_MAX_LOCAL_MEM_);

    clarg_set_local (ctx, krn, 8, lmem_size);

    /* transfer data to the device */
    clmsync (ctx, 0, dem_in, CL_MEM_DEVICE|CL_EVENT_NOWAIT);
    clmsync (ctx, 0, qrm_out, CL_MEM_DEVICE|CL_EVENT_WAIT);

    /* set remaining kernel parameters */
    clarg_set_global (ctx, krn, 6, dem_in);
    clarg_set_global (ctx, krn, 7, qrm_out);

    G_message ("Simulating ...");

    /**
     * Test transmitters 
     * 
     * ASNEBE  | 1468 | 1200
     * ASKZK   | 1395 | 1180
     * ALEK    | 1408 | 1211
     * ANDREJ  | 1431 | 1212
     * AMALENA | 1462 | 1272
     * APOLJE  | 1483 | 1227
     * AMONS   | 1381 | 1230
     * ANTENA  | 1397 | 1238
     * ATOMAC  | 1443 | 1194
     * ATUNEL  | 1424 | 1239
     * AKOSET  | 1398 | 1206
     * ASPAR   | 1400 | 1245
     * ACHEMO  | 1431 | 1226
     * ALIVAD  | 1423 | 1248
     * ASMARG  | 1460 | 1203
     * ABTC    | 1451 | 1215
     * AZIMA   | 1471 | 1236
     * ASTARA  | 1422 | 1235
     * AVEGAS  | 1434 | 1257
     * APOVSE  | 1436 | 1232
     * ABTCST  | 1449 | 1213
     * AMOBI   | 1429 | 1222
     * APODUT  | 1378 | 1202
     * AOBZEL  | 1444 | 1222
     * AGRIC   | 1385 | 1207
     * ADELO   | 1421 | 1222
     * AJEZA   | 1459 | 1182
     * ASKZ    | 1396 | 1181
     * ATLK    | 1422 | 1225
     * AGR     | 1424 | 1220
     * ADNEVN  | 1426 | 1232
     * AMHOTE  | 1409 | 1212
     * AVIC    | 1414 | 1236
     * AJOZEF  | 1429 | 1234
     * ATVSLO  | 1425 | 1228
     * ACRNUC  | 1445 | 1172
     * ANGEL   | 1508 | 1219
     * AKOSEG  | 1401 | 1210
     * AMSC    | 1429 | 1222
     * AGRAM   | 1421 | 1195
     * ARAKTK  | 1436 | 1249
     * AIMKO   | 1445 | 1179
     * AKOVIN  | 1412 | 1214
     * ADOLGI  | 1388 | 1250
     * APETER  | 1431 | 1231
     * ASENTP  | 1387 | 1185
     * AOBI    | 1443 | 1268
     * AMZS    | 1427 | 1203
     * AVICTK  | 1403 | 1240
     * AZUR    | 1397 | 1238
     * AKAMNA  | 1388 | 1200
     * APOLCE  | 1482 | 1228
     * AKASEL  | 1497 | 1232
     * ASISKP  | 1399 | 1189
     * AKOZAR  | 1373 | 1247
     * ASEME   | 1448 | 1262
     * AURSKA  | 1424 | 1220
     * ABEZIT  | 1427 | 1199
     * AVEROV  | 1416 | 1201
     * ATEGR   | 1419 | 1209
     * ATRUB   | 1423 | 1231
     * AKOLIN  | 1436 | 1221
     * AVELAN  | 1439 | 1218
     * ABEZI   | 1431 | 1216
     * AGAMGD  | 1418 | 1148
     * AGURS   | 1431 | 1237
     * AGPG    | 1419 | 1237
     * AKOSME  | 1397 | 1210
     * AZALOG  | 1500 | 1225
     * ASOSTR  | 1498 | 1253
     * AROZNA  | 1404 | 1235
     * AVRHO   | 1380 | 1242
     * ARAK    | 1430 | 1244
     * AVILA   | 1404 | 1230
     * AKOLEZ  | 1413 | 1241
     * AJEZKZ  | 1427 | 1185
     * AVIZMA  | 1386 | 1173
     * AKOTNI  | 1429 | 1226
     * AMOSTE  | 1443 | 1227
     * ATOPNI  | 1425 | 1212
     * AGZ     | 1429 | 1209
     * APRZAC  | 1386 | 1192
     * ALITIJ  | 1455 | 1235
     * ATEHNO  | 1387 | 1233
     * ASMELT  | 1429 | 1196
     * ACRGMA  | 1439 | 1166
     * AFORD   | 1439 | 1215
     * ADRAVC  | 1396 | 1197
     * ABROD   | 1388 | 1168
     * ATABOR  | 1429 | 1228
     * ATRNOV  | 1421 | 1242
     * AGOLOV  | 1467 | 1263
     * ARESEV  | 1438 | 1228
     * ABTCMI  | 1450 | 1216
     * ASMART  | 1446 | 1213
     * APROGA  | 1382 | 1251
     * ABRDO   | 1382 | 1235
     * AKONEX  | 1387 | 1237
     * AZITO   | 1456 | 1213
     * ASLOM   | 1425 | 1226
     * APLAMA  | 1448 | 1202
     * AGRAD   | 1470 | 1218
     * AFUZIN  | 1463 | 1228
     * ASAZU   | 1421 | 1235
     * ATOTRA  | 1463 | 1233
     * AHALA   | 1414 | 1221
     * ASONCE  | 1433 | 1208
     * ALMLEK  | 1422 | 1202
     * ACERNE  | 1415 | 1212
     * ADRAV   | 1396 | 1201
     * ALGRAD  | 1424 | 1234
     * ALPP    | 1406 | 1205
     * ANAMA   | 1420 | 1231
     * ATIVO   | 1418 | 1226
     * AZAPUZ  | 1392 | 1196
     * ASMAR   | 1404 | 1150
     * ACANK   | 1416 | 1233
     * ATACGD  | 1390 | 1156
     * APOPTV  | 1435 | 1211
     * AFRANK  | 1417 | 1220
     * ASTEP   | 1450 | 1235
     * AEJATA  | 1445 | 1218
     * AZADOB  | 1482 | 1206
     * AELNA   | 1436 | 1177
     * AZELEZ  | 1428 | 1218
     * AMURGL  | 1415 | 1246
     * ACIGO   | 1423 | 1225
     * AIJS    | 1407 | 1241
     * AELOK   | 1459 | 1225
     * ATACBR  | 1385 | 1160
     * AJELSA  | 1416 | 1259
     * AVRHOV  | 1394 | 1242
     * ABIZOV  | 1472 | 1247
     * ACITYP  | 1454 | 1214
     * ASAVLJ  | 1417 | 1182
     * AKLINI  | 1434 | 1228
     * AKODEL  | 1439 | 1236
     * ASTOZI  | 1436 | 1198
     * AKORA   | 1422 | 1226
     * AHLEV   | 1388 | 1163
     * ATOPLA  | 1451 | 1224
     * AMALEN  | 1461 | 1273
     * AVECNA  | 1391 | 1222
     * ANADGO  | 1462 | 1178
     * ABOKAL  | 1379 | 1231
     * AVARNO  | 1404 | 1250
     * ASTRAL  | 1422 | 1227
     * AVEVCE  | 1488 | 1234
     * ARNC    | 1423 | 1225
     * AGEOPL  | 1406 | 1198
     * AMOBLI  | 1415 | 1200
     * AVLADA  | 1415 | 1234
     * ASTEG   | 1404 | 1193
     * AKOSEZ  | 1392 | 1207
     */

    const int ntest_tx = 154;
    cl_int2 *test_tx = malloc (ntest_tx * sizeof(cl_int2));

    test_tx[0] = (cl_int2) {1468,1200};
    test_tx[1] = (cl_int2) {1395,1180};
    test_tx[2] = (cl_int2) {1408,1211};
    test_tx[3] = (cl_int2) {1431,1212};
    test_tx[4] = (cl_int2) {1462,1272};
    test_tx[5] = (cl_int2) {1483,1227};
    test_tx[6] = (cl_int2) {1381,1230};
    test_tx[7] = (cl_int2) {1397,1238};
    test_tx[8] = (cl_int2) {1443,1194};
    test_tx[9] = (cl_int2) {1424,1239};
    test_tx[10] = (cl_int2) {1398,1206};
    test_tx[11] = (cl_int2) {1400,1245};
    test_tx[12] = (cl_int2) {1431,1226};
    test_tx[13] = (cl_int2) {1423,1248};
    test_tx[14] = (cl_int2) {1460,1203};
    test_tx[15] = (cl_int2) {1451,1215};
    test_tx[16] = (cl_int2) {1471,1236};
    test_tx[17] = (cl_int2) {1422,1235};
    test_tx[18] = (cl_int2) {1434,1257};
    test_tx[19] = (cl_int2) {1436,1232};
    test_tx[20] = (cl_int2) {1449,1213};
    test_tx[21] = (cl_int2) {1429,1222};
    test_tx[22] = (cl_int2) {1378,1202};
    test_tx[23] = (cl_int2) {1444,1222};
    test_tx[24] = (cl_int2) {1385,1207};
    test_tx[25] = (cl_int2) {1421,1222};
    test_tx[26] = (cl_int2) {1459,1182};
    test_tx[27] = (cl_int2) {1396,1181};
    test_tx[28] = (cl_int2) {1422,1225};
    test_tx[29] = (cl_int2) {1424,1220};
    test_tx[30] = (cl_int2) {1426,1232};
    test_tx[31] = (cl_int2) {1409,1212};
    test_tx[32] = (cl_int2) {1414,1236};
    test_tx[33] = (cl_int2) {1429,1234};
    test_tx[34] = (cl_int2) {1425,1228};
    test_tx[35] = (cl_int2) {1445,1172};
    test_tx[36] = (cl_int2) {1508,1219};
    test_tx[37] = (cl_int2) {1401,1210};
    test_tx[38] = (cl_int2) {1429,1222};
    test_tx[39] = (cl_int2) {1421,1195};
    test_tx[40] = (cl_int2) {1436,1249};
    test_tx[41] = (cl_int2) {1445,1179};
    test_tx[42] = (cl_int2) {1412,1214};
    test_tx[43] = (cl_int2) {1388,1250};
    test_tx[44] = (cl_int2) {1431,1231};
    test_tx[45] = (cl_int2) {1387,1185};
    test_tx[46] = (cl_int2) {1443,1268};
    test_tx[47] = (cl_int2) {1427,1203};
    test_tx[48] = (cl_int2) {1403,1240};
    test_tx[49] = (cl_int2) {1397,1238};
    test_tx[50] = (cl_int2) {1388,1200};
    test_tx[51] = (cl_int2) {1482,1228};
    test_tx[52] = (cl_int2) {1497,1232};
    test_tx[53] = (cl_int2) {1399,1189};
    test_tx[54] = (cl_int2) {1373,1247};
    test_tx[55] = (cl_int2) {1448,1262};
    test_tx[56] = (cl_int2) {1424,1220};
    test_tx[57] = (cl_int2) {1427,1199};
    test_tx[58] = (cl_int2) {1416,1201};
    test_tx[59] = (cl_int2) {1419,1209};
    test_tx[60] = (cl_int2) {1423,1231};
    test_tx[61] = (cl_int2) {1436,1221};
    test_tx[62] = (cl_int2) {1439,1218};
    test_tx[63] = (cl_int2) {1431,1216};
    test_tx[64] = (cl_int2) {1418,1148};
    test_tx[65] = (cl_int2) {1431,1237};
    test_tx[66] = (cl_int2) {1419,1237};
    test_tx[67] = (cl_int2) {1397,1210};
    test_tx[68] = (cl_int2) {1500,1225};
    test_tx[69] = (cl_int2) {1498,1253};
    test_tx[70] = (cl_int2) {1404,1235};
    test_tx[71] = (cl_int2) {1380,1242};
    test_tx[72] = (cl_int2) {1430,1244};
    test_tx[73] = (cl_int2) {1404,1230};
    test_tx[74] = (cl_int2) {1413,1241};
    test_tx[75] = (cl_int2) {1427,1185};
    test_tx[76] = (cl_int2) {1386,1173};
    test_tx[77] = (cl_int2) {1429,1226};
    test_tx[78] = (cl_int2) {1443,1227};
    test_tx[79] = (cl_int2) {1425,1212};
    test_tx[80] = (cl_int2) {1429,1209};
    test_tx[81] = (cl_int2) {1386,1192};
    test_tx[82] = (cl_int2) {1455,1235};
    test_tx[83] = (cl_int2) {1387,1233};
    test_tx[84] = (cl_int2) {1429,1196};
    test_tx[85] = (cl_int2) {1439,1166};
    test_tx[86] = (cl_int2) {1439,1215};
    test_tx[87] = (cl_int2) {1396,1197};
    test_tx[88] = (cl_int2) {1388,1168};
    test_tx[89] = (cl_int2) {1429,1228};
    test_tx[90] = (cl_int2) {1421,1242};
    test_tx[91] = (cl_int2) {1467,1263};
    test_tx[92] = (cl_int2) {1438,1228};
    test_tx[93] = (cl_int2) {1450,1216};
    test_tx[94] = (cl_int2) {1446,1213};
    test_tx[95] = (cl_int2) {1382,1251};
    test_tx[96] = (cl_int2) {1382,1235};
    test_tx[97] = (cl_int2) {1387,1237};
    test_tx[98] = (cl_int2) {1456,1213};
    test_tx[99] = (cl_int2) {1425,1226};
    test_tx[100] = (cl_int2) {1448,1202};
    test_tx[101] = (cl_int2) {1470,1218};
    test_tx[102] = (cl_int2) {1463,1228};
    test_tx[103] = (cl_int2) {1421,1235};
    test_tx[104] = (cl_int2) {1463,1233};
    test_tx[105] = (cl_int2) {1414,1221};
    test_tx[106] = (cl_int2) {1433,1208};
    test_tx[107] = (cl_int2) {1422,1202};
    test_tx[108] = (cl_int2) {1415,1212};
    test_tx[109] = (cl_int2) {1396,1201};
    test_tx[110] = (cl_int2) {1424,1234};
    test_tx[111] = (cl_int2) {1406,1205};
    test_tx[112] = (cl_int2) {1420,1231};
    test_tx[113] = (cl_int2) {1418,1226};
    test_tx[114] = (cl_int2) {1392,1196};
    test_tx[115] = (cl_int2) {1404,1150};
    test_tx[116] = (cl_int2) {1416,1233};
    test_tx[117] = (cl_int2) {1390,1156};
    test_tx[118] = (cl_int2) {1435,1211};
    test_tx[119] = (cl_int2) {1417,1220};
    test_tx[120] = (cl_int2) {1450,1235};
    test_tx[121] = (cl_int2) {1445,1218};
    test_tx[122] = (cl_int2) {1482,1206};
    test_tx[123] = (cl_int2) {1436,1177};
    test_tx[124] = (cl_int2) {1428,1218};
    test_tx[125] = (cl_int2) {1415,1246};
    test_tx[126] = (cl_int2) {1423,1225};
    test_tx[127] = (cl_int2) {1407,1241};
    test_tx[128] = (cl_int2) {1459,1225};
    test_tx[129] = (cl_int2) {1385,1160};
    test_tx[130] = (cl_int2) {1416,1259};
    test_tx[131] = (cl_int2) {1394,1242};
    test_tx[132] = (cl_int2) {1472,1247};
    test_tx[133] = (cl_int2) {1454,1214};
    test_tx[134] = (cl_int2) {1417,1182};
    test_tx[135] = (cl_int2) {1434,1228};
    test_tx[136] = (cl_int2) {1439,1236};
    test_tx[137] = (cl_int2) {1436,1198};
    test_tx[138] = (cl_int2) {1422,1226};
    test_tx[139] = (cl_int2) {1388,1163};
    test_tx[140] = (cl_int2) {1451,1224};
    test_tx[141] = (cl_int2) {1461,1273};
    test_tx[142] = (cl_int2) {1391,1222};
    test_tx[143] = (cl_int2) {1462,1178};
    test_tx[144] = (cl_int2) {1379,1231};
    test_tx[145] = (cl_int2) {1404,1250};
    test_tx[146] = (cl_int2) {1422,1227};
    test_tx[147] = (cl_int2) {1488,1234};
    test_tx[148] = (cl_int2) {1423,1225};
    test_tx[149] = (cl_int2) {1406,1198};
    test_tx[150] = (cl_int2) {1415,1200};
    test_tx[151] = (cl_int2) {1415,1234};
    test_tx[152] = (cl_int2) {1404,1193};
    test_tx[153] = (cl_int2) {1392,1207};

    /* for each transmitter */
    for (i = 0; i < ntest_tx; i++)
    {
        /* display completion percentage */
        G_percent (i, ntest_tx, 1);

        /* Transmitter coordinates */
        cl_int2 tx_coord = test_tx[i];

        G_message ("The transmitter is located at %d, %d", tx_coord.x,
                                                           tx_coord.y);
        /* Transmitter heights */
        //G_message ("Transmitter height above sea level is %d mts", tx_elevation);
        //G_message ("Transmitter anthena height above ground is %d mts", tx_anthena_height);

        /* transmitter data */ 
        cl_int4 tx_data = (cl_int4) {tx_coord.x, tx_coord.y,
                                     tx_elevation, tx_anthena_height};

        /* tile offset within the raster */
        cl_int2 tx_offset = (cl_int2) {tx_coord.x - radius,
                                       tx_coord.y - radius};

        /* set (more) remaining kernel parameters */
        clarg_set (ctx, krn, 1, tx_data);
        clarg_set (ctx, krn, 2, tx_offset);

        /* start kernel execution */
        clfork (ctx, 0, krn, &ndr, CL_EVENT_NOWAIT);

        clwait (ctx, 0, CL_KERNEL_EVENT|CL_MEM_EVENT|CL_EVENT_RELEASE);
    }

    /* sync memory */
    clmsync (ctx, 0, qrm_out, CL_MEM_HOST|CL_EVENT_WAIT);

    /* write the calculation output */
    save_raster (qrm_out, nrows, ncols, outfd);

    /* free allocated resources */
    free (test_tx);
    clclose (ctx, h);
    clfree (qrm_out);
    clfree (dem_in);

}
int main()
{
   cl_uint n = 1024;

	// use default contexts, if no ACCELERATOR use CPU 
   CLCONTEXT* cp = (stdacc)? stdacc : stdcpu;

   unsigned int devnum = 0;

   cl_kernel krn = clsym(cp,0,"matvecmult_kern",0);

   // allocate matrix and vectors using clmulti_array 
	typedef clmulti_array<cl_float,1> array1_t;
	typedef clmulti_array<cl_float,2> array2_t;
	array2_t aa(boost::extents[n][n]);
	array1_t b(boost::extents[n]);
	array1_t c(boost::extents[n]);

   // initialize matrix a[] and vector b[], zero c[] 
   for(int i=0;i<n;i++) for(int j=0;j<n;j++) aa[i][j] = 1.1f*i*j;
   for(int i=0;i<n;i++) b[i] = 2.2f*i;
   for(int i=0;i<n;i++) c[i] = 0.0f;

   // attach the vectors to the STDCL context 
   aa.clmattach(cp);
   b.clmattach(cp);
   c.clmattach(cp);

   // define the computational domain and workgroup size 
   clndrange_t ndr = clndrange_init1d( 0, n, 16);

   // non-blocking sync vectors a and b to device memory (copy to GPU)
   aa.clmsync(cp,devnum,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   b.clmsync(cp,devnum,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   // set the kernel arguments
   clarg_set(cp,krn,0,n);
   aa.clarg_set_global(cp,krn,1);
   b.clarg_set_global(cp,krn,2);
   c.clarg_set_global(cp,krn,3);

   // non-blocking fork of the OpenCL kernel to execute on the GPU
   clfork(cp,devnum,krn,&ndr,CL_EVENT_NOWAIT);

   // non-blocking sync vector c to host memory (copy back to host)
   c.clmsync(cp,0,CL_MEM_HOST|CL_EVENT_NOWAIT);

   // force execution of operations in command queue, non-blocking call
   clflush(cp,devnum,0);

   // block on completion of all operations in the command queue
   clwait(cp,devnum,CL_ALL_EVENT);

   for(int i=0;i<n;i++) printf("%f %f\n",b[i],c[i]);


   ///////////////////////////////////////////////////////////
   ///// now resize the vectors by adding some more values ...
   ///////////////////////////////////////////////////////////
  
	n *= 3;
 
   // OPTIONAL: for better performance, detach containers from STDCL context
   aa.clmdetach();
   b.clmdetach();
   c.clmdetach();


   // increase size of vectors three-fold 
   // ... note that *all* boost multi_array operations are valid
	aa.resize(boost::extents[n][n]);
	b.resize(boost::extents[n]);
	c.resize(boost::extents[n]);
   for(int i=0;i<n;i++) for(int j=0;j<n;j++) aa[i][j] = 1.1f*i*j;
   for(int i=0;i<n;i++) b[i] = 2.2f*i;
   for(int i=0;i<n;i++) c[i] = 0.0f;


   // OPTIONAL: ... if you dettached the containers, you must re-attach them
   aa.clmattach(cp);
   b.clmattach(cp);
   c.clmattach(cp);


   // now follow same steps used above to sync memory, execute kernel, etc.

   aa.clmsync(cp,devnum,CL_MEM_DEVICE|CL_EVENT_NOWAIT);
   b.clmsync(cp,devnum,CL_MEM_DEVICE|CL_EVENT_NOWAIT);

   clndrange_t ndr_threefold = clndrange_init1d( 0, n, 64);
   
   clarg_set(cp,krn,0,n);
   aa.clarg_set_global(cp,krn,1);
   b.clarg_set_global(cp,krn,2);
   c.clarg_set_global(cp,krn,3);

   clfork(cp,devnum,krn,&ndr_threefold,CL_EVENT_NOWAIT);

   c.clmsync(cp,0,CL_MEM_HOST|CL_EVENT_NOWAIT);

	clflush(cp,devnum,0);

   clwait(cp,devnum,CL_ALL_EVENT);

   for(int i=0;i<n;i++) printf("%f %f\n",b[i],c[i]);

}