예제 #1
0
int
main(int argc, char **argv)
{
   int m[ROWS][COLS] = {{1, 2, 3}, {4, 5, 6}};
   int n[COLS][ROWS];

   print(*m, ROWS, COLS);
   //transpose(*m, *n, ROWS, COLS);
   transpose2(*m, *n);
   print(*n, COLS, ROWS);

   return 0;
}
예제 #2
0
int main(int argc, char **argv) {
  int opt, exp = -1, var = -1;
  bool err = false;

  while ((opt = getopt(argc, argv, "n:v:")) != -1) {
    if (opt == 'n')
      exp = atoi(optarg);
    else if (opt == 'v')
      var = atoi(optarg);
    else
      err = true;
  }

  if (err || exp < 0 || var < 0 || var >= 2) {
    fprintf(stderr, "Usage: %s -n log2(size) -v variant\n", argv[0]);
    return 1;
  }

  int n = 1 << exp;
  size_t size = n * n * sizeof(int);
  int *src = NULL, *dst = NULL;

  posix_memalign((void **)&src, getpagesize(), size);
  posix_memalign((void **)&dst, getpagesize(), size);

  printf("Generate matrix %d x %d (%ld KiB)\n", n, n, size >> 10);

  fill(src, n);
  bzero(dst, size);
  flush_cache();

  printf("Performing matrix transposition.\n");

  _timer_t timer;
  timer_reset(&timer);
  timer_start(&timer);
  if (var == 0) 
    transpose1(dst, src, n);
  else
    transpose2(dst, src, n);
  timer_stop(&timer);
  timer_print(&timer);

  free(src);
  free(dst);

  return 0;
}
예제 #3
0
int main(int argc, char *argv[]) {
   int nx = 512, ny = 512, mx = 16, my = 16;
   int nblock = 64;
   int j, k, irc;
   float eps, epsmax;
   double dtime;
   struct timeval itime;
   float *a2 = NULL, *b2 = NULL, *c2 = NULL;
   float *g_a2 = NULL, *g_b2 = NULL;
/* allocate host data */
   a2 = (float *) malloc(ny*nx*sizeof(float));
   b2 = (float *) malloc(nx*ny*sizeof(float));
   c2 = (float *) malloc(ny*nx*sizeof(float));

/* set up GPU */
   irc = 0;
   setgbsize(nblock);
   init_cu(0,&irc);
   if (irc != 0) {
      printf("CUDA initialization error!\n");
      exit(1);
   }

/* allocate 2d data on GPU */
   gpu_fallocate(&g_a2,ny*nx,&irc);
   gpu_fallocate(&g_b2,nx*ny,&irc);

   if (irc != 0) {
      printf("GPU allocate error!\n");
      exit(1);
   }

/* initialize 2d data on host */
   for (k = 0; k < ny; k++) {
      for (j = 0; j < nx; j++) {
         b2[j+nx*k] = (float) (j + nx*k + 1);
         a2[k+ny*j] = 0.0;
      }
   }
   gpu_fcopyin(a2,g_a2,ny*nx);

/* measure overhead time by running empty kernel */
   dtimer(&dtime,&itime,-1);
   emptykernel();
   dtimer(&dtime,&itime,1);
   printf("C empty kernel time=%e\n",(float)dtime);

/* segmented 2d transpose on host with block size mx, my */
   dtimer(&dtime,&itime,-1);
/* transpose0(a2,b2,nx,ny); */
   transpose2(a2,b2,mx,my,nx,ny);
   dtimer(&dtime,&itime,1);
   printf("C 2d transpose time=%e\n",(float)dtime);

/* 2d transpose on GPU with block size mx, mx */
   gpu_fcopyin(b2,g_b2,nx*ny);
   dtimer(&dtime,&itime,-1);
   gpu_transpose2(g_a2,g_b2,mx,nx,ny);
   dtimer(&dtime,&itime,1);
   printf("GPU 2d transpose time=%e\n",(float)dtime);
   gpu_fcopyout(c2,g_a2,nx*ny);

/* Check for correctness: compare a2 and g_a2 */
   epsmax = 0.0;
   for (k = 0; k < ny; k++) {
      for (j = 0; j < nx; j++) {
         eps = a2[j+nx*k] - c2[j+nx*k];
         if (eps < 0.0)
            eps = -eps;
         if (eps > epsmax)
            epsmax = eps;
      }
   }
   printf("2d transpose maximum difference = %e\n",epsmax);

/* deallocate memory on GPU */
   gpu_deallocate((void *)g_a2,&irc);
   gpu_deallocate((void *)g_b2,&irc);
/* close down GPU */
   end_cu();

   return 0;
}
예제 #4
0
void
rotate (void *iadr, XLONG **oadr, int type, int nx, int ny, int dir)
{

    int    nelem;

    static int first_time = 1, sv_nx=0, sv_ny=0;;
    static ulong  *bufl = (ulong *) NULL;
    static ushort *bufs = (ushort *) NULL;
    static uchar  *bufc = (uchar *) NULL;


    if ((nx*ny) != (sv_nx*sv_ny)) {
        if (type == 1 && bufc) free ((uchar *)bufc);
        if (type == 2 && bufs) free ((ushort *)bufs);
        if (type == 4 && bufl) free ((ulong *)bufl);
        sv_nx = nx;
        sv_ny = ny;
        first_time = 1;
    }

    nelem = nx * ny;

    switch (type) {

    case (1):			/* uchar */
        ubip = (uchar *) iadr;
        if (first_time) {
            ubop = bufc = (uchar *) calloc (1, nelem * sizeof(uchar));
            *oadr = (XLONG *) ubop;
            first_time = 0;
        } else {
            ubop = (uchar *)bufc;
            *oadr = (XLONG *) bufc;
        }
        break;
    case (2):			/* ushort */
        usip = (ushort *) iadr;
        if (first_time) {
            usop = bufs = (ushort *) calloc (1, nelem * sizeof(ushort));
            *oadr = (XLONG *) usop;
            first_time = 0;
        } else {
            usop = (ushort *)bufs;
            *oadr = (XLONG *) bufs;
        }
        break;
    case (4):			/* ulong */
        ulip = (ulong *) iadr;
        if (first_time) {
            ulop = bufl = (ulong *) calloc (1, nelem * sizeof(ulong));
            *oadr = (XLONG *) ulop;
            first_time = 0;
        } else {
            ulop = (ulong *)bufl;
            *oadr = (XLONG *) bufl;
        }
        break;
    default:
        ;
        break;
    }

    switch (dir) {
    case (1):			/* flip x axis */
        flipx(type, nx, ny);
        break;
    case (2):			/* flip y axis */
        flipy(type, nx, ny);
        break;
    case (3):			/* transpose 1st diag */
        transpose1(type, nx, ny);
        break;
    case (4):			/* transpose 2nd diag */
        transpose2(type, nx, ny);
        break;
    default:
        break;
    }
}