Beispiel #1
0
int main (int argv, char **argc)
{
    btest=(double*)malloc(sizeof(double)*IDA);
    atest=(double*)malloc(sizeof(double)*IDA);
#ifdef __NVCUDA__
	acc_init( acc_device_nvcuda );
#endif 
#ifdef __NVOPENCL__
	acc_init( acc_device_nvocl );
	//acc_list_devices_spec( acc_device_nvocl );
#endif 



	printf(" *******************************************************\n"); 

	delaylength = 500;
	innerreps = 100;
	// GENERATE REFERENCE TIME 
	//	refer();   

	// TEST  PRIVATE 
	//estprivnew(); 

	// TEST  FIRSTPRIVATE 
	//estfirstprivnew(); 

#ifdef OMPVER2
	// TEST  COPYPRIVATE / 
	//estcopyprivnew(); 
#endif

	//TEST  COPYIN 
	copyintest(); 

	//  TEST  COPYOUT 
	copyouttest();

	//  TEST  CREATE 
	createtest();


	// TEST  REDUCTION 
	reductiontest(); 

	// TEST  REDUCTION 
	kerneltest(); 


	// TEST  PRIVATE  
	//	privatetest(); 



	delaylength = 500;
	innerreps = 100;

} 
Beispiel #2
0
int
main (int argc, char **argv)
{
    void *d;
    acc_device_t devtype = acc_device_host;

#if ACC_DEVICE_TYPE_nvidia
    devtype = acc_device_nvidia;

    if (acc_get_num_devices (acc_device_nvidia) == 0)
        return 0;
#endif

    acc_init (devtype);

    d = acc_malloc (0);
    if (d != NULL)
        abort ();

    acc_free (0);

    acc_shutdown (devtype);

    acc_set_device_type (devtype);

    d = acc_malloc (0);
    if (d != NULL)
        abort ();

    acc_shutdown (devtype);

    acc_init (devtype);

    d = acc_malloc (1024);
    if (d == NULL)
        abort ();

    acc_free (d);

    acc_shutdown (devtype);

    acc_set_device_type (devtype);

    d = acc_malloc (1024);
    if (d == NULL)
        abort ();

    acc_free (d);

    acc_shutdown (devtype);

    return 0;
}
Beispiel #3
0
//!
//! @brief This function initializes the hardware/software resources
//! required for device mouse HID task.
//!
void device_mouse_hid_task_init(void)
{
  sof_cnt = 0;

#if BOARD == EVK1101
  // Initialize accelerometer driver
  acc_init();
#endif

#ifndef FREERTOS_USED
  #if USB_HOST_FEATURE == true
  // If both device and host features are enabled, check if device mode is engaged
  // (accessing the USB registers of a non-engaged mode, even with load operations,
  // may corrupt USB FIFO data).
  if (Is_usb_device())
  #endif  // USB_HOST_FEATURE == true
    Usb_enable_sof_interrupt();
#endif  // FREERTOS_USED

#ifdef FREERTOS_USED
  xTaskCreate(device_mouse_hid_task,
              configTSK_USB_DHID_MOUSE_NAME,
              configTSK_USB_DHID_MOUSE_STACK_SIZE,
              NULL,
              configTSK_USB_DHID_MOUSE_PRIORITY,
              NULL);
#endif  // FREERTOS_USED
}
Beispiel #4
0
int main(int argc, char **argv)
{   
    /* 模块初始化 */
    exc_init();                                          /* 中断初始化 */
    sys_timer_init();                                    /* 系统时钟初始化 */
    light_init();                                        /* LED灯初始化 */
    switch_init();                                       /* 开关初始化 */
    speaker_init();                                      /* 蜂鸣器初始化 */
    motor_init();                                        /* 电机初始化 */
    decoder_init();                                      /* 编码器初始化 */
    gyro_init();                                         /* 陀螺仪初始化 */
    acc_init();                                          /* 加速度传感器初始化 */
    serial_initialize((intptr_t)(NULL));                 /* 初始化串口 */
    //sd_init(&Fatfs);                                     /* 初始化SD卡,并创建文件 */
    //sd_create_file(&test_data, test_data_name);
    
    /* 命令注册 */
    help_cmd_initialize((intptr_t)(NULL));
    light_cmd_initialize((intptr_t)(NULL));
    switch_cmd_initialize((intptr_t)(NULL));
    speaker_cmd_initialize((intptr_t)(NULL));
    motor_cmd_initialize((intptr_t)(NULL));
    decoder_cmd_initialize((intptr_t)(NULL));
    //sd_cmd_initialize((intptr_t)(NULL));

    printf("\n Welcome to k60 software platform!");
    printf("\n Press 'help' to get the help! \n");
    
    light_open(LIGHT4); 

    /* ntshell测试 */
    task_ntshell((intptr_t)(NULL));
}
Beispiel #5
0
void board_init(void)
{
#ifdef KEY_RC_BOARD
    /* On board Button initialization */
	ioport_configure_pin(BUTTON_IRQ_PIN_1,IOPORT_DIR_INPUT | IOPORT_PULL_UP);
	ioport_configure_pin(BUTTON_IRQ_PIN_2,IOPORT_DIR_INPUT | IOPORT_PULL_UP);
	ioport_configure_pin(BUTTON_IRQ_PIN_3,IOPORT_DIR_INPUT | IOPORT_PULL_UP);
	
    set_button_pins_for_normal_mode();

    /* Initialize the IRQ lines' interrupt behaviour. */
    DISABLE_ALL_BUTTON_IRQS();
		
	/* LED Init */
	/* LCD initialization for inactive use */    
    /* On board LED initialization */
	ioport_configure_pin(LCD_CS_ON_BOARD,	
	IOPORT_DIR_OUTPUT |  IOPORT_INIT_HIGH);
	
	ioport_set_port_dir(IOPORT_PORTE,KEY_RC_IO_MASK,IOPORT_DIR_OUTPUT);
	ioport_set_port_level(IOPORT_PORTE,KEY_RC_IO_MASK,KEY_RC_IO_MASK);
	ioport_set_pin_dir(IOPORT_CREATE_PIN(PORTG , 2),IOPORT_DIR_INPUT);
	ioport_set_pin_mode(IOPORT_CREATE_PIN(PORTG , 2), IOPORT_MODE_PULLUP);	

    LATCH_INIT();   
	/* Init ADC for the Accelerometer */
	 adc_init();

	// LATCH_INIT();
	 /* Enable Accelerometer by enabling the PWR pin in the Latch */
	 acc_init();
 
	update_latch_status();   

    /* Apply latch pulse to set LED status */
    pulse_latch();
	
#else
    /* To identify if it is a plain or STB*/
    board_identify();

	/* On board LED initialization */
	ioport_configure_pin(LED0_RCB,IOPORT_DIR_OUTPUT |  IOPORT_INIT_HIGH);
	ioport_configure_pin(LED1_RCB,IOPORT_DIR_OUTPUT |  IOPORT_INIT_HIGH);

	ioport_configure_pin(LED2_RCB,IOPORT_DIR_OUTPUT |  IOPORT_INIT_HIGH);

	/* On board Switch initialization */
	ioport_configure_pin(GPIO_PUSH_BUTTON_0,IOPORT_DIR_INPUT | IOPORT_PULL_UP);      

#ifdef BREAKOUT_BOARD
  //Enable RCB_BB RS232 level converter
	ioport_set_port_dir(IOPORT_PORTD,BB_SIO_MASK,IOPORT_DIR_OUTPUT);
	ioport_set_port_level(IOPORT_PORTD,BB_SIO_MASK,BB_SIO_VAL);
#endif
#endif

}
Beispiel #6
0
int
main (int argc, char **argv)
{
  acc_device_t devtype = acc_device_host;

#if ACC_DEVICE_TYPE_nvidia
  devtype = acc_device_nvidia;

  if (acc_get_num_devices (devtype) == 0)
    return 0;
#endif

  acc_init (devtype);

  acc_init (devtype);

  return 0;
}
Beispiel #7
0
int main()
{
	int i, j, k;
	TYPE sum, known_sum;
	int NI;
    int error;
	TYPE *input;
    TYPE rounding_error = 1.E-9;
    struct timeval tim;
    double start, end;
    
	NI = 1<<20;

    error = 0;
	input = (TYPE*)malloc(NI*sizeof(TYPE));
    acc_init(acc_device_default);

    srand((unsigned)time(0));
	for(i=0; i<NI; i++)
	{
		input[i] = (TYPE)rand()/(TYPE)RAND_MAX + 0.1;
	}
	
    sum = 0;
    gettimeofday(&tim, NULL);
    start = tim.tv_sec*1000 + (tim.tv_usec/1000.0);
  #pragma acc parallel copyin(input[0:NI]) \
                       num_gangs(192) \
                       num_workers(1) \
                       vector_length(128) 
  {
    #pragma acc loop gang vector reduction(+:sum)
	for(i=0; i<NI; i++)
	    sum += input[i];
  }
    gettimeofday(&tim, NULL);
    end = tim.tv_sec*1000 + (tim.tv_usec/1000.0);
	
    known_sum = 0;
	for(i=0; i<NI; i++)
	{
		known_sum += input[i];
	}
	
    if(fabs(sum - known_sum) > rounding_error)
    {
        error++;
		printf("same_gang_vector + FAILED! sum=%d, known_sum=%d\n", sum, known_sum);
    }
    
    printf("same_gang_vector + execution time is :%.2lf: ms\n", end-start);

    if(error == 0)
        printf("same_gang_vector + SUCCESS!\n");

    free(input);
}
Beispiel #8
0
int main(){
    int SIZE=1024;
    float *a=(float*)malloc(sizeof(float)*SIZE);
    float *b=(float*)malloc(sizeof(float)*SIZE);
    float *c=(float*)malloc(sizeof(float)*SIZE);
    int i,k,j;

    #ifdef __NVCUDA__
    acc_init( acc_device_nvcuda );
    #endif 
    #ifdef __NVOPENCL__
    acc_init( acc_device_nvocl );
    //acc_list_devices_spec( acc_device_nvocl );
    #endif 

    #pragma acc kernels create(a[0:SIZE],b[0:SIZE],c[0:SIZE]) copyout(a[0:SIZE],b[0:SIZE],c[0:SIZE])
    {
        #pragma acc loop independent
        for(i=0; i<SIZE; i++){
            a[i]=i;
            b[i]=SIZE-i;
        }
    }

    #pragma acc kernels present(a,b,c) copyout(c[0:SIZE])
    {
        #pragma acc loop independent
        for(i=0; i<SIZE; i++){
            c[i]=a[i]+b[i];
        }
    }

    for(i=0; i<SIZE; i++){
        if(c[i]!=(a[i]+b[i])){
            fprintf(stderr,"failed to perform the sum at %d. %6.4f + %6.4f =%6.4f\n",i,a[i],b[i],c[i]);
            return -1;
        }
    }
    fprintf(stderr,"Multiple kernels test was successfull!\n");
    return 0;
}
Beispiel #9
0
attribute_hidden void
goacc_lazy_initialize (void)
{
  struct goacc_thread *thr = goacc_thread ();

  if (thr && thr->dev)
    return;

  if (!cached_base_dev)
    acc_init (acc_device_default);
  else
    goacc_attach_host_thread_to_device (-1);
}
Beispiel #10
0
/**
 * \brief Initialize re200b sensor in order to detect motion
 * \param ul_acc_minus ACC minus input, use ACC peripheral definition in header.
 * \param ul_acc_plus ACC plus input, use ACC peripheral definition in header.
 */
void re200b_motion_detect_init(uint32_t ul_acc_minus, uint32_t ul_acc_plus)
{
	pmc_enable_periph_clk(ID_ACC);

	/* Initialize ACC */
	acc_init(ACC, ul_acc_plus, ul_acc_minus, ACC_MR_EDGETYP_ANY, ACC_MR_INV_DIS);

	/* clear status */
	acc_get_interrupt_status(ACC);

	/* reset event flags */
	g_compare_result = CMP_EQUAL;
	g_ul_compare_event_flag = false;
}
Beispiel #11
0
/*******************************************************************************
* Function Name  : Axis3_Test
* Description    : Light Sensor Test.
* Input          : None
* Output         : None
* Return         : None
*******************************************************************************/
void Axis3_Test(void)
{
  char		buf[24];
  int32_t	xoff = 0;
  int32_t	yoff = 0;
  int32_t	zoff = 0;
  int8_t	x = 0;
  int8_t	y = 0;
  int8_t	z = 0;

   
  OLED_ClearScreen();
  OLED_DisStrLine(0, 0, "Axis-3");
  

  I2CInit(I2CMASTER, 0);

  acc_init();
  /* Assume base board in zero-g position when reading first value. */
  acc_read(&x, &y, &z);
  xoff = 0-x;
  yoff = 0-y;
  zoff = 0-z;

 // while(1)
 // {	
    /* Accelerometer */
    acc_read(&x, &y, &z);
    x = x+xoff;
    y = y+yoff;
    z = z+zoff;

    snprintf(buf, 20, "Acc x: %d  ", x);
    OLED_DisStrLine(2, 0, (uint8_t *)buf);
	printf("\r\nAcc x: %d,  ", x);

	snprintf(buf, 20, "Acc y: %d  ", y);
    OLED_DisStrLine(3, 0, (uint8_t *)buf);
	printf("Acc y: %d,  ", y);

	snprintf(buf, 20, "Acc z: %d  ", z);
    OLED_DisStrLine(4, 0, (uint8_t *)buf);
	printf("Acc z: %d. ", z);
	
	delay_ms(250);

  //  if(KEY_Read() == KEY_ESC)
  //	 break;
 // }
}
Beispiel #12
0
int main()
{
#ifdef __NVCUDA__
    acc_init( acc_device_nvcuda );
#endif
#ifdef __NVOPENCL__
    acc_init( acc_device_nvocl );
    //acc_list_devices_spec( acc_device_nvocl );
#endif

    float a[SIZE], b[SIZE], c[SIZE];
    int i;
    for(i=0; i<2*SIZE; i++)
    {
        a[i%SIZE]=i;
        b[i%SIZE]=a[(i*13)%SIZE];
        c[i%SIZE]=0;
    }

#pragma acc kernels copyin(a,b) copyout(c)
#pragma acc loop independent
    for(int i=0; i<SIZE; i++)
    {
        c[i] = funct (a[i], b[i]);
        //c[i] = a[i]>b[i]?a[i]:b[i];
    }

    for (i = 0; i < SIZE; ++i) {
        if(c[i]!= funct(a[i],b[i])) {
            fprintf(stderr,"Error %d %16.10f!=%16.10f \n", i, c[i], funct(a[i],b[i]));
            return -1;
        }
    }
    fprintf(stderr,"'function call in kernels region' test was successful!\n");
    return 0;
}
Beispiel #13
0
int
main (int argc, char **argv)
{
  float atime;
  CUstream stream;
  CUresult r;

  acc_init (acc_device_nvidia);

  (void) acc_get_device_num (acc_device_nvidia);

  init_timers (1);

  stream = (CUstream) acc_get_cuda_stream (0);
  if (stream != NULL)
    abort ();

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
      abort ();
    }

  if (!acc_set_cuda_stream (0, stream))
    abort ();

  start_timer (0);

  acc_wait_all_async (0);

  acc_wait (0);

  atime = stop_timer (0);

  if (0.010 < atime)
    {
      fprintf (stderr, "actual time too long\n");
      abort ();
    }

  fini_timers ();

  acc_shutdown (acc_device_nvidia);

  exit (0);
}
int main()
{
	int i;
	int error;
	int NI;
	int *input;
    int sum;

	NI = 2048;

    acc_init(acc_device_default);
	input = (int*)malloc(NI*sizeof(int));

	for(i=0; i<NI; i++)
	{
	    input[i] = i%10;
	}

    sum = 0;
  #pragma acc parallel copyin(input[0:NI])
  {
    #pragma acc loop gang worker vector reduction(+:sum)
    for(i=0; i<NI; i++)
	{
	    sum += input[i];
	}
  }

	error = 0;
	
    int known_sum = 0;
	for(i=0; i<NI; i++)
	    known_sum += input[i];

	if(known_sum != sum)
		error++;

	free(input);

	printf("Test for same line gang worker vector reduction\n");
	if(error == 0)
		printf("SUCCESSFUL!\n");
	else
		printf("Reduction + FAILED! error=%d\n", error);

}
Beispiel #15
0
int
main (int argc, char **argv)
{
  CUstream stream;
  CUresult r;
  struct timeval tv1, tv2;
  time_t t1;

  acc_init (acc_device_nvidia);

  stream = (CUstream) acc_get_cuda_stream (0);
  if (stream != NULL)
    abort ();

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuStreamCreate failed: %d\n", r);
      abort ();
    }

  if (!acc_set_cuda_stream (0, stream))
    abort ();

  gettimeofday (&tv1, NULL);

  acc_wait_all_async (0);

  acc_wait (0);

  gettimeofday (&tv2, NULL);

  t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec);

  if (t1 > 1000)
    {
      fprintf (stderr, "too long\n");
      abort ();
    }

  acc_shutdown (acc_device_nvidia);

  exit (0);
}
Beispiel #16
0
int
main (int argc, char **argv)
{
    const int N = 256;
    int i;
    unsigned char *h;
    void *d;

    acc_init (acc_device_nvidia);

    h = (unsigned char *) malloc (N);

    for (i = 0; i < N; i++)
    {
        h[i] = i;
    }

    d = acc_malloc (N);

    acc_memcpy_to_device (d, h, N);

    memset (&h[0], 0, N);

    acc_memcpy_to_device (d, h, N << 1);

    acc_memcpy_from_device (h, d, N);

    for (i = 0; i < N; i++)
    {
        if (h[i] != i)
            abort ();
    }

    acc_free (d);

    free (h);

    acc_shutdown (acc_device_nvidia);

    return 0;
}
Beispiel #17
0
int main(int argc, char *argv[])
{
    int n;
    double *A, *B, *C;
    double start, end;
  	struct timeval tim;

    if (argc != 2) {
        fprintf(stderr, "Usage: matmul <n>\n");
        exit(1);
    }
    n = atoi(argv[1]);

    A = malloc(n * n * sizeof(double));
    B = malloc(n * n * sizeof(double));
    C = malloc(n * n * sizeof(double));

    initA(A, n);
    initB(B, n);
    initC(C, n);
    //verify(A, n);
    //verify(B, n);

	acc_init(acc_device_default);

    /* sequential run */
    gettimeofday(&tim, NULL);
    start = tim.tv_sec + (tim.tv_usec/1000000.0);
    iter_matmul(A, B, C, n);
    gettimeofday(&tim, NULL);
    end = tim.tv_sec + (tim.tv_usec/1000000.0);

	printf("Execution time is: %.2f s\n", end-start);
	
	verify(C, n);

    free(C);
    free(B);
    free(A);
    return 0;
}
Beispiel #18
0
int
main (int argc, char **argv)
{
  acc_device_t devtype = acc_device_host;

#if ACC_DEVICE_TYPE_nvidia
  devtype = acc_device_nvidia;

  if (acc_get_num_devices (acc_device_nvidia) == 0)
    return 0;
#endif

  acc_init (devtype);

  acc_shutdown (devtype);

  fprintf (stderr, "CheCKpOInT\n");
  acc_shutdown (devtype);

  return 0;
}
Beispiel #19
0
/*! \brief This is an example demonstrating the accelerometer
 *         functionalities using the accelerometer driver.
 */
int main(void)
{
  volatile unsigned long i;

  // switch to oscillator 0
  pm_switch_to_osc0(&AVR32_PM, FOSC0, OSC0_STARTUP);

  // Initialize the debug USART module.
  init_dbg_rs232(FOSC0);

  acc_init();

  // do a loop
  for (;;)
  {
    // slow down operations
    for ( i=0 ; i < 50000 ; i++);

    // display a header to user
    print_dbg("\x1B[2J\x1B[H\r\n\r\nAccelerometer Example\r\n\r\n");

    // Get accelerometer acquisition and process data
    acc_update();

    // test for fast or slow changes
    // depending on that, play with angles or moves
    if ( is_acc_slow() )
    {
      print_mouse() ;
      print_angles() ;
    }
    else
      print_move() ;

    // MEUH
    // only text here , needs to be combined with PWM
    // meuh_stop is the "end" signal from PWM
    meuh_en = is_acc_meuh( meuh_stop ) ;
  }
}
Beispiel #20
0
int
newepoch_init(epoch_item_t **ep, setup_params_t *setup)
{
     epoch_item_t *out = NULL;

      *ep = malloc(sizeof(epoch_item_t));
      out = *ep;

      if(!out) {
            pbgp_error("newepoch_init :: %s\n",strerror(errno));
            return -1;
      }
      bzero(out,sizeof(epoch_item_t));
      acc_init(&out->acc,setup->pairing);
      
      mpz_init(out->s_new); 
      mpz_init(out->s_rvk); 
      mpz_init(out->s_acc); 

      return 0;

}
Beispiel #21
0
static void initGPU(int argc, char** argv) {
        // gets the device id (if specified) to run on
        int devId = -1;
        if (argc > 1) {
                devId = atoi(argv[1]);
                int devCount = acc_get_num_devices(acc_device_nvidia);
                if (devId < 0 || devId >= devCount) {
                        printf("The specified device ID is not supported.\n");
                        exit(1);
                }
        }
        if (devId != -1) {
                acc_set_device_num(devId, acc_device_nvidia);
        }
        // creates a context on the GPU just to
        // exclude initialization time from computations
        acc_init(acc_device_nvidia);

        // print device id
        devId = acc_get_device_num(acc_device_nvidia);
        printf("Running on GPU with ID %d.\n\n", devId);

}
Beispiel #22
0
int main(int argc, char* argv[])
{
	int i, j, k, it;
    //FILE *fp;
	//int fd;
	real mean;
	struct timeval tim;
	double start, end;

	if (argc != 5)
	{
		printf("Usage: %s <nx> <ny> <ns> <nt>\n", argv[0]);
		exit(1);
	}

	srand(17);
	parse_arg(nx, argv[1]);
	parse_arg(ny, argv[2]);
	parse_arg(ns, argv[3]);
	parse_arg(nt, argv[4]);

	real alpha = real_rand();
	real beta = real_rand();

	printf("alpha = %f, beta = %f\n", alpha, beta);

	unsigned int szarray = (unsigned int)nx * ny * ns;
	unsigned int szarrayb = szarray * sizeof(real);

	real* w0 = (real*)malloc(szarrayb);
	real* w1 = (real*)malloc(szarrayb);

	if (!w0 || !w1)
	{
		printf("Error allocating memory for arrays: %p, %p\n", w0, w1);
		exit(1);
	}

	for (i = 0; i < szarray; i++)
	{
		w0[i] = real_rand();
		w1[i] = real_rand();
	}

	// 1) Perform an empty offload, that should strip
	// the initialization time from further offloads.
	acc_init(acc_device_default);
	
	gettimeofday(&tim, NULL);
	start = tim.tv_sec + (tim.tv_usec/1000000.0);

	// 2) Allocate data on device, but do not copy anything.
	#pragma acc data create (w0[0:szarray], w1[0:szarray])
	{
		// 3) Transfer data from host to device and leave it there,
		// i.e. do not allocate deivce memory buffers.
		#pragma acc update device(w0[0:szarray], w1[0:szarray])
		// 4) Perform data processing iterations, keeping all data
		// on device.
		{
			for (it = 0; it < nt; it++)
			{
				laplacian(nx, ny, ns, alpha, beta, w0, w1);
				real* w = w0; w0 = w1; w1 = w;
			}
		}

		// 5) Transfer output data back from device to host.
		#pragma acc update host (w0[0:szarray])
	}
	
	gettimeofday(&tim, NULL);
    end = tim.tv_sec + (tim.tv_usec/1000000.0);
#if 0	
	fd = creat(argv[5], 00666);
	fd = open(argv[5], O_WRONLY);
	write(fd, w0, szarrayb);
	close(fd);
    fp = fopen(argv[5], "w");
    fprintf(fp, "%d %d %d\n", ns, ny, nx);
	for(k = 0; k < ns; k++){
    	for (j = 0; j < ny; j++) {
        	for (i = 0; i < nx; i++) {
            	fprintf(fp, "%d %d %d %f\n", k, j, i, w0[k*nx*ny + j*nx + i]);
        	}
    	}
	}
    fclose(fp);
#endif
	
	mean = 0.0f;
	for (i = 0; i < szarray; i++)
		mean += w0[i];

	printf("Final mean = %f\n", mean/szarray);
	printf("Time for computing: %.2f s\n",end-start);

	free(w0);
	free(w1);

	return 0;
}
Beispiel #23
0
int main()
{
	int i, j, k;
	TYPE product, known_product;
	int NI, NJ, NK;
    int error;
	TYPE *input, *temp;
    struct timeval tim;
    double start, end;

	NK = 32;
	NJ = 1<<10;
	NI = 1<<10;

    error = 0;

	input = (TYPE*)malloc(NK*NJ*NI*sizeof(TYPE));
	temp = (TYPE*)malloc(NK*sizeof(TYPE));

    acc_init(acc_device_default);

    srand((unsigned)time(0));
	for(k=0; k<NK; k++)
	{
		for(j=0; j<NJ; j++)
		{
			for(i=0; i<NI; i++)
            {
				input[k*NJ*NI + j*NI + i] = rand()%5 + 1;
            }
		}
	}
  
    gettimeofday(&tim, NULL);
    start = tim.tv_sec*1000 + (tim.tv_usec/1000.0);
 #pragma acc parallel copyin(input[0:NK*NJ*NI]) \
  					  copyout(temp[0:NK]) 
  {
    #pragma acc loop gang
    for(k=0; k<NK; k++)
	{
        TYPE j_product = 1;
		#pragma acc loop worker reduction(*:j_product)
		for(j=0; j<NJ; j++)
		{
			#pragma acc loop vector
			for(i=0; i<NI; i++)
				j_product *= input[k*NJ*NI + j*NI + i];
		}
	    temp[k] = j_product;
	}
  }
    gettimeofday(&tim, NULL);
    end = tim.tv_sec*1000 + (tim.tv_usec/1000.0);
	
    for(k=0; k<NK; k++)
	{
		TYPE j_product = 1;
		for(j=0; j<NJ; j++)
		{
			for(i=0; i<NI; i++)
			    j_product *= input[k*NJ*NI + j*NI + i];
		}
		if(temp[k] != j_product)
        {
			error++;
		    printf("worker_vector * FAILED!\n");
        }
	}
    
    printf("worker_vector * execution time is :%.2lf: ms\n", end-start);

    if(error == 0)
        printf("worker_vector * SUCCESS!\n");

    free(input);
    free(temp);
}
Beispiel #24
0
int main()
{
    int i=0;

    #ifdef __NVCUDA__
    acc_init( acc_device_nvcuda );
    #endif 
    #ifdef __NVOPENCL__
    acc_init( acc_device_nvocl );
    //acc_list_devices_spec( acc_device_nvocl );
    #endif 
    
    size_t free=-1, total=-1;
    acc_get_mem_info(&free, &total);
    printf("device memory info> free/total %d/%d [%6.4f percent free]\n",free,total,free/(float)total*100);

    float a[SIZE];
    float b[SIZE];
    float c[SIZE];

    // Initialize matrices.
    for (i = 0; i < SIZE; ++i) {
        //B
        a[i] = (float)i ;
        b[i] = (float)2*i;
        c[i] = 0.0f;
    }// B

    unsigned long long int tic, toc;
    // Compute vector Add
    float sum=0, maX;
    int k;
    
#pragma acc enter data copyin(a,b) create(c)

    for(k=0; k<3; k++){
        printf("Calculation on GPU ... ");
        tic = clock();
        sum=0;
        maX=-1;
        #pragma acc kernels present(a,b,c)
        #pragma acc loop independent 
        for (i = 0; i < SIZE; ++i) {
            float x=0;
            x = a[i] + b[i] ;
            c[i] = x;
        }
        toc = clock();
        printf(" %6.4f ms\n",(toc-tic)/(float)1000);
    }

#pragma acc exit data copyout(c)

    // ****************
    // double-check the OpenACC result sequentially on the host
    // ****************
    // Perform the add
    printf("Calculation on CPU ... ");
    tic = clock();
    float cpuMax=-1, cpuSum=0;
    for (i = 0; i < SIZE; ++i) {
        //F
        if(c[i]!= (a[i]+b[i])) {
            fprintf(stderr,"Error %d %16.10f!=%16.10f \n", i, c[i], a[i]+b[i]);
            exit(1);
        }
    }//F
    toc = clock();
    printf(" %6.4f ms\n",(toc-tic)/(float)1000);
    fprintf(stderr,"OpenACC API test was successful!\n");

    printf("Shutting down the device...");
    acc_shutdown(acc_device_nvcuda);
    printf("[done]\n");

    return 0;
}
Beispiel #25
0
/**
 *  \brief ACC example application entry point.
 *
 *  \return Unused (ANSI-C compatibility).
 */
int main(void)
{
	uint32_t uc_key;
	int16_t s_volt = 0;
	uint32_t ul_value = 0;
	volatile uint32_t ul_status = 0x0;
	int32_t l_volt_dac0 = 0;

	/* Initialize the system */
	sysclk_init();
	board_init();

	/* Initialize debug console */
	configure_console();

	/* Output example information */
	puts(STRING_HEADER);

	/* Initialize DACC */
	/* Enable clock for DACC */
	pmc_enable_periph_clk(ID_DACC);
	/* Reset DACC registers */
	dacc_reset(DACC);
	/* External trigger mode disabled. DACC in free running mode. */
	dacc_disable_trigger(DACC, DACC_CHANNEL_0);
	/* Half word transfer mode */
	dacc_set_transfer_mode(DACC, 0);
#if (SAM3S) || (SAM3XA)
	/* Power save:
	 * sleep mode  - 0 (disabled)
	 * fast wakeup - 0 (disabled)
	 */
	dacc_set_power_save(DACC, 0, 0);
#endif

	/* Enable output channel DACC_CHANNEL */
	dacc_enable_channel(DACC, DACC_CHANNEL_0);
	/* Setup analog current */
	dacc_set_analog_control(DACC, DACC_ANALOG_CONTROL);

	/* Set DAC0 output at ADVREF/2. The DAC formula is:
	 *
	 * (5/6 * VOLT_REF) - (1/6 * VOLT_REF)     volt - (1/6 * VOLT_REF)
	 * ----------------------------------- = --------------------------
	 *              MAX_DIGITAL                       digit
	 *
	 * Here, digit = MAX_DIGITAL/2
	 */
	dacc_write_conversion_data(DACC, MAX_DIGITAL / 2, DACC_CHANNEL_0);
	l_volt_dac0 = (MAX_DIGITAL / 2) * (2 * VOLT_REF / 3) / MAX_DIGITAL +
			VOLT_REF / 6;

	/* Enable clock for AFEC */
	afec_enable(AFEC0);

	struct afec_config afec_cfg;

	afec_get_config_defaults(&afec_cfg);
	/* Initialize AFEC */
	afec_init(AFEC0, &afec_cfg);

	struct afec_ch_config afec_ch_cfg;
	afec_ch_get_config_defaults(&afec_ch_cfg);
	afec_ch_cfg.gain = AFEC_GAINVALUE_0;
	afec_ch_set_config(AFEC0, AFEC_CHANNEL_POTENTIOMETER, &afec_ch_cfg);
	/*
	 * Because the internal ADC offset is 0x200, it should cancel it and shift
	 * down to 0.
	 */
	afec_channel_set_analog_offset(AFEC0, AFEC_CHANNEL_POTENTIOMETER, 0x200);

	afec_set_trigger(AFEC0, AFEC_TRIG_SW);

	/* Enable channel for potentiometer. */
	afec_channel_enable(AFEC0, AFEC_CHANNEL_POTENTIOMETER);

	/* Enable clock for ACC */
	pmc_enable_periph_clk(ID_ACC);
	
	/* Initialize ACC */
	acc_init(ACC, ACC_MR_SELPLUS_AFE0_AD0, ACC_MR_SELMINUS_DAC0,
			ACC_MR_EDGETYP_ANY, ACC_MR_INV_DIS);

	/* Enable ACC interrupt */
	NVIC_EnableIRQ(ACC_IRQn);

	/* Enable */
	acc_enable_interrupt(ACC);

	dsplay_menu();

	while (1) {
		while (usart_read(CONSOLE_UART, &uc_key)) {
		}

		printf("input: %c\r\n", uc_key);

		switch (uc_key) {
		case 's':
		case 'S':
			printf("Input DAC0 output voltage (%d~%d mv): ",
					(VOLT_REF / 6), (VOLT_REF * 5 / 6));
			s_volt = get_input_voltage();
			puts("\r");

			if (s_volt > 0) {
				l_volt_dac0 = s_volt;
				/* The DAC formula is:
				 *
				 * (5/6 * VOLT_REF) - (1/6 * VOLT_REF)     volt - (1/6 * VOLT_REF)
				 * ----------------------------------- = --------------------------
				 *              MAX_DIGITAL                       digit
				 *
				 */
				ul_value = ((s_volt - (VOLT_REF / 6))
					* (MAX_DIGITAL * 6) / 4) / VOLT_REF;
				dacc_write_conversion_data(DACC, ul_value, DACC_CHANNEL_0);
				puts("-I- Set ok\r");
			} else {
				puts("-I- Input voltage is invalid\r");
			}
			break;
		case 'v':
		case 'V':
			/* Start conversion */
			afec_start_software_conversion(AFEC0);
			ul_status = afec_get_interrupt_status(AFEC0);
			while ((ul_status & AFEC_ISR_EOC0) != AFEC_ISR_EOC0) {
				ul_status = afec_get_interrupt_status(AFEC0);
			}
			/* Conversion is done */
			ul_value = afec_channel_get_value(AFEC0, AFEC_CHANNEL_POTENTIOMETER);

			/*
			 * Convert AFEC sample data to voltage value:
			 * voltage value = (sample data / max. resolution) * reference voltage
			 */
			s_volt = (ul_value * VOLT_REF) / MAX_DIGITAL;
			printf("-I- Voltage on potentiometer(AD0) is %d mv\n\r", s_volt);
			printf("-I- Voltage on DAC0 is %ld mv \n\r", (long)l_volt_dac0);
			break;
			
		case 'm':
		case 'M':
			dsplay_menu();
			break;
		}
	}
}
Beispiel #26
0
int main()
{
    int i;

    float a[SIZE];
    float b[SIZE];
    float c[SIZE];
    float seq[SIZE];
    /*
    float Papi[SIZE][SIZE];
    float *onedim;
    float *twodim;
    float temp[3]={a[0],b[0],c[0]};
    */
    #ifdef __NVCUDA__
    acc_init( acc_device_nvcuda );
    #endif 
    #ifdef __NVOPENCL__
    acc_init( acc_device_nvocl );
    acc_list_devices_spec( acc_device_nvocl );
    #endif 


    // Initialize matrices.
    for (i = 0; i < SIZE; ++i) {
        //B
            a[i] = (float)i ;
            b[i] = (float)2*i;
            c[i] = 0.0f;
    }// B

    unsigned long long int tic, toc;
    // Compute vector Add
    int d[1]={0};
    int k;
    for(k=0; k<3; k++){
        //C
        printf("Calculation on GPU ... ");
        tic = clock();
        #pragma acc data pcopyin(a[0:SIZE],b[0:SIZE]) pcopyout(c[0:SIZE]) pcopy(d[0:1])
        {
            # pragma acc kernels 
            {
                #pragma acc loop independent 
                {
                    for (i = 0; i < SIZE; ++i) {
                        #pragma acc atomic capture
                        {
                            d[0]+=inc_step();
                        } 
                        c[i] = a[i] + b[i] ;
                    }
                }
            }
        }
        toc = clock();
        printf(" %6.4f ms\n",(toc-tic)/(float)1000);
    }

    // ****************
    // double-check the OpenACC result sequentially on the host
    // ****************
    // Perform the add
    printf("Calculation on CPU ... ");
    tic = clock();
    for (i = 0; i < SIZE; ++i) {
            seq[i] = a[i] + b[i] ;
            if(c[i]!= seq[i]) {
                fprintf(stderr,"Error %d %16.10f!=%16.10f \n", i, c[i], seq[i]);
                return -1;
            }
    }
    toc = clock();
    printf(" %6.4f ms\n",(toc-tic)/(float)1000);
    printf("atomic sum> %d (should be %d)\n", d[0], 3*SIZE*inc_step());
    if(d[0]==3*SIZE*inc_step()){
        fprintf(stderr,"OpenACC atomic operation test was successful!\n");
    }else{
        fprintf(stderr,"OpenACC atomic operation test failed!\n");
    }
    return 0;
}
Beispiel #27
0
int
main (int argc, char **argv)
{
  CUdevice dev;
  CUfunction delay;
  CUmodule module;
  CUresult r;
  CUstream stream;
  unsigned long *a, *d_a, dticks;
  int nbytes;
  float atime, dtime;
  void *kargs[2];
  int clkrate;
  int devnum, nprocs;

  acc_init (acc_device_nvidia);

  devnum = acc_get_device_num (acc_device_nvidia);

  r = cuDeviceGet (&dev, devnum);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGet failed: %d\n", r);
      abort ();
    }

  r =
    cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
			  dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r);
      abort ();
    }

  r = cuModuleLoad (&module, "subr.ptx");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleLoad failed: %d\n", r);
      abort ();
    }

  r = cuModuleGetFunction (&delay, module, "delay");
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuModuleGetFunction failed: %d\n", r);
      abort ();
    }

  nbytes = nprocs * sizeof (unsigned long);

  dtime = 200.0;

  dticks = (unsigned long) (dtime * clkrate);

  a = (unsigned long *) malloc (nbytes);
  d_a = (unsigned long *) acc_malloc (nbytes);

  acc_map_data (a, d_a, nbytes);

  kargs[0] = (void *) &d_a;
  kargs[1] = (void *) &dticks;

  r = cuStreamCreate (&stream, CU_STREAM_DEFAULT);
  if (r != CUDA_SUCCESS)
	{
	  fprintf (stderr, "cuStreamCreate failed: %d\n", r);
	  abort ();
	}

  acc_set_cuda_stream (0, stream);

  init_timers (1);

  start_timer (0);

  r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0);
  if (r != CUDA_SUCCESS)
    {
      fprintf (stderr, "cuLaunchKernel failed: %d\n", r);
      abort ();
    }

  acc_wait (1);

  atime = stop_timer (0);

  if (atime < dtime)
    {
      fprintf (stderr, "actual time < delay time\n");
      abort ();
    }

  start_timer (0);

  acc_wait (1);

  atime = stop_timer (0);

  if (0.010 < atime)
    {
      fprintf (stderr, "actual time < delay time\n");
      abort ();
    }

  acc_unmap_data (a);

  fini_timers ();

  free (a);
  acc_free (d_a);

  acc_shutdown (acc_device_nvidia);

  return 0;
}
Beispiel #28
0
int main(int argc, char* argv[])
{
  int i;
  unsigned int SIZEX = 0;
  unsigned int SIZEY = 0;
  unsigned int SIZEZ = 0;
  if (argc == 4) {
    sscanf(argv [1], "%d", &SIZEX);
    sscanf(argv [2], "%d", &SIZEY);
    sscanf(argv [3], "%d", &SIZEZ);
  }else{
    printf("usage: %s xdim ydim zdim\n", argv [0]);
    return -1;
  }

  unsigned int SIZE = SIZEX * SIZEY * SIZEZ;
  assert(SIZE > 0);
  printf("allocation size> %d\n", SIZE);

  float *a = (float*)malloc(sizeof(float) * SIZE);
  float *b = (float*)malloc(sizeof(float) * SIZE);
  float *c = (float*)malloc(sizeof(float) * SIZE);


    #ifdef __NVCUDA__
  acc_init(acc_device_nvcuda);
    #endif
    #ifdef __NVOPENCL__
  
    #define DEVICE_TYPE acc_device_nvocl 
  printf("compiled for ocl\n");
  acc_init(DEVICE_TYPE);
  acc_list_devices_spec(DEVICE_TYPE);
    #endif


  
  for (i = 0; i < SIZE; ++i) {
    a [i] = (float)i;
    b [i] = (float)2 * i;
    c [i] = 0.0f;
  }

  
  int k;
  double revsum = 0;
  int iter = 30;
    

	ipmacc_prompt((char*)"IPMACC: memory allocation c\n");
// ISPC host and device are the same, skipping memory allocation
ipmacc_prompt((char*)"IPMACC: memory allocation a\n");
// ISPC host and device are the same, skipping memory allocation
ipmacc_prompt((char*)"IPMACC: memory allocation b\n");
// ISPC host and device are the same, skipping memory allocation
	ipmacc_prompt((char*)"IPMACC: memory copyin a\n");
// ISPC host and device are the same, skipping copyin
ipmacc_prompt((char*)"IPMACC: memory copyin b\n");
// ISPC host and device are the same, skipping copyin


{


  
		for(k = 0; k < iter; k++)
 {
    reset_and_start_timer();
        


/* kernel call statement*/
{

unsigned int __ispc_n_threads = sysconf(_SC_NPROCESSORS_ONLN); // acc_get_n_cores(acc_device_intelispc);
if(getenv("IPMACC_VERBOSE")) printf("IPMACC: Launching ISPC kernel> %d threads + SIMD \n", __ispc_n_threads);
__generated_kernel_launch_0(a,c,b,SIZE);
}
/* kernel call statement*/
// ISPC target is synchronized with CPU
// skipping synchronization



    double dt = get_elapsed_msec();
    revsum += 1.0 / dt;
    printf("@time of openacc run:\t\t\t%.3f msec\n", dt);
  }


}
	ipmacc_prompt((char*)"IPMACC: memory copyout c\n");
// ISPC host and device are the same, skipping copyout



  printf("harmonic mean openacc run> %.3f msec\n", iter / revsum);

  
  
  
  
  for (i = 0; i < SIZE; ++i) {
    if (c [i] != (a [i] + b [i])) {
      fprintf(stdout, "Error %d %16.10f!=%16.10f \n", i, c [i], a [i] + b [i]);
      return -1;
    }
  }

  fprintf(stdout, "OpenACC vectoradd test was successful!\n");
  return 0;
}
Beispiel #29
0
int main()
{
	int i, j, k;
	int sum, known_sum;
	int NI, NJ, NK;
    int error;
	REAL *input, *temp;
    REAL frounding_error = 1.E-9;
    struct timeval tim;
    double start, end;
	
	NK = 2;
	NJ = 32;
    NI = 1<<20;
    
    error = 0;
	input = (REAL*)malloc(NK*NJ*NI*sizeof(REAL));
	temp = (REAL*)malloc(NK*NJ*NI*sizeof(REAL));
    
    acc_init(acc_device_default);

    srand((unsigned)time(0));
	for(k=0; k<NK; k++)
	{
		for(j=0; j<NJ; j++)
		{
			for(i=0; i<NI; i++)
            {
				input[k*NJ*NI + j*NI + i] = (REAL)rand()/(REAL)RAND_MAX + 0.1;
            }
		}
	}
    
    gettimeofday(&tim, NULL);
    start = tim.tv_sec*1000 + (tim.tv_usec/1000.0);
  #pragma acc parallel copyin(input[0:NK*NJ*NI]) \
  					   copyout(temp[0:NK*NJ*NI]) \
                       num_gangs(192) \
                       num_workers(8) \
                       vector_length(128) 
  {
    #pragma acc loop gang
    for(k=0; k<NK; k++)
	{
		#pragma acc loop worker
		for(j=0; j<NJ; j++)
		{
            REAL i_sum = 0.0;
			#pragma acc loop vector reduction(+:i_sum)
			for(i=0; i<NI; i++)
				i_sum += input[k*NJ*NI + j*NI + i];
            temp[k*NJ*NI + j*NI] = i_sum;
		}
	}
  }
    gettimeofday(&tim, NULL);
    end = tim.tv_sec*1000 + (tim.tv_usec/1000.0);

	for(k=0; k<NK; k++)
	{
		for(j=0; j<NJ; j++)
		{
            REAL i_sum = 0.0;
			for(i=0; i<NI; i++)
				i_sum += input[k*NJ*NI + j*NI + i];

            if(fabsf(temp[k*NJ*NI + j*NI]-i_sum) > frounding_error)
            {
			    error++;
		        printf("vecotr + FAILED!\n");
            }
		}
	}
    
    printf("vector + execution time is :%.2lf: ms\n", end-start);
    if(error == 0)
        printf("vector + SUCCESS!\n");
    
    free(input);
    free(temp);
}
Beispiel #30
0
int main(int argc, char* argv[])
{
	if (argc != 5)
	{
		printf("Usage: %s <nx> <ny> <ns> <nt>\n", argv[0]);
		exit(1);
	}

	const char* no_timing = getenv("NO_TIMING");

#if defined(_OPENACC)
	char* regcount_fname = getenv("OPENACC_PROFILING_FNAME");
	if (regcount_fname)
	{
		char* regcount_lineno = getenv("OPENACC_PROFILING_LINENO");
		int lineno = -1;
		if (regcount_lineno)
			lineno = atoi(regcount_lineno);
		//kernelgen_enable_openacc_regcount(regcount_fname, lineno);
	}
#endif

	parse_arg(nx, argv[1]);
	parse_arg(ny, argv[2]);
	parse_arg(ns, argv[3]);
	parse_arg(nt, argv[4]);

	size_t szarray = (size_t)nx * ny * ns;
	size_t szarrayb = szarray * sizeof(real);

	real* x = (real*)memalign(MEMALIGN, szarrayb);
	real* y = (real*)memalign(MEMALIGN, szarrayb);
	real* xy = (real*)memalign(MEMALIGN, szarrayb);

	if (!x || !y || !xy)
	{
		printf("Error allocating memory for arrays: %p, %p, %p\n", x, y, xy);
		exit(1);
	}

	real mean = 0.0f;
	for (int i = 0; i < szarray; i++)
	{
		x[i] = real_rand();
		y[i] = real_rand();
		xy[i] = real_rand();
		mean += x[i] + y[i] + xy[i];
	}
	printf("initial mean = %f\n", mean / szarray / 3);

	//
	// MIC or OPENACC:
	//
	// 1) Perform an empty offload, that should strip
	// the initialization time from further offloads.
	//
#if defined(_MIC) || defined(_OPENACC)
	volatile struct timespec init_s, init_f;
#if defined(_MIC)
	get_time(&init_s);
	#pragma offload target(mic) \
		nocopy(x:length(szarray) alloc_if(0) free_if(0)), \
		nocopy(y:length(szarray) alloc_if(0) free_if(0)), \
		nocopy(xy:length(szarray) alloc_if(0) free_if(0))
	{ }
	get_time(&init_f);
#endif
#if defined(_OPENACC)
	get_time(&init_s);
	acc_init(acc_device_default);
	get_time(&init_f);
#endif
	double init_t = get_time_diff((struct timespec*)&init_s, (struct timespec*)&init_f);
	if (!no_timing) printf("init time = %f sec\n", init_t);
#endif

	volatile struct timespec total_s, total_f;
	get_time(&total_s);
	//
	// MIC or OPENACC:
	//
	// 2) Allocate data on device, but do not copy anything.
	//
#if defined(_MIC) || defined(_OPENACC)
	volatile struct timespec alloc_s, alloc_f;
#if defined(_MIC)
	get_time(&alloc_s);
	#pragma offload target(mic) \
		nocopy(x:length(szarray) alloc_if(1) free_if(0)), \
		nocopy(y:length(szarray) alloc_if(1) free_if(0)), \
		nocopy(xy:length(szarray) alloc_if(1) free_if(0))
	{ }
	get_time(&alloc_f);
#endif
#if defined(_OPENACC)
	get_time(&alloc_s);
	#pragma acc data create (x[0:szarray], y[0:szarray], xy[0:szarray])
	{
	get_time(&alloc_f);
#endif
	double alloc_t = get_time_diff((struct timespec*)&alloc_s, (struct timespec*)&alloc_f);
	if (!no_timing) printf("device buffer alloc time = %f sec\n", alloc_t);
#endif

	//
	// MIC or OPENACC:
	//
	// 3) Transfer data from host to device and leave it there,
	// i.e. do not allocate deivce memory buffers.
	//
#if defined(_MIC) || defined(_OPENACC)
	volatile struct timespec load_s, load_f;
#if defined(_MIC)
	get_time(&load_s);
	#pragma offload target(mic) \
		in(x:length(szarray) alloc_if(0) free_if(0)), \
		in(y:length(szarray) alloc_if(0) free_if(0)), \
		in(xy:length(szarray) alloc_if(0) free_if(0))
	{ }
	get_time(&load_f);
#endif
#if defined(_OPENACC)
	get_time(&load_s);
	#pragma acc update device(x[0:szarray], y[0:szarray], xy[0:szarray])
	get_time(&load_f);
#endif
	double load_t = get_time_diff((struct timespec*)&load_s, (struct timespec*)&load_f);
	if (!no_timing) printf("data load time = %f sec (%f GB/sec)\n", load_t, 2 * szarrayb / (load_t * 1024 * 1024 * 1024));
#endif

	//
	// 4) Perform data processing iterations, keeping all data
	// on device.
	//
	volatile struct timespec compute_s, compute_f;
	get_time(&compute_s);
#if defined(_MIC)
	#pragma offload target(mic) \
		nocopy(x:length(szarray) alloc_if(0) free_if(0)), \
		nocopy(y:length(szarray) alloc_if(0) free_if(0)), \
		nocopy(xy:length(szarray) alloc_if(0) free_if(0))
#endif
	{
		for (int it = 0; it < nt; it++)
		{
#if defined(_PATUS)
			real* dummy;
			#pragma omp parallel
			sincos_patus(&dummy, x, y, xy, nx, ny, ns);
#else
			sincos_(&nx, &ny, &ns, x, y, xy);
#endif
		}
	}
	get_time(&compute_f);
	double compute_t = get_time_diff((struct timespec*)&compute_s, (struct timespec*)&compute_f);
	if (!no_timing) printf("compute time = %f sec\n", compute_t);

	//
	// MIC or OPENACC:
	//
	// 5) Transfer output data back from device to host.
	//
#if defined(_MIC) || defined(_OPENACC)
	volatile struct timespec save_s, save_f;
#if defined(_MIC)
	get_time(&save_s);
	#pragma offload target(mic) \
		out(xy:length(szarray) alloc_if(0) free_if(0))
	{ }
	get_time(&save_f);
#endif
#if defined(_OPENACC)
	get_time(&save_s);
	#pragma acc update host (xy[0:szarray])
	get_time(&save_f);
#endif
	double save_t = get_time_diff((struct timespec*)&save_s, (struct timespec*)&save_f);
	if (!no_timing) printf("data save time = %f sec (%f GB/sec)\n", save_t, szarrayb / (save_t * 1024 * 1024 * 1024));
#endif

	//
	// MIC or OPENACC:
	//
	// 6) Deallocate device data buffers.
	// OPENACC does not seem to have explicit deallocation.
	//
#if defined(_OPENACC)
	}
#endif
#if defined(_MIC)
	volatile struct timespec free_s, free_f;
	get_time(&free_s);
	#pragma offload target(mic) \
		nocopy(x:length(szarray) alloc_if(0) free_if(1)), \
		nocopy(y:length(szarray) alloc_if(0) free_if(1)), \
		nocopy(xy:length(szarray) alloc_if(0) free_if(1))
	{ }
	get_time(&free_f);
	double free_t = get_time_diff((struct timespec*)&free_s, (struct timespec*)&free_f);
	// if (!no_timing) printf("device buffer free time = %f sec\n", free_t);
#endif

	get_time(&total_f);
	if (!no_timing) printf("device buffer free time = %f sec\n", get_time_diff((struct timespec*)&total_s, (struct timespec*)&total_f));

	// For the final mean - account only the norm of the top
	// most level (tracked by swapping idxs array of indexes).
	mean = 0.0f;
	for (int i = 0; i < szarray; i++)
		mean += xy[i];
	printf("final mean = %f\n", mean / szarray);

	free(x);
	free(y);
	free(xy);

	fflush(stdout);

	return 0;
}