int main (int argv, char **argc) { btest=(double*)malloc(sizeof(double)*IDA); atest=(double*)malloc(sizeof(double)*IDA); #ifdef __NVCUDA__ acc_init( acc_device_nvcuda ); #endif #ifdef __NVOPENCL__ acc_init( acc_device_nvocl ); //acc_list_devices_spec( acc_device_nvocl ); #endif printf(" *******************************************************\n"); delaylength = 500; innerreps = 100; // GENERATE REFERENCE TIME // refer(); // TEST PRIVATE //estprivnew(); // TEST FIRSTPRIVATE //estfirstprivnew(); #ifdef OMPVER2 // TEST COPYPRIVATE / //estcopyprivnew(); #endif //TEST COPYIN copyintest(); // TEST COPYOUT copyouttest(); // TEST CREATE createtest(); // TEST REDUCTION reductiontest(); // TEST REDUCTION kerneltest(); // TEST PRIVATE // privatetest(); delaylength = 500; innerreps = 100; }
int main (int argc, char **argv) { void *d; acc_device_t devtype = acc_device_host; #if ACC_DEVICE_TYPE_nvidia devtype = acc_device_nvidia; if (acc_get_num_devices (acc_device_nvidia) == 0) return 0; #endif acc_init (devtype); d = acc_malloc (0); if (d != NULL) abort (); acc_free (0); acc_shutdown (devtype); acc_set_device_type (devtype); d = acc_malloc (0); if (d != NULL) abort (); acc_shutdown (devtype); acc_init (devtype); d = acc_malloc (1024); if (d == NULL) abort (); acc_free (d); acc_shutdown (devtype); acc_set_device_type (devtype); d = acc_malloc (1024); if (d == NULL) abort (); acc_free (d); acc_shutdown (devtype); return 0; }
//! //! @brief This function initializes the hardware/software resources //! required for device mouse HID task. //! void device_mouse_hid_task_init(void) { sof_cnt = 0; #if BOARD == EVK1101 // Initialize accelerometer driver acc_init(); #endif #ifndef FREERTOS_USED #if USB_HOST_FEATURE == true // If both device and host features are enabled, check if device mode is engaged // (accessing the USB registers of a non-engaged mode, even with load operations, // may corrupt USB FIFO data). if (Is_usb_device()) #endif // USB_HOST_FEATURE == true Usb_enable_sof_interrupt(); #endif // FREERTOS_USED #ifdef FREERTOS_USED xTaskCreate(device_mouse_hid_task, configTSK_USB_DHID_MOUSE_NAME, configTSK_USB_DHID_MOUSE_STACK_SIZE, NULL, configTSK_USB_DHID_MOUSE_PRIORITY, NULL); #endif // FREERTOS_USED }
int main(int argc, char **argv) { /* 模块初始化 */ exc_init(); /* 中断初始化 */ sys_timer_init(); /* 系统时钟初始化 */ light_init(); /* LED灯初始化 */ switch_init(); /* 开关初始化 */ speaker_init(); /* 蜂鸣器初始化 */ motor_init(); /* 电机初始化 */ decoder_init(); /* 编码器初始化 */ gyro_init(); /* 陀螺仪初始化 */ acc_init(); /* 加速度传感器初始化 */ serial_initialize((intptr_t)(NULL)); /* 初始化串口 */ //sd_init(&Fatfs); /* 初始化SD卡,并创建文件 */ //sd_create_file(&test_data, test_data_name); /* 命令注册 */ help_cmd_initialize((intptr_t)(NULL)); light_cmd_initialize((intptr_t)(NULL)); switch_cmd_initialize((intptr_t)(NULL)); speaker_cmd_initialize((intptr_t)(NULL)); motor_cmd_initialize((intptr_t)(NULL)); decoder_cmd_initialize((intptr_t)(NULL)); //sd_cmd_initialize((intptr_t)(NULL)); printf("\n Welcome to k60 software platform!"); printf("\n Press 'help' to get the help! \n"); light_open(LIGHT4); /* ntshell测试 */ task_ntshell((intptr_t)(NULL)); }
void board_init(void) { #ifdef KEY_RC_BOARD /* On board Button initialization */ ioport_configure_pin(BUTTON_IRQ_PIN_1,IOPORT_DIR_INPUT | IOPORT_PULL_UP); ioport_configure_pin(BUTTON_IRQ_PIN_2,IOPORT_DIR_INPUT | IOPORT_PULL_UP); ioport_configure_pin(BUTTON_IRQ_PIN_3,IOPORT_DIR_INPUT | IOPORT_PULL_UP); set_button_pins_for_normal_mode(); /* Initialize the IRQ lines' interrupt behaviour. */ DISABLE_ALL_BUTTON_IRQS(); /* LED Init */ /* LCD initialization for inactive use */ /* On board LED initialization */ ioport_configure_pin(LCD_CS_ON_BOARD, IOPORT_DIR_OUTPUT | IOPORT_INIT_HIGH); ioport_set_port_dir(IOPORT_PORTE,KEY_RC_IO_MASK,IOPORT_DIR_OUTPUT); ioport_set_port_level(IOPORT_PORTE,KEY_RC_IO_MASK,KEY_RC_IO_MASK); ioport_set_pin_dir(IOPORT_CREATE_PIN(PORTG , 2),IOPORT_DIR_INPUT); ioport_set_pin_mode(IOPORT_CREATE_PIN(PORTG , 2), IOPORT_MODE_PULLUP); LATCH_INIT(); /* Init ADC for the Accelerometer */ adc_init(); // LATCH_INIT(); /* Enable Accelerometer by enabling the PWR pin in the Latch */ acc_init(); update_latch_status(); /* Apply latch pulse to set LED status */ pulse_latch(); #else /* To identify if it is a plain or STB*/ board_identify(); /* On board LED initialization */ ioport_configure_pin(LED0_RCB,IOPORT_DIR_OUTPUT | IOPORT_INIT_HIGH); ioport_configure_pin(LED1_RCB,IOPORT_DIR_OUTPUT | IOPORT_INIT_HIGH); ioport_configure_pin(LED2_RCB,IOPORT_DIR_OUTPUT | IOPORT_INIT_HIGH); /* On board Switch initialization */ ioport_configure_pin(GPIO_PUSH_BUTTON_0,IOPORT_DIR_INPUT | IOPORT_PULL_UP); #ifdef BREAKOUT_BOARD //Enable RCB_BB RS232 level converter ioport_set_port_dir(IOPORT_PORTD,BB_SIO_MASK,IOPORT_DIR_OUTPUT); ioport_set_port_level(IOPORT_PORTD,BB_SIO_MASK,BB_SIO_VAL); #endif #endif }
int main (int argc, char **argv) { acc_device_t devtype = acc_device_host; #if ACC_DEVICE_TYPE_nvidia devtype = acc_device_nvidia; if (acc_get_num_devices (devtype) == 0) return 0; #endif acc_init (devtype); acc_init (devtype); return 0; }
int main() { int i, j, k; TYPE sum, known_sum; int NI; int error; TYPE *input; TYPE rounding_error = 1.E-9; struct timeval tim; double start, end; NI = 1<<20; error = 0; input = (TYPE*)malloc(NI*sizeof(TYPE)); acc_init(acc_device_default); srand((unsigned)time(0)); for(i=0; i<NI; i++) { input[i] = (TYPE)rand()/(TYPE)RAND_MAX + 0.1; } sum = 0; gettimeofday(&tim, NULL); start = tim.tv_sec*1000 + (tim.tv_usec/1000.0); #pragma acc parallel copyin(input[0:NI]) \ num_gangs(192) \ num_workers(1) \ vector_length(128) { #pragma acc loop gang vector reduction(+:sum) for(i=0; i<NI; i++) sum += input[i]; } gettimeofday(&tim, NULL); end = tim.tv_sec*1000 + (tim.tv_usec/1000.0); known_sum = 0; for(i=0; i<NI; i++) { known_sum += input[i]; } if(fabs(sum - known_sum) > rounding_error) { error++; printf("same_gang_vector + FAILED! sum=%d, known_sum=%d\n", sum, known_sum); } printf("same_gang_vector + execution time is :%.2lf: ms\n", end-start); if(error == 0) printf("same_gang_vector + SUCCESS!\n"); free(input); }
int main(){ int SIZE=1024; float *a=(float*)malloc(sizeof(float)*SIZE); float *b=(float*)malloc(sizeof(float)*SIZE); float *c=(float*)malloc(sizeof(float)*SIZE); int i,k,j; #ifdef __NVCUDA__ acc_init( acc_device_nvcuda ); #endif #ifdef __NVOPENCL__ acc_init( acc_device_nvocl ); //acc_list_devices_spec( acc_device_nvocl ); #endif #pragma acc kernels create(a[0:SIZE],b[0:SIZE],c[0:SIZE]) copyout(a[0:SIZE],b[0:SIZE],c[0:SIZE]) { #pragma acc loop independent for(i=0; i<SIZE; i++){ a[i]=i; b[i]=SIZE-i; } } #pragma acc kernels present(a,b,c) copyout(c[0:SIZE]) { #pragma acc loop independent for(i=0; i<SIZE; i++){ c[i]=a[i]+b[i]; } } for(i=0; i<SIZE; i++){ if(c[i]!=(a[i]+b[i])){ fprintf(stderr,"failed to perform the sum at %d. %6.4f + %6.4f =%6.4f\n",i,a[i],b[i],c[i]); return -1; } } fprintf(stderr,"Multiple kernels test was successfull!\n"); return 0; }
attribute_hidden void goacc_lazy_initialize (void) { struct goacc_thread *thr = goacc_thread (); if (thr && thr->dev) return; if (!cached_base_dev) acc_init (acc_device_default); else goacc_attach_host_thread_to_device (-1); }
/** * \brief Initialize re200b sensor in order to detect motion * \param ul_acc_minus ACC minus input, use ACC peripheral definition in header. * \param ul_acc_plus ACC plus input, use ACC peripheral definition in header. */ void re200b_motion_detect_init(uint32_t ul_acc_minus, uint32_t ul_acc_plus) { pmc_enable_periph_clk(ID_ACC); /* Initialize ACC */ acc_init(ACC, ul_acc_plus, ul_acc_minus, ACC_MR_EDGETYP_ANY, ACC_MR_INV_DIS); /* clear status */ acc_get_interrupt_status(ACC); /* reset event flags */ g_compare_result = CMP_EQUAL; g_ul_compare_event_flag = false; }
/******************************************************************************* * Function Name : Axis3_Test * Description : Light Sensor Test. * Input : None * Output : None * Return : None *******************************************************************************/ void Axis3_Test(void) { char buf[24]; int32_t xoff = 0; int32_t yoff = 0; int32_t zoff = 0; int8_t x = 0; int8_t y = 0; int8_t z = 0; OLED_ClearScreen(); OLED_DisStrLine(0, 0, "Axis-3"); I2CInit(I2CMASTER, 0); acc_init(); /* Assume base board in zero-g position when reading first value. */ acc_read(&x, &y, &z); xoff = 0-x; yoff = 0-y; zoff = 0-z; // while(1) // { /* Accelerometer */ acc_read(&x, &y, &z); x = x+xoff; y = y+yoff; z = z+zoff; snprintf(buf, 20, "Acc x: %d ", x); OLED_DisStrLine(2, 0, (uint8_t *)buf); printf("\r\nAcc x: %d, ", x); snprintf(buf, 20, "Acc y: %d ", y); OLED_DisStrLine(3, 0, (uint8_t *)buf); printf("Acc y: %d, ", y); snprintf(buf, 20, "Acc z: %d ", z); OLED_DisStrLine(4, 0, (uint8_t *)buf); printf("Acc z: %d. ", z); delay_ms(250); // if(KEY_Read() == KEY_ESC) // break; // } }
int main() { #ifdef __NVCUDA__ acc_init( acc_device_nvcuda ); #endif #ifdef __NVOPENCL__ acc_init( acc_device_nvocl ); //acc_list_devices_spec( acc_device_nvocl ); #endif float a[SIZE], b[SIZE], c[SIZE]; int i; for(i=0; i<2*SIZE; i++) { a[i%SIZE]=i; b[i%SIZE]=a[(i*13)%SIZE]; c[i%SIZE]=0; } #pragma acc kernels copyin(a,b) copyout(c) #pragma acc loop independent for(int i=0; i<SIZE; i++) { c[i] = funct (a[i], b[i]); //c[i] = a[i]>b[i]?a[i]:b[i]; } for (i = 0; i < SIZE; ++i) { if(c[i]!= funct(a[i],b[i])) { fprintf(stderr,"Error %d %16.10f!=%16.10f \n", i, c[i], funct(a[i],b[i])); return -1; } } fprintf(stderr,"'function call in kernels region' test was successful!\n"); return 0; }
int main (int argc, char **argv) { float atime; CUstream stream; CUresult r; acc_init (acc_device_nvidia); (void) acc_get_device_num (acc_device_nvidia); init_timers (1); stream = (CUstream) acc_get_cuda_stream (0); if (stream != NULL) abort (); r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); start_timer (0); acc_wait_all_async (0); acc_wait (0); atime = stop_timer (0); if (0.010 < atime) { fprintf (stderr, "actual time too long\n"); abort (); } fini_timers (); acc_shutdown (acc_device_nvidia); exit (0); }
int main() { int i; int error; int NI; int *input; int sum; NI = 2048; acc_init(acc_device_default); input = (int*)malloc(NI*sizeof(int)); for(i=0; i<NI; i++) { input[i] = i%10; } sum = 0; #pragma acc parallel copyin(input[0:NI]) { #pragma acc loop gang worker vector reduction(+:sum) for(i=0; i<NI; i++) { sum += input[i]; } } error = 0; int known_sum = 0; for(i=0; i<NI; i++) known_sum += input[i]; if(known_sum != sum) error++; free(input); printf("Test for same line gang worker vector reduction\n"); if(error == 0) printf("SUCCESSFUL!\n"); else printf("Reduction + FAILED! error=%d\n", error); }
int main (int argc, char **argv) { CUstream stream; CUresult r; struct timeval tv1, tv2; time_t t1; acc_init (acc_device_nvidia); stream = (CUstream) acc_get_cuda_stream (0); if (stream != NULL) abort (); r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } if (!acc_set_cuda_stream (0, stream)) abort (); gettimeofday (&tv1, NULL); acc_wait_all_async (0); acc_wait (0); gettimeofday (&tv2, NULL); t1 = ((tv2.tv_sec - tv1.tv_sec) * 1000000) + (tv2.tv_usec - tv1.tv_usec); if (t1 > 1000) { fprintf (stderr, "too long\n"); abort (); } acc_shutdown (acc_device_nvidia); exit (0); }
int main (int argc, char **argv) { const int N = 256; int i; unsigned char *h; void *d; acc_init (acc_device_nvidia); h = (unsigned char *) malloc (N); for (i = 0; i < N; i++) { h[i] = i; } d = acc_malloc (N); acc_memcpy_to_device (d, h, N); memset (&h[0], 0, N); acc_memcpy_to_device (d, h, N << 1); acc_memcpy_from_device (h, d, N); for (i = 0; i < N; i++) { if (h[i] != i) abort (); } acc_free (d); free (h); acc_shutdown (acc_device_nvidia); return 0; }
int main(int argc, char *argv[]) { int n; double *A, *B, *C; double start, end; struct timeval tim; if (argc != 2) { fprintf(stderr, "Usage: matmul <n>\n"); exit(1); } n = atoi(argv[1]); A = malloc(n * n * sizeof(double)); B = malloc(n * n * sizeof(double)); C = malloc(n * n * sizeof(double)); initA(A, n); initB(B, n); initC(C, n); //verify(A, n); //verify(B, n); acc_init(acc_device_default); /* sequential run */ gettimeofday(&tim, NULL); start = tim.tv_sec + (tim.tv_usec/1000000.0); iter_matmul(A, B, C, n); gettimeofday(&tim, NULL); end = tim.tv_sec + (tim.tv_usec/1000000.0); printf("Execution time is: %.2f s\n", end-start); verify(C, n); free(C); free(B); free(A); return 0; }
int main (int argc, char **argv) { acc_device_t devtype = acc_device_host; #if ACC_DEVICE_TYPE_nvidia devtype = acc_device_nvidia; if (acc_get_num_devices (acc_device_nvidia) == 0) return 0; #endif acc_init (devtype); acc_shutdown (devtype); fprintf (stderr, "CheCKpOInT\n"); acc_shutdown (devtype); return 0; }
/*! \brief This is an example demonstrating the accelerometer * functionalities using the accelerometer driver. */ int main(void) { volatile unsigned long i; // switch to oscillator 0 pm_switch_to_osc0(&AVR32_PM, FOSC0, OSC0_STARTUP); // Initialize the debug USART module. init_dbg_rs232(FOSC0); acc_init(); // do a loop for (;;) { // slow down operations for ( i=0 ; i < 50000 ; i++); // display a header to user print_dbg("\x1B[2J\x1B[H\r\n\r\nAccelerometer Example\r\n\r\n"); // Get accelerometer acquisition and process data acc_update(); // test for fast or slow changes // depending on that, play with angles or moves if ( is_acc_slow() ) { print_mouse() ; print_angles() ; } else print_move() ; // MEUH // only text here , needs to be combined with PWM // meuh_stop is the "end" signal from PWM meuh_en = is_acc_meuh( meuh_stop ) ; } }
int newepoch_init(epoch_item_t **ep, setup_params_t *setup) { epoch_item_t *out = NULL; *ep = malloc(sizeof(epoch_item_t)); out = *ep; if(!out) { pbgp_error("newepoch_init :: %s\n",strerror(errno)); return -1; } bzero(out,sizeof(epoch_item_t)); acc_init(&out->acc,setup->pairing); mpz_init(out->s_new); mpz_init(out->s_rvk); mpz_init(out->s_acc); return 0; }
static void initGPU(int argc, char** argv) { // gets the device id (if specified) to run on int devId = -1; if (argc > 1) { devId = atoi(argv[1]); int devCount = acc_get_num_devices(acc_device_nvidia); if (devId < 0 || devId >= devCount) { printf("The specified device ID is not supported.\n"); exit(1); } } if (devId != -1) { acc_set_device_num(devId, acc_device_nvidia); } // creates a context on the GPU just to // exclude initialization time from computations acc_init(acc_device_nvidia); // print device id devId = acc_get_device_num(acc_device_nvidia); printf("Running on GPU with ID %d.\n\n", devId); }
int main(int argc, char* argv[]) { int i, j, k, it; //FILE *fp; //int fd; real mean; struct timeval tim; double start, end; if (argc != 5) { printf("Usage: %s <nx> <ny> <ns> <nt>\n", argv[0]); exit(1); } srand(17); parse_arg(nx, argv[1]); parse_arg(ny, argv[2]); parse_arg(ns, argv[3]); parse_arg(nt, argv[4]); real alpha = real_rand(); real beta = real_rand(); printf("alpha = %f, beta = %f\n", alpha, beta); unsigned int szarray = (unsigned int)nx * ny * ns; unsigned int szarrayb = szarray * sizeof(real); real* w0 = (real*)malloc(szarrayb); real* w1 = (real*)malloc(szarrayb); if (!w0 || !w1) { printf("Error allocating memory for arrays: %p, %p\n", w0, w1); exit(1); } for (i = 0; i < szarray; i++) { w0[i] = real_rand(); w1[i] = real_rand(); } // 1) Perform an empty offload, that should strip // the initialization time from further offloads. acc_init(acc_device_default); gettimeofday(&tim, NULL); start = tim.tv_sec + (tim.tv_usec/1000000.0); // 2) Allocate data on device, but do not copy anything. #pragma acc data create (w0[0:szarray], w1[0:szarray]) { // 3) Transfer data from host to device and leave it there, // i.e. do not allocate deivce memory buffers. #pragma acc update device(w0[0:szarray], w1[0:szarray]) // 4) Perform data processing iterations, keeping all data // on device. { for (it = 0; it < nt; it++) { laplacian(nx, ny, ns, alpha, beta, w0, w1); real* w = w0; w0 = w1; w1 = w; } } // 5) Transfer output data back from device to host. #pragma acc update host (w0[0:szarray]) } gettimeofday(&tim, NULL); end = tim.tv_sec + (tim.tv_usec/1000000.0); #if 0 fd = creat(argv[5], 00666); fd = open(argv[5], O_WRONLY); write(fd, w0, szarrayb); close(fd); fp = fopen(argv[5], "w"); fprintf(fp, "%d %d %d\n", ns, ny, nx); for(k = 0; k < ns; k++){ for (j = 0; j < ny; j++) { for (i = 0; i < nx; i++) { fprintf(fp, "%d %d %d %f\n", k, j, i, w0[k*nx*ny + j*nx + i]); } } } fclose(fp); #endif mean = 0.0f; for (i = 0; i < szarray; i++) mean += w0[i]; printf("Final mean = %f\n", mean/szarray); printf("Time for computing: %.2f s\n",end-start); free(w0); free(w1); return 0; }
int main() { int i, j, k; TYPE product, known_product; int NI, NJ, NK; int error; TYPE *input, *temp; struct timeval tim; double start, end; NK = 32; NJ = 1<<10; NI = 1<<10; error = 0; input = (TYPE*)malloc(NK*NJ*NI*sizeof(TYPE)); temp = (TYPE*)malloc(NK*sizeof(TYPE)); acc_init(acc_device_default); srand((unsigned)time(0)); for(k=0; k<NK; k++) { for(j=0; j<NJ; j++) { for(i=0; i<NI; i++) { input[k*NJ*NI + j*NI + i] = rand()%5 + 1; } } } gettimeofday(&tim, NULL); start = tim.tv_sec*1000 + (tim.tv_usec/1000.0); #pragma acc parallel copyin(input[0:NK*NJ*NI]) \ copyout(temp[0:NK]) { #pragma acc loop gang for(k=0; k<NK; k++) { TYPE j_product = 1; #pragma acc loop worker reduction(*:j_product) for(j=0; j<NJ; j++) { #pragma acc loop vector for(i=0; i<NI; i++) j_product *= input[k*NJ*NI + j*NI + i]; } temp[k] = j_product; } } gettimeofday(&tim, NULL); end = tim.tv_sec*1000 + (tim.tv_usec/1000.0); for(k=0; k<NK; k++) { TYPE j_product = 1; for(j=0; j<NJ; j++) { for(i=0; i<NI; i++) j_product *= input[k*NJ*NI + j*NI + i]; } if(temp[k] != j_product) { error++; printf("worker_vector * FAILED!\n"); } } printf("worker_vector * execution time is :%.2lf: ms\n", end-start); if(error == 0) printf("worker_vector * SUCCESS!\n"); free(input); free(temp); }
int main() { int i=0; #ifdef __NVCUDA__ acc_init( acc_device_nvcuda ); #endif #ifdef __NVOPENCL__ acc_init( acc_device_nvocl ); //acc_list_devices_spec( acc_device_nvocl ); #endif size_t free=-1, total=-1; acc_get_mem_info(&free, &total); printf("device memory info> free/total %d/%d [%6.4f percent free]\n",free,total,free/(float)total*100); float a[SIZE]; float b[SIZE]; float c[SIZE]; // Initialize matrices. for (i = 0; i < SIZE; ++i) { //B a[i] = (float)i ; b[i] = (float)2*i; c[i] = 0.0f; }// B unsigned long long int tic, toc; // Compute vector Add float sum=0, maX; int k; #pragma acc enter data copyin(a,b) create(c) for(k=0; k<3; k++){ printf("Calculation on GPU ... "); tic = clock(); sum=0; maX=-1; #pragma acc kernels present(a,b,c) #pragma acc loop independent for (i = 0; i < SIZE; ++i) { float x=0; x = a[i] + b[i] ; c[i] = x; } toc = clock(); printf(" %6.4f ms\n",(toc-tic)/(float)1000); } #pragma acc exit data copyout(c) // **************** // double-check the OpenACC result sequentially on the host // **************** // Perform the add printf("Calculation on CPU ... "); tic = clock(); float cpuMax=-1, cpuSum=0; for (i = 0; i < SIZE; ++i) { //F if(c[i]!= (a[i]+b[i])) { fprintf(stderr,"Error %d %16.10f!=%16.10f \n", i, c[i], a[i]+b[i]); exit(1); } }//F toc = clock(); printf(" %6.4f ms\n",(toc-tic)/(float)1000); fprintf(stderr,"OpenACC API test was successful!\n"); printf("Shutting down the device..."); acc_shutdown(acc_device_nvcuda); printf("[done]\n"); return 0; }
/** * \brief ACC example application entry point. * * \return Unused (ANSI-C compatibility). */ int main(void) { uint32_t uc_key; int16_t s_volt = 0; uint32_t ul_value = 0; volatile uint32_t ul_status = 0x0; int32_t l_volt_dac0 = 0; /* Initialize the system */ sysclk_init(); board_init(); /* Initialize debug console */ configure_console(); /* Output example information */ puts(STRING_HEADER); /* Initialize DACC */ /* Enable clock for DACC */ pmc_enable_periph_clk(ID_DACC); /* Reset DACC registers */ dacc_reset(DACC); /* External trigger mode disabled. DACC in free running mode. */ dacc_disable_trigger(DACC, DACC_CHANNEL_0); /* Half word transfer mode */ dacc_set_transfer_mode(DACC, 0); #if (SAM3S) || (SAM3XA) /* Power save: * sleep mode - 0 (disabled) * fast wakeup - 0 (disabled) */ dacc_set_power_save(DACC, 0, 0); #endif /* Enable output channel DACC_CHANNEL */ dacc_enable_channel(DACC, DACC_CHANNEL_0); /* Setup analog current */ dacc_set_analog_control(DACC, DACC_ANALOG_CONTROL); /* Set DAC0 output at ADVREF/2. The DAC formula is: * * (5/6 * VOLT_REF) - (1/6 * VOLT_REF) volt - (1/6 * VOLT_REF) * ----------------------------------- = -------------------------- * MAX_DIGITAL digit * * Here, digit = MAX_DIGITAL/2 */ dacc_write_conversion_data(DACC, MAX_DIGITAL / 2, DACC_CHANNEL_0); l_volt_dac0 = (MAX_DIGITAL / 2) * (2 * VOLT_REF / 3) / MAX_DIGITAL + VOLT_REF / 6; /* Enable clock for AFEC */ afec_enable(AFEC0); struct afec_config afec_cfg; afec_get_config_defaults(&afec_cfg); /* Initialize AFEC */ afec_init(AFEC0, &afec_cfg); struct afec_ch_config afec_ch_cfg; afec_ch_get_config_defaults(&afec_ch_cfg); afec_ch_cfg.gain = AFEC_GAINVALUE_0; afec_ch_set_config(AFEC0, AFEC_CHANNEL_POTENTIOMETER, &afec_ch_cfg); /* * Because the internal ADC offset is 0x200, it should cancel it and shift * down to 0. */ afec_channel_set_analog_offset(AFEC0, AFEC_CHANNEL_POTENTIOMETER, 0x200); afec_set_trigger(AFEC0, AFEC_TRIG_SW); /* Enable channel for potentiometer. */ afec_channel_enable(AFEC0, AFEC_CHANNEL_POTENTIOMETER); /* Enable clock for ACC */ pmc_enable_periph_clk(ID_ACC); /* Initialize ACC */ acc_init(ACC, ACC_MR_SELPLUS_AFE0_AD0, ACC_MR_SELMINUS_DAC0, ACC_MR_EDGETYP_ANY, ACC_MR_INV_DIS); /* Enable ACC interrupt */ NVIC_EnableIRQ(ACC_IRQn); /* Enable */ acc_enable_interrupt(ACC); dsplay_menu(); while (1) { while (usart_read(CONSOLE_UART, &uc_key)) { } printf("input: %c\r\n", uc_key); switch (uc_key) { case 's': case 'S': printf("Input DAC0 output voltage (%d~%d mv): ", (VOLT_REF / 6), (VOLT_REF * 5 / 6)); s_volt = get_input_voltage(); puts("\r"); if (s_volt > 0) { l_volt_dac0 = s_volt; /* The DAC formula is: * * (5/6 * VOLT_REF) - (1/6 * VOLT_REF) volt - (1/6 * VOLT_REF) * ----------------------------------- = -------------------------- * MAX_DIGITAL digit * */ ul_value = ((s_volt - (VOLT_REF / 6)) * (MAX_DIGITAL * 6) / 4) / VOLT_REF; dacc_write_conversion_data(DACC, ul_value, DACC_CHANNEL_0); puts("-I- Set ok\r"); } else { puts("-I- Input voltage is invalid\r"); } break; case 'v': case 'V': /* Start conversion */ afec_start_software_conversion(AFEC0); ul_status = afec_get_interrupt_status(AFEC0); while ((ul_status & AFEC_ISR_EOC0) != AFEC_ISR_EOC0) { ul_status = afec_get_interrupt_status(AFEC0); } /* Conversion is done */ ul_value = afec_channel_get_value(AFEC0, AFEC_CHANNEL_POTENTIOMETER); /* * Convert AFEC sample data to voltage value: * voltage value = (sample data / max. resolution) * reference voltage */ s_volt = (ul_value * VOLT_REF) / MAX_DIGITAL; printf("-I- Voltage on potentiometer(AD0) is %d mv\n\r", s_volt); printf("-I- Voltage on DAC0 is %ld mv \n\r", (long)l_volt_dac0); break; case 'm': case 'M': dsplay_menu(); break; } } }
int main() { int i; float a[SIZE]; float b[SIZE]; float c[SIZE]; float seq[SIZE]; /* float Papi[SIZE][SIZE]; float *onedim; float *twodim; float temp[3]={a[0],b[0],c[0]}; */ #ifdef __NVCUDA__ acc_init( acc_device_nvcuda ); #endif #ifdef __NVOPENCL__ acc_init( acc_device_nvocl ); acc_list_devices_spec( acc_device_nvocl ); #endif // Initialize matrices. for (i = 0; i < SIZE; ++i) { //B a[i] = (float)i ; b[i] = (float)2*i; c[i] = 0.0f; }// B unsigned long long int tic, toc; // Compute vector Add int d[1]={0}; int k; for(k=0; k<3; k++){ //C printf("Calculation on GPU ... "); tic = clock(); #pragma acc data pcopyin(a[0:SIZE],b[0:SIZE]) pcopyout(c[0:SIZE]) pcopy(d[0:1]) { # pragma acc kernels { #pragma acc loop independent { for (i = 0; i < SIZE; ++i) { #pragma acc atomic capture { d[0]+=inc_step(); } c[i] = a[i] + b[i] ; } } } } toc = clock(); printf(" %6.4f ms\n",(toc-tic)/(float)1000); } // **************** // double-check the OpenACC result sequentially on the host // **************** // Perform the add printf("Calculation on CPU ... "); tic = clock(); for (i = 0; i < SIZE; ++i) { seq[i] = a[i] + b[i] ; if(c[i]!= seq[i]) { fprintf(stderr,"Error %d %16.10f!=%16.10f \n", i, c[i], seq[i]); return -1; } } toc = clock(); printf(" %6.4f ms\n",(toc-tic)/(float)1000); printf("atomic sum> %d (should be %d)\n", d[0], 3*SIZE*inc_step()); if(d[0]==3*SIZE*inc_step()){ fprintf(stderr,"OpenACC atomic operation test was successful!\n"); }else{ fprintf(stderr,"OpenACC atomic operation test failed!\n"); } return 0; }
int main (int argc, char **argv) { CUdevice dev; CUfunction delay; CUmodule module; CUresult r; CUstream stream; unsigned long *a, *d_a, dticks; int nbytes; float atime, dtime; void *kargs[2]; int clkrate; int devnum, nprocs; acc_init (acc_device_nvidia); devnum = acc_get_device_num (acc_device_nvidia); r = cuDeviceGet (&dev, devnum); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGet failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&nprocs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuDeviceGetAttribute (&clkrate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuDeviceGetAttribute failed: %d\n", r); abort (); } r = cuModuleLoad (&module, "subr.ptx"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleLoad failed: %d\n", r); abort (); } r = cuModuleGetFunction (&delay, module, "delay"); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuModuleGetFunction failed: %d\n", r); abort (); } nbytes = nprocs * sizeof (unsigned long); dtime = 200.0; dticks = (unsigned long) (dtime * clkrate); a = (unsigned long *) malloc (nbytes); d_a = (unsigned long *) acc_malloc (nbytes); acc_map_data (a, d_a, nbytes); kargs[0] = (void *) &d_a; kargs[1] = (void *) &dticks; r = cuStreamCreate (&stream, CU_STREAM_DEFAULT); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuStreamCreate failed: %d\n", r); abort (); } acc_set_cuda_stream (0, stream); init_timers (1); start_timer (0); r = cuLaunchKernel (delay, 1, 1, 1, 1, 1, 1, 0, stream, kargs, 0); if (r != CUDA_SUCCESS) { fprintf (stderr, "cuLaunchKernel failed: %d\n", r); abort (); } acc_wait (1); atime = stop_timer (0); if (atime < dtime) { fprintf (stderr, "actual time < delay time\n"); abort (); } start_timer (0); acc_wait (1); atime = stop_timer (0); if (0.010 < atime) { fprintf (stderr, "actual time < delay time\n"); abort (); } acc_unmap_data (a); fini_timers (); free (a); acc_free (d_a); acc_shutdown (acc_device_nvidia); return 0; }
int main(int argc, char* argv[]) { int i; unsigned int SIZEX = 0; unsigned int SIZEY = 0; unsigned int SIZEZ = 0; if (argc == 4) { sscanf(argv [1], "%d", &SIZEX); sscanf(argv [2], "%d", &SIZEY); sscanf(argv [3], "%d", &SIZEZ); }else{ printf("usage: %s xdim ydim zdim\n", argv [0]); return -1; } unsigned int SIZE = SIZEX * SIZEY * SIZEZ; assert(SIZE > 0); printf("allocation size> %d\n", SIZE); float *a = (float*)malloc(sizeof(float) * SIZE); float *b = (float*)malloc(sizeof(float) * SIZE); float *c = (float*)malloc(sizeof(float) * SIZE); #ifdef __NVCUDA__ acc_init(acc_device_nvcuda); #endif #ifdef __NVOPENCL__ #define DEVICE_TYPE acc_device_nvocl printf("compiled for ocl\n"); acc_init(DEVICE_TYPE); acc_list_devices_spec(DEVICE_TYPE); #endif for (i = 0; i < SIZE; ++i) { a [i] = (float)i; b [i] = (float)2 * i; c [i] = 0.0f; } int k; double revsum = 0; int iter = 30; ipmacc_prompt((char*)"IPMACC: memory allocation c\n"); // ISPC host and device are the same, skipping memory allocation ipmacc_prompt((char*)"IPMACC: memory allocation a\n"); // ISPC host and device are the same, skipping memory allocation ipmacc_prompt((char*)"IPMACC: memory allocation b\n"); // ISPC host and device are the same, skipping memory allocation ipmacc_prompt((char*)"IPMACC: memory copyin a\n"); // ISPC host and device are the same, skipping copyin ipmacc_prompt((char*)"IPMACC: memory copyin b\n"); // ISPC host and device are the same, skipping copyin { for(k = 0; k < iter; k++) { reset_and_start_timer(); /* kernel call statement*/ { unsigned int __ispc_n_threads = sysconf(_SC_NPROCESSORS_ONLN); // acc_get_n_cores(acc_device_intelispc); if(getenv("IPMACC_VERBOSE")) printf("IPMACC: Launching ISPC kernel> %d threads + SIMD \n", __ispc_n_threads); __generated_kernel_launch_0(a,c,b,SIZE); } /* kernel call statement*/ // ISPC target is synchronized with CPU // skipping synchronization double dt = get_elapsed_msec(); revsum += 1.0 / dt; printf("@time of openacc run:\t\t\t%.3f msec\n", dt); } } ipmacc_prompt((char*)"IPMACC: memory copyout c\n"); // ISPC host and device are the same, skipping copyout printf("harmonic mean openacc run> %.3f msec\n", iter / revsum); for (i = 0; i < SIZE; ++i) { if (c [i] != (a [i] + b [i])) { fprintf(stdout, "Error %d %16.10f!=%16.10f \n", i, c [i], a [i] + b [i]); return -1; } } fprintf(stdout, "OpenACC vectoradd test was successful!\n"); return 0; }
int main() { int i, j, k; int sum, known_sum; int NI, NJ, NK; int error; REAL *input, *temp; REAL frounding_error = 1.E-9; struct timeval tim; double start, end; NK = 2; NJ = 32; NI = 1<<20; error = 0; input = (REAL*)malloc(NK*NJ*NI*sizeof(REAL)); temp = (REAL*)malloc(NK*NJ*NI*sizeof(REAL)); acc_init(acc_device_default); srand((unsigned)time(0)); for(k=0; k<NK; k++) { for(j=0; j<NJ; j++) { for(i=0; i<NI; i++) { input[k*NJ*NI + j*NI + i] = (REAL)rand()/(REAL)RAND_MAX + 0.1; } } } gettimeofday(&tim, NULL); start = tim.tv_sec*1000 + (tim.tv_usec/1000.0); #pragma acc parallel copyin(input[0:NK*NJ*NI]) \ copyout(temp[0:NK*NJ*NI]) \ num_gangs(192) \ num_workers(8) \ vector_length(128) { #pragma acc loop gang for(k=0; k<NK; k++) { #pragma acc loop worker for(j=0; j<NJ; j++) { REAL i_sum = 0.0; #pragma acc loop vector reduction(+:i_sum) for(i=0; i<NI; i++) i_sum += input[k*NJ*NI + j*NI + i]; temp[k*NJ*NI + j*NI] = i_sum; } } } gettimeofday(&tim, NULL); end = tim.tv_sec*1000 + (tim.tv_usec/1000.0); for(k=0; k<NK; k++) { for(j=0; j<NJ; j++) { REAL i_sum = 0.0; for(i=0; i<NI; i++) i_sum += input[k*NJ*NI + j*NI + i]; if(fabsf(temp[k*NJ*NI + j*NI]-i_sum) > frounding_error) { error++; printf("vecotr + FAILED!\n"); } } } printf("vector + execution time is :%.2lf: ms\n", end-start); if(error == 0) printf("vector + SUCCESS!\n"); free(input); free(temp); }
int main(int argc, char* argv[]) { if (argc != 5) { printf("Usage: %s <nx> <ny> <ns> <nt>\n", argv[0]); exit(1); } const char* no_timing = getenv("NO_TIMING"); #if defined(_OPENACC) char* regcount_fname = getenv("OPENACC_PROFILING_FNAME"); if (regcount_fname) { char* regcount_lineno = getenv("OPENACC_PROFILING_LINENO"); int lineno = -1; if (regcount_lineno) lineno = atoi(regcount_lineno); //kernelgen_enable_openacc_regcount(regcount_fname, lineno); } #endif parse_arg(nx, argv[1]); parse_arg(ny, argv[2]); parse_arg(ns, argv[3]); parse_arg(nt, argv[4]); size_t szarray = (size_t)nx * ny * ns; size_t szarrayb = szarray * sizeof(real); real* x = (real*)memalign(MEMALIGN, szarrayb); real* y = (real*)memalign(MEMALIGN, szarrayb); real* xy = (real*)memalign(MEMALIGN, szarrayb); if (!x || !y || !xy) { printf("Error allocating memory for arrays: %p, %p, %p\n", x, y, xy); exit(1); } real mean = 0.0f; for (int i = 0; i < szarray; i++) { x[i] = real_rand(); y[i] = real_rand(); xy[i] = real_rand(); mean += x[i] + y[i] + xy[i]; } printf("initial mean = %f\n", mean / szarray / 3); // // MIC or OPENACC: // // 1) Perform an empty offload, that should strip // the initialization time from further offloads. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec init_s, init_f; #if defined(_MIC) get_time(&init_s); #pragma offload target(mic) \ nocopy(x:length(szarray) alloc_if(0) free_if(0)), \ nocopy(y:length(szarray) alloc_if(0) free_if(0)), \ nocopy(xy:length(szarray) alloc_if(0) free_if(0)) { } get_time(&init_f); #endif #if defined(_OPENACC) get_time(&init_s); acc_init(acc_device_default); get_time(&init_f); #endif double init_t = get_time_diff((struct timespec*)&init_s, (struct timespec*)&init_f); if (!no_timing) printf("init time = %f sec\n", init_t); #endif volatile struct timespec total_s, total_f; get_time(&total_s); // // MIC or OPENACC: // // 2) Allocate data on device, but do not copy anything. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec alloc_s, alloc_f; #if defined(_MIC) get_time(&alloc_s); #pragma offload target(mic) \ nocopy(x:length(szarray) alloc_if(1) free_if(0)), \ nocopy(y:length(szarray) alloc_if(1) free_if(0)), \ nocopy(xy:length(szarray) alloc_if(1) free_if(0)) { } get_time(&alloc_f); #endif #if defined(_OPENACC) get_time(&alloc_s); #pragma acc data create (x[0:szarray], y[0:szarray], xy[0:szarray]) { get_time(&alloc_f); #endif double alloc_t = get_time_diff((struct timespec*)&alloc_s, (struct timespec*)&alloc_f); if (!no_timing) printf("device buffer alloc time = %f sec\n", alloc_t); #endif // // MIC or OPENACC: // // 3) Transfer data from host to device and leave it there, // i.e. do not allocate deivce memory buffers. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec load_s, load_f; #if defined(_MIC) get_time(&load_s); #pragma offload target(mic) \ in(x:length(szarray) alloc_if(0) free_if(0)), \ in(y:length(szarray) alloc_if(0) free_if(0)), \ in(xy:length(szarray) alloc_if(0) free_if(0)) { } get_time(&load_f); #endif #if defined(_OPENACC) get_time(&load_s); #pragma acc update device(x[0:szarray], y[0:szarray], xy[0:szarray]) get_time(&load_f); #endif double load_t = get_time_diff((struct timespec*)&load_s, (struct timespec*)&load_f); if (!no_timing) printf("data load time = %f sec (%f GB/sec)\n", load_t, 2 * szarrayb / (load_t * 1024 * 1024 * 1024)); #endif // // 4) Perform data processing iterations, keeping all data // on device. // volatile struct timespec compute_s, compute_f; get_time(&compute_s); #if defined(_MIC) #pragma offload target(mic) \ nocopy(x:length(szarray) alloc_if(0) free_if(0)), \ nocopy(y:length(szarray) alloc_if(0) free_if(0)), \ nocopy(xy:length(szarray) alloc_if(0) free_if(0)) #endif { for (int it = 0; it < nt; it++) { #if defined(_PATUS) real* dummy; #pragma omp parallel sincos_patus(&dummy, x, y, xy, nx, ny, ns); #else sincos_(&nx, &ny, &ns, x, y, xy); #endif } } get_time(&compute_f); double compute_t = get_time_diff((struct timespec*)&compute_s, (struct timespec*)&compute_f); if (!no_timing) printf("compute time = %f sec\n", compute_t); // // MIC or OPENACC: // // 5) Transfer output data back from device to host. // #if defined(_MIC) || defined(_OPENACC) volatile struct timespec save_s, save_f; #if defined(_MIC) get_time(&save_s); #pragma offload target(mic) \ out(xy:length(szarray) alloc_if(0) free_if(0)) { } get_time(&save_f); #endif #if defined(_OPENACC) get_time(&save_s); #pragma acc update host (xy[0:szarray]) get_time(&save_f); #endif double save_t = get_time_diff((struct timespec*)&save_s, (struct timespec*)&save_f); if (!no_timing) printf("data save time = %f sec (%f GB/sec)\n", save_t, szarrayb / (save_t * 1024 * 1024 * 1024)); #endif // // MIC or OPENACC: // // 6) Deallocate device data buffers. // OPENACC does not seem to have explicit deallocation. // #if defined(_OPENACC) } #endif #if defined(_MIC) volatile struct timespec free_s, free_f; get_time(&free_s); #pragma offload target(mic) \ nocopy(x:length(szarray) alloc_if(0) free_if(1)), \ nocopy(y:length(szarray) alloc_if(0) free_if(1)), \ nocopy(xy:length(szarray) alloc_if(0) free_if(1)) { } get_time(&free_f); double free_t = get_time_diff((struct timespec*)&free_s, (struct timespec*)&free_f); // if (!no_timing) printf("device buffer free time = %f sec\n", free_t); #endif get_time(&total_f); if (!no_timing) printf("device buffer free time = %f sec\n", get_time_diff((struct timespec*)&total_s, (struct timespec*)&total_f)); // For the final mean - account only the norm of the top // most level (tracked by swapping idxs array of indexes). mean = 0.0f; for (int i = 0; i < szarray; i++) mean += xy[i]; printf("final mean = %f\n", mean / szarray); free(x); free(y); free(xy); fflush(stdout); return 0; }