Workers::Workers() { int size = matrixSize * matrixSize; A = new ElementType[size]; B = new ElementType[size]; C = new ElementType[size]; randomInit(A, size); randomInit(B, size); }
int main(void) { randomInit(); DDRB |= (1<<PB2) | (1<<PB3) | (1<<PB4) | (1<<PB5); startDemo(3, 0b00000011); _delay_ms(500); unsigned char length = 1; while(1) { /*unsigned char buttons = getButton(); ledSet(buttons, 1); ledSet(~buttons, 0);*/ generate(length); if(check(length)) { gameOver(length - 1); length = 0; } _delay_ms(500); length++; } return 0; }
void Initialize(arma::Mat<eT>& W, const size_t rows, const size_t cols) { RandomInitialization randomInit(-gamma, gamma); randomInit.Initialize(W, rows, cols); W = (b / (k * rows)) * arma::sqrt(W + 1); }
void Initialize(arma::Mat<eT>& W, const size_t rows, const size_t cols) { arma::Row<eT> b = s * arma::sqrt(3 / (rows * dataSum)); const double theta = b.min(); RandomInitialization randomInit(-theta, theta); randomInit.Initialize(W, rows, cols); }
// ------------------------------------- // Main function // ------------------------------------- void appMain(void) { uint16_t i; // timers (used to get an extra interrupt context) alarmInit(&timer, onTimer, NULL); alarmSchedule(&timer, 1000); // radio radioSetReceiveHandle(radioRecvCb); radioOn(); for (i = 0; i < BUFFER_SIZE; i++) { buffer[i] = i; } randomInit(); // SELECT_FLASH; // extFlashBulkErase(); // UNSELECT_FLASH; for (i = 0; ; i++) { uint32_t address = i * 64ul; SELECT_FLASH; if (IS_ALIGNED(address, EXT_FLASH_SECTOR_SIZE)) { PRINTF("erase address %lu\n", address); flashErase(address); } PRINTF("write address %lu\n", address); flashWrite(address); if (address > 0) { PRINTF("verify...\n"); flashRead(address - 64); } UNSELECT_FLASH; msleep(randomInRange(400, 1000)); PRINTF("send smth to radio...\n"); radioSend("hello world", sizeof("hello world")); greenLedToggle(); } }
int main (int argc, char* argv[]){ int nsteps = 2; int g1[WDEFAULT][HDEFAULT], g2[WDEFAULT][HDEFAULT]; for(int i = 0; i < WDEFAULT; i++) for(int j = 0; j < HDEFAULT; j++){ g1[i][j] = 0; g2[i][j] = 0; } const gsl_rng_type *T; gsl_rng *rand; gsl_rng_env_setup(); T = gsl_rng_default; rand = gsl_rng_alloc(T); gsl_rng_set(rand, get_seed_noblock()); randomInit(g1, 0.55, rand); printGrid(g1); updateGrid(g1,g2); printf("# updated \n"); printGrid(g2); for(int i = 0; i < nsteps; i++){ // blit g2 back into g1 memcpy(g1, g2, sizeof(int)*WDEFAULT*HDEFAULT); // now update g1 updateGrid(g1,g2); printGrid(g2); } gsl_rng_free(rand); return EXIT_SUCCESS; }
/** * \brief Initializes all the subsystems for this Droplet. This function MUST be called * by the user before using any other functions in the API. */ static void initAllSystems(void){ cli(); Config32MHzClock(); calculateIdNumber(); schedulerInit(); INIT_DEBUG_PRINT("SCHEDULER INIT\r\n"); pcCommInit(); INIT_DEBUG_PRINT("PC COM INIT\r\n"); rgbLEDinit(); INIT_DEBUG_PRINT("LED INIT\r\n"); powerInit(); INIT_DEBUG_PRINT("POWER INIT\r\n"); i2cInit(); INIT_DEBUG_PRINT("I2C INIT\r\n"); enableInterrupts(); rangeAlgsInit(); INIT_DEBUG_PRINT("RANGE ALGORITHMS INIT\r\n"); rgbSensorInit(); INIT_DEBUG_PRINT("RGB SENSE INIT\r\n"); irLedInit(); INIT_DEBUG_PRINT("IR LED INIT\r\n"); irSensorInit(); INIT_DEBUG_PRINT("IR SENSE INIT\r\n"); #ifdef AUDIO_DROPLET speakerInit(); INIT_DEBUG_PRINT("SPEAKER INIT\r\n"); micInit(); INIT_DEBUG_PRINT("MIC INIT\r\n"); //Must occur after ir_sensor_init. #endif motorInit(); INIT_DEBUG_PRINT("MOTOR INIT\r\n"); randomInit(); INIT_DEBUG_PRINT("RAND INIT\r\n"); //This uses adc readings for a random seed, and so requires that the adcs have been initialized. localizationInit(); INIT_DEBUG_PRINT("LOCALIZATION INIT\r\n"); #ifdef SYNCHRONIZED fireflySyncInit(); #endif setAllirPowers(256); startupLightSequence(); irCommInit(); INIT_DEBUG_PRINT("IR COM INIT\r\n"); #ifdef AUDIO_DROPLET enableMicInterrupt(); #endif }
int main(void) { char buf[80]; system("clear"); printf("\n\n\n ### GPU ENABLED CODE\n"); printf("\n ### Host creating two input vectors and one output vector\n"); unsigned i, LIST_SIZE; FILE *fp0; fp0 = fopen("data.txt", "r"); if (!fp0) { fprintf(stderr, "Failed to load data.txt.\n"); exit(1); } fgets(buf, sizeof(buf), fp0); sscanf(buf,"%d", &LIST_SIZE); printf(" ### Vector_size:%d elements, Elem_size:%lu byte\n", LIST_SIZE, sizeof(int)); fclose(fp0); int *A = (int*)malloc(sizeof(int)*LIST_SIZE); if (A == NULL) { fprintf(stderr, "failed to allocate memory.\n"); return -1; } int *B = (int*)malloc(sizeof(int)*LIST_SIZE); if (B == NULL) { fprintf(stderr, "failed to allocate memory.\n"); return -1; } int *C = (int*)malloc(sizeof(int)*LIST_SIZE); if (C == NULL) { fprintf(stderr, "failed to allocate memory.\n"); return -1; } printf("\n Host initialising input vector values\n"); for(i = 0; i < LIST_SIZE; i++) { A[i] = /*randomInit()*/ i; B[i] = /*randomInit()*/ LIST_SIZE-i; loadBar(i, LIST_SIZE, 100, 2); } printf("\n Setting up GPU\n"); // Load the kernel source code into the array source_str FILE *fp; char *source_str; size_t source_size; fp = fopen("vec.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); // Get platform and device information cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); printf("Check platform id %s\n", getErrorString(ret)); ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices); printf("device_id:%u, device_count:%u\n", (int) device_id, ret_num_devices); printf("Check device id %s\n", getErrorString(ret)); // Create an OpenCL context cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); printf("Check context %s\n", getErrorString(ret)); // Create a command queue cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); printf("Check command queue %s\n", getErrorString(ret)); // Create memory buffers on the device for each vector cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); printf("Check a_mem_obj %s\n", getErrorString(ret)); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); printf("Check b_mem_obj %s\n", getErrorString(ret)); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); printf("Check c_mem_obj %s\n", getErrorString(ret)); // <- // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); printf("Check program %s\n", getErrorString(ret)); // Build the program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); // ret = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); printf("Check build program %s, device_id:%d \n", getErrorString(ret),(int)device_id); // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "vector_add", &ret); printf("Check kernel create %s\n", getErrorString(ret)); // Set the arguments of the kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); printf("Check kernel arg 0 %s\n", getErrorString(ret)); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); printf("Check kernel arg 1 %s\n", getErrorString(ret)); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); ret |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &LIST_SIZE); printf("Check kernel args 0-3 %s\n", getErrorString(ret)); // COPY EXECUTE BLOCK ////////////////////////////////////////////// // ->Copy the lists A and B to their respective memory buffers ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), A, 0, NULL, NULL); printf("Check Read B %s\n", getErrorString(ret)); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), B, 0, NULL, NULL); printf("Check Read B %s\n", getErrorString(ret)); // Execute the OpenCL kernel on the list printf("\n GPU adding the vectors \n"); size_t global_item_size = LIST_SIZE; // Process the entire lists size_t local_item_size = 1; // Process one item at a time ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); printf("Check kernel EnqueueNDRange %s\n", getErrorString(ret)); // Read the memory buffer c_mem_obj on the device to the local C ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), C, 0, NULL, NULL); printf("Check Read Buffer C %s\n", getErrorString(ret)); //////////////////////////////////////////////////////////////////// // Display the first 10 result to the screen for(i = 0; i < 10; i++) printf("%d + %d = %d\n", A[i], B[i], C[i]); printf("\n"); // Display the last 10 result to the screen for(i = LIST_SIZE-10; i < LIST_SIZE; i++) printf("%d + %d = %d\n", A[i], B[i], C[i]); //// SECOND TIME //////////////////////////////////////////////////////// ///////////////////////////////////////////////////////////////////////// printf("\n Host initialising input vectors randomly \n"); for(i = 0; i < LIST_SIZE; i++) { A[i] = randomInit(); B[i] = randomInit(); loadBar(i, LIST_SIZE, 100, 2); } // COPY EXECUTE BLOCK RE-RUN //////////////////////////////////////// // ->Copy the lists A and B to their respective memory buffers ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), A, 0, NULL, NULL); printf("Check Read B %s\n", getErrorString(ret)); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), B, 0, NULL, NULL); printf("Check Read B %s\n", getErrorString(ret)); // Execute the OpenCL kernel on the list ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); printf("Check kernel EnqueueNDRange %s\n", getErrorString(ret)); // Read the memory buffer c_mem_obj on the device to the local C ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), C, 0, NULL, NULL); printf("Check Read Buffer C %s\n", getErrorString(ret)); //////////////////////////////////////////////////////////////////// // Display the first 10 result to the screen for(i = 0; i < 10; i++) printf("%d + %d = %d\n", A[i], B[i], C[i]); printf("\n"); // Display the last 10 result to the screen for(i = LIST_SIZE-10; i < LIST_SIZE; i++) printf("%d + %d = %d\n", A[i], B[i], C[i]); ////// SECOND TIME ENDS///////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////// printf("\n Cleaning up GPU queues\n"); // Clean up ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(a_mem_obj); ret = clReleaseMemObject(b_mem_obj); ret = clReleaseMemObject(c_mem_obj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); printf("\n Host freeing up the vectors\n"); free(A); free(B); free(C); return 0; }
int main(void) { int N = 100; int M = 20; // The number of classes determines the number of problems Problem* probs = (Problem*) malloc(M * sizeof(Problem)); // Initialize the parameters for the SVM one class Parameter param; // Node *x_space; // Type of SVM param.svm_type = 0; param.kernel_type = RBF; param.degree = 3; param.gamma = 0; param.coef0 = 0; // param.nu = 0.05; // param.cache_size = 1000; // param.C = 1; // param.eps = 1e-3; // param.p = 0.1; // param.shrinking = 1; // param.probability = 0; // param.nr_weight = 0; // param.weight_label = NULL; // param.weight = NULL; // Generate random Matrix and vector unsigned long vector_start_time = start_timer(); unsigned long int mem_size_X = sizeof(float) * M * N; unsigned long int mem_size_K = sizeof(float) * M * M; unsigned long int mem_size_Y = sizeof(float) * M; float *Y = (float *) malloc(mem_size_Y); float *X = (float *) malloc(mem_size_X); float *K = (float *) malloc(mem_size_K); // initialize host memory srand(time(NULL )); randomInit(X, M * N); srand(time(NULL )); randomInit(Y, M); Node* x_space = malloc(N * sizeof(Node*)); for (unsigned int i = 0; i < M; i++) { probs[i].idx = i; probs[i].y = Y[i]; for (unsigned int j = 0; j < N; j++) { x_space[j].idx = j; x_space[j].value = X[i * N + j]; } probs[i].x = x_space; } stop_timer(vector_start_time, "Vector generation"); printf("\nUsing LINEAR\n"); unsigned long ker_start_time = start_timer(); //formK(K, probs, M); stop_timer(ker_start_time, "Time LINEAR\t"); printf("\nUsing POLY\n"); ker_start_time = start_timer(); stop_timer(ker_start_time, "Time POLY\t"); printf("\nUsing RBF\n"); ker_start_time = start_timer(); stop_timer(ker_start_time, "Time RBF\t"); printf("\nUsing SIGMOID\n"); ker_start_time = start_timer(); stop_timer(ker_start_time, "Time SIGMOID\t"); return EXIT_SUCCESS; }
int main(int argc, char** argv) { // set seed for rand() srand(2006); // 1. allocate host memory for matrices A and B unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float)* size_A; float* h_A = (float*)malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float)* size_B; float* h_B = (float*)malloc(mem_size_B); // 2. initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); int i,j,k; // 3. print out A and B //printf("\n\nMatrix A\n"); /*for (i = 0; i < size_A; i++) { printf("%f ", h_A[i]); if (((i + 1) % WA) == 0) printf("\n"); } printf("\n\nMatrix B\n"); for (i = 0; i < size_B; i++) { printf("%f ", h_B[i]); if (((i + 1) % WB) == 0) printf("\n"); }*/ // 4. allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float)* size_C; float* h_C = (float*)malloc(mem_size_C); // 5. Initialize OpenCL // OpenCL specific variables cl_context clGPUContext; cl_command_queue clCommandQue; cl_program clProgram; cl_kernel clKernel; cl_platform_id platform_id = NULL; // cl_uint ret_num_devices; // cl_uint ret_num_platforms; // size_t dataBytes; size_t kernelLength; cl_int errcode; cl_device_id device_id = NULL; // cl_int ret; // // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; /*****************************************/ /* Initialize OpenCL */ /*****************************************/ //clGPUContext = clCreateContextFromType(0, // CL_DEVICE_TYPE_GPU, // NULL, NULL, &errcode); //shrCheckError(errcode, CL_SUCCESS); // get the list of GPU devices associated // with context ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices); /* Create OpenCL context */ clGPUContext = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); /* Create Command Queue */ clCommandQue = clCreateCommandQueue(clGPUContext, device_id, 0, &ret); /* errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); */ // shrCheckError(errcode, CL_SUCCESS); //Create a command-queue // clCommandQue = clCreateCommandQueue(clGPUContext, // clDevices[0], 0, &errcode); // shrCheckError(errcode, CL_SUCCESS); // Setup device memory d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &ret); d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &ret); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &ret); //printf("6. Load and build OpenCL kernel\n"); // 6. Load and build OpenCL kernel FILE *fp; char fileName[] = "./kernel.cl"; char *source_str; size_t source_size; /* Load the source code containing the kernel*/ fp = fopen(fileName, "r"); if (!fp) { printf("Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); clProgram = clCreateProgramWithSource(clGPUContext, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); // shrCheckError(errcode, CL_SUCCESS); clKernel = clCreateKernel(clProgram, "matrixMul", &ret); // shrCheckError(errcode, CL_SUCCESS); // 7. Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; ret = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); ret |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); ret |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); ret |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); ret |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); // shrCheckError(errcode, CL_SUCCESS); localWorkSize[0] = 3; localWorkSize[1] = 3; globalWorkSize[0] = 3; globalWorkSize[1] = 3; ret = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); // shrCheckError(errcode, CL_SUCCESS); // 8. Retrieve result from device ret = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); //shrCheckError(errcode, CL_SUCCESS); // 9. print out the results /*printf("\n\nMatrix C (Results)\n"); for (i = 0; i < size_C; i++) { printf("%f ", h_C[i]); if (((i + 1) % WC) == 0) printf("\n"); } printf("\n");*/ // 10. clean up memory free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); //free(clDevices); //free(clMatrixMul); clReleaseContext(clGPUContext); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommandQue); }
int main(int argc, char *argv[]) { int i,j,k; machineInformation currentMachine; counterSessionInfo session; initializeCUDA(); // Set machine information from CounterHomeBrew.h currentMachine.cpu_model = CPU_MODEL; currentMachine.num_sockets = NUM_SOCKETS; currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET; currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET; currentMachine.num_cores = NUM_CORES; currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET; // should multiply by NUM_SOCKETS??? currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX; currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX; // Set session events, umasks and counters used // int32 core_event_numbers[] = {FP_COMP_OPS_EXE_EVTNR,SIMD_FP_256_EVTNR,0x51,0xF1,0x80}; // int32 core_umasks[] = {FP_COMP_OPS_EXE_SCALAR_DOUBLE_UMASK,SIMD_FP_256_PACKED_DOUBLE_UMASK,0x01, 0x07,0x01}; session.core_gen_counter_num_used = 5; int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1}; int32 core_umasks[] = {0x20,0x40,0x01,0x01, 0x07}; session.cbo_counter_num_used = 1; int32 cbo_event_numbers[] = {0x37}; int32 cbo_umasks[] = {0xf}; session.cbo_filter = 0x1f; for (i = 0; i < session.core_gen_counter_num_used; i++) { session.core_event_numbers[i] = core_event_numbers[i]; session.core_umasks[i] = core_umasks[i]; } for (i = 0; i < session.cbo_counter_num_used; i++) { session.cbo_event_numbers[i] = cbo_event_numbers[i]; session.cbo_umasks[i] = cbo_umasks[i]; } int fd[NUM_CORES]; // Arrays to hold counter data... counterData before; counterData after; // some data for doing a naive matmul to test flop counting... // initloop(N); // M,N,K are multiples of the block size.... int gpuOuter = atoi(argv[1]); int gpuInner = atoi(argv[2]); int cpuInner = atoi(argv[3]); double minRuntime = atoi(argv[4]); int Md = atoi(argv[5])*block_size; int Nd = atoi(argv[6])*block_size; int Kd = atoi(argv[7])*block_size; int Mh = atoi(argv[8]); int Nh = atoi(argv[9]); int Kh = atoi(argv[10]); char *ts1,*ts2,*ts3,*ts4; char *ts5,*ts6,*ts7,*ts8; double fineTimeStamps[8]; double gTime = 0.0; double cTime = 0.0; double seconds = 0.0; int num_iters; uint64 *coreSums; coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64)); uint64 *sums; sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64)); float *Atmp = NULL; float *Btmp = NULL; float *Ctmp = NULL; Atmp = (float*) malloc( Mh * Nh * sizeof(float) ); Btmp = (float*) malloc( Nh * sizeof(float) ); Ctmp = (float*) malloc( Mh * sizeof(float) ); randomInit(Atmp,Mh*Nh); randomInit(Btmp,Nh); for (num_iters = cpuInner; seconds < minRuntime; num_iters *=2) { seconds = 0.0; for (i =0; i < num_iters; i++) BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, Atmp,Mh, Btmp,1, 1, Ctmp,1 ); seconds = read_timer()-seconds; } // num_iters /= 2; free(Atmp); free(Btmp); free(Ctmp); int readyThreads = 0; #pragma omp parallel { int threadNum = omp_get_thread_num(); int numThreads = omp_get_num_threads(); assert(numThreads==2); if (threadNum == 0) { cudaError_t error; int memSizeA = sizeof(float)*Md*Nd; int memSizeB = sizeof(float)*Nd; int memSizeC = sizeof(float)*Md; float *Ahost,*Bhost,*Chost; // use pinned memory on the host for BW and asynch memory transfers.. int flags = cudaHostAllocDefault; ts5 = getTimeStamp(); fineTimeStamps[0] = read_timer(); error = cudaHostAlloc((void**)&Ahost,memSizeA,flags);if (error != cudaSuccess){printf("cudaHostMalloc Ahost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaHostAlloc((void**)&Bhost,memSizeB,flags);if (error != cudaSuccess){printf("cudaHostMalloc Bhost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaHostAlloc((void**)&Chost,memSizeC,flags);if (error != cudaSuccess){printf("cudaHostMalloc Chost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} // set local arrays randomInit(Ahost,Md*Nd); randomInit(Bhost,Nd); // allocate device memory float *Adevice,*Bdevice,*Cdevice; error = cudaMalloc((void**)&Adevice,memSizeA); if (error != cudaSuccess){printf("cudaMalloc Adevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMalloc((void**)&Bdevice,memSizeB); if (error != cudaSuccess){printf("cudaMalloc Bdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMalloc((void**)&Cdevice,memSizeC); if (error != cudaSuccess){printf("cudaMalloc Cdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} fineTimeStamps[1] = read_timer(); ts6 = getTimeStamp(); #pragma omp critical { readyThreads += 1; } // fprintf(stderr,"Incremented ready GPU\n"); while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 0: %d\n",readyThreads);}; //#pragma omp single //{ cudaStream_t stream1; cudaStreamCreate ( &stream1) ; ts3 = getTimeStamp(); fineTimeStamps[2] = read_timer(); gTime = read_timer(); for (int i = 0; i < gpuOuter; i++) GPUsgemv(gpuInner,Md,Nd,Kd,Adevice,Bdevice,Cdevice,Ahost,Bhost,Chost,&stream1); cudaStreamSynchronize(stream1); gTime = read_timer() - gTime; fineTimeStamps[3] = read_timer(); ts4 = getTimeStamp(); cudaFreeHost(Ahost); cudaFreeHost(Bhost); cudaFreeHost(Chost); } else { // uint64 min_iters = strtoull(argv[4],NULL,0); float *A = NULL; float *B = NULL; float *C = NULL; ts7 = getTimeStamp(); fineTimeStamps[4] = read_timer(); A = (float*) malloc( Mh * Nh * sizeof(float) ); B = (float*) malloc( Nh * sizeof(float) ); C = (float*) malloc( Mh * sizeof(float) ); randomInit(A,Mh*Nh); randomInit(B,Nh); fineTimeStamps[5] = read_timer(); ts8 = getTimeStamp(); #pragma omp critical { readyThreads += 1; } // fprintf(stderr,"Incremented ready CPU\n"); while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 1: %d\n",readyThreads);}; // open the msr files for each core on the machine for (i = 0; i < currentMachine.num_cores; i++) open_msr_file(i,&fd[i]); int socketsProgrammed = 0; for (i = 0; i < currentMachine.num_cores; i++) { int currentCoreFD = fd[i]; stopCounters(i, currentCoreFD, ¤tMachine, &session); programCoreFixedCounters(currentCoreFD); programGeneralPurposeRegisters(currentCoreFD, ¤tMachine, &session); /* Program the Uncore as desired...*/ // Only program the first physical core on each socket. // NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm. if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets) { programUncoreCounters( currentCoreFD, ¤tMachine, &session); socketsProgrammed++; } } seconds = 0.0; // start the programmed counters... for (i = 0; i < currentMachine.num_cores; i++) startCounters( i, fd[i], ¤tMachine, &session); /* READ COUNTERS BEFORE STUFF */ readCounters(fd,¤tMachine,&session, &before); ts1 = getTimeStamp(); fineTimeStamps[6] = read_timer(); seconds = read_timer(); /* DO STUFF */ for (i =0; i < num_iters; i++) BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, A,Mh, B,1, 1, C,1 ); /* END DOING STUFF */ seconds = read_timer()-seconds; fineTimeStamps[7] = read_timer(); ts2 = getTimeStamp(); /* READ COUNTERS AFTER STUFF */ for (i = 0; i < currentMachine.num_cores; i++) stopCounters(i,fd[i],¤tMachine, &session); // printf("num_iters = %"PRIu64", runtime is %g\n",num_iters,seconds); readCounters(fd,¤tMachine,&session,&after); diffCounterData(¤tMachine, &session, &after, &before, &after); for (i = 0; i < currentMachine.num_sockets; i++) { // printf("Socket %d\n",i); for (j = 0; j < currentMachine.num_cores_per_socket; j++) { // printf("%d,",j); for (k = 0; k < session.core_gen_counter_num_used; k++){ // printf("%"PRIu64",",after.generalCore[i*currentMachine.num_cores_per_socket + j][k]); // bug in the indexing of the core sums??? // coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; } // printf("\n"); } } for (i = 0; i < currentMachine.num_sockets; i++) { // printf("%d,",i); for (j = 0; j < currentMachine.num_cbos; j++) { // printf("%d,",j); for (k = 0; k < session.cbo_counter_num_used; k++) { // printf("%llu,",after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]); // bug in the indexing of the core sums??? // sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; } } } // printf("\n"); // Stop counters, reset PMU, close msr files cleanup(fd,¤tMachine,&session); free(A); free(B); free(C); } } // end parallel region printf("%s,%s,%s,%s,%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%f,%f,%f,",ts7,ts8,ts1,ts2,ts5,ts6,ts3,ts4,Mh,Nh,Kh,Md/block_size,Nd/block_size,Kd/block_size,num_iters,gpuOuter,gpuInner,seconds,gTime,(float)(gpuOuter*(Md*Kd+Nd+Md))/16.0); for (int i = 0; i < 8; i++) printf("%f,",fineTimeStamps[i]); for (j = 0; j < session.core_gen_counter_num_used; j++) printf("%llu,",coreSums[j]); for (j = 0; j < session.cbo_counter_num_used; j++) if (j == session.cbo_counter_num_used-1) printf("%llu",sums[j]); else printf("%llu,",sums[j]); printf("\n"); free(sums); free(coreSums); return 0; }
Random::Random() { randomInit(); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test matrix multiply using CUBLAS //////////////////////////////////////////////////////////////////////////////// int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) { cudaDeviceProp deviceProp; checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID)); // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32; // set seed for rand() srand(2006); // allocate host memory for matrices A and B unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float *h_A = (float *)malloc(mem_size_A); unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float *h_B = (float *)malloc(mem_size_B); // set seed for rand() srand(2006); // initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); // allocate device memory float *d_A, *d_B, *d_C; unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; unsigned int mem_size_C = sizeof(float) * size_C; // allocate host memory for the result float *h_C = (float *) malloc(mem_size_C); float *h_CUBLAS = (float *) malloc(mem_size_C); checkCudaErrors(cudaMalloc((void **) &d_A, mem_size_A)); checkCudaErrors(cudaMalloc((void **) &d_B, mem_size_B)); checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice)); checkCudaErrors(cudaMalloc((void **) &d_C, mem_size_C)); // setup execution parameters dim3 threads(block_size, block_size); dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); // create and start timer printf("Computing result using CUBLAS..."); // execute the kernel int nIter = 30; // CUBLAS version 2.0 { const float alpha = 1.0f; const float beta = 0.0f; cublasHandle_t handle; cudaEvent_t start, stop; checkCudaErrors(cublasCreate(&handle)); //Perform warmup operation with cublas checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA)); // Allocate CUDA events that we'll use for timing checkCudaErrors(cudaEventCreate(&start)); checkCudaErrors(cudaEventCreate(&stop)); // Record the start event checkCudaErrors(cudaEventRecord(start, NULL)); for (int j = 0; j < nIter; j++) { //note cublas is column primary! //need to transpose the order checkCudaErrors(cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA)); } printf("done.\n"); // Record the stop event checkCudaErrors(cudaEventRecord(stop, NULL)); // Wait for the stop event to complete checkCudaErrors(cudaEventSynchronize(stop)); float msecTotal = 0.0f; checkCudaErrors(cudaEventElapsedTime(&msecTotal, start, stop)); // Compute and print the performance float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); // copy result from device to host checkCudaErrors(cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost)); // Destroy the handle checkCudaErrors(cublasDestroy(handle)); } // compute reference solution printf("Computing result using host CPU..."); float *reference = (float *)malloc(mem_size_C); matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB); printf("done.\n"); // check result (CUBLAS) bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true) { printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f); } printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL"); printf("\nNOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.\n"); // clean up memory free(h_A); free(h_B); free(h_C); free(reference); checkCudaErrors(cudaFree(d_A)); checkCudaErrors(cudaFree(d_B)); checkCudaErrors(cudaFree(d_C)); // cudaDeviceReset causes the driver to clean up all state. While // not mandatory in normal operation, it is good practice. It is also // needed to ensure correct operation when the application is being // profiled. Calling cudaDeviceReset causes all profile data to be // flushed before the application exits cudaDeviceReset(); if (resCUBLAS == true) { return EXIT_SUCCESS; // return value = 1 } else { return EXIT_FAILURE; // return value = 0 } }
int main(int argc, char* argv[]) { int num_ker=0,num_queue; num_ker=atoi(argv[2]); num_queue=atoi(argv[3]); //variables /*#define WA 1024 #define HA 1024 #define WB 1024 #define HB WA #define WC WB #define HC HA */ struct timeval tim,ftim; double t1,t2,tim1,tim2; // gettimeofday(&tim, NULL); // t1=tim.tv_sec+(tim.tv_usec/1000000.0); gettimeofday(&ftim, NULL); tim1=ftim.tv_sec+(ftim.tv_usec/1000000.0); int m,WA,HA,WB,HB,WC,HC; m = atoi(argv[5]); WA=(256*m); HA = WA; WB = WA; HB = WB; WC = WA; HC = WA; // set seed for rand() srand(2006); // 1. allocate host memory for matrices A and B //automate the size of the matrix unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(int) * size_A; int* h_A = (int*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(int) * size_B; int* h_B = (int*) malloc(mem_size_B); // 2. initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); /* // 3. print out A and B printf("\n\nMatrix A\n"); for(i = 0; i < size_A; i++) { printf("%f ", h_A[i]); if(((i + 1) % WA) == 0) printf("\n"); } printf("\n\nMatrix B\n"); for(i = 0; i < size_B; i++) { printf("%f ", h_B[i]); if(((i + 1) % WB) == 0) printf("\n"); } */ // 4. allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(int) * size_C; int* h_C = (int*) malloc(mem_size_C); // 5. Initialize OpenCL // OpenCL specific variables cl_context clGPUContext; // cl_command_queue* clCommandQue; //cl_program clProgram; //cl_kernel clKernel; cl_platform_id* cpPlatform; // OpenCL platform cl_uint platformCount; //keeps the divice count size_t dataBytes; size_t kernelLength; cl_int errcode; // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; /*****************************************/ /* Initialize OpenCL */ /*****************************************/ //cl_platform_id* cpPlatform; // OpenCL platform //cl_device_id device_id;// = (cl_device_id)malloc(sizeof(cl_device_id)); // Bind to platform // errcode = clGetPlatformIDs(1, &cpPlatform, NULL); clGetPlatformIDs(0, NULL, &platformCount); cpPlatform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); clGetPlatformIDs(platformCount, cpPlatform, NULL);//what ever is returned from last step will be used here cl_device_id device_id; int choice =atoi(argv[1]); if(choice ==1) { // Length of vectors // n = 64; // Connect to a compute device // we can have CL_DEVICE_GPU or ACCELERATOR or ALL as an option here //depending what device are we working on // we can these multiple times depending on requirements errcode = clGetDeviceIDs(cpPlatform[0],CL_DEVICE_TYPE_CPU , 1, &device_id, NULL); if (errcode != CL_SUCCESS) printf("Error: Failed to create a device group!\n"); } else { // errcode = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device errcode = clGetDeviceIDs(cpPlatform[1], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if (errcode != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); } } //printf("here"); // Create a context clGPUContext = clCreateContext(0, 1, &device_id, NULL, NULL, &errcode); // Create a command queue //printf("here"); /*clGPUContext = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); //shrCheckError(errcode, CL_SUCCESS); // get the list of GPU devices associated // with context errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); //shrCheckError(errcode, CL_SUCCESS); */ //malloc for command queue, kernel and program cl_kernel *clKernel=(cl_kernel *)malloc(num_ker * sizeof(cl_kernel)); cl_program *clProgram=(cl_program *)malloc(num_ker * sizeof(cl_kernel)); cl_command_queue * clCommandQue = (cl_command_queue *)malloc(num_ker * sizeof(cl_command_queue)); //Create a command-queue for(i=0;i<num_queue;i++) { clCommandQue[i] = clCreateCommandQueue(clGPUContext, device_id, 0, &errcode); } //shrCheckError(errcode, CL_SUCCESS); /* // Setup device memory d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, printf("\nhere"); CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &errcode); */ char *file="matxm.cl"; char *KernelSource = load_program_source(file); for(i=0;i<num_ker;i++) { clProgram[i] = clCreateProgramWithSource(clGPUContext, 1, (const char **) & KernelSource, NULL, &errcode); //shrCheckError(errcode, CL_SUCCESS); errcode = clBuildProgram(clProgram[i], 0, NULL, NULL, NULL, NULL); //shrCheckError(errcode, CL_SUCCESS); clKernel[i] = clCreateKernel(clProgram[i], "matrixMul", &errcode); } //shrCheckError(errcode, CL_SUCCESS); // Setup device memory d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_B, h_B, &errcode); // Write our data set into the input array in device memory for(i=0;i<num_queue;i++){ errcode = clEnqueueWriteBuffer(clCommandQue[i], d_A, CL_TRUE, 0,mem_size_A, h_A, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue[i], d_B, CL_TRUE, 0,mem_size_B, h_B, 0, NULL, NULL); } // 7. Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; for(i=0;i<num_ker;i++) { errcode = clSetKernelArg(clKernel[i], 0, sizeof(cl_mem), (void *)&d_C); errcode = clSetKernelArg(clKernel[i], 1, sizeof(cl_mem), (void *)&d_A); errcode = clSetKernelArg(clKernel[i], 2, sizeof(cl_mem), (void *)&d_B); errcode = clSetKernelArg(clKernel[i], 3, sizeof(int), (void *)&wA); errcode = clSetKernelArg(clKernel[i], 4, sizeof(int), (void *)&wC); } // shrCheckError(errcode, CL_SUCCESS); //struct timespec start, finish; int value; value =atoi(argv[4]); localWorkSize[0] = value ; localWorkSize[1] = value ; globalWorkSize[0] = HA; globalWorkSize[1] = HA; //clFinish(clCommandQue); //timer starting // clock_gettime(CLOCK_MONOTONIC, &start); //struct timeval tim; //double t1,t2; // gettimeofday(&tim, NULL); // t1=tim.tv_sec+(tim.tv_usec/1000000.0); gettimeofday(&tim, NULL); t1=tim.tv_sec+(tim.tv_usec/1000000.0); //multikernels inside queues int j=0; for(j=0;j<num_queue;j++) { for(i=0;i<num_ker;i++){ errcode = clEnqueueNDRangeKernel(clCommandQue[j], clKernel[i], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); } } for(i=0;i<num_queue;i++) { clFinish(clCommandQue[i]); } gettimeofday(&tim, NULL); t2=tim.tv_sec+(tim.tv_usec/1000000.0); printf("%.6lf\t",(t2-t1)); // shrCheckError(errcode, CL_SUCCESS); /* clock_gettime(CLOCK_MONOTONIC, &finish); elapsed = (finish.tv_sec - start.tv_sec); elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000000.0; printf("Work Item/threads = %d \n",value); printf("time taken by GPU = %le\n ",elapsed); */ // 8. Retrieve result from device for(i=0;i<num_queue;i++) { errcode = clEnqueueReadBuffer(clCommandQue[i], d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); //shrCheckError(errcode, CL_SUCCESS); } for(i=0;i<num_queue;i++) { clFinish(clCommandQue[i]); } // shrCheckError(errcode, CL_SUCCESS); //clock_gettime(CLOCK_MONOTONIC, &finish); // elapsed = (finish.tv_sec - start.tv_sec); // elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000000.0; //printf("Work Item/threads = %d \n",value); //printf("time taken by GPU = %le\n ",elapsed); // 9. print out the results /*printf("\n\nMatrix C (Results)\n"); for(i = 0; i < size_C; i++) { printf("%f ", h_C[i]); if(((i + 1) % WC) == 0) printf("\n"); } printf("\n");*/ // 10. clean up memory free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); // free(device_id); free(KernelSource); clReleaseContext(clGPUContext); for(i=0;i<num_ker;i++) { clReleaseKernel(clKernel[i]); clReleaseProgram(clProgram[i]); } for(i=0;i<num_queue;i++){ clReleaseCommandQueue(clCommandQue[i]); } gettimeofday(&ftim, NULL); tim2=ftim.tv_sec+(ftim.tv_usec/1000000.0); printf("%.6lf\t",(tim2-tim1)); printf("\n"); exit(0); }
int main(int argc, char** argv) { srand(2006); unsigned int size_A = WA*HA; unsigned int mem_size_A = sizeof(float)*size_A; float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = WB*HB; unsigned int mem_size_B = sizeof(float)*size_B; float* h_B = (float*) malloc(mem_size_B); randomInit(h_A, size_A); randomInit(h_B, size_B); printf("\n\nMathrix A\n"); for (int i = 0; i < size_A; i++) { printf("%f ", h_A[i]); if (((i+1)%WA)==0) { printf("\n"); } } printf("\n\nMatrix B\n"); for (int i = 0; i < size_B; i++) { printf("%f ", h_B[i]); if (((i+1)%WB)==0) { printf("\n"); } } unsigned int size_C = WC*HC; unsigned int mem_size_C = sizeof(float)*size_C; float* h_C = (float*) malloc(mem_size_C); for (int i = 0; i < n; ++i) { for (int j = 0; j < m; ++j) { for (int k = 0; k < p; ++k) { a[i+n*j] += b[i+n*k]*c[k+p*j]; } } } printf("\n\nMatric C (Results)\n"); for (int i = 0; i < size_C; i++) { printf("%f ", h_C[i]); if (((i+1)%WC)==0) { printf("\n"); } } printf("\n"); free(h_A); free(h_B); free(h_C); return 0; }
Map::Map(int width, int height, int mode) : width(width),height(height),mode(mode){ randomInit(); }
int main(int argc, char** argv) { srand(1000); int i; unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*) malloc(mem_size_B); randomInit(h_A, size_A); randomInit(h_B, size_B); unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); cl_context clGPUContext; cl_command_queue clCommandQue; cl_program clProgram; cl_kernel clKernel; cl_event mm; size_t dataBytes; size_t kernelLength; cl_int errcode; cl_mem d_A; cl_mem d_B; cl_mem d_C; clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode); d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &errcode); FILE* fp = fopen("hw2.cl", "r"); fseek (fp , 0 , SEEK_END); const size_t lSize = ftell(fp); rewind(fp); unsigned char* buffer; buffer = (unsigned char*) malloc (lSize); fread(buffer, 1, lSize, fp); fclose(fp); cl_int status; clProgram = clCreateProgramWithBinary(clGPUContext, 1, (const cl_device_id *)clDevices, &lSize, (const unsigned char**)&buffer, &status, &errcode); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); clKernel = clCreateKernel(clProgram, "MM", &errcode); size_t globalWorkSize[2]; int wA = WA; int wC = WC; errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); globalWorkSize[0] = 16; globalWorkSize[1] = 16; cl_ulong time_start, time_end, total_time = 0; errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, NULL, 0, NULL, &mm); printf("Average time = %lu\n"); clFinish(clCommandQue); clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; printf("Average time = %lu\n", total_time); errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); free(clDevices); clReleaseContext(clGPUContext); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommandQue); }
//////////////////////////////////////////////////////////////////////////////// //! Run a simple test matrix multiply using CUBLAS //////////////////////////////////////////////////////////////////////////////// int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) { cudaDeviceProp deviceProp; cudaError_t error; error = cudaGetDeviceProperties(&deviceProp, devID); if (error != cudaSuccess) { printf("cudaGetDeviceProperties returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } // use a larger block size for Fermi and above int block_size = (deviceProp.major < 2) ? 16 : 32; // set seed for rand() srand(2006); // allocate host memory for matrices A and B unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA; unsigned int mem_size_A = sizeof(float) * size_A; float *h_A = (float *)malloc(mem_size_A); unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB; unsigned int mem_size_B = sizeof(float) * size_B; float *h_B = (float *)malloc(mem_size_B); // set seed for rand() srand(2006); // initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); // allocate device memory float *d_A, *d_B, *d_C; unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC; unsigned int mem_size_C = sizeof(float) * size_C; // allocate host memory for the result float *h_C = (float *) malloc(mem_size_C); float *h_CUBLAS = (float *) malloc(mem_size_C); error = cudaMalloc((void **) &d_A, mem_size_A); if (error != cudaSuccess) { printf("cudaMalloc d_A returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_B, mem_size_B); if (error != cudaSuccess) { printf("cudaMalloc d_B returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMalloc((void **) &d_C, mem_size_C); if (error != cudaSuccess) { printf("cudaMalloc d_C returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } // create and start timer StopWatchInterface *timerMemIn = NULL; sdkCreateTimer(&timerMemIn); // start the timer sdkStartTimer(&timerMemIn); // copy host memory to device error = cudaMemcpy(d_A, h_A, mem_size_A, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy d_A h_A returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } error = cudaMemcpy(d_B, h_B, mem_size_B, cudaMemcpyHostToDevice); if (error != cudaSuccess) { printf("cudaMemcpy d_B h_B returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } sdkStopTimer(&timerMemIn); printf("\nMemory H2D Transferring time: %f (ms)\n", sdkGetTimerValue(&timerMemIn)); sdkDeleteTimer(&timerMemIn); // setup execution parameters dim3 threads(block_size, block_size); dim3 grid(matrix_size.uiWC / threads.x, matrix_size.uiHC / threads.y); // create and start timer printf("Computing result using CUBLAS..."); // execute the kernel int nIter = 30; // CUBLAS version 2.0 { cublasHandle_t handle; cublasStatus_t ret; ret = cublasCreate(&handle); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasCreate returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } const float alpha = 1.0f; const float beta = 0.0f; //Perform warmup operation with cublas ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } // Allocate CUDA events that we'll use for timing cudaEvent_t start; error = cudaEventCreate(&start); if (error != cudaSuccess) { fprintf(stderr, "Failed to create start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } cudaEvent_t stop; error = cudaEventCreate(&stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to create stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Record the start event error = cudaEventRecord(start, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record start event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } for (int j = 0; j < nIter; j++) { //note cublas is column primary! //need to transpose the order ret = cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA, matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA, &beta, d_C, matrix_size.uiWA); if (ret != CUBLAS_STATUS_SUCCESS) { printf("cublasSgemm returned error code %d, line(%d)\n", ret, __LINE__); exit(EXIT_FAILURE); } } printf("done.\n"); // Record the stop event error = cudaEventRecord(stop, NULL); if (error != cudaSuccess) { fprintf(stderr, "Failed to record stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Wait for the stop event to complete error = cudaEventSynchronize(stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to synchronize on the stop event (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } float msecTotal = 0.0f; error = cudaEventElapsedTime(&msecTotal, start, stop); if (error != cudaSuccess) { fprintf(stderr, "Failed to get time elapsed between events (error code %s)!\n", cudaGetErrorString(error)); exit(EXIT_FAILURE); } // Compute and print the performance float msecPerMatrixMul = msecTotal / nIter; double flopsPerMatrixMul = 2.0 * (double)matrix_size.uiWA * (double)matrix_size.uiHA * (double)matrix_size.uiWB; double gigaFlops = (flopsPerMatrixMul * 1.0e-9f) / (msecPerMatrixMul / 1000.0f); printf( "Performance= %.2f GFlop/s, Time= %.3f msec, Size= %.0f Ops\n", gigaFlops, msecPerMatrixMul, flopsPerMatrixMul); // create and start timer StopWatchInterface *timerMemOut = NULL; sdkCreateTimer(&timerMemOut); // start the timer sdkStartTimer(&timerMemOut); // copy result from device to host error = cudaMemcpy(h_CUBLAS, d_C, mem_size_C, cudaMemcpyDeviceToHost); sdkStopTimer(&timerMemOut); printf("\Memory D2H Transferring time: %f (ms)\n", sdkGetTimerValue(&timerMemOut)); sdkDeleteTimer(&timerMemOut); if (error != cudaSuccess) { printf("cudaMemcpy h_CUBLAS d_C returned error code %d, line(%d)\n", error, __LINE__); exit(EXIT_FAILURE); } checkError(cublasDestroy(handle), "cublasDestroy() error!\n"); } // compute reference solution printf("Computing result using host CPU..."); float *reference = (float *)malloc(mem_size_C); // create and start timer StopWatchInterface *timer = NULL; sdkCreateTimer(&timer); // start the timer sdkStartTimer(&timer); matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB); sdkStopTimer(&timer); printf("\nCPU Processing time: %f (ms)\n", sdkGetTimerValue(&timer)); sdkDeleteTimer(&timer); printf("done.\n"); // check result (CUBLAS) bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f); if (resCUBLAS != true) { printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100, 1.0e-5f); } printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n", (true == resCUBLAS) ? "PASS" : "FAIL"); // clean up memory free(h_A); free(h_B); free(h_C); free(reference); cudaFree(d_A); cudaFree(d_B); cudaFree(d_C); cudaDeviceReset(); if (resCUBLAS == true) { return EXIT_SUCCESS; // return value = 1 } else { return EXIT_FAILURE; // return value = 0 } }
int main( int argc, char* argv[] ) { // Length of vectors int m = atoi(argv[4]); unsigned int n=(256*m); //matrix variable // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; //########################Vector Add Variables // Host input vectors int *h_a; int *h_b; // Host output vector int *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; // cl_kernel *kernel; cl_platform_id* cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context //cl_command_queue* queue; // command queue //cl_command_queue queue; // command queue // cl_program *program; // program cl_platform_id* platforms; // platform id, // differnt for all the device we have in the system cl_uint platformCount; //keeps the divice count // Size, in bytes, of each vector size_t bytes = n*sizeof(int); // Allocate memory for each vector on host h_a = (int*)malloc(bytes); h_b = (int*)malloc(bytes); h_c = (int*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = i; h_b[i] = i; // printf("%d ",h_a[i]); } size_t globalSize, localSize; //similar to cuda cl_int err;//for errors int workgrp; int wrkitm; int num_ker; num_ker=atoi(argv[2]); wrkitm=atoi(argv[3]);// i have tried automating lots of data, // Number of work items in each local work group localSize = wrkitm ; // Number of total work items - localSize must be devisor globalSize = n; //################################# Done vector ################### //#############Matrix Multiplication Variables ############### int WA,HA,WB,HB,WC,HC; WA = n; HA = WA; WB = WA; HB = WB; WC = WA; HC = WA; // set seed for rand() srand(2006); // 1. allocate host memory for matrices A and B //automate the size of the matrix unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*) malloc(mem_size_B); // 4. allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); // 2. initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); //######################## matrix done ####################### //mallocing for array of queues (break through) cl_command_queue * queue = (cl_command_queue *)malloc(num_ker * sizeof(cl_command_queue)); cl_kernel *kernel=(cl_kernel *)malloc(num_ker * sizeof(cl_kernel)); cl_program *program=(cl_program *)malloc(num_ker * sizeof(cl_kernel)); //defining platform clGetPlatformIDs(0, NULL, &platformCount); cpPlatform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); clGetPlatformIDs(platformCount, cpPlatform, NULL);//what ever is returned from last step will be used here int choice = atoi(argv[1]); if(choice ==1) { // we can have CL_DEVICE_GPU or ACCELERATOR or ALL as an option here // we can these multiple times depending on requirements err = clGetDeviceIDs(cpPlatform[0],CL_DEVICE_TYPE_CPU , 1, &device_id, NULL); if (err != CL_SUCCESS) printf("Error: Failed to create a device group!\n"); } else { // Get ID for the device err = clGetDeviceIDs(cpPlatform[1], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); } } context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); //malloc file and kernel variable char **file=(char **)malloc(num_ker * sizeof(char *)); char **KernelSource=(char **)malloc(num_ker * sizeof(char *)); for(i=0;i<num_ker;i++) { queue[i] = clCreateCommandQueue(context, device_id, 0, &err); } file[0]="vectadd.cl"; KernelSource[0] = load_program_source(file[0]); file[1]="matxm.cl"; KernelSource[1] = load_program_source(file[1]); for(i=0;i<num_ker;i++) { // Create the compute program from the source buffer program[i] = clCreateProgramWithSource(context, 1, (const char **) & KernelSource[i], NULL, &err); // Build the program executable clBuildProgram(program[i], 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel[i] = clCreateKernel(program[i], file[i], &err); } //Vector Start // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); //vector finsih //matrix start d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err); d_A = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, h_A, &err); d_B = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_B, h_B, &err); //matrix finish // Write our data set into the input array in device memory for(i=0;i<num_ker;i++) { if(i=0)//for vectorADD { err = clEnqueueWriteBuffer(queue[i], d_a, CL_TRUE, 0,bytes, h_a, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue[i], d_b, CL_TRUE, 0,bytes, h_b, 0, NULL, NULL); // Set the arguments to our compute kernel err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &d_a); err = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), &d_b); err = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &d_c); err = clSetKernelArg(kernel[i], 3, sizeof(unsigned int), &n); // Get the maximum work group size for executing the kernel on the device if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } } else if(i=1) { err = clEnqueueWriteBuffer(queue[i], d_A, CL_TRUE, 0,mem_size_A, h_A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue[i], d_B, CL_TRUE, 0,mem_size_B, h_B, 0, NULL, NULL); //size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&d_C); err = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&d_A); err = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&d_B); err = clSetKernelArg(kernel[i], 3, sizeof(int), (void *)&wA); err = clSetKernelArg(kernel[i], 4, sizeof(int), (void *)&wC); } } /* // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); // Get the maximum work group size for executing the kernel on the device if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } */ //struct timeval tim; //double t1,t2; //gettimeofday(&tim, NULL); //t1=tim.tv_sec+(tim.tv_usec/1000000.0); //need to work on work size############################# for(i=0;i<num_ker;i++) { err = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL, &globalSize, &localSize, 0, NULL, NULL); } //for(i=0;i<num_ker;i++) //clFinish(queue[i]); //gettimeofday(&tim, NULL); // t2=tim.tv_sec+(tim.tv_usec/1000000.0); //printf("GPU time %.4lf\t",(t2-t1)); for(i=0;i<num_ker;++i) { if(i=0) { clEnqueueReadBuffer(queue[i], d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); } else if(i=1) { err = clEnqueueReadBuffer(queue[i], d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); } } for(i=0;i<num_ker;++i) { clFinish(queue[i]); } // release OpenCL resources free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); // clReleaseProgram(program); // clReleaseKernel(kernel); for(i=0;i<num_ker;++i) { clReleaseCommandQueue(queue[i]); clReleaseKernel(kernel[i]); clReleaseProgram(program[i]); } clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
//---------------------------------------------------------- // System initialization //---------------------------------------------------------- static inline void initSystem(void) { bool success; (void)success; // disable interrupts: disabled on msp430 by default, but other systems might need this DISABLE_INTS(); // stop the watchdog: GCC disables it by default, but other compilers might not be so helpful watchdogStop(); // TODO: init dynamic memory // platformMemInit(); // basic, platform-specific initialization: timers, platform-specific drivers (?) initPlatform(); // start energy accounting (as soon as timers are initialized) energyConsumerOn(ENERGY_CONSUMER_MCU); #ifdef USE_PRINT // init printing to serial (makes sense only after clock has been calibrated) if (printInit != NULL) printInit(); #endif INIT_PRINTF("starting MansOS...\n"); #ifdef USE_LEDS INIT_PRINTF("init LED(s)...\n"); ledsInit(); #endif #ifdef USE_BEEPER beeperInit(); #endif #ifdef RAMTEXT_START if ((MemoryAddress_t)&_end > RAMTEXT_START) { // Panic right aways on RAM overflow. // In case this happens, you might want to increase the address // specified by CONST_RAMTEXT_START in config file assertionFailed("Overflow between .data and .ramtext sections", __FILE__, __LINE__); } #endif #ifdef USE_ADC if (adcInit != NULL) { INIT_PRINTF("init ADC...\n"); adcInit(); } #endif #ifdef USE_RANDOM INIT_PRINTF("init RNG...\n"); randomInit(); #endif #if USE_ALARMS INIT_PRINTF("init alarms...\n"); initAlarms(); #endif #ifdef USE_RADIO INIT_PRINTF("init radio...\n"); radioInit(); #endif #ifdef USE_ADDRESSING INIT_PRINTF("init communication stack...\n"); networkingInit(); #endif #ifdef USE_EXT_FLASH INIT_PRINTF("init external flash...\n"); extFlashInit(); #endif #ifdef USE_SDCARD INIT_PRINTF("init SD card...\n"); sdcardInit(); #endif #ifdef USE_EEPROM INIT_PRINTF("init EEPROM...\n"); eepromInit(); #endif #ifdef USE_ISL29003 INIT_PRINTF("init ISL light sensor...\n"); success = islInit(); if (!success) { INIT_PRINTF("ISL init failed!\n"); } #endif #ifdef USE_ADS1115 INIT_PRINTF("init ADS111x ADC converter chip...\n"); adsInit(); #endif #if USE_ADS8638 INIT_PRINTF("init ADS8638 ADC converter chip...\n"); ads8638Init(); #endif #if USE_ADS8328 INIT_PRINTF("init ADS8328 ADC converter chip...\n"); ads8328Init(); #endif #if USE_AD5258 INIT_PRINTF("init AD5258 digital potentiometer...\n"); ad5258Init(); #endif #if USE_DAC7718 INIT_PRINTF("init DAC7718 DAC converter chip...\n"); dac7718Init(); #endif #if USE_ISL1219 INIT_PRINTF("init ISL1219 real-time clock chip...\n"); isl1219Init(); #endif #ifdef USE_HUMIDITY INIT_PRINTF("init humidity sensor...\n"); humidityInit(); #endif #ifdef USE_ACCEL INIT_PRINTF("init accelerometer...\n"); accelInit(); #endif #ifdef USE_TIMESYNC INIT_PRINTF("init base station time sync...\n"); timesyncInit(); #endif #ifdef USE_SMP INIT_PRINTF("init SSMP...\n"); smpInit(); #endif #ifdef USE_REPROGRAMMING INIT_PRINTF("init reprogramming...\n"); bootParamsInit(); #endif #ifdef USE_DCO_RECALIBRATION extern void dcoRecalibrationInit(void); INIT_PRINTF("init DCO recalibration...\n"); dcoRecalibrationInit(); #endif #ifdef USE_FS INIT_PRINTF("init file system...\n"); fsInit(); #endif #ifdef USE_FATFS INIT_PRINTF("init FAT file system...\n"); fatFsInit(); INIT_PRINTF("init POSIX-like file routines...\n"); posixStdioInit(); #endif #ifdef USE_WMP INIT_PRINTF("init WMP...\n"); wmpInit(); #endif #ifdef USE_SEAL_NET INIT_PRINTF("init SEAL networking...\n"); sealNetInit(); #endif INIT_PRINTF("starting the application...\n"); }