void Mesh::NEListToArray() { std::vector< std::set<size_t> >::const_iterator vec_it; std::set<size_t>::const_iterator set_it; size_t offset = 0; size_t index = 0; for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++) offset += vec_it->size(); NEListArray_size = offset; cuda_check(cudaHostAlloc((void **)&NEListIndex_pinned, sizeof(size_t) * (NNodes+1), cudaHostAllocPortable)); cuda_check(cudaHostAlloc((void **)&NEListArray_pinned, sizeof(size_t) * NEListArray_size, cudaHostAllocPortable)); offset = 0; for(vec_it = NEList.begin(); vec_it != NEList.end(); vec_it++) { NEListIndex_pinned[index++] = offset; for(set_it = vec_it->begin(); set_it != vec_it->end(); set_it++) NEListArray_pinned[offset++] = *set_it; } assert(index == NEList.size()); NEListIndex_pinned[index] = offset; }
void GRTRegenerateChains::AllocatePerGPUMemory(GRTRegenerateThreadRunData *data) { // Malloc space on the GPU for everything. cudaError_t err; // Default flags are for memory on device and copying things. unsigned int flags = 0; // If we are using zero copy, set the zero copy flag. if (this->CommandLineData->GetUseZeroCopy()) { flags = cudaHostAllocMapped; } // Malloc device hash space. CH_CUDA_SAFE_CALL(cudaMalloc((void **)&this->DEVICE_Hashes[data->threadID], this->NumberOfHashes * this->HashLengthBytes * sizeof(unsigned char))); CH_CUDA_SAFE_CALL(cudaMemcpy(this->DEVICE_Hashes[data->threadID], this->HashList, this->NumberOfHashes * this->HashLengthBytes * sizeof(unsigned char), cudaMemcpyHostToDevice)); //this->HOST_Success[data->threadID] = new unsigned char [this->NumberOfHashes * sizeof(unsigned char)]; cudaHostAlloc((void **)&this->HOST_Success[data->threadID], this->NumberOfHashes * sizeof(unsigned char), flags); memset(this->HOST_Success[data->threadID], 0, this->NumberOfHashes * sizeof(unsigned char)); this->HOST_Success_Reported[data->threadID] = new unsigned char [this->NumberOfHashes * sizeof(unsigned char)]; memset(this->HOST_Success_Reported[data->threadID], 0, this->NumberOfHashes * sizeof(unsigned char)); // If zero copy is in use, get the device pointer if (this->CommandLineData->GetUseZeroCopy()) { cudaHostGetDevicePointer((void **)&this->DEVICE_Success[data->threadID], this->HOST_Success[data->threadID], 0); } else { CH_CUDA_SAFE_CALL(cudaMalloc((void **)&this->DEVICE_Success[data->threadID], this->NumberOfHashes * sizeof(unsigned char))); CH_CUDA_SAFE_CALL(cudaMemset(this->DEVICE_Success[data->threadID], 0, this->NumberOfHashes * sizeof(unsigned char))); } //this->HOST_Passwords[data->threadID] = new unsigned char[MAX_PASSWORD_LEN * this->NumberOfHashes * sizeof(unsigned char)]; cudaHostAlloc((void **)&this->HOST_Passwords[data->threadID], MAX_PASSWORD_LENGTH * this->NumberOfHashes * sizeof(unsigned char), flags); memset(this->HOST_Passwords[data->threadID], 0, MAX_PASSWORD_LENGTH * this->NumberOfHashes * sizeof(unsigned char)); if (this->CommandLineData->GetUseZeroCopy()) { cudaHostGetDevicePointer((void **)&this->DEVICE_Passwords[data->threadID], this->HOST_Passwords[data->threadID], 0); } else { CH_CUDA_SAFE_CALL(cudaMalloc((void **)&this->DEVICE_Passwords[data->threadID], MAX_PASSWORD_LENGTH * this->NumberOfHashes * sizeof(unsigned char))); CH_CUDA_SAFE_CALL(cudaMemset(this->DEVICE_Passwords[data->threadID], 0, MAX_PASSWORD_LENGTH * this->NumberOfHashes * sizeof(unsigned char))); } err = cudaGetLastError(); if (err != cudaSuccess) { printf("Thread %d: CUDA error 5: %s. Exiting.\n", data->threadID, cudaGetErrorString( err)); return; } //printf("Memory for thread %d allocated.\n", data->threadID); }
TEST(HostAlloc, NullArguments) { cudaError_t ret; ret = cudaHostAlloc(NULL, 0, 0); EXPECT_EQ(cudaErrorInvalidValue, ret); ret = cudaHostAlloc(NULL, 4, 0); EXPECT_EQ(cudaErrorInvalidValue, ret); ret = cudaFreeHost(NULL); EXPECT_EQ(cudaSuccess, ret); }
void alloc_data_host(DataArray* data_arr) { // when allocating data, we don't know which pointer will be ruturned, but we can use pointer to pointer // data_arr->data_r = complex** // *(complex**) = *complex // *(data_arr->data_r) means smth what is under the adress data_arr->data_r, so complex* double cudaHostAlloc((void**) data_arr->data_r, sizeof(double complex)*N, cudaHostAllocDefault); // pinnable memory <- check here for cudaMallocHost (could be faster) cudaHostAlloc((void**) data_arr->data_k, sizeof(double complex)*N, cudaHostAllocDefault); // pinnable memory // in case of pageable memory (slower/not asynchronous): //*(data_arr->data_r) = (double complex*) malloc( (size_t) sizeof(double complex)*data_arr->size ); //*(data_arr->data_k) = (double complex*) malloc( (size_t) sizeof(double complex)*data_arr->size ); }
TEST_P(MemcpyAsync, H2DTransfers) { const size_t param = GetParam(); const size_t alloc = 1 << param; cudaError_t ret; void *d1, *h1; ret = cudaMalloc(&d1, alloc); ASSERT_EQ(cudaSuccess, ret); ret = cudaHostAlloc(&h1, alloc, cudaHostAllocMapped); ASSERT_EQ(cudaSuccess, ret); cudaStream_t stream; ret = cudaStreamCreate(&stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaMemcpyAsync(d1, h1, alloc, cudaMemcpyHostToDevice, stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamSynchronize(stream); ASSERT_EQ(cudaSuccess, ret); ret = cudaFree(d1); ASSERT_EQ(cudaSuccess, ret); ret = cudaFreeHost(h1); ASSERT_EQ(cudaSuccess, ret); ret = cudaStreamDestroy(stream); ASSERT_EQ(cudaSuccess, ret); }
Args3D<T>::Args3D(int _width, int _height, int _depth){ width = _width; height = _height; depth = _depth; gpuErrchk( cudaHostAlloc((void **) &hostArray, width*height*depth*sizeof(T), cudaHostAllocWriteCombined | cudaHostAllocMapped) ); gpuErrchk( cudaHostGetDevicePointer(&deviceArray, hostArray, 0) ); }
Waifu2x::eWaifu2xError Waifu2x::ProcessNet(std::shared_ptr<cNet> net, const int crop_w, const int crop_h, const bool use_tta, const int batch_size, cv::Mat &im) { Waifu2x::eWaifu2xError ret; CudaDeviceSet devset(mProcess, mGPUNo); const auto OutputMemorySize = net->GetOutputMemorySize(crop_w, crop_h, OuterPadding, batch_size); if (OutputMemorySize > mOutputBlockSize) { if (mIsCuda) { CUDA_HOST_SAFE_FREE(mOutputBlock); CUDA_CHECK_WAIFU2X(cudaHostAlloc(&mOutputBlock, OutputMemorySize, cudaHostAllocDefault)); } else { SAFE_DELETE_WAIFU2X(mOutputBlock); mOutputBlock = new float[OutputMemorySize]; } mOutputBlockSize = OutputMemorySize; } ret = net->ReconstructImage(use_tta, crop_w, crop_h, OuterPadding, batch_size, mOutputBlock, im, im); if (ret != Waifu2x::eWaifu2xError_OK) return ret; return Waifu2x::eWaifu2xError_OK; }
Args<T>::Args(int _width){ width = _width; gpuErrchk( cudaDeviceReset() ); gpuErrchk( cudaSetDeviceFlags(cudaDeviceMapHost) ); gpuErrchk( cudaHostAlloc((void **) &hostArray, width*sizeof(T), cudaHostAllocWriteCombined | cudaHostAllocMapped) ); gpuErrchk( cudaHostGetDevicePointer(&deviceArray, hostArray, 0) ); }
TEST(HostAlloc, MappedPointer) { cudaError_t ret; int device; ret = cudaGetDevice(&device); ASSERT_EQ(cudaSuccess, ret); struct cudaDeviceProp prop; ret = cudaGetDeviceProperties(&prop, device); ASSERT_EQ(cudaSuccess, ret); void * ptr; ret = cudaHostAlloc(&ptr, 4, cudaHostAllocMapped); ASSERT_EQ(cudaSuccess, ret); /* * Try to retrieve the device pointer, expecting a result according to * prop.canMapHostMemory. */ void * device_ptr; ret = cudaHostGetDevicePointer(&device_ptr, ptr, 0); if (prop.canMapHostMemory) { EXPECT_EQ(cudaSuccess, ret); EXPECT_FALSE(device_ptr == NULL); } else { EXPECT_EQ(cudaErrorMemoryAllocation, ret); } ret = cudaFreeHost(ptr); ASSERT_EQ(cudaSuccess, ret); }
void * CudaHostPinnedSpace::allocate( const size_t arg_alloc_size ) const { void * ptr = NULL; CUDA_SAFE_CALL( cudaHostAlloc( &ptr, arg_alloc_size , cudaHostAllocDefault ) ); return ptr ; }
static void *alloc_pinned_mem(int size) { void *ret; checkCudaErrors(cudaHostAlloc(&ret, size, cudaHostAllocPortable)); return ret; }
void* cuda_hostalloc(long N) { void* ptr; if (cudaSuccess != cudaHostAlloc(&ptr, N, cudaHostAllocDefault)) abort(); insert(ptr, N, false); return ptr; }
void* CUDAPageLockedMemAllocator::Malloc(size_t size, int deviceId) { void* p; cudaSetDevice(deviceId); // Note: I ask for cudaHostAllocDefault but cudaHostGetFlags() shows that it is allocated as 'cudaHostAllocMapped' cudaHostAlloc(&p, size, cudaHostAllocDefault) || "Malloc in CUDAPageLockedMemAllocator failed"; return p; }
void Mesh::pin_data() { NNListToArray(); NEListToArray(); size_t ENList_bytes = sizeof(size_t) * ENList.size(); size_t coords_bytes = sizeof(float) * coords.size(); size_t metric_bytes = sizeof(float) * metric.size(); size_t normal_bytes = sizeof(float) * normals.size(); cuda_check(cudaHostAlloc((void **)&ENList_pinned, ENList_bytes, cudaHostAllocPortable)); cuda_check(cudaHostAlloc((void **)&coords_pinned, coords_bytes, cudaHostAllocPortable)); cuda_check(cudaHostAlloc((void **)&metric_pinned, metric_bytes, cudaHostAllocPortable)); cuda_check(cudaHostAlloc((void **)&normals_pinned, normal_bytes, cudaHostAllocPortable)); memcpy(ENList_pinned, &ENList[0], ENList_bytes); memcpy(coords_pinned, &coords[0], coords_bytes); memcpy(metric_pinned, &metric[0], metric_bytes); memcpy(normals_pinned, &normals[0], normal_bytes); }
/** constructor * * @param size extent for each dimension (in elements) */ MappedBufferIntern(DataSpace<DIM> size): DeviceBuffer<TYPE, DIM>(size, size), pointer(nullptr), ownPointer(true) { #if( PMACC_CUDA_ENABLED == 1 ) CUDA_CHECK((cuplaError_t)cudaHostAlloc(&pointer, size.productOfComponents() * sizeof (TYPE), cudaHostAllocMapped)); #else pointer = new TYPE[size.productOfComponents()]; #endif reset(false); }
void *hostAlloc(size_t size, unsigned int flags ) { void *ptr; cudaError_t cudaError = cudaHostAlloc(&ptr, size, flags); if(cudaError != cudaSuccess) { throw cudaError; } return ptr; }
TEST(HostAlloc, MallocFree) { cudaError_t ret; int * ptr; ret = cudaHostAlloc((void **) &ptr, sizeof(*ptr), 0); ASSERT_EQ(cudaSuccess, ret); ASSERT_FALSE(NULL == ptr); *ptr = 0; ret = cudaFreeHost(ptr); EXPECT_EQ(cudaSuccess, ret); }
inline void* allocate(size_t num_bytes) { void* result = nullptr; cudaError_t error = cudaHostAlloc(&result, num_bytes, cudaHostAllocPortable); if(error != cudaSuccess) { throw thrust::system_error(error, thrust::cuda_category(), "pinned_resource::allocate(): cudaMallocManaged"); } return result; }
void init() { // Aloca memorie - local //a_h = (float *)malloc(N*sizeof(float)); //b_h = (float *)malloc(N*sizeof(float)); //r_h = (float *)malloc(N*sizeof(float)); int size = N*sizeof(float); cudaHostAlloc((void **)&a_h, size, 0); checkCUDAError("cudaHostAllocMapped"); cudaHostAlloc((void **)&b_h, size, 0); checkCUDAError("cudaHostAllocMapped"); cudaHostAlloc((void **)&r_h, size, 0); checkCUDAError("cudaHostAllocMapped"); // Aloca memorie - CUDA //cutilSafeCall(cudaMalloc((void **) &a_d, N*sizeof(float))); //cutilSafeCall(cudaMalloc((void **) &b_d, N*sizeof(float))); //cutilSafeCall(cudaMalloc((void **) &r_d, N*sizeof(float))); cudaHostGetDevicePointer((void **)&a_d, (void *)a_h, 0); checkCUDAError("cudaHostGetDevicePointer"); cudaHostGetDevicePointer((void **)&b_d, (void *)b_h, 0); checkCUDAError("cudaHostGetDevicePointer"); cudaHostGetDevicePointer((void **)&r_d, (void *)r_h, 0); checkCUDAError("cudaHostGetDevicePointer"); control = (float *)malloc(N*sizeof(float)); // Initializeaza vectori for(int i=0;i<N;i++) { a_h[i] = (float)(i % 13)+1; b_h[i] = (float)(i % 3)+1; } }
TEST(HostAlloc, FlagRetrieval) { cudaError_t ret; void * ptrs[8]; unsigned int flags[8]; int device; ret = cudaGetDevice(&device); ASSERT_EQ(cudaSuccess, ret); struct cudaDeviceProp prop; ret = cudaGetDeviceProperties(&prop, device); ASSERT_EQ(cudaSuccess, ret); for (size_t i = 0; i < (sizeof(flags) / sizeof(flags[0])); i++) { unsigned int flag = cudaHostAllocDefault; if (i & 0x1) { flag |= cudaHostAllocPortable; } if (i & 0x2) { flag |= cudaHostAllocMapped; } if (i & 0x4) { flag |= cudaHostAllocWriteCombined; } ret = cudaHostAlloc(&ptrs[i], 4, flag); ASSERT_EQ(cudaSuccess, ret); flags[i] = flag; } for (size_t i = 0; i < (sizeof(flags) / sizeof(flags[0])); i++) { unsigned int flag; ret = cudaHostGetFlags(&flag, ptrs[i]); ASSERT_EQ(cudaSuccess, ret); const unsigned int expected = flags[i] | (prop.canMapHostMemory ? cudaHostAllocMapped : 0); EXPECT_EQ(expected, flag); } for (size_t i = 0; i < (sizeof(flags) / sizeof(flags[0])); i++) { ret = cudaFreeHost(ptrs[i]); EXPECT_EQ(cudaSuccess, ret); } }
// Allocate ZeroCopy mapped memory, shared between CUDA and CPU. bool GIEFeatExtractor::cudaAllocMapped( void** cpuPtr, void** gpuPtr, size_t size ) { if( !cpuPtr || !gpuPtr || size == 0 ) return false; //CUDA(cudaSetDeviceFlags(cudaDeviceMapHost)); if( CUDA_FAILED(cudaHostAlloc(cpuPtr, size, cudaHostAllocMapped)) ) return false; if( CUDA_FAILED(cudaHostGetDevicePointer(gpuPtr, *cpuPtr, 0)) ) return false; memset(*cpuPtr, 0, size); std::cout << "cudaAllocMapped : " << size << " bytes" << std::endl; return true; }
int main() { // Height of the capillary interface for the grid CpuPtr_2D height_distribution(nx, ny, 0, true); computeRandomHeights(0, H, height_distribution); // Density difference between brine and CO2 float delta_rho = 500; // Gravitational acceleration float g = 9.87; // Non-dimensional constant that scales the strength of the capillary forces float c_cap = 1.0/6.0; // Permeability data (In real simulations this will be a table based on rock data, here we use a random distribution ) float k_data[10] = {0.9352, 1.0444, 0.9947, 0.9305, 0.9682, 1.0215, 0.9383, 1.0477, 0.9486, 1.0835}; float k_heights[10] = {10, 20, 25, 100, 155, 193, 205, 245, 267, 300}; //Inside Kernel // Converting the permeability data into a table of even subintervals in the z-directions //float k_values[n+1]; //kDistribution(dz, h, k_heights, k_data, k_values); // MOBILITY // The mobility is a function of the saturation, which is directly related to the capillary pressure // Pressure at capillary interface, which is known float p_ci = 1; // Table of capillary pressure values for our subintervals along the z-axis ranging from 0 to h float resolution = 0.01; int size = 1/resolution + 1; float p_cap_ref_table[size]; float s_b_ref_table[size]; createReferenceTable(g, H, delta_rho, c_cap, resolution, p_cap_ref_table, s_b_ref_table); // Set block and grid sizes and initialize gpu pointer dim3 grid; dim3 block; computeGridBlock(dim3& grid, dim3& block, nx, ny, block_x, block_y); // Allocate and set data on the GPU GpuPtr_2D Lambda_device(nx, ny, 0, NULL); GpuPtr_1D k_data_device(10, k_data); GpuPtr_1D k_heights_device(10, k_heights); GpuPtr_1D p_cap_ref_table_device(size, p_cap_ref_table); GpuPtr_1D s_b_ref_table_device(size, s_b_ref_table); cudaHostAlloc(&args, sizeof(CoarsePermIntegrationKernelArgs), cudaHostAllocWriteCombined); // Set arguments and run coarse integration kernel CoarsePermIntegrationArgs coarse_perm_int_args; setCoarsePermIntegrationArgs(coarse_perm_int_args,\ Lambda_device.getRawPtr(),\ k_data_device.getRawPtr(),\ k_heights_device.getRawPtr(),\ p_cap_ref_table_device.getRawPtr(),\ s_b_ref_table_device.getRawPtr(),\ nx, ny, 0); callCoarseIntegrationKernel(grid, block, coarse_perm_int_args); float p_cap_values[n+1]; computeCapillaryPressure(p_ci, g, delta_rho, h, dz, n, p_cap_values); float s_b_values[n+1]; inverseCapillaryPressure(n, g, h, delta_rho, c_cap, p_cap_values, s_b_values); printArray(n+1, s_b_values); // End point mobility lambda'_b, a known quantity float lambda_end_point = 1; float lambda_values[n+1]; computeMobility(n, s_b_values, lambda_end_point, lambda_values); // Multiply permeability values with lambda values float f_values[n+1]; multiply(n+1, lambda_values, k_values, f_values); //Numerical integral with trapezoidal float K = trapezoidal(dz, n, k_values); float L = trapezoidal(dz, n, f_values)/K; printf("Value of integral K. %.4f", K); printf("Value of integral L. %.4f", L); }
magma_int_t magma_d_initP2P ( magma_int_t *bw_bmark, magma_int_t *num_gpus ){ // Number of GPUs printf("Checking for multiple GPUs...\n"); int gpu_n; (cudaGetDeviceCount(&gpu_n)); printf("CUDA-capable device count: %i\n", gpu_n); if (gpu_n < 2) { printf("Two or more Tesla(s) with (SM 2.0)" " class GPUs are required for P2P.\n"); } // Query device properties cudaDeviceProp prop[64]; int gpuid_tesla[64]; // find the first two GPU's that can support P2P int gpu_count = 0; // GPUs that meet the criteria for (int i=0; i < gpu_n; i++) { (cudaGetDeviceProperties(&prop[i], i)); // Only Tesla boards based on Fermi can support P2P { // This is an array of P2P capable GPUs gpuid_tesla[gpu_count++] = i; } } *num_gpus=gpu_n; for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { // Check possibility for peer access printf("\nChecking GPU(s) for support of peer to peer memory access...\n"); int can_access_peer_0_1, can_access_peer_1_0; // In this case we just pick the first two that we can support (cudaDeviceCanAccessPeer(&can_access_peer_0_1, gpuid_tesla[i], gpuid_tesla[j])); (cudaDeviceCanAccessPeer(&can_access_peer_1_0, gpuid_tesla[j], gpuid_tesla[i])); // Output results from P2P capabilities printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid_tesla[i]].name, gpuid_tesla[i], prop[gpuid_tesla[j]].name, gpuid_tesla[j] , can_access_peer_0_1 ? "Yes" : "No"); printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n", prop[gpuid_tesla[j]].name, gpuid_tesla[j], prop[gpuid_tesla[i]].name, gpuid_tesla[i], can_access_peer_1_0 ? "Yes" : "No"); if (can_access_peer_0_1 == 0 || can_access_peer_1_0 == 0) { printf("Two or more Tesla(s) with class" " GPUs are required for P2P to run.\n"); printf("Support for UVA requires a Tesla with SM 2.0 capabilities.\n"); printf("Peer to Peer access is not available between" " GPU%d <-> GPU%d, waiving test.\n", gpuid_tesla[i], gpuid_tesla[j]); printf("PASSED\n"); //exit(EXIT_SUCCESS); } } } // Enable peer access for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { printf("Enabling peer access between GPU%d and GPU%d...\n", gpuid_tesla[i], gpuid_tesla[j]); (cudaSetDevice(gpuid_tesla[i])); (cudaDeviceEnablePeerAccess(gpuid_tesla[j], 0)); (cudaSetDevice(gpuid_tesla[j])); (cudaDeviceEnablePeerAccess(gpuid_tesla[i], 0)); magma_dcheckerr("P2P"); } } magma_dcheckerr("P2P successful"); // Enable peer access for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { // Check that we got UVA on both devices printf("Checking GPU%d and GPU%d for UVA capabilities...\n", gpuid_tesla[i], gpuid_tesla[j]); //const bool has_uva = (prop[gpuid_tesla[i]].unifiedAddressing && // prop[gpuid_tesla[j]].unifiedAddressing); printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[i]].name, gpuid_tesla[i], (prop[gpuid_tesla[i]].unifiedAddressing ? "Yes" : "No") ); printf("> %s (GPU%d) supports UVA: %s\n", prop[gpuid_tesla[j]].name, gpuid_tesla[j], (prop[gpuid_tesla[j]].unifiedAddressing ? "Yes" : "No") ); } } if(*bw_bmark==1){ // P2P memcopy() benchmark for(int i=0; i<gpu_n; i++) { for(int j=i+1; j<gpu_n; j++) { // Allocate buffers const size_t buf_size = 1024 * 1024 * 16 * sizeof(float); printf("Allocating buffers (%iMB on GPU%d, GPU%d and CPU Host)...\n", int(buf_size / 1024 / 1024), gpuid_tesla[i], gpuid_tesla[j]); (cudaSetDevice(gpuid_tesla[i])); float* g0; (cudaMalloc(&g0, buf_size)); (cudaSetDevice(gpuid_tesla[j])); float* g1; (cudaMalloc(&g1, buf_size)); float* h0; (cudaMallocHost(&h0, buf_size)); // Automatically portable with UVA // Create CUDA event handles printf("Creating event handles...\n"); cudaEvent_t start_event, stop_event; float time_memcpy; int eventflags = cudaEventBlockingSync; (cudaEventCreateWithFlags(&start_event, eventflags)); (cudaEventCreateWithFlags(&stop_event, eventflags)); (cudaEventRecord(start_event, 0)); for (int k=0; k<100; k++) { // With UVA we don't need to specify source and target devices, the // runtime figures this out by itself from the pointers // Ping-pong copy between GPUs if (k % 2 == 0) (cudaMemcpy(g1, g0, buf_size, cudaMemcpyDefault)); else (cudaMemcpy(g0, g1, buf_size, cudaMemcpyDefault)); } (cudaEventRecord(stop_event, 0)); (cudaEventSynchronize(stop_event)); (cudaEventElapsedTime(&time_memcpy, start_event, stop_event)); printf("cudaMemcpyPeer / cudaMemcpy between" "GPU%d and GPU%d: %.2fGB/s\n", gpuid_tesla[i], gpuid_tesla[j], (1.0f / (time_memcpy / 1000.0f)) * ((100.0f * buf_size)) / 1024.0f / 1024.0f / 1024.0f); // Cleanup and shutdown printf("Cleanup of P2P benchmark...\n"); (cudaEventDestroy(start_event)); (cudaEventDestroy(stop_event)); (cudaSetDevice(gpuid_tesla[i])); (magma_free( g0) ); (cudaSetDevice(gpuid_tesla[j])); (magma_free( g1) ); (magma_free_cpu( h0) ); } } // host-device memcopy() benchmark for(int j=0; j<gpu_n; j++) { cudaSetDevice(gpuid_tesla[j]); int *h_data_source; int *h_data_sink; int *h_data_in[STREAM_COUNT]; int *d_data_in[STREAM_COUNT]; int *h_data_out[STREAM_COUNT]; int *d_data_out[STREAM_COUNT]; cudaEvent_t cycleDone[STREAM_COUNT]; cudaStream_t stream[STREAM_COUNT]; cudaEvent_t start, stop; // Allocate resources int memsize; memsize = 1000000 * sizeof(int); h_data_source = (int*) malloc(memsize); h_data_sink = (int*) malloc(memsize); for( int i =0; i<STREAM_COUNT; ++i ) { ( cudaHostAlloc(&h_data_in[i], memsize, cudaHostAllocDefault) ); ( cudaMalloc(&d_data_in[i], memsize) ); ( cudaHostAlloc(&h_data_out[i], memsize, cudaHostAllocDefault) ); ( cudaMalloc(&d_data_out[i], memsize) ); ( cudaStreamCreate(&stream[i]) ); ( cudaEventCreate(&cycleDone[i]) ); cudaEventRecord(cycleDone[i], stream[i]); } cudaEventCreate(&start); cudaEventCreate(&stop); // Time host-device copies cudaEventRecord(start,0); ( cudaMemcpyAsync(d_data_in[0], h_data_in[0], memsize, cudaMemcpyHostToDevice,0) ); cudaEventRecord(stop,0); cudaEventSynchronize(stop); float memcpy_h2d_time; cudaEventElapsedTime(&memcpy_h2d_time, start, stop); cudaEventRecord(start,0); ( cudaMemcpyAsync(h_data_out[0], d_data_out[0], memsize, cudaMemcpyDeviceToHost, 0) ); cudaEventRecord(stop,0); cudaEventSynchronize(stop); float memcpy_d2h_time; cudaEventElapsedTime(&memcpy_d2h_time, start, stop); cudaEventSynchronize(stop); printf("Measured timings (throughput):\n"); printf(" Memcpy host to device GPU %d \t: %f ms (%f GB/s)\n", j, memcpy_h2d_time, (memsize * 1e-6)/ memcpy_h2d_time ); printf(" Memcpy device GPU %d to host\t: %f ms (%f GB/s)\n", j, memcpy_d2h_time, (memsize * 1e-6)/ memcpy_d2h_time); // Free resources free( h_data_source ); free( h_data_sink ); for( int i =0; i<STREAM_COUNT; ++i ) { magma_free_cpu( h_data_in[i] ); magma_free( d_data_in[i] ); magma_free_cpu( h_data_out[i] ); magma_free( d_data_out[i] ); cudaStreamDestroy(stream[i]); cudaEventDestroy(cycleDone[i]); } cudaEventDestroy(start); cudaEventDestroy(stop); } }//end if-loop bandwidth_benchmark magma_dcheckerr("P2P established"); return MAGMA_SUCCESS; }
//////////////////////////////////////////////////////////////////////////////// // Program Main //////////////////////////////////////////////////////////////////////////////// int main(int argc, char *argv[]) { int Nx, Ny, Nz, max_iters; int blockX, blockY, blockZ; if (argc == 8) { Nx = atoi(argv[1]); Ny = atoi(argv[2]); Nz = atoi(argv[3]); max_iters = atoi(argv[4]); blockX = atoi(argv[5]); blockY = atoi(argv[6]); blockZ = atoi(argv[7]); } else { printf("Usage: %s nx ny nz i block_x block_y block_z number_of_threads\n", argv[0]); exit(1); } // Get the number of GPUS int number_of_devices; checkCuda(cudaGetDeviceCount(&number_of_devices)); if (number_of_devices < 2) { printf("Less than two devices were found.\n"); printf("Exiting...\n"); return -1; } // Decompose along the Z-axis int _Nz = Nz/number_of_devices; // Define constants const _DOUBLE_ L = 1.0; const _DOUBLE_ h = L/(Nx+1); const _DOUBLE_ dt = h*h/6.0; const _DOUBLE_ beta = dt/(h*h); const _DOUBLE_ c0 = beta; const _DOUBLE_ c1 = (1-6*beta); // Check if ECC is turned on ECCCheck(number_of_devices); // Set the number of OpenMP threads omp_set_num_threads(number_of_devices); #pragma omp parallel { unsigned int tid = omp_get_num_threads(); #pragma omp single { printf("Number of OpenMP threads: %d\n", tid); } } // CPU memory operations int dt_size = sizeof(_DOUBLE_); _DOUBLE_ *u_new, *u_old; u_new = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); u_old = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); init(u_old, u_new, h, Nx, Ny, Nz); // Allocate and generate arrays on the host size_t pitch_bytes; size_t pitch_gc_bytes; _DOUBLE_ *h_Unew, *h_Uold; _DOUBLE_ *h_s_Uolds[number_of_devices], *h_s_Unews[number_of_devices]; _DOUBLE_ *left_send_buffer[number_of_devices], *left_receive_buffer[number_of_devices]; _DOUBLE_ *right_send_buffer[number_of_devices], *right_receive_buffer[number_of_devices]; h_Unew = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); h_Uold = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(Nz+2)); init(h_Uold, h_Unew, h, Nx, Ny, Nz); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); h_s_Unews[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); h_s_Uolds[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_Nz+2)); right_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_send_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); right_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); left_receive_buffer[tid] = (_DOUBLE_ *)malloc(sizeof(_DOUBLE_)*(Nx+2)*(Ny+2)*(_GC_DEPTH)); checkCuda(cudaHostAlloc((void**)&h_s_Unews[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&h_s_Uolds[tid], dt_size*(Nx+2)*(Ny+2)*(_Nz+2), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_send_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&right_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); checkCuda(cudaHostAlloc((void**)&left_receive_buffer[tid], dt_size*(Nx+2)*(Ny+2)*(_GC_DEPTH), cudaHostAllocPortable)); init_subdomain(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid); } // GPU memory operations _DOUBLE_ *d_s_Unews[number_of_devices], *d_s_Uolds[number_of_devices]; _DOUBLE_ *d_right_send_buffer[number_of_devices], *d_left_send_buffer[number_of_devices]; _DOUBLE_ *d_right_receive_buffer[number_of_devices], *d_left_receive_buffer[number_of_devices]; #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); CopyToConstantMemory(c0, c1); checkCuda(cudaMallocPitch((void**)&d_s_Uolds[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_s_Unews[tid], &pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2))); checkCuda(cudaMallocPitch((void**)&d_left_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_receive_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_left_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); checkCuda(cudaMallocPitch((void**)&d_right_send_buffer[tid], &pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH))); } // Copy data from host to the device double HtD_timer = 0.; HtD_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_s_Uolds[tid], pitch_bytes, h_s_Uolds[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); checkCuda(cudaMemcpy2D(d_s_Unews[tid], pitch_bytes, h_s_Unews[tid], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_Nz+2)), cudaMemcpyDefault)); } HtD_timer += omp_get_wtime(); int pitch = pitch_bytes/dt_size; int gc_pitch = pitch_gc_bytes/dt_size; // GPU kernel launch parameters dim3 threads_per_block(blockX, blockY, blockZ); unsigned int blocksInX = getBlock(Nx, blockX); unsigned int blocksInY = getBlock(Ny, blockY); unsigned int blocksInZ = getBlock(_Nz-2, k_loop); dim3 thread_blocks(blocksInX, blocksInY, blocksInZ); dim3 thread_blocks_halo(blocksInX, blocksInY); double compute_timer = 0.; compute_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); for(int iterations = 0; iterations < max_iters; iterations++) { // Compute inner nodes checkCuda(cudaSetDevice(tid)); ComputeInnerPoints(thread_blocks, threads_per_block, d_s_Unews[tid], d_s_Uolds[tid], pitch, Nx, Ny, _Nz); // Copy right boundary data to host if (tid == 0) { checkCuda(cudaSetDevice(tid)); CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0); checkCuda(cudaMemcpy2D(right_send_buffer[tid], dt_size*(Nx+2), d_right_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault)); } // Copy left boundary data to host if (tid == 1) { checkCuda(cudaSetDevice(tid)); CopyBoundaryRegionToGhostCell(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_send_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1); checkCuda(cudaMemcpy2D(left_send_buffer[tid], dt_size*(Nx+2), d_left_send_buffer[tid], pitch_gc_bytes, dt_size*(Nx+2), (Ny+2)*(_GC_DEPTH), cudaMemcpyDefault)); } #pragma omp barrier // Copy right boundary data to device 1 if (tid == 1) { checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_left_receive_buffer[tid], pitch_gc_bytes, right_send_buffer[tid-1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault)); CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_left_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 1); } // Copy left boundary data to device 0 if (tid == 0) { checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(d_right_receive_buffer[tid], pitch_gc_bytes, left_send_buffer[tid+1], dt_size*(Nx+2), dt_size*(Nx+2), ((Ny+2)*(_GC_DEPTH)), cudaMemcpyDefault)); CopyGhostCellToBoundaryRegion(thread_blocks_halo, threads_per_block, d_s_Unews[tid], d_right_receive_buffer[tid], Nx, Ny, _Nz, pitch, gc_pitch, 0); } // Swap pointers on the host #pragma omp barrier checkCuda(cudaSetDevice(tid)); checkCuda(cudaDeviceSynchronize()); swap(_DOUBLE_*, d_s_Unews[tid], d_s_Uolds[tid]); } } compute_timer += omp_get_wtime(); // Copy data from device to host double DtH_timer = 0; DtH_timer -= omp_get_wtime(); #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaMemcpy2D(h_s_Uolds[tid], dt_size*(Nx+2), d_s_Uolds[tid], pitch_bytes, dt_size*(Nx+2), (Ny+2)*(_Nz+2), cudaMemcpyDeviceToHost)); } DtH_timer += omp_get_wtime(); // Merge sub-domains into a one big domain #pragma omp parallel { unsigned int tid = omp_get_thread_num(); merge_domains(h_s_Uolds[tid], h_Uold, Nx, Ny, _Nz, tid); } // Calculate on host #if defined(DEBUG) || defined(_DEBUG) cpu_heat3D(u_new, u_old, c0, c1, max_iters, Nx, Ny, Nz); #endif float gflops = CalcGflops(compute_timer, max_iters, Nx, Ny, Nz); PrintSummary("3D Heat (7-pt)", "Plane sweeping", compute_timer, HtD_timer, DtH_timer, gflops, max_iters, Nx); _DOUBLE_ t = max_iters * dt; CalcError(h_Uold, u_old, t, h, Nx, Ny, Nz); #if defined(DEBUG) || defined(_DEBUG) //exportToVTK(h_Uold, h, "heat3D.vtk", Nx, Ny, Nz); #endif #pragma omp parallel { unsigned int tid = omp_get_thread_num(); checkCuda(cudaSetDevice(tid)); checkCuda(cudaFree(d_s_Unews[tid])); checkCuda(cudaFree(d_s_Uolds[tid])); checkCuda(cudaFree(d_right_send_buffer[tid])); checkCuda(cudaFree(d_left_send_buffer[tid])); checkCuda(cudaFree(d_right_receive_buffer[tid])); checkCuda(cudaFree(d_left_receive_buffer[tid])); checkCuda(cudaFreeHost(h_s_Unews[tid])); checkCuda(cudaFreeHost(h_s_Uolds[tid])); checkCuda(cudaFreeHost(left_send_buffer[tid])); checkCuda(cudaFreeHost(right_send_buffer[tid])); checkCuda(cudaFreeHost(left_receive_buffer[tid])); checkCuda(cudaFreeHost(right_receive_buffer[tid])); checkCuda(cudaDeviceReset()); } free(u_old); free(u_new); return 0; }
extern "C" double gsum(UDF_INIT *initid, UDF_ARGS *args, char *is_null, char *error) { DBUG_ENTER("udf_sum::gsum"); double* h_in; double* h_out; double* d_in; double* d_out; unsigned int index = 0; cudaHostAlloc((void**)&h_in, MAXIMUM_ELEMENTS_IN_CACHE*sizeof(double), cudaHostAllocWriteCombined | cudaHostAllocMapped ); CUDA_CHECK_ERRORS("cudaHostAlloc -> h_in"); cudaHostAlloc((void**)&h_out, CUDA_BLOCK_SIZE*sizeof(double), cudaHostAllocWriteCombined | cudaHostAllocMapped ); CUDA_CHECK_ERRORS("cudaHostAlloc -> h_out"); cudaHostGetDevicePointer((void**)&d_in, h_in, 0); cudaHostGetDevicePointer((void**)&d_out, h_out, 0); char* column_name = (char*) args->args[0]; char* table_name = (char*) args->args[1]; char* schema_name = (char*) args->args[2]; DBUG_PRINT("info", ("column_name [%s], table_name [%s], schema_name [%s]", column_name, table_name, schema_name)); fprintf(stderr, "column_name [%s], table_name [%s], schema_name [%s]\n", column_name, table_name, schema_name); fflush(stderr); THD *thd = current_thd; TABLE_LIST* table_list = new TABLE_LIST; memset((char*) table_list, 0, sizeof(TABLE_LIST)); DBUG_PRINT("info", ("table_list->init_one_table")); table_list->init_one_table(schema_name, strlen(schema_name), table_name, strlen(table_name), table_name, TL_READ); DBUG_PRINT("info", ("open_and_lock_tables")); open_and_lock_tables(thd, table_list, FALSE, MYSQL_OPEN_IGNORE_FLUSH | MYSQL_LOCK_IGNORE_TIMEOUT); TABLE* table = table_list->table; clock_t cpu_clock; cpu_clock = clock(); table->file->ha_rnd_init(true); while (table->file->ha_rnd_next(table->record[0]) == 0){ h_in[index++] = table->field[1]->val_real(); } table->file->ha_rnd_end(); cpu_clock = clock() - cpu_clock; fprintf(stderr, "gsum -> index [%d]\n", index); fprintf(stderr, "gsum -> fill cache within [%f seconds]\n", ((float)cpu_clock)/CLOCKS_PER_SEC); fflush(stderr); cudaEvent_t start, stop; float elapsedTime; cudaEventCreate(&start); cudaEventCreate(&stop); cudaEventRecord(start, 0); ReductionTask reduction_task(MAXIMUM_ELEMENTS_IN_CACHE, sizeof(double), CUDA_BLOCK_SIZE, CUDA_THREAD_PER_BLOCK_SIZE, R_SUM, R_DOUBLE); reductionWorkerUsingMappedMemory<double>(d_in, d_out, &reduction_task); cudaEventRecord(stop, 0); cudaEventSynchronize(stop); cudaEventElapsedTime(&elapsedTime, start, stop); double gpu_sum = 0; for (unsigned int i = 0; i < CUDA_BLOCK_SIZE; i++) { gpu_sum += ((double*)h_out)[i]; } float bandwidthInMBs = (1e3f * MAXIMUM_ELEMENTS_IN_CACHE*sizeof(double)) / (elapsedTime * (float)(1 << 20)); fprintf(stderr, "gpu result [%f], gpu time [%f seconds] bandwidth (mb) [%f]\n", gpu_sum, elapsedTime/1000.0, bandwidthInMBs); fflush(stderr); cudaFreeHost(h_in); CUDA_CHECK_ERRORS("cudaFreeHost -> h_in"); cudaFreeHost(h_out); CUDA_CHECK_ERRORS("cudaFreeHost -> h_out"); thd->cleanup_after_query(); DBUG_PRINT("info", ("about to delete table_list")); delete table_list; DBUG_RETURN(gpu_sum); }
cudaError_t cudaMallocHost(void **pHost, size_t size) { return cudaHostAlloc(pHost, size, cudaHostAllocDefault); }
void MFNHashTypePlainCUDA::allocateThreadAndDeviceMemory() { trace_printf("MFNHashTypePlainCUDA::allocateThreadAndDeviceMemory()\n"); /** * Error variable - stores the result of the various mallocs & such. */ cudaError_t err, err2; /** * Flags for calling cudaHostMalloc - will be set to cudaHostAllocMapped * if we are mapping memory to the host with zero copy. */ unsigned int cudaHostMallocFlags = 0; if (this->useZeroCopy) { cudaHostMallocFlags |= cudaHostAllocMapped; } /* * Malloc the device hashlist space. This is the number of available hashes * times the hash length in bytes. The data will be copied later. */ err = cudaMalloc((void **)&this->DeviceHashlistAddress, this->activeHashesProcessed.size() * this->hashLengthBytes); if (err != cudaSuccess) { printf("Unable to allocate %d bytes for device hashlist! Exiting!\n", this->activeHashesProcessed.size() * this->hashLengthBytes); printf("return code: %d\n", err); exit(1); } /* * Allocate the host/device space for the success list (flags for found passwords). * This is a byte per password. To avoid atomic write issues, each password * gets a full addressible byte, and the GPU handles the dependencies between * multiple threads trying to set a flag in the same segment of memory. * * On the host, it will be allocated as mapped memory if we are using zerocopy. * * As this region of memory is frequently copied back to the host, mapping it * improves performance. In theory. */ err = cudaHostAlloc((void **)&this->HostSuccessAddress, this->activeHashesProcessed.size(), cudaHostMallocFlags); if (err != cudaSuccess) { printf("Unable to allocate %d bytes for success flags! Exiting!\n", this->activeHashesProcessed.size()); printf("return code: %d\n", err); exit(1); } // Clear host success flags region - if we are mapping the memory, the GPU // will directly write this. memset(this->HostSuccessAddress, 0, this->activeHashesProcessed.size()); // Allocate memory for the reported flags. this->HostSuccessReportedAddress = new uint8_t [this->activeHashesProcessed.size()]; memset(this->HostSuccessReportedAddress, 0, this->activeHashesProcessed.size()); // If zero copy is in use, get the device pointer for the success data, else // malloc a region of memory on the device. if (this->useZeroCopy) { err = cudaHostGetDevicePointer((void **)&this->DeviceSuccessAddress, this->HostSuccessAddress, 0); err2 = cudaSuccess; } else { err = cudaMalloc((void **)&this->DeviceSuccessAddress, this->activeHashesProcessed.size()); err2 = cudaMemset(this->DeviceSuccessAddress, 0, this->activeHashesProcessed.size()); } if ((err != cudaSuccess) || (err2 != cudaSuccess)) { printf("Unable to allocate %d bytes for device success list! Exiting!\n", this->activeHashesProcessed.size()); printf("return code: %d\n", err); printf("return code: %d\n", err2); exit(1); } /* * Allocate memory for the found passwords. As this is commonly copied * back and forth, it will be made zero copy if requested. * * This requires (number hashes * passwordLength) bytes of data. */ err = cudaHostAlloc((void **)&this->HostFoundPasswordsAddress, this->passwordLength * this->activeHashesProcessed.size() , cudaHostMallocFlags); if (err != cudaSuccess) { printf("Unable to allocate %d bytes for host password list! Exiting!\n", this->passwordLength * this->activeHashesProcessed.size()); printf("return code: %d\n", err); exit(1); } // Clear the host found password space. memset(this->HostFoundPasswordsAddress, 0, this->passwordLength * this->activeHashesProcessed.size()); if (this->useZeroCopy) { err = cudaHostGetDevicePointer((void **)&this->DeviceFoundPasswordsAddress, this->HostFoundPasswordsAddress, 0); err2 = cudaSuccess; } else { err = cudaMalloc((void **)&this->DeviceFoundPasswordsAddress, this->passwordLength * this->activeHashesProcessed.size()); err2 = cudaMemset(this->DeviceFoundPasswordsAddress, 0, this->passwordLength * this->activeHashesProcessed.size()); } if ((err != cudaSuccess) || (err2 != cudaSuccess)) { printf("Unable to allocate %d bytes for device password list! Exiting!\n", this->passwordLength * this->activeHashesProcessed.size()); printf("return code: %d\n", err); printf("return code: %d\n", err2); exit(1); } /** * Allocate space for host and device start positions. To improve performance, * this space is now aligned for improved coalescing performance. All the * position 0 elements are together, followed by all the position 1 elements, * etc. * * This memory can be allocated as write combined, as it is not read by * the host ever - only written. Since it is regularly transferred to the * GPU, this should help improve performance. */ err = cudaHostAlloc((void**)&this->HostStartPointAddress, this->TotalKernelWidth * this->passwordLength, cudaHostAllocWriteCombined | cudaHostMallocFlags); err2 = cudaMalloc((void **)&this->DeviceStartPointAddress, this->TotalKernelWidth * this->passwordLength); if ((err != cudaSuccess) || (err2 != cudaSuccess)) { printf("Unable to allocate %d bytes for host/device startpos list! Exiting!\n", this->TotalKernelWidth * this->passwordLength); printf("return code: %d\n", err); printf("return code: %d\n", err2); exit(1); } /** * Allocate space for the device start password values. This is a copy of * the MFNHashTypePlain::HostStartPasswords32 vector for the GPU. */ err = cudaMalloc((void **)&this->DeviceStartPasswords32Address, this->TotalKernelWidth * this->passwordLengthWords); if ((err != cudaSuccess)) { printf("Unable to allocate %d bytes for host/device startpos list! Exiting!\n", this->TotalKernelWidth * this->passwordLengthWords); printf("return code: %d\n", err); exit(1); } /** * Finally, attempt to allocate space for the giant device bitmaps. There * are 4x128MB bitmaps, and any number can be allocated. If they are not * fully allocated, their address is set to null as a indicator to the device * that there is no data present. Attempt to allocate as many as possible. * * This will be accessed regularly, so should probably not be zero copy. * Also, I'm not sure how mapping host memory into multiple threads would * work. Typically, if the GPU doesn't have enough RAM for the full * set of bitmaps, it's a laptop, and therefore may be short on host RAM * for the pinned access. * * If there is an error in allocation, call cudaGetLastError() to clear it - * we know there has been an error, and do not want it to persist. */ err = cudaMalloc((void **)&this->DeviceBitmap128mb_a_Address, 128 * 1024 * 1024); if (err == cudaSuccess) { memalloc_printf("Successfully allocated Bitmap A\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap A\n"); this->DeviceBitmap128mb_a_Address = 0; cudaGetLastError(); } err = cudaMalloc((void **)&this->DeviceBitmap128mb_b_Address, 128 * 1024 * 1024); if (err == cudaSuccess) { memalloc_printf("Successfully allocated Bitmap B\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap B\n"); this->DeviceBitmap128mb_b_Address = 0; cudaGetLastError(); } err = cudaMalloc((void **)&this->DeviceBitmap128mb_c_Address, 128 * 1024 * 1024); if (err == cudaSuccess) { memalloc_printf("Successfully allocated Bitmap C\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap C\n"); this->DeviceBitmap128mb_c_Address = 0; cudaGetLastError(); } err = cudaMalloc((void **)&this->DeviceBitmap128mb_d_Address, 128 * 1024 * 1024); if (err == cudaSuccess) { memalloc_printf("Successfully allocated Bitmap D\n"); } else { memalloc_printf("Unable to allocate 128MB bitmap D\n"); this->DeviceBitmap128mb_d_Address = 0; cudaGetLastError(); } //printf("Thread %d memory allocated successfully\n", this->threadId); }
Waifu2x::eWaifu2xError Waifu2x::init(int argc, char** argv, const std::string &Mode, const int NoiseLevel, const std::string &ModelDir, const std::string &Process, const int CropSize, const int BatchSize) { Waifu2x::eWaifu2xError ret; if (is_inited) return eWaifu2xError_OK; try { mode = Mode; noise_level = NoiseLevel; model_dir = ModelDir; process = Process; crop_size = CropSize; batch_size = BatchSize; inner_padding = layer_num; outer_padding = 1; output_size = crop_size - offset * 2; input_block_size = crop_size + (inner_padding + outer_padding) * 2; original_width_height = 128 + layer_num * 2; output_block_size = crop_size + (inner_padding + outer_padding - layer_num) * 2; std::call_once(waifu2x_once_flag, [argc, argv]() { assert(argc >= 1); int tmpargc = 1; char* tmpargvv[] = { argv[0] }; char** tmpargv = tmpargvv; // glog等の初期化 caffe::GlobalInit(&tmpargc, &tmpargv); }); const auto cuDNNCheckStartTime = std::chrono::system_clock::now(); if (process == "gpu") process = "cudnn"; const auto cuDNNCheckEndTime = std::chrono::system_clock::now(); boost::filesystem::path mode_dir_path(model_dir); if (!mode_dir_path.is_absolute()) // model_dirが相対パスなら絶対パスに直す { // まずはカレントディレクトリ下にあるか探す mode_dir_path = boost::filesystem::absolute(model_dir); if (!boost::filesystem::exists(mode_dir_path) && argc >= 1) // 無かったらargv[0]から実行ファイルのあるフォルダを推定し、そのフォルダ下にあるか探す { boost::filesystem::path a0(argv[0]); if (a0.is_absolute()) mode_dir_path = a0.branch_path() / model_dir; } } if (!boost::filesystem::exists(mode_dir_path)) return eWaifu2xError_FailedOpenModelFile; if (process == "cpu") { caffe::Caffe::set_mode(caffe::Caffe::CPU); isCuda = false; } else { caffe::Caffe::set_mode(caffe::Caffe::GPU); isCuda = true; } if (mode == "noise" || mode == "noise_scale" || mode == "auto_scale") { const std::string model_path = (mode_dir_path / "srcnn.prototxt").string(); const std::string param_path = (mode_dir_path / ("noise" + std::to_string(noise_level) + "_model.json")).string(); ret = ConstractNet(net_noise, model_path, param_path, process); if (ret != eWaifu2xError_OK) return ret; } if (mode == "scale" || mode == "noise_scale" || mode == "auto_scale") { const std::string model_path = (mode_dir_path / "srcnn.prototxt").string(); const std::string param_path = (mode_dir_path / "scale2.0x_model.json").string(); ret = ConstractNet(net_scale, model_path, param_path, process); if (ret != eWaifu2xError_OK) return ret; } const int input_block_plane_size = input_block_size * input_block_size * input_plane; const int output_block_plane_size = output_block_size * output_block_size * input_plane; if (isCuda) { CUDA_CHECK_WAIFU2X(cudaHostAlloc(&input_block, sizeof(float) * input_block_plane_size * batch_size, cudaHostAllocWriteCombined)); CUDA_CHECK_WAIFU2X(cudaHostAlloc(&dummy_data, sizeof(float) * input_block_plane_size * batch_size, cudaHostAllocWriteCombined)); CUDA_CHECK_WAIFU2X(cudaHostAlloc(&output_block, sizeof(float) * output_block_plane_size * batch_size, cudaHostAllocDefault)); } else { input_block = new float[input_block_plane_size * batch_size]; dummy_data = new float[input_block_plane_size * batch_size]; output_block = new float[output_block_plane_size * batch_size]; } for (size_t i = 0; i < input_block_plane_size * batch_size; i++) dummy_data[i] = 0.0f; is_inited = true; } catch (...) { return eWaifu2xError_InvalidParameter; } return eWaifu2xError_OK; }
uint8_t* allocPageLocked(size_t size) { void* ptr; checkCudaError(cudaHostAlloc(&ptr, size, cudaHostAllocPortable), "cudaHostAlloc"); return static_cast<uint8_t*>(ptr); }
int main(int argc, char *argv[]) { int i,j,k; machineInformation currentMachine; counterSessionInfo session; initializeCUDA(); // Set machine information from CounterHomeBrew.h currentMachine.cpu_model = CPU_MODEL; currentMachine.num_sockets = NUM_SOCKETS; currentMachine.num_phys_cores_per_socket = NUM_PHYS_CORES_PER_SOCKET; currentMachine.num_cores_per_socket = NUM_CORES_PER_SOCKET; currentMachine.num_cores = NUM_CORES; currentMachine.num_cbos = NUM_PHYS_CORES_PER_SOCKET; // should multiply by NUM_SOCKETS??? currentMachine.core_gen_counter_num_max = CORE_GEN_COUNTER_MAX; currentMachine.cbo_counter_num_max = CBO_COUNTER_NUM_MAX; // Set session events, umasks and counters used // int32 core_event_numbers[] = {FP_COMP_OPS_EXE_EVTNR,SIMD_FP_256_EVTNR,0x51,0xF1,0x80}; // int32 core_umasks[] = {FP_COMP_OPS_EXE_SCALAR_DOUBLE_UMASK,SIMD_FP_256_PACKED_DOUBLE_UMASK,0x01, 0x07,0x01}; session.core_gen_counter_num_used = 5; int32 core_event_numbers[] = {0x10,0x10,0x11,0x51,0xF1}; int32 core_umasks[] = {0x20,0x40,0x01,0x01, 0x07}; session.cbo_counter_num_used = 1; int32 cbo_event_numbers[] = {0x37}; int32 cbo_umasks[] = {0xf}; session.cbo_filter = 0x1f; for (i = 0; i < session.core_gen_counter_num_used; i++) { session.core_event_numbers[i] = core_event_numbers[i]; session.core_umasks[i] = core_umasks[i]; } for (i = 0; i < session.cbo_counter_num_used; i++) { session.cbo_event_numbers[i] = cbo_event_numbers[i]; session.cbo_umasks[i] = cbo_umasks[i]; } int fd[NUM_CORES]; // Arrays to hold counter data... counterData before; counterData after; // some data for doing a naive matmul to test flop counting... // initloop(N); // M,N,K are multiples of the block size.... int gpuOuter = atoi(argv[1]); int gpuInner = atoi(argv[2]); int cpuInner = atoi(argv[3]); double minRuntime = atoi(argv[4]); int Md = atoi(argv[5])*block_size; int Nd = atoi(argv[6])*block_size; int Kd = atoi(argv[7])*block_size; int Mh = atoi(argv[8]); int Nh = atoi(argv[9]); int Kh = atoi(argv[10]); char *ts1,*ts2,*ts3,*ts4; char *ts5,*ts6,*ts7,*ts8; double fineTimeStamps[8]; double gTime = 0.0; double cTime = 0.0; double seconds = 0.0; int num_iters; uint64 *coreSums; coreSums = (uint64*)calloc(currentMachine.num_sockets*session.core_gen_counter_num_used,sizeof(uint64)); uint64 *sums; sums = (uint64*)calloc(currentMachine.num_sockets*session.cbo_counter_num_used,sizeof(uint64)); float *Atmp = NULL; float *Btmp = NULL; float *Ctmp = NULL; Atmp = (float*) malloc( Mh * Nh * sizeof(float) ); Btmp = (float*) malloc( Nh * sizeof(float) ); Ctmp = (float*) malloc( Mh * sizeof(float) ); randomInit(Atmp,Mh*Nh); randomInit(Btmp,Nh); for (num_iters = cpuInner; seconds < minRuntime; num_iters *=2) { seconds = 0.0; for (i =0; i < num_iters; i++) BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, Atmp,Mh, Btmp,1, 1, Ctmp,1 ); seconds = read_timer()-seconds; } // num_iters /= 2; free(Atmp); free(Btmp); free(Ctmp); int readyThreads = 0; #pragma omp parallel { int threadNum = omp_get_thread_num(); int numThreads = omp_get_num_threads(); assert(numThreads==2); if (threadNum == 0) { cudaError_t error; int memSizeA = sizeof(float)*Md*Nd; int memSizeB = sizeof(float)*Nd; int memSizeC = sizeof(float)*Md; float *Ahost,*Bhost,*Chost; // use pinned memory on the host for BW and asynch memory transfers.. int flags = cudaHostAllocDefault; ts5 = getTimeStamp(); fineTimeStamps[0] = read_timer(); error = cudaHostAlloc((void**)&Ahost,memSizeA,flags);if (error != cudaSuccess){printf("cudaHostMalloc Ahost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaHostAlloc((void**)&Bhost,memSizeB,flags);if (error != cudaSuccess){printf("cudaHostMalloc Bhost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaHostAlloc((void**)&Chost,memSizeC,flags);if (error != cudaSuccess){printf("cudaHostMalloc Chost returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} // set local arrays randomInit(Ahost,Md*Nd); randomInit(Bhost,Nd); // allocate device memory float *Adevice,*Bdevice,*Cdevice; error = cudaMalloc((void**)&Adevice,memSizeA); if (error != cudaSuccess){printf("cudaMalloc Adevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMalloc((void**)&Bdevice,memSizeB); if (error != cudaSuccess){printf("cudaMalloc Bdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} error = cudaMalloc((void**)&Cdevice,memSizeC); if (error != cudaSuccess){printf("cudaMalloc Cdevice returned error code %d, line(%d)\n", error, __LINE__);exit(EXIT_FAILURE);} fineTimeStamps[1] = read_timer(); ts6 = getTimeStamp(); #pragma omp critical { readyThreads += 1; } // fprintf(stderr,"Incremented ready GPU\n"); while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 0: %d\n",readyThreads);}; //#pragma omp single //{ cudaStream_t stream1; cudaStreamCreate ( &stream1) ; ts3 = getTimeStamp(); fineTimeStamps[2] = read_timer(); gTime = read_timer(); for (int i = 0; i < gpuOuter; i++) GPUsgemv(gpuInner,Md,Nd,Kd,Adevice,Bdevice,Cdevice,Ahost,Bhost,Chost,&stream1); cudaStreamSynchronize(stream1); gTime = read_timer() - gTime; fineTimeStamps[3] = read_timer(); ts4 = getTimeStamp(); cudaFreeHost(Ahost); cudaFreeHost(Bhost); cudaFreeHost(Chost); } else { // uint64 min_iters = strtoull(argv[4],NULL,0); float *A = NULL; float *B = NULL; float *C = NULL; ts7 = getTimeStamp(); fineTimeStamps[4] = read_timer(); A = (float*) malloc( Mh * Nh * sizeof(float) ); B = (float*) malloc( Nh * sizeof(float) ); C = (float*) malloc( Mh * sizeof(float) ); randomInit(A,Mh*Nh); randomInit(B,Nh); fineTimeStamps[5] = read_timer(); ts8 = getTimeStamp(); #pragma omp critical { readyThreads += 1; } // fprintf(stderr,"Incremented ready CPU\n"); while (readyThreads < 2){sleep(1);fprintf(stderr,"Thread 1: %d\n",readyThreads);}; // open the msr files for each core on the machine for (i = 0; i < currentMachine.num_cores; i++) open_msr_file(i,&fd[i]); int socketsProgrammed = 0; for (i = 0; i < currentMachine.num_cores; i++) { int currentCoreFD = fd[i]; stopCounters(i, currentCoreFD, ¤tMachine, &session); programCoreFixedCounters(currentCoreFD); programGeneralPurposeRegisters(currentCoreFD, ¤tMachine, &session); /* Program the Uncore as desired...*/ // Only program the first physical core on each socket. // NOTE: Some assumptions about topology here...check /proc/cpuinfo to confirm. if (i % currentMachine.num_phys_cores_per_socket == 0 && socketsProgrammed < currentMachine.num_sockets) { programUncoreCounters( currentCoreFD, ¤tMachine, &session); socketsProgrammed++; } } seconds = 0.0; // start the programmed counters... for (i = 0; i < currentMachine.num_cores; i++) startCounters( i, fd[i], ¤tMachine, &session); /* READ COUNTERS BEFORE STUFF */ readCounters(fd,¤tMachine,&session, &before); ts1 = getTimeStamp(); fineTimeStamps[6] = read_timer(); seconds = read_timer(); /* DO STUFF */ for (i =0; i < num_iters; i++) BLASFUNC( CblasColMajor,CblasNoTrans,Mh,Nh, 1, A,Mh, B,1, 1, C,1 ); /* END DOING STUFF */ seconds = read_timer()-seconds; fineTimeStamps[7] = read_timer(); ts2 = getTimeStamp(); /* READ COUNTERS AFTER STUFF */ for (i = 0; i < currentMachine.num_cores; i++) stopCounters(i,fd[i],¤tMachine, &session); // printf("num_iters = %"PRIu64", runtime is %g\n",num_iters,seconds); readCounters(fd,¤tMachine,&session,&after); diffCounterData(¤tMachine, &session, &after, &before, &after); for (i = 0; i < currentMachine.num_sockets; i++) { // printf("Socket %d\n",i); for (j = 0; j < currentMachine.num_cores_per_socket; j++) { // printf("%d,",j); for (k = 0; k < session.core_gen_counter_num_used; k++){ // printf("%"PRIu64",",after.generalCore[i*currentMachine.num_cores_per_socket + j][k]); // bug in the indexing of the core sums??? // coreSums[i*session.core_gen_counter_num_used + k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; coreSums[k] += after.generalCore[i*currentMachine.num_cores_per_socket + j][k]; } // printf("\n"); } } for (i = 0; i < currentMachine.num_sockets; i++) { // printf("%d,",i); for (j = 0; j < currentMachine.num_cbos; j++) { // printf("%d,",j); for (k = 0; k < session.cbo_counter_num_used; k++) { // printf("%llu,",after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]); // bug in the indexing of the core sums??? // sums[i*session.cbo_counter_num_used + k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; sums[k] += after.cboUncore[i*currentMachine.num_phys_cores_per_socket + j][k]; } } } // printf("\n"); // Stop counters, reset PMU, close msr files cleanup(fd,¤tMachine,&session); free(A); free(B); free(C); } } // end parallel region printf("%s,%s,%s,%s,%s,%s,%s,%s,%d,%d,%d,%d,%d,%d,%d,%d,%d,%f,%f,%f,",ts7,ts8,ts1,ts2,ts5,ts6,ts3,ts4,Mh,Nh,Kh,Md/block_size,Nd/block_size,Kd/block_size,num_iters,gpuOuter,gpuInner,seconds,gTime,(float)(gpuOuter*(Md*Kd+Nd+Md))/16.0); for (int i = 0; i < 8; i++) printf("%f,",fineTimeStamps[i]); for (j = 0; j < session.core_gen_counter_num_used; j++) printf("%llu,",coreSums[j]); for (j = 0; j < session.cbo_counter_num_used; j++) if (j == session.cbo_counter_num_used-1) printf("%llu",sums[j]); else printf("%llu,",sums[j]); printf("\n"); free(sums); free(coreSums); return 0; }