virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) { cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); /* Use small global size on CPU devices as it seems to be much faster. */ if(type == CL_DEVICE_TYPE_CPU) { VLOG(1) << "Global size: (64, 64)."; return make_int2(64, 64); } cl_ulong max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); if(DebugFlags().opencl.mem_limit) { max_buffer_size = min(max_buffer_size, cl_ulong(DebugFlags().opencl.mem_limit - device->stats.mem_used)); } VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; /* Limit to 2gb, as we shouldn't need more than that and some devices may support much more. */ max_buffer_size = min(max_buffer_size / 2, (cl_ulong)2l*1024*1024*1024); size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size); int2 global_size = make_int2(max(round_down((int)sqrt(num_elements), 64), 64), (int)sqrt(num_elements)); VLOG(1) << "Global size: " << global_size << "."; return global_size; }
void AdlPrimitivesDemo::render() { int size = 1024*256; // int size = 1024*64; size = NEXTMULTIPLEOF( size, 512 ); int* host1 = new int[size]; int2* host2 = new int2[size]; int4* host4 = new int4[size]; for(int i=0; i<size; i++) { host1[i] = getRandom(0,0xffff); host2[i] = make_int2( host1[i], i ); host4[i] = make_int4( host2[i].x, host2[i].y, host2[i].x, host2[i].y ); } Buffer<int> buf1( m_deviceData, size ); Buffer<int2> buf2( m_deviceData, size ); Buffer<int4> buf4( m_deviceData, size ); buf1.write( host1, size ); buf2.write( host2, size ); buf4.write( host4, size ); Stopwatch sw( m_deviceData ); m_nTxtLines = 0; sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "%d elems", size); // testSort( (Buffer<SortData>&)buf2, size, sw ); testFill1( buf1, size, sw ); testFill2( buf2, size, sw ); testFill4( buf4, size, sw ); test( buf2, size, sw ); delete [] host1; delete [] host2; delete [] host4; }
/** * @brief This performs the exchanging of all necessary halos between 2 neighboring MPI processes * * @param[in] cartComm The carthesian MPI communicator * @param[in] domSize The 2D size of the local domain * @param[in] topIndex The 2D index of the calling MPI process in the topology * @param[in] neighbors The list of ranks which are direct neighbors to the caller * @param[in] copyStream The stream used to overlap top & bottom halo exchange with side halo copy to host memory * @param[in, out] devBlocks The 2 device blocks that are updated during the Jacobi run * @param[in, out] devSideEdges The 2 side edges (parallel to the Y direction) that hold the packed halo values before sending them * @param[in, out] devHaloLines The 2 halo lines (parallel to the Y direction) that hold the packed halo values after receiving them * @param[in, out] hostSendLines The 2 host send buffers that are used during the halo exchange by the normal CUDA & MPI version * @param[in, out] hostRecvLines The 2 host receive buffers that are used during the halo exchange by the normal CUDA & MPI version * @return The time spent during the MPI transfers */ double TransferAllHalos(MPI_Comm cartComm, const int2 * domSize, const int2 * topIndex, const int * neighbors, cudaStream_t copyStream, real * devBlocks[2], real * devSideEdges[2], real * devHaloLines[2], real * hostSendLines[2], real * hostRecvLines[2]) { real * devSendLines[2] = {devBlocks[0] + domSize->x + 3, devBlocks[0] + domSize->y * (domSize->x + 2) + 1}; real * devRecvLines[2] = {devBlocks[0] + 1, devBlocks[0] + (domSize->y + 1) * (domSize->x + 2) + 1}; int yNeighbors[2] = {neighbors[DIR_TOP], neighbors[DIR_BOTTOM]}; int xNeighbors[2] = {neighbors[DIR_LEFT], neighbors[DIR_RIGHT]}; int2 order = make_int2(topIndex->x % 2, topIndex->y % 2); double transferTime; // Populate the block's side edges CopyDevSideEdgesFromBlock(devBlocks[0], devSideEdges, domSize, neighbors, copyStream); // Exchange data with the top and bottom neighbors transferTime = MPI_Wtime(); ExchangeHalos(cartComm, devSendLines[ order.y ], hostSendLines[0], hostRecvLines[0], devRecvLines[ order.y ], yNeighbors[ order.y ], domSize->x); ExchangeHalos(cartComm, devSendLines[1 - order.y], hostSendLines[0], hostRecvLines[0], devRecvLines[1 - order.y], yNeighbors[1 - order.y], domSize->x); SafeCudaCall(cudaStreamSynchronize(copyStream)); // Exchange data with the left and right neighbors ExchangeHalos(cartComm, devSideEdges[ order.x ], hostSendLines[1], hostRecvLines[1], devHaloLines[ order.x ], xNeighbors[ order.x ], domSize->y); ExchangeHalos(cartComm, devSideEdges[1 - order.x], hostSendLines[1], hostRecvLines[1], devHaloLines[1 - order.x], xNeighbors[1 - order.x], domSize->y); transferTime = MPI_Wtime() - transferTime; // Copy the received halos to the device block CopyDevHalosToBlock(devBlocks[0], devHaloLines[0], devHaloLines[1], domSize, neighbors); return transferTime; }
bool CPUSplitKernel::enqueue_split_kernel_data_init(const KernelDimensions &dim, RenderTile &rtile, int num_global_elements, device_memory &kernel_globals, device_memory &data, device_memory &split_data, device_memory &ray_state, device_memory &queue_index, device_memory &use_queues_flags, device_memory &work_pool_wgs) { KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer; kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); for (int y = 0; y < dim.global_size[1]; y++) { for (int x = 0; x < dim.global_size[0]; x++) { kg->global_id = make_int2(x, y); device->data_init_kernel()((KernelGlobals *)kernel_globals.device_pointer, (KernelData *)data.device_pointer, (void *)split_data.device_pointer, num_global_elements, (char *)ray_state.device_pointer, rtile.start_sample, rtile.start_sample + rtile.num_samples, rtile.x, rtile.y, rtile.w, rtile.h, rtile.offset, rtile.stride, (int *)queue_index.device_pointer, dim.global_size[0] * dim.global_size[1], (char *)use_queues_flags.device_pointer, (uint *)work_pool_wgs.device_pointer, rtile.num_samples, (float *)rtile.buffer); } } return true; }
virtual int2 split_kernel_global_size(device_memory& kg, device_memory& data, DeviceTask * /*task*/) { cl_device_type type = OpenCLInfo::get_device_type(device->cdDevice); /* Use small global size on CPU devices as it seems to be much faster. */ if(type == CL_DEVICE_TYPE_CPU) { VLOG(1) << "Global size: (64, 64)."; return make_int2(64, 64); } cl_ulong max_buffer_size; clGetDeviceInfo(device->cdDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_ulong), &max_buffer_size, NULL); VLOG(1) << "Maximum device allocation size: " << string_human_readable_number(max_buffer_size) << " bytes. (" << string_human_readable_size(max_buffer_size) << ")."; size_t num_elements = max_elements_for_max_buffer_size(kg, data, max_buffer_size / 2); int2 global_size = make_int2(round_down((int)sqrt(num_elements), 64), (int)sqrt(num_elements)); VLOG(1) << "Global size: " << global_size << "."; return global_size; }
virtual bool enqueue(const KernelDimensions &dim, device_memory &kernel_globals, device_memory &data) { if (!func) { return false; } KernelGlobals *kg = (KernelGlobals *)kernel_globals.device_pointer; kg->global_size = make_int2(dim.global_size[0], dim.global_size[1]); for (int y = 0; y < dim.global_size[1]; y++) { for (int x = 0; x < dim.global_size[0]; x++) { kg->global_id = make_int2(x, y); func(kg, (KernelData *)data.device_pointer); } } return true; }
/** * @brief This allocates and initializes all the relevant data buffers before the Jacobi run * * @param[in] topSizeY The size of the topology in the Y direction * @param[in] topIdxY The Y index of the calling MPI process in the topology * @param[in] domSize The size of the local domain (for which only the current MPI process is responsible) * @param[in] neighbors The neighbor ranks, according to the topology * @param[in] copyStream The stream used to overlap top & bottom halo exchange with side halo copy to host memory * @param[out] devBlocks The 2 device blocks that will be updated during the Jacobi run * @param[out] devSideEdges The 2 side edges (parallel to the Y direction) that will hold the packed halo values before sending them * @param[out] devHaloLines The 2 halo lines (parallel to the Y direction) that will hold the packed halo values after receiving them * @param[out] hostSendLines The 2 host send buffers that will be used during the halo exchange by the normal CUDA & MPI version * @param[out] hostRecvLines The 2 host receive buffers that will be used during the halo exchange by the normal CUDA & MPI version * @param[out] devResidue The global device residue, which will be updated after every Jacobi iteration */ void InitializeDataChunk(int topSizeY, int topIdxY, const int2 * domSize, const int * neighbors, cudaStream_t * copyStream, real * devBlocks[2], real * devSideEdges[2], real * devHaloLines[2], real * hostSendLines[2], real * hostRecvLines[2], real ** devResidue) { const real PI = (real)3.1415926535897932384626; const real E_M_PI = (real)exp(-PI); size_t blockBytes = (domSize->x + 2) * (domSize->y + 2) * sizeof(real); size_t sideLineBytes = domSize->y * sizeof(real); int2 borderBounds = make_int2(topIdxY * domSize->y, (topIdxY + 1) * domSize->y); int borderSpan = domSize->y * topSizeY - 1; real * hostBlock = SafeHostAlloc(blockBytes); // Clearing the block also sets the boundary conditions for top and bottom edges to 0 memset(hostBlock, 0, blockBytes); InitExchangeBuffers(hostSendLines, hostRecvLines, 0, domSize->x * sizeof(real)); InitExchangeBuffers(hostSendLines, hostRecvLines, 1, sideLineBytes); // Set the boundary conditions for the left edge if (!HasNeighbor(neighbors, DIR_LEFT)) { for (int j = borderBounds.x, idx = domSize->x + 3; j < borderBounds.y; ++j, idx += domSize->x + 2) { hostBlock[idx] = (real)sin(PI * j / borderSpan); } } // Set the boundary conditions for the right edge if (!HasNeighbor(neighbors, DIR_RIGHT)) { for (int j = borderBounds.x, idx = ((domSize->x + 2) << 1) - 2; j < borderBounds.y; ++j, idx += domSize->x + 2) { hostBlock[idx] = (real)sin(PI * j / borderSpan) * E_M_PI; } } // Perform device memory allocation and initialization for (int i = 0; i < 2; ++i) { SafeCudaCall(cudaMalloc((void **)&devBlocks[i], blockBytes)); SafeCudaCall(cudaMalloc((void **)&devSideEdges[i], sideLineBytes)); SafeCudaCall(cudaMalloc((void **)&devHaloLines[i], sideLineBytes)); SafeCudaCall(cudaMemset(devSideEdges[i], 0, sideLineBytes)); } SafeCudaCall(cudaMalloc((void **)devResidue, sizeof(real))); SafeCudaCall(cudaMemcpy(devBlocks[0], hostBlock, blockBytes, cudaMemcpyHostToDevice)); SafeCudaCall(cudaMemcpy(devBlocks[1], devBlocks[0], blockBytes, cudaMemcpyDeviceToDevice)); SafeCudaCall(cudaStreamCreate(copyStream)); SafeHostFree(hostBlock); }
void AdlPrimitivesDemo::testFill2( Buffer<int2>& buf, int size, Stopwatch& sw ) { MyFill::Data* sortData = MyFill::allocate( m_deviceData ); sw.start(); MyFill::execute( sortData, buf, make_int2(12, 13), size ); sw.stop(); MyFill::deallocate( sortData ); { float t = sw.getMs(); sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "Fill int2: %3.2fGB/s (%3.2fms)", size/t/1000/1000*8, t); } }
/** * @brief Generates the 2D topology and establishes the neighbor relationships between MPI processes * * @param[in, out] rank The rank of the calling MPI process * @param[in] size The total number of MPI processes available * @param[in] topSize The desired topology size (this must match the number of available MPI processes) * @param[out] neighbors The list that will be populated with the direct neighbors of the calling MPI process * @param[out] topIndex The 2D index that the calling MPI process will have in the topology * @param[out] cartComm The carthesian MPI communicator */ int ApplyTopology(int * rank, int size, const int2 * topSize, int * neighbors, int2 * topIndex, MPI_Comm * cartComm) { int topologySize = topSize->x * topSize->y; int dimSize[2] = {topSize->x, topSize->y}; int usePeriods[2] = {0, 0}, newCoords[2]; int oldRank = * rank; // The number of MPI processes must fill the topology if (size != topologySize) { OneErrPrintf(* rank == MPI_MASTER_RANK, "Error: The number of MPI processes (%d) doesn't match " "the topology size (%d).\n", size, topologySize); return STATUS_ERR; } // Create a carthesian communicator MPI_Cart_create(MPI_COMM_WORLD, 2, dimSize, usePeriods, 1, cartComm); // Update the rank to be relevant to the new communicator MPI_Comm_rank(* cartComm, rank); if ((* rank) != oldRank) { printf("Rank change: from %d to %d\n", oldRank, * rank); } // Obtain the 2D coordinates in the new communicator MPI_Cart_coords(* cartComm, * rank, 2, newCoords); * topIndex = make_int2(newCoords[0], newCoords[1]); // Obtain the direct neighbor ranks MPI_Cart_shift(* cartComm, 0, 1, neighbors + DIR_LEFT, neighbors + DIR_RIGHT); MPI_Cart_shift(* cartComm, 1, 1, neighbors + DIR_TOP, neighbors + DIR_BOTTOM); // Setting the device here will have effect only for the normal CUDA & MPI version SetDeviceAfterInit(* rank); return STATUS_OK; }
int2 KITTIReader::List2Int2(const QStringList & n) { return make_int2((int)n.at(0).trimmed().toFloat(), (int)n.at(1).trimmed().toFloat()); }
__host__ int2 make_int2( const Vector2i& v ) { return make_int2( v.x, v.y ); }
__host__ __device__ inline int2 operator/( const int a, const int2 b ) { return make_int2( a / b.x, a / b.y ); }
__host__ __device__ inline int2 operator/( const int2 a, const int b ) { return make_int2( a.x / b, a.y / b ); }
__host__ __device__ inline int2 operator*( const int2 a, const int b ) { return make_int2( a.x * b, a.y * b ); }
/// Subtract operator __host__ __device__ inline int2 operator-( const int2 a, const int2 b ) { return make_int2( a.x - b.x, a.y - b.y ); }
/* If sliced is false, splits image into tiles and assigns equal amount of tiles to every render device. * If sliced is true, slice image into as much pieces as how many devices are rendering this image. */ int TileManager::gen_tiles(bool sliced) { int resolution = state.resolution_divider; int image_w = max(1, params.width/resolution); int image_h = max(1, params.height/resolution); int2 center = make_int2(image_w/2, image_h/2); state.tiles.clear(); int num_logical_devices = preserve_tile_device? num_devices: 1; int num = min(image_h, num_logical_devices); int slice_num = sliced? num: 1; int tile_index = 0; state.tiles.clear(); state.tiles.resize(num); vector<list<Tile> >::iterator tile_list = state.tiles.begin(); if(tile_order == TILE_HILBERT_SPIRAL) { assert(!sliced); /* Size of blocks in tiles, must be a power of 2 */ const int hilbert_size = (max(tile_size.x, tile_size.y) <= 12)? 8: 4; int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x; int tile_h = (tile_size.y >= image_h)? 1: (image_h + tile_size.y - 1)/tile_size.y; int tiles_per_device = (tile_w * tile_h + num - 1) / num; int cur_device = 0, cur_tiles = 0; int2 block_size = tile_size * make_int2(hilbert_size, hilbert_size); /* Number of blocks to fill the image */ int blocks_x = (block_size.x >= image_w)? 1: (image_w + block_size.x - 1)/block_size.x; int blocks_y = (block_size.y >= image_h)? 1: (image_h + block_size.y - 1)/block_size.y; int n = max(blocks_x, blocks_y) | 0x1; /* Side length of the spiral (must be odd) */ /* Offset of spiral (to keep it centered) */ int2 offset = make_int2((image_w - n*block_size.x)/2, (image_h - n*block_size.y)/2); offset = (offset / tile_size) * tile_size; /* Round to tile border. */ int2 block = make_int2(0, 0); /* Current block */ SpiralDirection prev_dir = DIRECTION_UP, dir = DIRECTION_UP; for(int i = 0;;) { /* Generate the tiles in the current block. */ for(int hilbert_index = 0; hilbert_index < hilbert_size*hilbert_size; hilbert_index++) { int2 tile, hilbert_pos = hilbert_index_to_pos(hilbert_size, hilbert_index); /* Rotate block according to spiral direction. */ if(prev_dir == DIRECTION_UP && dir == DIRECTION_UP) { tile = make_int2(hilbert_pos.y, hilbert_pos.x); } else if(dir == DIRECTION_LEFT || prev_dir == DIRECTION_LEFT) { tile = hilbert_pos; } else if(dir == DIRECTION_DOWN) { tile = make_int2(hilbert_size-1-hilbert_pos.y, hilbert_size-1-hilbert_pos.x); } else { tile = make_int2(hilbert_size-1-hilbert_pos.x, hilbert_size-1-hilbert_pos.y); } int2 pos = block*block_size + tile*tile_size + offset; /* Only add tiles which are in the image (tiles outside of the image can be generated since the spiral is always square). */ if(pos.x >= 0 && pos.y >= 0 && pos.x < image_w && pos.y < image_h) { int w = min(tile_size.x, image_w - pos.x); int h = min(tile_size.y, image_h - pos.y); tile_list->push_front(Tile(tile_index, pos.x, pos.y, w, h, cur_device)); cur_tiles++; tile_index++; if(cur_tiles == tiles_per_device) { tile_list++; cur_tiles = 0; cur_device++; } } } /* Stop as soon as the spiral has reached the center block. */ if(block.x == (n-1)/2 && block.y == (n-1)/2) break; /* Advance to next block. */ prev_dir = dir; switch(dir) { case DIRECTION_UP: block.y++; if(block.y == (n-i-1)) { dir = DIRECTION_LEFT; } break; case DIRECTION_LEFT: block.x++; if(block.x == (n-i-1)) { dir = DIRECTION_DOWN; } break; case DIRECTION_DOWN: block.y--; if(block.y == i) { dir = DIRECTION_RIGHT; } break; case DIRECTION_RIGHT: block.x--; if(block.x == i+1) { dir = DIRECTION_UP; i++; } break; } } return tile_index; } for(int slice = 0; slice < slice_num; slice++) { int slice_y = (image_h/slice_num)*slice; int slice_h = (slice == slice_num-1)? image_h - slice*(image_h/slice_num): image_h/slice_num; int tile_w = (tile_size.x >= image_w)? 1: (image_w + tile_size.x - 1)/tile_size.x; int tile_h = (tile_size.y >= slice_h)? 1: (slice_h + tile_size.y - 1)/tile_size.y; int tiles_per_device = (tile_w * tile_h + num - 1) / num; int cur_device = 0, cur_tiles = 0; for(int tile_y = 0; tile_y < tile_h; tile_y++) { for(int tile_x = 0; tile_x < tile_w; tile_x++, tile_index++) { int x = tile_x * tile_size.x; int y = tile_y * tile_size.y; int w = (tile_x == tile_w-1)? image_w - x: tile_size.x; int h = (tile_y == tile_h-1)? slice_h - y: tile_size.y; tile_list->push_back(Tile(tile_index, x, y + slice_y, w, h, sliced? slice: cur_device)); if(!sliced) { cur_tiles++; if(cur_tiles == tiles_per_device) { /* Tiles are already generated in Bottom-to-Top order, so no sort is necessary in that case. */ if(tile_order != TILE_BOTTOM_TO_TOP) { tile_list->sort(TileComparator(tile_order, center)); } tile_list++; cur_tiles = 0; cur_device++; } } } } if(sliced) { tile_list++; } } return tile_index; }
int2 CPUSplitKernel::split_kernel_local_size() { return make_int2(1, 1); }
int2 CPUSplitKernel::split_kernel_global_size(device_memory & /*kg*/, device_memory & /*data*/, DeviceTask * /*task*/) { return make_int2(1, 1); }
void getBlobStridesAndReceptiveFields(caffe::Net<float> & net, const std::vector<std::string> & blobsToVisualize, std::map<std::string,int> & strides, std::map<std::string,int2> & receptiveFields) { const int nInputBlobs = net.input_blob_indices().size(); if (nInputBlobs == 0) { std::cerr << "there are no input blobs - where to start?" << std::endl; return; } std::string inputBlobName(net.blob_names()[net.input_blob_indices()[0]]); strides[inputBlobName] = 1; receptiveFields[inputBlobName] = make_int2(1,1); boost::shared_ptr<caffe::Blob<float> > inputBlob = net.blob_by_name(inputBlobName); int2 inputSize = make_int2(inputBlob->width(),inputBlob->height()); const int nLayers = net.layers().size(); for (int i=0; i<nLayers; ++i) { boost::shared_ptr<caffe::Layer<float> > layer = net.layers()[i]; std::string layerType(layer->type()); bool isConv = layerType == std::string("Convolution"); bool isPool = layerType == std::string("Pooling"); if (isConv || isPool) { caffe::Blob<float> * inputBlob = net.bottom_vecs()[i][0]; int inputBlobNum = getBlobNumber(net,inputBlob); assert(inputBlobNum >= 0); std::string inputBlobName = net.blob_names()[inputBlobNum]; caffe::Blob<float> * outputBlob = net.top_vecs()[i][0]; int outputBlobNum = getBlobNumber(net,outputBlob); assert(outputBlobNum >= 0); std::string outputBlobName = net.blob_names()[outputBlobNum]; int2 kernelSize, stride; if (isConv) { caffe::ConvolutionParameter convParam = layer->layer_param().convolution_param(); kernelSize = convParam.has_kernel_size() ? make_int2(convParam.kernel_size()) : make_int2(convParam.kernel_w(),convParam.kernel_h()); stride = convParam.has_stride() ? make_int2(convParam.stride()) : make_int2(convParam.stride_w(),convParam.stride_h()); } else if (isPool) { caffe::PoolingParameter poolParam = layer->layer_param().pooling_param(); kernelSize = poolParam.has_kernel_size() ? make_int2(poolParam.kernel_size()) : make_int2(poolParam.kernel_w(),poolParam.kernel_h()); stride = poolParam.has_stride() ? make_int2(poolParam.stride()) : make_int2(poolParam.stride_w(),poolParam.stride_h()); } if (strides.find(inputBlobName) != strides.end()) { const int strideIn = strides[inputBlobName]; const int2 fieldIn = receptiveFields[inputBlobName]; strides[outputBlobName] = strideIn*stride.x; receptiveFields[outputBlobName] = strideIn*(kernelSize - make_int2(1)) + fieldIn; } } else if (layerType == std::string("InnerProduct")) { caffe::Blob<float> * inputBlob = net.bottom_vecs()[i][0]; int inputBlobNum = getBlobNumber(net,inputBlob); assert(inputBlobNum >= 0); std::string inputBlobName = net.blob_names()[inputBlobNum]; caffe::Blob<float> * outputBlob = net.top_vecs()[i][0]; int outputBlobNum = getBlobNumber(net,outputBlob); assert(outputBlobNum >= 0); std::string outputBlobName = net.blob_names()[outputBlobNum]; if (strides.find(inputBlobName) != strides.end()) { strides[outputBlobName] = strides[inputBlobName]; receptiveFields[outputBlobName] = inputSize; } } } }
virtual int2 split_kernel_local_size() { return make_int2(64, 1); }
int main() { //Checks for memory leaks in debug mode _CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF); glfwInit(); glfwWindowHint(GLFW_CONTEXT_VERSION_MAJOR, 4); glfwWindowHint(GLFW_CONTEXT_VERSION_MINOR, 4); glfwWindowHint(GLFW_OPENGL_PROFILE, GLFW_OPENGL_CORE_PROFILE); glfwWindowHint(GLFW_RESIZABLE, GL_FALSE); GLFWwindow* window = glfwCreateWindow(width, height, "Hikari", nullptr, nullptr); glfwMakeContextCurrent(window); //Set callbacks for keyboard and mouse glfwSetInputMode(window, GLFW_CURSOR, GLFW_CURSOR_DISABLED); glewExperimental = GL_TRUE; glewInit(); glGetError(); //Define the viewport dimensions glViewport(0, 0, width, height); //Initialize cuda->opengl context cudaCheck(cudaGLSetGLDevice(0)); cudaGraphicsResource *resource; //Create a texture to store ray tracing result GLuint tex; glActiveTexture(GL_TEXTURE0); glGenTextures(1, &tex); glBindTexture(GL_TEXTURE_2D, tex); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_NEAREST); glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_NEAREST); glTexImage2D(GL_TEXTURE_2D, 0, GL_RGBA32F, width, height, 0, GL_RGBA, GL_FLOAT, NULL); cudaCheck(cudaGraphicsGLRegisterImage(&resource, tex, GL_TEXTURE_2D, cudaGraphicsMapFlagsWriteDiscard)); glBindTexture(GL_TEXTURE_2D, 0); Shader final = Shader("fsQuad.vert", "fsQuad.frag"); FullscreenQuad fsQuad = FullscreenQuad(); float4* buffer; cudaCheck(cudaMalloc((void**)&buffer, width * height * sizeof(float4))); cudaCheck(cudaMemset(buffer, 0, width * height * sizeof(float4))); //Mesh float3 offset = make_float3(0); float3 scale = make_float3(15); Mesh cBox("objs/Avent", 0, scale, offset); offset = make_float3(0, 55, 0); scale = make_float3(100); Mesh light("objs/plane", (int)cBox.triangles.size(), scale, offset); cBox.triangles.insert(cBox.triangles.end(), light.triangles.begin(), light.triangles.end()); cBox.aabbs.insert(cBox.aabbs.end(), light.aabbs.begin(), light.aabbs.end()); std::cout << "Num triangles: " << cBox.triangles.size() << std::endl; cBox.root = AABB(fminf(cBox.root.minBounds, light.root.minBounds), fmaxf(cBox.root.maxBounds, light.root.maxBounds)); BVH bvh(cBox.aabbs, cBox.triangles, cBox.root); Camera cam(make_float3(14, 15, 80), make_int2(width, height), 45.0f, 0.04f, 80.0f); Camera* dCam; cudaCheck(cudaMalloc((void**)&dCam, sizeof(Camera))); cudaCheck(cudaMemcpy(dCam, &cam, sizeof(Camera), cudaMemcpyHostToDevice)); cudaCheck(cudaGraphicsMapResources(1, &resource, 0)); cudaArray* pixels; cudaCheck(cudaGraphicsSubResourceGetMappedArray(&pixels, resource, 0, 0)); cudaResourceDesc viewCudaArrayResourceDesc; viewCudaArrayResourceDesc.resType = cudaResourceTypeArray; viewCudaArrayResourceDesc.res.array.array = pixels; cudaSurfaceObject_t viewCudaSurfaceObject; cudaCheck(cudaCreateSurfaceObject(&viewCudaSurfaceObject, &viewCudaArrayResourceDesc)); cudaCheck(cudaGraphicsUnmapResources(1, &resource, 0)); while (!glfwWindowShouldClose(window)) { float currentFrame = float(glfwGetTime()); deltaTime = currentFrame - lastFrame; lastFrame = currentFrame; //Check and call events glfwPollEvents(); handleInput(window, cam); if (cam.moved) { frameNumber = 0; cudaCheck(cudaMemset(buffer, 0, width * height * sizeof(float4))); } cam.rebuildCamera(); cudaCheck(cudaMemcpy(dCam, &cam, sizeof(Camera), cudaMemcpyHostToDevice)); frameNumber++; if (frameNumber < 20000) { cudaCheck(cudaGraphicsMapResources(1, &resource, 0)); std::chrono::time_point<std::chrono::system_clock> start, end; start = std::chrono::system_clock::now(); render(cam, dCam, viewCudaSurfaceObject, buffer, bvh.dTriangles, bvh.dNodes, frameNumber, cam.moved); end = std::chrono::system_clock::now(); std::chrono::duration<double> elapsed = end - start; std::cout << "Frame: " << frameNumber << " --- Elapsed time: " << elapsed.count() << "s\n"; cudaCheck(cudaGraphicsUnmapResources(1, &resource, 0)); } cam.moved = false; glUseProgram(final.program); glActiveTexture(GL_TEXTURE0); glBindTexture(GL_TEXTURE_2D, tex); glClear(GL_COLOR_BUFFER_BIT); final.setUniformi("tRender", 0); fsQuad.render(); //std::cout << glGetError() << std::endl; //Swap the buffers glfwSwapBuffers(window); glfwSetCursorPos(window, lastX, lastY); }
void RGBDCamera::update(const RawFrame* this_frame) { //Check the timestamp, and skip if we have already seen this frame if (this_frame->timestamp <= latest_stamp_) { return; } else { latest_stamp_ = this_frame->timestamp; } //Apply bilateral filter to incoming depth uint16_t* filtered_depth; cudaMalloc((void**)&filtered_depth, this_frame->width*this_frame->height*sizeof(uint16_t)); bilateralFilter(this_frame->depth, filtered_depth, this_frame->width, this_frame->height); //Convert the input color data to intensity float* temp_intensity; cudaMalloc((void**)&temp_intensity, this_frame->width*this_frame->height*sizeof(float)); colorToIntensity(this_frame->color, temp_intensity, this_frame->width*this_frame->height); //Create pyramids for (int i = 0; i < PYRAMID_DEPTH; i++) { //Fill in sizes the first two times through if (pass_ < 2) { current_icp_frame_[i] = new ICPFrame(this_frame->width/pow(2,i), this_frame->height/pow(2,i)); current_rgbd_frame_[i] = new RGBDFrame(this_frame->width/pow(2,i), this_frame->height/pow(2,i)); } //Add ICP data generateVertexMap(filtered_depth, current_icp_frame_[i]->vertex, current_icp_frame_[i]->width, current_icp_frame_[i]->height, focal_length_, make_int2(this_frame->width, this_frame->height)); generateNormalMap(current_icp_frame_[i]->vertex, current_icp_frame_[i]->normal, current_icp_frame_[i]->width, current_icp_frame_[i]->height); //Add RGBD data cudaMemcpy(current_rgbd_frame_[i]->vertex, current_icp_frame_[i]->vertex, current_rgbd_frame_[i]->width*current_rgbd_frame_[i]->height*sizeof(glm::vec3), cudaMemcpyDeviceToDevice); cudaMemcpy(current_rgbd_frame_[i]->intensity, temp_intensity, current_rgbd_frame_[i]->width*current_rgbd_frame_[i]->height*sizeof(float), cudaMemcpyDeviceToDevice); //Downsample depth and color if not the last iteration if (i != (PYRAMID_DEPTH-1)) { subsampleDepth(filtered_depth, current_icp_frame_[i]->width, current_icp_frame_[i]->height); subsample(temp_intensity, current_rgbd_frame_[i]->width, current_rgbd_frame_[i]->height); cudaDeviceSynchronize(); } } //Clear the filtered depth and temporary color since they are no longer needed cudaFree(filtered_depth); cudaFree(temp_intensity); if (pass_ >= 1) { glm::mat4 update_trans(1.0f); //Loop through pyramids backwards (coarse first) for (int i = PYRAMID_DEPTH - 1; i >= 0; i--) { //Get a copy of the ICP frame for this pyramid level ICPFrame icp_f(current_icp_frame_[i]->width, current_icp_frame_[i]->height); cudaMemcpy(icp_f.vertex, current_icp_frame_[i]->vertex, icp_f.width*icp_f.height*sizeof(glm::vec3), cudaMemcpyDeviceToDevice); cudaMemcpy(icp_f.normal, current_icp_frame_[i]->normal, icp_f.width*icp_f.height*sizeof(glm::vec3), cudaMemcpyDeviceToDevice); //Get a copy of the RGBD frame for this pyramid level //RGBDFrame rgbd_f(current_rgbd_frame_[i]->width, current_rgbd_frame_[i]->height); //cudaMemcpy(rgbd_f.vertex, current_rgbd_frame_[i]->vertex, rgbd_f.width*rgbd_f.height*sizeof(glm::vec3), cudaMemcpyDeviceToDevice); //cudaMemcpy(rgbd_f.intensity, current_rgbd_frame_[i]->intensity, rgbd_f.width*rgbd_f.height*sizeof(float), cudaMemcpyDeviceToDevice); //Apply the most recent update to the points/normals if (i < (PYRAMID_DEPTH-1)) { transformVertexMap(icp_f.vertex, update_trans, icp_f.width*icp_f.height); transformNormalMap(icp_f.normal, update_trans, icp_f.width*icp_f.height); cudaDeviceSynchronize(); } //Loop through iterations for (int j = 0; j < PYRAMID_ITERS[i]; j++) { //Get the Geometric ICP cost values float A1[6 * 6]; float b1[6]; computeICPCost2(last_icp_frame_[i], icp_f, A1, b1); //Get the Photometric RGB-D cost values //float A2[6*6]; //float b2[6]; //compueRGBDCost(last_rgbd_frame_, rgbd_f, A2, b2); //Combine the two //for (size_t k = 0; k < 6; k++) { //for (size_t l = 0; l < 6; l++) { //A1[6 * k + l] += A2[6 * k + l]; //} //b1[k] += b2[k]; //} //Solve for the optimized camera transformation float x[6]; solveCholesky(6, A1, b1, x); //Check for NaN/divergence if (isnan(x[0]) || isnan(x[1]) || isnan(x[2]) || isnan(x[3]) || isnan(x[4]) || isnan(x[5])) { printf("Camera tracking is lost.\n"); break; } //Update position/orientation of the camera glm::mat4 this_trans = glm::rotate(glm::mat4(1.0f), -x[2] * 180.0f / 3.14159f, glm::vec3(0.0f, 0.0f, 1.0f)) * glm::rotate(glm::mat4(1.0f), -x[1] * 180.0f / 3.14159f, glm::vec3(0.0f, 1.0f, 0.0f)) * glm::rotate(glm::mat4(1.0f), -x[0] * 180.0f / 3.14159f, glm::vec3(1.0f, 0.0f, 0.0f)) * glm::translate(glm::mat4(1.0f), glm::vec3(x[3], x[4], x[5])); update_trans = this_trans * update_trans; //Apply the update to the points/normals if (j < (PYRAMID_ITERS[i] - 1)) { transformVertexMap(icp_f.vertex, this_trans, icp_f.width*icp_f.height); transformNormalMap(icp_f.normal, this_trans, icp_f.width*icp_f.height); cudaDeviceSynchronize(); } } } //Update the global transform with the result position_ = glm::vec3(glm::vec4(position_, 1.0f) * update_trans); orientation_ = glm::mat3(glm::mat4(orientation_) * update_trans); } if (pass_ < 2) { pass_++; } //Swap current and last frames for (int i = 0; i < PYRAMID_DEPTH; i++) { ICPFrame* temp = current_icp_frame_[i]; current_icp_frame_[i] = last_icp_frame_[i]; last_icp_frame_[i] = temp; //TODO: Longterm, only RGBD should do this. ICP should not swap, as last_frame should be updated by a different function RGBDFrame* temp2 = current_rgbd_frame_[i]; current_rgbd_frame_[i] = last_rgbd_frame_[i]; last_rgbd_frame_[i] = temp2; } }
int2 operator+ (const int2& a, const int2& b) { int2 sum = make_int2(a.x + b.x, a.y + b.y); return sum; }
/** * @brief Computes inverse wavelet transform 53. * * @param idata Input data. * @param odata Output data * @param img_size Struct with input image width and height. * @param step Struct with output image width and height. */ __global__ void iwt53_new(const float *idata, float *odata, const int2 img_size, const int2 step) { // shared memory for part of the signal __shared__ int shared[MEMSIZE][MEMSIZE + 1]; // LL subband dimensions - ceil of input image dimensions // const int2 ll_sub = make_int2((int) ceilf(img_size.x / 2.0), (int) ceilf(img_size.y / 2.0)); const int2 ll_sub = make_int2((img_size.x + 1) >> 1, (img_size.y + 1) >> 1); // Input x, y block dimension // Width // bidx.x - left block // bidx.y - right block const int2 bidx = make_int2(blockIdx.x * BLOCKSIZEX, ll_sub.x + blockIdx.x * BLOCKSIZEX); // Height // bidy.x - top block // bidy.y - bottom block const int2 bidy = make_int2(blockIdx.y * BLOCKSIZEY, ll_sub.y + blockIdx.y * BLOCKSIZEY); // Even thread id const short tidx2 = threadIdx.x * 2; // thread id short2 tid = make_short2(threadIdx.x, threadIdx.y); // Patch size /* Compute patch offset and size */ // p_size_x.x - left part block x size // p_size_x.y - right part block x size const short2 p_size_x = make_short2(ll_sub.x - bidx.x < BLOCKSIZEX ? ll_sub.x - bidx.x : BLOCKSIZEX, img_size.x - bidx.y < BLOCKSIZEX ? img_size.x - bidx.y : BLOCKSIZEX); // p_size_y.x - top part block x size // p_size_y.y - bottom part block x size const short2 p_size_y = make_short2(ll_sub.y - bidy.x < BLOCKSIZEY ? ll_sub.y - bidy.x : BLOCKSIZEY, img_size.y - bidy.y < BLOCKSIZEY ? img_size.y - bidy.y : BLOCKSIZEY); // summary size const short2 p_size_sum = make_short2(p_size_x.x + p_size_x.y, p_size_y.x + p_size_y.y); /* block x size */ // Threads offset to read margins short p_offset_y_t; // Allocate registers in order to compute even and odd pixels. int pix_neighborhood[6]; // Minimize registers usage. Right | bottom offset. Odd | even result pixels. int results[6]; read_data_new<int, MEMSIZE + 1>(1, tid, bidx, bidy, p_size_x, p_size_y, ll_sub, img_size, step.x, idata, shared, OFFSET_53/2); __syncthreads(); // thread x id tid.x = threadIdx.x; // thread y id tid.y = threadIdx.y; // Row number p_offset_y_t = 0; // Process columns iprocess_53_new<MEMSIZE + 1>(tidx2, tid.y, p_offset_y_t, p_size_sum.y, p_size_sum.x + 2 * OFFSET_53, pix_neighborhood, shared, results); __syncthreads(); tid.x = threadIdx.x; tid.y = threadIdx.y; p_offset_y_t = 0; // safe results and rotate while (tid.y < p_size_sum.x + 2 * OFFSET_53 && 2 * tid.x < p_size_sum.y) { // Can not dynamically index registers, avoid local memory usage. // shared[tid.x][tid.y] = k2 * results[0 + p_offset_y * 2]; // if(tid.x + BLOCKSIZEX < p_size_sum.y) // shared[tid.x + BLOCKSIZEX][tid.y] = k1 * results[1 + p_offset_y * 2]; save_to_shared_new<int, MEMSIZE + 1, (MEMSIZE + (BLOCKSIZEY - 1)) / BLOCKSIZEY> (1, make_short2(2 * tid.x, tid.y), make_short2(2 * tid.x + 1, tid.y), 2 * tid.x + 1, p_offset_y_t, p_size_sum.y, results, shared); p_offset_y_t++; tid.y += BLOCKSIZEY; } __syncthreads(); tid.x = threadIdx.x; tid.y = threadIdx.y; // Row number p_offset_y_t = 0; // Process rows iprocess_53_new<MEMSIZE + 1>(tidx2, tid.y, p_offset_y_t, p_size_sum.x, p_size_sum.y, pix_neighborhood, shared, results); __syncthreads(); tid.x = threadIdx.x; tid.y = threadIdx.y; // Row number p_offset_y_t = 0; // Safe results while (2 * tid.x < p_size_sum.x && tid.y < p_size_sum.y) { // Can not dynamically index registers, avoid local memory usage. // shared[tid.x][tid.y] = k2 * results[0 + p_offset_y * 2]; // if(tid.x + BLOCKSIZEX < p_size_sum.y) // shared[tid.x + BLOCKSIZEX][tid.y] = k1 * results[1 + p_offset_y * 2]; save_to_shared_new<int, MEMSIZE + 1, (MEMSIZE + (BLOCKSIZEY - 1)) / BLOCKSIZEY> (1, make_short2(tid.y, 2 * tid.x), make_short2(tid.y, 2 * tid.x + 1), 2 * tid.x + 1, p_offset_y_t, p_size_sum.x, results, shared); p_offset_y_t++; tid.y += BLOCKSIZEY; } __syncthreads(); tid.x = threadIdx.x; tid.y = threadIdx.y; // Save to GM save_data_new<int, MEMSIZE + 1>(tid, p_size_sum, bidx.x, bidy.x, img_size, step.x, odata, shared); }
void AdlPrimitivesDemo::test( Buffer<int2>& buf, int size, Stopwatch& sw ) { Kernel* kernel = KernelManager::query( m_deviceData, "..\\..\\AdlDemos\\TestBed\\Demos\\AdlPrimitivesDemoKernel", "FillInt4Kernel" ); Buffer<int4> constBuffer( m_deviceData, 1, BufferBase::BUFFER_CONST ); int numGroups = (size+128*4-1)/(128*4); Buffer<u32> workBuffer0( m_deviceData, numGroups*(16) ); Buffer<u32> workBuffer1( m_deviceData, numGroups*(16) ); Buffer<int2> sortBuffer( m_deviceData, size ); { int2* host = new int2[size]; for(int i=0; i<size; i++) { host[i] = make_int2( getRandom(0, 0xf), i ); } sortBuffer.write( host, size ); DeviceUtils::waitForCompletion( m_deviceData ); delete [] host; } int4 constData; { constData.x = size; constData.y = 0; constData.z = numGroups; constData.w = 0; } sw.start(); int nThreads = size/4; { BufferInfo bInfo[] = { BufferInfo( &buf ), BufferInfo( &workBuffer0 ), BufferInfo( &workBuffer1 ) }; Launcher launcher( m_deviceData, kernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) ); launcher.setConst( constBuffer, constData ); launcher.launch1D( nThreads, 128 ); } sw.split(); { constData.w = 1; int nThreads = size/4; BufferInfo bInfo[] = { BufferInfo( &buf ), BufferInfo( &workBuffer0 ), BufferInfo( &workBuffer1 ) }; Launcher launcher( m_deviceData, kernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) ); launcher.setConst( constBuffer, constData ); launcher.launch1D( nThreads, 128 ); } sw.split(); { constData.w = 2; int nThreads = size/4; BufferInfo bInfo[] = { BufferInfo( &sortBuffer ), BufferInfo( &workBuffer0 ), BufferInfo( &workBuffer1 ) }; Launcher launcher( m_deviceData, kernel ); launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(Launcher::BufferInfo) ); launcher.setConst( constBuffer, constData ); launcher.launch1D( nThreads, 128 ); } sw.stop(); { int2* host = new int2[size]; buf.read( host, size ); DeviceUtils::waitForCompletion( m_deviceData ); for(int i=0; i<128*4-1; i++) { ADLASSERT( host[i].x <= host[i+1].x ); } delete [] host; } { float t[3]; sw.getMs(t, 3); // (byte * nElems) sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "LoadStore: %3.2fGB/s (%3.2fns)", (4*8*2)*nThreads/t[0]/1000/1000, t[0]*1000.f); sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "GenHistog: %3.2fGB/s (%3.2fns)", (4*(8*2+2))*nThreads/t[1]/1000/1000, t[1]*1000.f); sprintf_s(m_txtBuffer[m_nTxtLines++], LINE_CAPACITY, "FullSort: %3.2fGB/s (%3.2fns)", (4*(8*2+2))*nThreads/t[2]/1000/1000, t[2]*1000.f); } }