void SpatialBatchNormalization::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); Tensor<float>* out = TO_TENSOR_PTR(output.get()); RASSERT(in->dim() >= 3); RASSERT(in->size()[2] == nfeats_); if (output != nullptr && in->dim() != out->dim()) { output = nullptr; } // Check that the input and output size are the same. if (output != nullptr) { if (in->size()[0] != out->size()[0] || in->size()[1] != out->size()[1] || in->size()[2] != out->size()[2]) { output = nullptr; } } if (output == nullptr) { output.reset(new Tensor<float>(in->dim(), in->size())); } }
static __inline__ RunStore* runstore_from_runs( Run* runs ) { RASSERT(runs != RUNS_EMPTY); RASSERT(runs != RUNS_RECT); return (RunStore*)runs - 1; }
static void region_operator_init( RegionOperator* o, Region* r1, Region* r2 ) { int run1_count, run2_count; int maxruns; RASSERT( !region_isEmpty(r1) ); RASSERT( !region_isEmpty(r2) ); if (region_isRect(r1)) { run1_count = RUNS_RECT_COUNT; o->runs1 = o->runs1_rect; runs_set_rect( o->runs1, &r1->bounds ); } else { o->runs1 = r1->runs; run1_count = runs_get_count(r1->runs); } if (region_isRect(r2)) { run2_count = RUNS_RECT_COUNT; o->runs2 = o->runs2_rect; runs_set_rect( o->runs2, &r2->bounds ); } else { o->runs2 = r2->runs; run2_count = runs_get_count(r2->runs); } maxruns = run1_count < run2_count ? run2_count : run1_count; o->store = runstore_alloc( 3*maxruns ); o->runs_base = runstore_to_runs(o->store); }
void skin_region_translate( SkinRegion* r, int dx, int dy ) { Run* runs; if (region_isEmpty(r)) return; skin_rect_translate( &r->bounds, dx, dy ); if (region_isRect(r)) return; runs = region_edit(r); while (runs[0] != YSENTINEL) { int ytop = runs[0]; int ybot = runs[1]; RASSERT(ybot != YSENTINEL); runs[0] = (Run)(ytop + dy); runs[1] = (Run)(ybot + dy); runs += 2; while (runs[0] != XSENTINEL) { int xleft = runs[0]; int xright = runs[1]; RASSERT(xright != YSENTINEL); runs[0] = (Run)(xleft + dx); runs[1] = (Run)(xright + dx); runs += 2; } runs += 1; } }
void OpenCLContext::runKernel(const uint32_t device_index, const uint32_t dim, const uint32_t* global_work_size, const bool blocking) { // You must call OpenCL::useKernel() first. RASSERT(cur_kernel_ != nullptr); RASSERT(device_index < devices_.size()); RASSERT(dim <= 3); // OpenCL doesn't support greater than 3 dims! cl::NDRange offset = cl::NullRange; cl::NDRange global_work; cl::NDRange local_work = cl::NullRange; // Let OpenCL Choose switch (dim) { case 1: global_work = cl::NDRange(global_work_size[0]); break; case 2: global_work = cl::NDRange(global_work_size[0], global_work_size[1]); break; case 3: global_work = cl::NDRange(global_work_size[0], global_work_size[1], global_work_size[2]); break; } cl::Event cur_event; CHECK_ERROR(queues_[device_index].enqueueNDRangeKernel( cur_kernel_->kernel(), offset, global_work, local_work, nullptr, &cur_event)); if (blocking) { cur_event.wait(); } }
void SpatialLPPooling::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); RASSERT(in->dim() == 2 || in->dim() == 3); if (output != nullptr && TO_TENSOR_PTR(output.get())->dim() != in->dim()) { // Input dimension has changed! cleanup(); } if (output != nullptr) { // Check that the dimensions above the lowest 2 match for (uint32_t i = 2; i < in->dim() && output != nullptr; i++) { if (TO_TENSOR_PTR(output.get())->size()[i] != in->size()[i]) { cleanup(); } } } if (output != nullptr) { // Check that the lowest 2 dimensions are the correct size if (TO_TENSOR_PTR(output.get())->size()[0] != in->size()[0] / poolsize_u_ || TO_TENSOR_PTR(output.get())->size()[1] != in->size()[1] / poolsize_v_) { cleanup(); } } if (output == nullptr) { // Check that the width and height is a multiple of the poolsize RASSERT(in->size()[0] % poolsize_u_ == 0 && in->size()[1] % poolsize_v_ == 0); std::unique_ptr<uint32_t[]> out_size(new uint32_t[in->dim()]); out_size[0] = in->size()[0] / poolsize_u_; out_size[1] = in->size()[1] / poolsize_v_; for (uint32_t i = 2; i < in->dim(); i++) { out_size[i] = in->size()[i]; } output.reset(new Tensor<float>(in->dim(), out_size.get())); input_cpu_.reset(new float[in->nelems()]); output_cpu_.reset(new float[TO_TENSOR_PTR(output.get())->nelems()]); } uint32_t n_threads = 1; if (in->dim() > 2) { n_threads = TO_TENSOR_PTR(output.get())->size()[2]; } if (thread_cbs_.size() != n_threads) { thread_cbs_.empty(); for (uint32_t f = 0; f < n_threads; f++) { thread_cbs_.push_back(std::unique_ptr<jcl::threading::Callback<void>>( MakeCallableMany(&SpatialLPPooling::forwardPropThread, this, f))); } } }
void SelectTable::forwardProp(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TABLE_DATA); Table* in = (Table*)input.get(); // Check that the input table isn't too small. RASSERT(in->tableSize() > index_); output = (*in)(index_); }
void ParallelTable::forwardProp(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TABLE_DATA); Table* in = (Table*)input.get(); // Make sure table size matches the number of parallel stages: RASSERT(in->tableSize() == network_.size()); for (uint32_t i = 0; i < network_.size(); i++) { network_[i]->forwardProp((*in)(i)); } initOutput(); // Init output just copies the pointers from the output // of all the parallel stages and fills up a table with them }
uint32_t OpenCLContext::queryMaxWorkgroupSizeForCurKernel( const uint32_t device_index) { // You must call OpenCL::useKernel() first. RASSERT(cur_kernel_ != nullptr); RASSERT(device_index < devices_.size()); size_t max_workgroup_size; cl_int rc = cur_kernel_->kernel().getWorkGroupInfo<size_t>( devices_[device_index], CL_KERNEL_WORK_GROUP_SIZE, &max_workgroup_size); RASSERT(rc == CL_SUCCESS); return (uint32_t)max_workgroup_size; }
void JoinTable::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TABLE_DATA); // Table expected Table* in = TO_TABLE_PTR(input.get()); RASSERT(in->tableSize() > 0); // Check that it is a table of FloatTensors for (uint32_t i = 0; i < in->tableSize(); i++) { // Table of float tensors expected RASSERT((*in)(i)->type() == TENSOR_DATA); } uint32_t dim = TO_TENSOR_PTR((*in)(0).get())->dim(); RASSERT(dim > dimension_); // Otherwise input is smaller than join dimension uint32_t jdim = dim - dimension_ - 1; // dimension_=0 is the top dim // Make sure the dimensions OTHER than the join dimension are all the same for (uint32_t d = 0; d < dim; d++) { if (d != jdim) { for (uint32_t j = 1; j < in->tableSize(); j++) { // sizes must match RASSERT(TO_TENSOR_PTR((*in)(j).get())->size()[d] == TO_TENSOR_PTR((*in)(0).get())->size()[d]); } if (output != nullptr && TO_TENSOR_PTR(output.get())->size()[d] != TO_TENSOR_PTR((*in)(0).get())->size()[d]) { output = nullptr; } } } uint32_t nelems_jdim = 0; for (uint32_t j = 1; j < in->tableSize(); j++) { nelems_jdim += TO_TENSOR_PTR((*in)(j).get())->size()[jdim]; } if (output != nullptr && TO_TENSOR_PTR(output.get())->size()[jdim] != nelems_jdim) { output = nullptr; } if (output == nullptr) { std::unique_ptr<uint32_t[]> size(new uint32_t[dim]); memcpy(size.get(), TO_TENSOR_PTR((*in)(0).get())->size(), sizeof(size[0]) * dim); size[dimension_] = nelems_jdim; output = std::shared_ptr<TorchData>(new Tensor<float>(dim, size.get())); } }
void Select::forwardProp(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); // For now we only support the Select operation along the outer dimension. // In torch indexing this is always 1. RASSERT(this->dimension_ == 1); if (src_tensor_ != input.get()) { // Only create the tensor slice if the input has changed. src_tensor_ = TO_TENSOR_PTR(input.get()); // Note the index is torch 1-indexed. output = Tensor<float>::selectOuterDim(*src_tensor_, this->index_ - 1); } }
static Run* runs_copy_scanline( Run* dst, Run* src ) { RASSERT(src[0] != YSENTINEL); RASSERT(src[1] != YSENTINEL); dst[0] = src[0]; dst[1] = src[1]; src += 2; dst += 2; do { *dst++ = *src++; } while (src[-1] != XSENTINEL); return dst; }
void comm_mutex_rw::rd_lock() { /// possible dead locks: th1:R-lock, th2:wr_lock(), th1:rd_lock() DWORD t = GetCurrentThreadId(); if( Waccess(t) ) { /// already W-locked _nRin++; RASSERT( _nWlocks != 0 ); return; } comm_mutex_rw_LOCK_IN; _nRin++; RASSERT( _nWlocks == 0 ); ResetEvent( (HANDLE)_cndAccess ); comm_mutex_rw_UNLOCK_IN; }
bool comm_mutex_rw::try_wr_lock() { #if( _WIN32_WINNT >= 0x0400 ) DWORD t = GetCurrentThreadId(); if( Waccess(t) ) { /// already W-locked ==> guard only one W-lock _nWlocks++; return true; } if( TryEnterCriticalSection((CRITICAL_SECTION*)&_mxIN) ) { /// lock in mutex RASSERT( _nWlocks == 0 ); if( TryEnterCriticalSection((CRITICAL_SECTION*)&_mxOUT) ) { /// lock out mutex if( Raccess() ) { /// no readers ? _nWlocks++; _W_owner = t; comm_mutex_rw_UNLOCK_OUT; return true; } comm_mutex_rw_UNLOCK_OUT; } comm_mutex_rw_UNLOCK_IN; } #endif return false; }
CLDevice OpenCLContext::CLDeviceType2CLDevice(const cl_device_type device) { CLDevice ret; switch (device) { case CL_DEVICE_TYPE_DEFAULT: ret = CLDeviceDefault; break; case CL_DEVICE_TYPE_CPU: ret = CLDeviceCPU; break; case CL_DEVICE_TYPE_GPU: ret = CLDeviceGPU; break; case CL_DEVICE_TYPE_ACCELERATOR: ret = CLDeviceAccelerator; break; case CL_DEVICE_TYPE_ALL: ret = CLDeviceAll; break; default: std::cout << "Invalid enumerant" << std::endl; RASSERT(false); ret = CLDeviceDefault; break; } return ret; }
void InitJTorch(const bool use_cpu, const uint32_t requested_deviceid, const bool verbose_startup) { std::lock_guard<std::mutex> lck(cl_context_lock_); // Check we haven't already called init. RASSERT(cl_context == nullptr); if (verbose_startup) { std::cout << "Valid OpenCL devices attached:" << std::endl; const uint32_t num_devices = jcl::OpenCLContext::printDevices(); static_cast<void>(num_devices); } jcl::CLDevice device = use_cpu ? jcl::CLDeviceCPU : jcl::CLDeviceGPU; jcl::CLVendor vendor = jcl::CLVendorAny; const bool device_exists = jcl::OpenCLContext::queryDeviceExists(device, vendor); if (!device_exists) { if (use_cpu) { std::cerr << "No CPU devices attached."; } else { std::cerr << "No GPU devices attached."; } } RASSERT(device_exists); // Otherwise, initialize the context. cl_context.reset(new jcl::OpenCLContext()); cl_context->init(device, jcl::CLVendorAny, verbose_startup); // Make sure the user is requesting a device id that exists. RASSERT(requested_deviceid < cl_context->getNumDevices()); deviceid = requested_deviceid; std::cout << "Jtorch is using device " << deviceid << ": " << cl_context->getDeviceName(deviceid) << std::endl; // Startup clblas. // TODO(tompson): I have NO idea what device ID this will run on. const cl_int blas_ret = clblasSetup(); const bool blas_ok = (blas_ret == CL_SUCCESS); if (!blas_ok) { std::cout << "ERROR - InitJTorchInternal: clblasSetup returned error: " << jcl::OpenCLContext::getErrorString(blas_ret); } RASSERT(blas_ok); }
static void runs_coalesce_band( Run* *psrc_spans, Run* *pdst_spans, SkinBox* minmax ) { Run* sspan = *psrc_spans; Run* dspan = *pdst_spans; int pleft = sspan[0]; int pright = sspan[1]; int xleft, xright; RASSERT(pleft != XSENTINEL); RASSERT(pright != XSENTINEL); RASSERT(pleft < pright); if (pleft < minmax->x1) minmax->x1 = pleft; sspan += 2; xleft = sspan[0]; while (xleft != XSENTINEL) { xright = sspan[1]; RASSERT(xright != XSENTINEL); RASSERT(xleft < xright); if (xleft == pright) { pright = xright; } else { dspan[0] = (Run) pleft; dspan[1] = (Run) pright; dspan += 2; } sspan += 2; xleft = sspan[0]; } dspan[0] = (Run) pleft; dspan[1] = (Run) pright; dspan[2] = XSENTINEL; dspan += 3; sspan += 1; /* skip XSENTINEL */ if (pright > minmax->x2) minmax->x2 = pright; *psrc_spans = sspan; *pdst_spans = dspan; }
void comm_mutex_rw::wr_unlock() { RASSERT( _nWlocks ); _nWlocks--; if( _nWlocks == 0 ) { _W_owner = 0; comm_mutex_rw_UNLOCK_IN; } }
static RunStore* runstore_alloc( int count ) { RunStore* s = malloc( sizeof(*s) + sizeof(Run)*count ); RASSERT(s != NULL); s->count = count; s->refcount = 1; return s; }
static int runs_coalesce( Run* dst, Run* src, SkinBox* minmax ) { Run* prev = NULL; Run* dst0 = dst; int ytop = src[0]; int ybot; while (ytop != YSENTINEL) { Run* sspan = src + 2; Run* dspan = dst + 2; ybot = src[1]; RASSERT( ytop < ybot ); RASSERT( ybot != YSENTINEL ); RASSERT( src[2] != XSENTINEL ); if (ytop < minmax->y1) minmax->y1 = ytop; if (ybot > minmax->y2) minmax->y2 = ybot; dst[0] = (Run) ytop; dst[1] = (Run) ybot; runs_coalesce_band( &sspan, &dspan, minmax ); if (prev && prev[1] == dst[0] && (dst-prev) == (dspan-dst) && !memcmp(prev+2, dst+2, (dspan-dst-2)*sizeof(Run))) { /* coalesce two identical bands */ prev[1] = dst[1]; } else { prev = dst; dst = dspan; } src = sspan; ytop = src[0]; } dst[0] = YSENTINEL; return (dst + 1 - dst0); }
void SpatialMaxPooling::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); RASSERT(in->dim() == 2 || in->dim() == 3); // We'll escentially do ceil_mode = false from torch const uint32_t iwidth = in->size()[0]; const uint32_t iheight = in->size()[1]; uint32_t oheight = (long)(floor((float)(iheight - kh_ + 2*padh_) / dh_)) + 1; uint32_t owidth = (long)(floor((float)(iwidth - kw_ + 2*padw_) / dw_)) + 1; if (output != nullptr && TO_TENSOR_PTR(output.get())->dim() != in->dim()) { // Input dimension has changed! output = nullptr; } if (output != nullptr) { // Check that the dimensions above the lowest 2 match for (uint32_t i = 2; i < in->dim() && output != nullptr; i++) { if (TO_TENSOR_PTR(output.get())->size()[i] != in->size()[i]) { output = nullptr; } } } if (output != nullptr) { // Check that the lowest 2 dimensions are the correct size if (TO_TENSOR_PTR(output.get())->size()[0] != owidth || TO_TENSOR_PTR(output.get())->size()[1] != oheight) { output = nullptr; } } if (output == nullptr) { std::unique_ptr<uint32_t[]> out_size(new uint32_t[in->dim()]); out_size[0] = owidth; out_size[1] = oheight; for (uint32_t i = 2; i < in->dim(); i++) { out_size[i] = in->size()[i]; } output.reset(new Tensor<float>(in->dim(), out_size.get())); } }
void OpenCLContext::runKernel(const uint32_t device_index, const uint32_t dim, const uint32_t* global_work_size, const uint32_t* local_work_size, const bool blocking) { // You must call OpenCL::useKernel() first. RASSERT(cur_kernel_ != nullptr); RASSERT(device_index < devices_.size()); RASSERT(dim <= 3); // OpenCL doesn't support greater than 3 dims! uint32_t total_worksize = 1; for (uint32_t i = 0; i < dim; i++) { // Check that: Global workgroup size is evenly divisible by the local work // group size! RASSERT((global_work_size[i] % local_work_size[i]) == 0); total_worksize *= local_work_size[i]; // Check that: Local workgroup size is not greater than // devices_max_workitem_size_ RASSERT((uint32_t)local_work_size[i] <= (uint32_t)devices_max_workitem_size_[device_index][i]); } // Check that: Local workgroup size is not greater than // CL_DEVICE_MAX_WORK_GROUP_SIZE! RASSERT(total_worksize <= (uint32_t)devices_max_workgroup_size_[device_index]); uint32_t max_size = queryMaxWorkgroupSizeForCurKernel(device_index); // Check that: Local workgroup size is not greater than // CL_KERNEL_WORK_GROUP_SIZE! RASSERT(total_worksize <= (uint32_t)max_size); cl::NDRange offset = cl::NullRange; cl::NDRange global_work; cl::NDRange local_work; switch (dim) { case 1: global_work = cl::NDRange(global_work_size[0]); local_work = cl::NDRange(local_work_size[0]); break; case 2: global_work = cl::NDRange(global_work_size[0], global_work_size[1]); local_work = cl::NDRange(local_work_size[0], local_work_size[1]); break; case 3: global_work = cl::NDRange(global_work_size[0], global_work_size[1], global_work_size[2]); local_work = cl::NDRange(local_work_size[0], local_work_size[1], local_work_size[2]); break; } cl::Event cur_event; CHECK_ERROR(queues_[device_index].enqueueNDRangeKernel( cur_kernel_->kernel(), offset, global_work, local_work, nullptr, &cur_event)); if (blocking) { cur_event.wait(); } }
static Run* runs_next_scanline( Run* runs ) { RASSERT(runs[0] != YSENTINEL && runs[1] != YSENTINEL ); runs += 2; do { runs += 1; } while (runs[-1] != XSENTINEL); return runs; }
// kernel1d default is either TorchStage::gaussian1D<float>(n) or just a // vector of 1 values. SpatialDivisiveNormalization::SpatialDivisiveNormalization( const std::shared_ptr<Tensor<float>> kernel, const float threshold) : TorchStage() { RASSERT(kernel->dim() <= 2); // Averaging kernel must be 1D or 2D! // Averaging kernel must have odd size! RASSERT(kernel->size()[0] % 2 != 0 && !(kernel->dim() == 2 && kernel->size()[1] % 2 == 0)); kernel_.reset(Tensor<float>::clone(*kernel)); kernel_norm_ = nullptr; // Normalization is input size dependant output = nullptr; std_coef_.reset(nullptr); std_pass1_.reset(nullptr); std_pass2_.reset(nullptr); std_.reset(nullptr); threshold_ = threshold; }
wtnt_t *newwtnt() { wtnt_t *wnptr; wnptr = valloc((size_t)sizeof(wtnt_t)); RASSERT((wnptr != WNULL), "Cannot allocate space for write notices!"); wnptr->more = WNULL; wnptr->wtntc = 0; return(wnptr); }
/*-----------------------------------------------------------*/ address_t newtwin() { address_t twin; int allocsize; allocsize = Pagesize; twin = (address_t)valloc((size_t)allocsize); RASSERT((twin != (address_t)NULL), "Cannot allocate twin space!"); return(twin); }
// kernel1d default is either TorchStage::gaussian1D<float>(n) or just a // vector of 1 values. SpatialSubtractiveNormalization::SpatialSubtractiveNormalization( const std::shared_ptr<Tensor<float>> kernel) : TorchStage() { RASSERT(kernel->dim() <= 2); // Averaging kernel must have odd size! RASSERT(kernel->size()[0] % 2 != 0 && !(kernel->dim() == 2 && kernel->size()[1] % 2 == 0)); // Clone and normalize the input kernel kernel_.reset(Tensor<float>::clone(*kernel.get())); float sum = Tensor<float>::slowSum(*kernel_); Tensor<float>::div(*kernel_, sum); output = nullptr; mean_coef_.reset(nullptr); mean_pass1_.reset(nullptr); mean_pass2_.reset(nullptr); mean_.reset(nullptr); }
void comm_mutex_rw::wr_lock() { /// possible dead locks: th1:W-lock, th2:rd_lock(), th1:wr_lock() DWORD t = GetCurrentThreadId(); if( Waccess(t) ) { /// already W-locked _nWlocks++; return; } comm_mutex_rw_LOCK_IN; RASSERT( _nWlocks == 0 ); comm_mutex_rw_LOCK_OUT; _nWlocks++; _W_owner = t; comm_mutex_rw_UNLOCK_OUT; int ret = WaitForSingleObject( (HANDLE)_cndAccess, INFINITE ); RASSERT( ret == WAIT_OBJECT_0 ); RASSERT( Raccess() ); }
void Reshape::init(std::shared_ptr<TorchData> input) { RASSERT(input->type() == TorchDataType::TENSOR_DATA); Tensor<float>* in = TO_TENSOR_PTR(input.get()); int32_t nelems = outNElem(); static_cast<void>(nelems); // Check the input size. RASSERT(in->nelems() == static_cast<uint32_t>(nelems)); if (output != nullptr) { Tensor<float>* out = TO_TENSOR_PTR(output.get()); if (out->storage() != in->storage()) { // The tensors don't share the same storage! Reinitialize the view. output = nullptr; } } if (output == nullptr) { output = Tensor<float>::view(*in, odim_, osize_.get()); } }
bool comm_mutex_rw::try_rd_lock() { #if( _WIN32_WINNT >= 0x0400 ) DWORD t = GetCurrentThreadId(); if( Waccess(t) ) { /// already W-locked _nRin++; RASSERT( _nWlocks != 0 ); return true; } else { if( TryEnterCriticalSection((CRITICAL_SECTION*)&_mxIN) ) { _nRin++; RASSERT( _nWlocks == 0 ); comm_mutex_rw_UNLOCK_IN; return true; } } #endif return false; }