void DataChannelMPI::allGather(std::vector<at::Tensor>& output, at::Tensor& input, THDGroup group_id) { const auto& group_pair = _groups.at(group_id); const auto& comm = group_pair.first; if (comm == MPI_COMM_NULL) return; if (output.size() != group_pair.second.size()) throw std::logic_error("allGather: number of output tensors and group size does not match"); for (auto out_tensor : output) assertSameSizeAndType(out_tensor, input, "allGather"); auto recv_buffer = _newLikeFlat(output); auto contig_input = input.contiguous(); MPI_Allgather( contig_input.data_ptr(), contig_input.numel(), mpi_datatype.at(contig_input.type().scalarType()), recv_buffer.data_ptr(), contig_input.numel(), mpi_datatype.at(recv_buffer.type().scalarType()), comm ); for (size_t i = 0; i < output.size(); ++i) output[i].copy_(recv_buffer[i]); }
void dissemble(PFBlock fb) { Opcodes opcode; int rmode,rdata, k = 0; string name; Instruction *pi = fb->pstart; while (pi != end_of_code) { opcode = (Opcodes)pi->opcode; // *add 1.2.4 HALT+data is not always a breakpoint! // (it is used as a NOP + <any useful tag data>) if (opcode == HALT) { if (pi->data < MAX_BREAKPOINTS) { Breakpoint *pb = Breakpoint::from_id(pi->data); Instruction ai = pb->saved_instruction(); std::cout << "*"; opcode = (Opcodes)ai.opcode; rmode = ai.rmode; rdata = ai.data; } else { opcode = NOP; rdata = pi->data; } } else { rmode = pi->rmode; rdata = pi->data; } name = get_opcode_name(opcode); std::cout << k++ << ' ' << name << '\t'; if (opcode == CCALL || opcode == CALL || opcode == CALLD || opcode == CALLN) { FBlock* pfb; void *data = data_ptr(rdata); if (opcode == CALLN) pfb = Builtin::imported_fblock_from_function((void*)((NFBlock *)data)->pfn); else pfb = PFBlock(data_ptr(rdata)); if (pfb) Function::from_fun_block(pfb)->dump(std::cout); } else if (opcode == JSWITCH) { int *swb = (int *)data_ptr(rdata); int sz = *swb++; int def = *swb++; std::cout << '(' << sz << ',' << def << ") "; for (int i = 0; i < sz; i++) std::cout << *swb++ << ' ' << *swb++ << ' '; } else if (opcode == TOSD || opcode == TPODS) { PClass pc = *(PClass *)data_ptr(rdata); std::cout << pc->name(); } else { if (rmode) switch(rmode) { case DIRECT: std::cout << "D "; break; case SREL: std::cout << "R "; break; case OREL: std::cout << "S "; break; } if (rdata != 0) std::cout << rdata; } std::cout << std::endl; if (opcode == RET || opcode == RETI || opcode == RETD) break; pi++; } }
void* kma_malloc(kma_size_t size) { if(size >= PAGE_SIZE) return NULL; if(entry_page == NULL) init_first_page(); kma_frame* current = (kma_frame*)entry_page->ptr; //go until last in the list or we find one that fits bool fits = (current->occupied == FREE && frame_size(current) >= size); while(current->last == NOT_LAST && !fits){ current = current->next; fits = (current->occupied == FREE && frame_size(current) >= size); } void* ret_addr; //if it fits... if(fits){ allocate_frame(current,size); ret_addr = data_ptr(current); //if not... }else{ //if nothing in the resource map fits, we need to allocate a new page //ifwe are here, current should be last kma_page_t* new_page = get_page(); void* next = ((char*) new_page->ptr) + new_page->size; kma_frame* new_frame = write_new_frame(new_page->ptr, new_page, current, next, FREE, LAST); allocate_frame(new_frame,size); current->last = NOT_LAST; ret_addr = data_ptr(new_frame); } //print_debug(); return ret_addr; }
std::vector<BackendChunk*> SimpleBackend::Create(const std::vector<BackendChunk*>& input, const std::vector<Scale>& result_sizes, std::shared_ptr<ComputeFn> fn) { auto current_device_id = MinervaSystem::Instance().current_device_id(); std::vector<BackendChunk*> result_chunks; Task* task = new Task(); for (auto i : input) { auto c = CHECK_NOTNULL(dynamic_cast<SimpleChunk*>(i)); task->inputs.emplace_back(c->data(), 0); } for (auto s : result_sizes) { auto data_id = MinervaSystem::Instance().GenerateDataId(); std::shared_ptr<PhysicalData> data_ptr( new PhysicalData(s, current_device_id, data_id), [&] (PhysicalData* d) { device_manager_.FreeData(d->data_id); delete d; } ); SimpleChunk* o = new SimpleChunk(data_ptr); result_chunks.emplace_back(o); task->outputs.emplace_back(o->data(), 0); } task->op = PhysicalOp{fn, current_device_id}; task->id = 0; DLOG(INFO) << "executing task name=" << fn->Name() << " to device #" << current_device_id; // wait for finish unique_lock<mutex> ul(finish_mutex_); device_manager_.GetDevice(current_device_id)->PushTask(task); finished_flag_ = false; while(! finished_flag_) { finish_cond_.wait(ul); } return result_chunks; }
bool pic_data_p(pic_state *pic, pic_value obj, const pic_data_type *type) { if (pic_type(pic, obj) != PIC_TYPE_DATA) { return false; } return type == NULL || data_ptr(pic, obj)->type == type; }
static Tensor new_from_data(const Type & type, int device, PyObject *data) { if (PySequence_Check(data)) { return new_from_sequence(type, device, data); } else { // could use scalarTensor but using store_scalar for consistency with the sequence path; // this has stricter checking (i.e. a floating-point number passed to an integral type will error). auto tensor = type.tensor({}); torch::utils::store_scalar((char*)tensor.data_ptr(), type.scalarType(), data); return tensor; } }
std::vector<BackendChunk*> SimpleBackend::Create(const std::vector<BackendChunk*>& input, const std::vector<Scale>& result_sizes, std::shared_ptr<ComputeFn> fn) { auto current_device_id = MinervaSystem::Instance().current_device_id(); std::vector<BackendChunk*> result_chunks; Task* task = new Task(); task->light = true; //printf("Backend collecting task inputs\n"); for (auto i : input) { auto c = CHECK_NOTNULL(dynamic_cast<SimpleChunk*>(i)); task->inputs.emplace_back(c->data(), 0); } //printf("Backend collecting %lu task outputs\n",result_sizes.size()); for (auto s : result_sizes) { auto data_id = MinervaSystem::Instance().GenerateDataId(); #ifdef HAS_MPI int current_device_id = MinervaSystem::Instance().current_device_id(); int currentrank = MinervaSystem::Instance().device_manager().GetDevice(current_device_id)->rank(); //printf("[%d] Backend Creating new PhysicalData on rank %d\n",MinervaSystem::Instance().rank(),currentrank); std::shared_ptr<PhysicalData> data_ptr( new PhysicalData(s, currentrank, current_device_id, data_id), [&] (PhysicalData* d) { device_manager_.FreeData(d->data_id); delete d; } ); #else std::shared_ptr<PhysicalData> data_ptr( new PhysicalData(s, current_device_id, data_id), [&] (PhysicalData* d) { device_manager_.FreeData(d->data_id); delete d; } ); #endif SimpleChunk* o = new SimpleChunk(data_ptr); result_chunks.emplace_back(o); task->outputs.emplace_back(o->data(), 0); } task->op = PhysicalOp{fn, current_device_id}; task->id = MinervaSystem::Instance().GenerateTaskId(); DLOG(INFO) << "executing task name=" << fn->Name() << " to device #" << current_device_id; // wait for finish // unique_lock<mutex> ul(finish_mutex_); // finished_flag_.store(false);; device_manager_.GetDevice(current_device_id)->PushTask(task); // while(! finished_flag_.load()) { // finish_cond_.wait(ul); // } return result_chunks; }
//C[x] += A[x]*B[x] //(if not 4-dimensional, then indexing [x] is ignored (e.g. for weight matrices)) void affine_y_x(int x_A, Ndarray* A, int x_B, Ndarray* B, int x_C, /*out*/Ndarray* C, bool transpose_A = false, bool transpose_B = false) { const float* data_A = data_ptr(A, x_A); const float* data_B = data_ptr(B, x_B); float* data_C = data_ptr(C, x_C); int A_dim[2], B_dim[2]; lastTwoDims(A, A_dim); lastTwoDims(B, B_dim); int ldB = B_dim[1]; int ldA = A_dim[1]; char transA = transpose_A ? 'T' : 'N'; char transB = transpose_B ? 'T' : 'N'; if (transpose_A) std::swap(A_dim[0], A_dim[1]); if (transpose_B) std::swap(B_dim[0], B_dim[1]); const float alpha = 1; const float beta = 1; Ndarray_sgemm(transB, transA, B_dim[1], A_dim[0], A_dim[1], &alpha, data_B, ldB, data_A, ldA, &beta, data_C, B_dim[1]); }
static Tensor new_from_sequence(ScalarType scalarType, PyObject* data) { if (!PySequence_Check(data)) { throw TypeError("new(): data must be a sequence (got %s)", Py_TYPE(data)->tp_name); } if (THPUtils_checkString(data)) { throw TypeError("new(): invalid data type '%s'", Py_TYPE(data)->tp_name); } #ifdef WITH_NUMPY if (PyArray_Check(data)) { return autograd::make_variable(tensor_from_numpy(data), false); } #endif auto sizes = compute_sizes(data); auto tensor = autograd::make_variable(CPU(scalarType).tensor(sizes), false); recursive_store( (char*)tensor.data_ptr(), tensor.sizes(), tensor.strides(), 0, scalarType, tensor.type().elementSizeInBytes(), data); return tensor; }
/** * Select histogram block i, store to ihist. * * @param ihist histogram i * @param i histogram block index * @param j histogram component index (select only a component); * -1 to select complete block * @return O_K if ok, NOT_EXEC if not executed */ INT16 CGEN_PUBLIC CHistogram::SelectBlock(CData* ihist, INT32 i, INT32 j) { INT32 l, rl; if (CheckHisto() != O_K) return (NOT_EXEC); if (ihist == NULL) return (NOT_EXEC); if (i < 0 || i >= m_nhist) return (NOT_EXEC); rl = BytesPerBlock(); data_reset (ihist); if (j < 0) { data_scopy (m_hist, ihist); data_arr_alloc (ihist,m_bins); copy_data_descr (m_hist, ihist); set_data_nblock (ihist, 1); dl_memmove ( (char*)data_ptr(ihist), (char*)xaddr(m_hist,i*m_bins,0), rl); return (O_K); } if (j > -1 && j < data_dim(m_hist)) { comp_mdef (ihist,1,T_DOUBLE); data_arr_alloc (ihist,m_bins); copy_comp_text (m_hist, j, ihist, 0, 1); copy_data_descr(m_hist, ihist); set_data_nblock (ihist, 1); for (l = 0; l < m_bins; l++) dstore (dfetch(m_hist,i*m_bins+l,j), ihist,l,0); return (O_K); } return (NOT_EXEC); }
void DataChannelMPI::gather(std::vector<at::Tensor>& output, at::Tensor& input, rank_type dst_rank, THDGroup group_id) { const auto& group_pair = _groups.at(group_id); const auto& comm = group_pair.first; if (comm == MPI_COMM_NULL) return; at::Tensor recv_buffer; void *recvbuf = nullptr; if (_rank != dst_rank) { if (output.size() > 0) throw std::logic_error("gather: number of input tensors should be 0 for non root"); } else { if (output.size() != group_pair.second.size()) throw std::logic_error("gather: number of output tensors and group size does not match"); for (auto out_tensor : output) assertSameSizeAndType(out_tensor, input, "gather"); recv_buffer = _newLikeFlat(output); recvbuf = recv_buffer.data_ptr(); } rank_type group_dst_rank = group_pair.second.mustGetGroupRank(dst_rank); auto contig_input = input.contiguous(); MPI_Gather( contig_input.data_ptr(), input.numel(), mpi_datatype.at(input.type().scalarType()), recvbuf, input.numel(), mpi_datatype.at(input.type().scalarType()), group_dst_rank, comm ); // NOTE: this is a no-op in all processes except dst_rank for (size_t i = 0; i < output.size(); ++i) output[i].copy_(recv_buffer[i]); }
const unsigned char * tissuestack::imaging::DicomFileWrapper::getData() { // the dicom image std::unique_ptr<DicomImage> dcmTmp(new DicomImage(this->_file_name.c_str(), CIF_AcrNemaCompatibility)); unsigned long data_size = dcmTmp->getOutputDataSize( (this->getAllocatedBits() <= 8 || this->isColor()) ? 8 : 16); unsigned long image_size = this->getWidth() * this->getHeight(); if (data_size == 0) data_size = image_size * (this->getAllocatedBits() / 2); if (image_size == 0) image_size = data_size; const unsigned long finished_image_size = image_size * 3; // we converge on an RGB format 8 bit per channel which is what our raw format is std::unique_ptr<unsigned char[]> data_ptr(new unsigned char[finished_image_size]); double min = pow(2, this->getAllocatedBits())-1; double max = 0; dcmTmp->getMinMaxValues(min,max); int two_power8 = static_cast<int>(pow(2, 8)); int two_power_16 = static_cast<int>(pow(2, 16)); if (this->isColor()) // COLOR { if (dcmTmp->getOutputData(data_ptr.get(), data_size, 8, 0, 0) <= 0) return nullptr; return data_ptr.release(); } if (this->getAllocatedBits() <= 8) // 8 BIT { if (this->containsSignedData()) // SIGNED { std::unique_ptr<char[]> tmp_ptr(new char[image_size]); if (dcmTmp->getOutputData(tmp_ptr.get(), data_size, 8, 0, 0) <= 0) return nullptr; // convert to unsigned min = pow(2, this->getAllocatedBits())-1; max = 0; for (unsigned long i=0;i<image_size;i++) { const unsigned long rgbOffset = i * 3; const unsigned char val = static_cast<unsigned char>(static_cast<int>(tmp_ptr.get()[i]) + (two_power8 / 2)); data_ptr.get()[rgbOffset] = data_ptr.get()[rgbOffset+1] = data_ptr.get()[rgbOffset+2] = val; if (val < min) min = val; if (val > max) max = val; } // now that we have min and max ... fit to contrast range linearly for (unsigned long i=0;i<finished_image_size;i++) data_ptr.get()[i] = static_cast<unsigned char>(((static_cast<double>(data_ptr.get()[i])-min) * (two_power8-1)) / (max-min)); // return return data_ptr.release(); } // UNSIGNED std::unique_ptr<unsigned char[]> tmp_ptr(new unsigned char[image_size]); if (dcmTmp->getOutputData(tmp_ptr.get(), data_size, 8, 0, 0) <= 0) return nullptr; // make RGB data for (unsigned long i=0;i<image_size;i++) { const unsigned long rgbOffset = i * 3; data_ptr.get()[rgbOffset] = data_ptr.get()[rgbOffset+1] = data_ptr.get()[rgbOffset+2] = tmp_ptr.get()[i]; } return data_ptr.release(); } if (this->getAllocatedBits() > 8) // 12/16 BITS { std::unique_ptr<unsigned short[]> tmp_16bit_ptr(new unsigned short[finished_image_size]); if (this->containsSignedData()) // SIGNED { std::unique_ptr<short[]> tmp_ptr(new short[image_size]); if (dcmTmp->getOutputData(tmp_ptr.get(), data_size, 16, 0, 0) <= 0) return nullptr; // convert to unsigned min = pow(2, this->getAllocatedBits())-1; max = 0; for (unsigned long i=0;i<image_size;i++) { const unsigned long rgbOffset = i * 3; const unsigned short val = static_cast<unsigned short>(static_cast<int>(tmp_ptr.get()[i]) + (two_power_16 / 2)); tmp_16bit_ptr.get()[rgbOffset] = tmp_16bit_ptr.get()[rgbOffset+1] = tmp_16bit_ptr.get()[rgbOffset+2] = val; if (val < min) min = val; if (val > max) max = val; } // now that we have min and max ... fit to contrast range linearly to make it 8 bit for (unsigned long i=0;i<finished_image_size;i++) data_ptr.get()[i] = static_cast<unsigned char>(((static_cast<double>(tmp_16bit_ptr.get()[i])-min) * (two_power8-1)) / (max-min)); return data_ptr.release(); } // UNSIGNED if (dcmTmp->getOutputData(tmp_16bit_ptr.get(), data_size, 16, 0, 0) <= 0) return nullptr; // turn 16 bit into 8 bit for (unsigned long i=0;i<image_size;i++) { const unsigned long rgbOffset = i * 3; data_ptr.get()[rgbOffset] = data_ptr.get()[rgbOffset+1] = data_ptr.get()[rgbOffset+2] = static_cast<unsigned char>((static_cast<double>(tmp_16bit_ptr.get()[i]) * (two_power8-1)) / (two_power_16-1)); } } return data_ptr.release(); }
at::Tensor mkldnn_convolution( const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, IntList padding, IntList stride, IntList dilation) { auto output = input.type().tensor(conv_output_size( input.sizes(), weight.sizes(), padding, stride, dilation)); auto cpu_engine = CpuEngine::Instance().get_engine(); int32_t n = input.size(0); int32_t ic = input.size(1); int32_t ih = input.size(2); int32_t iw = input.size(3); int32_t oc = output.size(1); int32_t oh = output.size(2); int32_t ow = output.size(3); int32_t kh = weight.size(2); int32_t kw = weight.size(3); int32_t sh = stride[0]; int32_t sw = stride[1]; int32_t ph = padding[0]; int32_t pw = padding[1]; auto data_t = memory::data_type::f32; auto format_any = memory::format::any; auto format_nchw = memory::format::nchw; auto format_oihw = memory::format::oihw; auto format_x = memory::format::x; memory::dims input_tz = {n, ic, ih, iw}; memory::dims weight_tz = {oc, ic, kh, kw}; memory::dims bias_tz = {oc}; memory::dims output_tz = {n, oc, oh, ow}; memory::dims _stride = {sh, sw}; memory::dims _padding = {ph, pw}; auto input_md = memory::desc({input_tz}, data_t, format_any); auto weight_md = memory::desc({weight_tz}, data_t, format_any); auto bias_md = memory::desc({bias_tz}, data_t, format_any); auto output_md = memory::desc({output_tz}, data_t, format_any); std::shared_ptr<convolution_forward::desc> conv_forward_desc; if (bias.defined()) { conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, convolution_direct, input_md, weight_md, bias_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } else { conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, convolution_direct, input_md, weight_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd; conv_forward_pd.reset(new convolution_forward::primitive_desc( *conv_forward_desc, cpu_engine)); auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine}, input.data_ptr()); auto weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine}, weight.data_ptr()); auto output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine}, output.data_ptr()); std::vector<primitive> net; auto input_pd = conv_forward_pd->src_primitive_desc(); auto input_memory = input_usr_memory; if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) { input_memory = memory(input_pd); net.push_back(reorder(input_usr_memory, input_memory)); } auto weight_pd = conv_forward_pd->weights_primitive_desc(); auto weight_memory = weight_usr_memory; if (weight_usr_memory.get_primitive_desc() != memory::primitive_desc(weight_pd)) { weight_memory = memory(weight_pd); net.push_back(reorder(weight_usr_memory, weight_memory)); } auto output_pd = conv_forward_pd->dst_primitive_desc(); auto output_memory = output_usr_memory; if (output_usr_memory.get_primitive_desc() != memory::primitive_desc(output_pd)) { output_memory = memory(output_pd); } std::shared_ptr<convolution_forward> conv_forward; std::shared_ptr<memory> bias_usr_memory; if (bias.defined()) { bias_usr_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine}, bias.data_ptr())); conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory, weight_memory, *bias_usr_memory, output_memory)); } else { conv_forward.reset(new convolution_forward(*conv_forward_pd, input_memory, weight_memory, output_memory)); } net.push_back(*conv_forward); if (output_memory != output_usr_memory) { net.push_back(reorder(output_memory, output_usr_memory)); } Stream::Instance().get_stream().submit(net); return output; }
// NB: CuDNN only implements the backward algorithm for batchnorm // in training mode (evaluation mode batchnorm has a different algorithm), // which is why this doesn't accept a 'training' parameter. std::tuple<Tensor, Tensor, Tensor> cudnn_batch_norm_backward( const Tensor& input_t, const Tensor& grad_output_t, const Tensor& weight_t, // Unused: but we require them to be passed so that double backwards // has access const Tensor& running_mean, const Tensor& running_var, const Tensor& save_mean_t, const Tensor& save_var_t, double epsilon) { TensorArg input{ input_t, "input", 1 }, grad_output{ grad_output_t, "grad_output", 2 }, weight{ weight_t, "weight", 3 }, save_mean{ save_mean_t, "save_mean", 4 }, save_var{ save_var_t, "save_var", 5 }; CheckedFrom c = "cudnn_batch_norm_backward"; setCuDNNStreamToCurrent(); checkAllDefined(c, {input, grad_output, weight, save_mean, save_var}); checkAllSameGPU(c, {input, grad_output, weight, save_mean, save_var}); if (input->type().scalarType() == ScalarType::Half) { checkScalarType(c, weight, ScalarType::Float); } else { checkAllSameType(c, {input, weight}); } checkAllSameType(c, {input, grad_output}); checkAllSameType(c, {weight, save_mean, save_var}); // TODO: is weight required to be contiguous? checkAllContiguous(c, {input, grad_output, save_mean, save_var}); checkDimRange(c, input, 2, 6 /* exclusive */); checkSameSize(c, input, grad_output); auto num_features = input->size(1); for (auto t : {weight, save_mean, save_var}) { checkNumel(c, t, num_features); } cudnnBatchNormMode_t mode; if (input->dim() == 2) { mode = CUDNN_BATCHNORM_PER_ACTIVATION; } else { #if CUDNN_VERSION >= 7003 mode = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; #else mode = CUDNN_BATCHNORM_SPATIAL; #endif } auto grad_input_t = input->type().tensor(input->sizes()); auto grad_weight_t = weight->type().tensor(weight->sizes()); auto grad_bias_t = weight->type().tensor(weight->sizes()); auto handle = getCudnnHandle(); auto dataType = getCudnnDataType(*input); TensorDescriptor idesc{ *input, 4 }; // input, output, grad_output descriptor TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 }; // descriptor for weight, bias, save_mean, etc. Constant one(dataType, 1); Constant zero(dataType, 0); CUDNN_CHECK(cudnnBatchNormalizationBackward( handle, mode, &one, &zero, &one, &zero, idesc.desc(), input->data_ptr(), idesc.desc(), grad_output->data_ptr(), idesc.desc(), grad_input_t.data_ptr(), wdesc.desc(), weight->data_ptr(), grad_weight_t.data_ptr(), grad_bias_t.data_ptr(), epsilon, save_mean->data_ptr(), save_var->data_ptr())); return std::tuple<Tensor,Tensor,Tensor>{grad_input_t, grad_weight_t, grad_bias_t}; }
void * pic_data(pic_state *PIC_UNUSED(pic), pic_value data) { return data_ptr(pic, data)->data; }
/** * Update histogram data st by data records from x * * Uses the followong fields of class CVh: * hist - histogram data (must be double, may be empty) * minp - minima-maxima (must be double) * icomp - index component (may be -1, if not index in data x) * nind - index number (only for initialization required) * m_bins - histogram resolution (only for initialization required) * indexlist - index list of vectors * * if index list cannot be created, index indep. histogram is made: * - no label, no index comp. or no label file * - label, but no ltab * mode - 1 constant data interval (minm->dim == 1, * minm->nvec == 1) * 2 component spec. quantization (minm->nvec == 2) * 3 index - comp. spec. quantization (minm->nvec > 2) * * @param x Feature vector sequence * @param wv Weighting vector sequence (may be NULL) * @return O_K if ok or NOT_EXEC if not executed */ INT16 CGEN_PUBLIC CHistogram::UpdateHist(CData* x, CData* wv) { INT32 t, il, wdim, T; FLOAT64 cw; FLOAT64 *xp, *wvp, *p_mm; INT16 lflag, flag_w, mflag; INT32 *qx; INT16 ierr; IDENTIFY ("CVh::UpdateHist"); /* Check input data */ if (data_empty(x) == TRUE) return IERROR(this,HIS_NOINP,0,0,0); if (data_ndim(x) == 0) return IERROR(this,HIS_NODAT,0,0,0); T = data_nrec(x); m_count = 0; /* Check index list data */ lflag = 0; if (data_empty(m_indexlist) != TRUE) lflag = 1; /* Use minp data */ SetupHmode(); if (m_hmode <= 0) return NOT_EXEC; p_mm = (FLOAT64*) data_ptr(m_minmax); /* Prepare histogram array */ Prepare(x, wv); if ((ierr = CheckConsist()) != O_K) return NOT_EXEC; /* Check weight data */ flag_w = 0; wvp = NULL; wdim = data_ndim(wv); if (data_empty(wv) != TRUE && lflag == 0) { if (wdim < m_nhist) return (NOT_EXEC); flag_w = 1; wvp = (FLOAT64*) dl_calloc (wdim,sizeof(FLOAT64),"UpdateHist"); } mflag = 0; if (data_empty(wv) != TRUE && lflag == 1) { if (wdim == 1) mflag = 1; } if (flag_w == 1 || mflag == 1) if (data_nrec(wv) < T) { T = data_nrec(wv); printf("histogram update: less weights than data ... only %d records", T); } /* Aux. arrays */ qx = (INT32*) dl_calloc(m_hdim+2, sizeof(INT32), "UpdateHist"); xp = (FLOAT64*) dl_calloc(m_hdim+2, sizeof(FLOAT64),"UpdateHist"); /* Hard update, index driven */ cw = 1.; if (flag_w == 0) { il = 0; for (t = 0; t < T; t++) { dvec_fetch(xp, x, t, m_hdim, m_icomp); if (lflag != 0) il = (INT32) dfetch(m_indexlist,t,0); if (il > -1 && il < m_nhist) { m_count++; if (mflag == 1) cw = dfetch(wv,t,0); QuantVec(xp, qx, p_mm, il); IncrementHisto(qx, cw, il); } } } /* Soft updating weighted by w-array */ if (flag_w == 1) { for (t = 0; t < T; t++) { dvec_fetch(wvp,wv,t,wdim,-1); dvec_fetch(xp,x,t,m_hdim,m_icomp); for (il = 0; il < m_nhist; il++) { m_count++; QuantVec(xp, qx, p_mm, il); IncrementHisto(qx, wvp[il], il); } } } /* Ending activities */dl_free(qx); dl_free(xp); if (flag_w == 1) dl_free (wvp); return O_K; }
double stfnum::lmFit( const Vector_double& data, double dt, const stfnum::storedFunc& fitFunc, const Vector_double& opts, bool use_scaling, Vector_double& p, std::string& info, int& warning ) { // Basic range checking: if (fitFunc.pInfo.size()!=p.size()) { std::string msg("Error in stfnum::lmFit()\n" "function parameters (p_fit) and parameters entered (p) have different sizes"); throw std::runtime_error(msg); } if ( opts.size() != 6 ) { std::string msg("Error in stfnum::lmFit()\n" "wrong number of options"); throw std::runtime_error(msg); } bool constrained = false; std::vector< double > constrains_lm_lb( fitFunc.pInfo.size() ); std::vector< double > constrains_lm_ub( fitFunc.pInfo.size() ); bool can_scale = use_scaling; for ( unsigned n_p=0; n_p < fitFunc.pInfo.size(); ++n_p ) { if ( fitFunc.pInfo[n_p].constrained ) { constrained = true; constrains_lm_lb[n_p] = fitFunc.pInfo[n_p].constr_lb; constrains_lm_ub[n_p] = fitFunc.pInfo[n_p].constr_ub; } else { constrains_lm_lb[n_p] = -DBL_MAX; constrains_lm_ub[n_p] = DBL_MAX; } if ( can_scale ) { if (fitFunc.pInfo[n_p].scale == stfnum::noscale) { can_scale = false; } } } // Store the functions at global scope: saveFunc(fitFunc.func); saveJac(fitFunc.jac); double info_id[LM_INFO_SZ]; Vector_double data_ptr(data); Vector_double xyscale(4); if (can_scale) { xyscale = get_scale(data_ptr, dt); } // The parameters need to be separated into two parts: // Those that are to be fitted and those that the client wants // to keep constant. Since there is no native support to // do so in Lourakis' routines, the workaround is a little // tricky, making (ab)use of the *void pointer: // number of parameters that need to be fitted: int n_fitted=0; for ( unsigned n_p=0; n_p < fitFunc.pInfo.size(); ++n_p ) { n_fitted += fitFunc.pInfo[n_p].toFit; } // parameters that need to be fitted: Vector_double p_toFit(n_fitted); std::deque<bool> p_fit_bool( fitFunc.pInfo.size() ); // parameters that are held constant: Vector_double p_const( fitFunc.pInfo.size()-n_fitted ); for ( unsigned n_p=0, n_c=0, n_f=0; n_p < fitFunc.pInfo.size(); ++n_p ) { if (fitFunc.pInfo[n_p].toFit) { p_toFit[n_f++] = p[n_p]; if (can_scale) { p_toFit[n_f-1] = fitFunc.pInfo[n_p].scale(p_toFit[n_f-1], xyscale[0], xyscale[1], xyscale[2], xyscale[3]); } } else { p_const[n_c++] = p[n_p]; if (can_scale) { p_const[n_c-1] = fitFunc.pInfo[n_p].scale(p_const[n_c-1], xyscale[0], xyscale[1], xyscale[2], xyscale[3]); } } p_fit_bool[n_p] = fitFunc.pInfo[n_p].toFit; } // size * dt_new = 1 -> dt_new = 1.0/size double dt_finfo = dt; if (can_scale) dt_finfo = 1.0/data_ptr.size(); fitInfo fInfo( p_fit_bool, p_const, dt_finfo ); // make l-value of opts: Vector_double opts_l(5); for (std::size_t n=0; n < 4; ++n) opts_l[n] = opts[n]; opts_l[4] = -1e-6; int it = 0; if (p_toFit.size()!=0 && data_ptr.size()!=0) { double old_info_id[LM_INFO_SZ]; // initialize with initial parameter guess: Vector_double old_p_toFit(p_toFit); #ifdef _DEBUG std::ostringstream optsMsg; optsMsg << "\nopts: "; for (std::size_t n_p=0; n_p < opts.size(); ++n_p) optsMsg << opts[n_p] << "\t"; optsMsg << "\n" << "data_ptr[" << data_ptr.size()-1 << "]=" << data_ptr[data_ptr.size()-1] << "\n"; optsMsg << "constrains_lm_lb: "; for (std::size_t n_p=0; n_p < constrains_lm_lb.size(); ++n_p) optsMsg << constrains_lm_lb[n_p] << "\t"; optsMsg << "\n" << "constrains_lm_ub: "; for (std::size_t n_p=0; n_p < constrains_lm_ub.size(); ++n_p) optsMsg << constrains_lm_ub[n_p] << "\t"; optsMsg << "\n\n"; std::cout << optsMsg; #endif while ( 1 ) { #ifdef _DEBUG std::ostringstream paramMsg; paramMsg << "Pass: "******"\t"; paramMsg << "p_toFit: "; for (std::size_t n_p=0; n_p < p_toFit.size(); ++n_p) paramMsg << p_toFit[n_p] << "\t"; paramMsg << "\n"; std::cout << paramMsg.str().c_str(); #endif if ( !fitFunc.hasJac ) { if ( !constrained ) { dlevmar_dif( c_func_lour, &p_toFit[0], &data_ptr[0], n_fitted, (int)data.size(), (int)opts[4], &opts_l[0], info_id, NULL, NULL, &fInfo ); } else { dlevmar_bc_dif( c_func_lour, &p_toFit[0], &data_ptr[0], n_fitted, (int)data.size(), &constrains_lm_lb[0], &constrains_lm_ub[0], NULL, (int)opts[4], &opts_l[0], info_id, NULL, NULL, &fInfo ); } } else { if ( !constrained ) { dlevmar_der( c_func_lour, c_jac_lour, &p_toFit[0], &data_ptr[0], n_fitted, (int)data.size(), (int)opts[4], &opts_l[0], info_id, NULL, NULL, &fInfo ); } else { dlevmar_bc_der( c_func_lour, c_jac_lour, &p_toFit[0], &data_ptr[0], n_fitted, (int)data.size(), &constrains_lm_lb[0], &constrains_lm_ub[0], NULL, (int)opts[4], &opts_l[0], info_id, NULL, NULL, &fInfo ); } } it++; if ( info_id[1] != info_id[1] ) { // restore previous parameters if new chisqr is NaN: p_toFit = old_p_toFit; } else { double dchisqr = (info_id[0] - info_id[1]) / info_id[1]; // (old chisqr - new chisqr) / new_chisqr if ( dchisqr < 0 ) { // restore previous results and exit if new chisqr is larger: for ( int n_i = 0; n_i < LM_INFO_SZ; ++n_i ) info_id[n_i] = old_info_id[n_i]; p_toFit = old_p_toFit; break; } if ( dchisqr < 1e-5 ) { // Keep current results and exit if change in chisqr is below threshold break; } // otherwise, store results and continue iterating: for ( int n_i = 0; n_i < LM_INFO_SZ; ++n_i ) old_info_id[n_i] = info_id[n_i]; old_p_toFit = p_toFit; } if ( it >= opts[5] ) // Exit if maximal number of iterations is reached break; // decrease initial step size for next iteration: opts_l[0] *= 1e-4; } } else { std::runtime_error e("Array of size zero in lmFit"); throw e; } // copy back the fitted parameters to p: for ( unsigned n_p=0, n_f=0, n_c=0; n_p<fitFunc.pInfo.size(); ++n_p ) { if (fitFunc.pInfo[n_p].toFit) { p[n_p] = p_toFit[n_f++]; } else { p[n_p] = p_const[n_c++]; } if (can_scale) { p[n_p] = fitFunc.pInfo[n_p].unscale(p[n_p], xyscale[0], xyscale[1], xyscale[2], xyscale[3]); } } std::ostringstream str_info; str_info << "Passes: " << it; str_info << "\nIterations during last pass: "******"\nStopping reason during last pass:"******"\nStopped by small gradient of squared error."; warning = 0; break; case 2: str_info << "\nStopped by small rel. parameter change."; warning = 0; break; case 3: str_info << "\nReached max. number of iterations. Restart\n" << "with smarter initial parameters and / or with\n" << "increased initial scaling factor and / or with\n" << "increased max. number of iterations."; warning = 3; break; case 4: str_info << "\nSingular matrix. Restart from current parameters\n" << "with increased initial scaling factor."; warning = 4; break; case 5: str_info << "\nNo further error reduction is possible.\n" << "Restart with increased initial scaling factor."; warning = 5; break; case 6: str_info << "\nStopped by small squared error."; warning = 0; break; case 7: str_info << "\nStopped by invalid (i.e. NaN or Inf) \"func\" values.\n"; str_info << "This is a user error."; warning = 7; break; default: str_info << "\nUnknown reason for stopping the fit."; warning = -1; } if (use_scaling && !can_scale) { str_info << "\nCouldn't use scaling because one or more " << "of the parameters don't allow it."; } info=str_info.str(); return info_id[1]; }
const float* data_ptr(const Ndarray* a, int x) { return data_ptr((Ndarray*) a, x); }
std::tuple<at::Tensor, at::Tensor> mkldnn_convolution_backward_weights( IntList weight_size, const at::Tensor& grad_output, const at::Tensor& input, IntList padding, IntList stride, IntList dilation, bool bias_defined) { auto grad_weight = grad_output.type().tensor(weight_size); Tensor grad_bias; if (bias_defined) { grad_bias = grad_output.type().tensor({grad_output.size(1)}); } auto cpu_engine = CpuEngine::Instance().get_engine(); int32_t n = input.size(0); int32_t ic = input.size(1); int32_t ih = input.size(2); int32_t iw = input.size(3); int32_t oc = grad_output.size(1); int32_t oh = grad_output.size(2); int32_t ow = grad_output.size(3); int32_t kh = grad_weight.size(2); int32_t kw = grad_weight.size(3); int32_t sh = stride[0]; int32_t sw = stride[1]; int32_t ph = padding[0]; int32_t pw = padding[1]; auto data_t = memory::data_type::f32; auto format_any = memory::format::any; auto format_nchw = memory::format::nchw; auto format_oihw = memory::format::oihw; auto format_x = memory::format::x; memory::dims input_tz = {n, ic, ih, iw}; memory::dims weight_tz = {oc, ic, kh, kw}; memory::dims bias_tz = {oc}; memory::dims output_tz = {n, oc, oh, ow}; memory::dims _stride = {sh, sw}; memory::dims _padding = {ph, pw}; memory::desc input_md({input_tz}, data_t, format_any); memory::desc weight_md({weight_tz}, data_t, format_any); memory::desc bias_md({bias_tz}, data_t, format_any); memory::desc output_md({output_tz}, data_t, format_any); // need to re-create conv_forward_pd to feed conv_backward_weight_pd std::shared_ptr<convolution_forward::desc> conv_forward_desc; if (bias_defined) { conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, convolution_direct, input_md, weight_md, bias_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } else { conv_forward_desc.reset(new convolution_forward::desc(prop_kind::forward, convolution_direct, input_md, weight_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } std::shared_ptr<convolution_forward::primitive_desc> conv_forward_pd; conv_forward_pd.reset(new convolution_forward::primitive_desc( *conv_forward_desc, cpu_engine)); std::shared_ptr<convolution_backward_weights::desc> conv_backward_weight_desc; if (bias_defined) { conv_backward_weight_desc.reset(new convolution_backward_weights::desc( convolution_direct, input_md, weight_md, bias_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } else { conv_backward_weight_desc.reset(new convolution_backward_weights::desc( convolution_direct, input_md, weight_md, output_md, _stride, _padding, _padding, padding_kind::zero)); } std::shared_ptr<convolution_backward_weights::primitive_desc> conv_backward_weight_pd; conv_backward_weight_pd.reset(new convolution_backward_weights::primitive_desc( *conv_backward_weight_desc, cpu_engine, *conv_forward_pd)); auto input_usr_memory = memory({{{input_tz}, data_t, format_nchw}, cpu_engine}, input.data_ptr()); auto grad_output_usr_memory = memory({{{output_tz}, data_t, format_nchw}, cpu_engine}, grad_output.data_ptr()); auto grad_weight_usr_memory = memory({{{weight_tz}, data_t, format_oihw}, cpu_engine}, grad_weight.data_ptr()); std::shared_ptr<memory> grad_bias_memory; std::vector<primitive> net; auto input_pd = conv_backward_weight_pd->src_primitive_desc(); auto input_memory = input_usr_memory; if (input_usr_memory.get_primitive_desc() != memory::primitive_desc(input_pd)) { input_memory = memory(input_pd); net.push_back(reorder(input_usr_memory, input_memory)); } auto grad_output_pd = conv_backward_weight_pd->diff_dst_primitive_desc(); auto grad_output_memory = grad_output_usr_memory; if (grad_output_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_output_pd)) { grad_output_memory = memory(grad_output_pd); net.push_back(reorder(grad_output_usr_memory, grad_output_memory)); } auto grad_weight_pd = conv_backward_weight_pd->diff_weights_primitive_desc(); auto grad_weight_memory = grad_weight_usr_memory; if (grad_weight_usr_memory.get_primitive_desc() != memory::primitive_desc(grad_weight_pd)) { grad_weight_memory = memory(grad_weight_pd); } std::shared_ptr<convolution_backward_weights> conv_backward_weight; if (bias_defined) { grad_bias_memory.reset(new memory({{{bias_tz}, data_t, format_x}, cpu_engine}, grad_bias.data_ptr())); conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd, input_memory, grad_output_memory, grad_weight_memory, *grad_bias_memory)); } else { conv_backward_weight.reset(new convolution_backward_weights(*conv_backward_weight_pd, input_memory, grad_output_memory, grad_weight_memory)); } net.push_back(*conv_backward_weight); if (grad_weight_memory != grad_weight_usr_memory) { net.push_back(reorder(grad_weight_memory, grad_weight_usr_memory)); } Stream::Instance().get_stream().submit(net); return std::tuple<at::Tensor, at::Tensor>{grad_weight, grad_bias}; }