inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_); caffe_memset(size_, 0, cpu_ptr_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; case HEAD_AT_CPU: break; } }
void* SyncedMemory::cpu_data() { if(size_==0) { to_cpu(); } else { if(head_==UNINITIALIZED) caffe_memset(size_, 0, cpu_ptr_); } return (void*)cpu_ptr_; }
void BatchTripletLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { if (propagate_down[1]) { LOG(FATAL) << this->type() << " Layer cannot backpropagate to label inputs."; } if (propagate_down[0]) { Blob<Dtype>* feat = bottom[0]; const Dtype* feat_data = feat->cpu_data(); Dtype* feat_diff = feat->mutable_cpu_diff(); int count = feat->count(); int num = feat->num(); int dim = count / num; int agg_step = num * sizeof(Dtype); Dtype * agg_data = (Dtype *)aggregator_->mutable_cpu_data(); caffe_memset(num * agg_step, 0, agg_data); Dtype scale1 = Dtype(2) / triplets_.size() * mu_; for (int i=0; i<triplets_.size(); ++i) { int qry_id = triplets_[i].first_; int pos_id = triplets_[i].second_; int neg_id = triplets_[i].third_; agg_data[qry_id * num + neg_id] += scale1; agg_data[qry_id * num + pos_id] -= scale1; agg_data[pos_id * num + pos_id] += scale1; agg_data[pos_id * num + qry_id] -= scale1; agg_data[neg_id * num + qry_id] += scale1; agg_data[neg_id * num + neg_id] -= scale1; } Dtype scale2 = Dtype(2) / pos_pairs_.size() * (Dtype(1) - mu_); for (int i=0; i<pos_pairs_.size(); ++i) { int qry_id = pos_pairs_[i].first; int pos_id = pos_pairs_[i].second; agg_data[qry_id * num + qry_id] += scale2; agg_data[qry_id * num + pos_id] -= scale2; agg_data[pos_id * num + pos_id] += scale2; agg_data[pos_id * num + qry_id] -= scale2; } caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, num, Dtype(1), agg_data, feat_data, Dtype(0), feat_diff); } }
void RDMABuffer::Write(bool data) { struct ibv_sge list; list.addr = (uint64_t) addr_; list.length = size_; list.lkey = self_->lkey; struct ibv_send_wr wr; caffe_memset(sizeof(wr), 0, &wr); wr.wr_id = (uint64_t) this; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM; wr.send_flags = IBV_SEND_SIGNALED; wr.imm_data = id_; if (!data) { // ctrl signal wr.imm_data += CTRL_ID_OFFSET; } wr.wr.rdma.remote_addr = (uint64_t) peer_->addr; wr.wr.rdma.rkey = peer_->rkey; struct ibv_send_wr *bad_wr; // lock the channel since there may be multiple threads calling write() boost::mutex::scoped_lock lock(channel_->mutex_); CHECK(!ibv_post_send(channel_->qp_, &wr, &bad_wr)) << "Failed to post send"; // TODO poll only every N writes to improve performance for (;;) { ibv_wc wc; int ne = ibv_poll_cq(channel_->write_cq_, 1, &wc); CHECK_GE(ne, 0); if (ne) { CHECK(wc.wr_id == (uint64_t)this) << "Oops. Polled a Work Completion belongs to a different buffer"; break; } } }
void RDMAChannel::RecvMR(int id) { memory_regions_[id] = new ibv_mr(); // Map the memory region itself so that it can be received ibv_mr* init = ibv_reg_mr(adapter_.pd_, memory_regions_[id], sizeof(ibv_mr), IBV_ACCESS_LOCAL_WRITE); region_regions_[id] = init; struct ibv_sge list; list.addr = (uint64_t) memory_regions_[id]; list.length = sizeof(ibv_mr); list.lkey = init->lkey; struct ibv_recv_wr wr; caffe_memset(sizeof(wr), 0, &wr); wr.wr_id = (uint64_t) this; wr.sg_list = &list; wr.num_sge = 1; struct ibv_recv_wr* bad_wr; CHECK(!ibv_post_recv(qp_, &wr, &bad_wr)); }
Socket::Socket(const string& host, int port, bool listen) { addrinfo *res; addrinfo hints; caffe_memset(sizeof(addrinfo), 0, &hints); if (listen) { hints.ai_flags = AI_PASSIVE; } hints.ai_family = AF_INET; hints.ai_socktype = SOCK_STREAM; string p = boost::lexical_cast<string>(port); const char* server = host.size() ? host.c_str() : NULL; int n = getaddrinfo(server, p.c_str(), &hints, &res); CHECK_GE(n, 0)<< gai_strerror(n) << " for " << host << ":" << port; fd_ = -1; for (addrinfo* t = res; t; t = t->ai_next) { fd_ = socket(t->ai_family, t->ai_socktype, t->ai_protocol); if (fd_ >= 0) { if (listen) { int n = 1; setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n); if (!bind(fd_, t->ai_addr, t->ai_addrlen)) break; } else { if (!connect(fd_, t->ai_addr, t->ai_addrlen)) break; } close(fd_); fd_ = -1; } } freeaddrinfo(res); string verb(listen ? "listen" : "connect"); CHECK_GE(fd_, 0) << "Could not " << verb << " to " << host << ":" << port; if (listen) { LOG(INFO)<< "Listening to port " << port; ::listen(fd_, 1); } }
inline void SyncedMemory::to_cpu() { switch (head_) { case UNINITIALIZED: { CaffeMallocHost(&cpu_ptr_, size_); caffe_memset(size_, 0, cpu_ptr_); head_ = HEAD_AT_CPU; own_cpu_data_ = true; break; } case HEAD_AT_GPU: { #ifndef CPU_ONLY if (cpu_ptr_ == nullptr) { CaffeMallocHost(&cpu_ptr_, size_); own_cpu_data_ = true; } if (device_context_->backend() == Backend::BACKEND_CUDA) { #ifdef USE_CUDA caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_); #endif // USE_CUDA } else { #ifdef USE_GREENTEA viennacl::ocl::context ctx = viennacl::ocl::get_context( device_context_->id()); greentea_gpu_memcpy(size_, (cl_mem) gpu_ptr_, 0, cpu_ptr_, &ctx); ctx.get_queue().finish(); #endif } head_ = SYNCED; #else NO_GPU; #endif // !CPU_ONLY break; } case HEAD_AT_CPU: case SYNCED: break; } }
void FlowWarpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { int width = top[0]->width(); int height = top[0]->height(); int channels = top[0]->channels(); int num = top[0]->num(); const int wh_size = width * height; const int whc_size = width * height * channels; const Dtype* warped_data = top[0]->cpu_data(); // dest const Dtype* warped_diff = top[0]->cpu_diff(); // dest const Dtype* image_data = bottom[0]->cpu_data(); // source image Dtype* image_diff = bottom[0]->mutable_cpu_diff(); // source image const Dtype* flow_data = bottom[1]->cpu_data(); // source flow Dtype* flow_diff = bottom[1]->mutable_cpu_diff(); // source flow for(int i=0; i<num*whc_size; i++) image_diff[i] = 0 ; for(int n=0; n<num; n++) { int off = whc_size * n; for(int x=0; x<width; x++) for(int y=0; y<height; y++) { float fx = flow_data[2*wh_size*n + y*width + x]; float fy = flow_data[2*wh_size*n + wh_size + y*width + x]; float x2 = float(x) + fx; float y2 = float(y) + fy; if(x2>=0 && y2>=0 && x2<width && y2<height) { int ix2_L = int(x2); int iy2_T = int(y2); int ix2_R = min(ix2_L+1, width-1); int iy2_B = min(iy2_T+1, height-1); float alpha=x2-ix2_L; float beta=y2-iy2_T; for(int c=0; c<channels; c++) { float warped_diff_value = warped_diff[off + c*wh_size + y*width + x]; image_diff[off + c*wh_size + iy2_T*width + ix2_L] += warped_diff_value * (1-alpha)*(1-beta); image_diff[off + c*wh_size + iy2_T*width + ix2_R] += warped_diff_value * alpha*(1-beta); image_diff[off + c*wh_size + iy2_B*width + ix2_L] += warped_diff_value * (1-alpha)*beta; image_diff[off + c*wh_size + iy2_B*width + ix2_R] += warped_diff_value * alpha*beta; } float gamma = iy2_B - y2; float bot_diff = 0; for(int c=0; c<channels; c++) { float temp = 0; temp += gamma * (image_data[off + c*wh_size + iy2_T*width + ix2_R] - image_data[off + c*wh_size + iy2_T*width + ix2_L]); temp += (1-gamma) * (image_data[off + c*wh_size + iy2_B*width + ix2_R] - image_data[off + c*wh_size + iy2_B*width + ix2_L]); bot_diff += warped_diff[off + c*wh_size + y*width + x] * temp; } flow_diff[2*wh_size*n + y*width + x] = bot_diff; gamma = ix2_R - x2; bot_diff = 0; for(int c=0; c<channels; c++) { float temp = 0; temp += gamma * (image_data[off + c*wh_size + iy2_B*width + ix2_L] - image_data[off + c*wh_size + iy2_T*width + ix2_L]); temp += (1-gamma) * (image_data[off + c*wh_size + iy2_B*width + ix2_R] - image_data[off + c*wh_size + iy2_T*width + ix2_R]); bot_diff += warped_diff[off + c*wh_size + y*width + x] * temp; } flow_diff[2*wh_size*n + wh_size + y*width + x] = bot_diff; } } } if(!propagate_down[0]) caffe_memset(bottom[0]->count()*sizeof(Dtype), 0, image_diff); if(!propagate_down[1]) caffe_memset(bottom[1]->count()*sizeof(Dtype), 0, flow_diff); // { // printf("cpu flow u:\n"); // for(int y=0; y<height; y++) // { // for(int x=0; x<width; x++) // { // printf("%f ", bottom[1]->data_at(0, 0, y, x)); // } // printf("\n"); // } // printf("cpu flow v:\n"); // for(int y=0; y<height; y++) // { // for(int x=0; x<width; x++) // { // printf("%f ", bottom[1]->data_at(0, 1, y, x)); // } // printf("\n"); // } // printf("cpu image:\n"); // for(int y=0; y<height; y++) // { // for(int x=0; x<width; x++) // { // printf("%f ", bottom[0]->data_at(0, 0, y, x)); // } // printf("\n"); // } // printf("cpu flow diff u:\n"); // for(int y=0; y<height; y++) // { // for(int x=0; x<width; x++) // { // printf("%f ", bottom[1]->diff_at(0, 0, y, x)); // } // printf("\n"); // } // printf("cpu flow diff v:\n"); // for(int y=0; y<height; y++) // { // for(int x=0; x<width; x++) // { // printf("%f ", bottom[1]->diff_at(0, 1, y, x)); // } // printf("\n"); // } // printf("cpu image diff:\n"); // for(int y=0; y<height; y++) // { // for(int x=0; x<width; x++) // { // printf("%f ", bottom[0]->diff_at(0, 0, y, x)); // } // printf("\n"); // } // } }