Esempio n. 1
0
inline void SyncedMemory::to_cpu() {
  switch (head_) {
  case UNINITIALIZED:
    CaffeMallocHost(&cpu_ptr_, size_, &cpu_malloc_use_cuda_);
    caffe_memset(size_, 0, cpu_ptr_);
    head_ = HEAD_AT_CPU;
    own_cpu_data_ = true;
    break;
  case HEAD_AT_CPU:
    break;
  }
}
void* SyncedMemory::cpu_data() {
	if(size_==0)
	{
		to_cpu();
	}
	else
	{
		if(head_==UNINITIALIZED)
		caffe_memset(size_, 0, cpu_ptr_);
	}
  	
  return (void*)cpu_ptr_;
}
void BatchTripletLossLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
    const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
  if (propagate_down[1]) {
    LOG(FATAL) << this->type()
               << " Layer cannot backpropagate to label inputs.";
  }
  if (propagate_down[0]) {
    Blob<Dtype>* feat = bottom[0];
    const Dtype* feat_data = feat->cpu_data();
    Dtype* feat_diff = feat->mutable_cpu_diff();
    int count = feat->count();
    int num = feat->num();
    int dim = count / num;
    int agg_step = num * sizeof(Dtype);
    Dtype * agg_data = (Dtype *)aggregator_->mutable_cpu_data();
    caffe_memset(num * agg_step, 0, agg_data);

    Dtype scale1 = Dtype(2) / triplets_.size() * mu_;
    for (int i=0; i<triplets_.size(); ++i) {
      int qry_id = triplets_[i].first_;
      int pos_id = triplets_[i].second_;
      int neg_id = triplets_[i].third_;

      agg_data[qry_id * num + neg_id] += scale1;
      agg_data[qry_id * num + pos_id] -= scale1;

      agg_data[pos_id * num + pos_id] += scale1;
      agg_data[pos_id * num + qry_id] -= scale1;

      agg_data[neg_id * num + qry_id] += scale1;
      agg_data[neg_id * num + neg_id] -= scale1;
    }

    Dtype scale2 = Dtype(2) / pos_pairs_.size() * (Dtype(1) - mu_);
    for (int i=0; i<pos_pairs_.size(); ++i) {
      int qry_id = pos_pairs_[i].first;
      int pos_id = pos_pairs_[i].second;

      agg_data[qry_id * num + qry_id] += scale2;
      agg_data[qry_id * num + pos_id] -= scale2;

      agg_data[pos_id * num + pos_id] += scale2;
      agg_data[pos_id * num + qry_id] -= scale2;
    }

    caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, num, dim, num,
        Dtype(1), agg_data, feat_data, Dtype(0), feat_diff);
  }
}
Esempio n. 4
0
void RDMABuffer::Write(bool data) {
  struct ibv_sge list;
  list.addr = (uint64_t) addr_;
  list.length = size_;
  list.lkey = self_->lkey;

  struct ibv_send_wr wr;
  caffe_memset(sizeof(wr), 0, &wr);
  wr.wr_id = (uint64_t) this;
  wr.sg_list = &list;
  wr.num_sge = 1;
  wr.opcode = IBV_WR_RDMA_WRITE_WITH_IMM;
  wr.send_flags = IBV_SEND_SIGNALED;
  wr.imm_data = id_;
  if (!data) {
  // ctrl signal
    wr.imm_data += CTRL_ID_OFFSET;
  }

  wr.wr.rdma.remote_addr = (uint64_t) peer_->addr;
  wr.wr.rdma.rkey = peer_->rkey;

  struct ibv_send_wr *bad_wr;

  // lock the channel since there may be multiple threads calling write()
  boost::mutex::scoped_lock lock(channel_->mutex_);
  CHECK(!ibv_post_send(channel_->qp_, &wr, &bad_wr)) << "Failed to post send";

  // TODO poll only every N writes to improve performance
  for (;;) {
    ibv_wc wc;
    int ne = ibv_poll_cq(channel_->write_cq_, 1, &wc);
    CHECK_GE(ne, 0);
    if (ne) {
      CHECK(wc.wr_id == (uint64_t)this) << "Oops. Polled a Work Completion belongs to a different buffer";
      break;
    }
  }
}
Esempio n. 5
0
void RDMAChannel::RecvMR(int id) {
  memory_regions_[id] = new ibv_mr();

  // Map the memory region itself so that it can be received
  ibv_mr* init = ibv_reg_mr(adapter_.pd_, memory_regions_[id], sizeof(ibv_mr),
                            IBV_ACCESS_LOCAL_WRITE);
  region_regions_[id] = init;

  struct ibv_sge list;
  list.addr = (uint64_t) memory_regions_[id];
  list.length = sizeof(ibv_mr);
  list.lkey = init->lkey;

  struct ibv_recv_wr wr;
  caffe_memset(sizeof(wr), 0, &wr);
  wr.wr_id = (uint64_t) this;
  wr.sg_list = &list;
  wr.num_sge = 1;

  struct ibv_recv_wr* bad_wr;
  CHECK(!ibv_post_recv(qp_, &wr, &bad_wr));
}
Esempio n. 6
0
Socket::Socket(const string& host, int port, bool listen) {
  addrinfo *res;
  addrinfo hints;
  caffe_memset(sizeof(addrinfo), 0, &hints);
  if (listen) {
    hints.ai_flags = AI_PASSIVE;
  }
  hints.ai_family = AF_INET;
  hints.ai_socktype = SOCK_STREAM;
  string p = boost::lexical_cast<string>(port);
  const char* server = host.size() ? host.c_str() : NULL;
  int n = getaddrinfo(server, p.c_str(), &hints, &res);
  CHECK_GE(n, 0)<< gai_strerror(n) << " for " << host << ":" << port;
  fd_ = -1;
  for (addrinfo* t = res; t; t = t->ai_next) {
    fd_ = socket(t->ai_family, t->ai_socktype, t->ai_protocol);
    if (fd_ >= 0) {
      if (listen) {
        int n = 1;
        setsockopt(fd_, SOL_SOCKET, SO_REUSEADDR, &n, sizeof n);
        if (!bind(fd_, t->ai_addr, t->ai_addrlen))
          break;
      } else {
        if (!connect(fd_, t->ai_addr, t->ai_addrlen))
          break;
      }
      close(fd_);
      fd_ = -1;
    }
  }
  freeaddrinfo(res);
  string verb(listen ? "listen" : "connect");
  CHECK_GE(fd_, 0) << "Could not " << verb << " to " << host << ":" << port;
  if (listen) {
    LOG(INFO)<< "Listening to port " << port;
    ::listen(fd_, 1);
  }
}
Esempio n. 7
0
inline void SyncedMemory::to_cpu() {
  switch (head_) {
    case UNINITIALIZED: {
      CaffeMallocHost(&cpu_ptr_, size_);
      caffe_memset(size_, 0, cpu_ptr_);
      head_ = HEAD_AT_CPU;
      own_cpu_data_ = true;
      break;
    }
    case HEAD_AT_GPU: {
#ifndef CPU_ONLY
      if (cpu_ptr_ == nullptr) {
        CaffeMallocHost(&cpu_ptr_, size_);
        own_cpu_data_ = true;
      }
      if (device_context_->backend() == Backend::BACKEND_CUDA) {
#ifdef USE_CUDA
        caffe_gpu_memcpy(size_, gpu_ptr_, cpu_ptr_);
#endif  // USE_CUDA
      } else {
#ifdef USE_GREENTEA
        viennacl::ocl::context ctx = viennacl::ocl::get_context(
            device_context_->id());
        greentea_gpu_memcpy(size_, (cl_mem) gpu_ptr_, 0, cpu_ptr_, &ctx);
        ctx.get_queue().finish();
#endif
      }
      head_ = SYNCED;
#else
      NO_GPU;
#endif  // !CPU_ONLY
      break;
    }
    case HEAD_AT_CPU:
    case SYNCED:
      break;
  }
}
Esempio n. 8
0
void FlowWarpLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
      const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom)
{
    int width = top[0]->width();
    int height = top[0]->height();
    int channels = top[0]->channels();
    int num = top[0]->num();
    const int wh_size = width * height;
    const int whc_size = width * height * channels;

    const Dtype* warped_data = top[0]->cpu_data(); // dest
    const Dtype* warped_diff = top[0]->cpu_diff(); // dest
    const Dtype* image_data = bottom[0]->cpu_data(); // source image
    Dtype* image_diff = bottom[0]->mutable_cpu_diff(); // source image
    const Dtype* flow_data = bottom[1]->cpu_data(); // source flow
    Dtype* flow_diff = bottom[1]->mutable_cpu_diff(); // source flow

    for(int i=0; i<num*whc_size; i++)
        image_diff[i] = 0 ;

    for(int n=0; n<num; n++)
    {
        int off = whc_size * n;
        for(int x=0; x<width; x++)
            for(int y=0; y<height; y++)
            {
                float fx = flow_data[2*wh_size*n + y*width + x];
                float fy = flow_data[2*wh_size*n + wh_size + y*width + x];

                float x2 = float(x) + fx;
                float y2 = float(y) + fy;

                if(x2>=0 && y2>=0 && x2<width && y2<height)
                {
                    int ix2_L = int(x2);
                    int iy2_T = int(y2);
                    int ix2_R = min(ix2_L+1, width-1);
                    int iy2_B = min(iy2_T+1, height-1);

                    float alpha=x2-ix2_L;
                    float beta=y2-iy2_T;
                    for(int c=0; c<channels; c++)
                    {
                        float warped_diff_value = warped_diff[off + c*wh_size + y*width + x];
                        image_diff[off + c*wh_size + iy2_T*width + ix2_L] += warped_diff_value * (1-alpha)*(1-beta);
                        image_diff[off + c*wh_size + iy2_T*width + ix2_R] += warped_diff_value * alpha*(1-beta);
                        image_diff[off + c*wh_size + iy2_B*width + ix2_L] += warped_diff_value * (1-alpha)*beta;
                        image_diff[off + c*wh_size + iy2_B*width + ix2_R] += warped_diff_value * alpha*beta;
                    }

                    float gamma = iy2_B - y2;
                    float bot_diff = 0;
                    for(int c=0; c<channels; c++)
                    {
                        float temp = 0;
                        temp += gamma *     (image_data[off + c*wh_size + iy2_T*width + ix2_R] - image_data[off + c*wh_size + iy2_T*width + ix2_L]);
                        temp += (1-gamma) * (image_data[off + c*wh_size + iy2_B*width + ix2_R] - image_data[off + c*wh_size + iy2_B*width + ix2_L]);

                        bot_diff += warped_diff[off + c*wh_size + y*width + x] * temp;
                    }
                    flow_diff[2*wh_size*n + y*width + x] = bot_diff;

                    gamma = ix2_R - x2;
                    bot_diff = 0;
                    for(int c=0; c<channels; c++)
                    {
                        float temp = 0;
                        temp += gamma *     (image_data[off + c*wh_size + iy2_B*width + ix2_L] - image_data[off + c*wh_size + iy2_T*width + ix2_L]);
                        temp += (1-gamma) * (image_data[off + c*wh_size + iy2_B*width + ix2_R] - image_data[off + c*wh_size + iy2_T*width + ix2_R]);

                        bot_diff += warped_diff[off + c*wh_size + y*width + x] * temp;
                    }
                    flow_diff[2*wh_size*n + wh_size + y*width + x] = bot_diff;
                }
            }
    }

    if(!propagate_down[0]) caffe_memset(bottom[0]->count()*sizeof(Dtype), 0, image_diff);
    if(!propagate_down[1]) caffe_memset(bottom[1]->count()*sizeof(Dtype), 0, flow_diff);


//    {
//        printf("cpu flow u:\n");
//        for(int y=0; y<height; y++)
//        {
//            for(int x=0; x<width; x++)
//            {
//                printf("%f ", bottom[1]->data_at(0, 0, y, x));
//            }
//            printf("\n");
//        }
//        printf("cpu flow v:\n");
//        for(int y=0; y<height; y++)
//        {
//            for(int x=0; x<width; x++)
//            {
//                printf("%f ", bottom[1]->data_at(0, 1, y, x));
//            }
//            printf("\n");
//        }
//        printf("cpu image:\n");
//        for(int y=0; y<height; y++)
//        {
//            for(int x=0; x<width; x++)
//            {
//                printf("%f ", bottom[0]->data_at(0, 0, y, x));
//            }
//            printf("\n");
//        }
//        printf("cpu flow diff u:\n");
//        for(int y=0; y<height; y++)
//        {
//            for(int x=0; x<width; x++)
//            {
//                printf("%f ", bottom[1]->diff_at(0, 0, y, x));
//            }
//            printf("\n");
//        }
//        printf("cpu flow diff v:\n");
//        for(int y=0; y<height; y++)
//        {
//            for(int x=0; x<width; x++)
//            {
//                printf("%f ", bottom[1]->diff_at(0, 1, y, x));
//            }
//            printf("\n");
//        }
//        printf("cpu image diff:\n");
//        for(int y=0; y<height; y++)
//        {
//            for(int x=0; x<width; x++)
//            {
//                printf("%f ", bottom[0]->diff_at(0, 0, y, x));
//            }
//            printf("\n");
//        }
//    }
}