예제 #1
0
void enableProfiler(ProfilerState new_state) {
  TORCH_ASSERT(new_state != ProfilerState::Disabled);
#ifndef WITH_CUDA
  if (new_state == ProfilerState::NVTX)
    throw std::runtime_error("Can't use NVTX profiler - PyTorch was compiled without CUDA");
#endif
  if (state != ProfilerState::Disabled && new_state != state) {
      throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running");
  }
  state = new_state;

#ifdef WITH_CUDA
  if(state == ProfilerState::CUDA) {
    // event recording appears to have some startup overhead, so we need to
    // to generate some dummy events first before recording syncrhonization events
    for(int i = 0; i < 5; i++) {
      onEachDevice([](int d) {
          mark("__cuda_startup");
          cudaDeviceSynchronize();
      });
    }

    // cuda events must be on the same device, so we need a start event recorded
    // for each gpu. we then use this event to synchronize time on the GPU
    // with the CPU clock.
    onEachDevice([](int d) {
        mark("__cuda_start_event");
    });
  }
#endif
  mark("__start_profile", false);
}
예제 #2
0
PyObject * THPStorage_(New)(THStorage *ptr)
{
  TORCH_ASSERT(ptr);
  PyTypeObject *type = (PyTypeObject *)THPStorageClass;
  PyObject *obj = type->tp_alloc(type, 0);
  if (obj) {
    ((THPStorage *)obj)->cdata = ptr;
  } else {
    THStorage_(free)(LIBRARY_STATE ptr);
  }
  return obj;
}
예제 #3
0
void InputBuffer::add(size_t pos, Variable var) {
  TORCH_ASSERT(pos < buffer.size());
  if (!var.defined()) {
    return;
  }
  auto& old_var = buffer[pos];
  if (!old_var.defined()) {
    buffer[pos] = std::move(var);
  } else {
    AutoGPU auto_gpu(var);
    // ATen doesn't route sparse additions correctly...
    if (old_var.type().is_sparse()) {
      buffer[pos] = var + old_var;
    } else {
      buffer[pos] = old_var + var;
    }
  }
}
예제 #4
0
at::Type& get_default_tensor_type() {
  TORCH_ASSERT(default_tensor_type);
  return *default_tensor_type;
}