void enableProfiler(ProfilerState new_state) { TORCH_ASSERT(new_state != ProfilerState::Disabled); #ifndef WITH_CUDA if (new_state == ProfilerState::NVTX) throw std::runtime_error("Can't use NVTX profiler - PyTorch was compiled without CUDA"); #endif if (state != ProfilerState::Disabled && new_state != state) { throw std::runtime_error("can't change kind of profiling (e.g. NVTX to CPU) while profiler is running"); } state = new_state; #ifdef WITH_CUDA if(state == ProfilerState::CUDA) { // event recording appears to have some startup overhead, so we need to // to generate some dummy events first before recording syncrhonization events for(int i = 0; i < 5; i++) { onEachDevice([](int d) { mark("__cuda_startup"); cudaDeviceSynchronize(); }); } // cuda events must be on the same device, so we need a start event recorded // for each gpu. we then use this event to synchronize time on the GPU // with the CPU clock. onEachDevice([](int d) { mark("__cuda_start_event"); }); } #endif mark("__start_profile", false); }
PyObject * THPStorage_(New)(THStorage *ptr) { TORCH_ASSERT(ptr); PyTypeObject *type = (PyTypeObject *)THPStorageClass; PyObject *obj = type->tp_alloc(type, 0); if (obj) { ((THPStorage *)obj)->cdata = ptr; } else { THStorage_(free)(LIBRARY_STATE ptr); } return obj; }
void InputBuffer::add(size_t pos, Variable var) { TORCH_ASSERT(pos < buffer.size()); if (!var.defined()) { return; } auto& old_var = buffer[pos]; if (!old_var.defined()) { buffer[pos] = std::move(var); } else { AutoGPU auto_gpu(var); // ATen doesn't route sparse additions correctly... if (old_var.type().is_sparse()) { buffer[pos] = var + old_var; } else { buffer[pos] = old_var + var; } } }
at::Type& get_default_tensor_type() { TORCH_ASSERT(default_tensor_type); return *default_tensor_type; }