void MPICommunicatorImpl::Initialize(const std::vector<NDArrayViewPtr>& values) { assert(CPUDEVICE < 0); // just in case somebody decides to change CPUDEVICE macro. DeviceDescriptor lastGpuDevice = DeviceDescriptor::CPUDevice(); m_gpuDataTransferers.resize(values.size()); m_intermediateCPUBuffers.resize(values.size()); for (auto i = 0; i < values.size(); ++i) { auto view = values[i]; auto device = view->Device(); // Make sure none of the values are sparse - we currently do not support aggregation of sparse matrices if (view->GetStorageFormat() != StorageFormat::Dense) RuntimeError("Aggregation for sparse matrices is currently not supported!"); // TODO: device.Type should be called Kind. if (device.Type() != DeviceKind::GPU) { m_intermediateCPUBuffers[i] = Buffer(); m_gpuDataTransferers[i] = nullptr; } else { if (lastGpuDevice.Type() == DeviceKind::CPU) lastGpuDevice = device; else if (device.Id() != lastGpuDevice.Id()) // For the time being, assume all devices have the same id. LogicError("Not all values are on the same GPU device id"); auto requiredSize = GetBufferSize(view); m_gpuDataTransferers[i] = std::make_shared<GPUDataTransferer>(device.Id(), true); if (m_intermediateCPUBuffers[i].totalSize < requiredSize) m_intermediateCPUBuffers[i] = AllocateIntermediateBuffer(device.Id(), requiredSize); } } }
/// <summary> /// Shows how to use Clone() to share function parameters among multi evaluation threads. /// </summary> /// <description> /// It first creates a new function with parameters, then spawns multi threads. Each thread uses Clone() to create a new /// instance of function and then use this instance to do evaluation. /// All cloned functions share the same parameters. /// </description> void MultiThreadsEvaluationWithClone(const DeviceDescriptor& device, const int threadCount) { using namespace std::placeholders; const size_t inputDim = 937; const size_t numOutputClasses = 9304; const size_t numHiddenLayers = 6; const size_t hiddenLayersDim = 2048; auto inputVar = InputVariable({inputDim}, DataType::Float, L"features"); assert(numHiddenLayers >= 1); auto classifierRoot = SetupFullyConnectedDNNLayer(inputVar, hiddenLayersDim, device, std::bind(Sigmoid, _1, L"")); for (size_t i = 1; i < numHiddenLayers; ++i) { classifierRoot = SetupFullyConnectedDNNLayer(classifierRoot, hiddenLayersDim, device, std::bind(Sigmoid, _1, L"")); } auto outputTimesParam = Parameter(NDArrayView::RandomUniform<float>({numOutputClasses, hiddenLayersDim}, -0.5, 0.5, 1, device)); auto classifierFunc = Times(outputTimesParam, classifierRoot, L"classifierOutput"); // Now test the structure if (classifierFunc->Parameters().size() != ((numHiddenLayers * 2) + 1)) { throw std::runtime_error("MultiThreadsEvaluationWithClone: Function does not have expected Parameter count"); } OutputFunctionInfo(classifierFunc); fprintf(stderr, "MultiThreadsEvaluationWithClone on device=%d\n", device.Id()); // Run evaluation in parallel std::vector<std::thread> threadList(threadCount); for (int th = 0; th < threadCount; ++th) { threadList[th] = std::thread(RunEvaluationClassifier, classifierFunc->Clone(), device); } for (int th = 0; th < threadCount; ++th) { threadList[th].join(); fprintf(stderr, "thread %d joined.\n", th); fflush(stderr); } }
/// <summary> /// Shows how to use LoadLegacyModel() and Clone() to share function parameters among multi evaluation threads. /// </summary> /// <description> /// It first loads a model, then spawns multi threads. Each thread uses Clone() to create a new /// instance of function and then use this instance to do evaluation. /// All cloned functions share the same parameters. /// Note: It uses the model trained by Examples\Image\GettingStarted\01_OneHidden.cntk as example. Instructions /// to train the model is described in Examples\Image\GettingStarted\README.md. /// The pre-trained model file 01_OneHidden needs to be in the current directory. /// </description> void MultiThreadsEvaluationWithLoadModel(const DeviceDescriptor& device, const int threadCount) { // The model file will be trained and copied to the current runtime directory first. auto modelFuncPtr = CNTK::Function::LoadModel(DataType::Float, L"01_OneHidden", device); OutputFunctionInfo(modelFuncPtr); fprintf(stderr, "MultiThreadsEvaluationWithLoadModel on device=%d\n", device.Id()); // Run evaluation in parallel. std::vector<std::thread> threadList(threadCount); for (int th = 0; th < threadCount; ++th) { threadList[th] = std::thread(RunEvaluationOneHidden, modelFuncPtr->Clone(), device); } for (int th = 0; th < threadCount; ++th) { threadList[th].join(); fprintf(stderr, "thread %d joined.\n", th); fflush(stderr); } }