void TestReader(const ConfigParameters& configBase) { // int nonexistant = configBase("nonexistant"); // use to test global exception handler ConfigParameters config(configBase("mnistTest")); ConfigParameters readerConfig(config("reader")); readerConfig.Insert("traceLevel", config("traceLevel", "0")); size_t mbSize = config("minibatchSize"); size_t epochSize = config("epochSize", "0"); if (epochSize == 0) { epochSize = requestDataSize; } DataReader dataReader(readerConfig); // get names of features and labels std::vector<std::wstring> featureNames; std::vector<std::wstring> labelNames; GetFileConfigNames(readerConfig, featureNames, labelNames); // setup minibatch matrices int deviceId = 0; auto featuresMatrix = make_shared<Matrix<ElemType>>(deviceId); auto labelsMatrix = make_shared<Matrix<ElemType>>(deviceId); MBLayoutPtr pMBLayout = make_shared<MBLayout>(); StreamMinibatchInputs matrices; matrices.AddInput(featureNames[0], featuresMatrix, pMBLayout, TensorShape()); matrices.AddInput(labelNames[0], labelsMatrix , pMBLayout, TensorShape()); auto start = std::chrono::system_clock::now(); int epochs = config("maxEpochs"); epochs *= 2; for (int epoch = 0; epoch < epochs; epoch++) { dataReader.StartMinibatchLoop(mbSize, epoch, epochSize); int i = 0; while (dataReader.GetMinibatch(matrices)) { Matrix<ElemType>& features = matrices.GetInputMatrix<ElemType>(featureNames[0]); Matrix<ElemType>& labels = matrices.GetInputMatrix<ElemType>(labelNames[0]); if (labels.GetNumRows() == 0) { fprintf(stderr, "%4d: features dim: %lu x %lu - [%.8g, %.8g, ...]\n", i++, features.GetNumRows(), features.GetNumCols(), features(0, 0), features(0, 1)); } else { fprintf(stderr, "%4d: features dim: %lu x %lu - [%.8g, %.8g, ...] label dim: %lu x %lu - [%d, %d, ...]\n", i++, features.GetNumRows(), features.GetNumCols(), features(0, 0), features(0, 1), labels.GetNumRows(), labels.GetNumCols(), (int) labels(0, 0), (int) labels(0, 1)); } } } auto end = std::chrono::system_clock::now(); auto elapsed = end - start; fprintf(stderr, "%f seconds elapsed", (float) (std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count()) / 1000); }
void DoCommand(const ConfigParameters& configRoot) { ConfigArray command = configRoot("command", "train"); ConfigParameters config = configRoot(command[0]); ConfigParameters readerConfig(config("reader")); readerConfig.Insert("traceLevel", config("traceLevel", "0")); ConfigArray minibatchSize = config("minibatchSize", "256"); intargvector mbSizeArr = minibatchSize; size_t mbSize = 20000; // mbSizeArr[0]; size_t epochSize = config("epochSize", "0"); if (epochSize == 0) { epochSize = requestDataSize; } ConfigParameters configFeatures = readerConfig(L"features"); size_t dimFeatures = configFeatures("dim"); ConfigParameters configLabels = readerConfig(L"labels"); size_t dimLabels = configLabels("labelDim"); ConfigParameters configSgd = config("SGD"); std::wstring modelPath = configSgd("modelPath"); StreamMinibatchInputs inputMatrices; StreamMinibatchInputs outputMatrices; std::wstring inputName = L"features"; std::wstring outputName = L"CE.BFF.FF.P"; int deviceId = 0; auto matrix = make_shared<Matrix<ElemType>>(dimFeatures, mbSize, deviceId); MBLayoutPtr pMBLayout = make_shared<MBLayout>(); inputMatrices.AddInput(inputName, matrix, pMBLayout, TensorShape(dimFeatures)); outputMatrices.AddInput(outputName, make_shared<Matrix<ElemType>>(dimLabels, mbSize, deviceId), pMBLayout, TensorShape(dimLabels)); std::map<std::wstring, std::vector<ElemType>*> input; std::map<std::wstring, std::vector<ElemType>*> output; std::vector<ElemType>* arr = input[inputName] = new std::vector<ElemType>(dimFeatures * mbSize); output[outputName] = new std::vector<ElemType>(dimLabels * mbSize); Eval<ElemType> eval(config); auto dataReader = make_shared<DataReader>(readerConfig); eval.CreateNetwork(Microsoft::MSR::CNTK::ToLegacyString(Microsoft::MSR::CNTK::ToUTF8(modelPath))); dataReader->StartMinibatchLoop(mbSize, 0, inputMatrices.GetStreamDescriptions(), epochSize); eval.StartEvaluateMinibatchLoop(outputName); while (dataReader->GetMinibatch(inputMatrices)) { void* data = (void*) arr->data(); size_t dataSize = arr->size() * sizeof(ElemType); void* mat = &(*matrix)(0, 0); size_t matSize = matrix->GetNumElements() * sizeof(ElemType); memcpy_s(data, dataSize, mat, matSize); eval.Evaluate(input, output); } }
void TestSequenceReader(const ConfigParameters& configBase) { // int nonexistant = configBase("nonexistant"); // use to test global exception handler ConfigParameters config = configBase("sequenceTest"); size_t mbSize = config("minibatchSize"); size_t epochSize = config("epochSize", "0"); if (epochSize == 0) { epochSize = requestDataSize; } for (int fileType = 0; fileType < 2; ++fileType) { ConfigParameters readerConfig = config(fileType ? "readerSequence" : "readerSentence"); readerConfig.Insert("traceLevel", config("traceLevel", "0")); std::vector<std::wstring> featureNames; std::vector<std::wstring> labelNames; GetFileConfigNames(readerConfig, featureNames, labelNames); DataReader dataReader(readerConfig); // get names of features and labels std::vector<std::wstring> files; files.push_back(readerConfig(L"file")); // setup minibatch matrices auto featuresMatrix = make_shared<Matrix<ElemType>>(); auto labelsMatrix = make_shared<Matrix<ElemType>>(); StreamMinibatchInputs matrices; matrices.AddInputMatrix(featureNames[0], featuresMatrix); matrices.AddInputMatrix(labelNames[1] , labelsMatrix); auto start = std::chrono::system_clock::now(); int epochs = config("maxEpochs"); epochs *= 2; for (int epoch = 0; epoch < epochs; epoch++) { dataReader.StartMinibatchLoop(mbSize, epoch, epochSize); for (int i = 0; dataReader.GetMinibatch(matrices); i++) { auto& features = matrices.GetInputMatrix<ElemType>(featureNames[0]); auto& labels = matrices.GetInputMatrix<ElemType>(labelNames[1]); fprintf(stderr, "%4d: features dim: %lu x %lu - [%.8g, %.8g, ...] label dim: %d x %d - [%d, %d, ...]\n", i, features.GetNumRows(), features.GetNumCols(), features(0, 0), features(0, 1), labels.GetNumRows(), labels.GetNumCols(), (int) labels(0, 0), (int) labels(0, 1)); } } auto end = std::chrono::system_clock::now(); auto elapsed = end - start; fprintf(stderr, "%f seconds elapsed", (float) (std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count()) / 1000); } }
bool LibSVMBinaryReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices) { //timer = clock(); #if DEBUG span minibatch_span(*reader_series, 1, L"Get Minibatch: %ld", cur_read); #endif size_t actualMBSize = 0; if (m_prefetchEnabled) { if (!m_pendingAsyncGetMinibatch.valid()) { // fprintf(stderr, "not valid\n"); CheckDataMatrices(matrices); m_pendingAsyncGetMinibatch = std::async(std::launch::async, [this]() { return m_dataInput->FillMatrices(m_dataMatrices); }); } //fprintf(stderr, "before get.\n"); //timer = clock(); #if DEBUG reader_series->write_flag(_T("before get.")); #endif actualMBSize = m_pendingAsyncGetMinibatch.get(); #if DEBUG reader_series->write_flag(_T("after get.")); #endif // timer = clock() - timer; // fprintf(stderr, "done get\tIt took me %d clicks (%f seconds).\n", timer, ((float)timer) / CLOCKS_PER_SEC); if (actualMBSize == 0) { return false; } m_pMBLayout->InitAsFrameMode(actualMBSize); #if DEBUG reader_series->write_flag(_T("starting fill.")); #endif for (auto matrix : m_dataMatrices) { if (matrices.HasInput(matrix.first)) matrix.second->Fill(&matrices.GetInputMatrix<ElemType>(matrix.first)); } #if DEBUG reader_series->write_flag(_T("done fill.")); #endif if (matrices.HasInput(L"DSSMLabel")) DoDSSMMatrix(matrices.GetInputMatrix<ElemType>(L"DSSMLabel"), actualMBSize); m_pendingAsyncGetMinibatch = std::async(std::launch::async, [this]() { // CheckDataMatrices(matrices); return m_dataInput->FillMatrices(m_dataMatrices); }); } #if DEBUG cur_read++; #endif /* timer = clock() - timer; fprintf(stderr, "It took me %d clicks (%f seconds).\n", timer, ((float)timer) / CLOCKS_PER_SEC); */ // fprintf(stderr, "done\n"); return true; }
bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices) { // TODO: verify that the set of matrix names is identical // to the set of reader input names. Warn if it's a subset, throw // if it's a superset. if (m_endOfEpoch) { return false; } // Check that all matrices have the same device id. // If not we should inject the IMemoryProvider per stream. int deviceId = matrices.begin()->second.matrix->GetDeviceId(); for (auto mx : matrices) assert(mx.second.matrix->GetDeviceId() == deviceId), UNUSED(deviceId); assert(m_prefetchTask.valid()); Minibatch minibatch = m_prefetchTask.get(); if (minibatch.m_endOfEpoch) { m_endOfEpoch = true; if (minibatch.m_data.empty()) { return false; } } // Reset stale mb layouts. // BUGBUG: This seems incorrect. (1) layouts should all be updated below, and (2) some of these layouts are the same, we are resetting them twice. for (const auto& iter : matrices) { iter.second.pMBLayout->Init(1, 0); } // a map to generate error messages when checking layout constraints. map<wstring, wstring> layoutToInputMap; if (!minibatch.m_data.empty()) { // TODO: Use alternating pinned buffer in the packer, do not copy anything, but pack into the pinned memory. // Copy returned minibatch to the matrices. for (const auto& mx : matrices) { if (m_nameToStreamId.find(mx.first) == m_nameToStreamId.end()) { string inputNames = EnumerateInputs(m_nameToStreamId); RuntimeError("Could not map input '%ls' to the reader. Reader outputs only [%s].", mx.first.c_str(), inputNames.c_str()); } size_t streamId = m_nameToStreamId[mx.first]; const auto& stream = minibatch.m_data[streamId]; m_numParallelSequences = stream->m_layout->GetNumParallelSequences(); // This assert no longer holds - different inputs have different sequence lengths, resulting in different number // of parallel samples. // assert(m_numParallelSequences == minibatch.m_data.front()->m_layout->GetNumParallelSequences()); auto& layout = mx.second.pMBLayout; if (layout->GetNumCols() == 0) { // layout is empty, copy layout info from the reader layout->CopyFrom(stream->m_layout, /*keepName*/ true); layoutToInputMap[layout->GetAxisName()] = mx.first; } else if (*layout != *stream->m_layout) // this does a deep value-level comparison { RuntimeError("Dynamic axis layout '%ls' is shared between inputs '%ls' and '%ls', but layouts generated " "from the input data are incompatible on this axis. Are you using different sequence lengths? " "Did you consider adding a DynamicAxis() to the Input nodes?", layout->GetAxisName(), layoutToInputMap[layout->GetAxisName()].c_str(), mx.first.c_str()); } size_t sampleSize = m_streams[streamId]->m_sampleLayout->GetNumElements(); auto& matrix = matrices.GetInputMatrix<ElemType>(mx.first); FillMatrixFromStream(m_streams[streamId]->m_storageType, &matrix, sampleSize, stream); } } if (!m_endOfEpoch) { // Starting the prefetch task. There is always a single async read in flight. // When the network requests a new minibatch, we wait for the current async to finish, // return the result and kick off a new one. m_prefetchTask = std::async(m_launchType, [this]() { return m_reader->ReadMinibatch(); }); } return !minibatch.m_data.empty(); }
void DoCreateLabelMap(const ConfigParameters& config) { // this gets the section name we are interested in std::string section = config(L"section"); // get that section (probably a peer config section, which works thanks to heirarchal symbol resolution) ConfigParameters configSection(config(section)); ConfigParameters readerConfig(configSection("reader")); readerConfig.Insert("allowMapCreation", "true"); size_t minibatchSize = config(L"minibatchSize", "2048"); int traceLevel = config(L"traceLevel", "0"); std::vector<std::wstring> featureNames; std::vector<std::wstring> labelNames; GetFileConfigNames(readerConfig, featureNames, labelNames); // setup minibatch matrices auto featuresMatrix = make_shared<Matrix<ElemType>>(CPUDEVICE); auto labelsMatrix = make_shared<Matrix<ElemType>>(CPUDEVICE); StreamMinibatchInputs matrices; matrices.AddInputMatrix(featureNames[0], featuresMatrix); if (labelNames.size() == 0) RuntimeError("CreateLabelMap: no labels found to process"); // now create the reader and loop through the entire dataset to get all the labels auto start = std::chrono::system_clock::now(); for (const std::wstring& labelsName : labelNames) { // take the last label file defined (the other one might be input) matrices.AddInputMatrix(labelsName, labelsMatrix); // get the label mapping file name ConfigParameters labelConfig(readerConfig(labelsName)); std::string labelMappingFile; if (labelConfig.ExistsCurrent(L"labelMappingFile")) labelMappingFile = labelConfig(L"labelMappingFile"); else if (readerConfig.ExistsCurrent(L"labelMappingFile")) labelMappingFile = labelConfig(L"labelMappingFile"); else RuntimeError("CreateLabelMap: No labelMappingFile defined"); if (fexists(labelMappingFile)) { fprintf(stderr, "CreateLabelMap: the label mapping file '%s' already exists, no work to do.\n", labelMappingFile.c_str()); return; } fprintf(stderr, "CreateLabelMap: Creating the mapping file '%s' \n", labelMappingFile.c_str()); DataReader dataReader(readerConfig); dataReader.StartMinibatchLoop(minibatchSize, 0, requestDataSize); int count = 0; while (dataReader.GetMinibatch(matrices)) { Matrix<ElemType>& features = matrices.GetInputMatrix<ElemType>(featureNames[0]); count += features.GetNumCols(); if (traceLevel > 1) fprintf(stderr, "."); // progress meter } dataReader.StartMinibatchLoop(minibatchSize, 1, requestDataSize); // print the results if (traceLevel > 0) fprintf(stderr, "\nread %d labels and produced %s\n", count, labelMappingFile.c_str()); } auto end = std::chrono::system_clock::now(); auto elapsed = end - start; if (traceLevel > 1) fprintf(stderr, "%f seconds elapsed\n", (float) (std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count()) / 1000); }