Esempio n. 1
0
void TestReader(const ConfigParameters& configBase)
{
    // int nonexistant = configBase("nonexistant");  // use to test global exception handler
    ConfigParameters config(configBase("mnistTest"));
    ConfigParameters readerConfig(config("reader"));
    readerConfig.Insert("traceLevel", config("traceLevel", "0"));

    size_t mbSize = config("minibatchSize");
    size_t epochSize = config("epochSize", "0");
    if (epochSize == 0)
    {
        epochSize = requestDataSize;
    }

    DataReader dataReader(readerConfig);

    // get names of features and labels
    std::vector<std::wstring> featureNames;
    std::vector<std::wstring> labelNames;
    GetFileConfigNames(readerConfig, featureNames, labelNames);

    // setup minibatch matrices
    int deviceId = 0;
    auto featuresMatrix = make_shared<Matrix<ElemType>>(deviceId);
    auto labelsMatrix   = make_shared<Matrix<ElemType>>(deviceId);
    MBLayoutPtr pMBLayout = make_shared<MBLayout>();
    StreamMinibatchInputs matrices;
    matrices.AddInput(featureNames[0], featuresMatrix, pMBLayout, TensorShape());
    matrices.AddInput(labelNames[0],   labelsMatrix  , pMBLayout, TensorShape());

    auto start = std::chrono::system_clock::now();
    int epochs = config("maxEpochs");
    epochs *= 2;
    for (int epoch = 0; epoch < epochs; epoch++)
    {
        dataReader.StartMinibatchLoop(mbSize, epoch, epochSize);
        int i = 0;
        while (dataReader.GetMinibatch(matrices))
        {
            Matrix<ElemType>& features = matrices.GetInputMatrix<ElemType>(featureNames[0]);
            Matrix<ElemType>& labels   = matrices.GetInputMatrix<ElemType>(labelNames[0]);

            if (labels.GetNumRows() == 0)
            {
                fprintf(stderr, "%4d: features dim: %lu x %lu - [%.8g, %.8g, ...]\n", i++, features.GetNumRows(), features.GetNumCols(), features(0, 0), features(0, 1));
            }
            else
            {
                fprintf(stderr, "%4d: features dim: %lu x %lu - [%.8g, %.8g, ...] label dim: %lu x %lu - [%d, %d, ...]\n", i++, features.GetNumRows(), features.GetNumCols(), features(0, 0), features(0, 1), labels.GetNumRows(), labels.GetNumCols(), (int) labels(0, 0), (int) labels(0, 1));
            }
        }
    }
    auto end = std::chrono::system_clock::now();
    auto elapsed = end - start;
    fprintf(stderr, "%f seconds elapsed", (float) (std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count()) / 1000);
}
Esempio n. 2
0
void DoCommand(const ConfigParameters& configRoot)
{
    ConfigArray command = configRoot("command", "train");
    ConfigParameters config = configRoot(command[0]);
    ConfigParameters readerConfig(config("reader"));
    readerConfig.Insert("traceLevel", config("traceLevel", "0"));

    ConfigArray minibatchSize = config("minibatchSize", "256");
    intargvector mbSizeArr = minibatchSize;
    size_t mbSize = 20000; // mbSizeArr[0];
    size_t epochSize = config("epochSize", "0");
    if (epochSize == 0)
    {
        epochSize = requestDataSize;
    }

    ConfigParameters configFeatures = readerConfig(L"features");
    size_t dimFeatures = configFeatures("dim");
    ConfigParameters configLabels = readerConfig(L"labels");
    size_t dimLabels = configLabels("labelDim");
    ConfigParameters configSgd = config("SGD");
    std::wstring modelPath = configSgd("modelPath");

    StreamMinibatchInputs inputMatrices;
    StreamMinibatchInputs outputMatrices;
    std::wstring inputName = L"features";
    std::wstring outputName = L"CE.BFF.FF.P";
    int deviceId = 0;
    auto matrix = make_shared<Matrix<ElemType>>(dimFeatures, mbSize, deviceId);
    MBLayoutPtr pMBLayout = make_shared<MBLayout>();
    inputMatrices.AddInput(inputName, matrix, pMBLayout, TensorShape(dimFeatures));
    outputMatrices.AddInput(outputName, make_shared<Matrix<ElemType>>(dimLabels, mbSize, deviceId), pMBLayout, TensorShape(dimLabels));

    std::map<std::wstring, std::vector<ElemType>*> input;
    std::map<std::wstring, std::vector<ElemType>*> output;
    std::vector<ElemType>* arr = input[inputName] = new std::vector<ElemType>(dimFeatures * mbSize);
    output[outputName] = new std::vector<ElemType>(dimLabels * mbSize);

    Eval<ElemType> eval(config);

    auto dataReader = make_shared<DataReader>(readerConfig);
    eval.CreateNetwork(Microsoft::MSR::CNTK::ToLegacyString(Microsoft::MSR::CNTK::ToUTF8(modelPath)));
    dataReader->StartMinibatchLoop(mbSize, 0, inputMatrices.GetStreamDescriptions(), epochSize);
    eval.StartEvaluateMinibatchLoop(outputName);
    while (dataReader->GetMinibatch(inputMatrices))
    {
        void* data = (void*) arr->data();
        size_t dataSize = arr->size() * sizeof(ElemType);
        void* mat = &(*matrix)(0, 0);
        size_t matSize = matrix->GetNumElements() * sizeof(ElemType);
        memcpy_s(data, dataSize, mat, matSize);
        eval.Evaluate(input, output);
    }
}
Esempio n. 3
0
void TestSequenceReader(const ConfigParameters& configBase)
{
    // int nonexistant = configBase("nonexistant");  // use to test global exception handler
    ConfigParameters config = configBase("sequenceTest");

    size_t mbSize = config("minibatchSize");
    size_t epochSize = config("epochSize", "0");
    if (epochSize == 0)
    {
        epochSize = requestDataSize;
    }

    for (int fileType = 0; fileType < 2; ++fileType)
    {
        ConfigParameters readerConfig = config(fileType ? "readerSequence" : "readerSentence");
        readerConfig.Insert("traceLevel", config("traceLevel", "0"));

        std::vector<std::wstring> featureNames;
        std::vector<std::wstring> labelNames;
        GetFileConfigNames(readerConfig, featureNames, labelNames);

        DataReader dataReader(readerConfig);

        // get names of features and labels
        std::vector<std::wstring> files;
        files.push_back(readerConfig(L"file"));

        // setup minibatch matrices
        auto featuresMatrix = make_shared<Matrix<ElemType>>();
        auto labelsMatrix   = make_shared<Matrix<ElemType>>();
        StreamMinibatchInputs matrices;
        matrices.AddInputMatrix(featureNames[0], featuresMatrix);
        matrices.AddInputMatrix(labelNames[1]  , labelsMatrix);

        auto start = std::chrono::system_clock::now();
        int epochs = config("maxEpochs");
        epochs *= 2;
        for (int epoch = 0; epoch < epochs; epoch++)
        {
            dataReader.StartMinibatchLoop(mbSize, epoch, epochSize);
            for (int i = 0; dataReader.GetMinibatch(matrices); i++)
            {
                auto& features = matrices.GetInputMatrix<ElemType>(featureNames[0]);
                auto& labels   = matrices.GetInputMatrix<ElemType>(labelNames[1]);
                fprintf(stderr, "%4d: features dim: %lu x %lu - [%.8g, %.8g, ...] label dim: %d x %d - [%d, %d, ...]\n", i, features.GetNumRows(), features.GetNumCols(), features(0, 0), features(0, 1), labels.GetNumRows(), labels.GetNumCols(), (int) labels(0, 0), (int) labels(0, 1));
            }
        }
        auto end = std::chrono::system_clock::now();
        auto elapsed = end - start;
        fprintf(stderr, "%f seconds elapsed", (float) (std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count()) / 1000);
    }
}
Esempio n. 4
0
bool LibSVMBinaryReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
{
//timer = clock();
#if DEBUG
    span minibatch_span(*reader_series, 1, L"Get Minibatch: %ld", cur_read);
#endif
    size_t actualMBSize = 0;
    if (m_prefetchEnabled)
    {
        if (!m_pendingAsyncGetMinibatch.valid())
        {
            // fprintf(stderr, "not valid\n");
            CheckDataMatrices(matrices);
            m_pendingAsyncGetMinibatch = std::async(std::launch::async, [this]()
                                                    {
                                                        return m_dataInput->FillMatrices(m_dataMatrices);
                                                    });
        }
//fprintf(stderr, "before get.\n");
//timer = clock();
#if DEBUG
        reader_series->write_flag(_T("before get."));
#endif
        actualMBSize = m_pendingAsyncGetMinibatch.get();
#if DEBUG
        reader_series->write_flag(_T("after get."));
#endif
        // timer = clock() - timer;
        // fprintf(stderr, "done get\tIt took me %d clicks (%f seconds).\n", timer, ((float)timer) / CLOCKS_PER_SEC);

        if (actualMBSize == 0)
        {
            return false;
        }

        m_pMBLayout->InitAsFrameMode(actualMBSize);
#if DEBUG
        reader_series->write_flag(_T("starting fill."));
#endif
        for (auto matrix : m_dataMatrices)
        {
            if (matrices.HasInput(matrix.first))
                matrix.second->Fill(&matrices.GetInputMatrix<ElemType>(matrix.first));
        }
#if DEBUG
        reader_series->write_flag(_T("done fill."));
#endif
        if (matrices.HasInput(L"DSSMLabel"))
            DoDSSMMatrix(matrices.GetInputMatrix<ElemType>(L"DSSMLabel"), actualMBSize);

        m_pendingAsyncGetMinibatch = std::async(std::launch::async, [this]()
        {
            // CheckDataMatrices(matrices);
            return m_dataInput->FillMatrices(m_dataMatrices);
        });
    }
#if DEBUG
    cur_read++;
#endif
    /*

                timer = clock() - timer;
                fprintf(stderr, "It took me %d clicks (%f seconds).\n", timer, ((float)timer) / CLOCKS_PER_SEC);
                */
    // fprintf(stderr, "done\n");
    return true;
}
Esempio n. 5
0
bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
{
    // TODO: verify that the set of matrix names is identical 
    // to the set of reader input names. Warn if it's a subset, throw
    // if it's a superset.

    if (m_endOfEpoch)
    {
        return false;
    }

    // Check that all matrices have the same device id.
    // If not we should inject the IMemoryProvider per stream.
    int deviceId = matrices.begin()->second.matrix->GetDeviceId();
    for (auto mx : matrices)
        assert(mx.second.matrix->GetDeviceId() == deviceId), UNUSED(deviceId);

    assert(m_prefetchTask.valid());

    Minibatch minibatch = m_prefetchTask.get();
    if (minibatch.m_endOfEpoch)
    {
        m_endOfEpoch = true;
        if (minibatch.m_data.empty())
        {
            return false;
        }
    }

    // Reset stale mb layouts.
    // BUGBUG: This seems incorrect. (1) layouts should all be updated below, and (2) some of these layouts are the same, we are resetting them twice.
    for (const auto& iter : matrices)
    {
        iter.second.pMBLayout->Init(1, 0);
    }

    // a map to generate error messages when checking layout constraints. 
    map<wstring, wstring> layoutToInputMap;
    if (!minibatch.m_data.empty())
    {
        // TODO: Use alternating pinned buffer in the packer, do not copy anything, but pack into the pinned memory.
        // Copy returned minibatch to the matrices.
        for (const auto& mx : matrices)
        {
            if (m_nameToStreamId.find(mx.first) == m_nameToStreamId.end())
            {
                string inputNames = EnumerateInputs(m_nameToStreamId);
                RuntimeError("Could not map input '%ls' to the reader. Reader outputs only [%s].", 
                    mx.first.c_str(), inputNames.c_str());
            }

            size_t streamId = m_nameToStreamId[mx.first];
            
            const auto& stream = minibatch.m_data[streamId];

            m_numParallelSequences = stream->m_layout->GetNumParallelSequences();

            // This assert no longer holds - different inputs have different sequence lengths, resulting in different number 
            // of parallel samples.
            // assert(m_numParallelSequences == minibatch.m_data.front()->m_layout->GetNumParallelSequences());

            auto& layout = mx.second.pMBLayout;

            if (layout->GetNumCols() == 0)
            {
                // layout is empty, copy layout info from the reader
                layout->CopyFrom(stream->m_layout, /*keepName*/ true);
                layoutToInputMap[layout->GetAxisName()] = mx.first;
            }
            else if (*layout != *stream->m_layout) // this does a deep value-level comparison
            {
                RuntimeError("Dynamic axis layout '%ls' is shared between inputs '%ls' and '%ls', but layouts generated "
                    "from the input data are incompatible on this axis. Are you using different sequence lengths? "
                    "Did you consider adding a DynamicAxis() to the Input nodes?",
                    layout->GetAxisName(), layoutToInputMap[layout->GetAxisName()].c_str(), mx.first.c_str());
            }

            size_t sampleSize = m_streams[streamId]->m_sampleLayout->GetNumElements();
            auto& matrix = matrices.GetInputMatrix<ElemType>(mx.first);
            FillMatrixFromStream(m_streams[streamId]->m_storageType, &matrix, sampleSize, stream);
        }
    }

    if (!m_endOfEpoch)
    {
        // Starting the prefetch task. There is always a single async read in flight.
        // When the network requests a new minibatch, we wait for the current async to finish,
        // return the result and kick off a new one.
        m_prefetchTask = std::async(m_launchType, [this]()
        {
            return m_reader->ReadMinibatch();
        });
    }

    return !minibatch.m_data.empty();
}
Esempio n. 6
0
void DoCreateLabelMap(const ConfigParameters& config)
{
    // this gets the section name we are interested in
    std::string section = config(L"section");
    // get that section (probably a peer config section, which works thanks to heirarchal symbol resolution)
    ConfigParameters configSection(config(section));
    ConfigParameters readerConfig(configSection("reader"));
    readerConfig.Insert("allowMapCreation", "true");
    size_t minibatchSize = config(L"minibatchSize", "2048");
    int traceLevel = config(L"traceLevel", "0");
    std::vector<std::wstring> featureNames;
    std::vector<std::wstring> labelNames;
    GetFileConfigNames(readerConfig, featureNames, labelNames);

    // setup minibatch matrices
    auto featuresMatrix = make_shared<Matrix<ElemType>>(CPUDEVICE);
    auto labelsMatrix   = make_shared<Matrix<ElemType>>(CPUDEVICE);
    StreamMinibatchInputs matrices;
    matrices.AddInputMatrix(featureNames[0], featuresMatrix);
    if (labelNames.size() == 0)
        RuntimeError("CreateLabelMap: no labels found to process");

    // now create the reader and loop through the entire dataset to get all the labels
    auto start = std::chrono::system_clock::now();
    for (const std::wstring& labelsName : labelNames)
    {
        // take the last label file defined (the other one might be input)
        matrices.AddInputMatrix(labelsName, labelsMatrix);

        // get the label mapping file name
        ConfigParameters labelConfig(readerConfig(labelsName));
        std::string labelMappingFile;
        if (labelConfig.ExistsCurrent(L"labelMappingFile"))
            labelMappingFile = labelConfig(L"labelMappingFile");
        else if (readerConfig.ExistsCurrent(L"labelMappingFile"))
            labelMappingFile = labelConfig(L"labelMappingFile");
        else
            RuntimeError("CreateLabelMap: No labelMappingFile defined");

        if (fexists(labelMappingFile))
        {
            fprintf(stderr, "CreateLabelMap: the label mapping file '%s' already exists, no work to do.\n", labelMappingFile.c_str());
            return;
        }
        fprintf(stderr, "CreateLabelMap: Creating the mapping file '%s' \n", labelMappingFile.c_str());

        DataReader dataReader(readerConfig);
        dataReader.StartMinibatchLoop(minibatchSize, 0, requestDataSize);
        int count = 0;
        while (dataReader.GetMinibatch(matrices))
        {
            Matrix<ElemType>& features = matrices.GetInputMatrix<ElemType>(featureNames[0]);
            count += features.GetNumCols();
            if (traceLevel > 1)
                fprintf(stderr, "."); // progress meter
        }
        dataReader.StartMinibatchLoop(minibatchSize, 1, requestDataSize);

        // print the results
        if (traceLevel > 0)
            fprintf(stderr, "\nread %d labels and produced %s\n", count, labelMappingFile.c_str());
    }
    auto end = std::chrono::system_clock::now();
    auto elapsed = end - start;
    if (traceLevel > 1)
        fprintf(stderr, "%f seconds elapsed\n", (float) (std::chrono::duration_cast<std::chrono::milliseconds>(elapsed).count()) / 1000);
}