示例#1
0
void LibSVMBinaryReader<ElemType>::InitFromConfig(const ConfigRecordType& readerConfig)
{

    std::map<std::wstring, std::wstring> rename;
    RenamedMatrices(readerConfig, rename);

    if (readerConfig.Exists(L"randomize"))
    {
        string randomizeString = readerConfig(L"randomize");
        if (randomizeString == "None")
        {
            m_randomize = 0L;
        }
        else if (randomizeString == "Auto")
        {
            time_t rawtime;
            struct tm* timeinfo;
            time(&rawtime);
            timeinfo = localtime(&rawtime);
            m_randomize = (unsigned long) (timeinfo->tm_sec + timeinfo->tm_min * 60 + timeinfo->tm_hour * 60 * 60);
        }
        else
        {
            m_randomize = readerConfig(L"randomize", 0);
        }
    }
    else
    {
        m_randomize = 0L;
    }

    m_partialMinibatch = true;
    std::string minibatchMode(readerConfig(L"minibatchMode", "Partial"));
    m_partialMinibatch = EqualCI(minibatchMode, "Partial");

    std::wstring file = readerConfig(L"file", L"");

    m_dataInput = make_shared<SparseBinaryInput<ElemType>>(file);
    m_dataInput->Init(rename);

    m_mbSize = (size_t) readerConfig(L"minibatch", 0);
    if (m_mbSize > 0)
    {
        if (m_dataInput->GetMBSize() != m_mbSize)
        {
            RuntimeError("Data file and config file have mismatched minibatch sizes.\n");
            return;
        }
    }
    else
    {
        m_mbSize = m_dataInput->GetMBSize();
    }

    m_prefetchEnabled = true;
}
示例#2
0
void DSSMReader<ElemType>::InitFromConfig(const ConfigRecordType& readerConfig)
{
    std::vector<std::wstring> features;
    std::vector<std::wstring> labels;

    // Determine the names of the features and lables sections in the config file.
    // features - [in,out] a vector of feature name strings
    // labels - [in,out] a vector of label name strings
    // For DSSM dataset, we only need features. No label is necessary. The following "labels" just serves as a place holder
    GetFileConfigNames(readerConfig, features, labels);

    // For DSSM dataset, it must have exactly two features
    // In the config file, we must specify query features first, then document features. The sequence is different here. Pay attention
    if (features.size() == 2 && labels.size() == 1)
    {
        m_featuresNameQuery = features[1];
        m_featuresNameDoc = features[0];
        m_labelsName = labels[0];
    }
    else
    {
        RuntimeError("DSSM requires exactly two features and one label. Their names should match those in NDL definition");
        return;
    }

    m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = 0;
    m_labelIdMax = m_labelDim = 0;
    m_partialMinibatch = m_endReached = false;
    m_labelType = labelCategory;
    m_readNextSample = 0;
    m_traceLevel = readerConfig(L"traceLevel", 0);

    if (readerConfig.Exists(L"randomize"))
    {
        // BUGBUG: reading out string and number... ugh
        wstring randomizeString = readerConfig(L"randomize");
        if (randomizeString == L"None")
        {
            m_randomizeRange = randomizeNone;
        }
        else if (randomizeString == L"Auto")
        {
            m_randomizeRange = randomizeAuto;
        }
        else
        {
            m_randomizeRange = readerConfig(L"randomize");
        }
    }
    else
    {
        m_randomizeRange = randomizeNone;
    }

    std::string minibatchMode(readerConfig(L"minibatchMode", "Partial"));
    m_partialMinibatch = EqualCI(minibatchMode, "Partial");

    // Get the config parameters for query feature and doc feature
    ConfigParameters configFeaturesQuery = readerConfig(m_featuresNameQuery, "");
    ConfigParameters configFeaturesDoc   = readerConfig(m_featuresNameDoc, "");

    if (configFeaturesQuery.size() == 0)
        RuntimeError("features file not found, required in configuration: i.e. 'features=[file=c:\\myfile.txt;start=1;dim=123]'");
    if (configFeaturesDoc.size() == 0)
        RuntimeError("features file not found, required in configuration: i.e. 'features=[file=c:\\myfile.txt;start=1;dim=123]'");

    // Read in feature size information
    // This information will be used to handle OOVs
    m_featuresDimQuery = configFeaturesQuery(L"dim");
    m_featuresDimDoc   = configFeaturesDoc(L"dim");

    std::wstring fileQ = configFeaturesQuery("file");
    std::wstring fileD = configFeaturesDoc("file");

    dssm_queryInput.Init(fileQ, m_featuresDimQuery);
    dssm_docInput.Init(fileD, m_featuresDimDoc);

    m_totalSamples = dssm_queryInput.numRows;
    if (read_order == NULL)
    {
        read_order = new int[m_totalSamples];
        for (int c = 0; c < m_totalSamples; c++)
        {
            read_order[c] = c;
        }
    }
    m_mbSize = 0;
}