void LibSVMBinaryReader<ElemType>::InitFromConfig(const ConfigRecordType& readerConfig) { std::map<std::wstring, std::wstring> rename; RenamedMatrices(readerConfig, rename); if (readerConfig.Exists(L"randomize")) { string randomizeString = readerConfig(L"randomize"); if (randomizeString == "None") { m_randomize = 0L; } else if (randomizeString == "Auto") { time_t rawtime; struct tm* timeinfo; time(&rawtime); timeinfo = localtime(&rawtime); m_randomize = (unsigned long) (timeinfo->tm_sec + timeinfo->tm_min * 60 + timeinfo->tm_hour * 60 * 60); } else { m_randomize = readerConfig(L"randomize", 0); } } else { m_randomize = 0L; } m_partialMinibatch = true; std::string minibatchMode(readerConfig(L"minibatchMode", "Partial")); m_partialMinibatch = EqualCI(minibatchMode, "Partial"); std::wstring file = readerConfig(L"file", L""); m_dataInput = make_shared<SparseBinaryInput<ElemType>>(file); m_dataInput->Init(rename); m_mbSize = (size_t) readerConfig(L"minibatch", 0); if (m_mbSize > 0) { if (m_dataInput->GetMBSize() != m_mbSize) { RuntimeError("Data file and config file have mismatched minibatch sizes.\n"); return; } } else { m_mbSize = m_dataInput->GetMBSize(); } m_prefetchEnabled = true; }
void DSSMReader<ElemType>::InitFromConfig(const ConfigRecordType& readerConfig) { std::vector<std::wstring> features; std::vector<std::wstring> labels; // Determine the names of the features and lables sections in the config file. // features - [in,out] a vector of feature name strings // labels - [in,out] a vector of label name strings // For DSSM dataset, we only need features. No label is necessary. The following "labels" just serves as a place holder GetFileConfigNames(readerConfig, features, labels); // For DSSM dataset, it must have exactly two features // In the config file, we must specify query features first, then document features. The sequence is different here. Pay attention if (features.size() == 2 && labels.size() == 1) { m_featuresNameQuery = features[1]; m_featuresNameDoc = features[0]; m_labelsName = labels[0]; } else { RuntimeError("DSSM requires exactly two features and one label. Their names should match those in NDL definition"); return; } m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = 0; m_labelIdMax = m_labelDim = 0; m_partialMinibatch = m_endReached = false; m_labelType = labelCategory; m_readNextSample = 0; m_traceLevel = readerConfig(L"traceLevel", 0); if (readerConfig.Exists(L"randomize")) { // BUGBUG: reading out string and number... ugh wstring randomizeString = readerConfig(L"randomize"); if (randomizeString == L"None") { m_randomizeRange = randomizeNone; } else if (randomizeString == L"Auto") { m_randomizeRange = randomizeAuto; } else { m_randomizeRange = readerConfig(L"randomize"); } } else { m_randomizeRange = randomizeNone; } std::string minibatchMode(readerConfig(L"minibatchMode", "Partial")); m_partialMinibatch = EqualCI(minibatchMode, "Partial"); // Get the config parameters for query feature and doc feature ConfigParameters configFeaturesQuery = readerConfig(m_featuresNameQuery, ""); ConfigParameters configFeaturesDoc = readerConfig(m_featuresNameDoc, ""); if (configFeaturesQuery.size() == 0) RuntimeError("features file not found, required in configuration: i.e. 'features=[file=c:\\myfile.txt;start=1;dim=123]'"); if (configFeaturesDoc.size() == 0) RuntimeError("features file not found, required in configuration: i.e. 'features=[file=c:\\myfile.txt;start=1;dim=123]'"); // Read in feature size information // This information will be used to handle OOVs m_featuresDimQuery = configFeaturesQuery(L"dim"); m_featuresDimDoc = configFeaturesDoc(L"dim"); std::wstring fileQ = configFeaturesQuery("file"); std::wstring fileD = configFeaturesDoc("file"); dssm_queryInput.Init(fileQ, m_featuresDimQuery); dssm_docInput.Init(fileD, m_featuresDimDoc); m_totalSamples = dssm_queryInput.numRows; if (read_order == NULL) { read_order = new int[m_totalSamples]; for (int c = 0; c < m_totalSamples; c++) { read_order[c] = c; } } m_mbSize = 0; }