void LibSVMBinaryReader<ElemType>::InitFromConfig(const ConfigRecordType& readerConfig) { std::map<std::wstring, std::wstring> rename; RenamedMatrices(readerConfig, rename); if (readerConfig.Exists(L"randomize")) { string randomizeString = readerConfig(L"randomize"); if (randomizeString == "None") { m_randomize = 0L; } else if (randomizeString == "Auto") { time_t rawtime; struct tm* timeinfo; time(&rawtime); timeinfo = localtime(&rawtime); m_randomize = (unsigned long) (timeinfo->tm_sec + timeinfo->tm_min * 60 + timeinfo->tm_hour * 60 * 60); } else { m_randomize = readerConfig(L"randomize", 0); } } else { m_randomize = 0L; } m_partialMinibatch = true; std::string minibatchMode(readerConfig(L"minibatchMode", "Partial")); m_partialMinibatch = EqualCI(minibatchMode, "Partial"); std::wstring file = readerConfig(L"file", L""); m_dataInput = make_shared<SparseBinaryInput<ElemType>>(file); m_dataInput->Init(rename); m_mbSize = (size_t) readerConfig(L"minibatch", 0); if (m_mbSize > 0) { if (m_dataInput->GetMBSize() != m_mbSize) { RuntimeError("Data file and config file have mismatched minibatch sizes.\n"); return; } } else { m_mbSize = m_dataInput->GetMBSize(); } m_prefetchEnabled = true; }
Section* BinaryWriter<ElemType>::CreateSection(const ConfigParameters& config, Section* parentSection, size_t p_records, size_t p_windowSize) { // first check if we need to open a new section file std::vector<std::wstring> sections; // determine the element size, default to ElemType size size_t elementSize = sizeof(ElemType); if (config.ExistsCurrent(L"elementSize")) { elementSize = config(L"elementSize"); } // get the number of records we should expect (max) // if defined in previous levels same number will be used size_t records = p_records; if (config.ExistsCurrent(L"wrecords")) { records = config(L"wrecords"); } if (records == 0) { InvalidArgument("Required config variable 'wrecords' missing from BinaryWriter configuration."); } size_t dim = 1; // default dimension (single item) if (config.ExistsCurrent(L"dim")) { dim = config(L"dim"); } // get the section type (used for caching) SectionType sectionType = sectionTypeNull; if (config.ExistsCurrent(L"sectionType")) { SectionType foundType = sectionTypeNull; wstring type = config(L"sectionType"); for (int i = 0; i < sectionTypeMax; i++) { if (EqualCI(type, SectionTypeStrings[i])) { foundType = SectionType(i); break; } } // check to make sure it matched something if (foundType == sectionTypeNull) { InvalidArgument("Invalid value for 'sectionType' in BinaryWriter configuration: %ls", type.c_str()); } sectionType = foundType; } // calculate number of bytes = dim*elementSize*records size_t dataOnlySize = records * elementSize * dim; size_t dataSize = dataOnlySize + sectionHeaderMin; // filename to use the one defined at this level, if there is none use the parent file SectionFile* file = NULL; if (config.ExistsCurrent(L"wfile")) { std::wstring wfile = config(L"wfile"); auto secFile = m_secFiles.find(wfile); if (secFile != m_secFiles.end()) { file = secFile->second; } else { // TODO: sanity check and use records as a clue of how big to make it size_t initialSize = config(L"wsize", (size_t) 256); // default to 256MB if not provided initialSize *= 1024 * 1024; // convert MB to bytes if (initialSize < dataSize) initialSize = dataSize * 5 / 4; // make the initalSize slightly larger than needed for data file = new SectionFile(wfile, fileOptionsReadWrite, initialSize); m_secFiles[wfile] = file; parentSection = file->FileSection(); parentSection->SetElementCount(records); parentSection->SetFileUniqueId(this->m_uniqueID); } } else { // no file defined at this config level, use parent file if (parentSection != NULL && parentSection->GetSectionFile() != NULL) { file = parentSection->GetSectionFile(); } else if (sectionType != sectionTypeNull) { InvalidArgument("No filename (wfile) defined in BinaryWriter configuration."); } } // determine file position if needed size_t filePositionLast = 0; size_t filePositionNext = 0; if (file != NULL) { // get the next available position in the file (always on the end) filePositionLast = file->GetFilePositionMax(); filePositionNext = file->RoundUp(filePositionLast); // we have a gap, zero it out to keep the file clean if (filePositionLast != filePositionNext) { size_t size = filePositionNext - filePositionLast; size_t roundDown = file->RoundUp(filePositionLast - file->GetViewAlignment() - 1); // need to get a veiw to zero out non-used bytes void* view = file->GetView(roundDown, file->GetViewAlignment()); char* ptr = (char*) view + filePositionLast % file->GetViewAlignment(); memset(ptr, 0, size); file->ReleaseView(view); } } // get the new section name std::string sectionName = config.ConfigName(); // get the window size, to see if we want to do separate element mapping size_t windowSize = p_windowSize; if (config.ExistsCurrent(L"windowSize")) { windowSize = config(L"windowSize"); } MappingType mappingMain = windowSize ? mappingElementWindow : mappingParent; MappingType mappingAux = windowSize ? mappingSection : mappingParent; // now create the new section Section* section = NULL; switch (sectionType) { case sectionTypeNull: // this happens for the original file header, nothing to do // also used when multiple files are defined, but none at the base level break; case sectionTypeFile: // file header // shouldn't occur, but same case as above break; case sectionTypeData: // data section section = new Section(file, parentSection, filePositionNext, mappingMain, dataSize); section->InitHeader(sectionTypeData, sectionName + ":Data Section", sectionDataFloat, sizeof(ElemType)); break; case sectionTypeLabel: // label data { size_t elementSize2 = sizeof(LabelIdType); dataSize = records * elementSize2 + sectionHeaderMin; auto sectionLabel = new SectionLabel(file, parentSection, filePositionNext, mappingMain, dataSize); SectionData dataType = sectionDataInt; LabelKind labelKind = labelCategory; // default if (config.Match(L"labelType", L"Regression")) { labelKind = labelRegression; dataType = sectionDataFloat; elementSize2 = sizeof(ElemType); } else if (config.Match(L"labelType", L"Category")) { // everything set already, default value } else { RuntimeError("Invalid type 'labelType' or missing in BinaryWriter configuration."); } // initialize the section header sectionLabel->InitHeader(sectionTypeLabel, sectionName + ":Labels", dataType, (WORD) elementSize2); // initialize the special label header items sectionLabel->SetLabelKind(labelKind); sectionLabel->SetLabelDim(config(L"labelDim")); section = sectionLabel; break; } case sectionTypeLabelMapping: // label mapping table (array of strings) section = new SectionString(file, parentSection, filePositionNext, mappingAux, dataSize); section->InitHeader(sectionTypeLabelMapping, sectionName + ":Label Map", sectionDataStrings, 0); // declare variable length strings section->SetFlags(flagAuxilarySection); section->SetFlags(flagVariableSized); break; case sectionTypeStats: // data statistics { ConfigArray calcStats = config(L"compute"); records = calcStats.size(); elementSize = sizeof(NumericStatistics); dataOnlySize = records * elementSize; dataSize = dataOnlySize + sectionHeaderMin; auto sectionStats = new SectionStats(file, parentSection, filePositionNext, mappingAux, dataSize); sectionStats->InitHeader(sectionTypeStats, sectionName + ":Data Statistics", sectionDataStruct, sizeof(NumericStatistics)); // declare variable length strings sectionStats->SetFlags(flagAuxilarySection); section = sectionStats; break; } case sectionTypeCategoryLabel: section = new Section(file, parentSection, filePositionNext, mappingMain, dataSize); section->InitHeader(sectionTypeCategoryLabel, sectionName + ":Category Labels", sectionDataFloat, sizeof(ElemType)); // declare variable length strings break; } // set the rest of the header variables necessary if (section == NULL) { // NULL or file section/already created section = parentSection; } else { section->SetElementSize(elementSize); section->SetElementsPerRecord(dim); section->SetElementCount(records * dim); section->SetSize(dataSize); section->SetSizeAll(dataSize); // windowSize is in records, convert to bytes size_t dataWindowSize = windowSize ? windowSize * elementSize * dim : dataOnlySize; // clamp it down to actual data size dataWindowSize = min(dataOnlySize, dataWindowSize); // now get the data pointer setup and allocate the view as necessary bool auxSection = !!(section->GetFlags() & flagAuxilarySection); section->EnsureElements(0, auxSection ? dataOnlySize : dataWindowSize); // update the max file position for the next section file->SetFilePositionMax(section->GetFilePosition() + dataSize); // Add new section to parent parentSection->AddSection(section); } // From here on down we have a fully usable section object // now find the subsections and repeat vector<std::wstring> subsections; FindConfigNames(config, "sectionType", subsections); // look for any children and create them as well for (std::wstring subsection : subsections) { CreateSection(config(subsection), section, records, windowSize); } // wait until here so everything is mapped and valid in the object if (sectionType == sectionTypeStats) { ConfigArray calcStats = config(L"compute"); ((SectionStats*) section)->InitCompute(calcStats); } // add to section map if (sectionType != sectionTypeFile && sectionType != sectionTypeNull) { std::wstring wsectionName = msra::strfun::utf16(sectionName); // can't have identical names in a write configuration if (m_sections.find(wsectionName) != m_sections.end()) { RuntimeError("Identical section name appears twice:%s", sectionName.c_str()); } m_sections[wsectionName] = section; } // validate the header (make sure it's sane) if (section && file && !section->ValidateHeader(file->Writing())) { RuntimeError("Invalid header in file %ls, in header %ls\n", file->GetName().c_str(), section->GetName().c_str()); } // return the now complete section return section; }
void DSSMReader<ElemType>::InitFromConfig(const ConfigRecordType& readerConfig) { std::vector<std::wstring> features; std::vector<std::wstring> labels; // Determine the names of the features and lables sections in the config file. // features - [in,out] a vector of feature name strings // labels - [in,out] a vector of label name strings // For DSSM dataset, we only need features. No label is necessary. The following "labels" just serves as a place holder GetFileConfigNames(readerConfig, features, labels); // For DSSM dataset, it must have exactly two features // In the config file, we must specify query features first, then document features. The sequence is different here. Pay attention if (features.size() == 2 && labels.size() == 1) { m_featuresNameQuery = features[1]; m_featuresNameDoc = features[0]; m_labelsName = labels[0]; } else { RuntimeError("DSSM requires exactly two features and one label. Their names should match those in NDL definition"); return; } m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = 0; m_labelIdMax = m_labelDim = 0; m_partialMinibatch = m_endReached = false; m_labelType = labelCategory; m_readNextSample = 0; m_traceLevel = readerConfig(L"traceLevel", 0); if (readerConfig.Exists(L"randomize")) { // BUGBUG: reading out string and number... ugh wstring randomizeString = readerConfig(L"randomize"); if (randomizeString == L"None") { m_randomizeRange = randomizeNone; } else if (randomizeString == L"Auto") { m_randomizeRange = randomizeAuto; } else { m_randomizeRange = readerConfig(L"randomize"); } } else { m_randomizeRange = randomizeNone; } std::string minibatchMode(readerConfig(L"minibatchMode", "Partial")); m_partialMinibatch = EqualCI(minibatchMode, "Partial"); // Get the config parameters for query feature and doc feature ConfigParameters configFeaturesQuery = readerConfig(m_featuresNameQuery, ""); ConfigParameters configFeaturesDoc = readerConfig(m_featuresNameDoc, ""); if (configFeaturesQuery.size() == 0) RuntimeError("features file not found, required in configuration: i.e. 'features=[file=c:\\myfile.txt;start=1;dim=123]'"); if (configFeaturesDoc.size() == 0) RuntimeError("features file not found, required in configuration: i.e. 'features=[file=c:\\myfile.txt;start=1;dim=123]'"); // Read in feature size information // This information will be used to handle OOVs m_featuresDimQuery = configFeaturesQuery(L"dim"); m_featuresDimDoc = configFeaturesDoc(L"dim"); std::wstring fileQ = configFeaturesQuery("file"); std::wstring fileD = configFeaturesDoc("file"); dssm_queryInput.Init(fileQ, m_featuresDimQuery); dssm_docInput.Init(fileD, m_featuresDimDoc); m_totalSamples = dssm_queryInput.numRows; if (read_order == NULL) { read_order = new int[m_totalSamples]; for (int c = 0; c < m_totalSamples; c++) { read_order[c] = c; } } m_mbSize = 0; }