Esempio n. 1
0
void LibSVMBinaryReader<ElemType>::InitFromConfig(const ConfigRecordType& readerConfig)
{

    std::map<std::wstring, std::wstring> rename;
    RenamedMatrices(readerConfig, rename);

    if (readerConfig.Exists(L"randomize"))
    {
        string randomizeString = readerConfig(L"randomize");
        if (randomizeString == "None")
        {
            m_randomize = 0L;
        }
        else if (randomizeString == "Auto")
        {
            time_t rawtime;
            struct tm* timeinfo;
            time(&rawtime);
            timeinfo = localtime(&rawtime);
            m_randomize = (unsigned long) (timeinfo->tm_sec + timeinfo->tm_min * 60 + timeinfo->tm_hour * 60 * 60);
        }
        else
        {
            m_randomize = readerConfig(L"randomize", 0);
        }
    }
    else
    {
        m_randomize = 0L;
    }

    m_partialMinibatch = true;
    std::string minibatchMode(readerConfig(L"minibatchMode", "Partial"));
    m_partialMinibatch = EqualCI(minibatchMode, "Partial");

    std::wstring file = readerConfig(L"file", L"");

    m_dataInput = make_shared<SparseBinaryInput<ElemType>>(file);
    m_dataInput->Init(rename);

    m_mbSize = (size_t) readerConfig(L"minibatch", 0);
    if (m_mbSize > 0)
    {
        if (m_dataInput->GetMBSize() != m_mbSize)
        {
            RuntimeError("Data file and config file have mismatched minibatch sizes.\n");
            return;
        }
    }
    else
    {
        m_mbSize = m_dataInput->GetMBSize();
    }

    m_prefetchEnabled = true;
}
Esempio n. 2
0
Section* BinaryWriter<ElemType>::CreateSection(const ConfigParameters& config, Section* parentSection, size_t p_records, size_t p_windowSize)
{
    // first check if we need to open a new section file
    std::vector<std::wstring> sections;

    // determine the element size, default to ElemType size
    size_t elementSize = sizeof(ElemType);
    if (config.ExistsCurrent(L"elementSize"))
    {
        elementSize = config(L"elementSize");
    }

    // get the number of records we should expect (max)
    // if defined in previous levels same number will be used
    size_t records = p_records;
    if (config.ExistsCurrent(L"wrecords"))
    {
        records = config(L"wrecords");
    }
    if (records == 0)
    {
        InvalidArgument("Required config variable 'wrecords' missing from BinaryWriter configuration.");
    }

    size_t dim = 1; // default dimension (single item)
    if (config.ExistsCurrent(L"dim"))
    {
        dim = config(L"dim");
    }

    // get the section type (used for caching)
    SectionType sectionType = sectionTypeNull;
    if (config.ExistsCurrent(L"sectionType"))
    {
        SectionType foundType = sectionTypeNull;
        wstring type = config(L"sectionType");
        for (int i = 0; i < sectionTypeMax; i++)
        {
            if (EqualCI(type, SectionTypeStrings[i]))
            {
                foundType = SectionType(i);
                break;
            }
        }

        // check to make sure it matched something
        if (foundType == sectionTypeNull)
        {
            InvalidArgument("Invalid value for 'sectionType' in BinaryWriter configuration: %ls", type.c_str());
        }
        sectionType = foundType;
    }

    // calculate number of bytes = dim*elementSize*records
    size_t dataOnlySize = records * elementSize * dim;
    size_t dataSize = dataOnlySize + sectionHeaderMin;

    // filename to use the one defined at this level, if there is none use the parent file
    SectionFile* file = NULL;
    if (config.ExistsCurrent(L"wfile"))
    {
        std::wstring wfile = config(L"wfile");
        auto secFile = m_secFiles.find(wfile);
        if (secFile != m_secFiles.end())
        {
            file = secFile->second;
        }
        else
        {
            // TODO: sanity check and use records as a clue of how big to make it
            size_t initialSize = config(L"wsize", (size_t) 256); // default to 256MB if not provided
            initialSize *= 1024 * 1024;                          // convert MB to bytes
            if (initialSize < dataSize)
                initialSize = dataSize * 5 / 4; // make the initalSize slightly larger than needed for data
            file = new SectionFile(wfile, fileOptionsReadWrite, initialSize);
            m_secFiles[wfile] = file;
            parentSection = file->FileSection();
            parentSection->SetElementCount(records);
            parentSection->SetFileUniqueId(this->m_uniqueID);
        }
    }
    else
    { // no file defined at this config level, use parent file
        if (parentSection != NULL && parentSection->GetSectionFile() != NULL)
        {
            file = parentSection->GetSectionFile();
        }
        else if (sectionType != sectionTypeNull)
        {
            InvalidArgument("No filename (wfile) defined in BinaryWriter configuration.");
        }
    }

    // determine file position if needed
    size_t filePositionLast = 0;
    size_t filePositionNext = 0;

    if (file != NULL)
    {
        // get the next available position in the file (always on the end)
        filePositionLast = file->GetFilePositionMax();
        filePositionNext = file->RoundUp(filePositionLast);

        // we have a gap, zero it out to keep the file clean
        if (filePositionLast != filePositionNext)
        {
            size_t size = filePositionNext - filePositionLast;
            size_t roundDown = file->RoundUp(filePositionLast - file->GetViewAlignment() - 1);
            // need to get a veiw to zero out non-used bytes
            void* view = file->GetView(roundDown, file->GetViewAlignment());
            char* ptr = (char*) view + filePositionLast % file->GetViewAlignment();
            memset(ptr, 0, size);
            file->ReleaseView(view);
        }
    }

    // get the new section name
    std::string sectionName = config.ConfigName();

    // get the window size, to see if we want to do separate element mapping
    size_t windowSize = p_windowSize;
    if (config.ExistsCurrent(L"windowSize"))
    {
        windowSize = config(L"windowSize");
    }
    MappingType mappingMain = windowSize ? mappingElementWindow : mappingParent;
    MappingType mappingAux = windowSize ? mappingSection : mappingParent;

    // now create the new section
    Section* section = NULL;
    switch (sectionType)
    {
    case sectionTypeNull:
        // this happens for the original file header, nothing to do
        // also used when multiple files are defined, but none at the base level
        break;
    case sectionTypeFile: // file header
        // shouldn't occur, but same case as above
        break;
    case sectionTypeData: // data section
        section = new Section(file, parentSection, filePositionNext, mappingMain, dataSize);
        section->InitHeader(sectionTypeData, sectionName + ":Data Section", sectionDataFloat, sizeof(ElemType));
        break;
    case sectionTypeLabel: // label data
    {
        size_t elementSize2 = sizeof(LabelIdType);
        dataSize = records * elementSize2 + sectionHeaderMin;
        auto sectionLabel = new SectionLabel(file, parentSection, filePositionNext, mappingMain, dataSize);
        SectionData dataType = sectionDataInt;
        LabelKind labelKind = labelCategory; // default
        if (config.Match(L"labelType", L"Regression"))
        {
            labelKind = labelRegression;
            dataType = sectionDataFloat;
            elementSize2 = sizeof(ElemType);
        }
        else if (config.Match(L"labelType", L"Category"))
        {
            // everything set already, default value
        }
        else
        {
            RuntimeError("Invalid type 'labelType' or missing in BinaryWriter configuration.");
        }

        // initialize the section header
        sectionLabel->InitHeader(sectionTypeLabel, sectionName + ":Labels", dataType, (WORD) elementSize2);

        // initialize the special label header items
        sectionLabel->SetLabelKind(labelKind);
        sectionLabel->SetLabelDim(config(L"labelDim"));
        section = sectionLabel;
        break;
    }
    case sectionTypeLabelMapping: // label mapping table (array of strings)
        section = new SectionString(file, parentSection, filePositionNext, mappingAux, dataSize);
        section->InitHeader(sectionTypeLabelMapping, sectionName + ":Label Map", sectionDataStrings, 0); // declare variable length strings
        section->SetFlags(flagAuxilarySection);
        section->SetFlags(flagVariableSized);
        break;
    case sectionTypeStats: // data statistics
    {
        ConfigArray calcStats = config(L"compute");
        records = calcStats.size();
        elementSize = sizeof(NumericStatistics);
        dataOnlySize = records * elementSize;
        dataSize = dataOnlySize + sectionHeaderMin;
        auto sectionStats = new SectionStats(file, parentSection, filePositionNext, mappingAux, dataSize);
        sectionStats->InitHeader(sectionTypeStats, sectionName + ":Data Statistics", sectionDataStruct, sizeof(NumericStatistics)); // declare variable length strings
        sectionStats->SetFlags(flagAuxilarySection);
        section = sectionStats;
        break;
    }
    case sectionTypeCategoryLabel:
        section = new Section(file, parentSection, filePositionNext, mappingMain, dataSize);
        section->InitHeader(sectionTypeCategoryLabel, sectionName + ":Category Labels", sectionDataFloat, sizeof(ElemType)); // declare variable length strings
        break;
    }

    // set the rest of the header variables necessary
    if (section == NULL)
    {
        // NULL or file section/already created
        section = parentSection;
    }
    else
    {
        section->SetElementSize(elementSize);
        section->SetElementsPerRecord(dim);
        section->SetElementCount(records * dim);
        section->SetSize(dataSize);
        section->SetSizeAll(dataSize);

        // windowSize is in records, convert to bytes
        size_t dataWindowSize = windowSize ? windowSize * elementSize * dim : dataOnlySize;
        // clamp it down to actual data size
        dataWindowSize = min(dataOnlySize, dataWindowSize);

        // now get the data pointer setup and allocate the view as necessary
        bool auxSection = !!(section->GetFlags() & flagAuxilarySection);
        section->EnsureElements(0, auxSection ? dataOnlySize : dataWindowSize);

        // update the max file position for the next section
        file->SetFilePositionMax(section->GetFilePosition() + dataSize);

        // Add new section to parent
        parentSection->AddSection(section);
    }

    // From here on down we have a fully usable section object

    // now find the subsections and repeat
    vector<std::wstring> subsections;
    FindConfigNames(config, "sectionType", subsections);

    // look for any children and create them as well
    for (std::wstring subsection : subsections)
    {
        CreateSection(config(subsection), section, records, windowSize);
    }

    // wait until here so everything is mapped and valid in the object
    if (sectionType == sectionTypeStats)
    {
        ConfigArray calcStats = config(L"compute");
        ((SectionStats*) section)->InitCompute(calcStats);
    }

    // add to section map
    if (sectionType != sectionTypeFile && sectionType != sectionTypeNull)
    {
        std::wstring wsectionName = msra::strfun::utf16(sectionName);
        // can't have identical names in a write configuration
        if (m_sections.find(wsectionName) != m_sections.end())
        {
            RuntimeError("Identical section name appears twice:%s", sectionName.c_str());
        }
        m_sections[wsectionName] = section;
    }

    // validate the header (make sure it's sane)
    if (section && file && !section->ValidateHeader(file->Writing()))
    {
        RuntimeError("Invalid header in file %ls, in header %ls\n", file->GetName().c_str(), section->GetName().c_str());
    }

    // return the now complete section
    return section;
}
Esempio n. 3
0
void DSSMReader<ElemType>::InitFromConfig(const ConfigRecordType& readerConfig)
{
    std::vector<std::wstring> features;
    std::vector<std::wstring> labels;

    // Determine the names of the features and lables sections in the config file.
    // features - [in,out] a vector of feature name strings
    // labels - [in,out] a vector of label name strings
    // For DSSM dataset, we only need features. No label is necessary. The following "labels" just serves as a place holder
    GetFileConfigNames(readerConfig, features, labels);

    // For DSSM dataset, it must have exactly two features
    // In the config file, we must specify query features first, then document features. The sequence is different here. Pay attention
    if (features.size() == 2 && labels.size() == 1)
    {
        m_featuresNameQuery = features[1];
        m_featuresNameDoc = features[0];
        m_labelsName = labels[0];
    }
    else
    {
        RuntimeError("DSSM requires exactly two features and one label. Their names should match those in NDL definition");
        return;
    }

    m_mbStartSample = m_epoch = m_totalSamples = m_epochStartSample = 0;
    m_labelIdMax = m_labelDim = 0;
    m_partialMinibatch = m_endReached = false;
    m_labelType = labelCategory;
    m_readNextSample = 0;
    m_traceLevel = readerConfig(L"traceLevel", 0);

    if (readerConfig.Exists(L"randomize"))
    {
        // BUGBUG: reading out string and number... ugh
        wstring randomizeString = readerConfig(L"randomize");
        if (randomizeString == L"None")
        {
            m_randomizeRange = randomizeNone;
        }
        else if (randomizeString == L"Auto")
        {
            m_randomizeRange = randomizeAuto;
        }
        else
        {
            m_randomizeRange = readerConfig(L"randomize");
        }
    }
    else
    {
        m_randomizeRange = randomizeNone;
    }

    std::string minibatchMode(readerConfig(L"minibatchMode", "Partial"));
    m_partialMinibatch = EqualCI(minibatchMode, "Partial");

    // Get the config parameters for query feature and doc feature
    ConfigParameters configFeaturesQuery = readerConfig(m_featuresNameQuery, "");
    ConfigParameters configFeaturesDoc   = readerConfig(m_featuresNameDoc, "");

    if (configFeaturesQuery.size() == 0)
        RuntimeError("features file not found, required in configuration: i.e. 'features=[file=c:\\myfile.txt;start=1;dim=123]'");
    if (configFeaturesDoc.size() == 0)
        RuntimeError("features file not found, required in configuration: i.e. 'features=[file=c:\\myfile.txt;start=1;dim=123]'");

    // Read in feature size information
    // This information will be used to handle OOVs
    m_featuresDimQuery = configFeaturesQuery(L"dim");
    m_featuresDimDoc   = configFeaturesDoc(L"dim");

    std::wstring fileQ = configFeaturesQuery("file");
    std::wstring fileD = configFeaturesDoc("file");

    dssm_queryInput.Init(fileQ, m_featuresDimQuery);
    dssm_docInput.Init(fileD, m_featuresDimDoc);

    m_totalSamples = dssm_queryInput.numRows;
    if (read_order == NULL)
    {
        read_order = new int[m_totalSamples];
        for (int c = 0; c < m_totalSamples; c++)
        {
            read_order[c] = c;
        }
    }
    m_mbSize = 0;
}