コード例 #1
0
ファイル: Indexer.cpp プロジェクト: 6779660/CNTK
void Indexer::BuildFromLines()
{
    assert(m_pos == m_bufferStart);
    m_hasSequenceIds = false;
    size_t lines = 0;
    int64_t offset = GetFileOffset();
    while (!m_done)
    {
        m_pos = (char*)memchr(m_pos, ROW_DELIMITER, m_bufferEnd - m_pos);
        if (m_pos)
        {
            SequenceDescriptor sd = {};
            sd.m_id = lines;
            sd.m_numberOfSamples = 1;
            sd.m_isValid = true;
            sd.m_fileOffsetBytes = offset;
            offset = GetFileOffset() + 1;
            sd.m_byteSize = offset - sd.m_fileOffsetBytes;
            // TODO: ignore empty lines.
            AddSequence(sd);
            ++m_pos;
            ++lines;
        }
        else
        {
            RefillBuffer();
        }
    }
}
コード例 #2
0
ファイル: Indexer.cpp プロジェクト: 6779660/CNTK
bool Indexer::GetNextSequenceId(size_t& id)
{
    bool found = false;
    id = 0;
    while (!m_done)
    {
        while (m_pos != m_bufferEnd)
        {
            char c = *m_pos;
            // a well-formed sequence id must end in either a column delimiter 
            // or a name prefix
            if (c == COLUMN_DELIMITER || c == NAME_PREFIX)
            {
                return found;
            }

            if (!isdigit(c))
            {
                // TODO: ignore malformed sequences
                RuntimeError("Unexpected character('%c')"
                    " while reading a sequence id"
                    " at the offset = %" PRIi64 "\n", c, GetFileOffset());
            }

            found |= true;
            size_t temp = id;
            id = id * 10 + (c - '0');
            if (temp > id)
            {
                // TODO: ignore malformed sequences
                RuntimeError("Size_t overflow while reading a sequence id"
                    " at the offset = %" PRIi64 "\n", GetFileOffset());
            }
            ++m_pos;
        }
        RefillBuffer();
    }

    // TODO: ignore malformed sequences
    // reached EOF without hitting the pipe character.
    RuntimeError("Reached the end of file "
        " while reading a sequence id"
        " at the offset = %" PRIi64 "\n", GetFileOffset());
}
コード例 #3
0
ファイル: Indexer.cpp プロジェクト: Microsoft/CNTK
void Indexer::BuildFromLines(CorpusDescriptorPtr corpus)
{
    assert(m_pos == m_bufferStart);
    m_hasSequenceIds = false;
    size_t lines = 0;
    int64_t offset = GetFileOffset();
    while (!m_done)
    {
        m_pos = (char*)memchr(m_pos, ROW_DELIMITER, m_bufferEnd - m_pos);
        if (m_pos)
        {
            SequenceDescriptor sd = {};
            sd.m_numberOfSamples = 1;
            sd.m_fileOffsetBytes = offset;
            offset = GetFileOffset() + 1;
            sd.m_byteSize = offset - sd.m_fileOffsetBytes;
            AddSequenceIfIncluded(corpus, lines, sd);
            ++m_pos;
            ++lines;
        }
        else
        {
            RefillBuffer();
        }
    }

    if (offset < m_fileOffsetEnd)
    {
        // There's a number of characters, not terminated by a newline,
        // add a sequence to the index, parser will have to deal with it.
        SequenceDescriptor sd = {};
        sd.m_numberOfSamples = 1;
        sd.m_fileOffsetBytes = offset;
        sd.m_byteSize = m_fileOffsetEnd - sd.m_fileOffsetBytes;
        AddSequenceIfIncluded(corpus, lines, sd);
    }
}
コード例 #4
0
ファイル: Indexer.cpp プロジェクト: 6779660/CNTK
void Indexer::Build()
{
    if (!m_chunks.empty())
    {
        return;
    }
    
    if (m_maxChunkSize > 0)
    {
        auto fileSize = filesize(m_file);
        m_chunks.reserve((fileSize + m_maxChunkSize - 1) / m_maxChunkSize);
    }

    m_chunks.push_back({});

    RefillBuffer(); // read the first block of data
    if (m_done)
    {
        RuntimeError("Input file is empty");
    }

    if ((m_bufferEnd - m_bufferStart > 3) &&
        (m_bufferStart[0] == '\xEF' && m_bufferStart[1] == '\xBB' && m_bufferStart[2] == '\xBF'))
    {
        // input file contains UTF-8 BOM value, skip it.
        m_pos += 3;
        m_fileOffsetStart += 3;
        m_bufferStart += 3;
    }

    // check the first byte and decide what to do next
    if (!m_hasSequenceIds || m_bufferStart[0] == NAME_PREFIX)
    {
        // skip sequence id parsing, treat lines as individual sequences
        BuildFromLines();
        return;
    }

    size_t id = 0;
    int64_t offset = GetFileOffset();
    // read the very first sequence id
    if (!GetNextSequenceId(id))
    {
        RuntimeError("Expected a sequence id at the offset %" PRIi64 ", none was found.", offset);
    }

    SequenceDescriptor sd = {};
    sd.m_id = id;
    sd.m_fileOffsetBytes = offset;
    sd.m_isValid = true;

    while (!m_done)
    {
        SkipLine(); // ignore whatever is left on this line.
        offset = GetFileOffset(); // a new line starts at this offset;
        sd.m_numberOfSamples++;

        if (!m_done && GetNextSequenceId(id) && id != sd.m_id)
        {
            // found a new sequence, which starts at the [offset] bytes into the file
            sd.m_byteSize = offset - sd.m_fileOffsetBytes;
            AddSequence(sd);
            sd = {};
            sd.m_id = id;
            sd.m_fileOffsetBytes = offset;
            sd.m_isValid = true;
        }
    }

    // calculate the byte size for the last sequence
    sd.m_byteSize = m_fileOffsetEnd - sd.m_fileOffsetBytes;
    AddSequence(sd);
}
コード例 #5
0
ファイル: Indexer.cpp プロジェクト: Microsoft/CNTK
void Indexer::Build(CorpusDescriptorPtr corpus)
{
    if (!m_index.IsEmpty())
    {
        return;
    }

    m_index.Reserve(filesize(m_file));

    RefillBuffer(); // read the first block of data
    if (m_done)
    {
        RuntimeError("Input file is empty");
    }

    if ((m_bufferEnd - m_bufferStart > 3) &&
        (m_bufferStart[0] == '\xEF' && m_bufferStart[1] == '\xBB' && m_bufferStart[2] == '\xBF'))
    {
        // input file contains UTF-8 BOM value, skip it.
        m_pos += 3;
        m_fileOffsetStart += 3;
        m_bufferStart += 3;
    }

    // check the first byte and decide what to do next
    if (!m_hasSequenceIds || m_bufferStart[0] == NAME_PREFIX)
    {
        // skip sequence id parsing, treat lines as individual sequences
        BuildFromLines(corpus);
        return;
    }

    size_t id = 0;
    int64_t offset = GetFileOffset();
    // read the very first sequence id
    if (!TryGetSequenceId(id))
    {
        RuntimeError("Expected a sequence id at the offset %" PRIi64 ", none was found.", offset);
    }

    SequenceDescriptor sd = {};
    sd.m_fileOffsetBytes = offset;

    size_t currentKey = id;
    while (!m_done)
    {
        SkipLine(); // ignore whatever is left on this line.
        offset = GetFileOffset(); // a new line starts at this offset;
        sd.m_numberOfSamples++;

        if (!m_done && TryGetSequenceId(id) && id != currentKey)
        {
            // found a new sequence, which starts at the [offset] bytes into the file
            sd.m_byteSize = offset - sd.m_fileOffsetBytes;
            AddSequenceIfIncluded(corpus, currentKey, sd);

            sd = {};
            sd.m_fileOffsetBytes = offset;
            currentKey = id;
        }
    }

    // calculate the byte size for the last sequence
    sd.m_byteSize = m_fileOffsetEnd - sd.m_fileOffsetBytes;
    AddSequenceIfIncluded(corpus, currentKey, sd);
}