void Indexer::Build() { if (!m_chunks.empty()) { return; } if (m_maxChunkSize > 0) { auto fileSize = filesize(m_file); m_chunks.reserve((fileSize + m_maxChunkSize - 1) / m_maxChunkSize); } m_chunks.push_back({}); RefillBuffer(); // read the first block of data if (m_done) { RuntimeError("Input file is empty"); } if ((m_bufferEnd - m_bufferStart > 3) && (m_bufferStart[0] == '\xEF' && m_bufferStart[1] == '\xBB' && m_bufferStart[2] == '\xBF')) { // input file contains UTF-8 BOM value, skip it. m_pos += 3; m_fileOffsetStart += 3; m_bufferStart += 3; } // check the first byte and decide what to do next if (!m_hasSequenceIds || m_bufferStart[0] == NAME_PREFIX) { // skip sequence id parsing, treat lines as individual sequences BuildFromLines(); return; } size_t id = 0; int64_t offset = GetFileOffset(); // read the very first sequence id if (!GetNextSequenceId(id)) { RuntimeError("Expected a sequence id at the offset %" PRIi64 ", none was found.", offset); } SequenceDescriptor sd = {}; sd.m_id = id; sd.m_fileOffsetBytes = offset; sd.m_isValid = true; while (!m_done) { SkipLine(); // ignore whatever is left on this line. offset = GetFileOffset(); // a new line starts at this offset; sd.m_numberOfSamples++; if (!m_done && GetNextSequenceId(id) && id != sd.m_id) { // found a new sequence, which starts at the [offset] bytes into the file sd.m_byteSize = offset - sd.m_fileOffsetBytes; AddSequence(sd); sd = {}; sd.m_id = id; sd.m_fileOffsetBytes = offset; sd.m_isValid = true; } } // calculate the byte size for the last sequence sd.m_byteSize = m_fileOffsetEnd - sd.m_fileOffsetBytes; AddSequence(sd); }
void Indexer::Build(CorpusDescriptorPtr corpus) { if (!m_index.IsEmpty()) { return; } m_index.Reserve(filesize(m_file)); RefillBuffer(); // read the first block of data if (m_done) { RuntimeError("Input file is empty"); } if ((m_bufferEnd - m_bufferStart > 3) && (m_bufferStart[0] == '\xEF' && m_bufferStart[1] == '\xBB' && m_bufferStart[2] == '\xBF')) { // input file contains UTF-8 BOM value, skip it. m_pos += 3; m_fileOffsetStart += 3; m_bufferStart += 3; } // check the first byte and decide what to do next if (!m_hasSequenceIds || m_bufferStart[0] == NAME_PREFIX) { // skip sequence id parsing, treat lines as individual sequences BuildFromLines(corpus); return; } size_t id = 0; int64_t offset = GetFileOffset(); // read the very first sequence id if (!TryGetSequenceId(id)) { RuntimeError("Expected a sequence id at the offset %" PRIi64 ", none was found.", offset); } SequenceDescriptor sd = {}; sd.m_fileOffsetBytes = offset; size_t currentKey = id; while (!m_done) { SkipLine(); // ignore whatever is left on this line. offset = GetFileOffset(); // a new line starts at this offset; sd.m_numberOfSamples++; if (!m_done && TryGetSequenceId(id) && id != currentKey) { // found a new sequence, which starts at the [offset] bytes into the file sd.m_byteSize = offset - sd.m_fileOffsetBytes; AddSequenceIfIncluded(corpus, currentKey, sd); sd = {}; sd.m_fileOffsetBytes = offset; currentKey = id; } } // calculate the byte size for the last sequence sd.m_byteSize = m_fileOffsetEnd - sd.m_fileOffsetBytes; AddSequenceIfIncluded(corpus, currentKey, sd); }