void FMIndexBuilder::buildMarkers() { size_t starting_byte = m_str_bytes; // Do we need to place new large markers? while((m_str_symbols / m_large_sample_rate) + 1 > m_num_large_markers_wrote) { // Build a new large marker with the accumulated counts up to this point LargeMarker marker; marker.byteIndex = starting_byte; marker.counts = m_runningAC; m_prevLargeMarker = marker; // Write the marker to the temp file mp_lm_tmp->write(reinterpret_cast<const char*>(&m_prevLargeMarker), sizeof(LargeMarker)); m_num_large_markers_wrote += 1; } // We place a new SmallMarkers for every segment. AlphaCount16 smallAC; for(size_t j = 0; j < BWT_ALPHABET::size; ++j) { size_t v = m_runningAC.getByIdx(j) - m_prevLargeMarker.counts.getByIdx(j); if(v > smallAC.getMaxValue()) { std::cerr << "Error: Number of symbols exceeds the maximum value (" << v << " > " << smallAC.getMaxValue() << ")\n"; std::cerr << "RunningAC: " << m_runningAC << "\n"; std::cerr << "PrevAC: " << m_prevLargeMarker.counts << "\n"; std::cerr << "SmallAC:" << smallAC << "\n"; exit(EXIT_FAILURE); } smallAC.setByIdx(j, v); } // Construct the small marker SmallMarker smallMarker; smallMarker.byteCount = starting_byte - m_prevLargeMarker.byteIndex; smallMarker.counts = smallAC; m_prevSmallMarker = smallMarker; // write it to disk mp_sm_tmp->write(reinterpret_cast<const char*>(&m_prevSmallMarker), sizeof(SmallMarker)); m_num_small_markers_wrote += 1; }
// Fill in the FM-index data structures void RLBWT::initializeFMIndex() { m_smallShiftValue = Occurrence::calculateShiftValue(m_smallSampleRate); m_largeShiftValue = Occurrence::calculateShiftValue(m_largeSampleRate); // initialize the marker vectors, // LargeMarkers are placed every 2048 bases (by default) containing the absolute count // of symbols seen up to that point. SmallMarkers are placed every 128 bases with the // count over the last 128 symbols. From these relative counts the absolute count // every 128 symbols can be interpolated. size_t num_large_markers = getNumRequiredMarkers(m_numSymbols, m_largeSampleRate); size_t num_small_markers = getNumRequiredMarkers(m_numSymbols, m_smallSampleRate); m_largeMarkers.resize(num_large_markers); m_smallMarkers.resize(num_small_markers); // Fill in the marker values // We wish to place markers every sampleRate symbols however since a run may // not end exactly on sampleRate boundaries, we place the markers AFTER // the run crossing the boundary ends // Place a blank markers at the start of the data m_largeMarkers[0].unitIndex = 0; m_smallMarkers[0].unitCount = 0; // State variables for the number of markers placed, // the next marker to place, etc size_t curr_large_marker_index = 1; size_t curr_small_marker_index = 1; size_t next_small_marker = m_smallSampleRate; size_t next_large_marker = m_largeSampleRate; size_t prev_small_marker_unit_index = 0; size_t running_total = 0; AlphaCount64 running_ac; for(size_t i = 0; i < m_rlString.size(); ++i) { // Update the count and advance the running total RLUnit& unit = m_rlString[i]; char symbol = unit.getChar(); uint8_t run_len = unit.getCount(); running_ac.add(symbol, run_len); running_total += run_len; size_t curr_unit_index = i + 1; bool last_symbol = i == m_rlString.size() - 1; // Check whether to place a new large marker bool place_last_large_marker = last_symbol && curr_large_marker_index < num_large_markers; while(running_total >= next_large_marker || place_last_large_marker) { size_t expected_marker_pos = curr_large_marker_index * m_largeSampleRate; // Sanity checks // The marker position should always be less than the running total unless // the number of symbols is smaller than the sample rate assert(expected_marker_pos <= running_total || place_last_large_marker); assert((running_total - expected_marker_pos) <= RL_FULL_COUNT || place_last_large_marker); assert(curr_large_marker_index < num_large_markers); assert(running_ac.getSum() == running_total); LargeMarker& marker = m_largeMarkers[curr_large_marker_index]; marker.unitIndex = i + 1; marker.counts = running_ac; next_large_marker += m_largeSampleRate; curr_large_marker_index += 1; place_last_large_marker = last_symbol && curr_large_marker_index < num_large_markers; } // Check whether to place a new small marker bool place_last_small_marker = last_symbol && curr_small_marker_index < num_small_markers; while(running_total >= next_small_marker || place_last_small_marker) { // Place markers size_t expected_marker_pos = curr_small_marker_index * m_smallSampleRate; // Sanity checks // The marker position should always be less than the running total unless // the number of symbols is smaller than the sample rate assert(expected_marker_pos <= running_total || place_last_small_marker); assert((running_total - expected_marker_pos) <= RL_FULL_COUNT || place_last_small_marker); assert(curr_small_marker_index < num_small_markers); assert(running_ac.getSum() == running_total); // Calculate the number of rl units that are contained in this block if(curr_unit_index - prev_small_marker_unit_index > std::numeric_limits<uint16_t>::max()) { std::cerr << "Error: Number of units in occurrence array block " << curr_small_marker_index << " exceeds the maximum value.\n"; exit(EXIT_FAILURE); } // Calculate the large marker to set the relative count from // This is generally the most previously placed large block except it might // be the second-previous in the case that we placed the last large marker. size_t large_marker_index = expected_marker_pos >> m_largeShiftValue; assert(large_marker_index < curr_large_marker_index); // ensure the last has ben placed LargeMarker& prev_large_marker = m_largeMarkers[large_marker_index]; // Set the 8bit AlphaCounts as the sum since the last large (superblock) marker AlphaCount16 smallAC; for(size_t j = 0; j < ALPHABET_SIZE; ++j) { size_t v = running_ac.getByIdx(j) - prev_large_marker.counts.getByIdx(j); if(v > smallAC.getMaxValue()) { std::cerr << "Error: Number of symbols in occurrence array block " << curr_small_marker_index << " exceeds the maximum value (" << v << " > " << smallAC.getMaxValue() << ")\n"; exit(EXIT_FAILURE); } smallAC.setByIdx(j, v); } // Set the small marker SmallMarker& small_marker = m_smallMarkers[curr_small_marker_index]; small_marker.unitCount = curr_unit_index - prev_large_marker.unitIndex; small_marker.counts = smallAC; // Update state variables next_small_marker += m_smallSampleRate; curr_small_marker_index += 1; prev_small_marker_unit_index = curr_unit_index; place_last_small_marker = last_symbol && curr_small_marker_index < num_small_markers; } } assert(curr_small_marker_index == num_small_markers); assert(curr_large_marker_index == num_large_markers); // Initialize C(a) m_predCount.set('$', 0); m_predCount.set('A', running_ac.get('$')); m_predCount.set('C', m_predCount.get('A') + running_ac.get('A')); m_predCount.set('G', m_predCount.get('C') + running_ac.get('C')); m_predCount.set('T', m_predCount.get('G') + running_ac.get('G')); }