// ******************************************************************************************** void Block::StoreDNAHuf(BitStream &bit_stream, LzMatcher &lz_matcher, uchar *sym_code, HuffmanEncoder::Code *sym_huf_codes, bool try_lz) { // Info about LZ matches if (try_lz) { FindLzMatches(lz_matcher); StoreLzMatches(bit_stream, lz_matcher); for (uint32 i = 0; i < rec_count; ++i) { uint32 cur_sequence_len = records[i].sequence_len; uchar *cur_sequence = records[i].sequence; for (uint32 j = lz_matches[i].length; j < cur_sequence_len; ++j) { bit_stream.PutBits(sym_huf_codes[sym_code[cur_sequence[j]]].code, sym_huf_codes[sym_code[cur_sequence[j]]].len); } } } else { for (uint32 i = 0; i < rec_count; ++i) { uint32 cur_sequence_len = records[i].sequence_len; uchar *cur_sequence = records[i].sequence; for (uint32 j = 0; j < cur_sequence_len; ++j) { bit_stream.PutBits(sym_huf_codes[sym_code[cur_sequence[j]]].code, sym_huf_codes[sym_code[cur_sequence[j]]].len); } } } bit_stream.FlushPartialWordBuffer(); }
// ******************************************************************************************** void Block::StoreQualityRLE(BitStream &bit_stream, uchar *qua_code, HuffmanEncoder::Code **qua_huf_codes, HuffmanEncoder::Code **run_huf_codes, int32 /*n_qualities*/) { // Quality data uchar prev = 0; uint32 pos = 0; for (uint32 i = 0; i < qua_stream_len; ++i) { uchar qua = qua_code[qua_stream[i]]; uchar len = run_stream[i]; bit_stream.PutBits(qua_huf_codes[prev][qua].code, qua_huf_codes[prev][qua].len); bit_stream.PutBits(run_huf_codes[qua][len].code, run_huf_codes[qua][len].len); prev = qua; pos += len+1; } bit_stream.FlushPartialWordBuffer(); }
// ******************************************************************************************** void Block::StoreQualityPlain(BitStream &bit_stream, uchar *qua_code, HuffmanEncoder::Code **qua_huf_codes, int32 /*n_qualities*/, bool use_truc_hash) { // Quality data if (!use_truc_hash) { for (uint32 i = 0; i < rec_count; ++i) { uchar *cur_quality = records[i].quality; uint32 cur_quality_len = records[i].quality_len; for (uint32 j = 0; j < cur_quality_len; ++j) { int32 qua = qua_code[cur_quality[j]]; bit_stream.PutBits(qua_huf_codes[j+1][qua].code, qua_huf_codes[j+1][qua].len); } } } else { for (uint32 i = 0; i < rec_count; ++i) { uchar *cur_quality = records[i].quality; uint32 cur_quality_len = records[i].quality_len; uint32 cur_quality_len_th = records[i].rec_th_len; bit_stream.PutBits(cur_quality_len_th != cur_quality_len, 1); if (cur_quality_len_th != cur_quality_len) { bit_stream.PutBits(cur_quality_len - cur_quality_len_th, BitStream::BitLength(cur_quality_len)); } for (uint32 j = 0; j < cur_quality_len_th; ++j) { int32 qua = qua_code[cur_quality[j]]; bit_stream.PutBits(qua_huf_codes[j+1][qua].code, qua_huf_codes[j+1][qua].len); } } } bit_stream.FlushPartialWordBuffer(); }
// ******************************************************************************************** void Block::StoreLzMatches(BitStream &bit_stream, LzMatcher &lz_matcher) { for (uint32 i = 0; i < rec_count; ++i) { bit_stream.PutBit(lz_matches[i].length != 0); if (lz_matches[i].length == 0) { bit_stream.PutBit(records[i].lz_inserted); } } bit_stream.FlushPartialWordBuffer(); // DNA data uint32 rec_no_bits = BitStream::BitLength(b_start_rec_no + rec_count - 1); uint32 length_bits; uint32 offset_bits = (uint32)MAX(0, (int32)global_max_sequence_length - (int32)lz_matcher.GetMinMatchLen()); if (offset_bits) { offset_bits = BitStream::BitLength(global_max_sequence_length - lz_matcher.GetMinMatchLen()); } for (uint32 i = 0; i < rec_count; ++i) { if (lz_matches[i].length > 0) { length_bits = BitStream::BitLength(MAX(0, MIN((int32)records[i].sequence_len - (int32)lz_matcher.GetMinMatchLen(), 255))); bit_stream.PutBits(lz_matches[i].rec_no, rec_no_bits); if (length_bits > 0) bit_stream.PutBits(lz_matches[i].length - lz_matcher.GetMinMatchLen(), length_bits); if (offset_bits) bit_stream.PutBits(lz_matches[i].rec_offset, offset_bits); } } bit_stream.FlushPartialWordBuffer(); }
// ******************************************************************************************** void Block::StoreTitle(BitStream &bit_stream, std::vector<Field> &fields, int32 block_no, bool is_num_fields_constant) { uint32 n_fields = (uint32) fields.size(); uint32 n_fields_bits = BitStream::BitLength(n_fields); prev_value.resize(n_fields); for (uint32 i = 0; i < n_fields; ++i) { if (fields[i].is_constant) continue; Field::BlockDesc& block_desc = fields[i].block_desc[block_no]; prev_value[i] = 0; if (!fields[i].is_numeric) { bit_stream.PutBit(block_desc.is_block_constant); } if (fields[i].is_numeric) { block_desc.is_block_delta_constant &= (int32)block_desc.block_delta_constant == fields[i].min_delta; if (fields[i].is_delta_coding) { bit_stream.PutBit(block_desc.is_block_delta_constant); } else { bit_stream.PutBit(block_desc.is_block_value_constant); } } } for (uint32 i = 0; i < rec_count; ++i) { uint32 c_field = 0; uint32 start_pos = 0; FastqRecord &rec = records[i]; if (!is_num_fields_constant) { bit_stream.PutBits(rec.no_of_fields, n_fields_bits); } for (uint32 k = 0; k <= rec.title_len; ++k) { Field &cur_field = fields[c_field]; if (rec.title[k] != cur_field.sep && k < rec.title_len) continue; if (cur_field.is_constant) { start_pos = k+1; c_field++; continue; } if (cur_field.is_numeric) { int32 value = utils::to_num(rec.title+start_pos, k-start_pos); if (i == 0) { bit_stream.PutBits(value-cur_field.min_value, cur_field.no_of_bits_per_value); } else if ((cur_field.is_delta_coding && !cur_field.block_desc[block_no].is_block_delta_constant) || (!cur_field.is_delta_coding && !cur_field.block_desc[block_no].is_block_value_constant)) { int32 to_store; if (cur_field.is_delta_coding) { to_store = value - prev_value[c_field] - cur_field.min_delta; } else { to_store = value - cur_field.min_value; } if (cur_field.Huffman_global) { const HuffmanEncoder::Code* codes = cur_field.Huffman_global->GetCodes(); bit_stream.PutBits(codes[to_store].code, codes[to_store].len); } else { bit_stream.PutBits(to_store, cur_field.no_of_bits_per_num); } } prev_value[c_field] = value; start_pos = k+1; c_field++; continue; } if (i > 0 && cur_field.block_desc[block_no].is_block_constant) { start_pos = k+1; c_field++; continue; } if (!cur_field.is_len_constant) { bit_stream.PutBits(k-start_pos - cur_field.min_len, cur_field.no_of_bits_per_len); } for (uint32 j = 0; j < k-start_pos; ++j) { if (j >= cur_field.len || !cur_field.Ham_mask[j]) { uchar c = rec.title[start_pos+j]; const HuffmanEncoder::Code* codes = cur_field.Huffman_local[MIN(j, Superblock::MAX_FIELD_STAT_LEN)]->GetCodes(); bit_stream.PutBits(codes[c].code, codes[c].len); } } start_pos = k+1; c_field++; } } bit_stream.FlushPartialWordBuffer(); }
// ******************************************************************************************** void Block::Process(BitStream &bit_stream, LzMatcher &lz_matcher, std::vector<Field> &fields, uint32 /*n_fields*/, uint32 fastq_flags, uchar *sym_code, HuffmanEncoder::Code *sym_huf_codes, uchar *qua_code, HuffmanEncoder::Code **qua_huf_codes, uint32 /*max_run_len*/, HuffmanEncoder::Code **run_huf_codes, uint32 n_qualities, uint32 _global_max_sequence_length, uint32 max_quality_length, uint32 block_no, uint32 _quality_stats_mode) { global_max_sequence_length = _global_max_sequence_length; #if (D_RESERVE_BYTES_PER_BLOCK) { uchar bytes[Block::RESERVED_BYTES]; std::fill(bytes, bytes+Block::RESERVED_BYTES, INVALID_BYTE); bit_stream.PutBytes(bytes, Block::RESERVED_BYTES); } #endif if ((fastq_flags & FLAG_PLUS_ONLY) == 0) { for (uint32 i = 0; i < rec_count; ++i) { bit_stream.PutBit(records[i].plus_len == 1); } } uint32 quality_len_bits = BitStream::BitLength(max_quality_length); if ((fastq_flags & FLAG_VARIABLE_LENGTH) != 0) { for (uint32 i = 0; i < rec_count; ++i) { bit_stream.PutBits(records[i].quality_len, quality_len_bits); } } if ((fastq_flags & FLAG_LINE_BREAKS) != 0) { uint32 max_line_break_len = 0; for (uint32 i = 0; i < rec_count; ++i) { if (records[i].sequence_breaks) { for (uint32 j = 0; j < records[i].sequence_breaks->size(); ++j) { if ((*records[i].sequence_breaks)[j] > (int32) max_line_break_len) { max_line_break_len = (*records[i].sequence_breaks)[j]; } } } if (records[i].quality_breaks) { for (uint32 j = 0; j < records[i].quality_breaks->size(); ++j) { if ((*records[i].quality_breaks)[j] > (int32) max_line_break_len) { max_line_break_len = (*records[i].quality_breaks)[j]; } } } } uint32 line_breaks_bits = BitStream::BitLength(max_line_break_len); bit_stream.PutBits(line_breaks_bits, 5); for (uint32 i = 0; i < rec_count; ++i) { if (records[i].sequence_breaks) { for (uint32 j = 0; j < records[i].sequence_breaks->size(); ++j) { bit_stream.PutBits((*records[i].sequence_breaks)[j], line_breaks_bits); } } bit_stream.PutBits(0, line_breaks_bits); if (records[i].quality_breaks) { for (uint32 j = 0; j < records[i].quality_breaks->size(); ++j) { bit_stream.PutBits((*records[i].quality_breaks)[j], line_breaks_bits); } } bit_stream.PutBits(0, line_breaks_bits); } } bit_stream.FlushPartialWordBuffer(); bool is_num_field_constant = (fastq_flags & FLAG_CONST_NUM_FIELDS) != 0; StoreTitle(bit_stream, fields, block_no, is_num_field_constant); if (_quality_stats_mode == QUALITY_RLE) { StoreQualityRLE(bit_stream, qua_code, qua_huf_codes, run_huf_codes, n_qualities); } else { bool use_trunc_hash = _quality_stats_mode == QUALITY_PLAIN_TRUNC; StoreQualityPlain(bit_stream, qua_code, qua_huf_codes, n_qualities, use_trunc_hash); } bool try_lz = (fastq_flags & FLAG_TRY_LZ) != 0; if ((fastq_flags & FLAG_DNA_PLAIN) != 0) { StoreDNAPlain(bit_stream, lz_matcher, sym_code, try_lz); } else { StoreDNAHuf(bit_stream, lz_matcher, sym_code, sym_huf_codes, try_lz); } #if (D_COMPUTE_RECORDS_CRC_PER_BLOCK) uint32 hash = ComputeRecordsCrc32(); bit_stream.PutWord(hash); #endif }