// ******************************************************************************************** void Block::ReadLzMatches(BitStream &bit_stream, LzMatcher &lz_matcher) { for (uint32 i = 0; i < rec_count; ++i) { uint32 tmp(0); // remove warning... bit_stream.GetBit(tmp); lz_matches[i].length = tmp; if (!lz_matches[i].length) { bit_stream.GetBit(tmp); records[i].lz_inserted = tmp != 0; } } bit_stream.FlushInputWordBuffer(); // DNA data uint32 rec_no_bits = BitStream::BitLength(b_start_rec_no + rec_count - 1); uint32 length_bits; uint32 offset_bits = (uint32)MAX(0, (int32)global_max_sequence_length - (int32)lz_matcher.GetMinMatchLen()); if (offset_bits) { offset_bits = BitStream::BitLength(offset_bits); } for (uint32 i = 0; i < rec_count; ++i) { if (lz_matches[i].length > 0) { uint32 tmp; bit_stream.GetBits(tmp, rec_no_bits); lz_matches[i].rec_no = tmp; length_bits = (uint32)MAX(0, MIN((int32)records[i].sequence_len - (int32)lz_matcher.GetMinMatchLen(), 255)); if (length_bits) { length_bits = BitStream::BitLength(length_bits); bit_stream.GetBits(tmp, length_bits); } else { tmp = 0; } lz_matches[i].length = tmp + lz_matcher.GetMinMatchLen(); if (offset_bits) { bit_stream.GetBits(tmp, offset_bits); } else { tmp = 0; } lz_matches[i].rec_offset = tmp; } } bit_stream.FlushInputWordBuffer(); }
// ******************************************************************************************** void Block::ReadDNAPlain(BitStream &bit_stream, LzMatcher &lz_matcher, std::vector<uchar> &symbols, bool try_lz, bool extracting) { // Info about LZ matches lz_matches.resize(rec_count); for (uint32 i = 0; i < rec_count; ++i) { records[i].sequence_len = records[i].quality_len - no_of_amb[i]; } if (try_lz) { ReadLzMatches(bit_stream, lz_matcher); for (uint32 i = 0; i < rec_count; ++i) { if (!extracting) DecodeLzMatches(lz_matcher, i); uint32 cur_sequence_len = records[i].sequence_len; uchar *cur_sequence = records[i].sequence; my_assert(lz_matches[i].length <= cur_sequence_len); for (uint32 j = lz_matches[i].length; j < cur_sequence_len; ++j) { uint32 tmp(0); // remove warning... bit_stream.Get2Bits(tmp); cur_sequence[j] = symbols[tmp]; } cur_sequence[cur_sequence_len] = '\0'; if (!extracting) DecodeLzMatches_Insert(lz_matcher, i); } } else { for (uint32 i = 0; i < rec_count; ++i) { lz_matches[i].length = 0; uint32 cur_sequence_len = records[i].sequence_len; uchar *cur_sequence = records[i].sequence; for (uint32 j = 0; j < cur_sequence_len; ++j) { uint32 tmp(0); // reduce warning... bit_stream.Get2Bits(tmp); cur_sequence[j] = symbols[tmp]; } cur_sequence[cur_sequence_len] = '\0'; } } bit_stream.FlushInputWordBuffer(); }
// ******************************************************************************************** void Block::ReadQualityRLE(BitStream &bit_stream, std::vector<uchar> &qualities, std::vector<HuffmanEncoder*> &Huffman_qua, int32 /*n_qualities*/, std::vector<HuffmanEncoder*> &Huffman_run, int32 /*max_run_len*/) { int32 max_len = 0; uint32 i = 0; for (; i < rec_count; ++i) { max_len += records[i].quality_len; } qua_stream_len = run_stream_len = max_len; qua_stream = new uchar[max_len]; run_stream = new uchar[max_len]; uint32 prev = 0; i = 0; for (uint32 j = 0; j < qua_stream_len; ++i) { int32 h_tmp; uint32 bit; // Quality data bit_stream.GetBits(bit, Huffman_qua[prev]->GetMinLen()); h_tmp = Huffman_qua[prev]->DecodeFast(bit); while (h_tmp < 0) { bit_stream.GetBit(bit); h_tmp = Huffman_qua[prev]->Decode(bit); }; my_assert(h_tmp < (int32)qualities.size()); qua_stream[i] = qualities[h_tmp]; prev = h_tmp; // Run data bit_stream.GetBits(bit, Huffman_run[prev]->GetMinLen()); h_tmp = Huffman_run[prev]->DecodeFast(bit); while (h_tmp < 0) { bit_stream.GetBit(bit); h_tmp = Huffman_run[prev]->Decode(bit); }; run_stream[i] = (uchar) h_tmp; j += h_tmp+1; } bit_stream.FlushInputWordBuffer(); }
// ******************************************************************************************** void HuffmanEncoder::LoadTree(BitStream& bit_stream, HuffmanEncoder& tree) { static uchar* mem_buf = NULL; static uint32 mem_size; bit_stream.FlushInputWordBuffer(); bit_stream.GetWord(mem_size); my_assert(mem_size > 0); mem_buf = new uchar[mem_size]; bit_stream.GetBytes(mem_buf, mem_size); tree.LoadTree(mem_buf, mem_size); delete [] mem_buf; }
// ******************************************************************************************** void Block::ReadQualityPlain(BitStream &bit_stream, std::vector<uchar> &qualities, std::vector<HuffmanEncoder*> &Huffman_qua, int32 /*n_qualities*/, bool trucated_hashes, bool /*uses_const_delta*/) { // Quality data for (uint32 i = 0; i < rec_count; ++i) { no_of_amb[i] = 0; uchar *cur_quality = records[i].quality; uint32 cur_quality_len = records[i].quality_len; uint32 trunc_len = 0; if (trucated_hashes) // Truncate # { uint32 is_truncated(0); // skip warning... bit_stream.GetBit(is_truncated); if (is_truncated) { bit_stream.GetBits(trunc_len, BitStream::BitLength(cur_quality_len)); } } for (uint32 j = 0; j < cur_quality_len-trunc_len; ++j) { uint32 bit; int32 h_tmp; bit_stream.GetBits(bit, Huffman_qua[j+1]->GetMinLen()); h_tmp = Huffman_qua[j+1]->DecodeFast(bit); while (h_tmp < 0) { bit_stream.GetBit(bit); h_tmp = Huffman_qua[j+1]->Decode(bit); }; if ((cur_quality[j] = qualities[h_tmp]) >= 128) { no_of_amb[i]++; } } for (uint32 j = cur_quality_len-trunc_len; j < cur_quality_len; ++j) { cur_quality[j] = '#'; } } bit_stream.FlushInputWordBuffer(); }
// ******************************************************************************************** void Block::ReadDNAHuf(BitStream &bit_stream, LzMatcher &lz_matcher, std::vector<uchar> &symbols, HuffmanEncoder *Huffman_sym, bool try_lz, bool extracting) { // Info about LZ matches lz_matches.resize(rec_count); for (uint32 i = 0; i < rec_count; ++i) { records[i].sequence_len = records[i].quality_len - no_of_amb[i]; } if (try_lz) { ReadLzMatches(bit_stream, lz_matcher); for (uint32 i = 0; i < rec_count; ++i) { if (!extracting) DecodeLzMatches(lz_matcher, i); uint32 cur_sequence_len = records[i].sequence_len; uchar *cur_sequence = records[i].sequence; for (uint32 j = lz_matches[i].length; j < cur_sequence_len; ++j) { // Symbols uint32 bit; bit_stream.GetBits(bit, Huffman_sym->GetMinLen()); int32 h_tmp = Huffman_sym->DecodeFast(bit); while (h_tmp < 0) { bit_stream.GetBit(bit); h_tmp = Huffman_sym->Decode(bit); }; cur_sequence[j] = symbols[h_tmp]; } cur_sequence[cur_sequence_len] = '\0'; if (!extracting) DecodeLzMatches_Insert(lz_matcher, i); } } else { for (uint32 i = 0; i < rec_count; ++i) { lz_matches[i].length = 0; uint32 cur_sequence_len = records[i].sequence_len; uchar *cur_sequence = records[i].sequence; for (uint32 j = lz_matches[i].length; j < cur_sequence_len; ++j) { // Symbols uint32 bit; bit_stream.GetBits(bit, Huffman_sym->GetMinLen()); int32 h_tmp = Huffman_sym->DecodeFast(bit); while (h_tmp < 0) { bit_stream.GetBit(bit); h_tmp = Huffman_sym->Decode(bit); }; cur_sequence[j] = symbols[h_tmp]; } cur_sequence[cur_sequence_len] = '\0'; } } bit_stream.FlushInputWordBuffer(); }
// ******************************************************************************************** void Block::ReadTitle(BitStream &bit_stream, std::vector<Field> &fields, uint32 n_fields, int32 block_no, bool is_num_fields_constant) { prev_value.resize(n_fields); uint32 n_fields_bits = BitStream::BitLength(n_fields); uint32 tmp = 0; for (uint32 i = 0; i < n_fields; ++i) { if (fields[i].is_constant) continue; prev_value[i] = 0; if (!fields[i].is_numeric) { bit_stream.GetBit(tmp); fields[i].block_desc[block_no].is_block_constant = tmp != 0; } else { bit_stream.GetBit(tmp); if (fields[i].is_delta_coding) { fields[i].block_desc[block_no].is_block_delta_constant = tmp != 0; } else { fields[i].block_desc[block_no].is_block_value_constant = tmp != 0; } } } for (uint32 i = 0; i < rec_count; ++i) { FastqRecord& cur_rec = records[i]; uint32 cn_fields = n_fields; if (!is_num_fields_constant) { bit_stream.GetBits(tmp, n_fields_bits); cn_fields = tmp; } for (uint32 j = 0; j < cn_fields; ++j) { Field &cur_field = fields[j]; if (cur_field.is_constant) { cur_rec.AppendTitle(cur_field.data, cur_field.len); cur_rec.AppendTitle(cur_field.sep); continue; } if (cur_field.is_numeric) { uint32 num_val = 0; if (cur_rec.title_len + 10 >= cur_rec.title_size) { cur_rec.Extend(cur_rec.title, cur_rec.title_size); } if (i == 0) { bit_stream.GetBits(num_val, cur_field.no_of_bits_per_value); num_val += cur_field.min_value; cur_rec.title_len += utils::to_string(cur_rec.title+cur_rec.title_len, num_val); prev_value[j] = num_val; } else { if ((cur_field.is_delta_coding && !cur_field.block_desc[block_no].is_block_delta_constant) || (!cur_field.is_delta_coding && !cur_field.block_desc[block_no].is_block_value_constant)) { if (cur_field.no_of_bits_per_num > 0) { if (cur_field.Huffman_global) { uint32 bit; int32 h_tmp; bit_stream.GetBits(bit, cur_field.Huffman_global->GetMinLen()); h_tmp = cur_field.Huffman_global->DecodeFast(bit); while (h_tmp < 0) { bit_stream.GetBit(bit); h_tmp = cur_field.Huffman_global->Decode(bit); }; num_val = h_tmp; } else { bit_stream.GetBits(num_val, cur_field.no_of_bits_per_num); } } else { num_val = 0; } } else { if (cur_field.is_delta_coding) { num_val = 0; } else { num_val = prev_value[j] - cur_field.min_value; } } if (cur_field.is_delta_coding) { num_val += prev_value[j] + cur_field.min_delta; } else { num_val += cur_field.min_value; } cur_rec.title_len += utils::to_string(cur_rec.title+cur_rec.title_len, num_val); prev_value[j] = num_val; } cur_rec.AppendTitle(cur_field.sep); continue; } if (i > 0 && cur_field.block_desc[block_no].is_block_constant) { cur_rec.AppendTitle(records[0].title+cur_field.block_str_start, cur_field.block_str_len); cur_rec.AppendTitle(cur_field.sep); continue; } uint32 field_len; if (!cur_field.is_len_constant) { bit_stream.GetBits(field_len, cur_field.no_of_bits_per_len); field_len += cur_field.min_len; } else { field_len = cur_field.len; } for (uint32 k = 0; k < field_len; ++k) { if (k < cur_field.len && cur_field.Ham_mask[k]) { cur_rec.AppendTitle(cur_field.data[k]); } else { uint32 bit; int32 h_tmp; HuffmanEncoder *cur_huf = cur_field.Huffman_local[MIN(k, Superblock::MAX_FIELD_STAT_LEN)]; bit_stream.GetBits(bit, cur_huf->GetMinLen()); h_tmp = cur_huf->DecodeFast(bit); while (h_tmp < 0) { bit_stream.GetBit(bit); h_tmp = cur_huf->Decode(bit); }; tmp = h_tmp; cur_rec.AppendTitle((uchar) tmp); } } if (i == 0 && cur_field.block_desc[block_no].is_block_constant) { cur_field.block_str_start = cur_rec.title_len - field_len; cur_field.block_str_len = field_len; } cur_rec.AppendTitle(cur_field.sep); } cur_rec.title_len--; // do not count last '\0' symbols cur_rec.plus[0] = '+'; if (cur_rec.plus_len == 1) { cur_rec.plus[1] = '\0'; } else { cur_rec.plus_len = cur_rec.title_len; cur_rec.ExtendTo(cur_rec.plus, cur_rec.plus_size, cur_rec.title_len+2); std::copy(cur_rec.title+1, cur_rec.title+cur_rec.title_len, cur_rec.plus+1); } } bit_stream.FlushInputWordBuffer(); }
// ******************************************************************************************** void Block::Read(BitStream &bit_stream, LzMatcher &lz_matcher, std::vector<Field> &fields, uint32 n_fields, uint32 fastq_flags, std::vector<uchar> &symbols, HuffmanEncoder *Huffman_sym, std::vector<uchar> &qualities, std::vector<HuffmanEncoder*> &Huffman_qua, uint32 max_run_len, std::vector<HuffmanEncoder*> &Huffman_run, uint32 n_qualities, uint32 _global_max_sequence_length, uint32 max_quality_length, uint32 block_no, uint32 quality_stats_mode, bool extracting) { global_max_sequence_length = _global_max_sequence_length; #if (D_RESERVE_BYTES_PER_BLOCK) { uchar bytes[Block::RESERVED_BYTES]; bit_stream.GetBytes(bytes, Block::RESERVED_BYTES); for (uint32 i = 0; i < Block::RESERVED_BYTES; ++i) { my_assert(bytes[i] == INVALID_BYTE); } } #endif no_of_amb.resize(rec_count); for (uint32 i = 0; i < rec_count; ++i) { records[i].Reset(); no_of_amb[i] = 0; } int32 quality_len_bits = BitStream::BitLength(max_quality_length); if ((fastq_flags & FLAG_PLUS_ONLY) == 0) { for (uint32 i = 0; i < rec_count; ++i) { bit_stream.GetBit(records[i].plus_len); } } else { for (uint32 i = 0; i < rec_count; ++i) { records[i].plus_len = 1; } } if ((fastq_flags & FLAG_VARIABLE_LENGTH) != 0) { uint32 tmp; for (uint32 i = 0; i < rec_count; ++i) { FastqRecord& rec = records[i]; bit_stream.GetBits(tmp, quality_len_bits); rec.quality_len = tmp; rec.ExtendTo(rec.quality, rec.quality_size, tmp+2); rec.sequence_len = tmp; rec.ExtendTo(rec.sequence, rec.sequence_size, tmp+2); } } else { for (uint32 i = 0; i < rec_count; ++i) { FastqRecord& rec = records[i]; rec.quality_len = max_quality_length; rec.ExtendTo(rec.quality, rec.quality_size, max_quality_length+2); rec.sequence_len = global_max_sequence_length; rec.ExtendTo(rec.sequence, rec.sequence_size, global_max_sequence_length+2); } } if ((fastq_flags & FLAG_LINE_BREAKS) != 0) { uint32 line_breaks_bits; bit_stream.GetBits(line_breaks_bits, 5); uint32 tmp; for (uint32 i = 0; i < rec_count; ++i) { FastqRecord& rec = records[i]; if (rec.sequence_breaks) { delete rec.sequence_breaks; rec.sequence_breaks = NULL; } bit_stream.GetBits(tmp, line_breaks_bits); while (tmp != 0) { if (!rec.sequence_breaks) { rec.sequence_breaks = new std::vector<int>; } rec.sequence_breaks->push_back(tmp); bit_stream.GetBits(tmp, line_breaks_bits); } if (rec.quality_breaks) { delete rec.quality_breaks; rec.quality_breaks = NULL; } bit_stream.GetBits(tmp, line_breaks_bits); while (tmp != 0) { if (!rec.quality_breaks) { rec.quality_breaks = new std::vector<int>; } rec.quality_breaks->push_back(tmp); bit_stream.GetBits(tmp, line_breaks_bits); } } } bit_stream.FlushInputWordBuffer(); bool is_num_fields_constant = (fastq_flags & FLAG_CONST_NUM_FIELDS) != 0; ReadTitle(bit_stream, fields, n_fields, block_no, is_num_fields_constant); if (quality_stats_mode == QUALITY_RLE) { ReadQualityRLE(bit_stream, qualities, Huffman_qua, n_qualities, Huffman_run, max_run_len); MakeUnRLE(); } else { bool use_trunc_h = quality_stats_mode == QUALITY_PLAIN_TRUNC; bool uses_const_delta = (fastq_flags & (FLAG_USE_DELTA | FLAG_DELTA_CONSTANT)) == (FLAG_USE_DELTA | FLAG_DELTA_CONSTANT); ReadQualityPlain(bit_stream, qualities, Huffman_qua, n_qualities, use_trunc_h, uses_const_delta); } bool try_lz = (fastq_flags & FLAG_TRY_LZ) != 0; if ((fastq_flags & FLAG_DNA_PLAIN) != 0) { ReadDNAPlain(bit_stream, lz_matcher, symbols, try_lz, extracting); } else { ReadDNAHuf(bit_stream, lz_matcher, symbols, Huffman_sym, try_lz, extracting); // lz_matches not supported when Huffman encoding } #if (D_COMPUTE_RECORDS_CRC_PER_BLOCK) uint32 hash; bit_stream.GetWord(hash); my_assert(hash == ComputeRecordsCrc32()); #endif }