Exemple #1
0
// ********************************************************************************************
void Block::ReadLzMatches(BitStream &bit_stream, LzMatcher &lz_matcher)
{
	for (uint32 i = 0; i < rec_count; ++i)
	{
		uint32 tmp(0);			// remove warning...
		bit_stream.GetBit(tmp);
		lz_matches[i].length = tmp;
		if (!lz_matches[i].length)
		{
			bit_stream.GetBit(tmp);
			records[i].lz_inserted = tmp != 0;
		}
	}
	bit_stream.FlushInputWordBuffer();

	// DNA data
	uint32 rec_no_bits = BitStream::BitLength(b_start_rec_no + rec_count - 1);
	uint32 length_bits;
	uint32 offset_bits = (uint32)MAX(0, (int32)global_max_sequence_length - (int32)lz_matcher.GetMinMatchLen());

	if (offset_bits)
	{
		offset_bits = BitStream::BitLength(offset_bits);
	}

	for (uint32 i = 0; i < rec_count; ++i)
	{
		if (lz_matches[i].length > 0)
		{
			uint32 tmp;
			bit_stream.GetBits(tmp, rec_no_bits);
			lz_matches[i].rec_no = tmp;

			length_bits = (uint32)MAX(0, MIN((int32)records[i].sequence_len - (int32)lz_matcher.GetMinMatchLen(), 255));
			if (length_bits)
			{
				length_bits = BitStream::BitLength(length_bits);
				bit_stream.GetBits(tmp, length_bits);
			}
			else
			{
				tmp = 0;
			}
			lz_matches[i].length = tmp + lz_matcher.GetMinMatchLen();

			if (offset_bits)
			{
				bit_stream.GetBits(tmp, offset_bits);
			}
			else
			{
				tmp = 0;
			}
			lz_matches[i].rec_offset = tmp;
		}
	}
	bit_stream.FlushInputWordBuffer();
}
Exemple #2
0
// ********************************************************************************************
void Block::ReadDNAPlain(BitStream &bit_stream, LzMatcher &lz_matcher, std::vector<uchar> &symbols, bool try_lz, bool extracting)
{
	// Info about LZ matches
	lz_matches.resize(rec_count);
	
	for (uint32 i = 0; i < rec_count; ++i)
	{
		records[i].sequence_len = records[i].quality_len - no_of_amb[i];
	}

	if (try_lz)
	{
		ReadLzMatches(bit_stream, lz_matcher);

		for (uint32 i = 0; i < rec_count; ++i)
		{
			if (!extracting)
				DecodeLzMatches(lz_matcher, i);

			uint32 cur_sequence_len = records[i].sequence_len;
			uchar *cur_sequence = records[i].sequence;
			my_assert(lz_matches[i].length <= cur_sequence_len);
			for (uint32 j = lz_matches[i].length; j < cur_sequence_len; ++j)
			{
				uint32 tmp(0);	// remove warning...
				bit_stream.Get2Bits(tmp);
				cur_sequence[j] = symbols[tmp];
			}	
			cur_sequence[cur_sequence_len] = '\0';

			if (!extracting)
				DecodeLzMatches_Insert(lz_matcher, i);
		}
	}
	else
	{
		for (uint32 i = 0; i < rec_count; ++i)
		{
			lz_matches[i].length = 0;
			uint32 cur_sequence_len = records[i].sequence_len;
			uchar *cur_sequence = records[i].sequence;

			for (uint32 j = 0; j < cur_sequence_len; ++j)
			{
				uint32 tmp(0);			// reduce warning...
				bit_stream.Get2Bits(tmp);
				cur_sequence[j] = symbols[tmp];
			}	
			cur_sequence[cur_sequence_len] = '\0';
		}
	}

	bit_stream.FlushInputWordBuffer();
}
Exemple #3
0
// ********************************************************************************************
void Block::ReadQualityRLE(BitStream &bit_stream, std::vector<uchar> &qualities, 
std::vector<HuffmanEncoder*> &Huffman_qua, int32 /*n_qualities*/, std::vector<HuffmanEncoder*> &Huffman_run, 
int32 /*max_run_len*/)
{
	int32 max_len = 0;
	uint32 i = 0;
	for (; i < rec_count; ++i)
	{
		max_len += records[i].quality_len;
	}

	qua_stream_len = run_stream_len = max_len;
	qua_stream = new uchar[max_len];
	run_stream = new uchar[max_len];


	uint32 prev = 0;
	i = 0;
	for (uint32 j = 0; j < qua_stream_len; ++i)
	{
		int32 h_tmp;
		uint32 bit;

		// Quality data
		bit_stream.GetBits(bit, Huffman_qua[prev]->GetMinLen());
		h_tmp = Huffman_qua[prev]->DecodeFast(bit);

		while (h_tmp < 0)
		{
			bit_stream.GetBit(bit);
			h_tmp = Huffman_qua[prev]->Decode(bit);
		};

		my_assert(h_tmp < (int32)qualities.size());
		qua_stream[i] = qualities[h_tmp];
		prev = h_tmp;

		// Run data
		bit_stream.GetBits(bit, Huffman_run[prev]->GetMinLen());
		h_tmp = Huffman_run[prev]->DecodeFast(bit);

		while (h_tmp < 0)
		{
			bit_stream.GetBit(bit);
			h_tmp = Huffman_run[prev]->Decode(bit);
		};

		run_stream[i] = (uchar) h_tmp;

		j += h_tmp+1;
	}

	bit_stream.FlushInputWordBuffer();
}
Exemple #4
0
// ********************************************************************************************
void HuffmanEncoder::LoadTree(BitStream& bit_stream, HuffmanEncoder& tree)
{
	static uchar* mem_buf = NULL;
	static uint32 mem_size;

	bit_stream.FlushInputWordBuffer();

	bit_stream.GetWord(mem_size);
	my_assert(mem_size > 0);
	mem_buf = new uchar[mem_size];
	bit_stream.GetBytes(mem_buf, mem_size);
	tree.LoadTree(mem_buf, mem_size);

	delete [] mem_buf;
}
Exemple #5
0
// ********************************************************************************************
void Block::ReadQualityPlain(BitStream &bit_stream, std::vector<uchar> &qualities, 
	std::vector<HuffmanEncoder*> &Huffman_qua, int32 /*n_qualities*/, bool trucated_hashes, bool /*uses_const_delta*/)
{
	// Quality data
	for (uint32 i = 0; i < rec_count; ++i)
	{
		no_of_amb[i] = 0;
		uchar *cur_quality = records[i].quality; 
		uint32 cur_quality_len = records[i].quality_len;
		uint32 trunc_len = 0;

		if (trucated_hashes)			// Truncate #
		{
			uint32 is_truncated(0);		// skip warning...
			bit_stream.GetBit(is_truncated);
			if (is_truncated)
			{
				bit_stream.GetBits(trunc_len, BitStream::BitLength(cur_quality_len));
			}
		}

		for (uint32 j = 0; j < cur_quality_len-trunc_len; ++j)
		{
			uint32 bit;
			int32 h_tmp;

			bit_stream.GetBits(bit, Huffman_qua[j+1]->GetMinLen());
			h_tmp = Huffman_qua[j+1]->DecodeFast(bit);

			while (h_tmp < 0)
			{
				bit_stream.GetBit(bit);
				h_tmp = Huffman_qua[j+1]->Decode(bit);
			};

			if ((cur_quality[j] = qualities[h_tmp]) >= 128)
			{
				no_of_amb[i]++;
			}
		}

		for (uint32 j = cur_quality_len-trunc_len; j < cur_quality_len; ++j)
		{
			cur_quality[j] = '#';
		}
	}
	bit_stream.FlushInputWordBuffer();
}
Exemple #6
0
// ********************************************************************************************
void Block::ReadDNAHuf(BitStream &bit_stream, LzMatcher &lz_matcher, std::vector<uchar> &symbols, HuffmanEncoder *Huffman_sym, bool try_lz, bool extracting)
{
	// Info about LZ matches
	lz_matches.resize(rec_count);
	
	for (uint32 i = 0; i < rec_count; ++i)
	{
		records[i].sequence_len = records[i].quality_len - no_of_amb[i];
	}

	if (try_lz)
	{
		ReadLzMatches(bit_stream, lz_matcher);

		for (uint32 i = 0; i < rec_count; ++i)
		{
			if (!extracting)
				DecodeLzMatches(lz_matcher, i);

			uint32 cur_sequence_len = records[i].sequence_len;
			uchar *cur_sequence = records[i].sequence;
			for (uint32 j = lz_matches[i].length; j < cur_sequence_len; ++j)
			{
				// Symbols
				uint32 bit;
				bit_stream.GetBits(bit, Huffman_sym->GetMinLen());
				int32 h_tmp = Huffman_sym->DecodeFast(bit);
				while (h_tmp < 0)
				{
					bit_stream.GetBit(bit);
					h_tmp = Huffman_sym->Decode(bit);
				};

				cur_sequence[j] = symbols[h_tmp];
			}	
			cur_sequence[cur_sequence_len] = '\0';

			if (!extracting)
				DecodeLzMatches_Insert(lz_matcher, i);
		}
	}
	else
	{
		for (uint32 i = 0; i < rec_count; ++i)
		{
			lz_matches[i].length = 0;
			uint32 cur_sequence_len = records[i].sequence_len;
			uchar *cur_sequence = records[i].sequence;
			for (uint32 j = lz_matches[i].length; j < cur_sequence_len; ++j)
			{
				// Symbols
				uint32 bit;
				bit_stream.GetBits(bit, Huffman_sym->GetMinLen());
				int32 h_tmp = Huffman_sym->DecodeFast(bit);
				while (h_tmp < 0)
				{
					bit_stream.GetBit(bit);
					h_tmp = Huffman_sym->Decode(bit);
				};

				cur_sequence[j] = symbols[h_tmp];
			}	
			cur_sequence[cur_sequence_len] = '\0';
		}
	}
	bit_stream.FlushInputWordBuffer();
}
Exemple #7
0
// ********************************************************************************************
void Block::ReadTitle(BitStream &bit_stream, std::vector<Field> &fields, uint32 n_fields, int32 block_no, bool is_num_fields_constant)
{
	prev_value.resize(n_fields);
	uint32 n_fields_bits = BitStream::BitLength(n_fields);

	uint32 tmp = 0;
	for (uint32 i = 0; i < n_fields; ++i)
	{
		if (fields[i].is_constant)
			continue;

		prev_value[i] = 0;
		if (!fields[i].is_numeric)
		{
			bit_stream.GetBit(tmp);
			fields[i].block_desc[block_no].is_block_constant = tmp != 0;
		}
		else
		{
			bit_stream.GetBit(tmp);
			if (fields[i].is_delta_coding)
			{
				fields[i].block_desc[block_no].is_block_delta_constant = tmp != 0;
			}
			else
			{
				fields[i].block_desc[block_no].is_block_value_constant = tmp != 0;
			}
		}		
	}

	for (uint32 i = 0; i < rec_count; ++i)
	{
		FastqRecord& cur_rec = records[i];

		uint32 cn_fields = n_fields;
		if (!is_num_fields_constant)
		{
			bit_stream.GetBits(tmp, n_fields_bits);
			cn_fields = tmp;
		}

		for (uint32 j = 0; j < cn_fields; ++j)
		{
			Field &cur_field = fields[j];
			if (cur_field.is_constant)
			{
				cur_rec.AppendTitle(cur_field.data, cur_field.len);
				cur_rec.AppendTitle(cur_field.sep);
				continue;
			}
			if (cur_field.is_numeric)
			{
				uint32 num_val = 0;

				if (cur_rec.title_len + 10 >= cur_rec.title_size)
				{
					cur_rec.Extend(cur_rec.title, cur_rec.title_size);
				}
				if (i == 0)
				{
					bit_stream.GetBits(num_val, cur_field.no_of_bits_per_value);
					num_val += cur_field.min_value;
					cur_rec.title_len += utils::to_string(cur_rec.title+cur_rec.title_len, num_val);
					prev_value[j] = num_val;
				}
				else
				{
					if ((cur_field.is_delta_coding && !cur_field.block_desc[block_no].is_block_delta_constant) ||
						(!cur_field.is_delta_coding && !cur_field.block_desc[block_no].is_block_value_constant))
					{
						if (cur_field.no_of_bits_per_num > 0)
						{
							if (cur_field.Huffman_global)
							{
								uint32 bit;
								int32 h_tmp;

								bit_stream.GetBits(bit, cur_field.Huffman_global->GetMinLen());
								h_tmp = cur_field.Huffman_global->DecodeFast(bit);

								while (h_tmp < 0)
								{
									bit_stream.GetBit(bit);
									h_tmp = cur_field.Huffman_global->Decode(bit);
								};

								num_val = h_tmp;
							}
							else
							{
								bit_stream.GetBits(num_val, cur_field.no_of_bits_per_num);
							}
						}
						else
						{
							num_val = 0;
						}
					}
					else
					{
						if (cur_field.is_delta_coding)
						{
							num_val = 0;
						}
						else
						{
							num_val = prev_value[j] - cur_field.min_value;
						}
					}

					if (cur_field.is_delta_coding)
					{
						num_val += prev_value[j] + cur_field.min_delta;
					}
					else
					{
						num_val += cur_field.min_value;
					}

					cur_rec.title_len += utils::to_string(cur_rec.title+cur_rec.title_len, num_val);
					prev_value[j] = num_val;
				}
				cur_rec.AppendTitle(cur_field.sep);

				continue;
			}

			if (i > 0 && cur_field.block_desc[block_no].is_block_constant)
			{
				cur_rec.AppendTitle(records[0].title+cur_field.block_str_start, cur_field.block_str_len);
				cur_rec.AppendTitle(cur_field.sep);
				continue;
			}

			uint32 field_len;
			if (!cur_field.is_len_constant)
			{
				bit_stream.GetBits(field_len, cur_field.no_of_bits_per_len);			
				field_len += cur_field.min_len;
			}
			else
			{
				field_len = cur_field.len;
			}
			
			for (uint32 k = 0; k < field_len; ++k)
			{
				if (k < cur_field.len && cur_field.Ham_mask[k])
				{
					cur_rec.AppendTitle(cur_field.data[k]);
				}
				else
				{
					uint32 bit;
					int32 h_tmp;
					HuffmanEncoder *cur_huf = cur_field.Huffman_local[MIN(k, Superblock::MAX_FIELD_STAT_LEN)]; 

					bit_stream.GetBits(bit, cur_huf->GetMinLen());
					h_tmp = cur_huf->DecodeFast(bit);

					while (h_tmp < 0)
					{
						bit_stream.GetBit(bit);
						h_tmp = cur_huf->Decode(bit);
					};

					tmp = h_tmp;
					cur_rec.AppendTitle((uchar) tmp);
				}
			}
			if (i == 0 && cur_field.block_desc[block_no].is_block_constant)
			{
				cur_field.block_str_start = cur_rec.title_len - field_len;
				cur_field.block_str_len = field_len;
			}

			cur_rec.AppendTitle(cur_field.sep);
		}

		cur_rec.title_len--;			// do not count last '\0' symbols
		cur_rec.plus[0] = '+';
		if (cur_rec.plus_len == 1)
		{
			cur_rec.plus[1] = '\0';
		}
		else
		{
			cur_rec.plus_len = cur_rec.title_len;
			cur_rec.ExtendTo(cur_rec.plus, cur_rec.plus_size, cur_rec.title_len+2);
			std::copy(cur_rec.title+1, cur_rec.title+cur_rec.title_len, cur_rec.plus+1);
		}
	}
	bit_stream.FlushInputWordBuffer();
}
Exemple #8
0
// ********************************************************************************************
void Block::Read(BitStream &bit_stream, LzMatcher &lz_matcher, std::vector<Field> &fields, uint32 n_fields,
	uint32 fastq_flags, std::vector<uchar> &symbols, HuffmanEncoder *Huffman_sym,
	std::vector<uchar> &qualities, std::vector<HuffmanEncoder*> &Huffman_qua, 
	uint32 max_run_len, std::vector<HuffmanEncoder*> &Huffman_run, uint32 n_qualities, 
	uint32 _global_max_sequence_length, uint32 max_quality_length, uint32 block_no, 
	uint32 quality_stats_mode, bool extracting)
{
	global_max_sequence_length = _global_max_sequence_length;

#if (D_RESERVE_BYTES_PER_BLOCK)
	{
		uchar bytes[Block::RESERVED_BYTES];
		bit_stream.GetBytes(bytes, Block::RESERVED_BYTES);
		for (uint32 i = 0; i < Block::RESERVED_BYTES; ++i)
		{
			my_assert(bytes[i] == INVALID_BYTE);
		}
	}
#endif

	no_of_amb.resize(rec_count);
	for (uint32 i = 0; i < rec_count; ++i)
	{
		records[i].Reset();
		no_of_amb[i] = 0;
	}

	int32 quality_len_bits = BitStream::BitLength(max_quality_length);
	if ((fastq_flags & FLAG_PLUS_ONLY) == 0)
	{
		for (uint32 i = 0; i < rec_count; ++i)
		{
			bit_stream.GetBit(records[i].plus_len);
		}
	}
	else
	{
		for (uint32 i = 0; i < rec_count; ++i)
		{
			records[i].plus_len = 1;
		}
	}

	if ((fastq_flags & FLAG_VARIABLE_LENGTH) != 0)
	{
		uint32 tmp;
		for (uint32 i = 0; i < rec_count; ++i)
		{
			FastqRecord& rec = records[i];
			bit_stream.GetBits(tmp, quality_len_bits);
			rec.quality_len = tmp;
			rec.ExtendTo(rec.quality, rec.quality_size, tmp+2);
			rec.sequence_len = tmp;
			rec.ExtendTo(rec.sequence, rec.sequence_size, tmp+2);
		}
	}
	else
	{
		for (uint32 i = 0; i < rec_count; ++i)
		{
			FastqRecord& rec = records[i];
			rec.quality_len = max_quality_length;
			rec.ExtendTo(rec.quality, rec.quality_size, max_quality_length+2);
			rec.sequence_len = global_max_sequence_length;
			rec.ExtendTo(rec.sequence, rec.sequence_size, global_max_sequence_length+2);
		}
	}

	if ((fastq_flags & FLAG_LINE_BREAKS) != 0)
	{
		uint32 line_breaks_bits;
		bit_stream.GetBits(line_breaks_bits, 5);
		uint32 tmp;

		for (uint32 i = 0; i < rec_count; ++i)
		{
			FastqRecord& rec = records[i];
			if (rec.sequence_breaks)
			{
				delete rec.sequence_breaks;
				rec.sequence_breaks = NULL;
			}

			bit_stream.GetBits(tmp, line_breaks_bits);
			while (tmp != 0)
			{
				if (!rec.sequence_breaks)
				{
					rec.sequence_breaks = new std::vector<int>;
				}
				rec.sequence_breaks->push_back(tmp);

				bit_stream.GetBits(tmp, line_breaks_bits);
			}


			if (rec.quality_breaks)
			{
				delete rec.quality_breaks;
				rec.quality_breaks = NULL;
			}
			bit_stream.GetBits(tmp, line_breaks_bits);
			while (tmp != 0)
			{
				if (!rec.quality_breaks)
				{
					rec.quality_breaks = new std::vector<int>;
				}
				rec.quality_breaks->push_back(tmp);

				bit_stream.GetBits(tmp, line_breaks_bits);
			}
		}
	}
	bit_stream.FlushInputWordBuffer();


	bool is_num_fields_constant = (fastq_flags & FLAG_CONST_NUM_FIELDS) != 0;
	ReadTitle(bit_stream, fields, n_fields, block_no, is_num_fields_constant);

	if (quality_stats_mode == QUALITY_RLE)
	{
		ReadQualityRLE(bit_stream, qualities, Huffman_qua, n_qualities, Huffman_run, max_run_len);
		MakeUnRLE();
	}
	else
	{
		bool use_trunc_h = quality_stats_mode == QUALITY_PLAIN_TRUNC;
		bool uses_const_delta = (fastq_flags & (FLAG_USE_DELTA | FLAG_DELTA_CONSTANT)) ==  (FLAG_USE_DELTA | FLAG_DELTA_CONSTANT);
		ReadQualityPlain(bit_stream, qualities, Huffman_qua, n_qualities, use_trunc_h, uses_const_delta);
	}

	bool try_lz = (fastq_flags & FLAG_TRY_LZ) != 0;
	if ((fastq_flags & FLAG_DNA_PLAIN) != 0)
	{
		ReadDNAPlain(bit_stream, lz_matcher, symbols, try_lz, extracting);
	}
	else
	{
		ReadDNAHuf(bit_stream, lz_matcher, symbols, Huffman_sym, try_lz, extracting);	// lz_matches not supported when Huffman encoding
	}

#if (D_COMPUTE_RECORDS_CRC_PER_BLOCK)
	uint32 hash;
	bit_stream.GetWord(hash);
	my_assert(hash == ComputeRecordsCrc32());
#endif
}