Exemplo n.º 1
0
bool checkCigarValid(
	::libmaus::bambam::BamAlignment const & alignment,
	::libmaus::bambam::BamHeader const & bamheader,
	::libmaus::autoarray::AutoArray < ::libmaus::autoarray::AutoArray<uint8_t>::unique_ptr_type > const & text
)
{
	if ( alignment.isUnmap() )
		return true;

	if ( ! alignment.isCigarLengthConsistent() )
	{
		std::cerr << "[E] inconsistent cigar " << alignment.getCigarString() << " for " << alignment.getName() << std::endl;
		return false;
	}
	
	if ( alignment.getRefID() < 0 || alignment.getRefID() >= static_cast<int64_t>(bamheader.chromosomes.size()) )
	{
		std::cerr << "[E] reference id " << alignment.getRefID() << " out of range for " << alignment.getName() << std::endl;
		return false;
	}
	
	::libmaus::autoarray::AutoArray<uint8_t> const & ctext = *(text[alignment.getRefID()]);
	int64_t refpos = alignment.getPos();
	int64_t seqpos = 0;
	bool alok = true;
	std::string const read = alignment.getRead();
	
	for ( uint64_t i = 0; alok && i < alignment.getNCigar(); ++i )
	{
		char const cop = alignment.getCigarFieldOpAsChar(i);
		int64_t const clen = alignment.getCigarFieldLength(i);
		
		switch ( cop )
		{
			// match/mismatch, increment both
			case '=':
			case 'X':
			case 'M':
			{
				for ( int64_t j = 0; alok && j < clen; ++j, ++refpos, ++ seqpos )
				{
					if ( refpos < 0 || refpos >= static_cast<int64_t>(ctext.size()) )
					{
						std::cerr << "[E] " << cop << " operation outside of chromosome coordinate range " << " for " << alignment.getName() << std::endl;
						alok = false;
					}
					else if ( seqpos >= alignment.getLseq() )
					{
						std::cerr << "[E] " << cop << " operation outside of sequence coordinate range " << " for " << alignment.getName() << std::endl;
						alok = false;
					}
					else if ( cop == '=' && toupper(ctext[refpos]) != toupper(read[seqpos]) )
					{
						std::cerr << "[E] " << cop << " operation but mismatch between reference and query." << std::endl;
						alok = false;
					}
					else if ( cop == 'X' && toupper(ctext[refpos]) == toupper(read[seqpos]) )
					{
						std::cerr << "[E] " << cop << " operation but mismatch between reference and query." << std::endl;
						alok = false;
					}
				}
				break;
			}
			// insert into reference, increment seq
			case 'P':
			case 'I':
			{
				for ( int64_t j = 0; alok && j < clen; ++j, ++seqpos )
				{
					if ( seqpos >= alignment.getLseq() )
					{
						std::cerr << "[E] " << cop << " operation outside of sequence coordinate range " << " for " << alignment.getName() << std::endl;
						alok = false;						
					}
				}
				break;
			}
			// delete from reference, increment ref
			case 'D':
			{
				for ( int64_t j = 0; alok && j < clen; ++j, ++refpos )
				{
					if ( refpos < 0 || refpos >= static_cast<int64_t>(ctext.size()) )
					{
						std::cerr << "[E] " << cop << " operation outside of reference coordinate range " << " for " << alignment.getName() << std::endl;
						alok = false;						
					}
				}
				break;
			}
			// soft clipping, increment seq
			case 'S':
			{
				for ( int64_t j = 0; alok && j < clen; ++j, ++seqpos )
				{
					if ( seqpos >= alignment.getLseq() )
					{
						std::cerr << "[E] " << cop << " operation outside of sequence coordinate range " << " for " << alignment.getName() << std::endl;
						alok = false;						
					}
				}
				break;
			}
			// hard clipping, do nothing
			case 'H':
			{
				break;
			}
			// skip region in reference, increment ref
			case 'N':
			{
				for ( int64_t j = 0; alok && j < clen; ++j, ++refpos )
				{
					if ( refpos < 0 || refpos >= static_cast<int64_t>(ctext.size()) )
					{
						std::cerr << "[E] " << cop << " operation outside of reference coordinate range " << " for " << alignment.getName() << std::endl;
						alok = false;						
					}
				}
				break;
			}
		}
	}
	
	return alok;
}