Ejemplo n.º 1
0
// converts a space separated quality string into a compressed quality string
// NOTE: this function has horrible amounts of overhead, but lean and mean code that I had before
//       failed some of the unit tests.
void CRegexUtilities::ConvertQualities(string& qualities, CMosaikString& compQualities) {

	string::iterator strIte = qualities.end() - 1;
	while ( *strIte == ' ' ) {
		qualities.erase( strIte );
		strIte--;
	}
	
	vector<string> columns;
	vector<string>::const_iterator sIter;

	char* pQualities = (char*)qualities.c_str();
	Chomp(pQualities);

	back_insert_iterator<vector<string> > backiter(columns);
	SplitString(backiter, " ", pQualities);
	const unsigned int numQualities = (unsigned int)columns.size();

	compQualities.Reserve(numQualities);
	compQualities.SetLength(numQualities);

	unsigned char* pCompQualities = (unsigned char*)compQualities.Data();

	for(sIter = columns.begin(); sIter != columns.end(); ++sIter, ++pCompQualities) {
		if(sIter->empty()) continue;
		*pCompQualities = GetUnsignedChar((char*)sIter->c_str());
	}
}
Ejemplo n.º 2
0
// converts the supplied read from pseudo-colorspace to colorspace
// The function is used by the unaligned-read writer.
void CColorspaceUtilities::ConvertReadPseudoColorspaceToColorspace(CMosaikString& s) {
    char* pBases = s.Data();
    for(unsigned int i = 0; i < s.Length(); ++i, ++pBases) {
        switch(*pBases) {
        case 'A':
            *pBases = '0';
            break;
        case 'C':
            *pBases = '1';
            break;
        case 'G':
            *pBases = '2';
            break;
        case 'T':
            *pBases = '3';
            break;
        case 'X':
        case 'N':
            break;
        default:
            printf("ERROR: Unrecognized nucleotide (%c) when converting read to colorspace.\n", pBases[i]);
            exit(1);
            break;
        }
    }
}
Ejemplo n.º 3
0
// converts the supplied read from colorspace to pseudo-colorspace
void CColorspaceUtilities::ConvertReadColorspaceToPseudoColorspace(CMosaikString& s) {
    char* pBases = s.Data();
    for(unsigned int i = 0; i < s.Length(); ++i, ++pBases) {
        switch(*pBases) {
        case '0':
            *pBases = 'A';
            break;
        case '1':
            *pBases = 'C';
            break;
        case '2':
            *pBases = 'G';
            break;
        case '3':
            *pBases = 'T';
            break;
        case 'X':
            break;
        case '-':
            *pBases = 'N';
            break;
        case '.':
            // here we pick an arbitrary colorspace transition, this will have at
            // least 25 % of being correct as opposed to specifying an 'N'.
            *pBases = 'A';
            break;
        default:
            printf("ERROR: Unrecognized nucleotide (%c) when converting read to pseudo-colorspace.\n", pBases[i]);
            exit(1);
            break;
        }
    }
}
Ejemplo n.º 4
0
// converts the supplied read from basespace to pseudo-colorspace
void CColorspaceUtilities::ConvertReadBasespaceToPseudoColorspace(CMosaikString& s) {

    char* pPrev   = s.Data();
    char* pString = pPrev + 1;

    // simplify various ambiguity codes
    *pPrev = GetSimplifiedBase(*pPrev);

    CS_MAP_t::const_iterator csIter;
    for(unsigned int i = 1; i < s.Length(); ++i, ++pString, ++pPrev) {

        // simplify various ambiguity codes
        *pString = GetSimplifiedBase(*pString);

        csIter = mCSMap.find(PACK_SHORT(*pPrev, *pString));
        if(csIter == mCSMap.end()) {
            printf("ERROR: Unknown combination found when converting to colorspace: [%c] & [%c]\n", *pPrev, *pString);
            exit(1);
        }

        *pPrev = csIter->second;
    }

    // adjust the read
    s.TrimEnd(1);
}
Ejemplo n.º 5
0
// encodes the supplied query sequence into 4-bit notation
void CBamWriter::EncodeQuerySequence(const CMosaikString& query, string& encodedQuery) {

	// prepare the encoded query string
	const unsigned int queryLen = query.Length();
	const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
	encodedQuery.resize(encodedQueryLen);
	char* pEncodedQuery = (char*)encodedQuery.data();
	const char* pQuery = (const char*)query.CData();

	unsigned char nucleotideCode;
	bool useHighWord = true;

	while(*pQuery) {

		switch(*pQuery) {
			case '=':
				nucleotideCode = 0;
				break;
			case 'A':
				nucleotideCode = 1;
				break;
			case 'C':
				nucleotideCode = 2;
				break;
			case 'G':
				nucleotideCode = 4;
				break;
			case 'T':
				nucleotideCode = 8;
				break;
			case 'N':
				nucleotideCode = 15;
				break;
			default:
				printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
				exit(1);
		}

		// pack the nucleotide code
		if(useHighWord) {
			*pEncodedQuery = nucleotideCode << 4;
			useHighWord = false;
		} else {
			*pEncodedQuery |= nucleotideCode;
			++pEncodedQuery;
			useHighWord = true;
		}

		// increment the query position
		++pQuery;
	}
}
Ejemplo n.º 6
0
// extracts the genome assembly ID from a FASTA/FASTQ header
void CRegexUtilities::ExtractGenomeAssemblyID(const string& line, CMosaikString& genomeAssemblyID) {
#ifdef WIN32

	cmatch results;
	if(!regex_search(line.c_str(), results, mGenomeAssemblyIDRegex)) {
		genomeAssemblyID.SetLength(0);
		return;
	}
	genomeAssemblyID = results[1].str().c_str();

#else

	// TODO: replace this with the TR1 regex above when it finally works in gcc. It doesn't work in gcc 4.3.3

	// find the GA tag
	const string gaTag = "GA(";
	string::size_type gaPos = line.find(gaTag.c_str());

	if(gaPos == string::npos) {
		genomeAssemblyID.SetLength(0);
		return;
	}

	// find the matching end parenthesis
	const unsigned int start = gaPos + gaTag.size();
	unsigned int stop = start;

	const char* pBuffer = line.data();
	unsigned int lineLen = line.size();

	if(stop < lineLen) {
		while(pBuffer[stop] != ')') {
			stop++;
			if(stop == lineLen) break;
		}
	}

	if(start == stop) {
		cout << "ERROR: could not parse genome assembly ID from FASTA header." << endl;
		cout << "       " << line << endl;
		exit(1);
	}

	genomeAssemblyID = line.substr(start, stop - start).c_str();

#endif
}
Ejemplo n.º 7
0
	// load the read header from disk
	void CAlignmentReader::LoadReadHeader(
		CMosaikString& readName, 
		unsigned int&  readGroupCode, 
		unsigned char& readStatus, 
		int&  numMate1Alignments, 
		int&  numMate2Alignments,
		int&  numMate1OriginalAlignments,
		int&  numMate2OriginalAlignments,
		int&  numMate1Hashes,
		int&  numMate2Hashes) {

		// get the read name
		const unsigned char readNameLength = (unsigned char)*mBufferPtr;
		++mBufferPtr;

		readName.Copy((const char*)mBufferPtr, readNameLength);
		mBufferPtr += readNameLength;

		// get the read group code
		memcpy((char*)&readGroupCode, mBufferPtr, SIZEOF_INT);
		mBufferPtr += SIZEOF_INT;

		// get the read status
		readStatus = (unsigned char)*mBufferPtr;
		++mBufferPtr;

		const bool haveMate1 = ((readStatus & RF_HAVE_MATE1) != 0 ? true : false);
		const bool haveMate2 = ((readStatus & RF_HAVE_MATE2) != 0 ? true : false);

		// get the number of mate 1 alignments
		if(haveMate1) {
			memcpy((char*)&numMate1Alignments, mBufferPtr, SIZEOF_INT);
			mBufferPtr += SIZEOF_INT;
			memcpy((char*)&numMate1OriginalAlignments, mBufferPtr, SIZEOF_INT);
			mBufferPtr += SIZEOF_INT;
			memcpy((char*)&numMate1Hashes, mBufferPtr, SIZEOF_INT);
			mBufferPtr += SIZEOF_INT;
		}

		// get the number of mate 2 alignments
		if(haveMate2) {
			memcpy((char*)&numMate2Alignments, mBufferPtr, SIZEOF_INT);
			mBufferPtr += SIZEOF_INT;
			memcpy((char*)&numMate2OriginalAlignments, mBufferPtr, SIZEOF_INT);
			mBufferPtr += SIZEOF_INT;
			memcpy((char*)&numMate2Hashes, mBufferPtr, SIZEOF_INT);
			mBufferPtr += SIZEOF_INT;
		}
	}
Ejemplo n.º 8
0
// saves the alignment to the alignment archive
void CBamWriter::SaveAlignment(
    const Alignment& al, 
    const char* zaString, 
    const bool& noCigarMdNm, 
    const bool& notShowRnamePos, 
    const bool& isSolid, 
    const bool& processedBamData,
    const bool& report_zn) {

	// =================
	// set the BAM flags
	// =================

	// define our flags
	unsigned int flag                = 0;
	int insertSize                   = 0;
	if(al.IsPairedEnd) {
		flag |= BAM_SEQUENCED_AS_PAIRS;
		// first or second mate?
		flag |= (al.IsFirstMate ? BAM_QUERY_FIRST_MATE : BAM_QUERY_SECOND_MATE);
		if(al.IsResolvedAsPair) {
			if ( al.IsResolvedAsProperPair ) 
				flag |= BAM_PROPER_PAIR;
			if(al.IsMateMapped && al.IsMateReverseStrand) flag |= BAM_MATE_REVERSE_COMPLEMENT;
			insertSize = al.FragmentLength;
		}
		if ( !al.IsMapped )
			flag |= BAM_QUERY_UNMAPPED;
		if ( !al.IsMateMapped )
			flag |= BAM_MATE_UNMAPPED;
	} else {
		if ( !al.IsMapped )
			flag |= BAM_QUERY_UNMAPPED;
	}

	if(al.IsMapped && al.IsReverseStrand) flag |= BAM_QUERY_REVERSE_COMPLEMENT;

	// ==========================
	// construct the cigar string
	// ==========================

	string packedCigar;
	unsigned short numCigarOperations = 0;
	if ( !noCigarMdNm ) {
		if ( !processedBamData )
			CreatePackedCigar( al, packedCigar, numCigarOperations, isSolid );
		else {
			packedCigar = al.PackedCigar;
			numCigarOperations = al.NumCigarOperation;
		}
	}
	else
		packedCigar = "\0";
	const unsigned int packedCigarLen = !noCigarMdNm ? packedCigar.size() : 0;

	// ===================
	// write the alignment
	// ===================

	// remove the gaps from the read
	CMosaikString query;
	if ( !processedBamData ) {
		query = al.Query.CData();
		query.Remove('-');
	}

	// initialize
	const unsigned int nameLen  = al.Name.Length() + 1;
	const unsigned int queryLen = processedBamData ? al.QueryLength : query.Length();

	// sanity check
	//al.BaseQualities.CheckQuality();
	//if ( queryLen != alIter->BaseQualities.Length() ) {
	//        printf("ERROR: The lengths of bases(%u) and qualities(%u) of Read (%s) didn't match.\n", queryLen, alIter->BaseQualities.Length(), readName.CData());
        //        exit(1);
        //}
	
	// encode the query
	string encodedQuery;
	if (!processedBamData && (query.Length() != 0))
		EncodeQuerySequence(query, encodedQuery);
	else
		encodedQuery = al.EncodedQuery;
	const unsigned int encodedQueryLen = encodedQuery.size();

	// create our read group tag
	string readGroupTag;
	const unsigned int readGroupTagLen = 3 + al.ReadGroup.size() + 1;
	readGroupTag.resize(readGroupTagLen);
	char* pReadGroupTag = (char*)readGroupTag.data();
	sprintf(pReadGroupTag, "RGZ%s", al.ReadGroup.c_str());

	// create our mismatch tag
	string mismatchTag;
	unsigned int numMismatches = 0;
	unsigned int nmTagLen = 0;
	if ( !noCigarMdNm ) {
		mismatchTag = "NMi";
		mismatchTag.resize(MISMATCH_TAG_LEN);
		nmTagLen = MISMATCH_TAG_LEN;
		numMismatches = al.NumMismatches;
		memcpy((char*)mismatchTag.data() + 3, (char*)&numMismatches, SIZEOF_INT);
	}

	// create our MD tag
	string mdTag;
	char* pMd = 0;
	unsigned int mdTagLen = 0;
	char* pMdTag;
	if ( !noCigarMdNm ) {
		if ( !processedBamData ) 
			pMd = (char*) mdTager.GetMdTag( al.Reference.CData(), al.Query.CData(), al.Reference.Length() );
		else
			pMd = (char*) al.MdString.c_str();

		mdTagLen = 3 + strlen( pMd ) + 1;
		mdTag.resize( mdTagLen );
		pMdTag = (char*)mdTag.data();
		sprintf(pMdTag, "MDZ%s", pMd);
		#ifdef VERBOSE_DEBUG
		fprintf(stderr, "=== MD ===\n");
		fprintf(stderr, "%s\n%s\n%s\n", al.Reference.CData(), al.Query.CData(), mdTag.c_str());
		#endif
	}

	// create our za tag
	unsigned int zaTagLen = 0;
	string zaTag;
	char* pZaTag;
	if ((zaString != 0) && (zaString != (char)0)) {
		zaTagLen = 3 + strlen( zaString ) + 1;
		zaTag.resize( zaTagLen );
		pZaTag = (char*)zaTag.data();
		sprintf(pZaTag, "ZAZ%s",zaString);
	}

	// create our zn tag
	unsigned int znTagLen = 0;
	string znTag;
	if (report_zn){
	  ostringstream zn_buffer;
	  zn_buffer << "ZNZ" 
	            << al.SwScore << ";"
		    << al.NextSwScore << ";"
		    << al.NumLongestMatchs << ";"
		    << al.Entropy << ";"
		    << al.NumMapped << ";"
		    << al.NumHash;
	  znTag = zn_buffer.str();
	  znTagLen = znTag.size() + 1;
	  //cerr << znTag.data() << "\t" << znTag << "\t" << znTagLen << endl;
	}
	
	// create our cs tag
	unsigned int csTagLen = 0;
	string csTag;
	char* pCsTag;
	if (isSolid) {
		csTagLen = 3 + strlen ( al.CsQuery.c_str() ) + 1;
		csTag.resize( csTagLen );
		pCsTag = (char*)csTag.data();
		sprintf( pCsTag, "CSZ%s", al.CsQuery.c_str() );
	}

	// create our cq tag
	unsigned int cqTagLen = 0;
	string cqTag;
	char* pCqTag;
	if (isSolid) {
		cqTagLen = 3 + strlen ( al.CsBaseQualities.c_str() ) + 1;
		cqTag.resize( cqTagLen );
		pCqTag = (char*)cqTag.data();
		sprintf( pCqTag, "CQZ%s", al.CsBaseQualities.c_str() );
	}

	// retrieve our bin
	unsigned int bin = CalculateMinimumBin(al.ReferenceBegin, al.ReferenceEnd);

	// assign the BAM core data
	unsigned int buffer[8] = {0};
	unsigned int reference_index, reference_pos;
	if (!al.IsMapped) {
	  if (al.IsMateMapped) reference_index = al.MateReferenceIndex;
	  else reference_index = 0xffffffff;
	} else {
	  reference_index = al.ReferenceIndex;
	}

	if (!al.IsMapped) {
	  if (al.IsMateMapped) reference_pos = al.MateReferenceBegin;
	  else reference_pos = 0xffffffff;
	} else {
	  reference_pos = al.ReferenceBegin;
	}

	buffer[0] = (notShowRnamePos) ? 0xffffffff : reference_index;
	buffer[1] = (notShowRnamePos) ? 0xffffffff : reference_pos;
	buffer[2] = (bin << 16) | (al.RecalibratedQuality << 8) | nameLen;
	buffer[3] = (flag << 16) | numCigarOperations;
	buffer[4] = queryLen;

	if(al.IsPairedEnd) {
		if (notShowRnamePos) {
		  buffer[5] = 0xffffffff;
		  buffer[6] = 0xffffffff;
		} else {
		  if (!al.IsMateMapped) {//unmapped mate
		    buffer[5] = reference_index;
		    buffer[6] = reference_pos;
		  } else {
		    buffer[5] = al.MateReferenceIndex;
		    buffer[6] = al.MateReferenceBegin;
		  }
		}
		buffer[7] = insertSize;
	} else {
		buffer[5] = 0xffffffff;
		buffer[6] = 0xffffffff;
		buffer[7] = 0;
	}

	// write the block size
	const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + readGroupTagLen + nmTagLen + mdTagLen + zaTagLen + znTagLen + csTagLen + cqTagLen;
	const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
	BgzfWrite((char*)&blockSize, SIZEOF_INT);

	// write the BAM core
	BgzfWrite((char*)&buffer, BAM_CORE_SIZE);

	// write the query name
	BgzfWrite(al.Name.CData(), nameLen);

	// write the packed cigar
	BgzfWrite(packedCigar.data(), packedCigarLen);

	// write the encoded query sequence
	BgzfWrite(encodedQuery.data(), encodedQueryLen);

	// write the base qualities
	BgzfWrite(al.BaseQualities.CData(), queryLen);

	// write the read group tag
	BgzfWrite(readGroupTag.data(), readGroupTagLen);

	// write the mismatch tag
	if ( !noCigarMdNm )
		BgzfWrite(mismatchTag.data(), MISMATCH_TAG_LEN);

	// write the MD tag
	if ( !noCigarMdNm )
	BgzfWrite(mdTag.data(), mdTagLen);

	// write the ZA tag
	if ( zaString != 0 && (zaString != (char)0))
		BgzfWrite(zaTag.data(), zaTagLen);
	
	// write the ZN tag
	if (report_zn && (znTagLen > 0))
		BgzfWrite(znTag.data(), znTagLen);

	// write the cs tag
	if (isSolid)
		BgzfWrite(csTag.data(), csTagLen);

	// write the cq tag
	if (isSolid)
		BgzfWrite(cqTag.data(), cqTagLen);
}
Ejemplo n.º 9
0
// converts the supplied alignment from colorspace to basespace
void CColorspaceUtilities::ConvertAlignmentToBasespace(Alignment& al) {


    // convert the alignment to character arrays
    const unsigned int pairwiseLen = al.Reference.Length();
    //char* pReference = al.Reference.Data();
    //char* pQuery     = al.Query.Data();

    // re-allocate mBsRef & mBsQuery if the reversed space is insufficient
    if(  pairwiseLen > mCsAl.csAlignmentLength ) {

        //if ( mCsAl.csReference ) delete [] mCsAl.csReference;
        //if ( mCsAl.csQuery )     delete [] mCsAl.csQuery;
        //if ( mCsAl.bsReference ) delete [] mCsAl.bsReference;
        //if ( mCsAl.bsQuery )     delete [] mCsAl.bsQuery;
        //if ( mCsAl.type )        delete [] mCsAl.type;

        //if ( mCsAl.dashReference ) delete [] mCsAl.dashReference;
        //if ( mCsAl.dashQuery )     delete [] mCsAl.dashQuery;

        //if ( mCsAl.mismatch )      delete [] mCsAl.mismatch;

        //if ( mCsAl.identical )     delete [] mCsAl.identical;

        delete [] mCsAl.csReference;
        delete [] mCsAl.csQuery;
        delete [] mCsAl.bsReference;
        delete [] mCsAl.bsQuery;
        delete [] mCsAl.type;

        delete [] mCsAl.dashReference;
        delete [] mCsAl.dashQuery;

        delete [] mCsAl.mismatch;

        delete [] mCsAl.identical;

        try {
            mCsAl.csAlignmentLength = pairwiseLen;
            mCsAl.csReference = new char [ pairwiseLen ];
            mCsAl.csQuery     = new char [ pairwiseLen ];
            mCsAl.bsReference = new char [ pairwiseLen + 2 ];
            mCsAl.bsQuery     = new char [ pairwiseLen + 2 ];
            mCsAl.type        = new unsigned short [pairwiseLen];

            mCsAl.dashReference = new RegionT [ pairwiseLen ];
            mCsAl.dashQuery     = new RegionT [ pairwiseLen ];

            mCsAl.mismatch      = new unsigned int [ pairwiseLen ];

            mCsAl.identical     = new RegionT [ pairwiseLen ];

        }
        catch( bad_alloc ) {
            cout << "ERROR: Unable to allocate enough memory for converting colorspace ." << endl;
            exit(1);
        }

    }

    // initialize the counters
    mCsAl.nDashReference = 0;
    mCsAl.nDashQuery     = 0;
    mCsAl.nMismatch      = 0;
    mCsAl.nIdentical     = 0;


    // convert cs to bs
    // initial the first BS base
    char bsBase = mpBsRefSeqs[al.ReferenceIndex][al.ReferenceBegin];
    if ( bsBase == 'N' || bsBase == 'X' ) {
        cout << "ERROR: The first base of the colorspace-basespace converter is N or X." << endl;
        exit(1);
    }
    // copy CS alignments
    memcpy ( mCsAl.csReference, al.Reference.Data(), pairwiseLen );
    memcpy ( mCsAl.csQuery,     al.Query.Data(),     pairwiseLen );


    mCsAl.bsReference[0] = bsBase;
    mCsAl.bsQuery[0]     = bsBase;
    ConvertCs2Bs(mCsAl.csReference, mCsAl.bsReference, 0, pairwiseLen-1, bsBase);
    ConvertCs2Bs(mCsAl.csQuery, mCsAl.bsQuery, 0, pairwiseLen-1, bsBase);


    // search the dash regions & mismatches
    bool continuedDReference = false;
    bool continuedDQuery     = false;
    unsigned int nMatch      = 0;
    BS_MAP_t::const_iterator bsIter;
    for ( unsigned int i = 0; i < pairwiseLen; i++ ) {

        // determine identical region
        const bool isEndIdentity = ( mCsAl.csQuery[i] != mCsAl.csReference[i] ) && ( nMatch >= mNAllowedMismatch );
        if ( isEndIdentity ) {
            mCsAl.identical[ mCsAl.nIdentical ].Begin  = i - nMatch;
            mCsAl.identical[ mCsAl.nIdentical ].Length = nMatch;
            mCsAl.nIdentical++;
        }

        if ( mCsAl.csQuery[i] == mCsAl.csReference[i])
            nMatch++;
        else
            nMatch = 0;

        // determine mismatches
        bool isN = (mCsAl.csReference[i] != 'A') && (mCsAl.csReference[i] != 'C') && (mCsAl.csReference[i] != 'G') && (mCsAl.csReference[i] != 'T');
        const bool isMismatch = (mCsAl.csReference[i] != '-') && (mCsAl.csQuery[i] != '-') && !isN && (mCsAl.csReference[i] != mCsAl.csQuery[i]);
        if ( isMismatch ) {
            // set the position
            mCsAl.mismatch[mCsAl.nMismatch] = i;
            mCsAl.nMismatch++;
            // set the mismatch flag
            mCsAl.type[i] = 0;
        }


        // for reference convertion
        if ( mCsAl.csReference[i] != '-' ) {
            // end the current dash region
            if ( continuedDReference ) {
                mCsAl.nDashReference++;
                // the current position could be a mismatch
                mCsAl.mismatch[mCsAl.nMismatch] = i;
                mCsAl.nMismatch++;
                mCsAl.type[i] = 1;
            }
            continuedDReference = false;
        }
        else {
            // start a dash region
            if ( !continuedDReference ) {
                mCsAl.dashReference[mCsAl.nDashReference].Begin  = i;
                mCsAl.dashReference[mCsAl.nDashReference].Length = 0;
                // the preceding position could be a mismatch
                mCsAl.mismatch[mCsAl.nMismatch] = i;
                mCsAl.nMismatch++;
                mCsAl.type[i] = 3;
            }
            mCsAl.dashReference[mCsAl.nDashReference].Length++;
            continuedDReference = true;
        }

        // for query convertion
        if ( mCsAl.csQuery[i] != '-' ) {
            // end the current dash region
            if ( continuedDQuery ) {
                mCsAl.nDashQuery++;
                // the current position could be a mismatch
                mCsAl.mismatch[mCsAl.nMismatch] = i;
                mCsAl.nMismatch++;
                mCsAl.type[i] = 2;
            }
            continuedDQuery = false;
        }
        else {
            // start a dash region
            if ( !continuedDQuery ) {
                mCsAl.dashQuery[mCsAl.nDashQuery].Begin  = i;
                mCsAl.dashQuery[mCsAl.nDashQuery].Length = 0;
                // the preceding position could be a mismatch
                mCsAl.mismatch[mCsAl.nMismatch] = i;
                mCsAl.nMismatch++;
                mCsAl.type[i] = 4;
            }
            mCsAl.dashQuery[mCsAl.nDashQuery].Length++;
            continuedDQuery = true;
        }


    }

    if ( nMatch > 0 ) {
        mCsAl.identical[ mCsAl.nIdentical ].Begin  = pairwiseLen - nMatch;
        mCsAl.identical[ mCsAl.nIdentical ].Length = nMatch;
        mCsAl.nIdentical++;
    }


    if ( mCsAl.identical[mCsAl.nIdentical - 1].Begin != 0 ) {
        // find sequencing errors
        if ( mCsAl.nMismatch > 0 )
            FindSequencingError(pairwiseLen);

        if ( mCsAl.nDashReference > 0 ) {
            for ( unsigned int i = 0; i < mCsAl.nDashReference; i++ ) {
                unsigned int curPosition = mCsAl.dashReference[i].Begin + mCsAl.dashReference[i].Length;
                if ( mCsAl.bsReference[ curPosition ] != mCsAl.bsQuery[ curPosition ] ) {
                    curPosition = mCsAl.dashReference[i].Begin;
                    for ( unsigned int j = 0; j < mCsAl.dashReference[i].Length; j++ )
                        mCsAl.bsQuery[ curPosition + j + 1 ] = 'N';
                }
            }

            AdjustDash(mCsAl.csReference, mCsAl.csQuery, mCsAl.dashReference, mCsAl.nDashReference, mCsAl.bsReference);
        }

        if ( mCsAl.nDashQuery > 0 )
            AdjustDash(mCsAl.csQuery, mCsAl.csReference, mCsAl.dashQuery, mCsAl.nDashQuery, mCsAl.bsQuery);


        // deal with the indentical region
        for ( unsigned int i = 0; i < mCsAl.nIdentical; i++ ) {
            unsigned int csEnd = mCsAl.identical[i].Begin + mCsAl.identical[i].Length - 1;
            unsigned int curPosition = mCsAl.identical[i].Begin;
            bsBase = mCsAl.bsReference[ curPosition ];
            while ( ( bsBase == '-' ) || ( bsBase == 'N' ) ) {
                curPosition--;
                bsBase = mCsAl.bsReference[ curPosition ];
                if ( curPosition == 0 )
                    break;
            }

            bool isGoodBsBase = false;
            if ( ( bsBase != '-' ) && ( bsBase != 'N' ) )
                isGoodBsBase = true;

            if ( isGoodBsBase )
                ConvertCs2Bs(mCsAl.csQuery, mCsAl.bsQuery, mCsAl.identical[i].Begin, csEnd, bsBase);
        }


    }

    // end up the sequences
    mCsAl.bsReference[ pairwiseLen + 1 ] = 0;
    mCsAl.bsQuery[ pairwiseLen + 1 ]     = 0;

    al.Reference = mCsAl.bsReference;
    al.Query     = mCsAl.bsQuery;

    ++al.ReferenceEnd;
    ++al.QueryEnd;
    al.QueryLength = al.QueryEnd - al.QueryBegin + 1;


    // ------------------------------------------------------------------------------------------
    // convert the colorspace transition qualities to base qualities
    // NOTE: this algorithm will simply take the minimum of the two qualities that overlap a base
    // ------------------------------------------------------------------------------------------
    const unsigned short numColorspaceQualities = al.BaseQualities.Length();
    const unsigned short lastCSQIndex = numColorspaceQualities - 1;

    CMosaikString csQualities = al.BaseQualities;
    al.BaseQualities.Reserve(numColorspaceQualities + 1);
    al.BaseQualities.SetLength(numColorspaceQualities + 1);

    const char* pCSQual = csQualities.CData();
    char* pBSQual       = al.BaseQualities.Data();

    // handle the first base quality
    *pBSQual = *pCSQual;
    ++pBSQual;

    // handle the internal base qualities
    for(unsigned short i = 1; i < numColorspaceQualities; ++i, ++pBSQual)
        *pBSQual = min(pCSQual[i - 1], pCSQual[i]);

    // handle the final base quality
    *pBSQual = pCSQual[lastCSQIndex];



    // update the number of mismatches
    // TODO: This should be augmented to support IUPAC ambiguity codes
    const unsigned int bsPairwiseLen = al.Reference.Length();

    al.NumMismatches = 0;
    for(unsigned short i = 0; i < bsPairwiseLen; ++i) {
        if(mCsAl.bsReference[i] != mCsAl.bsQuery[i]) al.NumMismatches++;
    }

}