// encodes the supplied query sequence into 4-bit notation void CBamWriter::EncodeQuerySequence(const CMosaikString& query, string& encodedQuery) { // prepare the encoded query string const unsigned int queryLen = query.Length(); const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); encodedQuery.resize(encodedQueryLen); char* pEncodedQuery = (char*)encodedQuery.data(); const char* pQuery = (const char*)query.CData(); unsigned char nucleotideCode; bool useHighWord = true; while(*pQuery) { switch(*pQuery) { case '=': nucleotideCode = 0; break; case 'A': nucleotideCode = 1; break; case 'C': nucleotideCode = 2; break; case 'G': nucleotideCode = 4; break; case 'T': nucleotideCode = 8; break; case 'N': nucleotideCode = 15; break; default: printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); exit(1); } // pack the nucleotide code if(useHighWord) { *pEncodedQuery = nucleotideCode << 4; useHighWord = false; } else { *pEncodedQuery |= nucleotideCode; ++pEncodedQuery; useHighWord = true; } // increment the query position ++pQuery; } }
// converts the supplied alignment from colorspace to basespace void CColorspaceUtilities::ConvertAlignmentToBasespace(Alignment& al) { // convert the alignment to character arrays const unsigned int pairwiseLen = al.Reference.Length(); //char* pReference = al.Reference.Data(); //char* pQuery = al.Query.Data(); // re-allocate mBsRef & mBsQuery if the reversed space is insufficient if( pairwiseLen > mCsAl.csAlignmentLength ) { //if ( mCsAl.csReference ) delete [] mCsAl.csReference; //if ( mCsAl.csQuery ) delete [] mCsAl.csQuery; //if ( mCsAl.bsReference ) delete [] mCsAl.bsReference; //if ( mCsAl.bsQuery ) delete [] mCsAl.bsQuery; //if ( mCsAl.type ) delete [] mCsAl.type; //if ( mCsAl.dashReference ) delete [] mCsAl.dashReference; //if ( mCsAl.dashQuery ) delete [] mCsAl.dashQuery; //if ( mCsAl.mismatch ) delete [] mCsAl.mismatch; //if ( mCsAl.identical ) delete [] mCsAl.identical; delete [] mCsAl.csReference; delete [] mCsAl.csQuery; delete [] mCsAl.bsReference; delete [] mCsAl.bsQuery; delete [] mCsAl.type; delete [] mCsAl.dashReference; delete [] mCsAl.dashQuery; delete [] mCsAl.mismatch; delete [] mCsAl.identical; try { mCsAl.csAlignmentLength = pairwiseLen; mCsAl.csReference = new char [ pairwiseLen ]; mCsAl.csQuery = new char [ pairwiseLen ]; mCsAl.bsReference = new char [ pairwiseLen + 2 ]; mCsAl.bsQuery = new char [ pairwiseLen + 2 ]; mCsAl.type = new unsigned short [pairwiseLen]; mCsAl.dashReference = new RegionT [ pairwiseLen ]; mCsAl.dashQuery = new RegionT [ pairwiseLen ]; mCsAl.mismatch = new unsigned int [ pairwiseLen ]; mCsAl.identical = new RegionT [ pairwiseLen ]; } catch( bad_alloc ) { cout << "ERROR: Unable to allocate enough memory for converting colorspace ." << endl; exit(1); } } // initialize the counters mCsAl.nDashReference = 0; mCsAl.nDashQuery = 0; mCsAl.nMismatch = 0; mCsAl.nIdentical = 0; // convert cs to bs // initial the first BS base char bsBase = mpBsRefSeqs[al.ReferenceIndex][al.ReferenceBegin]; if ( bsBase == 'N' || bsBase == 'X' ) { cout << "ERROR: The first base of the colorspace-basespace converter is N or X." << endl; exit(1); } // copy CS alignments memcpy ( mCsAl.csReference, al.Reference.Data(), pairwiseLen ); memcpy ( mCsAl.csQuery, al.Query.Data(), pairwiseLen ); mCsAl.bsReference[0] = bsBase; mCsAl.bsQuery[0] = bsBase; ConvertCs2Bs(mCsAl.csReference, mCsAl.bsReference, 0, pairwiseLen-1, bsBase); ConvertCs2Bs(mCsAl.csQuery, mCsAl.bsQuery, 0, pairwiseLen-1, bsBase); // search the dash regions & mismatches bool continuedDReference = false; bool continuedDQuery = false; unsigned int nMatch = 0; BS_MAP_t::const_iterator bsIter; for ( unsigned int i = 0; i < pairwiseLen; i++ ) { // determine identical region const bool isEndIdentity = ( mCsAl.csQuery[i] != mCsAl.csReference[i] ) && ( nMatch >= mNAllowedMismatch ); if ( isEndIdentity ) { mCsAl.identical[ mCsAl.nIdentical ].Begin = i - nMatch; mCsAl.identical[ mCsAl.nIdentical ].Length = nMatch; mCsAl.nIdentical++; } if ( mCsAl.csQuery[i] == mCsAl.csReference[i]) nMatch++; else nMatch = 0; // determine mismatches bool isN = (mCsAl.csReference[i] != 'A') && (mCsAl.csReference[i] != 'C') && (mCsAl.csReference[i] != 'G') && (mCsAl.csReference[i] != 'T'); const bool isMismatch = (mCsAl.csReference[i] != '-') && (mCsAl.csQuery[i] != '-') && !isN && (mCsAl.csReference[i] != mCsAl.csQuery[i]); if ( isMismatch ) { // set the position mCsAl.mismatch[mCsAl.nMismatch] = i; mCsAl.nMismatch++; // set the mismatch flag mCsAl.type[i] = 0; } // for reference convertion if ( mCsAl.csReference[i] != '-' ) { // end the current dash region if ( continuedDReference ) { mCsAl.nDashReference++; // the current position could be a mismatch mCsAl.mismatch[mCsAl.nMismatch] = i; mCsAl.nMismatch++; mCsAl.type[i] = 1; } continuedDReference = false; } else { // start a dash region if ( !continuedDReference ) { mCsAl.dashReference[mCsAl.nDashReference].Begin = i; mCsAl.dashReference[mCsAl.nDashReference].Length = 0; // the preceding position could be a mismatch mCsAl.mismatch[mCsAl.nMismatch] = i; mCsAl.nMismatch++; mCsAl.type[i] = 3; } mCsAl.dashReference[mCsAl.nDashReference].Length++; continuedDReference = true; } // for query convertion if ( mCsAl.csQuery[i] != '-' ) { // end the current dash region if ( continuedDQuery ) { mCsAl.nDashQuery++; // the current position could be a mismatch mCsAl.mismatch[mCsAl.nMismatch] = i; mCsAl.nMismatch++; mCsAl.type[i] = 2; } continuedDQuery = false; } else { // start a dash region if ( !continuedDQuery ) { mCsAl.dashQuery[mCsAl.nDashQuery].Begin = i; mCsAl.dashQuery[mCsAl.nDashQuery].Length = 0; // the preceding position could be a mismatch mCsAl.mismatch[mCsAl.nMismatch] = i; mCsAl.nMismatch++; mCsAl.type[i] = 4; } mCsAl.dashQuery[mCsAl.nDashQuery].Length++; continuedDQuery = true; } } if ( nMatch > 0 ) { mCsAl.identical[ mCsAl.nIdentical ].Begin = pairwiseLen - nMatch; mCsAl.identical[ mCsAl.nIdentical ].Length = nMatch; mCsAl.nIdentical++; } if ( mCsAl.identical[mCsAl.nIdentical - 1].Begin != 0 ) { // find sequencing errors if ( mCsAl.nMismatch > 0 ) FindSequencingError(pairwiseLen); if ( mCsAl.nDashReference > 0 ) { for ( unsigned int i = 0; i < mCsAl.nDashReference; i++ ) { unsigned int curPosition = mCsAl.dashReference[i].Begin + mCsAl.dashReference[i].Length; if ( mCsAl.bsReference[ curPosition ] != mCsAl.bsQuery[ curPosition ] ) { curPosition = mCsAl.dashReference[i].Begin; for ( unsigned int j = 0; j < mCsAl.dashReference[i].Length; j++ ) mCsAl.bsQuery[ curPosition + j + 1 ] = 'N'; } } AdjustDash(mCsAl.csReference, mCsAl.csQuery, mCsAl.dashReference, mCsAl.nDashReference, mCsAl.bsReference); } if ( mCsAl.nDashQuery > 0 ) AdjustDash(mCsAl.csQuery, mCsAl.csReference, mCsAl.dashQuery, mCsAl.nDashQuery, mCsAl.bsQuery); // deal with the indentical region for ( unsigned int i = 0; i < mCsAl.nIdentical; i++ ) { unsigned int csEnd = mCsAl.identical[i].Begin + mCsAl.identical[i].Length - 1; unsigned int curPosition = mCsAl.identical[i].Begin; bsBase = mCsAl.bsReference[ curPosition ]; while ( ( bsBase == '-' ) || ( bsBase == 'N' ) ) { curPosition--; bsBase = mCsAl.bsReference[ curPosition ]; if ( curPosition == 0 ) break; } bool isGoodBsBase = false; if ( ( bsBase != '-' ) && ( bsBase != 'N' ) ) isGoodBsBase = true; if ( isGoodBsBase ) ConvertCs2Bs(mCsAl.csQuery, mCsAl.bsQuery, mCsAl.identical[i].Begin, csEnd, bsBase); } } // end up the sequences mCsAl.bsReference[ pairwiseLen + 1 ] = 0; mCsAl.bsQuery[ pairwiseLen + 1 ] = 0; al.Reference = mCsAl.bsReference; al.Query = mCsAl.bsQuery; ++al.ReferenceEnd; ++al.QueryEnd; al.QueryLength = al.QueryEnd - al.QueryBegin + 1; // ------------------------------------------------------------------------------------------ // convert the colorspace transition qualities to base qualities // NOTE: this algorithm will simply take the minimum of the two qualities that overlap a base // ------------------------------------------------------------------------------------------ const unsigned short numColorspaceQualities = al.BaseQualities.Length(); const unsigned short lastCSQIndex = numColorspaceQualities - 1; CMosaikString csQualities = al.BaseQualities; al.BaseQualities.Reserve(numColorspaceQualities + 1); al.BaseQualities.SetLength(numColorspaceQualities + 1); const char* pCSQual = csQualities.CData(); char* pBSQual = al.BaseQualities.Data(); // handle the first base quality *pBSQual = *pCSQual; ++pBSQual; // handle the internal base qualities for(unsigned short i = 1; i < numColorspaceQualities; ++i, ++pBSQual) *pBSQual = min(pCSQual[i - 1], pCSQual[i]); // handle the final base quality *pBSQual = pCSQual[lastCSQIndex]; // update the number of mismatches // TODO: This should be augmented to support IUPAC ambiguity codes const unsigned int bsPairwiseLen = al.Reference.Length(); al.NumMismatches = 0; for(unsigned short i = 0; i < bsPairwiseLen; ++i) { if(mCsAl.bsReference[i] != mCsAl.bsQuery[i]) al.NumMismatches++; } }