Exemplo n.º 1
0
// encodes the supplied query sequence into 4-bit notation
void CBamWriter::EncodeQuerySequence(const CMosaikString& query, string& encodedQuery) {

	// prepare the encoded query string
	const unsigned int queryLen = query.Length();
	const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
	encodedQuery.resize(encodedQueryLen);
	char* pEncodedQuery = (char*)encodedQuery.data();
	const char* pQuery = (const char*)query.CData();

	unsigned char nucleotideCode;
	bool useHighWord = true;

	while(*pQuery) {

		switch(*pQuery) {
			case '=':
				nucleotideCode = 0;
				break;
			case 'A':
				nucleotideCode = 1;
				break;
			case 'C':
				nucleotideCode = 2;
				break;
			case 'G':
				nucleotideCode = 4;
				break;
			case 'T':
				nucleotideCode = 8;
				break;
			case 'N':
				nucleotideCode = 15;
				break;
			default:
				printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
				exit(1);
		}

		// pack the nucleotide code
		if(useHighWord) {
			*pEncodedQuery = nucleotideCode << 4;
			useHighWord = false;
		} else {
			*pEncodedQuery |= nucleotideCode;
			++pEncodedQuery;
			useHighWord = true;
		}

		// increment the query position
		++pQuery;
	}
}
Exemplo n.º 2
0
// converts the supplied alignment from colorspace to basespace
void CColorspaceUtilities::ConvertAlignmentToBasespace(Alignment& al) {


    // convert the alignment to character arrays
    const unsigned int pairwiseLen = al.Reference.Length();
    //char* pReference = al.Reference.Data();
    //char* pQuery     = al.Query.Data();

    // re-allocate mBsRef & mBsQuery if the reversed space is insufficient
    if(  pairwiseLen > mCsAl.csAlignmentLength ) {

        //if ( mCsAl.csReference ) delete [] mCsAl.csReference;
        //if ( mCsAl.csQuery )     delete [] mCsAl.csQuery;
        //if ( mCsAl.bsReference ) delete [] mCsAl.bsReference;
        //if ( mCsAl.bsQuery )     delete [] mCsAl.bsQuery;
        //if ( mCsAl.type )        delete [] mCsAl.type;

        //if ( mCsAl.dashReference ) delete [] mCsAl.dashReference;
        //if ( mCsAl.dashQuery )     delete [] mCsAl.dashQuery;

        //if ( mCsAl.mismatch )      delete [] mCsAl.mismatch;

        //if ( mCsAl.identical )     delete [] mCsAl.identical;

        delete [] mCsAl.csReference;
        delete [] mCsAl.csQuery;
        delete [] mCsAl.bsReference;
        delete [] mCsAl.bsQuery;
        delete [] mCsAl.type;

        delete [] mCsAl.dashReference;
        delete [] mCsAl.dashQuery;

        delete [] mCsAl.mismatch;

        delete [] mCsAl.identical;

        try {
            mCsAl.csAlignmentLength = pairwiseLen;
            mCsAl.csReference = new char [ pairwiseLen ];
            mCsAl.csQuery     = new char [ pairwiseLen ];
            mCsAl.bsReference = new char [ pairwiseLen + 2 ];
            mCsAl.bsQuery     = new char [ pairwiseLen + 2 ];
            mCsAl.type        = new unsigned short [pairwiseLen];

            mCsAl.dashReference = new RegionT [ pairwiseLen ];
            mCsAl.dashQuery     = new RegionT [ pairwiseLen ];

            mCsAl.mismatch      = new unsigned int [ pairwiseLen ];

            mCsAl.identical     = new RegionT [ pairwiseLen ];

        }
        catch( bad_alloc ) {
            cout << "ERROR: Unable to allocate enough memory for converting colorspace ." << endl;
            exit(1);
        }

    }

    // initialize the counters
    mCsAl.nDashReference = 0;
    mCsAl.nDashQuery     = 0;
    mCsAl.nMismatch      = 0;
    mCsAl.nIdentical     = 0;


    // convert cs to bs
    // initial the first BS base
    char bsBase = mpBsRefSeqs[al.ReferenceIndex][al.ReferenceBegin];
    if ( bsBase == 'N' || bsBase == 'X' ) {
        cout << "ERROR: The first base of the colorspace-basespace converter is N or X." << endl;
        exit(1);
    }
    // copy CS alignments
    memcpy ( mCsAl.csReference, al.Reference.Data(), pairwiseLen );
    memcpy ( mCsAl.csQuery,     al.Query.Data(),     pairwiseLen );


    mCsAl.bsReference[0] = bsBase;
    mCsAl.bsQuery[0]     = bsBase;
    ConvertCs2Bs(mCsAl.csReference, mCsAl.bsReference, 0, pairwiseLen-1, bsBase);
    ConvertCs2Bs(mCsAl.csQuery, mCsAl.bsQuery, 0, pairwiseLen-1, bsBase);


    // search the dash regions & mismatches
    bool continuedDReference = false;
    bool continuedDQuery     = false;
    unsigned int nMatch      = 0;
    BS_MAP_t::const_iterator bsIter;
    for ( unsigned int i = 0; i < pairwiseLen; i++ ) {

        // determine identical region
        const bool isEndIdentity = ( mCsAl.csQuery[i] != mCsAl.csReference[i] ) && ( nMatch >= mNAllowedMismatch );
        if ( isEndIdentity ) {
            mCsAl.identical[ mCsAl.nIdentical ].Begin  = i - nMatch;
            mCsAl.identical[ mCsAl.nIdentical ].Length = nMatch;
            mCsAl.nIdentical++;
        }

        if ( mCsAl.csQuery[i] == mCsAl.csReference[i])
            nMatch++;
        else
            nMatch = 0;

        // determine mismatches
        bool isN = (mCsAl.csReference[i] != 'A') && (mCsAl.csReference[i] != 'C') && (mCsAl.csReference[i] != 'G') && (mCsAl.csReference[i] != 'T');
        const bool isMismatch = (mCsAl.csReference[i] != '-') && (mCsAl.csQuery[i] != '-') && !isN && (mCsAl.csReference[i] != mCsAl.csQuery[i]);
        if ( isMismatch ) {
            // set the position
            mCsAl.mismatch[mCsAl.nMismatch] = i;
            mCsAl.nMismatch++;
            // set the mismatch flag
            mCsAl.type[i] = 0;
        }


        // for reference convertion
        if ( mCsAl.csReference[i] != '-' ) {
            // end the current dash region
            if ( continuedDReference ) {
                mCsAl.nDashReference++;
                // the current position could be a mismatch
                mCsAl.mismatch[mCsAl.nMismatch] = i;
                mCsAl.nMismatch++;
                mCsAl.type[i] = 1;
            }
            continuedDReference = false;
        }
        else {
            // start a dash region
            if ( !continuedDReference ) {
                mCsAl.dashReference[mCsAl.nDashReference].Begin  = i;
                mCsAl.dashReference[mCsAl.nDashReference].Length = 0;
                // the preceding position could be a mismatch
                mCsAl.mismatch[mCsAl.nMismatch] = i;
                mCsAl.nMismatch++;
                mCsAl.type[i] = 3;
            }
            mCsAl.dashReference[mCsAl.nDashReference].Length++;
            continuedDReference = true;
        }

        // for query convertion
        if ( mCsAl.csQuery[i] != '-' ) {
            // end the current dash region
            if ( continuedDQuery ) {
                mCsAl.nDashQuery++;
                // the current position could be a mismatch
                mCsAl.mismatch[mCsAl.nMismatch] = i;
                mCsAl.nMismatch++;
                mCsAl.type[i] = 2;
            }
            continuedDQuery = false;
        }
        else {
            // start a dash region
            if ( !continuedDQuery ) {
                mCsAl.dashQuery[mCsAl.nDashQuery].Begin  = i;
                mCsAl.dashQuery[mCsAl.nDashQuery].Length = 0;
                // the preceding position could be a mismatch
                mCsAl.mismatch[mCsAl.nMismatch] = i;
                mCsAl.nMismatch++;
                mCsAl.type[i] = 4;
            }
            mCsAl.dashQuery[mCsAl.nDashQuery].Length++;
            continuedDQuery = true;
        }


    }

    if ( nMatch > 0 ) {
        mCsAl.identical[ mCsAl.nIdentical ].Begin  = pairwiseLen - nMatch;
        mCsAl.identical[ mCsAl.nIdentical ].Length = nMatch;
        mCsAl.nIdentical++;
    }


    if ( mCsAl.identical[mCsAl.nIdentical - 1].Begin != 0 ) {
        // find sequencing errors
        if ( mCsAl.nMismatch > 0 )
            FindSequencingError(pairwiseLen);

        if ( mCsAl.nDashReference > 0 ) {
            for ( unsigned int i = 0; i < mCsAl.nDashReference; i++ ) {
                unsigned int curPosition = mCsAl.dashReference[i].Begin + mCsAl.dashReference[i].Length;
                if ( mCsAl.bsReference[ curPosition ] != mCsAl.bsQuery[ curPosition ] ) {
                    curPosition = mCsAl.dashReference[i].Begin;
                    for ( unsigned int j = 0; j < mCsAl.dashReference[i].Length; j++ )
                        mCsAl.bsQuery[ curPosition + j + 1 ] = 'N';
                }
            }

            AdjustDash(mCsAl.csReference, mCsAl.csQuery, mCsAl.dashReference, mCsAl.nDashReference, mCsAl.bsReference);
        }

        if ( mCsAl.nDashQuery > 0 )
            AdjustDash(mCsAl.csQuery, mCsAl.csReference, mCsAl.dashQuery, mCsAl.nDashQuery, mCsAl.bsQuery);


        // deal with the indentical region
        for ( unsigned int i = 0; i < mCsAl.nIdentical; i++ ) {
            unsigned int csEnd = mCsAl.identical[i].Begin + mCsAl.identical[i].Length - 1;
            unsigned int curPosition = mCsAl.identical[i].Begin;
            bsBase = mCsAl.bsReference[ curPosition ];
            while ( ( bsBase == '-' ) || ( bsBase == 'N' ) ) {
                curPosition--;
                bsBase = mCsAl.bsReference[ curPosition ];
                if ( curPosition == 0 )
                    break;
            }

            bool isGoodBsBase = false;
            if ( ( bsBase != '-' ) && ( bsBase != 'N' ) )
                isGoodBsBase = true;

            if ( isGoodBsBase )
                ConvertCs2Bs(mCsAl.csQuery, mCsAl.bsQuery, mCsAl.identical[i].Begin, csEnd, bsBase);
        }


    }

    // end up the sequences
    mCsAl.bsReference[ pairwiseLen + 1 ] = 0;
    mCsAl.bsQuery[ pairwiseLen + 1 ]     = 0;

    al.Reference = mCsAl.bsReference;
    al.Query     = mCsAl.bsQuery;

    ++al.ReferenceEnd;
    ++al.QueryEnd;
    al.QueryLength = al.QueryEnd - al.QueryBegin + 1;


    // ------------------------------------------------------------------------------------------
    // convert the colorspace transition qualities to base qualities
    // NOTE: this algorithm will simply take the minimum of the two qualities that overlap a base
    // ------------------------------------------------------------------------------------------
    const unsigned short numColorspaceQualities = al.BaseQualities.Length();
    const unsigned short lastCSQIndex = numColorspaceQualities - 1;

    CMosaikString csQualities = al.BaseQualities;
    al.BaseQualities.Reserve(numColorspaceQualities + 1);
    al.BaseQualities.SetLength(numColorspaceQualities + 1);

    const char* pCSQual = csQualities.CData();
    char* pBSQual       = al.BaseQualities.Data();

    // handle the first base quality
    *pBSQual = *pCSQual;
    ++pBSQual;

    // handle the internal base qualities
    for(unsigned short i = 1; i < numColorspaceQualities; ++i, ++pBSQual)
        *pBSQual = min(pCSQual[i - 1], pCSQual[i]);

    // handle the final base quality
    *pBSQual = pCSQual[lastCSQIndex];



    // update the number of mismatches
    // TODO: This should be augmented to support IUPAC ambiguity codes
    const unsigned int bsPairwiseLen = al.Reference.Length();

    al.NumMismatches = 0;
    for(unsigned short i = 0; i < bsPairwiseLen; ++i) {
        if(mCsAl.bsReference[i] != mCsAl.bsQuery[i]) al.NumMismatches++;
    }

}