// converts the supplied read from colorspace to pseudo-colorspace void CColorspaceUtilities::ConvertReadColorspaceToPseudoColorspace(CMosaikString& s) { char* pBases = s.Data(); for(unsigned int i = 0; i < s.Length(); ++i, ++pBases) { switch(*pBases) { case '0': *pBases = 'A'; break; case '1': *pBases = 'C'; break; case '2': *pBases = 'G'; break; case '3': *pBases = 'T'; break; case 'X': break; case '-': *pBases = 'N'; break; case '.': // here we pick an arbitrary colorspace transition, this will have at // least 25 % of being correct as opposed to specifying an 'N'. *pBases = 'A'; break; default: printf("ERROR: Unrecognized nucleotide (%c) when converting read to pseudo-colorspace.\n", pBases[i]); exit(1); break; } } }
// converts the supplied read from pseudo-colorspace to colorspace // The function is used by the unaligned-read writer. void CColorspaceUtilities::ConvertReadPseudoColorspaceToColorspace(CMosaikString& s) { char* pBases = s.Data(); for(unsigned int i = 0; i < s.Length(); ++i, ++pBases) { switch(*pBases) { case 'A': *pBases = '0'; break; case 'C': *pBases = '1'; break; case 'G': *pBases = '2'; break; case 'T': *pBases = '3'; break; case 'X': case 'N': break; default: printf("ERROR: Unrecognized nucleotide (%c) when converting read to colorspace.\n", pBases[i]); exit(1); break; } } }
// converts the supplied read from basespace to pseudo-colorspace void CColorspaceUtilities::ConvertReadBasespaceToPseudoColorspace(CMosaikString& s) { char* pPrev = s.Data(); char* pString = pPrev + 1; // simplify various ambiguity codes *pPrev = GetSimplifiedBase(*pPrev); CS_MAP_t::const_iterator csIter; for(unsigned int i = 1; i < s.Length(); ++i, ++pString, ++pPrev) { // simplify various ambiguity codes *pString = GetSimplifiedBase(*pString); csIter = mCSMap.find(PACK_SHORT(*pPrev, *pString)); if(csIter == mCSMap.end()) { printf("ERROR: Unknown combination found when converting to colorspace: [%c] & [%c]\n", *pPrev, *pString); exit(1); } *pPrev = csIter->second; } // adjust the read s.TrimEnd(1); }
// encodes the supplied query sequence into 4-bit notation void CBamWriter::EncodeQuerySequence(const CMosaikString& query, string& encodedQuery) { // prepare the encoded query string const unsigned int queryLen = query.Length(); const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5); encodedQuery.resize(encodedQueryLen); char* pEncodedQuery = (char*)encodedQuery.data(); const char* pQuery = (const char*)query.CData(); unsigned char nucleotideCode; bool useHighWord = true; while(*pQuery) { switch(*pQuery) { case '=': nucleotideCode = 0; break; case 'A': nucleotideCode = 1; break; case 'C': nucleotideCode = 2; break; case 'G': nucleotideCode = 4; break; case 'T': nucleotideCode = 8; break; case 'N': nucleotideCode = 15; break; default: printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery); exit(1); } // pack the nucleotide code if(useHighWord) { *pEncodedQuery = nucleotideCode << 4; useHighWord = false; } else { *pEncodedQuery |= nucleotideCode; ++pEncodedQuery; useHighWord = true; } // increment the query position ++pQuery; } }
// saves the alignment to the alignment archive void CBamWriter::SaveAlignment( const Alignment& al, const char* zaString, const bool& noCigarMdNm, const bool& notShowRnamePos, const bool& isSolid, const bool& processedBamData, const bool& report_zn) { // ================= // set the BAM flags // ================= // define our flags unsigned int flag = 0; int insertSize = 0; if(al.IsPairedEnd) { flag |= BAM_SEQUENCED_AS_PAIRS; // first or second mate? flag |= (al.IsFirstMate ? BAM_QUERY_FIRST_MATE : BAM_QUERY_SECOND_MATE); if(al.IsResolvedAsPair) { if ( al.IsResolvedAsProperPair ) flag |= BAM_PROPER_PAIR; if(al.IsMateMapped && al.IsMateReverseStrand) flag |= BAM_MATE_REVERSE_COMPLEMENT; insertSize = al.FragmentLength; } if ( !al.IsMapped ) flag |= BAM_QUERY_UNMAPPED; if ( !al.IsMateMapped ) flag |= BAM_MATE_UNMAPPED; } else { if ( !al.IsMapped ) flag |= BAM_QUERY_UNMAPPED; } if(al.IsMapped && al.IsReverseStrand) flag |= BAM_QUERY_REVERSE_COMPLEMENT; // ========================== // construct the cigar string // ========================== string packedCigar; unsigned short numCigarOperations = 0; if ( !noCigarMdNm ) { if ( !processedBamData ) CreatePackedCigar( al, packedCigar, numCigarOperations, isSolid ); else { packedCigar = al.PackedCigar; numCigarOperations = al.NumCigarOperation; } } else packedCigar = "\0"; const unsigned int packedCigarLen = !noCigarMdNm ? packedCigar.size() : 0; // =================== // write the alignment // =================== // remove the gaps from the read CMosaikString query; if ( !processedBamData ) { query = al.Query.CData(); query.Remove('-'); } // initialize const unsigned int nameLen = al.Name.Length() + 1; const unsigned int queryLen = processedBamData ? al.QueryLength : query.Length(); // sanity check //al.BaseQualities.CheckQuality(); //if ( queryLen != alIter->BaseQualities.Length() ) { // printf("ERROR: The lengths of bases(%u) and qualities(%u) of Read (%s) didn't match.\n", queryLen, alIter->BaseQualities.Length(), readName.CData()); // exit(1); //} // encode the query string encodedQuery; if (!processedBamData && (query.Length() != 0)) EncodeQuerySequence(query, encodedQuery); else encodedQuery = al.EncodedQuery; const unsigned int encodedQueryLen = encodedQuery.size(); // create our read group tag string readGroupTag; const unsigned int readGroupTagLen = 3 + al.ReadGroup.size() + 1; readGroupTag.resize(readGroupTagLen); char* pReadGroupTag = (char*)readGroupTag.data(); sprintf(pReadGroupTag, "RGZ%s", al.ReadGroup.c_str()); // create our mismatch tag string mismatchTag; unsigned int numMismatches = 0; unsigned int nmTagLen = 0; if ( !noCigarMdNm ) { mismatchTag = "NMi"; mismatchTag.resize(MISMATCH_TAG_LEN); nmTagLen = MISMATCH_TAG_LEN; numMismatches = al.NumMismatches; memcpy((char*)mismatchTag.data() + 3, (char*)&numMismatches, SIZEOF_INT); } // create our MD tag string mdTag; char* pMd = 0; unsigned int mdTagLen = 0; char* pMdTag; if ( !noCigarMdNm ) { if ( !processedBamData ) pMd = (char*) mdTager.GetMdTag( al.Reference.CData(), al.Query.CData(), al.Reference.Length() ); else pMd = (char*) al.MdString.c_str(); mdTagLen = 3 + strlen( pMd ) + 1; mdTag.resize( mdTagLen ); pMdTag = (char*)mdTag.data(); sprintf(pMdTag, "MDZ%s", pMd); #ifdef VERBOSE_DEBUG fprintf(stderr, "=== MD ===\n"); fprintf(stderr, "%s\n%s\n%s\n", al.Reference.CData(), al.Query.CData(), mdTag.c_str()); #endif } // create our za tag unsigned int zaTagLen = 0; string zaTag; char* pZaTag; if ((zaString != 0) && (zaString != (char)0)) { zaTagLen = 3 + strlen( zaString ) + 1; zaTag.resize( zaTagLen ); pZaTag = (char*)zaTag.data(); sprintf(pZaTag, "ZAZ%s",zaString); } // create our zn tag unsigned int znTagLen = 0; string znTag; if (report_zn){ ostringstream zn_buffer; zn_buffer << "ZNZ" << al.SwScore << ";" << al.NextSwScore << ";" << al.NumLongestMatchs << ";" << al.Entropy << ";" << al.NumMapped << ";" << al.NumHash; znTag = zn_buffer.str(); znTagLen = znTag.size() + 1; //cerr << znTag.data() << "\t" << znTag << "\t" << znTagLen << endl; } // create our cs tag unsigned int csTagLen = 0; string csTag; char* pCsTag; if (isSolid) { csTagLen = 3 + strlen ( al.CsQuery.c_str() ) + 1; csTag.resize( csTagLen ); pCsTag = (char*)csTag.data(); sprintf( pCsTag, "CSZ%s", al.CsQuery.c_str() ); } // create our cq tag unsigned int cqTagLen = 0; string cqTag; char* pCqTag; if (isSolid) { cqTagLen = 3 + strlen ( al.CsBaseQualities.c_str() ) + 1; cqTag.resize( cqTagLen ); pCqTag = (char*)cqTag.data(); sprintf( pCqTag, "CQZ%s", al.CsBaseQualities.c_str() ); } // retrieve our bin unsigned int bin = CalculateMinimumBin(al.ReferenceBegin, al.ReferenceEnd); // assign the BAM core data unsigned int buffer[8] = {0}; unsigned int reference_index, reference_pos; if (!al.IsMapped) { if (al.IsMateMapped) reference_index = al.MateReferenceIndex; else reference_index = 0xffffffff; } else { reference_index = al.ReferenceIndex; } if (!al.IsMapped) { if (al.IsMateMapped) reference_pos = al.MateReferenceBegin; else reference_pos = 0xffffffff; } else { reference_pos = al.ReferenceBegin; } buffer[0] = (notShowRnamePos) ? 0xffffffff : reference_index; buffer[1] = (notShowRnamePos) ? 0xffffffff : reference_pos; buffer[2] = (bin << 16) | (al.RecalibratedQuality << 8) | nameLen; buffer[3] = (flag << 16) | numCigarOperations; buffer[4] = queryLen; if(al.IsPairedEnd) { if (notShowRnamePos) { buffer[5] = 0xffffffff; buffer[6] = 0xffffffff; } else { if (!al.IsMateMapped) {//unmapped mate buffer[5] = reference_index; buffer[6] = reference_pos; } else { buffer[5] = al.MateReferenceIndex; buffer[6] = al.MateReferenceBegin; } } buffer[7] = insertSize; } else { buffer[5] = 0xffffffff; buffer[6] = 0xffffffff; buffer[7] = 0; } // write the block size const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + readGroupTagLen + nmTagLen + mdTagLen + zaTagLen + znTagLen + csTagLen + cqTagLen; const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize; BgzfWrite((char*)&blockSize, SIZEOF_INT); // write the BAM core BgzfWrite((char*)&buffer, BAM_CORE_SIZE); // write the query name BgzfWrite(al.Name.CData(), nameLen); // write the packed cigar BgzfWrite(packedCigar.data(), packedCigarLen); // write the encoded query sequence BgzfWrite(encodedQuery.data(), encodedQueryLen); // write the base qualities BgzfWrite(al.BaseQualities.CData(), queryLen); // write the read group tag BgzfWrite(readGroupTag.data(), readGroupTagLen); // write the mismatch tag if ( !noCigarMdNm ) BgzfWrite(mismatchTag.data(), MISMATCH_TAG_LEN); // write the MD tag if ( !noCigarMdNm ) BgzfWrite(mdTag.data(), mdTagLen); // write the ZA tag if ( zaString != 0 && (zaString != (char)0)) BgzfWrite(zaTag.data(), zaTagLen); // write the ZN tag if (report_zn && (znTagLen > 0)) BgzfWrite(znTag.data(), znTagLen); // write the cs tag if (isSolid) BgzfWrite(csTag.data(), csTagLen); // write the cq tag if (isSolid) BgzfWrite(cqTag.data(), cqTagLen); }