// converts the supplied read from colorspace to pseudo-colorspace
void CColorspaceUtilities::ConvertReadColorspaceToPseudoColorspace(CMosaikString& s) {
    char* pBases = s.Data();
    for(unsigned int i = 0; i < s.Length(); ++i, ++pBases) {
        switch(*pBases) {
        case '0':
            *pBases = 'A';
            break;
        case '1':
            *pBases = 'C';
            break;
        case '2':
            *pBases = 'G';
            break;
        case '3':
            *pBases = 'T';
            break;
        case 'X':
            break;
        case '-':
            *pBases = 'N';
            break;
        case '.':
            // here we pick an arbitrary colorspace transition, this will have at
            // least 25 % of being correct as opposed to specifying an 'N'.
            *pBases = 'A';
            break;
        default:
            printf("ERROR: Unrecognized nucleotide (%c) when converting read to pseudo-colorspace.\n", pBases[i]);
            exit(1);
            break;
        }
    }
}
// converts the supplied read from pseudo-colorspace to colorspace
// The function is used by the unaligned-read writer.
void CColorspaceUtilities::ConvertReadPseudoColorspaceToColorspace(CMosaikString& s) {
    char* pBases = s.Data();
    for(unsigned int i = 0; i < s.Length(); ++i, ++pBases) {
        switch(*pBases) {
        case 'A':
            *pBases = '0';
            break;
        case 'C':
            *pBases = '1';
            break;
        case 'G':
            *pBases = '2';
            break;
        case 'T':
            *pBases = '3';
            break;
        case 'X':
        case 'N':
            break;
        default:
            printf("ERROR: Unrecognized nucleotide (%c) when converting read to colorspace.\n", pBases[i]);
            exit(1);
            break;
        }
    }
}
// converts the supplied read from basespace to pseudo-colorspace
void CColorspaceUtilities::ConvertReadBasespaceToPseudoColorspace(CMosaikString& s) {

    char* pPrev   = s.Data();
    char* pString = pPrev + 1;

    // simplify various ambiguity codes
    *pPrev = GetSimplifiedBase(*pPrev);

    CS_MAP_t::const_iterator csIter;
    for(unsigned int i = 1; i < s.Length(); ++i, ++pString, ++pPrev) {

        // simplify various ambiguity codes
        *pString = GetSimplifiedBase(*pString);

        csIter = mCSMap.find(PACK_SHORT(*pPrev, *pString));
        if(csIter == mCSMap.end()) {
            printf("ERROR: Unknown combination found when converting to colorspace: [%c] & [%c]\n", *pPrev, *pString);
            exit(1);
        }

        *pPrev = csIter->second;
    }

    // adjust the read
    s.TrimEnd(1);
}
Exemple #4
0
// encodes the supplied query sequence into 4-bit notation
void CBamWriter::EncodeQuerySequence(const CMosaikString& query, string& encodedQuery) {

	// prepare the encoded query string
	const unsigned int queryLen = query.Length();
	const unsigned int encodedQueryLen = (unsigned int)((queryLen / 2.0) + 0.5);
	encodedQuery.resize(encodedQueryLen);
	char* pEncodedQuery = (char*)encodedQuery.data();
	const char* pQuery = (const char*)query.CData();

	unsigned char nucleotideCode;
	bool useHighWord = true;

	while(*pQuery) {

		switch(*pQuery) {
			case '=':
				nucleotideCode = 0;
				break;
			case 'A':
				nucleotideCode = 1;
				break;
			case 'C':
				nucleotideCode = 2;
				break;
			case 'G':
				nucleotideCode = 4;
				break;
			case 'T':
				nucleotideCode = 8;
				break;
			case 'N':
				nucleotideCode = 15;
				break;
			default:
				printf("ERROR: Only the following bases are supported in the BAM format: {=, A, C, G, T, N}. Found [%c]\n", *pQuery);
				exit(1);
		}

		// pack the nucleotide code
		if(useHighWord) {
			*pEncodedQuery = nucleotideCode << 4;
			useHighWord = false;
		} else {
			*pEncodedQuery |= nucleotideCode;
			++pEncodedQuery;
			useHighWord = true;
		}

		// increment the query position
		++pQuery;
	}
}
Exemple #5
0
// saves the alignment to the alignment archive
void CBamWriter::SaveAlignment(
    const Alignment& al, 
    const char* zaString, 
    const bool& noCigarMdNm, 
    const bool& notShowRnamePos, 
    const bool& isSolid, 
    const bool& processedBamData,
    const bool& report_zn) {

	// =================
	// set the BAM flags
	// =================

	// define our flags
	unsigned int flag                = 0;
	int insertSize                   = 0;
	if(al.IsPairedEnd) {
		flag |= BAM_SEQUENCED_AS_PAIRS;
		// first or second mate?
		flag |= (al.IsFirstMate ? BAM_QUERY_FIRST_MATE : BAM_QUERY_SECOND_MATE);
		if(al.IsResolvedAsPair) {
			if ( al.IsResolvedAsProperPair ) 
				flag |= BAM_PROPER_PAIR;
			if(al.IsMateMapped && al.IsMateReverseStrand) flag |= BAM_MATE_REVERSE_COMPLEMENT;
			insertSize = al.FragmentLength;
		}
		if ( !al.IsMapped )
			flag |= BAM_QUERY_UNMAPPED;
		if ( !al.IsMateMapped )
			flag |= BAM_MATE_UNMAPPED;
	} else {
		if ( !al.IsMapped )
			flag |= BAM_QUERY_UNMAPPED;
	}

	if(al.IsMapped && al.IsReverseStrand) flag |= BAM_QUERY_REVERSE_COMPLEMENT;

	// ==========================
	// construct the cigar string
	// ==========================

	string packedCigar;
	unsigned short numCigarOperations = 0;
	if ( !noCigarMdNm ) {
		if ( !processedBamData )
			CreatePackedCigar( al, packedCigar, numCigarOperations, isSolid );
		else {
			packedCigar = al.PackedCigar;
			numCigarOperations = al.NumCigarOperation;
		}
	}
	else
		packedCigar = "\0";
	const unsigned int packedCigarLen = !noCigarMdNm ? packedCigar.size() : 0;

	// ===================
	// write the alignment
	// ===================

	// remove the gaps from the read
	CMosaikString query;
	if ( !processedBamData ) {
		query = al.Query.CData();
		query.Remove('-');
	}

	// initialize
	const unsigned int nameLen  = al.Name.Length() + 1;
	const unsigned int queryLen = processedBamData ? al.QueryLength : query.Length();

	// sanity check
	//al.BaseQualities.CheckQuality();
	//if ( queryLen != alIter->BaseQualities.Length() ) {
	//        printf("ERROR: The lengths of bases(%u) and qualities(%u) of Read (%s) didn't match.\n", queryLen, alIter->BaseQualities.Length(), readName.CData());
        //        exit(1);
        //}
	
	// encode the query
	string encodedQuery;
	if (!processedBamData && (query.Length() != 0))
		EncodeQuerySequence(query, encodedQuery);
	else
		encodedQuery = al.EncodedQuery;
	const unsigned int encodedQueryLen = encodedQuery.size();

	// create our read group tag
	string readGroupTag;
	const unsigned int readGroupTagLen = 3 + al.ReadGroup.size() + 1;
	readGroupTag.resize(readGroupTagLen);
	char* pReadGroupTag = (char*)readGroupTag.data();
	sprintf(pReadGroupTag, "RGZ%s", al.ReadGroup.c_str());

	// create our mismatch tag
	string mismatchTag;
	unsigned int numMismatches = 0;
	unsigned int nmTagLen = 0;
	if ( !noCigarMdNm ) {
		mismatchTag = "NMi";
		mismatchTag.resize(MISMATCH_TAG_LEN);
		nmTagLen = MISMATCH_TAG_LEN;
		numMismatches = al.NumMismatches;
		memcpy((char*)mismatchTag.data() + 3, (char*)&numMismatches, SIZEOF_INT);
	}

	// create our MD tag
	string mdTag;
	char* pMd = 0;
	unsigned int mdTagLen = 0;
	char* pMdTag;
	if ( !noCigarMdNm ) {
		if ( !processedBamData ) 
			pMd = (char*) mdTager.GetMdTag( al.Reference.CData(), al.Query.CData(), al.Reference.Length() );
		else
			pMd = (char*) al.MdString.c_str();

		mdTagLen = 3 + strlen( pMd ) + 1;
		mdTag.resize( mdTagLen );
		pMdTag = (char*)mdTag.data();
		sprintf(pMdTag, "MDZ%s", pMd);
		#ifdef VERBOSE_DEBUG
		fprintf(stderr, "=== MD ===\n");
		fprintf(stderr, "%s\n%s\n%s\n", al.Reference.CData(), al.Query.CData(), mdTag.c_str());
		#endif
	}

	// create our za tag
	unsigned int zaTagLen = 0;
	string zaTag;
	char* pZaTag;
	if ((zaString != 0) && (zaString != (char)0)) {
		zaTagLen = 3 + strlen( zaString ) + 1;
		zaTag.resize( zaTagLen );
		pZaTag = (char*)zaTag.data();
		sprintf(pZaTag, "ZAZ%s",zaString);
	}

	// create our zn tag
	unsigned int znTagLen = 0;
	string znTag;
	if (report_zn){
	  ostringstream zn_buffer;
	  zn_buffer << "ZNZ" 
	            << al.SwScore << ";"
		    << al.NextSwScore << ";"
		    << al.NumLongestMatchs << ";"
		    << al.Entropy << ";"
		    << al.NumMapped << ";"
		    << al.NumHash;
	  znTag = zn_buffer.str();
	  znTagLen = znTag.size() + 1;
	  //cerr << znTag.data() << "\t" << znTag << "\t" << znTagLen << endl;
	}
	
	// create our cs tag
	unsigned int csTagLen = 0;
	string csTag;
	char* pCsTag;
	if (isSolid) {
		csTagLen = 3 + strlen ( al.CsQuery.c_str() ) + 1;
		csTag.resize( csTagLen );
		pCsTag = (char*)csTag.data();
		sprintf( pCsTag, "CSZ%s", al.CsQuery.c_str() );
	}

	// create our cq tag
	unsigned int cqTagLen = 0;
	string cqTag;
	char* pCqTag;
	if (isSolid) {
		cqTagLen = 3 + strlen ( al.CsBaseQualities.c_str() ) + 1;
		cqTag.resize( cqTagLen );
		pCqTag = (char*)cqTag.data();
		sprintf( pCqTag, "CQZ%s", al.CsBaseQualities.c_str() );
	}

	// retrieve our bin
	unsigned int bin = CalculateMinimumBin(al.ReferenceBegin, al.ReferenceEnd);

	// assign the BAM core data
	unsigned int buffer[8] = {0};
	unsigned int reference_index, reference_pos;
	if (!al.IsMapped) {
	  if (al.IsMateMapped) reference_index = al.MateReferenceIndex;
	  else reference_index = 0xffffffff;
	} else {
	  reference_index = al.ReferenceIndex;
	}

	if (!al.IsMapped) {
	  if (al.IsMateMapped) reference_pos = al.MateReferenceBegin;
	  else reference_pos = 0xffffffff;
	} else {
	  reference_pos = al.ReferenceBegin;
	}

	buffer[0] = (notShowRnamePos) ? 0xffffffff : reference_index;
	buffer[1] = (notShowRnamePos) ? 0xffffffff : reference_pos;
	buffer[2] = (bin << 16) | (al.RecalibratedQuality << 8) | nameLen;
	buffer[3] = (flag << 16) | numCigarOperations;
	buffer[4] = queryLen;

	if(al.IsPairedEnd) {
		if (notShowRnamePos) {
		  buffer[5] = 0xffffffff;
		  buffer[6] = 0xffffffff;
		} else {
		  if (!al.IsMateMapped) {//unmapped mate
		    buffer[5] = reference_index;
		    buffer[6] = reference_pos;
		  } else {
		    buffer[5] = al.MateReferenceIndex;
		    buffer[6] = al.MateReferenceBegin;
		  }
		}
		buffer[7] = insertSize;
	} else {
		buffer[5] = 0xffffffff;
		buffer[6] = 0xffffffff;
		buffer[7] = 0;
	}

	// write the block size
	const unsigned int dataBlockSize = nameLen + packedCigarLen + encodedQueryLen + queryLen + readGroupTagLen + nmTagLen + mdTagLen + zaTagLen + znTagLen + csTagLen + cqTagLen;
	const unsigned int blockSize = BAM_CORE_SIZE + dataBlockSize;
	BgzfWrite((char*)&blockSize, SIZEOF_INT);

	// write the BAM core
	BgzfWrite((char*)&buffer, BAM_CORE_SIZE);

	// write the query name
	BgzfWrite(al.Name.CData(), nameLen);

	// write the packed cigar
	BgzfWrite(packedCigar.data(), packedCigarLen);

	// write the encoded query sequence
	BgzfWrite(encodedQuery.data(), encodedQueryLen);

	// write the base qualities
	BgzfWrite(al.BaseQualities.CData(), queryLen);

	// write the read group tag
	BgzfWrite(readGroupTag.data(), readGroupTagLen);

	// write the mismatch tag
	if ( !noCigarMdNm )
		BgzfWrite(mismatchTag.data(), MISMATCH_TAG_LEN);

	// write the MD tag
	if ( !noCigarMdNm )
	BgzfWrite(mdTag.data(), mdTagLen);

	// write the ZA tag
	if ( zaString != 0 && (zaString != (char)0))
		BgzfWrite(zaTag.data(), zaTagLen);
	
	// write the ZN tag
	if (report_zn && (znTagLen > 0))
		BgzfWrite(znTag.data(), znTagLen);

	// write the cs tag
	if (isSolid)
		BgzfWrite(csTag.data(), csTagLen);

	// write the cq tag
	if (isSolid)
		BgzfWrite(cqTag.data(), cqTagLen);
}