Ejemplo n.º 1
0
// print BamAlignment in FASTQ format
// N.B. - uses QueryBases NOT AlignedBases
void ConvertTool::ConvertToolPrivate::PrintFastq(const BamAlignment& a) { 
  
    // @BamAlignment.Name
    // BamAlignment.QueryBases
    // +
    // BamAlignment.Qualities
    //
    // N.B. - QueryBases are reverse-complemented (& Qualities reversed) if aligned to reverse strand .
    //        Name is appended "/1" or "/2" if paired-end, to reflect which mate this entry is.
  
    // handle paired-end alignments
    string name = a.Name;
    if ( a.IsPaired() )
        name.append( (a.IsFirstMate() ? "/1" : "/2") );
  
    // handle reverse strand alignment - bases & qualities
    string qualities = a.Qualities;
    string sequence  = a.QueryBases;
    if ( a.IsReverseStrand() ) {
        Utilities::Reverse(qualities);
        Utilities::ReverseComplement(sequence);
    }
  
    // write to output stream
    m_out << "@" << name << endl
          << sequence    << endl
          << "+"         << endl
          << qualities   << endl;
}
Ejemplo n.º 2
0
int DataStatisticsTool::Execute()
{
    // iterate over reads in BAM file(s)
    BamAlignment alignObj;
    while(bamReader.GetNextAlignment(alignObj))
    {
        if (alignObj.IsDuplicate()) continue;
        if (alignObj.IsFailedQC()) continue;
        if (!alignObj.IsMapped()) continue;
        if (!alignObj.IsPrimaryAlignment()) continue;
        if (alignObj.IsPaired() && !alignObj.IsProperPair()) continue;
        if (alignObj.IsPaired() && !alignObj.IsMateMapped()) continue;
        if (!alignObj.HasTag("MD")) continue;

//        // debug
//        GenericBamAlignmentTools::printBamAlignmentCigar(alignObj);
//        GenericBamAlignmentTools::printBamAlignmentMD(alignObj);

        // shift InDel
        GenericBamAlignmentTools::leftShiftInDel(alignObj);

//        // debug
//        GenericBamAlignmentTools::printBamAlignmentCigar(alignObj);
//        GenericBamAlignmentTools::printBamAlignmentMD(alignObj);

        // get the alignment sequences
        string alignRead;
        string alignGenome;
        GenericBamAlignmentTools::getAlignmentSequences(alignObj, alignRead, alignGenome);

        // update the statistics
        statistics.update(alignRead, alignGenome);
    }


    // print to screen
    cout << statistics << endl;
//    statistics.printMatchMismatch();

    // close BAM reader
    bamReader.Close();

    // close Fasta
    genomeFasta.Close();

    return 1;
}
Ejemplo n.º 3
0
// use current input alignment to update BAM file alignment stats
void StatsTool::StatsToolPrivate::ProcessAlignment(const BamAlignment& al) {
  
    // increment total alignment counter
    ++numReads;
    
    // check the paired-independent flags
    if ( al.IsDuplicate() ) ++numDuplicates;
    if ( al.IsFailedQC()  ) ++numFailedQC;
    if ( al.IsMapped()    ) ++numMapped;
    
    // check forward/reverse strand
    if ( al.IsReverseStrand() ) 
        ++numReverseStrand; 
    else 
        ++numForwardStrand;
    
    // if alignment is paired-end
    if ( al.IsPaired() ) {
      
        // increment PE counter
        ++numPaired;
      
        // increment first mate/second mate counters
        if ( al.IsFirstMate()  ) ++numFirstMate;
        if ( al.IsSecondMate() ) ++numSecondMate;
        
        // if alignment is mapped, check mate status
        if ( al.IsMapped() ) {
            // if mate mapped
            if ( al.IsMateMapped() ) 
                ++numBothMatesMapped;
            // else singleton
            else 
                ++numSingletons;
        }
        
        // check for explicit proper pair flag
        if ( al.IsProperPair() ) ++numProperPair;
        
        // store insert size for first mate 
        if ( settings->IsShowingInsertSizeSummary && al.IsFirstMate() && (al.InsertSize != 0) ) {
            int insertSize = abs(al.InsertSize);
            insertSizes.push_back( insertSize );
        }
    }
}
Ejemplo n.º 4
0
// print BamAlignment in SAM format
void ConvertTool::ConvertToolPrivate::PrintSam(const BamAlignment& a) {
  
    // tab-delimited
    // <QNAME> <FLAG> <RNAME> <POS> <MAPQ> <CIGAR> <MRNM> <MPOS> <ISIZE> <SEQ> <QUAL> [ <TAG>:<VTYPE>:<VALUE> [...] ]
  
    // write name & alignment flag
    m_out << a.Name << "\t" << a.AlignmentFlag << "\t";
    
    // write reference name
    if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) 
        m_out << m_references[a.RefID].RefName << "\t";
    else 
        m_out << "*\t";
    
    // write position & map quality
    m_out << a.Position+1 << "\t" << a.MapQuality << "\t";
    
    // write CIGAR
    const vector<CigarOp>& cigarData = a.CigarData;
    if ( cigarData.empty() ) m_out << "*\t";
    else {
        vector<CigarOp>::const_iterator cigarIter = cigarData.begin();
        vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
        for ( ; cigarIter != cigarEnd; ++cigarIter ) {
            const CigarOp& op = (*cigarIter);
            m_out << op.Length << op.Type;
        }
        m_out << "\t";
    }
    
    // write mate reference name, mate position, & insert size
    if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
        if ( a.MateRefID == a.RefID )
            m_out << "=\t";
        else
            m_out << m_references[a.MateRefID].RefName << "\t";
        m_out << a.MatePosition+1 << "\t" << a.InsertSize << "\t";
    } 
    else
        m_out << "*\t0\t0\t";
    
    // write sequence
    if ( a.QueryBases.empty() )
        m_out << "*\t";
    else
        m_out << a.QueryBases << "\t";
    
    // write qualities
    if ( a.Qualities.empty() || (a.Qualities.at(0) == (char)0xFF) )
        m_out << "*";
    else
        m_out << a.Qualities;
    
    // write tag data
    const char* tagData = a.TagData.c_str();
    const size_t tagDataLength = a.TagData.length();
    
    size_t index = 0;
    while ( index < tagDataLength ) {

        // write tag name   
        string tagName = a.TagData.substr(index, 2);
        m_out << "\t" << tagName << ":";
        index += 2;
        
        // get data type
        char type = a.TagData.at(index);
        ++index;
        switch ( type ) {
            case (Constants::BAM_TAG_TYPE_ASCII) :
                m_out << "A:" << tagData[index];
                ++index;
                break;

            case (Constants::BAM_TAG_TYPE_INT8)  :
            case (Constants::BAM_TAG_TYPE_UINT8) :
                m_out << "i:" << (int)tagData[index];
                ++index;
                break;

            case (Constants::BAM_TAG_TYPE_INT16) :
                m_out << "i:" << BamTools::UnpackSignedShort(&tagData[index]);
                index += sizeof(int16_t);
                break;

            case (Constants::BAM_TAG_TYPE_UINT16) :
                m_out << "i:" << BamTools::UnpackUnsignedShort(&tagData[index]);
                index += sizeof(uint16_t);
                break;

            case (Constants::BAM_TAG_TYPE_INT32) :
                m_out << "i:" << BamTools::UnpackSignedInt(&tagData[index]);
                index += sizeof(int32_t);
                break;

            case (Constants::BAM_TAG_TYPE_UINT32) :
                m_out << "i:" << BamTools::UnpackUnsignedInt(&tagData[index]);
                index += sizeof(uint32_t);
                break;

            case (Constants::BAM_TAG_TYPE_FLOAT) :
                m_out << "f:" << BamTools::UnpackFloat(&tagData[index]);
                index += sizeof(float);
                break;

            case (Constants::BAM_TAG_TYPE_HEX)    :
            case (Constants::BAM_TAG_TYPE_STRING) :
                m_out << type << ":";
                while (tagData[index]) {
                    m_out << tagData[index];
                    ++index;
                }
                ++index;
                break;
        }

        if ( tagData[index] == '\0') 
            break;
    }

    m_out << endl;
}
Ejemplo n.º 5
0
// print BamAlignment in JSON format
void ConvertTool::ConvertToolPrivate::PrintJson(const BamAlignment& a) {
  
    // write name & alignment flag
    m_out << "{\"name\":\"" << a.Name << "\",\"alignmentFlag\":\"" << a.AlignmentFlag << "\",";
    
    // write reference name
    if ( (a.RefID >= 0) && (a.RefID < (int)m_references.size()) ) 
        m_out << "\"reference\":\"" << m_references[a.RefID].RefName << "\",";
    
    // write position & map quality
    m_out << "\"position\":" << a.Position+1 << ",\"mapQuality\":" << a.MapQuality << ",";
    
    // write CIGAR
    const vector<CigarOp>& cigarData = a.CigarData;
    if ( !cigarData.empty() ) {
        m_out << "\"cigar\":[";
        vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
        vector<CigarOp>::const_iterator cigarIter  = cigarBegin;
        vector<CigarOp>::const_iterator cigarEnd   = cigarData.end();
        for ( ; cigarIter != cigarEnd; ++cigarIter ) {
            const CigarOp& op = (*cigarIter);
            if (cigarIter != cigarBegin)
                m_out << ",";
            m_out << "\"" << op.Length << op.Type << "\"";
        }
        m_out << "],";
    }
    
    // write mate reference name, mate position, & insert size
    if ( a.IsPaired() && (a.MateRefID >= 0) && (a.MateRefID < (int)m_references.size()) ) {
        m_out << "\"mate\":{"
              << "\"reference\":\"" << m_references[a.MateRefID].RefName << "\","
              << "\"position\":" << a.MatePosition+1
              << ",\"insertSize\":" << a.InsertSize << "},";
    }
    
    // write sequence
    if ( !a.QueryBases.empty() ) 
        m_out << "\"queryBases\":\"" << a.QueryBases << "\",";
    
    // write qualities
    if ( !a.Qualities.empty() && a.Qualities.at(0) != (char)0xFF ) {
        string::const_iterator s = a.Qualities.begin();
        m_out << "\"qualities\":[" << static_cast<short>(*s) - 33;
        ++s;
        for ( ; s != a.Qualities.end(); ++s )
            m_out << "," << static_cast<short>(*s) - 33;
        m_out << "],";
    }
    
    // write alignment's source BAM file
    m_out << "\"filename\":" << a.Filename << ",";

    // write tag data
    const char* tagData = a.TagData.c_str();
    const size_t tagDataLength = a.TagData.length();
    size_t index = 0;
    if ( index < tagDataLength ) {

        m_out << "\"tags\":{";
        
        while ( index < tagDataLength ) {

            if ( index > 0 )
                m_out << ",";
            
            // write tag name
            m_out << "\"" << a.TagData.substr(index, 2) << "\":";
            index += 2;
            
            // get data type
            char type = a.TagData.at(index);
            ++index;
            switch ( type ) {
                case (Constants::BAM_TAG_TYPE_ASCII) :
                    m_out << "\"" << tagData[index] << "\"";
                    ++index; 
                    break;
                
                case (Constants::BAM_TAG_TYPE_INT8)  :
                case (Constants::BAM_TAG_TYPE_UINT8) :
                    m_out << (int)tagData[index]; 
                    ++index; 
                    break;
                
                case (Constants::BAM_TAG_TYPE_INT16) :
                    m_out << BamTools::UnpackSignedShort(&tagData[index]);
                    index += sizeof(int16_t);
                    break;

                case (Constants::BAM_TAG_TYPE_UINT16) :
                    m_out << BamTools::UnpackUnsignedShort(&tagData[index]);
                    index += sizeof(uint16_t);
                    break;
                    
                case (Constants::BAM_TAG_TYPE_INT32) :
                    m_out << BamTools::UnpackSignedInt(&tagData[index]);
                    index += sizeof(int32_t);
                    break;

                case (Constants::BAM_TAG_TYPE_UINT32) :
                    m_out << BamTools::UnpackUnsignedInt(&tagData[index]);
                    index += sizeof(uint32_t);
                    break;

                case (Constants::BAM_TAG_TYPE_FLOAT) :
                    m_out << BamTools::UnpackFloat(&tagData[index]);
                    index += sizeof(float);
                    break;
                
                case (Constants::BAM_TAG_TYPE_HEX)    :
                case (Constants::BAM_TAG_TYPE_STRING) :
                    m_out << "\""; 
                    while (tagData[index]) {
                        if (tagData[index] == '\"')
                            m_out << "\\\""; // escape for json
                        else
                            m_out << tagData[index];
                        ++index;
                    }
                    m_out << "\""; 
                    ++index; 
                    break;      
            }
            
            if ( tagData[index] == '\0') 
                break;
        }

        m_out << "}";
    }

    m_out << "}" << endl;
}
Ejemplo n.º 6
0
int main (int argc, char *argv[]) {

    int  minBaseQuality = 0;

    string usage=string(""+string(argv[0])+"  [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+
			"\nThis program divides aligned single end reads into potentially deaminated\n"+
			"\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+
			"\nTip: if you do not need one of them, use /dev/null as your output\n"+
			"arguments:\n"+
			"\t"+"--bq  [base qual]   : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\n");

    if(argc == 1 ||
       argc < 4  ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    for(int i=1;i<(argc-2);i++){ 

	
        if(string(argv[i]) == "--bq"){
	    minBaseQuality=destringify<int>(argv[i+1]);
            i++;
            continue;
	}

    }

    string bamfiletopen = string( argv[ argc-5 ] );
    string vcffiletopen = string( argv[ argc-4 ] );
    string chrname      = string( argv[ argc-3 ] );
    string deambam      = string( argv[ argc-2 ] );
    string nondeambam   = string( argv[ argc-1 ] );

    //dummy reader, will need to reposition anyway
    VCFreader vcfr (vcffiletopen,
 		    vcffiletopen+".tbi",
 		    chrname,
 		    1,
 		    1,
 		    0);

    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }

    // if ( !reader.LocateIndex()  ) {
    // 	cerr << "The index for the BAM file cannot be located" << endl;
    // 	return 1;
    // }

    // if ( !reader.HasIndex()  ) {
    // 	cerr << "The BAM file has not been indexed." << endl;
    // 	return 1;
    // }

    //positioning the bam file
    int refid=reader.GetReferenceID(chrname);
    if(refid < 0){
	cerr << "Cannot retrieve the reference ID for "<< chrname << endl;
	return 1;
    }
    //cout<<"redif "<<refid<<endl;	    

    //setting the BAM reader at that position
    reader.SetRegion(refid,
		     0,
		     refid,
		     -1); 	



    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writerDeam;
    if ( !writerDeam.Open(deambam,      header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamWriter writerNoDeam;
    if ( !writerNoDeam.Open(nondeambam, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }



    unsigned int totalReads      =0;
    unsigned int deaminatedReads =0;
    unsigned int ndeaminatedReads =0;
    unsigned int skipped      =0;



    //iterating over the alignments for these regions
    BamAlignment al;
    int i;

    while ( reader.GetNextAlignment(al) ) {
	// cerr<<al.Name<<endl;

	//skip unmapped
	if(!al.IsMapped()){
	    skipped++;
	    continue;
	}

	//skip paired end !
	if(al.IsPaired() ){  
	    continue;
	    // cerr<<"Paired end not yet coded"<<endl;
	    // return 1;
	}


	string reconstructedReference = reconstructRef(&al);



	char refeBase;
	char readBase;
	bool isDeaminated;
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}

	isDeaminated=false;

	if(al.IsReverseStrand()){

	    //first base next to 3'
	    i = 0 ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    if(  readBase  == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    // cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;			
			cerr<<numberOfDeletions(&al)<<endl;			
			cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }
		    
		    //if the VCF has a at least one G but no A
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}

	    }


	    //second base next to 3'
	    i = 1;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+2,al.Position+2);

		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    // cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<"Problem2 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    //if the VCF has at least one G but no A 
		    // if(toprint->hasAtLeastOneG() &&
		    //    toprint->getAlt().find("A") == string::npos){
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}
	    }

	    //last  base next to 5'
	    i = (al.QueryBases.length()-1) ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);


		int lengthMatches=countMatchesRecons(reconstructedReference,0);		
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();

		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;
		    
		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem3 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }


		    //if the VCF has at least one G but no A
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}

	    }

	}else{

		
	    //first base next to 5'
	    i = 0;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C' 
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){ 

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;
		    
		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<numberOfDeletions(&al)<<endl;			
			cerr<<"Problem4 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    //if the VCF has at least one C but no T
		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }

		}

		//cout<<al.Position+
		 
	    }

	    //second last base next to 3'
	    i = (al.QueryBases.length()-2);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C'  &&
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  



		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		
		int lengthMatches=countMatchesRecons(reconstructedReference,1);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);

		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem5 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }
		}

		 

	    }

	    //last base next to 3'
	    i = (al.QueryBases.length()-1);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //&& refeBase  == 'C' 
	    if( readBase  == 'T'  && int(al.Qualities[i]-offset) >= minBaseQuality){  
		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		

		int lengthMatches=countMatchesRecons(reconstructedReference,0);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);

		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem6 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }
		}

	    }	

	   
	    
	}
		  



	totalReads++;

	if(isDeaminated){
	    deaminatedReads++;
	    writerDeam.SaveAlignment(al);		
	}else{
	    ndeaminatedReads++;
	    writerNoDeam.SaveAlignment(al);		
	}


    
    }//end for each read









    reader.Close();
    writerDeam.Close();
    writerNoDeam.Close();

    cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl;

   
    return 0;
}
Ejemplo n.º 7
0
 bool check(const PropertyFilter& filter, const BamAlignment& al) {
   
     bool keepAlignment = true;
     const PropertyMap& properties = filter.Properties;
     PropertyMap::const_iterator propertyIter = properties.begin();
     PropertyMap::const_iterator propertyEnd  = properties.end();
     for ( ; propertyIter != propertyEnd; ++propertyIter ) {
       
         // check alignment data field depending on propertyName
         const string& propertyName = (*propertyIter).first;
         const PropertyFilterValue& valueFilter = (*propertyIter).second;
         
         if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
         else if ( propertyName == CIGAR_PROPERTY ) {
             stringstream cigarSs;
             const vector<CigarOp>& cigarData = al.CigarData;
             if ( !cigarData.empty() ) {
                 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
                 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
                 vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
                 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
                     const CigarOp& op = (*cigarIter);
                     cigarSs << op.Length << op.Type;
                 }
                 keepAlignment &= valueFilter.check(cigarSs.str());
             }
         }
         else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
         else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
         else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
         else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
         else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
         else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
         else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
         else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
         else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
         else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
         else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
         else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
         else if ( propertyName == ISSINGLETON_PROPERTY ) {
             const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
             keepAlignment &= valueFilter.check(isSingleton);
         }
         else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
         else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
         else if ( propertyName == MATEREFERENCE_PROPERTY ) {
             if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
             BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
             const string& refName = filterToolReferences.at(al.MateRefID).RefName;
             keepAlignment &= valueFilter.check(refName);
         }
         else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
         else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
         else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
         else if ( propertyName == REFERENCE_PROPERTY ) {
             BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
             const string& refName = filterToolReferences.at(al.RefID).RefName;
             keepAlignment &= valueFilter.check(refName);
         }
         else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
         else BAMTOOLS_ASSERT_UNREACHABLE;
         
         // if alignment fails at ANY point, just quit and return false
         if ( !keepAlignment ) return false;
     }
   
     BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
     return keepAlignment;
 }
Ejemplo n.º 8
0
int main (int argc, char *argv[]) {
    if( (argc== 1) ||
	(argc== 2 && string(argv[1]) == "-h") ||
	(argc== 2 && string(argv[1]) == "-help") ||
	(argc== 2 && string(argv[1]) == "--help") ){
	cout<<"Usage:"<<endl;
	cout<<""<<endl;
	cout<<"plotQualScore input.bam"<<endl;
	return 1;
    }

    string bamfiletopen = string(argv[1]);
    BamReader reader;

    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
    }

    // if ( !reader.LocateIndex() ){
    // 	cerr << "warning: cannot locate index for file " << bamfiletopen<<endl;
    // 	//return 1;
    // }

    BamAlignment al;
    BamAlignment al2;
    
    bool unsurePEorSE=true;
    bool pe=true;
    int strLength=-1;
    int vecLengthToUse=-1;

    map<short,unsigned long>  ** counterA = 0;
    map<short,unsigned long>  ** counterC = 0;
    map<short,unsigned long>  ** counterG = 0;
    map<short,unsigned long>  ** counterT = 0;
    
    int lengthIndex1=0;
    int lengthIndex2=0;
    string seqInd1;
    string seqInd2;
    string qualInd1;
    string qualInd2;
    int offsetInd2;

    while ( reader.GetNextAlignment(al) ) {
	if(unsurePEorSE){
	    strLength=al.QueryBases.length();
	    if(al.IsPaired()){
		pe=true;
		vecLengthToUse=2*strLength;		
	    }else{
		pe=false;
		vecLengthToUse=strLength;
	    }
	    string index1;
	    string index2;
	
	    if(al.HasTag("XI")){
		al.GetTag("XI",index1);
		vecLengthToUse+=index1.length();
		lengthIndex1=index1.length();
	    }

	    if(al.HasTag("XJ")){
		al.GetTag("XJ",index2);
		vecLengthToUse+=index2.length();
		lengthIndex2=index2.length();
	    }

	    counterA     = new map<short,unsigned long>  * [vecLengthToUse];
	    counterC     = new map<short,unsigned long>  * [vecLengthToUse];
	    counterG     = new map<short,unsigned long>  * [vecLengthToUse];
	    counterT     = new map<short,unsigned long>  * [vecLengthToUse];
	    for(int i=0;i<vecLengthToUse;i++){
		counterA[i]=new map<short,unsigned long>  ();
		counterC[i]=new map<short,unsigned long>  ();
		counterG[i]=new map<short,unsigned long>  ();
		counterT[i]=new map<short,unsigned long>  ();
		for(short k=minQualScore;k<=maxQualScore;k++){
		    (*counterA[i])[k]=0;
		    (*counterC[i])[k]=0;
		    (*counterG[i])[k]=0;
		    (*counterT[i])[k]=0;
		}
	    }
	    unsurePEorSE=false;
	}else{
	    if(pe  &&
	       !al.IsPaired()){
		cerr << "Cannot have unpaired reads in PE mode" << endl;
		return 1;
	    }

	    if(!pe  &&
	       al.IsPaired()){
		cerr << "Cannot have unpaired reads in SE mode" << endl;
		return 1;
	    }
	}
	
	if(al.QueryBases.length() !=  al.Qualities.length()){
	    cerr << "Cannot have different lengths for sequence and quality" << endl;
	    return 1;
	}
	if(int(al.QueryBases.length()) !=  strLength){
	    cerr << "Cannot have different lengths for sequence and quality" << endl;
	    return 1;
	}

	if(pe){
	    if(al.IsFirstMate()){
		reader.GetNextAlignment(al2);
		if(al2.QueryBases.length() !=  al2.Qualities.length()){
		    cerr << "Cannot have different lengths for sequence and quality" << endl;
		    return 1;
		}

	    }else{
		cerr << "First read should be the first mate" << endl;
		return 1;	    
	    }
	}



	//cycle
	for(unsigned int i=0;i<al.QueryBases.length();i++){
	    short x=(short(al.Qualities[i])-qualOffset);
	    if(al.QueryBases[i] == 'A'){
	    	(*counterA[i])[x]++;
	    }
	    if(al.QueryBases[i] == 'C'){
	    	(*counterC[i])[x]++;
	    }
	    if(al.QueryBases[i] == 'G'){
	    	(*counterG[i])[x]++;
	    }
	    if(al.QueryBases[i] == 'T'){
	    	(*counterT[i])[x]++;
	    }
	}

	//The indices for al and al2 should hopefully be the same 
	if(lengthIndex1>0){
	    al.GetTag("XI",seqInd1);
	    al.GetTag("YI",qualInd1);
	    int j;

	    for(int i=0;i<lengthIndex1;i++){
		j=i+al.QueryBases.length();
		short x=(short(qualInd1[i])-qualOffset);
		if(seqInd1[i] == 'A'){
		    (*counterA[j])[x]++;
		}
		if(seqInd1[i] == 'C'){
		    (*counterC[j])[x]++;
		}
		if(seqInd1[i] == 'G'){
		    (*counterG[j])[x]++;
		}
		if(seqInd1[i] == 'T'){
		    (*counterT[j])[x]++;
		}
	    }
	}

	if(pe){
	    offsetInd2=al.QueryBases.length()+lengthIndex1+al2.QueryBases.length();
	    int j;
	    for(unsigned int i=0;i<al2.QueryBases.length();i++){
		j=i+al.QueryBases.length()+lengthIndex1;
		short x=(short(al2.Qualities[i])-qualOffset);
		if(al2.QueryBases[i] == 'A'){
		    (*counterA[j])[x]++;
		}
		if(al2.QueryBases[i] == 'C'){
		    (*counterC[j])[x]++;
		}
		if(al2.QueryBases[i] == 'G'){
		    (*counterG[j])[x]++;
		}
		if(al2.QueryBases[i] == 'T'){
		    (*counterT[j])[x]++;
		}
	    }
	}else{
	    offsetInd2=al.QueryBases.length()+lengthIndex1;
	}

	//The indices for al and al2 should hopefully be the same 
	if(lengthIndex2>0){
	    al.GetTag("XJ",seqInd2);
	    al.GetTag("YJ",qualInd2);
	    int j;

	    for(int i=0;i<lengthIndex2;i++){
		j=offsetInd2+i;
		short x=(short(qualInd2[i])-qualOffset);
		if(seqInd2[i] == 'A'){
		    (*counterA[j])[x]++;
		}
		if(seqInd2[i] == 'C'){
		    (*counterC[j])[x]++;
		}
		if(seqInd2[i] == 'G'){
		    (*counterG[j])[x]++;
		}
		if(seqInd2[i] == 'T'){
		    (*counterT[j])[x]++;
		}
	    }
	}

    }
    reader.Close();

    cout<<"cycle\t"<<"nuc\t";
    for(short k=minQualScore;k<maxQualScore;k++){
	cout<<k<<"\t";
    }
    cout<<maxQualScore<<endl;

    for(int i=0;i<vecLengthToUse;i++){
	cout<<(i+1)<<"\t";
	cout<<"A\t";
	for(short k=minQualScore;k<maxQualScore;k++){
	    cout<<(*counterA[i])[k]<<"\t";
	}
	cout<<(*counterA[i])[maxQualScore]<<endl;
	cout<<(i+1)<<"\t";
	cout<<"C\t";
	for(short k=minQualScore;k<maxQualScore;k++){
	    cout<<(*counterC[i])[k]<<"\t";
	}
	cout<<(*counterC[i])[maxQualScore]<<endl;
	cout<<(i+1)<<"\t";
	cout<<"G\t";
	for(short k=minQualScore;k<maxQualScore;k++){
	    cout<<(*counterG[i])[k]<<"\t";
	}
	cout<<(*counterG[i])[maxQualScore]<<endl;
	cout<<(i+1)<<"\t";
	cout<<"T\t";
	for(short k=minQualScore;k<maxQualScore;k++){
	    cout<<(*counterT[i])[k]<<"\t";
	}
	cout<<(*counterT[i])[maxQualScore]<<endl;	
    }



    
    return 0;
}
int main (int argc, char *argv[]) {

    int  minBaseQuality = 0;

    string usage=string(""+string(argv[0])+"  [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+
			"\nThis program divides aligned single end reads into potentially deaminated\n"+
			"\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+
			"\nThis is like filterDeaminatedVCF but it loads the VCF before then labels the reads instead of doing it on the fly\n"+
			"\nwhich is good if you have many reads in the bam file.\n"+			
			"\nTip: if you do not need one of them, use /dev/null as your output\n"+
			"\narguments:\n"+
			"\t"+"--bq    [base qual]  : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\t"+"--1000g [vcf file]   : VCF file from 1000g to get the putative A and T positions in modern humans (Default: "+vcf1000g+")\n"+
			"\n");

    if(argc == 1 ||
       argc < 4  ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    for(int i=1;i<(argc-2);i++){ 

	
        if(string(argv[i]) == "--bq"){
	    minBaseQuality=destringify<int>(argv[i+1]);
            i++;
            continue;
	}

        if(string(argv[i]) == "--1000g"){
	    vcf1000g=string(argv[i+1]);
            i++;
            continue;
	}



    }

    unsigned int maxSizeChromosome=250000000;//larger than chr1 hg19
    bool * hasCnoT;
    bool * hasGnoA;
    bool * thousandGenomesHasA;
    bool * thousandGenomesHasT;

    cerr<<"Trying to allocating memory"<<endl;
    try{
	hasCnoT              = new bool[ maxSizeChromosome ];
	hasGnoA              = new bool[ maxSizeChromosome ];
	thousandGenomesHasA  = new bool[ maxSizeChromosome ];
	thousandGenomesHasT  = new bool[ maxSizeChromosome ];
    }catch(bad_alloc& exc){
	cerr<<"ERROR: allocating memory failed"<<endl;
	return 1;
    }
    cerr<<"Success in allocating memory"<<endl;

    for(unsigned int i = 0;i<maxSizeChromosome;i++){
	hasCnoT[i]=false;
	hasGnoA[i]=false;
	thousandGenomesHasA[i]=false;
	thousandGenomesHasT[i]=false;
    }

    string bamfiletopen = string( argv[ argc-5 ] );
    string vcffiletopen = string( argv[ argc-4 ] );
    string chrname      = string( argv[ argc-3 ] );
    string deambam      = string( argv[ argc-2 ] );
    string nondeambam   = string( argv[ argc-1 ] );

    cerr<<"Reading consensus VCF "<<vcffiletopen<<"  ... "<<endl;

    VCFreader vcfr (vcffiletopen,
 		    // vcffiletopen+".tbi",
 		    // chrname,
 		    // 1,
 		    // maxSizeChromosome,
 		    0);

    
    while(vcfr.hasData()){
	SimpleVCF * toprint=vcfr.getData();

	if(toprint->getRef().length() != 1 )
	    continue;
	

	//if the VCF has a at least one G but no A
	if(  toprint->hasAtLeastOneG() && 
	     !toprint->hasAtLeastOneA() ){
	    hasGnoA[ toprint->getPosition() ] =true;
	}

	if(  toprint->hasAtLeastOneC() && 
	     !toprint->hasAtLeastOneT() ){
	    hasCnoT[ toprint->getPosition() ] =true;    
	}

	   
    }
    cerr<<"done reading VCF"<<endl;







    cerr<<"Reading 1000g VCF :"<<vcf1000g<<"  ..."<<endl;
    string line1000g;
    ifstream myFile1000g;
    
    myFile1000g.open(vcf1000g.c_str(), ios::in);

    if (myFile1000g.is_open()){
	while ( getline (myFile1000g,line1000g)){
	    vector<string> fields=allTokens(line1000g,'\t');
	    //0 chr
	    //1 pos
	    //2 id
	    //3 ref
	    //4 alt

	    //check if same chr
	    if(fields[0] != chrname){
		cerr <<"Error, wrong chromosome in 1000g file for line=  "<<line1000g<<endl;
		return 1;
	    }

	    //skip indels
	    if(fields[3].size() != 1 ||
	       fields[4].size() != 1 )
		continue;

	    char         ref=toupper(fields[3][0]);
	    char         alt=toupper(fields[4][0]);
	    unsigned int pos=destringify<unsigned int>( fields[1] );
	    
	    thousandGenomesHasA[ pos ] = ( (ref=='A') || (alt=='A') );
	    thousandGenomesHasT[ pos ] = ( (ref=='T') || (alt=='T') );
	    
	}
	myFile1000g.close();
    }else{
	cerr <<"Unable to open file "<<vcf1000g<<endl;
	return 1;
    }



    cerr<<"done reading 1000g VCF"<<endl;











    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }


    //positioning the bam file
    int refid=reader.GetReferenceID(chrname);
    if(refid < 0){
	cerr << "Cannot retrieve the reference ID for "<< chrname << endl;
	return 1;
    }
    //cout<<"redif "<<refid<<endl;	    

    //setting the BAM reader at that position
    reader.SetRegion(refid,
		     0,
		     refid,
		     -1); 	



    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writerDeam;
    if ( !writerDeam.Open(deambam,      header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamWriter writerNoDeam;
    if ( !writerNoDeam.Open(nondeambam, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }



    unsigned int totalReads      =0;
    unsigned int deaminatedReads =0;
    unsigned int ndeaminatedReads =0;
    unsigned int skipped      =0;



    //iterating over the alignments for these regions
    BamAlignment al;
    int i;

    while ( reader.GetNextAlignment(al) ) {
	// cerr<<al.Name<<endl;

	//skip unmapped
	if(!al.IsMapped()){
	    skipped++;
	    continue;
	}

	//skip paired end !
	if(al.IsPaired() ){  
	    continue;
	    // cerr<<"Paired end not yet coded"<<endl;
	    // return 1;
	}


	string reconstructedReference = reconstructRef(&al);



	char refeBase;
	char readBase;
	bool isDeaminated;
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}

	isDeaminated=false;

	if(al.IsReverseStrand()){

	    //first base next to 3'
	    i = 0 ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    if(  readBase  == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		if( hasGnoA[al.Position+1] && 
		    !thousandGenomesHasA[al.Position+1] )
		    isDeaminated=true; 


		// transformRef(&refeBase,&readBase);

		// vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     // cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;			
		// 	cerr<<numberOfDeletions(&al)<<endl;			
		// 	cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }
		    
		//     //if the VCF has a at least one G but no A
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }


	    //second base next to 3'
	    i = 1;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		if( hasGnoA[al.Position+2] && 
		   !thousandGenomesHasA[al.Position+2] )
		    isDeaminated=true; 


		// transformRef(&refeBase,&readBase);


		// vcfr.repositionIterator(chrname,al.Position+2,al.Position+2);

		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     // cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<"Problem2 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     //if the VCF has at least one G but no A 
		//     // if(toprint->hasAtLeastOneG() &&
		//     //    toprint->getAlt().find("A") == string::npos){
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }
	    }

	    //last  base next to 5'
	    i = (al.QueryBases.length()-1) ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);


		int lengthMatches=countMatchesRecons(reconstructedReference,0);		
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasGnoA[positionJump] && 
		   !thousandGenomesHasA[positionJump] )
		    isDeaminated=true; 


		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();

		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;
		    
		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem3 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }


		//     //if the VCF has at least one G but no A
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }

	}else{

		
	    //first base next to 5'
	    i = 0;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C' 
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){ 

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		// transformRef(&refeBase,&readBase);

		if(hasCnoT[al.Position+1]  && 
		   !thousandGenomesHasT[al.Position+1] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;
		    
		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;			
		// 	cerr<<"Problem4 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     //if the VCF has at least one C but no T
		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }

		// }

		//cout<<al.Position+
		 
	    }

	    //second last base next to 3'
	    i = (al.QueryBases.length()-2);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C'  &&
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  



		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		//transformRef(&refeBase,&readBase);		
		int lengthMatches=countMatchesRecons(reconstructedReference,1);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasCnoT[positionJump] && 
		   !thousandGenomesHasT[positionJump] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem5 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }
		// }

		 

	    }

	    //last base next to 3'
	    i = (al.QueryBases.length()-1);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //&& refeBase  == 'C' 
	    if( readBase  == 'T'  && int(al.Qualities[i]-offset) >= minBaseQuality){  
		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		

		int lengthMatches=countMatchesRecons(reconstructedReference,0);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasCnoT[positionJump] && 
		   !thousandGenomesHasT[positionJump] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem6 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }	

	   
	    
	}
		  



	totalReads++;

	if(isDeaminated){
	    deaminatedReads++;
	    writerDeam.SaveAlignment(al);		
	}else{
	    ndeaminatedReads++;
	    writerNoDeam.SaveAlignment(al);		
	}


    
    }//end for each read









    reader.Close();
    writerDeam.Close();
    writerNoDeam.Close();
    delete(hasCnoT);
    delete(hasGnoA);

    cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl;

   
    return 0;
}
Ejemplo n.º 10
0
int main (int argc, char *argv[]) {

    bool mapped  =false;
    bool unmapped=false;

    const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+
			      "This program takes a BAM file as input and produces\n"+
			      "another where the putative deaminated bases have\n"+
			      "have been cut\n"+
			      "\n"+
			      "Options:\n");
			      // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+
			      // "\t"+"-m , --mapped"   +"\n\t\t"+"For an mapped bam file"+"\n");
			      
			      

    if( (argc== 1) ||
	(argc== 2 && string(argv[1]) == "-h") ||
	(argc== 2 && string(argv[1]) == "-help") ||
	(argc== 2 && string(argv[1]) == "--help") ){
	cout<<"Usage:"<<endl;
	cout<<usage<<endl;
	cout<<""<<endl;
	return 1;
    }

    // for(int i=1;i<(argc-1);i++){ //all but the last arg

    // 	if(strcmp(argv[i],"-m") == 0 || strcmp(argv[i],"--mapped") == 0 ){
    // 	    mapped=true;
    // 	    continue;
    // 	}

    // 	if(strcmp(argv[i],"-u") == 0 || strcmp(argv[i],"--unmapped") == 0 ){
    // 	    unmapped=true;
    // 	    continue;
    // 	}
       
    // 	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
    // 	return 1;
    // }

    if(argc != 3){
	cerr<<"Error: Must specify the input and output BAM files";
	return 1;
    }

    string inbamFile =argv[argc-2];
    string outbamFile=argv[argc-1];

    // if(!mapped && !unmapped){
    // 	cerr << "Please specify whether you reads are mapped or unmapped" << endl;
    // 	return 1;
    // }
    // if(mapped && unmapped){
    // 	cerr << "Please specify either mapped or unmapped but not both" << endl;
    // 	return 1;
    // }

    BamReader reader;

    if ( !reader.Open(inbamFile) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
    }


    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writer;
    if ( !writer.Open(outbamFile, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamAlignment al;
    // BamAlignment al2;
    // bool al2Null=true;
    
    while ( reader.GetNextAlignment(al) ) {

	    if(al.IsPaired() ){  

		if(al.IsFirstMate() ){ //5' end, need to check first base only
		    if(al.IsReverseStrand()){ //
			if(!al.IsMapped()){
			    cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
			    //return 1;
			}
			int indexToCheck;
			//last
			indexToCheck=al.QueryBases.length()-1;
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);			 
			    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			    al.Qualities  = al.Qualities.substr(0, indexToCheck);
			}
		    }else{
			int indexToCheck;
			//first base
			indexToCheck=0;
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			    al.Qualities  = al.Qualities.substr(indexToCheck+1);
			}
		    }
		}else{ //3' end, need to check last two bases only
		    if( al.IsSecondMate() ){
			if(al.IsReverseStrand()){ //
			    if(!al.IsMapped()){
				cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
				//return 1;
			    }
			    int indexToCheck;

			    //second to last
			    indexToCheck=al.QueryBases.length()-2;
			    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				//al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				al.QueryBases = al.QueryBases.substr(0,indexToCheck);
				al.Qualities  = al.Qualities.substr(0, indexToCheck);
			    }else{
				//last
				indexToCheck=al.QueryBases.length()-1;
				if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
				    al.Qualities  = al.Qualities.substr(0, indexToCheck);
				}
			    }
			}else{
			    int indexToCheck;
			    //second base
			    indexToCheck=1;
			    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				//al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				al.QueryBases = al.QueryBases.substr(indexToCheck+1);
				al.Qualities  = al.Qualities.substr(indexToCheck+1);
			    }else{
				//first base
				indexToCheck=0;
				if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
				    al.Qualities  = al.Qualities.substr(indexToCheck+1);
				}
			    }

			}
		    }else{
			cerr << "Wrong state" << endl;
			return 1;
		    }
		}

	    }//end of paired end
	    else{//we consider single reads to have been sequenced from 5' to 3'

		if(al.IsReverseStrand()){ //need to consider 
		    if(!al.IsMapped()){
			cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
			//return 1;
		    }

		    int indexToCheck;



		    //second base
		    indexToCheck=1;
		    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"51 "<<al.QueryBases<<endl;
			// cout<<"51 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			al.Qualities  = al.Qualities.substr(indexToCheck+1);
			// cout<<"52 "<<al.QueryBases<<endl;
			// cout<<"52 "<<al.Qualities<<endl;
		    }else{
			//first base
			indexToCheck=0;
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    // al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    // cout<<"61 "<<al.QueryBases<<endl;
			    // cout<<"61 "<<al.Qualities<<endl;
			    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			    al.Qualities  = al.Qualities.substr(indexToCheck+1);
			    // cout<<"62 "<<al.QueryBases<<endl;
			    // cout<<"62 "<<al.Qualities<<endl;
			}

		    }


		    //last
		    indexToCheck=al.QueryBases.length()-1;
		    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"21 "<<al.QueryBases<<endl;
			// cout<<"21 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			al.Qualities  = al.Qualities.substr(0, indexToCheck);
			// cout<<"22 "<<al.QueryBases<<endl;
			// cout<<"22 "<<al.Qualities<<endl;
		    }
		}else{

		    int indexToCheck;
		    //first base
		    indexToCheck=0;
		    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);

			// cout<<"11 "<<al.QueryBases<<endl;
			// cout<<"11 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			al.Qualities  = al.Qualities.substr(indexToCheck+1);
			// cout<<"12 "<<al.QueryBases<<endl;
			// cout<<"12 "<<al.Qualities<<endl;

		    }

		    //second to last
		    indexToCheck=al.QueryBases.length()-2;
		    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"31 "<<al.QueryBases<<endl;
			// cout<<"31 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			al.Qualities  = al.Qualities.substr(0, indexToCheck);
			// cout<<"32 "<<al.QueryBases<<endl;
			// cout<<"32 "<<al.Qualities<<endl;
		    }else{

			//last
			indexToCheck=al.QueryBases.length()-1;
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    // al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    // cout<<"41 "<<al.QueryBases<<endl;
			    // cout<<"41 "<<al.Qualities<<endl;
			    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			    al.Qualities  = al.Qualities.substr(0, indexToCheck);
			    // cout<<"42 "<<al.QueryBases<<endl;
			    // cout<<"42 "<<al.Qualities<<endl;
			}

		    }
		}
	    }//end of single end

	    writer.SaveAlignment(al);		


    }//    while ( reader.GetNextAlignment(al) ) {

    reader.Close();
    writer.Close();
   
    return 0;
}
Ejemplo n.º 11
0
int main (int argc, char *argv[]) {


    bool produceUnCompressedBAM=false;
    bool verbose=false;
    bool ancientDNA=false;
    bool keepOrig=false;

    string adapter_F=options_adapter_F_BAM;
    string adapter_S=options_adapter_S_BAM;
    string adapter_chimera=options_adapter_chimera_BAM;
    string key="";
    bool allowMissing=false;
    int trimCutoff=1;

    bool allowAligned=false;
    bool printLog=false;
    string logFileName;

    BamReader reader;
    BamWriter writer;

    string bamFile;
    string bamFileOUT="";

    string key1;
    string key2;
    
    bool useDist=false;
    double location=-1.0;
    double scale   =-1.0;

    bool fastqFormat=false;
    string fastqfile1   = "";
    string fastqfile2   = "";
    string fastqoutfile = "";
    bool singleEndModeFQ=true;

    const string usage=string(string(argv[0])+
			      
			      " [options] BAMfile"+"\n"+
			      "\nThis program takes an unaligned BAM where mates are consecutive\nor fastq files and trims and merges reads\n"+

			      "\n\tYou can specify a unaligned bam file or one or two fastq :\n"+			      
			      "\t\t"+"-fq1" +"\t\t"+"First fastq"+"\n"+
			      "\t\t"+"-fq2" +"\t\t"+"Second  fastq file (for paired-end)"+"\n"+
			      "\t\t"+"-fqo" +"\t\t"+"Output fastq prefix"+"\n\n"+
			      //"\t"+"-p , --PIPE"+"\n\t\t"+"Read BAM from and write it to PIPE"+"\n"+
			      "\t"+"-o , --outfile" +"\t\t"+"Output (BAM format)."+"\n"+


			      "\t"+"-u            " +"\t\t"+"Produce uncompressed bam (good for pipe)"+"\n"+

			      //	"\t"+" , --outprefix" +"\n\t\t"+"Prefix for output files (default '"+outprefix+"')."+"\n"+
			      //"\t"+" , --SAM" +"\n\t\t"+"Output SAM not BAM."+"\n"+
			      "\t"+"--aligned" +"\t\t"+"Allow reads to be aligned (default "+boolStringify(allowAligned)+")"+"\n"+
			      "\t"+"-v , --verbose" +"\t\t"+"Turn all messages on (default "+boolStringify(verbose)+")"+"\n"+
			      "\t"+"--log [log file]" +"\t"+"Print a tally of merged reads to this log file (default only to stderr)"+"\n"+
			      
			      "\n\t"+"Paired End merging/Single Read trimming  options"+"\n"+
			      "\t\t"+"You can specify either:"+"\n"+
			      "\t\t\t"+"--ancientdna"+"\t\t\t"+"ancient DNA (default "+boolStringify(ancientDNA)+")"+"\n"+
			      "\t\t"+"            "+"\t\t\t\t"+"this allows for partial overlap"+"\n"+
			      "\n\t\t"+"or if you know your size length distribution:"+"\n"+
			      "\t\t\t"+"--loc"+"\t\t\t\t"+"Location for lognormal dist. (default none)"+"\n"+
			      "\t\t\t"+"--scale"+"\t\t\t\t"+"Scale for lognormal dist. (default none)"+"\n"+
			      //			      "\t\t\t\t\t\t\tGood for merging ancient DNA reads into a single sequence\n\n"
			      "\n\t\t"+"--keepOrig"+"\t\t\t\t"+"Write original reads if they are trimmed or merged  (default "+boolStringify(keepOrig)+")"+"\n"+
			      "\t\t\t\t\t\t\tSuch reads will be marked as PCR duplicates\n\n"
			      "\t\t"+"-f , --adapterFirstRead" +"\t\t\t"+"Adapter that is observed after the forward read (def. Multiplex: "+options_adapter_F_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-s , --adapterSecondRead" +"\t\t"+"Adapter that is observed after the reverse read (def. Multiplex: "+options_adapter_S_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-c , --FirstReadChimeraFilter" +"\t\t"+"If the forward read looks like this sequence, the cluster is filtered out.\n\t\t\t\t\t\t\tProvide several sequences separated by comma (def. Multiplex: "+options_adapter_chimera_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-k , --key"+"\t\t\t\t"+"Key sequence with which each sequence starts. Comma separate for forward and reverse reads. (default '"+key+"')"+"\n"+
			      "\t\t"+"-i , --allowMissing"+"\t\t\t"+"Allow one base in one key to be missing or wrong. (default "+boolStringify(allowMissing)+")"+"\n"+
			      "\t\t"+"-t , --trimCutoff"+"\t\t\t"+"Lowest number of adapter bases to be observed for single Read trimming (default "+stringify(trimCutoff)+")");

    if( (argc== 1) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
    	cout<<"Usage:"<<endl;
    	cout<<""<<endl;
    	cout<<usage<<endl;
    	return 1;
    }

    

    for(int i=1;i<(argc-1);i++){ //all but the last arg

	if(strcmp(argv[i],"-fq1") == 0 ){
	    fastqfile1=string(argv[i+1]);
	    fastqFormat=true;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-fq2") == 0 ){
	    fastqfile2=string(argv[i+1]);
	    fastqFormat=true;
	    singleEndModeFQ=false;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-fqo") == 0 ){
	    fastqoutfile=string(argv[i+1]);
	    fastqFormat=true;
	    i++;
	    continue;
	}




	if(strcmp(argv[i],"--log") == 0 ){
	    logFileName =string(argv[i+1]);
	    printLog=true;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-p") == 0 || strcmp(argv[i],"--PIPE") == 0 ){
	    cerr<<"This version no longer works with pipe, exiting"<<endl;
	    return 1;	    
	}

	if(strcmp(argv[i],"-u") == 0  ){
	    produceUnCompressedBAM=true;
	    continue;
	}

	if(strcmp(argv[i],"--aligned") == 0  ){
	    allowAligned=true;
	    continue;
	}



	if(strcmp(argv[i],"-o") == 0 || strcmp(argv[i],"--outfile") == 0 ){
	    bamFileOUT =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-v") == 0 || strcmp(argv[i],"--verbose") == 0 ){
	    verbose=true;
	    continue;
	}

	if(strcmp(argv[i],"--ancientdna") == 0 ){
	    ancientDNA=true;
	    continue;
	}

	if(strcmp(argv[i],"--keepOrig") == 0 ){
	    keepOrig=true;
	    continue;
	}

	if(strcmp(argv[i],"--loc") == 0 ){
	    location =destringify<double>(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"--scale") == 0 ){
	    scale =destringify<double>(argv[i+1]);
	    i++;
	    continue;
	}



	if(strcmp(argv[i],"-f") == 0 || strcmp(argv[i],"--adapterFirstRead") == 0 ){
	    adapter_F =string(argv[i+1]);
	    i++;
	    continue;
	}


	if(strcmp(argv[i],"-s") == 0 || strcmp(argv[i],"--adapterSecondRead") == 0 ){
	    adapter_S =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-c") == 0 || strcmp(argv[i],"--FirstReadChimeraFilter") == 0 ){
	    adapter_chimera =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-k") == 0 || strcmp(argv[i],"--keys") == 0 ){
	    key =string(argv[i+1]);
	    i++;
	    continue;
	}
	

	if(strcmp(argv[i],"-i") == 0 || strcmp(argv[i],"--allowMissing") == 0 ){
	    allowMissing=true;
	    continue;
	}

	if(strcmp(argv[i],"-t") == 0 || strcmp(argv[i],"--trimCutoff") == 0 ){
	    trimCutoff=atoi(argv[i+1]);
	    i++;
	    continue;
	}
	
	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
	return 1;	    
    }

    bamFile=argv[argc-1];

    if( (location != -1.0 && scale == -1.0) ||
	(location == -1.0 && scale != -1.0) ){
	cerr<<"Cannot specify --location without specifying --scale"<<endl;
	return 1;	    
    }
	
    if( (location != -1.0 && scale != -1.0) ){
	useDist=true;
	    
	if(ancientDNA){
	    cerr<<"Cannot specify --location/--scale and --ancientDNA"<<endl;
	    return 1;	    
	}
    }
    
    MergeTrimReads mtr (adapter_F,adapter_S,adapter_chimera,
			key1,key2,
			trimCutoff,allowMissing,ancientDNA,location,scale,useDist);

    fqwriters onereadgroup;

    if(fastqFormat){
	
	if( bamFileOUT != ""  || produceUnCompressedBAM || allowAligned){
	    cerr<<"ERROR : Cannot specify options like -o, -u or --allowAligned for fastq"<<endl;
	    return 1;
	}

	if(fastqfile1 == ""){
	    cerr<<"ERROR : Must specify as least the first file for fastq"<<endl;
	    return 1;	    
	}



	FastQParser * fqp1;
	FastQParser * fqp2;

	if(singleEndModeFQ){
	    fqp1 = new FastQParser (fastqfile1);

	    string outdirs   = fastqoutfile+".fq.gz";
	    string outdirsf  = fastqoutfile+".fail.fq.gz";

	    onereadgroup.single.open(outdirs.c_str(), ios::out);
	    onereadgroup.singlef.open(outdirsf.c_str(), ios::out);

	    if(!onereadgroup.single.good()){       cerr<<"Cannot write to file "<<outdirs<<endl; return 1; }
	    if(!onereadgroup.singlef.good()){      cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; }

	    
	}else{
	    fqp1 = new FastQParser (fastqfile1);
	    fqp2 = new FastQParser (fastqfile2);

	    string outdirs   = fastqoutfile+".fq.gz";
	    string outdir1   = fastqoutfile+"_r1.fq.gz";
	    string outdir2   = fastqoutfile+"_r2.fq.gz";

	    string outdirsf  = fastqoutfile+".fail.fq.gz";
	    string outdir1f  = fastqoutfile+"_r1.fail.fq.gz";
	    string outdir2f  = fastqoutfile+"_r2.fail.fq.gz";

	    onereadgroup.single.open(outdirs.c_str(), ios::out);
	    onereadgroup.pairr1.open(outdir1.c_str(), ios::out);
	    onereadgroup.pairr2.open(outdir2.c_str(), ios::out);

	    onereadgroup.singlef.open(outdirsf.c_str(), ios::out);
	    onereadgroup.pairr1f.open(outdir1f.c_str(), ios::out);
	    onereadgroup.pairr2f.open(outdir2f.c_str(), ios::out);

	    if(!onereadgroup.single.good()){       cerr<<"Cannot write to file "<<outdirs<<endl; return 1; }
	    if(!onereadgroup.pairr1.good()){       cerr<<"Cannot write to file "<<outdir1<<endl; return 1; }
	    if(!onereadgroup.pairr2.good()){       cerr<<"Cannot write to file "<<outdir2<<endl; return 1; }
	    
	    if(!onereadgroup.singlef.good()){      cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; }
	    if(!onereadgroup.pairr1f.good()){      cerr<<"Cannot write to file "<<outdir1f<<endl; return 1; }
	    if(!onereadgroup.pairr2f.good()){      cerr<<"Cannot write to file "<<outdir2f<<endl; return 1; }
	    
	}


	unsigned int totalSeqs=0;
	while(fqp1->hasData()){

	    FastQObj * fo1=fqp1->getData();
	    vector<string> def1=allTokens( *(fo1->getID()), ' '  );
	    string def1s=def1[0];
	

	    FastQObj * fo2;
	    string def2s;
	    string ext2s;

	    if(!singleEndModeFQ){
		if(!fqp2->hasData()){
		    cerr << "ERROR: Discrepency between fastq files at record " <<  *(fo1->getID()) <<endl;
		    return 1;
		}

		fo2=fqp2->getData();
		vector<string> def2=allTokens( *(fo2->getID()), ' ' );
		def2s=def2[0];




		if(strEndsWith(def1s,"/1")){
		    def1s=def1s.substr(0,def1s.size()-2);
		}
		if(strEndsWith(def2s,"/2")){
		    def2s=def2s.substr(0,def2s.size()-2);
		}

		if(strBeginsWith(def1s,"@")){
		    def1s=def1s.substr(1,def1s.size()-1);
		}
		if(strBeginsWith(def2s,"@")){
		    def2s=def2s.substr(1,def2s.size()-1);
		}


		if(def1s != def2s){
		    cerr << "ERROR: Discrepency between fastq files, different names " << *(fo1->getID()) <<" and "<< *(fo2->getID()) <<endl;
		    return 1;
		}

		merged result=	mtr.process_PE(*(fo1->getSeq()),*(fo1->getQual()),
					       *(fo2->getSeq()),*(fo2->getQual()));

		mtr.incrementCountall();

		if(result.code != ' '){ //keys or chimeras

		    if(result.code == 'K'){
			mtr.incrementCountfkey();
		    }else{
			if(result.code == 'D'){
			    mtr.incrementCountchimera();
			}else{
			    cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl;
			    exit(1);
			}
		    }
			
		    onereadgroup.pairr2f<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl;
		    onereadgroup.pairr1f<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		    continue;

		}else{
		        if(result.sequence != ""){ //new sequence			    
			    onereadgroup.single<<"@"<<def1s<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl;    	    

			    if( result.sequence.length() > max(fo1->getSeq()->length(),fo2->getSeq()->length()) ){
				mtr.incrementCountmergedoverlap();
			    }else{
				mtr.incrementCountmerged();			  
			    }

			}else{ //keep as is
			    mtr.incrementCountnothing();

			    onereadgroup.pairr2<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl;
			    onereadgroup.pairr1<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
			    

			}
		}

	    }else{
		
		
		merged result=mtr.process_SR(*(fo1->getSeq()),*(fo1->getQual()));
		mtr.incrementCountall();

		if(result.code != ' '){ //either chimera or missing key

		    if(result.code == 'K'){
			mtr.incrementCountfkey();
		    }else{
			if(result.code == 'D'){
			    mtr.incrementCountchimera();
			}else{
			    cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl;
			    exit(1);
			}
		    }

		    onereadgroup.singlef<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		    continue;
		}

		if(result.sequence != ""){ //new sequence
		    mtr.incrementCounttrimmed();
		    onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl;
		}else{
		    mtr.incrementCountnothing();
		    onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		}

	    }
	


	    totalSeqs++;
	}
    
	delete fqp1;
	if(!singleEndModeFQ){
	    delete fqp2;
	}

	if(singleEndModeFQ){

	    onereadgroup.single.close();
	    onereadgroup.singlef.close();
	    
	}else{
	    onereadgroup.single.close();
	    onereadgroup.pairr1.close();
	    onereadgroup.pairr2.close();
	    
	    onereadgroup.singlef.close();
	    onereadgroup.pairr1f.close();
	    onereadgroup.pairr2f.close();
	}
    
	//fastq
    }else{
	//else BAM


	//  initMerge();
	//     set_adapter_sequences(adapter_F,
	// 			  adapter_S,
	// 			  adapter_chimera);
	//     set_options(trimCutoff,allowMissing,mergeoverlap);
	if(key != ""){
	    size_t found=key.find(",");
	    if (found == string::npos){ //single end reads
		key1=key;
		key2="";
	    } else{                     //paired-end
		key1=key.substr(0,found);
		key2=key.substr(found+1,key.length()-found+1);
	    }
	}






	if( bamFileOUT == ""  ){
	    cerr<<"The output must be a be specified, exiting"<<endl;
	    return 1;
	}

	if ( !reader.Open(bamFile) ) {
	    cerr << "Could not open input BAM file  "<<bamFile << endl;
	    return 1;
	}
	SamHeader header = reader.GetHeader();

    

	string pID          = "mergeTrimReadsBAM";   
	string pName        = "mergeTrimReadsBAM";   
	string pCommandLine = "";
	for(int i=0;i<(argc);i++){
	    pCommandLine += (string(argv[i])+" ");
	}
	putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),".."));

	const RefVector references = reader.GetReferenceData();
	//we will not call bgzip with full compression, good for piping into another program to 
	//lessen the load on the CPU
	if(produceUnCompressedBAM) 
	    writer.SetCompressionMode(BamWriter::Uncompressed);

	if ( !writer.Open(bamFileOUT,header,references) ) {
	    cerr << "Could not open output BAM file "<<bamFileOUT << endl;
	    return 1;
	}



	SamHeader sh=reader.GetHeader();
	//Up to the user to be sure that a sequence is followed by his mate
	// if(!sh.HasSortOrder() || 
	//    sh.SortOrder != "queryname"){
	// 	cerr << "Bamfile must be sorted by queryname" << endl;
	// 	return 1;
	// }
    

	BamAlignment al;
	BamAlignment al2;
	bool al2Null=true;
    
	while ( reader.GetNextAlignment(al) ) {

	
	    if(al.IsMapped() || al.HasTag("NM") || al.HasTag("MD")  ){
		if(!allowAligned){
		    cerr << "Reads should not be aligned" << endl;
		    return 1;
		}else{
		    //should we remove tags ?
		}
	    }


	    if(al.IsPaired() && 
	       al2Null ){
		al2=al;
		al2Null=false;
		continue;
	    }else{
		if(al.IsPaired() && 
		   !al2Null){

		    bool  result =  mtr.processPair(al,al2);
		
		    if( result ){//was merged
			BamAlignment orig;
			BamAlignment orig2;

			if(keepOrig){
			    orig2 = al2;
			    orig  = al;
			}

			writer.SaveAlignment(al);

			if(keepOrig){
			    orig.SetIsDuplicate(true);
			    orig2.SetIsDuplicate(true);
			    writer.SaveAlignment(orig2);
			    writer.SaveAlignment(orig);
			}

			//the second record is empty
		    }else{
			//keep the sequences as pairs

			writer.SaveAlignment(al2);		    
			writer.SaveAlignment(al);
		    }

		    //
		    //  SINGLE END
		    //
		}else{ 
		    BamAlignment orig;
		    if(keepOrig){
			orig =al;
		    }
		    mtr.processSingle(al);

		    if(keepOrig){
			//write duplicate
			if(orig.QueryBases.length()  != al.QueryBases.length()){
			    orig.SetIsDuplicate(true);
			    writer.SaveAlignment(orig);
			}
		    }
		    writer.SaveAlignment(al);



		} //end single end
		al2Null=true;
	    }//second pair
		    

	} //while al
	reader.Close();
	writer.Close();


    } //else BAM


    cerr <<mtr.reportSingleLine()<<endl;

    if(printLog){
	ofstream fileLog;
	fileLog.open(logFileName.c_str());

	if (fileLog.is_open()){
	    fileLog <<mtr.reportMultipleLines() <<endl;

	}else{
	    cerr << "Unable to print to file "<<logFileName<<endl;
	}
	fileLog.close();
    }
    return 0;
}
Ejemplo n.º 12
0
int main (int argc, char *argv[]) {

    string usage=string(""+string(argv[0])+"  [in BAM file]"+
			"\nThis program reads a BAM file and computes the error rate for each cycle\n"+
			// "\nreads and the puts the rest into another bam file.\n"+
			// "\nTip: if you do not need one of them, use /dev/null as your output\n"+
			// "arguments:\n"+
			// "\t"+"--bq  [base qual]   : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\n");

    if(argc == 1 ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    // for(int i=1;i<(argc-2);i++){ 

	
    //     if(string(argv[i]) == "--bq"){
    // 	    minBaseQuality=destringify<int>(argv[i+1]);
    //         i++;
    //         continue;
    // 	}

    // }

    string bamfiletopen = string( argv[ argc-1 ] );
    // string deambam      = string( argv[ argc-2 ] );
    // string nondeambam   = string( argv[ argc-1 ] );


    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }








    //iterating over the alignments for these regions
    BamAlignment al;
    bool pairedEnd=false;
    bool firstRead=true;

    while ( reader.GetNextAlignment(al) ) {

	if(firstRead){ //reads are either all paired end or single end, I don't allow a mix
	    numberOfCycles=al.QueryBases.size();
	    // cout<<"numberOfCycles "<<numberOfCycles<<endl;
	    if(al.IsPaired() ){  
		pairedEnd=true;		
		matches    = vector<unsigned int> (2*numberOfCycles,0);
		mismatches = vector<unsigned int> (2*numberOfCycles,0);
		typesOfMismatches = vector< vector<unsigned int> >();
		for(int i=0;i<12;i++)
		    typesOfMismatches.push_back( vector<unsigned int> (2*numberOfCycles,0) );
	    }else{
		matches    = vector<unsigned int> (  numberOfCycles,0);
		mismatches = vector<unsigned int> (  numberOfCycles,0);
		typesOfMismatches = vector< vector<unsigned int> >();
		for(int i=0;i<12;i++)
		    typesOfMismatches.push_back( vector<unsigned int> (  numberOfCycles,0) );
	    }
	    firstRead=false;
	}

	if( (  pairedEnd && !al.IsPaired()) ||  
	    ( !pairedEnd &&  al.IsPaired())   ){
	    cerr<<"Read "<<al.Name<<" is wrong, cannot have a mixture of paired and unpaired read for this program"<<endl;
	    return 1;
	}


	//skip unmapped
	if(!al.IsMapped())
	    continue;

	if(numberOfCycles!=int(al.QueryBases.size())){
	    cerr<<"The length of read "<<al.Name<<" is wrong, should be "<<numberOfCycles<<"bp"<<endl;
	    return 1;
	}


	string reconstructedReference = reconstructRef(&al);
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}


	if( pairedEnd ){
	    if( al.IsFirstMate() ){ //start cycle 0

		if( al.IsReverseStrand()  ){ 
		    increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1
		}else{
		    increaseCounters(al,reconstructedReference,0               , 1); //start cycle 0
		}
       
	    }else{

		if( al.IsSecondMate()  ){ 
		    if( al.IsReverseStrand()  ){ 
			increaseCounters(al,reconstructedReference,2*numberOfCycles-1,-1); //start cycle 2*numberOfCycles-1
		    }else{
			increaseCounters(al,reconstructedReference,numberOfCycles    , 1); //start cycle numberOfCycles
		    }
		}else{
	    	    cerr<<"Reads "<<al.Name<<" must be either first or second mate"<<endl;
	    	    return 1;
	    	}
	    }	
	}else{ //single end 

	    if( al.IsReverseStrand()  ){ 
		increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1
	    }else{
		increaseCounters(al,reconstructedReference,0               , 1); //start cycle 0
	    }
	    
	}


    }//end while  each read
	


    reader.Close();

    cout<<"cycle\tmatches\tmismatches\tmismatches%\tA>C\tA>C%\tA>G\tA>G%\tA>T\tA>T%\tC>A\tC>A%\tC>G\tC>G%\tC>T\tC>T%\tG>A\tG>A%\tG>C\tG>C%\tG>T\tG>T%\tT>A\tT>A%\tT>C\tT>C%\tT>G\tT>G%"<<endl;



    for(unsigned int i=0;i<matches.size();i++){
cout<<(i+1);
	if( (matches[i]+mismatches[i]!=0) )
	    cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\t"<< 100.0*(double(mismatches[i])/double(matches[i]+mismatches[i])) ;
	else
	    cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\tNA";

	for(int j=0;j<12;j++){   
	    cout<<"\t"<<typesOfMismatches[j][i];
	    if( (matches[i]+mismatches[i]!=0) )
		cout<<"\t"<<100.0*double(typesOfMismatches[j][i])/double(matches[i]+mismatches[i]);
	    else
		cout<<"\tNA";
	}

	cout<<endl;
    }




   
    return 0;
}
Ejemplo n.º 13
0
	virtual void main()
	{
		//init
		QTextStream out(stdout);
		BamReader reader;
		NGSHelper::openBAM(reader, getInfile("in"));

		FastqOutfileStream out1(getOutfile("out1"), false);
		FastqOutfileStream out2(getOutfile("out2"), false);

		long long c_unpaired = 0;
		long long c_paired = 0;
		int max_cached = 0;

		//iterate through reads
		BamAlignment al;
		QHash<QByteArray, BamAlignment> al_cache;
		while (reader.GetNextAlignment(al))
		{
			//skip secondary alinments
			if(!al.IsPrimaryAlignment()) continue;

			//skip unpaired
			if(!al.IsPaired())
			{
				++c_unpaired;
				continue;
			}

			QByteArray name(al.Name.data()); //TODO use QByteArray::fromStdString (when upgraded to Qt5.4)

			//store cached read when we encounter the mate
			if (al_cache.contains(name))
			{
				BamAlignment mate = al_cache.take(name);
				//out << name << " [AL] First: " << al.IsFirstMate() << " Reverse: " << al.IsReverseStrand() << " Seq: " << al.QueryBases.data() << endl;
				//out << name << " [MA] First: " << mate.IsFirstMate() << " Reverse: " << mate.IsReverseStrand() << " Seq: " << mate.QueryBases.data() << endl;
				if (al.IsFirstMate())
				{
					write(out1, al, al.IsReverseStrand());
					write(out2, mate, mate.IsReverseStrand());
				}
				else
				{
					write(out1, mate, mate.IsReverseStrand());
					write(out2, al, al.IsReverseStrand());
				}
				++c_paired;
			}
			//cache read for later retrieval
			else
			{
				al_cache.insert(name, al);
			}

			max_cached = std::max(max_cached, al_cache.size());
		}
		reader.Close();
		out1.close();
		out2.close();

		//write debug output
		out << "Pair reads (written)            : " << c_paired << endl;
		out << "Unpaired reads (skipped)        : " << c_unpaired << endl;
		out << "Unmatched paired reads (skipped): " << al_cache.size() << endl;
		out << endl;
		out << "Maximum cached reads            : " << max_cached << endl;
	}
Ejemplo n.º 14
0
void BedGenomeCoverage::CoverageBam(string bamFile) {

    ResetChromCoverage();

    // open the BAM file
    BamReader reader;
    if (!reader.Open(bamFile)) {
        cerr << "Failed to open BAM file " << bamFile << endl;
        exit(1);
    }

    // get header & reference information
    string header = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // load the BAM header references into a BEDTools "genome file"
    _genome = new GenomeFile(refs);
    // convert each aligned BAM entry to BED
    // and compute coverage on B
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        // skip if the read is unaligned
        if (bam.IsMapped() == false)
            continue;

        bool _isReverseStrand = bam.IsReverseStrand();

        //changing second mate's strand to opposite
        if( _dUTP && bam.IsPaired() && bam.IsMateMapped() && bam.IsSecondMate())
            _isReverseStrand = !bam.IsReverseStrand();

        // skip if we care about strands and the strand isn't what
        // the user wanted
        if ( (_filterByStrand == true) &&
             ((_requestedStrand == "-") != _isReverseStrand) )
            continue;

        // extract the chrom, start and end from the BAM alignment
        string chrom(refs.at(bam.RefID).RefName);
        CHRPOS start = bam.Position;
        CHRPOS end = bam.GetEndPosition(false, false) - 1;

        // are we on a new chromosome?
        if ( chrom != _currChromName )
            StartNewChrom(chrom);
        if(_pair_chip_) {
            // Skip if not a proper pair
            if (bam.IsPaired() && (!bam.IsProperPair() or !bam.IsMateMapped()) )
                continue;
            // Skip if wrong coordinates
            if( ( (bam.Position<bam.MatePosition) && bam.IsReverseStrand() ) ||
                ( (bam.MatePosition < bam.Position) && bam.IsMateReverseStrand() ) ) {
                    //chemically designed: left on positive strand, right on reverse one
                    continue;
            }

            /*if(_haveSize) {
                if (bam.IsFirstMate() && bam.IsReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment
                    int mid = bam.MatePosition+abs(bam.InsertSize)/2;
                    if(mid<_fragmentSize/2)
                        AddCoverage(0, mid+_fragmentSize/2);
                    else
                        AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2);
                }
                else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment
                    int mid = start+abs(bam.InsertSize)/2;
                    if(mid<_fragmentSize/2)
                        AddCoverage(0, mid+_fragmentSize/2);
                    else
                        AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2);
                }
            } else */

            if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left
                AddCoverage(bam.MatePosition, end);
            }
            else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right
                AddCoverage(start, start + abs(bam.InsertSize) - 1);
            }
        } else if (_haveSize) {
            if(bam.IsReverseStrand()) {
                if(end<_fragmentSize) { //sometimes fragmentSize is bigger :(
                    AddCoverage(0, end);
                } else {
                    AddCoverage(end + 1 - _fragmentSize, end );
                }
            } else {
                AddCoverage(start,start+_fragmentSize - 1);
            }
        } else
        // add coverage accordingly.
        if (!_only_5p_end && !_only_3p_end) {
            bedVector bedBlocks;
            // we always want to split blocks when a D CIGAR op is found.
            // if the user invokes -split, we want to also split on N ops.
            if (_obeySplits) { // "D" true, "N" true
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, true);
            }
            else { // "D" true, "N" false
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false);
            }
            AddBlockedCoverage(bedBlocks);
        }
        else if (_only_5p_end) {
            CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end;
            AddCoverage(pos,pos);
        }
        else if (_only_3p_end) {
            CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end;
            AddCoverage(pos,pos);
        }
    }
    // close the BAM
    reader.Close();

    // process the results of the last chromosome.
    ReportChromCoverage(_currChromCoverage, _currChromSize,
            _currChromName, _currChromDepthHist);

    // report all empty chromsomes
    PrintEmptyChromosomes();

    // report the overall coverage if asked.
    PrintFinalCoverage();
}
Ejemplo n.º 15
0
int main (int argc, char *argv[]) {

    // bool mapped  =false;
    // bool unmapped=false;
    int bpToDecrease5=1;
    int bpToDecrease3=2;

    const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+
			      "\tThis program takes a BAM file as input and produces\n"+
			      "\tanother where the putative deaminated bases have\n"+
			      "\ta base quality score of "+intStringify(baseQualForDeam)+"\n"+
			      "\tgiven an "+intStringify(offset)+" offset \n"+
			      "\n"+
			      "\tOptions:\n"+
			      "\t\t"+"-n5" +"\t\t\t"+"Decrease the nth bases surrounding the 5' ends (Default:"+stringify(bpToDecrease5)+") "+"\n"+
			      "\t\t"+"-n3" +"\t\t\t"+"Decrease the nth bases surrounding the 3' ends (Default:"+stringify(bpToDecrease3)+") "+"\n"
			      );
			      // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+
			      // "\t"+"-m , --mapped"   +"\n\t\t"+"For an mapped bam file"+"\n");
			      
			      

    if( (argc== 1) ||
	(argc== 2 && string(argv[1]) == "-h") ||
	(argc== 2 && string(argv[1]) == "-help") ||
	(argc== 2 && string(argv[1]) == "--help") ){
	cout<<"Usage:"<<endl;
	cout<<usage<<endl;
	cout<<""<<endl;
	return 1;
    }

    for(int i=1;i<(argc-2);i++){ //all but the last arg

    	if( string(argv[i]) == "-n5" ){
	    bpToDecrease5 = destringify<int>(argv[i+1]);
            i++;
    	    continue;
    	}

    	if( string(argv[i]) == "-n3" ){
	    bpToDecrease3 = destringify<int>(argv[i+1]);
            i++;
    	    continue;
    	}
       
    	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
    	return 1;
    }

    if(argc < 3){
	cerr<<"Error: Must specify the input and output BAM files";
	return 1;
    }

    string inbamFile =argv[argc-2];
    string outbamFile=argv[argc-1];

    if(inbamFile == outbamFile){
	cerr<<"Input and output files are the same"<<endl;
	return 1;    
    }
    // if(!mapped && !unmapped){
    // 	cerr << "Please specify whether you reads are mapped or unmapped" << endl;
    // 	return 1;
    // }
    // if(mapped && unmapped){
    // 	cerr << "Please specify either mapped or unmapped but not both" << endl;
    // 	return 1;
    // }

    BamReader reader;

    if ( !reader.Open(inbamFile) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
    }


    vector<RefData>  testRefData=reader.GetReferenceData();
    SamHeader header = reader.GetHeader();
    string pID          = "decrQualDeaminated";   
    string pName        = "decrQualDeaminated";   
    string pCommandLine = "";
    for(int i=0;i<(argc);i++){
        pCommandLine += (string(argv[i])+" ");
    }
    putProgramInHeader(&header,pID,pName,pCommandLine);


    const RefVector references = reader.GetReferenceData();

    BamWriter writer;
    if ( !writer.Open(outbamFile, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamAlignment al;
    // BamAlignment al2;
    // bool al2Null=true;
    
    while ( reader.GetNextAlignment(al) ) {

	    if(al.IsPaired() ){  

		if(al.IsFirstMate() ){ //5' end, need to check first base only
		    if(al.IsReverseStrand()){ //
			if(!al.IsMapped()){
			    cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl;
			    //return 1;
			}
			int indexToCheck;

			//5' of first mate reversed
			indexToCheck=al.QueryBases.length()-1;
			for(int i=0;i<bpToDecrease5;i++){			
			    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    }
			    indexToCheck=max(indexToCheck-1,0);
			}

			
		    }else{
			int indexToCheck;
			//5' of first mate
			indexToCheck=0;
			for(int i=0;i<bpToDecrease5;i++){ //first base			
			    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    }
			    indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
			}

		    }


		}else{ //3' end, need to check last two bases only
		    if( al.IsSecondMate() ){
			if(al.IsReverseStrand()){ //
			    if(!al.IsMapped()){
				cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl;
				//return 1;
			    }
			    int indexToCheck;

			    //3' of second mate reversed
			    indexToCheck=al.QueryBases.length()-1;
			    for(int i=0;i<bpToDecrease3;i++){			
				if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				}
				indexToCheck=max(indexToCheck-1,0);
			    }
			    

			}else{
			    int indexToCheck;
			    
			    //3' of second mate forward
			    indexToCheck=0;
			    for(int i=0;i<bpToDecrease3;i++){ //first base			
				if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				}
				indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
			    }

			}
		    }else{
			cerr << "Wrong state" << endl;
			return 1;
		    }
		}

	    }//end of paired end
	    else{//we consider single reads to have been sequenced from 5' to 3'

		if(al.IsReverseStrand()){ //need to consider 
		    if(!al.IsMapped()){
			cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl;
			//return 1;
		    }

		    int indexToCheck;

		    //5' of single read reversed
		    indexToCheck=al.QueryBases.length()-1;
		    for(int i=0;i<bpToDecrease5;i++){			
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=max(indexToCheck-1,0);
		    }

		    //3' of single read reversed
		    indexToCheck=0;
		    for(int i=0;i<bpToDecrease3;i++){ //first base			
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
		    }

		    

		}else{

		    int indexToCheck;
		    
		    //5' of single read
		    indexToCheck=0;
		    for(int i=0;i<bpToDecrease5;i++){ //first base			
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
		    }


		    //3' of single read
		    indexToCheck=al.QueryBases.length()-1;
		    for(int i=0;i<bpToDecrease3;i++){			
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=max(indexToCheck-1,0);
		    }



		}
	    }//end of single end

	    writer.SaveAlignment(al);		


    }//    while ( reader.GetNextAlignment(al) ) {

    reader.Close();
    writer.Close();

    cerr<<"Program terminated gracefully"<<endl;   
    return 0;
}
Ejemplo n.º 16
0
int main (int argc, char *argv[]) {

    if(argc != 4){
	cerr<<"This program strips the mapping information and cuts sequences"<<endl;
	cerr<<"Usage "<<argv[0]<<" [bam file in] [bam file out] [distribution]"<<endl;
	cerr<<"The distribution is one per line"<<endl;
	return 1;
    }

    string bamfiletopen  = string(argv[1]);
    string bamfiletwrite = string(argv[2]);
    string fileDist       = string(argv[3]);

    igzstream myFile;
    string line;
    vector<int> distToUse;
    myFile.open(fileDist.c_str(), ios::in);
    
    if (myFile.good()){
	while ( getline (myFile,line)){
	    distToUse.push_back(   destringify<int>(line) );
	}
	myFile.close();
    }else{
	cerr << "Unable to open file "<<fileDist<<endl;
	return 1;
    }
    cerr<<"Read "<<distToUse.size()<<" data points "<<endl;


    cerr<<"Reading "<<bamfiletopen<<" writing to "<<bamfiletwrite<<endl;

    BamReader reader;
    BamWriter writer;

    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
    }

    SamHeader  myHeader=reader.GetHeader();
    SamProgram sp;

    string pID          = "removeTagsMapping";   
    string pName        = "removeTagsMapping";   
    string pCommandLine = "";
    for(int i=0;i<(argc);i++){
	pCommandLine += (string(argv[i])+" ");
    }
    putProgramInHeader(&myHeader,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),"."));

    //no @SQ
    myHeader.Sequences.Clear();
    vector< RefData > 	emptyRefVector;

    if( !writer.Open(bamfiletwrite,myHeader,emptyRefVector ) ) {
    	cerr << "Could not open output BAM file  "<<bamfiletwrite << endl;
    	return 1;	
    }

    unsigned int readsTotal=0;

    BamAlignment al;
    while ( reader.GetNextAlignment(al) ) {
	//deleting tag data
	al.TagData="";

	//reset the flag
	// if(al.IsPaired()){
	//     if(al.IsFirstMate()){
	// 	al.AlignmentFlag =  flagFirstPair;
	//     }else{
	// 	al.AlignmentFlag =  flagSecondPair;
	//     }
	// }else{
	// }

	if(al.IsPaired()){
	    if(al.IsFirstMate()){
		al.Name =  al.Name+"/1";
	    }else{
		al.Name =  al.Name+"/2";
	    }
	}

	al.AlignmentFlag =  flagSingleReads;


	//no ref or positon
	al.RefID=-1;
	al.MateRefID=-1;
	al.Position=-1;
	al.MatePosition=-1;
	//no insert size
	al.InsertSize=0;
	//no cigar
	al.CigarData.clear();
	//no mapping quality 
	al.MapQuality=0;
	int length = distToUse[ randomInt(0,distToUse.size()-1) ];
	length = MIN( length, al.Length);
	al.QueryBases = al.QueryBases.substr(0,length);
	al.Qualities  = al.Qualities.substr( 0,length);


	writer.SaveAlignment(al);
	readsTotal++;
    }

    reader.Close();
    writer.Close();

    cerr<<"Program "<<argv[0]<<" terminated gracefully, looked at "<<readsTotal<<endl;

    return 0;
}