Esempio n. 1
0
int main ( int argc, char *argv[] ) { 

  struct parameters *param = 0;
  param = interface(param, argc, argv);

  //region file input (the region file should be sorted as the same way as the bam file)
  ifstream region_f;
  region_f.open(param->region_f, ios_base::in);  // the region file is opened

  //bam input and generate index if not yet 
  //-------------------------------------------------------------------------------------------------------+
  // BAM input (file or filenames?)                                                                        |
  //-------------------------------------------------------------------------------------------------------+
  char *fof = param->mapping_f;
  FILE *IN=NULL;
  char linefof[5000];
  int filecount=0;
  vector <string> fnames;

  if (strchr(fof,' ')!=NULL) {
    char *ptr;
    ptr=strtok(fof," ");
    while (ptr!=NULL) {
      fnames.push_back(ptr);
      filecount++;
      ptr=strtok(NULL," ");
    }
  } else {
    IN=fopen(fof,"rt");
    if (IN!=NULL) {
      long linecount=0;
      while (fgets(linefof,5000-1,IN)!=NULL) {
        linecount++;
        if (linefof[0]!='#' && linefof[0]!='\n') {
          char *ptr=strchr(linefof,'\n');
          if (ptr!=NULL && ptr[0]=='\n') {
            ptr[0]='\0';
          }
          FILE *dummy=NULL;
          dummy=fopen(linefof,"rt");
          if (dummy!=NULL) {     // seems to be a file of filenames...
            fclose(dummy);
            fnames.push_back(linefof);
            filecount++;
          } else if (filecount==0 || linecount>=1000-1) {  // seems to be a single file
            fnames.push_back(fof);
            filecount++;
            break;
          }
        }
      }
      fclose(IN);
    }
  }  //file or file name decided and stored in vector "fnames"

  cerr << "the input mapping files are:" << endl;
  vector <string>::iterator fit = fnames.begin();
  for(; fit != fnames.end(); fit++) {
    cerr << *fit << endl;
  }

  //-------------------------------------------------------------------------------------------------------+
  // end of file or filenames                                                                              |
  //-------------------------------------------------------------------------------------------------------+

  // open the BAM file(s)
  BamMultiReader reader;
  reader.Open(fnames);

  // get header & reference information
  string header = reader.GetHeaderText();
  RefVector refs = reader.GetReferenceData();

  if ( ! reader.LocateIndexes() )     // opens any existing index files that match our BAM files
    reader.CreateIndexes();         // creates index files for BAM files that still lack one


  // locus bias
  struct lb empty_profile = {0,0,0,0};
  vector <struct lb> locus_b(1000, empty_profile);
  // output locus bias file
  string locus_bias_set = param->lbias;
  ofstream locus_bias;
  if ( locus_bias_set != "" ) {
    locus_bias.open(param->lbias);
    if ( !locus_bias ) {
      cerr << "can not open locus_bias file.\n";
      exit(0);
    }
  }

  //should decide which chromosome
  string line;
  string old_chr = "SRP";
  string type = param->type;

  //whether do some position-level pile-up stuff
  bool posc = false;
  ofstream posc_f;
  ofstream chrmap_f;
  string poscset = param->posc;
  if ( poscset != "" ) {
    posc = true;
    posc_f.open(param->posc);
    chrmap_f.open(param->chrmap);
  }

  bool noChr;
  if ( param->nochr == 1 ){
    noChr = true;
  } else {
    noChr = false;
  }

  //regions for the input of region file
  deque <struct region> regions;

  getline(region_f, line); //get the first line
  eatline(line,regions,noChr);
  
  deque <struct region>::iterator it = regions.begin();

  while ( it->chr != old_chr ) {

    old_chr = it->chr;  // set the current chr as old chr

    int chr_id  = reader.GetReferenceID(it->chr);

    if ( chr_id == -1 ) {  //reference not found

      for (; it != regions.end() && it->chr == old_chr; ) {
        gene_processing(*it,locus_b);           // print the old region info
        it = regions.erase(it);         // erase the current region
      }
  
      while ( regions.empty() ) {    
        getline(region_f, line);
        if ( region_f.eof() ){
          cerr << "finished: end of region file, zone 0" << endl;
          break;
        }
        eatline(line, regions,noChr);
        it = regions.begin();
        if (it->chr == old_chr){  
          gene_processing(*it,locus_b);      
          regions.clear();
          continue;
        }
      }
      continue;
    }

    int chr_len = refs.at(chr_id).RefLength;

    if ( !reader.SetRegion(chr_id, 1, chr_id, chr_len) ) // here set region
      {
        cerr << "bamtools count ERROR: Jump region failed " << it->chr << endl;
        reader.Close();
        exit(1);
      }

    //pile-up pos stats
    set <string> fragment;
    map <string, unsigned int> pileup;
    bool isposPileup = false;
    unsigned int old_start   = 0;
    unsigned int total_tags  = 0;
    unsigned int total_pos   = 0;
    unsigned int pileup_pos  = 0;


    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {

      if ( bam.IsMapped() == false ) continue;   // skip unaligned reads

      unsigned int unique;
      bam.GetTag("NH", unique);
      if (param->unique == 1) {
        if (unique != 1) {                       // skipe uniquelly mapped reads
          continue;
        }
      }

      if (read_length == 0){
        read_length = bam.Length;
      }

      //cout << bam.Name << endl;
      string chrom = refs.at(bam.RefID).RefName;
      string strand = "+";
      if (bam.IsReverseStrand()) strand = "-";

      unsigned int alignmentStart =  bam.Position+1;
      unsigned int mateStart;
      if (type == "p") mateStart = bam.MatePosition+1;
      unsigned int alignmentEnd = bam.GetEndPosition();
      unsigned int cigarEnd;
      vector <int> blockLengths;
      vector <int> blockStarts;
      blockStarts.push_back(0);
      ParseCigar(bam.CigarData, blockStarts, blockLengths, cigarEnd);


      // position check for unique mapped reads (because is paired-end reads, shoule base on fragment level for paired end reads)
      if (posc == true && unique == 1) {

        if (type == "p" && fragment.count(bam.Name) > 0) 
          fragment.erase(bam.Name);

        else {

          total_tags++;
          if (type == "p"){
            fragment.insert(bam.Name);
          }
          string alignSum;
          if (type == "p") {
             alignSum = int2str(alignmentStart) + "\t" + int2str(mateStart) + "\t.\t" + strand;
          } else {
             alignSum = int2str(alignmentStart) + "\t" + int2str(alignmentEnd) + "\t.\t" + strand;
          }

          if ( alignmentStart != old_start ) {
            isposPileup = false;
            map <string, unsigned int>::iterator pit = pileup.begin();            
            for (; pit != pileup.end(); pit++) {
              posc_f << chrom << "\truping\tpileup\t" << pit->first << "\t.\t" << "Pileup=" << pit->second << endl;     //print pileup
            }
            pileup.clear();           //clear pileup set
            pileup.insert( pair <string, unsigned int> (alignSum, 1) );  //insert the new read
            total_pos++;
          }

          else if ( alignmentStart == old_start ) { // same starts
            if ( pileup.count(alignSum) > 0 ) {  // pileup
              if ( pileup[alignSum] == 1 && isposPileup == false ) { 
                pileup_pos++; isposPileup = true;
              }
              pileup[alignSum]++;
            }
            else {
              pileup.insert( pair <string, unsigned int> (alignSum, 1) );
            }
          } //same starts

        }   //new fragment

        old_start = alignmentStart;
      } // do pos check



      float incre = 1.;
      if (blockStarts.size() > 1) incre = 0.5;     // incre half for junction reads
      incre /= static_cast < float >(unique);        // for multi aligned reads

      deque <struct region>::iterator iter = regions.begin();

      if ( iter->start > alignmentEnd ) continue;  // skip reads not overlapping with the first region

      while ( iter->chr == old_chr && iter->start <= alignmentEnd && iter != regions.end() ) {

        if (iter->end < alignmentStart) {            // the region end is beyond the alignmentStart

          gene_processing(*iter,locus_b);            // processing
          iter = regions.erase(iter);                // this region should be removed
          if ( regions.empty() ) { 
            getline(region_f, line);                        // get a line of region file
            if ( ! region_f.eof() ) {
              eatline(line, regions, noChr);                         // eat a line and put it into the duque
              iter = regions.begin();
            }
            else {  // it's reaching the end of the region file
              cerr << "finished: end of region file, zone 3" << endl;
              break;
            }
          }
          continue;
        }

        if (iter->end >= alignmentStart && iter->start <= alignmentEnd) {  //overlapping, should take action

          vector <int>::iterator cigit = blockStarts.begin();
          for (; cigit != blockStarts.end(); cigit++) {
            unsigned int current_start = *cigit + alignmentStart;
            int current_pos = current_start - (iter->start);
            //cout << iter->chr << "\t" << iter->start << "\t" << iter->end << "\t" << current_start << endl;
            if ( (iter->tags).count(current_pos) > 0 ) {
              (iter->tags)[current_pos] += incre;
            }
            else
              (iter->tags).insert( pair<int, float>(current_pos, incre) );  
          }

        }  // overlapping take action!

        if ( (iter+1) != regions.end() )
          iter++;                                           // if this region is not the last element in the deque
        else {                                              // the last element
          getline(region_f, line);                          // get a line of region file
          if ( ! region_f.eof() ){
            eatline(line, regions, noChr);                         // eat a line and put it into the duque
            iter = regions.end();
            iter--;
          }
          else {  //it's reaching the end of the region file
            cerr << "finished: end of region file, zone 4" << endl;
            break;
          }
        }

      } //while

    }  // read a bam


    // print chr map
    if (posc == true) {
      chrmap_f << old_chr << "\t" << total_tags << "\t" << total_pos << "\t" << pileup_pos << endl;
    } 
 
    //somehow to loop back
    it = regions.begin();                   //reset to begin
    for (; it != regions.end() && it->chr == old_chr; ) {
      gene_processing(*it,locus_b);              // print the old region info
      it = regions.erase(it);             // erase the current region
    }
  
    while ( regions.empty() ) {    

      getline(region_f, line);
      if ( region_f.eof() ){
        cerr << "finished: end of region file, zone 5" << endl;
        //print locus bias
        for (unsigned int l = 0; l < 1000; l++){
	  locus_bias << l << "\t" << locus_b[l].ps << "\t" << locus_b[l].hs << "\t" << locus_b[l].pe << "\t" << locus_b[l].he << endl;
	}
        exit(0);
      }
      eatline(line, regions, noChr);
      it = regions.begin();
      if (it->chr == old_chr){
        gene_processing(*it, locus_b);      
        regions.clear();
        continue;
      }
    }

  } // region chr != old chr
      
  regions.clear();
  reader.Close();
  region_f.close();
  return 0;

} //main
int main (int argc, char *argv[]) {

    int  minBaseQuality = 0;

    string usage=string(""+string(argv[0])+"  [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+
			"\nThis program divides aligned single end reads into potentially deaminated\n"+
			"\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+
			"\nThis is like filterDeaminatedVCF but it loads the VCF before then labels the reads instead of doing it on the fly\n"+
			"\nwhich is good if you have many reads in the bam file.\n"+			
			"\nTip: if you do not need one of them, use /dev/null as your output\n"+
			"\narguments:\n"+
			"\t"+"--bq    [base qual]  : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\t"+"--1000g [vcf file]   : VCF file from 1000g to get the putative A and T positions in modern humans (Default: "+vcf1000g+")\n"+
			"\n");

    if(argc == 1 ||
       argc < 4  ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    for(int i=1;i<(argc-2);i++){ 

	
        if(string(argv[i]) == "--bq"){
	    minBaseQuality=destringify<int>(argv[i+1]);
            i++;
            continue;
	}

        if(string(argv[i]) == "--1000g"){
	    vcf1000g=string(argv[i+1]);
            i++;
            continue;
	}



    }

    unsigned int maxSizeChromosome=250000000;//larger than chr1 hg19
    bool * hasCnoT;
    bool * hasGnoA;
    bool * thousandGenomesHasA;
    bool * thousandGenomesHasT;

    cerr<<"Trying to allocating memory"<<endl;
    try{
	hasCnoT              = new bool[ maxSizeChromosome ];
	hasGnoA              = new bool[ maxSizeChromosome ];
	thousandGenomesHasA  = new bool[ maxSizeChromosome ];
	thousandGenomesHasT  = new bool[ maxSizeChromosome ];
    }catch(bad_alloc& exc){
	cerr<<"ERROR: allocating memory failed"<<endl;
	return 1;
    }
    cerr<<"Success in allocating memory"<<endl;

    for(unsigned int i = 0;i<maxSizeChromosome;i++){
	hasCnoT[i]=false;
	hasGnoA[i]=false;
	thousandGenomesHasA[i]=false;
	thousandGenomesHasT[i]=false;
    }

    string bamfiletopen = string( argv[ argc-5 ] );
    string vcffiletopen = string( argv[ argc-4 ] );
    string chrname      = string( argv[ argc-3 ] );
    string deambam      = string( argv[ argc-2 ] );
    string nondeambam   = string( argv[ argc-1 ] );

    cerr<<"Reading consensus VCF "<<vcffiletopen<<"  ... "<<endl;

    VCFreader vcfr (vcffiletopen,
 		    // vcffiletopen+".tbi",
 		    // chrname,
 		    // 1,
 		    // maxSizeChromosome,
 		    0);

    
    while(vcfr.hasData()){
	SimpleVCF * toprint=vcfr.getData();

	if(toprint->getRef().length() != 1 )
	    continue;
	

	//if the VCF has a at least one G but no A
	if(  toprint->hasAtLeastOneG() && 
	     !toprint->hasAtLeastOneA() ){
	    hasGnoA[ toprint->getPosition() ] =true;
	}

	if(  toprint->hasAtLeastOneC() && 
	     !toprint->hasAtLeastOneT() ){
	    hasCnoT[ toprint->getPosition() ] =true;    
	}

	   
    }
    cerr<<"done reading VCF"<<endl;







    cerr<<"Reading 1000g VCF :"<<vcf1000g<<"  ..."<<endl;
    string line1000g;
    ifstream myFile1000g;
    
    myFile1000g.open(vcf1000g.c_str(), ios::in);

    if (myFile1000g.is_open()){
	while ( getline (myFile1000g,line1000g)){
	    vector<string> fields=allTokens(line1000g,'\t');
	    //0 chr
	    //1 pos
	    //2 id
	    //3 ref
	    //4 alt

	    //check if same chr
	    if(fields[0] != chrname){
		cerr <<"Error, wrong chromosome in 1000g file for line=  "<<line1000g<<endl;
		return 1;
	    }

	    //skip indels
	    if(fields[3].size() != 1 ||
	       fields[4].size() != 1 )
		continue;

	    char         ref=toupper(fields[3][0]);
	    char         alt=toupper(fields[4][0]);
	    unsigned int pos=destringify<unsigned int>( fields[1] );
	    
	    thousandGenomesHasA[ pos ] = ( (ref=='A') || (alt=='A') );
	    thousandGenomesHasT[ pos ] = ( (ref=='T') || (alt=='T') );
	    
	}
	myFile1000g.close();
    }else{
	cerr <<"Unable to open file "<<vcf1000g<<endl;
	return 1;
    }



    cerr<<"done reading 1000g VCF"<<endl;











    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }


    //positioning the bam file
    int refid=reader.GetReferenceID(chrname);
    if(refid < 0){
	cerr << "Cannot retrieve the reference ID for "<< chrname << endl;
	return 1;
    }
    //cout<<"redif "<<refid<<endl;	    

    //setting the BAM reader at that position
    reader.SetRegion(refid,
		     0,
		     refid,
		     -1); 	



    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writerDeam;
    if ( !writerDeam.Open(deambam,      header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamWriter writerNoDeam;
    if ( !writerNoDeam.Open(nondeambam, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }



    unsigned int totalReads      =0;
    unsigned int deaminatedReads =0;
    unsigned int ndeaminatedReads =0;
    unsigned int skipped      =0;



    //iterating over the alignments for these regions
    BamAlignment al;
    int i;

    while ( reader.GetNextAlignment(al) ) {
	// cerr<<al.Name<<endl;

	//skip unmapped
	if(!al.IsMapped()){
	    skipped++;
	    continue;
	}

	//skip paired end !
	if(al.IsPaired() ){  
	    continue;
	    // cerr<<"Paired end not yet coded"<<endl;
	    // return 1;
	}


	string reconstructedReference = reconstructRef(&al);



	char refeBase;
	char readBase;
	bool isDeaminated;
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}

	isDeaminated=false;

	if(al.IsReverseStrand()){

	    //first base next to 3'
	    i = 0 ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    if(  readBase  == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		if( hasGnoA[al.Position+1] && 
		    !thousandGenomesHasA[al.Position+1] )
		    isDeaminated=true; 


		// transformRef(&refeBase,&readBase);

		// vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     // cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;			
		// 	cerr<<numberOfDeletions(&al)<<endl;			
		// 	cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }
		    
		//     //if the VCF has a at least one G but no A
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }


	    //second base next to 3'
	    i = 1;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		if( hasGnoA[al.Position+2] && 
		   !thousandGenomesHasA[al.Position+2] )
		    isDeaminated=true; 


		// transformRef(&refeBase,&readBase);


		// vcfr.repositionIterator(chrname,al.Position+2,al.Position+2);

		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     // cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<"Problem2 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     //if the VCF has at least one G but no A 
		//     // if(toprint->hasAtLeastOneG() &&
		//     //    toprint->getAlt().find("A") == string::npos){
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }
	    }

	    //last  base next to 5'
	    i = (al.QueryBases.length()-1) ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);


		int lengthMatches=countMatchesRecons(reconstructedReference,0);		
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasGnoA[positionJump] && 
		   !thousandGenomesHasA[positionJump] )
		    isDeaminated=true; 


		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();

		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;
		    
		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem3 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }


		//     //if the VCF has at least one G but no A
		//     if(  toprint->hasAtLeastOneG() && 
		// 	!toprint->hasAtLeastOneA() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }

	}else{

		
	    //first base next to 5'
	    i = 0;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C' 
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){ 

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		// transformRef(&refeBase,&readBase);

		if(hasCnoT[al.Position+1]  && 
		   !thousandGenomesHasT[al.Position+1] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //cout<<*toprint<<endl;
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;
		    
		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;			
		// 	cerr<<"Problem4 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     //if the VCF has at least one C but no T
		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }

		// }

		//cout<<al.Position+
		 
	    }

	    //second last base next to 3'
	    i = (al.QueryBases.length()-2);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C'  &&
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  



		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		//transformRef(&refeBase,&readBase);		
		int lengthMatches=countMatchesRecons(reconstructedReference,1);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasCnoT[positionJump] && 
		   !thousandGenomesHasT[positionJump] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem5 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }
		// }

		 

	    }

	    //last base next to 3'
	    i = (al.QueryBases.length()-1);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //&& refeBase  == 'C' 
	    if( readBase  == 'T'  && int(al.Qualities[i]-offset) >= minBaseQuality){  
		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		

		int lengthMatches=countMatchesRecons(reconstructedReference,0);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		if(hasCnoT[positionJump] && 
		   !thousandGenomesHasT[positionJump] )
		    isDeaminated=true; 

		// vcfr.repositionIterator(chrname,positionJump,positionJump);
		// while(vcfr.hasData()){
		//     SimpleVCF * toprint=vcfr.getData();
		//     //skip deletions in the alt
		//     if(toprint->getRef().length() != 1 )
		// 	continue;

		//     if(toprint->getRef()[0] != refeBase){
		// 	cerr<<reconstructedReference<<endl;
		// 	cerr<<al.Position<<endl;
		// 	cerr<<lengthMatches<<endl;
		// 	cerr<<numberOfDeletions(&al)<<endl;
		// 	cerr<<positionJump<<endl;
		// 	cerr<<"Problem6 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
		// 	exit(1);
		//     }

		//     if(  toprint->hasAtLeastOneC() && 
		// 	!toprint->hasAtLeastOneT() ){
		// 	isDeaminated=true; 
		//     }
		// }

	    }	

	   
	    
	}
		  



	totalReads++;

	if(isDeaminated){
	    deaminatedReads++;
	    writerDeam.SaveAlignment(al);		
	}else{
	    ndeaminatedReads++;
	    writerNoDeam.SaveAlignment(al);		
	}


    
    }//end for each read









    reader.Close();
    writerDeam.Close();
    writerNoDeam.Close();
    delete(hasCnoT);
    delete(hasGnoA);

    cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl;

   
    return 0;
}
void StructuralVariations::findTranslocationsOnTheFly(string bamFileName, bool outtie, float meanCoverage, string outputFileHeader, map<string,int> SV_options) {
	size_t start = time(NULL);
	//open the bam file
	BamReader bamFile;
	bamFile.Open(bamFileName);
	//Information from the header is needed to initialize the data structure
	SamHeader head = bamFile.GetHeader();
	// now create Translocation on the fly
	Window *window;

	window = new Window(bamFileName,outtie,meanCoverage,outputFileHeader,SV_options);
	window->initTrans(head);
	//expands a vector so that it is large enough to hold reads from each contig in separate elements
	window->eventReads.resize(SV_options["contigsNumber"]);
	window->eventSplitReads.resize(SV_options["contigsNumber"]);

	window-> binnedCoverage.resize(SV_options["contigsNumber"]);
	window-> linksFromWin.resize(SV_options["contigsNumber"]);
	
	window -> numberOfEvents = 0;

	string line;
	string coverageFile=outputFileHeader+".tab";
	ifstream inputFile( coverageFile.c_str() );
	int line_number=0;
	while (std::getline( inputFile, line )){
		if(line_number > 0){
			vector<string> splitline;
    		std::stringstream ss(line);
    		std::string item;
    		while (std::getline(ss, item, '\t')) {
        		splitline.push_back(item);
    		}
			window -> binnedCoverage[window -> contig2position[splitline[0]]].push_back(atof(splitline[3].c_str()));
		}
		line_number += 1;
	}
	inputFile.close();


	//Initialize bam entity
	BamAlignment currentRead;
	//now start to iterate over the bam file
	int counter = 0;
	while ( bamFile.GetNextAlignmentCore(currentRead) ) {
	  if(currentRead.IsMapped()) {
	    window->insertRead(currentRead);
	  }
	}
	for(int i=0;i< window-> eventReads.size();i++){
	  if(window -> eventReads[i].size() >= window -> minimumPairs){
	    window->computeVariations(i);
	  }
	  window->eventReads[i]=queue<BamAlignment>();
	  window->eventSplitReads[i] = vector<BamAlignment>();
	}
	  
	window->interChrVariationsVCF.close();
	window->intraChrVariationsVCF.close();
	printf ("variant calling time consumption= %lds\n", time(NULL) - start);
}
Esempio n. 4
0
int IonstatsTestFragments(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bam_filename   = opts.GetFirstString('i', "input", "");
  string fasta_filename       = opts.GetFirstString('r', "ref", "");
  string output_json_filename = opts.GetFirstString('o', "output", "ionstats_tf.json");
  int histogram_length        = opts.GetFirstInt   ('h', "histogram-length", 400);

  if(argc < 2 or input_bam_filename.empty() or fasta_filename.empty()) {
    IonstatsTestFragmentsHelp();
    return 1;
  }

  //
  // Prepare for metric calculation
  //

  map<string,string> tf_sequences;
  PopulateReferenceSequences(tf_sequences, fasta_filename);


  BamReader input_bam;
  if (!input_bam.Open(input_bam_filename)) {
    fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str());
    return 1;
  }

  int num_tfs = input_bam.GetReferenceCount();


  SamHeader sam_header = input_bam.GetHeader();
  if(!sam_header.HasReadGroups()) {
    fprintf(stderr, "[ionstats] ERROR: no read groups in %s\n", input_bam_filename.c_str());
    return 1;
  }

  string flow_order;
  string key;
  for (SamReadGroupIterator rg = sam_header.ReadGroups.Begin(); rg != sam_header.ReadGroups.End(); ++rg) {
    if(rg->HasFlowOrder())
      flow_order = rg->FlowOrder;
    if(rg->HasKeySequence())
      key = rg->KeySequence;
  }


  // Need these metrics stratified by TF.

  vector<ReadLengthHistogram> called_histogram(num_tfs);
  vector<ReadLengthHistogram> aligned_histogram(num_tfs);
  vector<ReadLengthHistogram> AQ10_histogram(num_tfs);
  vector<ReadLengthHistogram> AQ17_histogram(num_tfs);
  vector<SimpleHistogram> error_by_position(num_tfs);
  vector<MetricGeneratorSNR> system_snr(num_tfs);
  vector<MetricGeneratorHPAccuracy> hp_accuracy(num_tfs);

  for (int tf = 0; tf < num_tfs; ++tf) {
    called_histogram[tf].Initialize(histogram_length);
    aligned_histogram[tf].Initialize(histogram_length);
    AQ10_histogram[tf].Initialize(histogram_length);
    AQ17_histogram[tf].Initialize(histogram_length);
    error_by_position[tf].Initialize(histogram_length);
  }

  vector<uint16_t> flow_signal_fz(flow_order.length());
  vector<int16_t> flow_signal_zm(flow_order.length());

  const RefVector& refs = input_bam.GetReferenceData();

  // Missing:
  //  - hp accuracy - tough, copy verbatim from TFMapper?


  BamAlignment alignment;
  vector<char>  MD_op;
  vector<int>   MD_len;
  MD_op.reserve(1024);
  MD_len.reserve(1024);
  string MD_tag;

  //
  // Main loop over mapped reads in the input BAM
  //

  while(input_bam.GetNextAlignment(alignment)) {


    if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag))
      continue;

    // The check below eliminates unexpected alignments
    if (alignment.IsReverseStrand() or alignment.Position > 5)
      continue;

    int current_tf = alignment.RefID;

    //
    // Step 1. Parse MD tag
    //

    MD_op.clear();
    MD_len.clear();

    for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) {

      int item_length = 0;
      if (*MD_ptr >= '0' and *MD_ptr <= '9') {    // Its a match
        MD_op.push_back('M');
        for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr)
          item_length = 10*item_length + *MD_ptr - '0';
      } else {
        if (*MD_ptr == '^') {                     // Its a deletion
          MD_ptr++;
          MD_op.push_back('D');
        } else                                    // Its a substitution
          MD_op.push_back('X');
        for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr)
          item_length++;
      }
      MD_len.push_back(item_length);
    }

    //
    // Step 2. Synchronously scan through Cigar and MD, doing error accounting
    //

    int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0;
    int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0;
    int increment = alignment.IsReverseStrand() ? -1 : 1;

    int AQ10_bases = 0;
    int AQ17_bases = 0;
    int num_bases = 0;
    int num_errors = 0;

    while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) {

      if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar
        cigar_idx += increment;
        continue;
      }
      if (MD_len[MD_idx] == 0) { // Try advancing MD
        MD_idx += increment;
        continue;
      }

      // Match
      if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        num_bases += advance;
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Insertion (read has a base, reference doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'I') {
        int advance = alignment.CigarData[cigar_idx].Length;
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;

      // Deletion (reference has a base, read doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Substitution
      } else if (MD_op[MD_idx] == 'X') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position[current_tf].Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      } else {
        printf("ionstats tf: Unexpected OP combination: %s Cigar=%c, MD=%c !\n",
            alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]);
        break;
      }

      if (num_errors*10 <= num_bases)   AQ10_bases = num_bases;
      if (num_errors*50 <= num_bases)   AQ17_bases = num_bases;
    }

    //
    // Step 3. Profit
    //

    called_histogram[current_tf].Add(alignment.Length);
    aligned_histogram[current_tf].Add(num_bases);
    AQ10_histogram[current_tf].Add(AQ10_bases);
    AQ17_histogram[current_tf].Add(AQ17_bases);

    if(alignment.GetTag("ZM", flow_signal_zm))
      system_snr[current_tf].Add(flow_signal_zm, key.c_str(), flow_order);
    else if(alignment.GetTag("FZ", flow_signal_fz))
      system_snr[current_tf].Add(flow_signal_fz, key.c_str(), flow_order);


    // HP accuracy - keeping it simple

    if (!alignment.IsReverseStrand()) {

      string genome = key + tf_sequences[refs[current_tf].RefName];
      string calls = key + alignment.QueryBases;
      const char *genome_ptr = genome.c_str();
      const char *calls_ptr = calls.c_str();

      for (int flow = 0; flow < (int)flow_order.length() and *genome_ptr and *calls_ptr; ++flow) {
        int genome_hp = 0;
        int calls_hp = 0;
        while (*genome_ptr == flow_order[flow]) {
          genome_hp++;
          genome_ptr++;
        }
        while (*calls_ptr == flow_order[flow]) {
          calls_hp++;
          calls_ptr++;
        }
        hp_accuracy[current_tf].Add(genome_hp, calls_hp);
      }
    }
  }



  //
  // Processing complete, generate ionstats_tf.json
  //

  Json::Value output_json(Json::objectValue);
  output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL));
  output_json["meta"]["format_name"] = "ionstats_tf";
  output_json["meta"]["format_version"] = "1.0";

  output_json["results_by_tf"] = Json::objectValue;

  for (int tf = 0; tf < num_tfs; ++tf) {

    if (aligned_histogram[tf].num_reads() < 1000)
      continue;

    called_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["full"]);
    aligned_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["aligned"]);
    AQ10_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ10"]);
    AQ17_histogram[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["AQ17"]);
    error_by_position[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]["error_by_position"]);
    system_snr[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]);
    hp_accuracy[tf].SaveToJson(output_json["results_by_tf"][refs[tf].RefName]);

    output_json["results_by_tf"][refs[tf].RefName]["sequence"] = tf_sequences[refs[tf].RefName];
  }

  input_bam.Close();

  ofstream out(output_json_filename.c_str(), ios::out);
  if (out.good()) {
    out << output_json.toStyledString();
    return 0;
  } else {
    fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str());
    return 1;
  }
}
Esempio n. 5
0
// generates mutiple sorted temp BAM files from single unsorted BAM file
bool SortTool::SortToolPrivate::GenerateSortedRuns(void) {
    
    // open input BAM file
    BamReader reader;
    if ( !reader.Open(m_settings->InputBamFilename) ) {
        cerr << "bamtools sort ERROR: could not open " << m_settings->InputBamFilename
             << " for reading... Aborting." << endl;
        return false;
    }
    
    // get basic data that will be shared by all temp/output files 
    SamHeader header = reader.GetHeader();
    header.SortOrder = ( m_settings->IsSortingByName
                       ? Constants::SAM_HD_SORTORDER_QUERYNAME
                       : Constants::SAM_HD_SORTORDER_COORDINATE );
    m_headerText = header.ToString();
    m_references = reader.GetReferenceData();
    
    // set up alignments buffer
    BamAlignment al;
    vector<BamAlignment> buffer;
    buffer.reserve( (size_t)(m_settings->MaxBufferCount*1.1) );
    bool bufferFull = false;
    
    // if sorting by name, we need to generate full char data
    // so can't use GetNextAlignmentCore()
    if ( m_settings->IsSortingByName ) {

        // iterate through file
        while ( reader.GetNextAlignment(al)) {

            // check buffer's usage
            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );

            // store alignments until buffer is "full"
            if ( !bufferFull )
                buffer.push_back(al);

            // if buffer is "full"
            else {

                // push any unmapped reads into buffer,
                // don't want to split these into a separate temp file
                if ( !al.IsMapped() )
                    buffer.push_back(al);

                // "al" is mapped, so create a sorted temp file with current buffer contents
                // then push "al" into fresh buffer
                else {
                    CreateSortedTempFile(buffer);
                    buffer.push_back(al);
                }
            }
        }
    }

    // sorting by position, can take advantage of GNACore() speedup
    else {

        // iterate through file
        while ( reader.GetNextAlignmentCore(al) ) {

            // check buffer's usage
            bufferFull = ( buffer.size() >= m_settings->MaxBufferCount );

            // store alignments until buffer is "full"
            if ( !bufferFull )
                buffer.push_back(al);

            // if buffer is "full"
            else {

                // push any unmapped reads into buffer,
                // don't want to split these into a separate temp file
                if ( !al.IsMapped() )
                    buffer.push_back(al);

                // "al" is mapped, so create a sorted temp file with current buffer contents
                // then push "al" into fresh buffer
                else {
                    CreateSortedTempFile(buffer);
                    buffer.push_back(al);
                }
            }
        }
    }

    // handle any leftover buffer contents
    if ( !buffer.empty() )
        CreateSortedTempFile(buffer);
    
    // close reader & return success
    reader.Close();
    return true;
}
Esempio n. 6
0
int main (int argc, char** argv)
{

    // Print Commandline
    string ss(argv[0]);   // convert Char to String
    string commandline = "##Print Command line " + ss;

    int c;

    FastaReference* reference = NULL;
    int minbaseQ        = 10;   //default
    int windowlen       = 40;  //by default
    string regionstr;
    string RegionFile;
    string bamfile;
    bool STdin          = false;
    bool has_region     = false;
    bool has_regionFile = false;
    bool has_bamfile    = false;
    bool has_ref        = false;
    int ploidy         = 2;
    bool SetLowComplexityRegionSWGapExt = false;
    bool SetLowComplexityRegion = false;
   

    if (argc < 2)
    {
        printSummary(argv);
        exit(1);
    }

    while (true)
    {
        static struct option long_options[] =
        {
            {"help", no_argument, 0, 'h'},
            {"ploidy", required_argument, 0, 'p'},
            {"window-size", required_argument, 0, 'w'},
            {"reference", required_argument, 0, 'f'},
            {"min-base-quality", required_argument, 0,'q'},
            {"Region", required_argument, 0, 'R'},
            {"STdin", no_argument, 0, 's'},
            {"bam", required_argument, 0, 'b'},
            {"Repeat-Extgap", no_argument, 0, 'E'},
            {"LowCompex", no_argument, 0, 'l'},
            {0, 0, 0, 0}
        };

        int option_index = 0;

        c = getopt_long (argc, argv, "hslEf:q:w:s:r:R:p:b:", long_options, &option_index);

        /* Detect the end of the options. */
        if (c == -1)
            break;

        switch (c)
        {
            case 'f':
                reference = new FastaReference(optarg); // will exit on open failure
                commandline = commandline + " -f " + optarg;
                has_ref = true;
                break;

            case 'b':
                has_bamfile = true;
                bamfile = optarg;
                commandline = commandline + " -b " + optarg;
                break;

            case 'r':
                regionstr = optarg;
                has_region = true;
                commandline = commandline + " -r " + optarg;
                break;

             case 'R':
                RegionFile = optarg;
                has_regionFile = true;
                commandline = commandline + " -R " + optarg;
                break;

            case 's':
                STdin = true;
                commandline = commandline + " -s ";
                break;
                
            case 'q':
                minbaseQ = atoi(optarg);
                commandline = commandline + " -q " + optarg;
                break;

            case 'w':
                windowlen = atoi(optarg);
                commandline = commandline + " -w " + optarg;
                break;

            case 'p':
                ploidy = atoi(optarg);
                commandline = commandline + " -p " + optarg;
                break;

            case 'E':
                SetLowComplexityRegionSWGapExt = true;
                commandline = commandline + " -E ";
                break;

            case 'l':
                SetLowComplexityRegion = true;
                commandline = commandline + " -l ";
                break;

            case 'h':
                printSummary(argv);
                commandline = commandline + " -h ";
                exit(0);
                break;

            case '?':
                printSummary(argv);
                exit(1);
                break;

              default:
                abort();
                break;
        }
    }

    //// Open Error log files
    ofstream cerrlog("bonsaiReport.txt");
    streambuf *cerrsave = std::cerr.rdbuf();

    // Redirect stream buffers
    if (cerrlog.is_open())
        cerr.rdbuf(cerrlog.rdbuf());

    cerr << commandline << endl;
    

    //Check for Reference Fasta sequence
    if (!has_ref)
    {
        cerr << "no FASTA reference provided, cannot realign" << endl;
        exit(1);
    }

    ////Check for reader
    BamReader reader;
    if (STdin == true)
    {
        if (!reader.Open("stdin"))
        {
            cerr << "could not open stdin bam for reading" << endl;
            cerr << reader.GetErrorString() << endl;
            reader.Close();
            printSummary(argv);
        }
    }
    else
    {
        if (has_bamfile == true)
        {
            if (!reader.Open(bamfile))
            {
                cerr << "ERROR: could not open bam files from stdin ... Aborting" << endl;
                cerr << reader.GetErrorString() << endl;
                reader.Close();
                printSummary(argv);
            }

            if ( !reader.LocateIndex() )
                reader.CreateIndex();
        }
        else
        {
            cerr << "--bam flag is set but no bamfile is provided... Aborting" << endl;
            reader.Close();
            printSummary(argv);
        }
    }

    //// Check Region Tags
    if ( (has_regionFile == true) && (has_region == true) )
    {
        cerr << "ERROR: You provide both region and has provide a Set Region List... Aborting" << endl;
        exit(1);
    }

    //// store the names of all the reference sequences in the BAM file
    vector<RefData> referencedata = reader.GetReferenceData();
   
    //// Store Region LIST
    vector<BamRegion> regionlist;
    if (has_region == true)
    {
        BamRegion region;
        ParseRegionString(regionstr, reader, region);
        regionlist.push_back(region);
    }
    else if (has_regionFile == true)
    {
        ifstream RG(RegionFile.c_str(), ios_base::in);
        string line;
        while(getline(RG,line))
        {
            BamRegion region;
            ParseRegionString(line, reader, region);
            regionlist.push_back(region);
        }
        RG.close();
    }
    else if ( (has_regionFile == false) && (has_region == false) )
    {
        for (int i= 0; i < (int)referencedata.size(); i++)
        {
            string regionstr = referencedata.at(i).RefName;
            BamRegion region;
            ParseRegionString(regionstr, reader, region);
            if (!reader.SetRegion(region)) // Bam region will get [0,101) = 0 to 100 => [closed, half-opened)
            {
                cerr << "ERROR: set region " << regionstr << " failed. Check that REGION describes a valid range... Aborting" << endl;
                reader.Close();
                exit(1);
            }
            else
                regionlist.push_back(region);
        }
    }

    //// 
    BamWriter writer;
    if (!writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData()))
    {
        cerr << "could not open stdout for writing" << endl;
        exit(1);
    }

    //// Smallest start position and Largest end position for Req Seq
    vector<RefData>::iterator refdataIter = referencedata.begin();
    vector<BamRegion>::iterator regionListIter = regionlist.begin();
   

    // CLASS
    RealignFunctionsClass RealignFunction;

    map<int, string> RefIDRedName;
    vector<SalRealignInfo> AlGroups;
    multimap<int, BamAlignment> SortRealignedAlignmentsMultimap;

    int refid               = 0;
    BamAlignment alignment;
    bool IsNextAlignment = reader.GetNextAlignment(alignment);
    //cerr << "   " << alignment.Name << " Chr  " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl;

    int windowrealigned     = 0;
    int TotalWindowDetected = 0;
    int TotalReadsAligned   = 0;
    int TotalWindow         = 0;
    int TotalReads          = 0;

    while (refdataIter != referencedata.end() )
    {
        string refname = refdataIter->RefName;
        RefIDRedName[refid] = refname;
        int reflength = refdataIter->RefLength;
        int winstartpos, winendpos;
        int AllowableBasesInWindow = 1;
        bool nextChrName = false;

        cerr << "##HeaderINFO: RefID = " << refdataIter->RefName << "\t" << "RefLen = " << reflength << endl;
        
        while (nextChrName == false )
        {
            vector<int> minmaxRefSeqPos;
            bool IsPassDetectorNoRealignment = false;
            minmaxRefSeqPos.push_back(-1);
            minmaxRefSeqPos.push_back(0);
            //cerr << " region: " << (*regionListIter).LeftRefID << " : " << (*regionListIter).LeftPosition << " .. " << (*regionListIter).RightPosition << endl;
            if ((refid == (int)referencedata.size() - 1) && ((*regionListIter).LeftRefID == refid) && ((has_region==true) || (has_regionFile==true)) )
            {
                ////
                if ( (has_region == true) || (has_regionFile == true) )
                {                    
                    winstartpos = (*regionListIter).LeftPosition;
                    winendpos   = winstartpos + windowlen - 1;
                    reflength = (*regionListIter).RightPosition;
                    if (reflength < winendpos)
                        reflength = winendpos;
                                       
                    // Get Next Alignment First
                    if ( (refid == alignment.RefID) && (winstartpos == (*regionListIter).LeftPosition) && (IsNextAlignment == false) )
                        IsNextAlignment = reader.GetNextAlignment(alignment);
                }
                else if (has_region == false)
                {
                    winstartpos = 0;
                    winendpos   = winstartpos + windowlen - 1;

                    // Get Next Alignment First
                    if ( (refid == alignment.RefID) && (winstartpos == 0) && (IsNextAlignment == false) )
                        IsNextAlignment = reader.GetNextAlignment(alignment);
                }
                //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos;
                //cerr << "   " << alignment.Name << " Chr  " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl;

                ////
                while ((winstartpos < reflength))
                {
                    //// Check window end position
                    if (winendpos > reflength)
                        winendpos = reflength;

                    // Reinitialized
                    unsigned int NewReadMappedcount = 0;
                
                    //// Save and Erase alignments that are outside of window (Deque?)
                    if (!AlGroups.empty())
                    {
                        minmaxRefSeqPos.at(0) = -1;
                        minmaxRefSeqPos.at(1) = 0;

                        //cerr << "#Start: Keep alignments with start position exceed the right end of the window/Region " << endl;
                        vector<SalRealignInfo>::iterator Iter = AlGroups.begin();

                        while (Iter != AlGroups.end())
                        {
                            // Erase alignment s
                            if ((*Iter).al.GetEndPosition() < winstartpos)
                            {
                                //cerr << "  ToWrite: " << (*Iter).second.size() << " ; " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl;
                                SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al));
                                AlGroups.erase(Iter);

                                //cerr << "  ToWrite: DONE " << endl;
                            } 
                            else
                            {
                                string referenceSequence = reference->getSubSequence(RefIDRedName[(*Iter).al.RefID], (*Iter).al.Position, 2*(*Iter).al.Length);
                            
                                if ((*Iter).HasRealign == true )
                                {
                                    (*Iter).currentReadPosition = 0;
                                    (*Iter).currentGenomeSeqPosition = 0;
                                    (*Iter).currentAlPosition = (*Iter).al.Position;
                                    (*Iter).cigarindex = 0;
                                }

                                (*Iter).CigarSoftclippingLength = 0;
                                SalRealignInfo talr = (*Iter);
                                //cerr << "  ToKEEP: " << (*Iter).al.Name << " ; " << (*Iter).al.Position << " < " << winstartpos << " : " << (*Iter).al.GetEndPosition() << endl;
                                RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, talr, Iter, (*Iter).al, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, false);
                           
                                ++Iter; //Increment iterator
                            }
                        }
                    }
                

                    // Write Sorted Alignments that are outside of window
                    //cerr << "SortRealignedAlignmentsMultimap: " << SortRealignedAlignmentsMultimap.size() << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl;
                    if (!SortRealignedAlignmentsMultimap.empty()) // && (winWrite < winstartpos ) )
                    {
                        //cerr << "#Start: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl;
                        multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin();

                        while (sraIter != SortRealignedAlignmentsMultimap.end()) 
                        {
                            //cerr << " (*sraIter).first= " <<  (*sraIter).first << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << " winstartpos - ((windowlen - 1)*0.9)= " << winstartpos - ((windowlen - 1)*0.9) << endl;
                            if (((float) (*sraIter).first < floor((float) (winstartpos - ((windowlen - 1)*0.9)))) && ((minmaxRefSeqPos.at(0) > 0) && ((*sraIter).first < minmaxRefSeqPos.at(0)))) {
                                //writer.SaveAlignment((*sraIter).second);  // Why sometimes, it doesn't work ?????
                                if (!writer.SaveAlignment((*sraIter).second))
                                    cerr << writer.GetErrorString() << endl;

                                SortRealignedAlignmentsMultimap.erase(sraIter++);
                            } else {
                                ++sraIter;
                            }
                    }
                    //cerr << "#Done: Write alignments and delete alignments with start position exceed the right end of the window/Region " << endl;
                    }

                    //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos;
                    //cerr << "   " << alignment.Name << " Chr  " << alignment.RefID << " Startpos: " << alignment.Position << " Endpos: " << alignment.GetEndPosition() << " Length: " << alignment.Length << endl;
                    //cerr <<  ": " << alignment.RefID << " :" << RefIDRedName[alignment.RefID] << " : " << RefIDRedName[alignment.RefID] << endl;

                    //cerr << "Start: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl;
                    // Gather Reads within a window frame
                  
                    while ((IsNextAlignment) && (refid == alignment.RefID)) // Neeed more conditions
                    {
                        if (SetLowComplexityRegion == true) 
                        {
                            string sequenceInWindow = reference->getSubSequence(RefIDRedName[alignment.RefID], winstartpos, (winendpos-winstartpos+1) );

                            if (IsWindowInRepeatRegion(sequenceInWindow) == true)
                            {
                                if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0)
                                {
                                    TotalReads++;
                                    if (alignment.IsMapped())
                                    {
                                        string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2*alignment.Length);
 
                                        vector<SalRealignInfo>::iterator tIter;
                                        SalRealignInfo alr;
                                        alr.al = alignment;
                                        alr.currentReadPosition = 0;
                                        alr.currentGenomeSeqPosition = 0;
                                        alr.currentAlPosition = alignment.Position;
                                        alr.cigarindex = 0;
                                        alr.HasRealign = false;
                                        alr.CigarSoftclippingLength = 0;

                                        string str = "ZZZZZZZZZZZZZZZZZ";
                                        if (alignment.Name.find(str) != string::npos) {
                                            stringstream cigar;
                                            for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter)
                                                cigar << cigarIter->Length << cigarIter->Type;

                                            string cigarstr = cigar.str();
                                            cerr << "   TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl;
                                        }

                                        RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true);
                                        NewReadMappedcount++;
                                    } 
                                    else
                                    {
                                        SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                        cerr << "UNmapped : " << alignment.Name << endl;
                                    }
                                } 
                                else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1)
                                {
                                    SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                }
                                else
                                    break;
                            } else {
                                if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) < 2)
                                    SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                else
                                    break;
                            }
                        }
                        else // (SetLowComplexityRegion == false)
                        {
                            if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 0)
                            {
                                TotalReads++;
                                if (alignment.IsMapped())
                                {
                                    string referenceSequence = reference->getSubSequence(RefIDRedName[alignment.RefID], alignment.Position, 2 * alignment.Length);

                                    vector<SalRealignInfo>::iterator tIter;
                                    SalRealignInfo alr;
                                    alr.al = alignment;
                                    alr.currentReadPosition = 0;
                                    alr.currentGenomeSeqPosition = 0;
                                    alr.currentAlPosition = alignment.Position;
                                    alr.cigarindex = 0;
                                    alr.HasRealign = false;
                                    alr.CigarSoftclippingLength = 0;

                                    string str = "ZZZZZZZZZZZZZZZZZ";
                                    if (alignment.Name.find(str) != string::npos)
                                    {
                                        stringstream cigar;
                                        for (vector<CigarOp>::const_iterator cigarIter = alignment.CigarData.begin(); cigarIter != alignment.CigarData.end(); ++cigarIter)
                                            cigar << cigarIter->Length << cigarIter->Type;

                                        string cigarstr = cigar.str();
                                        cerr << "   TRACKING: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " cigar: " << cigarstr << endl;
                                    }

                                    RealignFunction.ParseAlignmentsAndExtractVariantsByBaseQualv(AlGroups, alr, tIter, alignment, referenceSequence, minmaxRefSeqPos, winstartpos, winendpos, (float) minbaseQ, true);

                                    //cerr << " winstart: " << winstartpos << " ; winend: " << winendpos;
                                    //cerr << "   INDEL: " << alignment.RefID << " " << alignment.Name << " pos: " << alignment.Position << " Length: " << alignment.Length << " CIGARstr: " << cigarstr << endl;
                                    NewReadMappedcount++;
                                } 
                                else
                                {
                                    SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                                    cerr << "UNmapped : " << alignment.Name << endl;
                                }
                            }
                            else if ((IsWithinWindow(alignment, winstartpos, winendpos, AllowableBasesInWindow)) == 1) {
                                SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > (alignment.Position, alignment));
                            }
                            else
                                break;
                        }

                        ////Get next alignment
                        IsNextAlignment = reader.GetNextAlignment(alignment);
                    }

                   //cerr << "Done: Gather aligmenets that lie (fully or partially) within the window frame and group INDELs if there are ... " << endl;

                    //// Detector Corner
                    bool ToRealign = MeetIndelDetectorThresholdv(AlGroups);
                    cerr << "MeetIndelDetectorThresholdv(AlGroups).size()= " << AlGroups.size() << endl;
                    
                    // **************
                    if (ToRealign)
                    {
                        //cerr << "  ToRealign: " << refdataIter->RefName << "\t" << reflength << "\t" << winstartpos << "\t" << winendpos << "\t" << AlGroups.size() << endl;
                        //cerr << "             minmaxRefSeqPos.at(1)= " << minmaxRefSeqPos.at(1) << " minmaxRefSeqPos.at(0)= " << minmaxRefSeqPos.at(0) << endl;

                        ////// Perform Realign routines
                        int TotalAlR = 0; // Total number of alignments to be realigned
                        int NumAlR = 0; // Now many alignments are aligned
                        TotalWindowDetected++;

                        cerr << "#Start: Meet Threshold, Realigning ... " << endl;

                        if (minmaxRefSeqPos.at(1) < winendpos)
                            minmaxRefSeqPos.at(1) = winendpos;

                        if (minmaxRefSeqPos.at(0) > winstartpos)
                            minmaxRefSeqPos.at(0) = winstartpos;

                        bool IsToRealign = RealignFunction.PruningByNaiveSelectionProcedureAndConstructHaplotypes2(winstartpos, winendpos, refid, refname, minmaxRefSeqPos, reference);

                        if (IsToRealign == true)
                        {
                            RealignFunction.SelectHaplotypeCandidates_SmithWatermanBSv(AlGroups, minmaxRefSeqPos, SetLowComplexityRegionSWGapExt);

                            minmaxRefSeqPos.at(0) = -1;
                            minmaxRefSeqPos.at(1) = 0;

                            int nextwinstartpos = winendpos + 1;
                            int nextwinendpos = winstartpos + windowlen - 1;
                            if (nextwinendpos > reflength)
                                nextwinendpos = reflength;

                            //cerr <<  "   Before Realign : " << SortRealignedAlignmentsMultimap.size() << endl;
                            RealignFunction.AdjustCigarsWRTChosenMultipleHaplotypesAndPrepareAlignmentsTobeWrittenOut(AlGroups, SortRealignedAlignmentsMultimap, reference, RefIDRedName, minmaxRefSeqPos, nextwinstartpos, nextwinendpos, minbaseQ, TotalAlR, NumAlR, ploidy);
                            IsPassDetectorNoRealignment = false; // Set flag to false to deactivate write functions

                            //cerr <<  "   After Realign : " << SortRealignedAlignmentsMultimap.size() << endl;

                            TotalReadsAligned += NumAlR;

                            if (NumAlR > 0) // Realignment done
                                windowrealigned++;
                        } else


                        cerr << "#Done: Meet Threshold, Realigning ... " << endl;
                    }


                    if (NewReadMappedcount > 0)
                        TotalWindow++;

                    RealignFunction.Clear();

                    //// Move the window frame
                    winstartpos = winendpos + 1;
                    winendpos = winstartpos + windowlen - 1;
                }

                //// Save and Erase remaining alignments that are outside of window (Deque?)
                if ((!AlGroups.empty())) {
                    cerr << "#Start: Write Remaining alignments and delete all alignments" << endl;

                    for (vector<SalRealignInfo>::iterator Iter = AlGroups.begin(); Iter != AlGroups.end(); ++Iter) {
                        //cerr << "    Remain alignment start: " << (*Iter).al.Name << " " << Iter->al.Position  << " < " << winstartpos << "  " << winendpos << endl;
                        SortRealignedAlignmentsMultimap.insert(pair<int, BamAlignment > ((*Iter).al.Position, (*Iter).al));
                    }

                    cerr << "#Done: Write Remaining alignments and delete all alignments" << endl;
                }

                AlGroups.clear();


                // Write Sorted remaining Alignments that are outside of window
                if (!SortRealignedAlignmentsMultimap.empty())
                {
                    for (multimap<int, BamAlignment>::iterator sraIter = SortRealignedAlignmentsMultimap.begin(); sraIter != SortRealignedAlignmentsMultimap.end(); ++sraIter)
                    {
                        //writer.SaveAlignment((*sraIter).second);
                        if (!writer.SaveAlignment((*sraIter).second))
                            cerr << writer.GetErrorString() << endl;
                    }
                    SortRealignedAlignmentsMultimap.clear();
                }

            }

            ++regionListIter;
            if ((*regionListIter).LeftRefID > refid)
                nextChrName = true;
        }

        //// If End of the chromosome position
        //// increament iterator
        ++refdataIter;
        ++refid;
    }


    reader.Close();
    writer.Close();

    cerr << "##-Completed- " << endl;
    cerr << " Total Reads processed =  " << TotalReads << endl;
    cerr << " Total Reads Aligned =    " << TotalReadsAligned << endl;
    cerr << " Total Window processed = " << TotalWindow << endl;
    cerr << " Total Window Detected =  " << TotalWindowDetected << endl;
    cerr << " Total Windows Aligned =  " << windowrealigned << endl;


    // Restore cerr's stream buffer before terminating
    if (cerrlog.is_open())
        cerr.rdbuf(cerrsave);

    commandline.clear();
    return 0;
}
Esempio n. 7
0
int main (int argc, char *argv[]) {

     if( (argc!= 4 && argc !=5 && argc !=6) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
	 cerr<<"Usage:splitByRG [in bam] [rg Tally] [out prefix] (optional target)"<<endl<<"this program will subsample a BAM file per read group for a certain target\nFor example splitByRG in.bam tally.txt out will create\nout.rg1.bam\nout.rg2.bam\n"<<endl;
    	return 1;
    }


     string bamfiletopen      = string(argv[1]);
     string rgTally           = string(argv[2]);
     string bamDirOutPrefix   = string(argv[3]);
     
     int target            =  200000;
     int maxTarget         = 1000000;

     if(argc==5){
	 target    = destringify<int> ( string(argv[4]) );	 
     }

     if(argc==6){
	 target    = destringify<int> ( string(argv[4]) );	 
	 maxTarget = destringify<int> ( string(argv[5]) );	 
     }


     cerr<<"minimum fragments:\t"<<target<<endl;
     cerr<<"target  fragments:\t"<<maxTarget<<endl;

     string line;
     ifstream myFileTally;
     map<string,double> rg2Fraction;

     myFileTally.open(rgTally.c_str(), ios::in);
     cerr<<"Retained groups:\n"<<endl;
     cerr<<"RG\t#mapped\tfraction retained"<<endl;
     cerr<<"-----------------------------------"<<endl;

     if (myFileTally.is_open()){
	 while ( getline (myFileTally,line)){
	     vector<string> tokens = allTokens(line,'\t');
	     if(tokens.size() > 6)
		 if( tokens[1] == "pass" && 
		    (tokens[0] != "\"\""    && 
		     tokens[0] != "control" && 
		     tokens[0] != "TOTAL") ){
		     //cout<<tokens[0]<<"\t"<<tokens[5]<<endl;
		     int count = destringify<int>(tokens[5]);

		     if(count>target){

			 if(count>=maxTarget){
			     rg2Fraction[  tokens[0] ] = double(maxTarget)/double(count);
			     cout<<tokens[0]<<"\t"<<count<<"\t"<<double(maxTarget)/double(count)<<endl;
			 }else{
			     cout<<tokens[0]<<"\t"<<count<<"\t"<<1.0<<endl;
			     rg2Fraction[  tokens[0] ] = 1.0;
			 }
		     }
		 }
	 }
	 myFileTally.close();
     }else{
	 cerr << "Unable to open file "<<rgTally<<endl;
	 return 1;
     }



     map<string,BamWriter *> rg2BamWriter;
     
     // if(!isDirectory(bamDirOut)){
     // 	 cerr<<"ERROR: the out directory does not exist"<<endl;
     // 	return 1;
     // }

     BamReader reader;

     if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
     }
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();
    vector<RefData>  refData=reader.GetReferenceData();

    SamReadGroupDictionary 	srgd=header.ReadGroups;
    for(SamReadGroupConstIterator srgci=srgd.ConstBegin();
	srgci<srgd.ConstEnd();
	srgci++){
	//cout<<*srgci<<endl;
	const SamReadGroup rg = (*srgci);
	//cout<<rg.ID<<endl;
	if( rg2Fraction.find(rg.ID) != rg2Fraction.end() ){
	    rg2BamWriter[rg.ID] = new  BamWriter();
	    rg2BamWriter[rg.ID]->Open(bamDirOutPrefix+"."+rg.ID+".bam",header,references); 
	}
	//cout<<bamDirOutPrefix+"."+rg.ID+".bam"<<endl;
    }
    //    return 1;

    //    BamWriter unmapped;

    // cout<<header.ToString()<<endl;
    // return 1;

    // if ( !unmapped.Open(bamDirOutPrefix+".unmapped.bam",header,references) ) {
    // 	cerr << "Could not open output BAM file "<< bamDirOutPrefix+".unmapped.bam" << endl;
    // 	return 1;
    // }

    //    cout<<"reading"<<endl;

    BamAlignment al;
    unsigned int total=0;
    while ( reader.GetNextAlignment(al) ) {


	if(al.HasTag("RG") &&
	   al.IsMapped() ){
	    string rgTag;
	    al.GetTag("RG",rgTag);
	    //cout<<rgTag<<endl;
	    if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new: ignore completely
	
		
	    }else{
		if( randomProb() <= rg2Fraction[  rgTag ] ){
		    rg2BamWriter[rgTag]->SaveAlignment(al);	 
		    //cout<<"wrote "<<rgTag<<endl;
		}   else{
		    //cout<<"skipped "<<rgTag<<endl;
		}	   
	    }
	}// else{
	//     string rgTag="unknown";	    
	//     //cout<<rgTag<<endl;
	//     if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new
	// 	cerr<<"Found new RG "<<rgTag<<endl;
	// 	rg2BamWriter[rgTag] = new  BamWriter();
	//  	if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) {
	//  	    cerr     << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl;
	//  	    return 1;
	//  	}
	// 	rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	//     }else{
	// 	rg2BamWriter[rgTag]->SaveAlignment(al);	    	   
	//     }

	//     // cerr << "Cannot get RG tag for " << al.Name<<endl;
	//     // return 1;
	// }

	total++;
    } //while al

    reader.Close();
    // writer.Close();
    
    // unmapped.Close();

    map<string,BamWriter *>::iterator rg2BamWriterIt;
    for (rg2BamWriterIt =rg2BamWriter.begin(); 
	 rg2BamWriterIt!=rg2BamWriter.end(); 
	 rg2BamWriterIt++){
	rg2BamWriterIt->second->Close();
    }
    cerr<<"Wrote succesfully "<<total<<" reads"<<endl;


    return 0;
}
Esempio n. 8
0
void BedIntersect::IntersectBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB = new BedFile(_bedBFile);
    _bedB->loadBedFileIntoMap();

    // create a dummy BED A file for printing purposes if not
    // using BAM output.
    if (_bamOutput == false) {
        _bedA = new BedFile(_bedAFile);
        _bedA->bedType = 12;
    }
    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);
    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs    = reader.GetReferenceData();
    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }
    vector<BED> hits;
    // reserve some space
    hits.reserve(100);
    BamAlignment bam;    
    // get each set of alignments for each pair.
    while (reader.GetNextAlignment(bam)) {

        // save an unaligned read if -v
        if (!bam.IsMapped()) {
            if (_noHit == true)
                writer.SaveAlignment(bam);
            continue;
        }   
        // break alignment into discrete blocks,
        bedVector bed_blocks;
        string chrom = refs.at(bam.RefID).RefName;
        GetBamBlocks(bam, chrom, bed_blocks, false, true);
        // create a basic BED entry from the BAM alignment
        BED bed;
        MakeBedFromBam(bam, chrom, bed_blocks, bed);
        bool overlapsFound = false;
        if ((_bamOutput == true) && (_obeySplits == false))
        {
            overlapsFound = _bedB->anyHits(bed.chrom, bed.start, bed.end, 
                                           bed.strand, _sameStrand, _diffStrand,
                                           _overlapFraction, _reciprocal);
        }
        else if ( ((_bamOutput == true)  && (_obeySplits == true)) ||
                  ((_bamOutput == false) && (_obeySplits == true)) )
        {
            // find the hits that overlap with the full span of the blocked BED
            _bedB->allHits(bed.chrom, bed.start, bed.end, bed.strand,
                           hits, _sameStrand, _diffStrand,
                           _overlapFraction, _reciprocal);
            // find the overlaps between the block in A and B
            overlapsFound = FindBlockedOverlaps(bed, bed_blocks, hits, _bamOutput);
        }
        else if ((_bamOutput == false) && (_obeySplits == false))
        {
            FindOverlaps(bed, hits);
        }
        // save the BAM alignment if overlap reqs. were met
        if (_bamOutput == true) {
            if ((overlapsFound == true) && (_noHit == false))
                writer.SaveAlignment(bam);
            else if ((overlapsFound == false) && (_noHit == true))
                writer.SaveAlignment(bam);
        }
        hits.clear();
    }

    // close the relevant BAM files.
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
Esempio n. 9
0
void BedIntersect::IntersectBam(string bamFile) {

	// load the "B" bed file into a map so
	// that we can easily compare "A" to it for overlaps
	_bedB->loadBedFileIntoMap();
	
	// open the BAM file
	BamReader reader;
	BamWriter writer;
	reader.Open(bamFile);

	// get header & reference information
	string header  = reader.GetHeaderText();
	RefVector refs = reader.GetReferenceData();

	// open a BAM output to stdout if we are writing BAM
	if (_bamOutput == true) {
		// open our BAM writer
        writer.Open("stdout", header, refs, _isUncompressedBam);
	}

	vector<BED> hits;
	// reserve some space
	hits.reserve(100);
	
	_bedA->bedType = 6;
	BamAlignment bam;	
	// get each set of alignments for each pair.
	while (reader.GetNextAlignment(bam)) {
		
		if (bam.IsMapped()) {	
			BED a;
			a.chrom = refs.at(bam.RefID).RefName;
			a.start = bam.Position;
			a.end   = bam.GetEndPosition(false);

			// build the name field from the BAM alignment.
			a.name = bam.Name;
			if (bam.IsFirstMate()) a.name += "/1";
			if (bam.IsSecondMate()) a.name += "/2";

			a.score  = ToString(bam.MapQuality);
			
			a.strand = "+"; 
			if (bam.IsReverseStrand()) a.strand = "-"; 
	
			if (_bamOutput == true) {
			    bool overlapsFound = false;
			    // treat the BAM alignment as a single "block"
			    if (_obeySplits == false) {
				    overlapsFound = FindOneOrMoreOverlap(a);
				}
				// split the BAM alignment into discrete blocks and
				// look for overlaps only within each block.
				else {
                    bool overlapFoundForBlock;
				    bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
				    // we don't want to split on "D" ops, hence the "false"
                    getBamBlocks(bam, refs, bedBlocks, false);
                    
                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
                	vector<BED>::const_iterator bedEnd  = bedBlocks.end();
                	for (; bedItr != bedEnd; ++bedItr) {
            	        overlapFoundForBlock = FindOneOrMoreOverlap(a);
            	        if (overlapFoundForBlock == true)
                            overlapsFound = true;
            	    }
				}
				if (overlapsFound == true) {
					if (_noHit == false)
						writer.SaveAlignment(bam);
				}
				else {
					if (_noHit == true) {
						writer.SaveAlignment(bam);
					}	
				}
			}
			else {
			    // treat the BAM alignment as a single BED "block"
			    if (_obeySplits == false) {
				    FindOverlaps(a, hits);
				    hits.clear();
			    }
			    // split the BAM alignment into discrete BED blocks and
				// look for overlaps only within each block.
			    else {
			        bedVector bedBlocks;  // vec to store the discrete BED "blocks" from a
                    getBamBlocks(bam, refs, bedBlocks, false);

                    vector<BED>::const_iterator bedItr  = bedBlocks.begin();
                	vector<BED>::const_iterator bedEnd  = bedBlocks.end();
                	for (; bedItr != bedEnd; ++bedItr) {
            	        FindOverlaps(*bedItr, hits);
                        hits.clear();
            	    }
			    }
			}
		}
	}
	
	// close the relevant BAM files.
	reader.Close();
	if (_bamOutput == true) {
		writer.Close();
	}
}
Esempio n. 10
0
void BedGenomeCoverage::CoverageBam(string bamFile) {

    ResetChromCoverage();

    // open the BAM file
    BamReader reader;
    if (!reader.Open(bamFile)) {
        cerr << "Failed to open BAM file " << bamFile << endl;
        exit(1);
    }

    // get header & reference information
    string header = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // load the BAM header references into a BEDTools "genome file"
    _genome = new GenomeFile(refs);
    // convert each aligned BAM entry to BED
    // and compute coverage on B
    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {
        // skip if the read is unaligned
        if (bam.IsMapped() == false)
            continue;

        bool _isReverseStrand = bam.IsReverseStrand();

        //changing second mate's strand to opposite
        if( _dUTP && bam.IsPaired() && bam.IsMateMapped() && bam.IsSecondMate())
            _isReverseStrand = !bam.IsReverseStrand();

        // skip if we care about strands and the strand isn't what
        // the user wanted
        if ( (_filterByStrand == true) &&
             ((_requestedStrand == "-") != _isReverseStrand) )
            continue;

        // extract the chrom, start and end from the BAM alignment
        string chrom(refs.at(bam.RefID).RefName);
        CHRPOS start = bam.Position;
        CHRPOS end = bam.GetEndPosition(false, false) - 1;

        // are we on a new chromosome?
        if ( chrom != _currChromName )
            StartNewChrom(chrom);
        if(_pair_chip_) {
            // Skip if not a proper pair
            if (bam.IsPaired() && (!bam.IsProperPair() or !bam.IsMateMapped()) )
                continue;
            // Skip if wrong coordinates
            if( ( (bam.Position<bam.MatePosition) && bam.IsReverseStrand() ) ||
                ( (bam.MatePosition < bam.Position) && bam.IsMateReverseStrand() ) ) {
                    //chemically designed: left on positive strand, right on reverse one
                    continue;
            }

            /*if(_haveSize) {
                if (bam.IsFirstMate() && bam.IsReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment
                    int mid = bam.MatePosition+abs(bam.InsertSize)/2;
                    if(mid<_fragmentSize/2)
                        AddCoverage(0, mid+_fragmentSize/2);
                    else
                        AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2);
                }
                else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //put fragmentSize in to the middle of pair end_fragment
                    int mid = start+abs(bam.InsertSize)/2;
                    if(mid<_fragmentSize/2)
                        AddCoverage(0, mid+_fragmentSize/2);
                    else
                        AddCoverage(mid-_fragmentSize/2, mid+_fragmentSize/2);
                }
            } else */

            if (bam.IsFirstMate() && bam.IsReverseStrand()) { //prolong to the mate to the left
                AddCoverage(bam.MatePosition, end);
            }
            else if (bam.IsFirstMate() && bam.IsMateReverseStrand()) { //prolong to the mate to the right
                AddCoverage(start, start + abs(bam.InsertSize) - 1);
            }
        } else if (_haveSize) {
            if(bam.IsReverseStrand()) {
                if(end<_fragmentSize) { //sometimes fragmentSize is bigger :(
                    AddCoverage(0, end);
                } else {
                    AddCoverage(end + 1 - _fragmentSize, end );
                }
            } else {
                AddCoverage(start,start+_fragmentSize - 1);
            }
        } else
        // add coverage accordingly.
        if (!_only_5p_end && !_only_3p_end) {
            bedVector bedBlocks;
            // we always want to split blocks when a D CIGAR op is found.
            // if the user invokes -split, we want to also split on N ops.
            if (_obeySplits) { // "D" true, "N" true
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, true);
            }
            else { // "D" true, "N" false
                GetBamBlocks(bam, refs.at(bam.RefID).RefName, bedBlocks, true, false);
            }
            AddBlockedCoverage(bedBlocks);
        }
        else if (_only_5p_end) {
            CHRPOS pos = ( !bam.IsReverseStrand() ) ? start : end;
            AddCoverage(pos,pos);
        }
        else if (_only_3p_end) {
            CHRPOS pos = ( bam.IsReverseStrand() ) ? start : end;
            AddCoverage(pos,pos);
        }
    }
    // close the BAM
    reader.Close();

    // process the results of the last chromosome.
    ReportChromCoverage(_currChromCoverage, _currChromSize,
            _currChromName, _currChromDepthHist);

    // report all empty chromsomes
    PrintEmptyChromosomes();

    // report the overall coverage if asked.
    PrintFinalCoverage();
}
Esempio n. 11
0
/*
Input file format: BAM file
*/
void Distinguish( string mappingFile, map< string, bool > & hapSNP, map< string, string > & refSeq, string outfilePrefix ) {
// [WARNING] If refSeq is not empty than it's BS data, otherwirse is resequencing data.
	string outfile1 = outfilePrefix + ".1.bam";
	string outfile2 = outfilePrefix + ".2.bam";
	string outHomoF = outfilePrefix + ".h**o.bam";
	string outUdeci = outfilePrefix + ".ambiguity.bam"; // The ambiguity reads or the reads which has't any SNP.

	BamReader h_I; // bam input file handle
	if ( !h_I.Open( mappingFile ) ) cerr << "[ERROR]: " << h_I.GetErrorString() << endl;

	// "header" and "references" from BAM files, these are required by BamWriter
	const SamHeader header     = h_I.GetHeader();
	const RefVector references = h_I.GetReferenceData();
	
	BamWriter h_O1, h_O2, h_U, h_H;
	if ( !h_O1.Open( outfile1, header, references ) ) { cerr << "Cannot open output BAM file: " << outfile1 << endl; exit(1); }
	if ( !h_O2.Open( outfile2, header, references ) ) { cerr << "Cannot open output BAM file: " << outfile2 << endl; exit(1); }
	if ( !h_U.Open ( outUdeci, header, references ) ) { cerr << "Cannot open output BAM file: " << outUdeci << endl; exit(1); }
	if ( !h_H.Open ( outHomoF, header, references ) ) { cerr << "Cannot open output BAM file: " << outHomoF << endl; exit(1); }

	int readsNumberRecord(0);

	SamLine samline;                                       // Samline class
	SamExt      sam;
	BamAlignment al;
	map< string, pair<BamAlignment, SamExt> > firstMateAl; // record the first mate reads alignment, HIstory problem to be like this struct!!

	string refstr; 					                       // Just For BS data.
	bool isC2T ( false );			                       // Just For BS data.
	while ( h_I.GetNextAlignment( al ) ) {

		++readsNumberRecord;
		if ( readsNumberRecord % 1000000 == 0 ) 
			cerr << "Have been dealed " << readsNumberRecord << " lines. " << local_time ();

		if ( !al.IsMapped() ) continue;
        //if ( al.InsertSize == 0 || al.RefID != al.MateRefID ) continue;

		samline._RID      = al.Name; samline._Flag= al.AlignmentFlag;
        samline._ref_id   = h_I.GetReferenceData()[al.RefID].RefName;
        samline._position = al.Position + 1;     // Position (0-base starts in BamTools), but I need 1-base starts
        samline._mapQ     = al.MapQuality;
		// MateRefID == -1 means mate read is unmapping
        samline._XorD     = ( al.MateRefID > -1 ) ? h_I.GetReferenceData()[al.MateRefID].RefName : "*"; 
        samline._coor     = al.MatePosition + 1; // Position (0-base starts in BamTools), but I need 1-base starts
        samline._seq      = al.QueryBases;
        samline._insert_size = abs (al.InsertSize);
		if ( samline._ref_id.compare( "BIG_ID_CAT" ) == 0 ) continue; // Ignore "BIG_ID_CAT"

		// get cigar;
        samline._cigar = itoa(al.CigarData[0].Length); samline._cigar.append( 1, al.CigarData[0].Type );
		for ( size_t i(1); i < al.CigarData.size(); ++i ) {
            samline._cigar += itoa(al.CigarData[i].Length);
            samline._cigar.append( 1, al.CigarData[i].Type );
        }

        sam.assign( &samline );
		/*********************************** For BS Data *********************************************/
		if ( !refSeq.empty() ) { // If the data is BS data, we should modify the QueryBases.
			if ( !refSeq.count( samline._ref_id ) ) { 
				cerr << "[ERROR]There's no such reference in the reference file. " << samline._ref_id << endl;
				exit(1);
			}
			if ( al.IsFirstMate() && !al.IsReverseStrand() ) { 
				isC2T = true; 
			} 
			else if ( al.IsFirstMate() && al.IsReverseStrand() ) { 
				isC2T = false; 
			} 
			else if ( al.IsSecondMate() && !al.IsReverseStrand() ) {
				isC2T = false;
			}
			else if ( al.IsSecondMate() && al.IsReverseStrand() ) {
				isC2T = true;
			} else {
				cerr << "[ERROR MATCH] " << endl; exit(1);
			}
			refstr.assign( refSeq[samline._ref_id], sam.ref_start() - 1, sam.ref_end() - sam.ref_start() + 1 );
			modifyBSreadBases( samline._ref_id, sam.ref_start (), sam.read_start(), sam.cigar_seq(), refstr, sam._seq, hapSNP, isC2T );
		}
		/********************************** End For BS Data *******************************************/

		// Consider the mate pair reads
		if ( !firstMateAl.count(al.Name) && (al.MateRefID > -1) ) { 

			firstMateAl[al.Name] = std::make_pair( al, sam );
		} else { // Consider the mate pair reads

			if ( !firstMateAl.count(al.Name) ) {

				switch ( Decide( sam, hapSNP ) ) { 
					case 1 : h_O1.SaveAlignment( al ); break; // Hap1
					case 2 : h_O2.SaveAlignment( al ); break; // Hap2
					case 0 : h_U.SaveAlignment ( al ); break; // Ambiguity
					default: // This alignment didn't contain any hete SNP.
							 h_H.SaveAlignment ( al );        // Homozygous reads
				}
			} else {
				
				int mark1 = Decide( firstMateAl[al.Name].second, hapSNP );
				int mark2 = Decide( sam, hapSNP );

				if ( mark1 == 1 && mark2 == 1 ) {
					
					h_O1.SaveAlignment( firstMateAl[al.Name].first );
					h_O1.SaveAlignment( al );
				} else if ( (mark1 == 1 && mark2 ==  0) || (mark1 == 0  && mark2 == 1) ) {
					
					h_O1.SaveAlignment( firstMateAl[al.Name].first );
					h_O1.SaveAlignment( al );
				} else if ( (mark1 == 1 && mark2 == -1) || (mark1 == -1 && mark2 == 1) ) {

					h_O1.SaveAlignment( firstMateAl[al.Name].first );
					h_O1.SaveAlignment( al );
				} else if ( mark1 == 2 && mark2 == 2 ) {

					h_O2.SaveAlignment( firstMateAl[al.Name].first );
					h_O2.SaveAlignment( al );
				} else if ( (mark1 == 2 && mark2 == 0 ) || (mark1 == 0  && mark2 == 2) ) {
					
					h_O2.SaveAlignment( firstMateAl[al.Name].first );
					h_O2.SaveAlignment( al );
				} else if ( (mark1 == 2 && mark2 == -1) || (mark1 == -1 && mark2 == 2) ) {

					h_O2.SaveAlignment( firstMateAl[al.Name].first );
					h_O2.SaveAlignment( al );
				} else if ( mark1 == -1 && mark2 == -1 ) {

					h_H.SaveAlignment ( firstMateAl[al.Name].first );
					h_H.SaveAlignment ( al );
				} else {
					
					h_U.SaveAlignment ( firstMateAl[al.Name].first );
					h_U.SaveAlignment ( al );
				}

				firstMateAl.erase( al.Name );
			}
		}
	}

	cerr << "------------  Remaind size: " << firstMateAl.size() << endl;
	for (map< string, pair<BamAlignment, SamExt> >::iterator it( firstMateAl.begin() ); it != firstMateAl.end(); ++it ) {

		switch ( Decide( it->second.second, hapSNP ) ) {
			case 1 : h_O1.SaveAlignment( it->second.first ); break; // Hap1
			case 2 : h_O2.SaveAlignment( it->second.first ); break; // Hap2
			case 0 : h_U.SaveAlignment ( it->second.first ); break; // Ambiguity
			default: // This alignment didn't contain any hete SNP.
					 h_H.SaveAlignment ( it->second.first );        // Homozygous reads
		}
	}

	h_I.Close();
	h_U.Close();
	h_H.Close();
	h_O1.Close();
	h_O2.Close();

	cerr << ">>>>>>>>>>>>> All Done <<<<<<<<<<<<<<" << endl;
	cerr << "Write to output file: " << outfile1    << endl;
	cerr << "Write to output file: " << outfile2    << endl;
	cerr << "Write to output file: " << outHomoF    << endl;
	cerr << "Write to output file: " << outUdeci    << endl;

	return;
}
void setMateInfo( BamAlignment & rec1, BamAlignment & rec2, SamHeader & header) {
    const int NO_ALIGNMENT_REFERENCE_INDEX = -1;
    const int NO_ALIGNMENT_START = -1;
    // If neither read is unmapped just set their mate info
    if (rec1.IsMapped() && rec2.IsMapped()) {
        
        rec1.MateRefID = rec2.MateRefID;
        rec1.MatePosition = rec2.Position;
        rec1.SetIsReverseStrand(rec2.IsReverseStrand());
        rec1.SetIsMapped(true);
        rec1.AddTag("MQ", "i", rec2.MapQuality);
        
        rec2.MateRefID = rec1.RefID;
        rec2.MatePosition = rec1.Position;
        rec2.SetIsReverseStrand( rec1.IsReverseStrand() );
        rec2.SetIsMapped(true);
        rec2.AddTag("MQ", "i", rec1.MapQuality);
    }
    // Else if they're both unmapped set that straight
    else if (!rec1.IsMapped() && !rec2.IsMapped()) {
        rec1.RefID = NO_ALIGNMENT_REFERENCE_INDEX;
        rec1.Position = NO_ALIGNMENT_START;
        rec1.MateRefID = NO_ALIGNMENT_REFERENCE_INDEX;
        rec1.MatePosition = NO_ALIGNMENT_START;
        rec1.SetIsReverseStrand(rec2.IsReverseStrand());
        rec1.SetIsMapped(false);
        rec2.RemoveTag("MQ");
        rec1.Length = 0;
        
        rec2.RefID = NO_ALIGNMENT_REFERENCE_INDEX;
        rec2.Position = NO_ALIGNMENT_START;
        rec2.MateRefID = NO_ALIGNMENT_REFERENCE_INDEX;
        rec2.MatePosition = NO_ALIGNMENT_START;
        rec2.SetIsReverseStrand(rec1.IsReverseStrand());
        rec2.SetIsMapped(false);
        rec2.RemoveTag("MQ");
        rec2.Length = 0;
    }
    // And if only one is mapped copy it's coordinate information to the mate
    else {
        BamAlignment & mapped   = rec1.IsMapped() ? rec1 : rec2;
        BamAlignment & unmapped = rec1.IsMapped() ? rec2 : rec1;
        unmapped.RefID = mapped.RefID;
        unmapped.Position = mapped.Position;
        
        mapped.MateRefID = unmapped.RefID;
        mapped.MatePosition = unmapped.Position;
        mapped.SetIsMateReverseStrand(unmapped.IsReverseStrand());
        mapped.SetIsMateMapped(false);
        mapped.Length = 0;
        
        unmapped.MateRefID = mapped.RefID;
        unmapped.MatePosition = mapped.Position;
        unmapped.SetIsMateReverseStrand(mapped.IsReverseStrand());
        unmapped.SetIsMateMapped(true);
        unmapped.Length = 0;
    }
    
    const int insertSize = computeInsertSize(rec1, rec2);
    rec1.Length = insertSize;
    rec2.Length = -insertSize;
}
Esempio n. 13
0
int main (int argc, char *argv[]) {

    // bool mapped  =false;
    // bool unmapped=false;
    int bpToDecrease5=1;
    int bpToDecrease3=2;

    const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+
			      "\tThis program takes a BAM file as input and produces\n"+
			      "\tanother where the putative deaminated bases have\n"+
			      "\ta base quality score of "+intStringify(baseQualForDeam)+"\n"+
			      "\tgiven an "+intStringify(offset)+" offset \n"+
			      "\n"+
			      "\tOptions:\n"+
			      "\t\t"+"-n5" +"\t\t\t"+"Decrease the nth bases surrounding the 5' ends (Default:"+stringify(bpToDecrease5)+") "+"\n"+
			      "\t\t"+"-n3" +"\t\t\t"+"Decrease the nth bases surrounding the 3' ends (Default:"+stringify(bpToDecrease3)+") "+"\n"
			      );
			      // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+
			      // "\t"+"-m , --mapped"   +"\n\t\t"+"For an mapped bam file"+"\n");
			      
			      

    if( (argc== 1) ||
	(argc== 2 && string(argv[1]) == "-h") ||
	(argc== 2 && string(argv[1]) == "-help") ||
	(argc== 2 && string(argv[1]) == "--help") ){
	cout<<"Usage:"<<endl;
	cout<<usage<<endl;
	cout<<""<<endl;
	return 1;
    }

    for(int i=1;i<(argc-2);i++){ //all but the last arg

    	if( string(argv[i]) == "-n5" ){
	    bpToDecrease5 = destringify<int>(argv[i+1]);
            i++;
    	    continue;
    	}

    	if( string(argv[i]) == "-n3" ){
	    bpToDecrease3 = destringify<int>(argv[i+1]);
            i++;
    	    continue;
    	}
       
    	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
    	return 1;
    }

    if(argc < 3){
	cerr<<"Error: Must specify the input and output BAM files";
	return 1;
    }

    string inbamFile =argv[argc-2];
    string outbamFile=argv[argc-1];

    if(inbamFile == outbamFile){
	cerr<<"Input and output files are the same"<<endl;
	return 1;    
    }
    // if(!mapped && !unmapped){
    // 	cerr << "Please specify whether you reads are mapped or unmapped" << endl;
    // 	return 1;
    // }
    // if(mapped && unmapped){
    // 	cerr << "Please specify either mapped or unmapped but not both" << endl;
    // 	return 1;
    // }

    BamReader reader;

    if ( !reader.Open(inbamFile) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
    }


    vector<RefData>  testRefData=reader.GetReferenceData();
    SamHeader header = reader.GetHeader();
    string pID          = "decrQualDeaminated";   
    string pName        = "decrQualDeaminated";   
    string pCommandLine = "";
    for(int i=0;i<(argc);i++){
        pCommandLine += (string(argv[i])+" ");
    }
    putProgramInHeader(&header,pID,pName,pCommandLine);


    const RefVector references = reader.GetReferenceData();

    BamWriter writer;
    if ( !writer.Open(outbamFile, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamAlignment al;
    // BamAlignment al2;
    // bool al2Null=true;
    
    while ( reader.GetNextAlignment(al) ) {

	    if(al.IsPaired() ){  

		if(al.IsFirstMate() ){ //5' end, need to check first base only
		    if(al.IsReverseStrand()){ //
			if(!al.IsMapped()){
			    cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl;
			    //return 1;
			}
			int indexToCheck;

			//5' of first mate reversed
			indexToCheck=al.QueryBases.length()-1;
			for(int i=0;i<bpToDecrease5;i++){			
			    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    }
			    indexToCheck=max(indexToCheck-1,0);
			}

			
		    }else{
			int indexToCheck;
			//5' of first mate
			indexToCheck=0;
			for(int i=0;i<bpToDecrease5;i++){ //first base			
			    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    }
			    indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
			}

		    }


		}else{ //3' end, need to check last two bases only
		    if( al.IsSecondMate() ){
			if(al.IsReverseStrand()){ //
			    if(!al.IsMapped()){
				cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl;
				//return 1;
			    }
			    int indexToCheck;

			    //3' of second mate reversed
			    indexToCheck=al.QueryBases.length()-1;
			    for(int i=0;i<bpToDecrease3;i++){			
				if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				}
				indexToCheck=max(indexToCheck-1,0);
			    }
			    

			}else{
			    int indexToCheck;
			    
			    //3' of second mate forward
			    indexToCheck=0;
			    for(int i=0;i<bpToDecrease3;i++){ //first base			
				if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				}
				indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
			    }

			}
		    }else{
			cerr << "Wrong state" << endl;
			return 1;
		    }
		}

	    }//end of paired end
	    else{//we consider single reads to have been sequenced from 5' to 3'

		if(al.IsReverseStrand()){ //need to consider 
		    if(!al.IsMapped()){
			cerr << "Cannot have reverse complemented unmapped reads :" <<al.Name<< endl;
			//return 1;
		    }

		    int indexToCheck;

		    //5' of single read reversed
		    indexToCheck=al.QueryBases.length()-1;
		    for(int i=0;i<bpToDecrease5;i++){			
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=max(indexToCheck-1,0);
		    }

		    //3' of single read reversed
		    indexToCheck=0;
		    for(int i=0;i<bpToDecrease3;i++){ //first base			
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
		    }

		    

		}else{

		    int indexToCheck;
		    
		    //5' of single read
		    indexToCheck=0;
		    for(int i=0;i<bpToDecrease5;i++){ //first base			
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=min(indexToCheck+1,int(al.Qualities.size()));
		    }


		    //3' of single read
		    indexToCheck=al.QueryBases.length()-1;
		    for(int i=0;i<bpToDecrease3;i++){			
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			}
			indexToCheck=max(indexToCheck-1,0);
		    }



		}
	    }//end of single end

	    writer.SaveAlignment(al);		


    }//    while ( reader.GetNextAlignment(al) ) {

    reader.Close();
    writer.Close();

    cerr<<"Program terminated gracefully"<<endl;   
    return 0;
}
Esempio n. 14
0
 bool check(const PropertyFilter& filter, const BamAlignment& al) {
   
     bool keepAlignment = true;
     const PropertyMap& properties = filter.Properties;
     PropertyMap::const_iterator propertyIter = properties.begin();
     PropertyMap::const_iterator propertyEnd  = properties.end();
     for ( ; propertyIter != propertyEnd; ++propertyIter ) {
       
         // check alignment data field depending on propertyName
         const string& propertyName = (*propertyIter).first;
         const PropertyFilterValue& valueFilter = (*propertyIter).second;
         
         if      ( propertyName == ALIGNMENTFLAG_PROPERTY )  keepAlignment &= valueFilter.check(al.AlignmentFlag);
         else if ( propertyName == CIGAR_PROPERTY ) {
             stringstream cigarSs;
             const vector<CigarOp>& cigarData = al.CigarData;
             if ( !cigarData.empty() ) {
                 vector<CigarOp>::const_iterator cigarBegin = cigarData.begin();
                 vector<CigarOp>::const_iterator cigarIter = cigarBegin;
                 vector<CigarOp>::const_iterator cigarEnd  = cigarData.end();
                 for ( ; cigarIter != cigarEnd; ++cigarIter ) {
                     const CigarOp& op = (*cigarIter);
                     cigarSs << op.Length << op.Type;
                 }
                 keepAlignment &= valueFilter.check(cigarSs.str());
             }
         }
         else if ( propertyName == INSERTSIZE_PROPERTY )           keepAlignment &= valueFilter.check(al.InsertSize);
         else if ( propertyName == ISDUPLICATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsDuplicate());
         else if ( propertyName == ISFAILEDQC_PROPERTY )           keepAlignment &= valueFilter.check(al.IsFailedQC());
         else if ( propertyName == ISFIRSTMATE_PROPERTY )          keepAlignment &= valueFilter.check(al.IsFirstMate());
         else if ( propertyName == ISMAPPED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsMapped());
         else if ( propertyName == ISMATEMAPPED_PROPERTY )         keepAlignment &= valueFilter.check(al.IsMateMapped());
         else if ( propertyName == ISMATEREVERSESTRAND_PROPERTY )  keepAlignment &= valueFilter.check(al.IsMateReverseStrand());
         else if ( propertyName == ISPAIRED_PROPERTY )             keepAlignment &= valueFilter.check(al.IsPaired());
         else if ( propertyName == ISPRIMARYALIGNMENT_PROPERTY )   keepAlignment &= valueFilter.check(al.IsPrimaryAlignment());
         else if ( propertyName == ISPROPERPAIR_PROPERTY )         keepAlignment &= valueFilter.check(al.IsProperPair());
         else if ( propertyName == ISREVERSESTRAND_PROPERTY )      keepAlignment &= valueFilter.check(al.IsReverseStrand());
         else if ( propertyName == ISSECONDMATE_PROPERTY )         keepAlignment &= valueFilter.check(al.IsSecondMate());
         else if ( propertyName == ISSINGLETON_PROPERTY ) {
             const bool isSingleton = al.IsPaired() && al.IsMapped() && !al.IsMateMapped();
             keepAlignment &= valueFilter.check(isSingleton);
         }
         else if ( propertyName == MAPQUALITY_PROPERTY )           keepAlignment &= valueFilter.check(al.MapQuality);
         else if ( propertyName == MATEPOSITION_PROPERTY )         keepAlignment &= ( al.IsPaired() && al.IsMateMapped() && valueFilter.check(al.MateRefID) );
         else if ( propertyName == MATEREFERENCE_PROPERTY ) {
             if ( !al.IsPaired() || !al.IsMateMapped() ) return false;
             BAMTOOLS_ASSERT_MESSAGE( (al.MateRefID>=0 && (al.MateRefID<(int)filterToolReferences.size())), "Invalid MateRefID");
             const string& refName = filterToolReferences.at(al.MateRefID).RefName;
             keepAlignment &= valueFilter.check(refName);
         }
         else if ( propertyName == NAME_PROPERTY )                 keepAlignment &= valueFilter.check(al.Name);
         else if ( propertyName == POSITION_PROPERTY )             keepAlignment &= valueFilter.check(al.Position);
         else if ( propertyName == QUERYBASES_PROPERTY )           keepAlignment &= valueFilter.check(al.QueryBases);
         else if ( propertyName == REFERENCE_PROPERTY ) {
             BAMTOOLS_ASSERT_MESSAGE( (al.RefID>=0 && (al.RefID<(int)filterToolReferences.size())), "Invalid RefID");
             const string& refName = filterToolReferences.at(al.RefID).RefName;
             keepAlignment &= valueFilter.check(refName);
         }
         else if ( propertyName == TAG_PROPERTY ) keepAlignment &= checkAlignmentTag(valueFilter, al);
         else BAMTOOLS_ASSERT_UNREACHABLE;
         
         // if alignment fails at ANY point, just quit and return false
         if ( !keepAlignment ) return false;
     }
   
     BAMTOOLS_ASSERT_MESSAGE( keepAlignment, "Error in BamAlignmentChecker... keepAlignment should be true here");
     return keepAlignment;
 }
Esempio n. 15
0
void BedWindow::WindowIntersectBam(string bamFile) {

    // load the "B" bed file into a map so
    // that we can easily compare "A" to it for overlaps
    _bedB->loadBedFileIntoMap();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
    reader.Open(bamFile);

    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs    = reader.GetReferenceData();

    // open a BAM output to stdout if we are writing BAM
    if (_bamOutput == true) {
        // set compression mode
        BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
        if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
        writer.SetCompressionMode(compressionMode);
        // open our BAM writer
        writer.Open("stdout", bamHeader, refs);
    }

    vector<BED> hits;                   // vector of potential hits
    // reserve some space
    hits.reserve(100);

    _bedA->bedType = 6;
    BamAlignment bam;
    bool overlapsFound;
    // get each set of alignments for each pair.
    while (reader.GetNextAlignment(bam)) {

        if (bam.IsMapped()) {
            BED a;
            a.chrom = refs.at(bam.RefID).RefName;
            a.start = bam.Position;
            a.end   = bam.GetEndPosition(false, false);

            // build the name field from the BAM alignment.
            a.name = bam.Name;
            if (bam.IsFirstMate()) a.name += "/1";
            if (bam.IsSecondMate()) a.name += "/2";

            a.score  = ToString(bam.MapQuality);
            a.strand = "+"; if (bam.IsReverseStrand()) a.strand = "-";

            if (_bamOutput == true) {
                overlapsFound = FindOneOrMoreWindowOverlaps(a);
                if (overlapsFound == true) {
                    if (_noHit == false)
                        writer.SaveAlignment(bam);
                }
                else {
                    if (_noHit == true)
                        writer.SaveAlignment(bam);
                }
            }
            else {
                FindWindowOverlaps(a, hits);
                hits.clear();
            }
        }
        // BAM IsMapped() is false
        else if (_noHit == true) {
            writer.SaveAlignment(bam);
        }
    }

    // close the relevant BAM files.
    reader.Close();
    if (_bamOutput == true) {
        writer.Close();
    }
}
Esempio n. 16
0
void TagBam::Tag() {

    // open the annotations files for processing;
    OpenAnnoFiles();

    // open the BAM file
    BamReader reader;
    BamWriter writer;
	if (!reader.Open(_bamFile)) {
        cerr << "Failed to open BAM file " << _bamFile << endl;
        exit(1);
    }
    
    // get header & reference information
    string bamHeader  = reader.GetHeaderText();
    RefVector refs = reader.GetReferenceData();

    // set compression mode
    BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
//    if ( _isUncompressedBam ) compressionMode = BamWriter::Uncompressed;
    writer.SetCompressionMode(compressionMode);
    // open our BAM writer
    writer.Open("stdout", bamHeader, refs);

    // rip through the BAM file and test for overlaps with each annotation file.
    BamAlignment al;
    vector<BED> hits;

    while (reader.GetNextAlignment(al)) {
        if (al.IsMapped() == true) {
            BED a;
            a.chrom = refs.at(al.RefID).RefName;
            a.start = al.Position;
            a.end   = al.GetEndPosition(false, false);
            a.strand = "+";
            if (al.IsReverseStrand()) a.strand = "-";
            
            ostringstream annotations;
            // annotate the BAM file based on overlaps with the annotation files.
            for (size_t i = 0; i < _annoFiles.size(); ++i) 
            {
                // grab the current annotation file.
                BedFile *anno = _annoFiles[i];
                
                if (!_useNames && !_useScores && !_useIntervals) {
                    // add the label for this annotation file to tag if there is overlap
                    if (anno->anyHits(a.chrom, a.start, a.end, a.strand, 
                                      _sameStrand, _diffStrand, _overlapFraction, false))
                    {
                        annotations << _annoLabels[i] << ";";
                    }
                }
                // use the score field
                else if (!_useNames && _useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t i = 0; i < hits.size(); ++i) {
                        annotations << hits[i].score;
                        if (i < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the name field from the annotation files to populate tag
                else if (_useNames && !_useScores && !_useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand, 0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << hits[j].name;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
                // use the full interval information annotation files to populate tag
                else if (!_useNames && !_useScores && _useIntervals) {
                    anno->allHits(a.chrom, a.start, a.end, a.strand, 
                                  hits, _sameStrand, _diffStrand,  0.0, false);
                    for (size_t j = 0; j < hits.size(); ++j) {
                        annotations << _annoLabels[i]  << ":" << 
                                        hits[j].chrom  << ":" <<
                                        hits[j].start  << "-" <<
                                        hits[j].end    << "," <<
                                        hits[j].name   << "," <<
                                        hits[j].score  << "," <<
                                        hits[j].strand;
                        if (j < hits.size() - 1) annotations << ",";
                    }
                    if (hits.size() > 0) annotations << ";";
                    hits.clear();
                }
            }
            // were there any overlaps with which to make a tag?
            if (annotations.str().size() > 0) {
                al.AddTag(_tag, "Z", annotations.str().substr(0, annotations.str().size() - 1)); // get rid of the last ";"
            }
        }
        writer.SaveAlignment(al);
    }
    reader.Close();
    writer.Close();
    // close the annotations files;
    CloseAnnoFiles();
}
Esempio n. 17
0
int main (int argc, char *argv[]) {

    string usage=string(""+string(argv[0])+"  [in BAM file]"+
			"\nThis program reads a BAM file and computes the error rate for each cycle\n"+
			// "\nreads and the puts the rest into another bam file.\n"+
			// "\nTip: if you do not need one of them, use /dev/null as your output\n"+
			// "arguments:\n"+
			// "\t"+"--bq  [base qual]   : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\n");

    if(argc == 1 ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    // for(int i=1;i<(argc-2);i++){ 

	
    //     if(string(argv[i]) == "--bq"){
    // 	    minBaseQuality=destringify<int>(argv[i+1]);
    //         i++;
    //         continue;
    // 	}

    // }

    string bamfiletopen = string( argv[ argc-1 ] );
    // string deambam      = string( argv[ argc-2 ] );
    // string nondeambam   = string( argv[ argc-1 ] );


    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }








    //iterating over the alignments for these regions
    BamAlignment al;
    bool pairedEnd=false;
    bool firstRead=true;

    while ( reader.GetNextAlignment(al) ) {

	if(firstRead){ //reads are either all paired end or single end, I don't allow a mix
	    numberOfCycles=al.QueryBases.size();
	    // cout<<"numberOfCycles "<<numberOfCycles<<endl;
	    if(al.IsPaired() ){  
		pairedEnd=true;		
		matches    = vector<unsigned int> (2*numberOfCycles,0);
		mismatches = vector<unsigned int> (2*numberOfCycles,0);
		typesOfMismatches = vector< vector<unsigned int> >();
		for(int i=0;i<12;i++)
		    typesOfMismatches.push_back( vector<unsigned int> (2*numberOfCycles,0) );
	    }else{
		matches    = vector<unsigned int> (  numberOfCycles,0);
		mismatches = vector<unsigned int> (  numberOfCycles,0);
		typesOfMismatches = vector< vector<unsigned int> >();
		for(int i=0;i<12;i++)
		    typesOfMismatches.push_back( vector<unsigned int> (  numberOfCycles,0) );
	    }
	    firstRead=false;
	}

	if( (  pairedEnd && !al.IsPaired()) ||  
	    ( !pairedEnd &&  al.IsPaired())   ){
	    cerr<<"Read "<<al.Name<<" is wrong, cannot have a mixture of paired and unpaired read for this program"<<endl;
	    return 1;
	}


	//skip unmapped
	if(!al.IsMapped())
	    continue;

	if(numberOfCycles!=int(al.QueryBases.size())){
	    cerr<<"The length of read "<<al.Name<<" is wrong, should be "<<numberOfCycles<<"bp"<<endl;
	    return 1;
	}


	string reconstructedReference = reconstructRef(&al);
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}


	if( pairedEnd ){
	    if( al.IsFirstMate() ){ //start cycle 0

		if( al.IsReverseStrand()  ){ 
		    increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1
		}else{
		    increaseCounters(al,reconstructedReference,0               , 1); //start cycle 0
		}
       
	    }else{

		if( al.IsSecondMate()  ){ 
		    if( al.IsReverseStrand()  ){ 
			increaseCounters(al,reconstructedReference,2*numberOfCycles-1,-1); //start cycle 2*numberOfCycles-1
		    }else{
			increaseCounters(al,reconstructedReference,numberOfCycles    , 1); //start cycle numberOfCycles
		    }
		}else{
	    	    cerr<<"Reads "<<al.Name<<" must be either first or second mate"<<endl;
	    	    return 1;
	    	}
	    }	
	}else{ //single end 

	    if( al.IsReverseStrand()  ){ 
		increaseCounters(al,reconstructedReference,numberOfCycles-1,-1); //start cycle numberOfCycles-1
	    }else{
		increaseCounters(al,reconstructedReference,0               , 1); //start cycle 0
	    }
	    
	}


    }//end while  each read
	


    reader.Close();

    cout<<"cycle\tmatches\tmismatches\tmismatches%\tA>C\tA>C%\tA>G\tA>G%\tA>T\tA>T%\tC>A\tC>A%\tC>G\tC>G%\tC>T\tC>T%\tG>A\tG>A%\tG>C\tG>C%\tG>T\tG>T%\tT>A\tT>A%\tT>C\tT>C%\tT>G\tT>G%"<<endl;



    for(unsigned int i=0;i<matches.size();i++){
cout<<(i+1);
	if( (matches[i]+mismatches[i]!=0) )
	    cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\t"<< 100.0*(double(mismatches[i])/double(matches[i]+mismatches[i])) ;
	else
	    cout<<"\t"<<matches[i]<<"\t"<<mismatches[i]<<"\tNA";

	for(int j=0;j<12;j++){   
	    cout<<"\t"<<typesOfMismatches[j][i];
	    if( (matches[i]+mismatches[i]!=0) )
		cout<<"\t"<<100.0*double(typesOfMismatches[j][i])/double(matches[i]+mismatches[i]);
	    else
		cout<<"\tNA";
	}

	cout<<endl;
    }




   
    return 0;
}
Esempio n. 18
0
int main (int argc, const char *argv[])
{
  printf ("------------- bamrealignment --------------\n");

  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  vector<int> score_vals(4);

  string input_bam  = opts.GetFirstString  ('i', "input", "");
  string output_bam = opts.GetFirstString  ('o', "output", "");
  opts.GetOption(score_vals, "4,-6,-5,-2", 's', "scores");
  int    clipping   = opts.GetFirstInt     ('c', "clipping", 2);
  bool   anchors    = opts.GetFirstBoolean ('a', "anchors", true);
  int    bandwidth  = opts.GetFirstInt     ('b', "bandwidth", 10);
  bool   verbose    = opts.GetFirstBoolean ('v', "verbose", false);
  bool   debug      = opts.GetFirstBoolean ('d', "debug", false);
  int    format     = opts.GetFirstInt     ('f', "format", 1);
  int  num_threads  = opts.GetFirstInt     ('t', "threads", 8);
  string log_fname  = opts.GetFirstString  ('l', "log", "");
  

  if (input_bam.empty() or output_bam.empty())
    return PrintHelp();

  opts.CheckNoLeftovers();

  std::ofstream logf;
  if (log_fname.size ())
  {
    logf.open (log_fname.c_str ());
    if (!logf.is_open ())
    {
      fprintf (stderr, "bamrealignment: Failed to open log file %s\n", log_fname.c_str());
      return 1;
    }
  }

  BamReader reader;
  if (!reader.Open(input_bam)) {
    fprintf(stderr, "bamrealignment: Failed to open input file %s\n", input_bam.c_str());
    return 1;
  }

  SamHeader header = reader.GetHeader();
  RefVector refs   = reader.GetReferenceData();

  BamWriter writer;
  writer.SetNumThreads(num_threads);
  if (format == 1)
    writer.SetCompressionMode(BamWriter::Uncompressed);
  else
    writer.SetCompressionMode(BamWriter::Compressed);

  if (!writer.Open(output_bam, header, refs)) {
    fprintf(stderr, "bamrealignment: Failed to open output file %s\n", output_bam.c_str());
    return 1;
  }


  // The meat starts here ------------------------------------

  if (verbose)
    cout << "Verbose option is activated, each alignment will print to screen." << endl
         << "  After a read hit RETURN to continue to the next one," << endl
         << "  or press q RETURN to quit the program," << endl
         << "  or press s Return to silence verbose," << endl
         << "  or press c RETURN to continue printing without further prompt." << endl << endl;

  unsigned int readcounter = 0;
  unsigned int mapped_readcounter = 0;
  unsigned int realigned_readcounter = 0;
  unsigned int modified_alignment_readcounter = 0;
  unsigned int pos_update_readcounter = 0;
  unsigned int failed_clip_realigned_readcount = 0;
  
  unsigned int already_perfect_readcount = 0;
  
  unsigned int bad_md_tag_readcount = 0;
  unsigned int error_recreate_ref_readcount = 0;
  unsigned int error_clip_anchor_readcount = 0;
  unsigned int error_sw_readcount = 0;
  unsigned int error_unclip_readcount = 0;
  
  unsigned int start_position_shift;
  int orig_position;
  int new_position;

  string  md_tag, new_md_tag, input = "x";
  vector<CigarOp>    new_cigar_data;
  vector<MDelement>  new_md_data;
  bool position_shift = false;
  time_t start_time = time(NULL);

  Realigner aligner;
  aligner.verbose_ = verbose;
  aligner.debug_   = debug;
  if (!aligner.SetScores(score_vals))
    cout << "bamrealignment: Four scores need to be provided: match, mismatch, gap open, gap extend score!" << endl;

  aligner.SetAlignmentBandwidth(bandwidth);

  BamAlignment alignment;
  while(reader.GetNextAlignment(alignment)){
    readcounter ++;
    position_shift = false;
    
    if ( (readcounter % 100000) == 0 )
       cout << "Processed " << readcounter << " reads. Elapsed time: " << (time(NULL) - start_time) << endl;

    if (alignment.IsMapped()) {
      
      
      
      orig_position = alignment.Position;
      mapped_readcounter++;
      aligner.SetClipping(clipping, !alignment.IsReverseStrand());
      if (aligner.verbose_) {
    	cout << endl;
        if (alignment.IsReverseStrand())
          cout << "The read is from the reverse strand." << endl;
        else
          cout << "The read is from the forward strand." << endl;
      }

      if (!alignment.GetTag("MD", md_tag)) {
    	if (aligner.verbose_)
          cout << "Warning: Skipping read " << alignment.Name << ". It is mapped but missing MD tag." << endl;
	if (logf.is_open ())
	  logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MISSMD" << '\n';
	bad_md_tag_readcount++;
      } else if (aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors)) {
	bool clipfail = false;
	if (Realigner::CR_ERR_CLIP_ANCHOR == aligner.GetCreateRefError ())
	{
	  clipfail = true;
	  failed_clip_realigned_readcount ++;
	}

        if (!aligner.computeSWalignment(new_cigar_data, new_md_data, start_position_shift)) {
          if (aligner.verbose_)
            cout << "Error in the alignment! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "SWERR" << '\n';
	  error_sw_readcount++;
          writer.SaveAlignment(alignment);  // Write alignment unchanged
          continue;
        }

        if (!aligner.addClippedBasesToTags(new_cigar_data, new_md_data, alignment.QueryBases.size())) {
          if (aligner.verbose_)
            cout << "Error when adding clipped anchors back to tags! Not updating read information." << endl;
	  if (logf.is_open ())
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNCLIPERR" << '\n';
          writer.SaveAlignment(alignment);  // Write alignment unchanged
	  error_unclip_readcount ++;
          continue;
        }
        new_md_tag = aligner.GetMDstring(new_md_data);
        realigned_readcounter++;

        // adjust start position of read
        if (!aligner.LeftAnchorClipped() and start_position_shift != 0) {
          new_position = aligner.updateReadPosition(alignment.CigarData, (int)start_position_shift, alignment.Position);
          if (new_position != alignment.Position) {
            pos_update_readcounter++;
            position_shift = true;
            alignment.Position = new_position;
          }
        }
        
        if (position_shift || alignment.CigarData.size () != new_cigar_data.size () || md_tag != new_md_tag)
	{
	  if (logf.is_open ())
	  {
	    logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "MOD";
	    if (position_shift)
	      logf << "-SHIFT";
	    if (clipfail)
	      logf << " NOCLIP";
	    logf << '\n';
	  }
	  modified_alignment_readcounter++;
	}
	else
	{
            if (logf.is_open ())
	    {
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "UNMOD";
              if (clipfail)
	        logf << " NOCLIP";
	      logf << '\n';
	    }
	}

        if (aligner.verbose_){
          cout << alignment.Name << endl;
          cout << "------------------------------------------" << endl;
          // Wait for input to continue or quit program
          if (input.size() == 0)
            input = 'x';
          else if (input[0] != 'c' and input[0] != 'C')
            getline(cin, input);
          if (input.size()>0){
            if (input[0] == 'q' or input[0] == 'Q')
              return 1;
            else if (input[0] == 's' or input[0] == 'S')
              aligner.verbose_ = false;
          }
        }

        // Finally update alignment information
        alignment.CigarData = new_cigar_data;
        alignment.EditTag("MD", "Z" , new_md_tag);

      } // end of CreateRef else if
      else {
	switch (aligner.GetCreateRefError ())
	{
	  case Realigner::CR_ERR_RECREATE_REF:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "RECRERR" << '\n';
	    error_recreate_ref_readcount++;
	    break;
	  case Realigner::CR_ERR_CLIP_ANCHOR:
            if (logf.is_open ())
	      logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "CLIPERR" << '\n';
	    error_clip_anchor_readcount++;
	    break;
	  default:
		  //  On a good run this writes way too many reads to the log file - don't want to create a too large txt file
          //  if (logf.is_open ())
	      //logf << alignment.Name << '\t' << alignment.IsReverseStrand() << '\t' << alignment.RefID << '\t' << setfill ('0') << setw (8) << orig_position << '\t' << "PERFECT" << '\n';
	    already_perfect_readcount++;
	    break;
	}
	
	if (aligner.verbose_) {
	  cout << alignment.Name << endl;
	  cout << "------------------------------------------" << endl;
	  // Wait for input to continue or quit program
	  if (input.size() == 0)
	    input = 'x';
	  else if (input[0] != 'c' and input[0] != 'C')
	    getline(cin, input);
	  if (input.size()>0){
	    if (input[0] == 'q' or input[0] == 'Q')
	      return 1;
	    else if (input[0] == 's' or input[0] == 'S')
	      aligner.verbose_ = false;
	  }
	}
      }

      // --- Debug output for Rajesh ---
      if (debug && aligner.invalid_cigar_in_input) {
        aligner.verbose_ = true;
        cout << "Invalid cigar string / md tag pair in read " << alignment.Name << endl;
        // Rerun reference generation to display error
        aligner.CreateRefFromQueryBases(alignment.QueryBases, alignment.CigarData, md_tag, anchors);

        aligner.verbose_ = verbose;
        aligner.invalid_cigar_in_input = false;
      }
      // --- --- ---


    } // end of if isMapped

    writer.SaveAlignment(alignment);

  } // end while loop over reads

  if (aligner.invalid_cigar_in_input)
    cerr << "WARNING bamrealignment: There were invalid cigar string / md tag pairs in the input bam file." << endl;

  // ----------------------------------------------------------------
  // program end -- output summary information
  cout   << "                            File: " << input_bam    << endl
         << "                     Total reads: " << readcounter  << endl
         << "                    Mapped reads: " << mapped_readcounter << endl;
  if (bad_md_tag_readcount)
    cout << "            Skipped: bad MD tags: " << bad_md_tag_readcount << endl;
  if (error_recreate_ref_readcount)
    cout << " Skipped: unable to recreate ref: " << error_recreate_ref_readcount << endl;
  if (error_clip_anchor_readcount)
    cout << "  Skipped: error clipping anchor: " << error_clip_anchor_readcount << endl;
  cout  <<  "       Skipped:  already perfect: " << already_perfect_readcount << endl
        <<  "           Total reads realigned: " << mapped_readcounter - already_perfect_readcount - bad_md_tag_readcount - error_recreate_ref_readcount - error_clip_anchor_readcount << endl;
  if (failed_clip_realigned_readcount)
    cout << "                      (including  " << failed_clip_realigned_readcount << " that failed to clip)" << endl;
  if (error_sw_readcount)
    cout << " Failed to complete SW alignment: " << error_sw_readcount << endl;
  if (error_unclip_readcount)
    cout << "         Failed to unclip anchor: " << error_unclip_readcount << endl;
  cout   << "           Succesfully realigned: " << realigned_readcounter << endl
         << "             Modified alignments: " << modified_alignment_readcounter << endl
         << "                Shifted position: " << pos_update_readcounter << endl;
  
  cout << "Processing time: " << (time(NULL)-start_time) << " seconds." << endl;
  cout << "INFO: The output BAM file may be unsorted." << endl;
  cout << "------------------------------------------" << endl;
  return 0;
}
Esempio n. 19
0
int 
main(int argc, char* argv[]) {

	// validate argument count
	if( argc != 2 ) {
		cerr << "USAGE: " << argv[0] << " <input BAM file> " << endl;
		return EXIT_FAILURE;
	}

	string filename = argv[1];
	//cerr << "Printing alignments from file: " << filename << endl;
	
	BamReader reader;
	if (!reader.Open(filename)) {
        cerr << "could not open filename " << filename << endl;
        return EXIT_FAILURE;
    }
    cerr << filename << ": Done opening" << endl;

    // Header can't be used to accurately determine sort order because samtools never
    // changes it; instead, check after loading each read as is done with "samtools index"

    // We don't need to load an index (right?)
	// if (!reader.LocateIndex()) {
    //     const string index_filename = filename + ".bai";
	//     if (!reader.OpenIndex(index_filename)) {
    //         cerr << "could not open index" << endl;
    //     }
    // }


    const SamHeader header = reader.GetHeader();
    cerr << filename << ": Done getting header" << endl;
    const RefVector refs = reader.GetReferenceData();
    cerr << filename << ": Done getting reference data" << endl;
	
    BamWriter writer;
    if (! output_bam_filename.empty()) {
        if (! writer.Open(output_bam_filename, header, refs)) {
            cerr << "Could not open BAM output file " << output_bam_filename << endl;
            return EXIT_FAILURE;
        }
        cerr << filename << ": Done opening BAM output file " << output_bam_filename << endl;
    }

    alignmentMap read1Map;  // a single map, for all reads awaiting their mate
    typedef map<string,int32_t> stringMap;
    typedef stringMap::iterator stringMapI;
    stringMap ref_mates;
    // alignmentMap read1Map, read2Map;

	BamAlignment full_al;
    int32_t count = 0;
    uint32_t max_reads_in_map = 0;
    int32_t n_reads_skipped_unmapped = 0;
    int32_t n_reads_skipped_mate_unmapped = 0;
    int32_t n_reads_skipped_wont_see_mate = 0;
    int32_t n_reads_skipped_mate_tail_est = 0;
    int32_t n_reads_skipped_ref_mate = 0;
    int32_t n_reads = 0;
    int32_t n_singleton_reads = 0;
    int32_t last_RefID = -1;
    int32_t last_Position = -1;

    cerr << filename << ": Looking for up to " << pairs_to_process << " link pairs,"
        << " total tail = " << link_pair_total_tail 
        << " critical tail = " << link_pair_crit_tail 
        << ", must be on diff chromosome = " << link_pair_diff_chrom << endl;

	while (reader.GetNextAlignment(full_al) 
           && (! pairs_to_process || count < pairs_to_process)) {

        BamAlignment al = full_al;

        //printAlignmentInfo(al, refs);
        //++count;
        ++n_reads;

        if (last_RefID < 0) last_RefID = al.RefID;
        if (last_Position < 0) last_Position = al.Position;
        if (al.RefID > last_RefID) {
            // We've moved to the next reference sequence
            // Clean up reads with mates expected here that haven't been seen
            if (debug_ref_mate) {
                cerr << "MISSED " << ref_mates.size() << " ref_mates on this reference "
                    << last_RefID << " " << refs[last_RefID].RefName << endl;
            }
            for (stringMapI rmI = ref_mates.begin(); rmI != ref_mates.end(); ++rmI) {
                ++n_reads_skipped_ref_mate;
                read1Map.erase(read1Map.find(rmI->first));
                ref_mates.erase(ref_mates.find(rmI->first));
            }
            last_RefID = al.RefID;
            last_Position = al.Position;
        } else if (! isCoordinateSorted(al.RefID, al.Position, last_RefID, last_Position)) {
            cerr << filename << " is not sorted, " << al.Name << " out of position" << endl;
            return EXIT_FAILURE;
        }

        if (! al.IsMapped()) { ++n_reads_skipped_unmapped; continue; }

        if (! al.IsMateMapped()) { ++n_reads_skipped_mate_unmapped; continue; }

        alignmentMapI mI = read1Map.find(al.Name);

        if (mI == read1Map.end()) {
            // the read name has not been seen before

            if (al.MateRefID < al.RefID
                || (al.MateRefID == al.RefID && al.MatePosition < al.Position)) {
                // we should have seen its mate earlier, so skip it
                ++n_reads_skipped_wont_see_mate;
                continue;
            }

            // If the mate likely to also be a link pair candidate, add the read
            int32_t mate_tail_est = readTailS(al.IsMateMapped(), al.IsMateReverseStrand(),
                            al.MatePosition, refs[al.MateRefID].RefLength, max_read_length);
            if (mate_tail_est <= mate_tail_est_crit) {
                // the mate tail estimate suggests it might be a link pair candidate
                read1Map[al.Name] = al;  // add the read to the map
            } else {
                // the mate tail estimate appears too long for the mate to be a candidate
                ++n_reads_skipped_mate_tail_est;
                continue;
            }

            if (read1Map.size() > max_reads_in_map) max_reads_in_map = read1Map.size();
            if (al.MateRefID == al.RefID && al.MatePosition >= al.Position) {
                // the mate is expected later on this contig
                ref_mates[al.Name] = al.MateRefID;
            }

        } else {
            // get the mate's alignment, and process the pair

            const BamAlignment& al_mate = mI->second;

            if (processReadPair(al, al_mate, refs, link_pair_total_tail, 
                                link_pair_crit_tail, link_pair_diff_chrom)) {
                ++count;

                // write to the new BAM file, if the string is not empty
                if (! output_bam_filename.empty()) {
                    writer.SaveAlignment(al_mate);  // the first one seen
                    writer.SaveAlignment(al);  // the second one seen
                }
            }

            read1Map.erase(mI);

            if (al.MateRefID == al.RefID) {
                stringMapI rmI = ref_mates.find(al.Name);
                if (rmI == ref_mates.end()) {
                    cerr << "expected a ref_mate, couldn't find its name: " << al.Name << endl;
                    return EXIT_FAILURE;
                }
                ref_mates.erase(rmI);
            }

        }

	}

	cerr << "===============================" << endl;
    cerr << read1Map.size() << " alignments left in read1Map" << endl;
    cerr << max_reads_in_map << " maximum number of reads in read1Map" << endl;
    cerr << count << " pairs processed" << endl;
	cerr << "===============================" << endl;
    cerr << n_reads << " total reads" << endl;
    cerr << n_singleton_reads << " singleton reads" << endl;
    cerr << n_reads_skipped_unmapped << " reads skipped because unmapped" << endl;
    cerr << n_reads_skipped_mate_unmapped << " reads skipped because mate unmapped" << endl;
    cerr << n_reads_skipped_wont_see_mate << " reads skipped because mate won't be seen" << endl;
    cerr << n_reads_skipped_mate_tail_est << " reads skipped because mate tail appears too long" << endl;
    cerr << n_reads_skipped_ref_mate << " reads skipped because mate not on reference" << endl;

	reader.Close();
    if (! output_bam_filename.empty()) {
	    writer.Close();
    }
	return EXIT_SUCCESS;
}
Esempio n. 20
0
void BedIntersectPE::ProcessBamBlock (const BamAlignment &bam1, const BamAlignment &bam2,
                                      const RefVector &refs, BamWriter &writer) {

    vector<BED> hits, hits1, hits2;         // vector of potential hits
    hits.reserve(1000);                     // reserve some space
    hits1.reserve(1000);
    hits2.reserve(1000);

    bool overlapsFound;                     // flag to indicate if overlaps were found

    if ( (_searchType == "either") || (_searchType == "xor") ||
              (_searchType == "both") || (_searchType == "notboth") ||
              (_searchType == "neither") ) {

        // create a new BEDPE feature from the BAM alignments.
        BEDPE a;
        ConvertBamToBedPE(bam1, bam2, refs, a);
        if (_bamOutput == true) {   // BAM output
            // write to BAM if correct hits found
            overlapsFound = FindOneOrMoreOverlaps(a, _searchType);
            if (overlapsFound == true) {
                writer.SaveAlignment(bam1);
                writer.SaveAlignment(bam2);
            }
        }
        else {  // BEDPE output
            FindOverlaps(a, hits1, hits2, _searchType);
            hits1.clear();
            hits2.clear();
        }
    }
    else if ( (_searchType == "ispan") || (_searchType == "ospan") ) {
        // only look for ispan and ospan when both ends are mapped.
        if (bam1.IsMapped() && bam2.IsMapped()) {
            // only do an inspan or outspan check if the alignment is intrachromosomal
            if (bam1.RefID == bam2.RefID) {
                // create a new BEDPE feature from the BAM alignments.
                BEDPE a;
                ConvertBamToBedPE(bam1, bam2, refs, a);
                if (_bamOutput == true) {   // BAM output
                    // look for overlaps, and write to BAM if >=1 were found
                    overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType);
                    if (overlapsFound == true) {
                        writer.SaveAlignment(bam1);
                        writer.SaveAlignment(bam2);
                    }
                }
                else {  // BEDPE output
                    FindSpanningOverlaps(a, hits, _searchType);
                    hits.clear();
                }
            }
        }
    }
    else if ( (_searchType == "notispan") || (_searchType == "notospan") ) {
        // only look for notispan and notospan when both ends are mapped.
        if (bam1.IsMapped() && bam2.IsMapped()) {
            // only do an inspan or outspan check if the alignment is intrachromosomal
            if (bam1.RefID == bam2.RefID) {
                // create a new BEDPE feature from the BAM alignments.
                BEDPE a;
                ConvertBamToBedPE(bam1, bam2, refs, a);
                if (_bamOutput == true) {   // BAM output
                    // write to BAM if there were no overlaps
                    overlapsFound = FindOneOrMoreSpanningOverlaps(a, _searchType);
                    if (overlapsFound == false) {
                        writer.SaveAlignment(bam1);
                        writer.SaveAlignment(bam2);
                    }
                }
                else {  // BEDPE output
                    FindSpanningOverlaps(a, hits, _searchType);
                    hits.clear();
                }
            }
            // if inter-chromosomal or orphaned, we know it's not ispan and not ospan
            else if (_bamOutput == true) {
                writer.SaveAlignment(bam1);
                writer.SaveAlignment(bam2);
            }
        }
        // if both ends aren't mapped, we know that it's notispan and not ospan
        else if (_bamOutput == true) {
            writer.SaveAlignment(bam1);
            writer.SaveAlignment(bam2);
        }
    }
}
Esempio n. 21
0
int main (int argc, char *argv[]) {


    bool produceUnCompressedBAM=false;
    bool verbose=false;
    bool ancientDNA=false;
    bool keepOrig=false;

    string adapter_F=options_adapter_F_BAM;
    string adapter_S=options_adapter_S_BAM;
    string adapter_chimera=options_adapter_chimera_BAM;
    string key="";
    bool allowMissing=false;
    int trimCutoff=1;

    bool allowAligned=false;
    bool printLog=false;
    string logFileName;

    BamReader reader;
    BamWriter writer;

    string bamFile;
    string bamFileOUT="";

    string key1;
    string key2;
    
    bool useDist=false;
    double location=-1.0;
    double scale   =-1.0;

    bool fastqFormat=false;
    string fastqfile1   = "";
    string fastqfile2   = "";
    string fastqoutfile = "";
    bool singleEndModeFQ=true;

    const string usage=string(string(argv[0])+
			      
			      " [options] BAMfile"+"\n"+
			      "\nThis program takes an unaligned BAM where mates are consecutive\nor fastq files and trims and merges reads\n"+

			      "\n\tYou can specify a unaligned bam file or one or two fastq :\n"+			      
			      "\t\t"+"-fq1" +"\t\t"+"First fastq"+"\n"+
			      "\t\t"+"-fq2" +"\t\t"+"Second  fastq file (for paired-end)"+"\n"+
			      "\t\t"+"-fqo" +"\t\t"+"Output fastq prefix"+"\n\n"+
			      //"\t"+"-p , --PIPE"+"\n\t\t"+"Read BAM from and write it to PIPE"+"\n"+
			      "\t"+"-o , --outfile" +"\t\t"+"Output (BAM format)."+"\n"+


			      "\t"+"-u            " +"\t\t"+"Produce uncompressed bam (good for pipe)"+"\n"+

			      //	"\t"+" , --outprefix" +"\n\t\t"+"Prefix for output files (default '"+outprefix+"')."+"\n"+
			      //"\t"+" , --SAM" +"\n\t\t"+"Output SAM not BAM."+"\n"+
			      "\t"+"--aligned" +"\t\t"+"Allow reads to be aligned (default "+boolStringify(allowAligned)+")"+"\n"+
			      "\t"+"-v , --verbose" +"\t\t"+"Turn all messages on (default "+boolStringify(verbose)+")"+"\n"+
			      "\t"+"--log [log file]" +"\t"+"Print a tally of merged reads to this log file (default only to stderr)"+"\n"+
			      
			      "\n\t"+"Paired End merging/Single Read trimming  options"+"\n"+
			      "\t\t"+"You can specify either:"+"\n"+
			      "\t\t\t"+"--ancientdna"+"\t\t\t"+"ancient DNA (default "+boolStringify(ancientDNA)+")"+"\n"+
			      "\t\t"+"            "+"\t\t\t\t"+"this allows for partial overlap"+"\n"+
			      "\n\t\t"+"or if you know your size length distribution:"+"\n"+
			      "\t\t\t"+"--loc"+"\t\t\t\t"+"Location for lognormal dist. (default none)"+"\n"+
			      "\t\t\t"+"--scale"+"\t\t\t\t"+"Scale for lognormal dist. (default none)"+"\n"+
			      //			      "\t\t\t\t\t\t\tGood for merging ancient DNA reads into a single sequence\n\n"
			      "\n\t\t"+"--keepOrig"+"\t\t\t\t"+"Write original reads if they are trimmed or merged  (default "+boolStringify(keepOrig)+")"+"\n"+
			      "\t\t\t\t\t\t\tSuch reads will be marked as PCR duplicates\n\n"
			      "\t\t"+"-f , --adapterFirstRead" +"\t\t\t"+"Adapter that is observed after the forward read (def. Multiplex: "+options_adapter_F_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-s , --adapterSecondRead" +"\t\t"+"Adapter that is observed after the reverse read (def. Multiplex: "+options_adapter_S_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-c , --FirstReadChimeraFilter" +"\t\t"+"If the forward read looks like this sequence, the cluster is filtered out.\n\t\t\t\t\t\t\tProvide several sequences separated by comma (def. Multiplex: "+options_adapter_chimera_BAM.substr(0,30)+")"+"\n"+
			      "\t\t"+"-k , --key"+"\t\t\t\t"+"Key sequence with which each sequence starts. Comma separate for forward and reverse reads. (default '"+key+"')"+"\n"+
			      "\t\t"+"-i , --allowMissing"+"\t\t\t"+"Allow one base in one key to be missing or wrong. (default "+boolStringify(allowMissing)+")"+"\n"+
			      "\t\t"+"-t , --trimCutoff"+"\t\t\t"+"Lowest number of adapter bases to be observed for single Read trimming (default "+stringify(trimCutoff)+")");

    if( (argc== 1) ||
    	(argc== 2 && string(argv[1]) == "-h") ||
    	(argc== 2 && string(argv[1]) == "-help") ||
    	(argc== 2 && string(argv[1]) == "--help") ){
    	cout<<"Usage:"<<endl;
    	cout<<""<<endl;
    	cout<<usage<<endl;
    	return 1;
    }

    

    for(int i=1;i<(argc-1);i++){ //all but the last arg

	if(strcmp(argv[i],"-fq1") == 0 ){
	    fastqfile1=string(argv[i+1]);
	    fastqFormat=true;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-fq2") == 0 ){
	    fastqfile2=string(argv[i+1]);
	    fastqFormat=true;
	    singleEndModeFQ=false;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-fqo") == 0 ){
	    fastqoutfile=string(argv[i+1]);
	    fastqFormat=true;
	    i++;
	    continue;
	}




	if(strcmp(argv[i],"--log") == 0 ){
	    logFileName =string(argv[i+1]);
	    printLog=true;
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-p") == 0 || strcmp(argv[i],"--PIPE") == 0 ){
	    cerr<<"This version no longer works with pipe, exiting"<<endl;
	    return 1;	    
	}

	if(strcmp(argv[i],"-u") == 0  ){
	    produceUnCompressedBAM=true;
	    continue;
	}

	if(strcmp(argv[i],"--aligned") == 0  ){
	    allowAligned=true;
	    continue;
	}



	if(strcmp(argv[i],"-o") == 0 || strcmp(argv[i],"--outfile") == 0 ){
	    bamFileOUT =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-v") == 0 || strcmp(argv[i],"--verbose") == 0 ){
	    verbose=true;
	    continue;
	}

	if(strcmp(argv[i],"--ancientdna") == 0 ){
	    ancientDNA=true;
	    continue;
	}

	if(strcmp(argv[i],"--keepOrig") == 0 ){
	    keepOrig=true;
	    continue;
	}

	if(strcmp(argv[i],"--loc") == 0 ){
	    location =destringify<double>(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"--scale") == 0 ){
	    scale =destringify<double>(argv[i+1]);
	    i++;
	    continue;
	}



	if(strcmp(argv[i],"-f") == 0 || strcmp(argv[i],"--adapterFirstRead") == 0 ){
	    adapter_F =string(argv[i+1]);
	    i++;
	    continue;
	}


	if(strcmp(argv[i],"-s") == 0 || strcmp(argv[i],"--adapterSecondRead") == 0 ){
	    adapter_S =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-c") == 0 || strcmp(argv[i],"--FirstReadChimeraFilter") == 0 ){
	    adapter_chimera =string(argv[i+1]);
	    i++;
	    continue;
	}

	if(strcmp(argv[i],"-k") == 0 || strcmp(argv[i],"--keys") == 0 ){
	    key =string(argv[i+1]);
	    i++;
	    continue;
	}
	

	if(strcmp(argv[i],"-i") == 0 || strcmp(argv[i],"--allowMissing") == 0 ){
	    allowMissing=true;
	    continue;
	}

	if(strcmp(argv[i],"-t") == 0 || strcmp(argv[i],"--trimCutoff") == 0 ){
	    trimCutoff=atoi(argv[i+1]);
	    i++;
	    continue;
	}
	
	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
	return 1;	    
    }

    bamFile=argv[argc-1];

    if( (location != -1.0 && scale == -1.0) ||
	(location == -1.0 && scale != -1.0) ){
	cerr<<"Cannot specify --location without specifying --scale"<<endl;
	return 1;	    
    }
	
    if( (location != -1.0 && scale != -1.0) ){
	useDist=true;
	    
	if(ancientDNA){
	    cerr<<"Cannot specify --location/--scale and --ancientDNA"<<endl;
	    return 1;	    
	}
    }
    
    MergeTrimReads mtr (adapter_F,adapter_S,adapter_chimera,
			key1,key2,
			trimCutoff,allowMissing,ancientDNA,location,scale,useDist);

    fqwriters onereadgroup;

    if(fastqFormat){
	
	if( bamFileOUT != ""  || produceUnCompressedBAM || allowAligned){
	    cerr<<"ERROR : Cannot specify options like -o, -u or --allowAligned for fastq"<<endl;
	    return 1;
	}

	if(fastqfile1 == ""){
	    cerr<<"ERROR : Must specify as least the first file for fastq"<<endl;
	    return 1;	    
	}



	FastQParser * fqp1;
	FastQParser * fqp2;

	if(singleEndModeFQ){
	    fqp1 = new FastQParser (fastqfile1);

	    string outdirs   = fastqoutfile+".fq.gz";
	    string outdirsf  = fastqoutfile+".fail.fq.gz";

	    onereadgroup.single.open(outdirs.c_str(), ios::out);
	    onereadgroup.singlef.open(outdirsf.c_str(), ios::out);

	    if(!onereadgroup.single.good()){       cerr<<"Cannot write to file "<<outdirs<<endl; return 1; }
	    if(!onereadgroup.singlef.good()){      cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; }

	    
	}else{
	    fqp1 = new FastQParser (fastqfile1);
	    fqp2 = new FastQParser (fastqfile2);

	    string outdirs   = fastqoutfile+".fq.gz";
	    string outdir1   = fastqoutfile+"_r1.fq.gz";
	    string outdir2   = fastqoutfile+"_r2.fq.gz";

	    string outdirsf  = fastqoutfile+".fail.fq.gz";
	    string outdir1f  = fastqoutfile+"_r1.fail.fq.gz";
	    string outdir2f  = fastqoutfile+"_r2.fail.fq.gz";

	    onereadgroup.single.open(outdirs.c_str(), ios::out);
	    onereadgroup.pairr1.open(outdir1.c_str(), ios::out);
	    onereadgroup.pairr2.open(outdir2.c_str(), ios::out);

	    onereadgroup.singlef.open(outdirsf.c_str(), ios::out);
	    onereadgroup.pairr1f.open(outdir1f.c_str(), ios::out);
	    onereadgroup.pairr2f.open(outdir2f.c_str(), ios::out);

	    if(!onereadgroup.single.good()){       cerr<<"Cannot write to file "<<outdirs<<endl; return 1; }
	    if(!onereadgroup.pairr1.good()){       cerr<<"Cannot write to file "<<outdir1<<endl; return 1; }
	    if(!onereadgroup.pairr2.good()){       cerr<<"Cannot write to file "<<outdir2<<endl; return 1; }
	    
	    if(!onereadgroup.singlef.good()){      cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; }
	    if(!onereadgroup.pairr1f.good()){      cerr<<"Cannot write to file "<<outdir1f<<endl; return 1; }
	    if(!onereadgroup.pairr2f.good()){      cerr<<"Cannot write to file "<<outdir2f<<endl; return 1; }
	    
	}


	unsigned int totalSeqs=0;
	while(fqp1->hasData()){

	    FastQObj * fo1=fqp1->getData();
	    vector<string> def1=allTokens( *(fo1->getID()), ' '  );
	    string def1s=def1[0];
	

	    FastQObj * fo2;
	    string def2s;
	    string ext2s;

	    if(!singleEndModeFQ){
		if(!fqp2->hasData()){
		    cerr << "ERROR: Discrepency between fastq files at record " <<  *(fo1->getID()) <<endl;
		    return 1;
		}

		fo2=fqp2->getData();
		vector<string> def2=allTokens( *(fo2->getID()), ' ' );
		def2s=def2[0];




		if(strEndsWith(def1s,"/1")){
		    def1s=def1s.substr(0,def1s.size()-2);
		}
		if(strEndsWith(def2s,"/2")){
		    def2s=def2s.substr(0,def2s.size()-2);
		}

		if(strBeginsWith(def1s,"@")){
		    def1s=def1s.substr(1,def1s.size()-1);
		}
		if(strBeginsWith(def2s,"@")){
		    def2s=def2s.substr(1,def2s.size()-1);
		}


		if(def1s != def2s){
		    cerr << "ERROR: Discrepency between fastq files, different names " << *(fo1->getID()) <<" and "<< *(fo2->getID()) <<endl;
		    return 1;
		}

		merged result=	mtr.process_PE(*(fo1->getSeq()),*(fo1->getQual()),
					       *(fo2->getSeq()),*(fo2->getQual()));

		mtr.incrementCountall();

		if(result.code != ' '){ //keys or chimeras

		    if(result.code == 'K'){
			mtr.incrementCountfkey();
		    }else{
			if(result.code == 'D'){
			    mtr.incrementCountchimera();
			}else{
			    cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl;
			    exit(1);
			}
		    }
			
		    onereadgroup.pairr2f<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl;
		    onereadgroup.pairr1f<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		    continue;

		}else{
		        if(result.sequence != ""){ //new sequence			    
			    onereadgroup.single<<"@"<<def1s<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl;    	    

			    if( result.sequence.length() > max(fo1->getSeq()->length(),fo2->getSeq()->length()) ){
				mtr.incrementCountmergedoverlap();
			    }else{
				mtr.incrementCountmerged();			  
			    }

			}else{ //keep as is
			    mtr.incrementCountnothing();

			    onereadgroup.pairr2<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl;
			    onereadgroup.pairr1<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
			    

			}
		}

	    }else{
		
		
		merged result=mtr.process_SR(*(fo1->getSeq()),*(fo1->getQual()));
		mtr.incrementCountall();

		if(result.code != ' '){ //either chimera or missing key

		    if(result.code == 'K'){
			mtr.incrementCountfkey();
		    }else{
			if(result.code == 'D'){
			    mtr.incrementCountchimera();
			}else{
			    cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl;
			    exit(1);
			}
		    }

		    onereadgroup.singlef<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		    continue;
		}

		if(result.sequence != ""){ //new sequence
		    mtr.incrementCounttrimmed();
		    onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl;
		}else{
		    mtr.incrementCountnothing();
		    onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl;
		}

	    }
	


	    totalSeqs++;
	}
    
	delete fqp1;
	if(!singleEndModeFQ){
	    delete fqp2;
	}

	if(singleEndModeFQ){

	    onereadgroup.single.close();
	    onereadgroup.singlef.close();
	    
	}else{
	    onereadgroup.single.close();
	    onereadgroup.pairr1.close();
	    onereadgroup.pairr2.close();
	    
	    onereadgroup.singlef.close();
	    onereadgroup.pairr1f.close();
	    onereadgroup.pairr2f.close();
	}
    
	//fastq
    }else{
	//else BAM


	//  initMerge();
	//     set_adapter_sequences(adapter_F,
	// 			  adapter_S,
	// 			  adapter_chimera);
	//     set_options(trimCutoff,allowMissing,mergeoverlap);
	if(key != ""){
	    size_t found=key.find(",");
	    if (found == string::npos){ //single end reads
		key1=key;
		key2="";
	    } else{                     //paired-end
		key1=key.substr(0,found);
		key2=key.substr(found+1,key.length()-found+1);
	    }
	}






	if( bamFileOUT == ""  ){
	    cerr<<"The output must be a be specified, exiting"<<endl;
	    return 1;
	}

	if ( !reader.Open(bamFile) ) {
	    cerr << "Could not open input BAM file  "<<bamFile << endl;
	    return 1;
	}
	SamHeader header = reader.GetHeader();

    

	string pID          = "mergeTrimReadsBAM";   
	string pName        = "mergeTrimReadsBAM";   
	string pCommandLine = "";
	for(int i=0;i<(argc);i++){
	    pCommandLine += (string(argv[i])+" ");
	}
	putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),".."));

	const RefVector references = reader.GetReferenceData();
	//we will not call bgzip with full compression, good for piping into another program to 
	//lessen the load on the CPU
	if(produceUnCompressedBAM) 
	    writer.SetCompressionMode(BamWriter::Uncompressed);

	if ( !writer.Open(bamFileOUT,header,references) ) {
	    cerr << "Could not open output BAM file "<<bamFileOUT << endl;
	    return 1;
	}



	SamHeader sh=reader.GetHeader();
	//Up to the user to be sure that a sequence is followed by his mate
	// if(!sh.HasSortOrder() || 
	//    sh.SortOrder != "queryname"){
	// 	cerr << "Bamfile must be sorted by queryname" << endl;
	// 	return 1;
	// }
    

	BamAlignment al;
	BamAlignment al2;
	bool al2Null=true;
    
	while ( reader.GetNextAlignment(al) ) {

	
	    if(al.IsMapped() || al.HasTag("NM") || al.HasTag("MD")  ){
		if(!allowAligned){
		    cerr << "Reads should not be aligned" << endl;
		    return 1;
		}else{
		    //should we remove tags ?
		}
	    }


	    if(al.IsPaired() && 
	       al2Null ){
		al2=al;
		al2Null=false;
		continue;
	    }else{
		if(al.IsPaired() && 
		   !al2Null){

		    bool  result =  mtr.processPair(al,al2);
		
		    if( result ){//was merged
			BamAlignment orig;
			BamAlignment orig2;

			if(keepOrig){
			    orig2 = al2;
			    orig  = al;
			}

			writer.SaveAlignment(al);

			if(keepOrig){
			    orig.SetIsDuplicate(true);
			    orig2.SetIsDuplicate(true);
			    writer.SaveAlignment(orig2);
			    writer.SaveAlignment(orig);
			}

			//the second record is empty
		    }else{
			//keep the sequences as pairs

			writer.SaveAlignment(al2);		    
			writer.SaveAlignment(al);
		    }

		    //
		    //  SINGLE END
		    //
		}else{ 
		    BamAlignment orig;
		    if(keepOrig){
			orig =al;
		    }
		    mtr.processSingle(al);

		    if(keepOrig){
			//write duplicate
			if(orig.QueryBases.length()  != al.QueryBases.length()){
			    orig.SetIsDuplicate(true);
			    writer.SaveAlignment(orig);
			}
		    }
		    writer.SaveAlignment(al);



		} //end single end
		al2Null=true;
	    }//second pair
		    

	} //while al
	reader.Close();
	writer.Close();


    } //else BAM


    cerr <<mtr.reportSingleLine()<<endl;

    if(printLog){
	ofstream fileLog;
	fileLog.open(logFileName.c_str());

	if (fileLog.is_open()){
	    fileLog <<mtr.reportMultipleLines() <<endl;

	}else{
	    cerr << "Unable to print to file "<<logFileName<<endl;
	}
	fileLog.close();
    }
    return 0;
}
Esempio n. 22
0
int main (int argc, char *argv[]) {

    int  minBaseQuality = 0;

    string usage=string(""+string(argv[0])+"  [in BAM file] [in VCF file] [chr name] [deam out BAM] [not deam out BAM]"+
			"\nThis program divides aligned single end reads into potentially deaminated\n"+
			"\nreads and the puts the rest into another bam file if the deaminated positions are not called as the alternative base in the VCF.\n"+
			"\nTip: if you do not need one of them, use /dev/null as your output\n"+
			"arguments:\n"+
			"\t"+"--bq  [base qual]   : Minimum base quality to flag a deaminated site (Default: "+stringify(minBaseQuality)+")\n"+
			"\n");

    if(argc == 1 ||
       argc < 4  ||
       (argc == 2 && (string(argv[0]) == "-h" || string(argv[0]) == "--help") )
       ){
	cerr << "Usage "<<usage<<endl;
	return 1;       
    }


    for(int i=1;i<(argc-2);i++){ 

	
        if(string(argv[i]) == "--bq"){
	    minBaseQuality=destringify<int>(argv[i+1]);
            i++;
            continue;
	}

    }

    string bamfiletopen = string( argv[ argc-5 ] );
    string vcffiletopen = string( argv[ argc-4 ] );
    string chrname      = string( argv[ argc-3 ] );
    string deambam      = string( argv[ argc-2 ] );
    string nondeambam   = string( argv[ argc-1 ] );

    //dummy reader, will need to reposition anyway
    VCFreader vcfr (vcffiletopen,
 		    vcffiletopen+".tbi",
 		    chrname,
 		    1,
 		    1,
 		    0);

    BamReader reader;
    
    if ( !reader.Open(bamfiletopen) ) {
    	cerr << "Could not open input BAM file"<< bamfiletopen << endl;
    	return 1;
    }

    // if ( !reader.LocateIndex()  ) {
    // 	cerr << "The index for the BAM file cannot be located" << endl;
    // 	return 1;
    // }

    // if ( !reader.HasIndex()  ) {
    // 	cerr << "The BAM file has not been indexed." << endl;
    // 	return 1;
    // }

    //positioning the bam file
    int refid=reader.GetReferenceID(chrname);
    if(refid < 0){
	cerr << "Cannot retrieve the reference ID for "<< chrname << endl;
	return 1;
    }
    //cout<<"redif "<<refid<<endl;	    

    //setting the BAM reader at that position
    reader.SetRegion(refid,
		     0,
		     refid,
		     -1); 	



    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writerDeam;
    if ( !writerDeam.Open(deambam,      header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamWriter writerNoDeam;
    if ( !writerNoDeam.Open(nondeambam, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }



    unsigned int totalReads      =0;
    unsigned int deaminatedReads =0;
    unsigned int ndeaminatedReads =0;
    unsigned int skipped      =0;



    //iterating over the alignments for these regions
    BamAlignment al;
    int i;

    while ( reader.GetNextAlignment(al) ) {
	// cerr<<al.Name<<endl;

	//skip unmapped
	if(!al.IsMapped()){
	    skipped++;
	    continue;
	}

	//skip paired end !
	if(al.IsPaired() ){  
	    continue;
	    // cerr<<"Paired end not yet coded"<<endl;
	    // return 1;
	}


	string reconstructedReference = reconstructRef(&al);



	char refeBase;
	char readBase;
	bool isDeaminated;
	if(al.Qualities.size() != reconstructedReference.size()){
	    cerr<<"Quality line is not the same size as the reconstructed reference"<<endl;
	    return 1;
	}

	isDeaminated=false;

	if(al.IsReverseStrand()){

	    //first base next to 3'
	    i = 0 ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    if(  readBase  == 'A' && int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    // cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;			
			cerr<<numberOfDeletions(&al)<<endl;			
			cerr<<"Problem1 position "<<*toprint<<" does not have a "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }
		    
		    //if the VCF has a at least one G but no A
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}

	    }


	    //second base next to 3'
	    i = 1;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);

	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+2,al.Position+2);

		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    // cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<"Problem2 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    //if the VCF has at least one G but no A 
		    // if(toprint->hasAtLeastOneG() &&
		    //    toprint->getAlt().find("A") == string::npos){
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}
	    }

	    //last  base next to 5'
	    i = (al.QueryBases.length()-1) ;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'G'  &&
	    if( readBase  == 'A' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  //isDeaminated=true; }

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);


		int lengthMatches=countMatchesRecons(reconstructedReference,0);		
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);
		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();

		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;
		    
		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem3 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }


		    //if the VCF has at least one G but no A
		    if(  toprint->hasAtLeastOneG() && 
			!toprint->hasAtLeastOneA() ){
			isDeaminated=true; 
		    }
		}

	    }

	}else{

		
	    //first base next to 5'
	    i = 0;
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C' 
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){ 

		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);

		vcfr.repositionIterator(chrname,al.Position+1,al.Position+1);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //cout<<*toprint<<endl;
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;
		    
		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<numberOfDeletions(&al)<<endl;			
			cerr<<"Problem4 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    //if the VCF has at least one C but no T
		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }

		}

		//cout<<al.Position+
		 
	    }

	    //second last base next to 3'
	    i = (al.QueryBases.length()-2);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //refeBase  == 'C'  &&
	    if( readBase  == 'T' &&  int(al.Qualities[i]-offset) >= minBaseQuality){  



		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		
		int lengthMatches=countMatchesRecons(reconstructedReference,1);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);

		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem5 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }
		}

		 

	    }

	    //last base next to 3'
	    i = (al.QueryBases.length()-1);
	    refeBase=toupper(reconstructedReference[i]);
	    readBase=toupper(         al.QueryBases[i]);
	    //&& refeBase  == 'C' 
	    if( readBase  == 'T'  && int(al.Qualities[i]-offset) >= minBaseQuality){  
		if( skipAlign(reconstructedReference,&al,&skipped) ){ continue; }
		transformRef(&refeBase,&readBase);		

		int lengthMatches=countMatchesRecons(reconstructedReference,0);	
		int positionJump = al.Position+lengthMatches+numberOfDeletions(&al);

		vcfr.repositionIterator(chrname,positionJump,positionJump);
		while(vcfr.hasData()){
		    SimpleVCF * toprint=vcfr.getData();
		    //skip deletions in the alt
		    if(toprint->getRef().length() != 1 )
			continue;

		    if(toprint->getRef()[0] != refeBase){
			cerr<<reconstructedReference<<endl;
			cerr<<al.Position<<endl;
			cerr<<lengthMatches<<endl;
			cerr<<numberOfDeletions(&al)<<endl;
			cerr<<positionJump<<endl;
			cerr<<"Problem6 position "<<*toprint<<" does not have a  "<<refeBase<<" as reference allele for read "<<al.Name<<endl;
			exit(1);
		    }

		    if(  toprint->hasAtLeastOneC() && 
			!toprint->hasAtLeastOneT() ){
			isDeaminated=true; 
		    }
		}

	    }	

	   
	    
	}
		  



	totalReads++;

	if(isDeaminated){
	    deaminatedReads++;
	    writerDeam.SaveAlignment(al);		
	}else{
	    ndeaminatedReads++;
	    writerNoDeam.SaveAlignment(al);		
	}


    
    }//end for each read









    reader.Close();
    writerDeam.Close();
    writerNoDeam.Close();

    cerr<<"Program finished sucessfully, out of "<<totalReads<<" mapped reads (skipped: "<<skipped<<" reads) we flagged "<<deaminatedReads<<" as deaminated and "<<ndeaminatedReads<<" as not deaminated"<<endl;

   
    return 0;
}
Esempio n. 23
0
int IonstatsAlignment(int argc, const char *argv[])
{
  OptArgs opts;
  opts.ParseCmdLine(argc, argv);
  string input_bam_filename   = opts.GetFirstString('i', "input", "");
  string output_json_filename = opts.GetFirstString('o', "output", "ionstats_alignment.json");
  int histogram_length        = opts.GetFirstInt   ('h', "histogram-length", 400);

  if(argc < 2 or input_bam_filename.empty()) {
    IonstatsAlignmentHelp();
    return 1;
  }

  //
  // Prepare for metric calculation
  //

  BamReader input_bam;
  if (!input_bam.Open(input_bam_filename)) {
    fprintf(stderr, "[ionstats] ERROR: cannot open %s\n", input_bam_filename.c_str());
    return 1;
  }

  ReadLengthHistogram called_histogram;
  ReadLengthHistogram aligned_histogram;
  ReadLengthHistogram AQ7_histogram;
  ReadLengthHistogram AQ10_histogram;
  ReadLengthHistogram AQ17_histogram;
  ReadLengthHistogram AQ20_histogram;
  ReadLengthHistogram AQ47_histogram;
  SimpleHistogram error_by_position;

  called_histogram.Initialize(histogram_length);
  aligned_histogram.Initialize(histogram_length);
  AQ7_histogram.Initialize(histogram_length);
  AQ10_histogram.Initialize(histogram_length);
  AQ17_histogram.Initialize(histogram_length);
  AQ20_histogram.Initialize(histogram_length);
  AQ47_histogram.Initialize(histogram_length);
  error_by_position.Initialize(histogram_length);

  BamAlignment alignment;
  vector<char>  MD_op;
  vector<int>   MD_len;
  MD_op.reserve(1024);
  MD_len.reserve(1024);
  string MD_tag;

  //
  // Main loop over mapped reads in the input BAM
  //

  while(input_bam.GetNextAlignment(alignment)) {

    // Record read length
    called_histogram.Add(alignment.Length);

    if (!alignment.IsMapped() or !alignment.GetTag("MD",MD_tag))
      continue;

    //
    // Step 1. Parse MD tag
    //

    MD_op.clear();
    MD_len.clear();

    for (const char *MD_ptr = MD_tag.c_str(); *MD_ptr;) {

      int item_length = 0;
      if (*MD_ptr >= '0' and *MD_ptr <= '9') {    // Its a match
        MD_op.push_back('M');
        for (; *MD_ptr and *MD_ptr >= '0' and *MD_ptr <= '9'; ++MD_ptr)
          item_length = 10*item_length + *MD_ptr - '0';
      } else {
        if (*MD_ptr == '^') {                     // Its a deletion
          MD_ptr++;
          MD_op.push_back('D');
        } else                                    // Its a substitution
          MD_op.push_back('X');
        for (; *MD_ptr and *MD_ptr >= 'A' and *MD_ptr <= 'Z'; ++MD_ptr)
          item_length++;
      }
      MD_len.push_back(item_length);
    }

    //
    // Step 2. Synchronously scan through Cigar and MD, doing error accounting
    //

    int MD_idx = alignment.IsReverseStrand() ? MD_op.size()-1 : 0;
    int cigar_idx = alignment.IsReverseStrand() ? alignment.CigarData.size()-1 : 0;
    int increment = alignment.IsReverseStrand() ? -1 : 1;

    int AQ7_bases = 0;
    int AQ10_bases = 0;
    int AQ17_bases = 0;
    int AQ20_bases = 0;
    int AQ47_bases = 0;
    int num_bases = 0;
    int num_errors = 0;

    while (cigar_idx < (int)alignment.CigarData.size() and MD_idx < (int) MD_op.size() and cigar_idx >= 0 and MD_idx >= 0) {

      if (alignment.CigarData[cigar_idx].Length == 0) { // Try advancing cigar
        cigar_idx += increment;
        continue;
      }
      if (MD_len[MD_idx] == 0) { // Try advancing MD
        MD_idx += increment;
        continue;
      }

      // Match
      if (alignment.CigarData[cigar_idx].Type == 'M' and MD_op[MD_idx] == 'M') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        num_bases += advance;
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Insertion (read has a base, reference doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'I') {
        int advance = alignment.CigarData[cigar_idx].Length;
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position.Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;

      // Deletion (reference has a base, read doesn't)
      } else if (alignment.CigarData[cigar_idx].Type == 'D' and MD_op[MD_idx] == 'D') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position.Add(num_bases);
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      // Substitution
      } else if (MD_op[MD_idx] == 'X') {
        int advance = min((int)alignment.CigarData[cigar_idx].Length, MD_len[MD_idx]);
        for (int cnt = 0; cnt < advance; ++cnt) {
          error_by_position.Add(num_bases);
          num_bases++;
          num_errors++;
        }
        alignment.CigarData[cigar_idx].Length -= advance;
        MD_len[MD_idx] -= advance;

      } else {
        printf("ionstats alignment: Unexpected OP combination: %s Cigar=%c, MD=%c !\n",
            alignment.Name.c_str(), alignment.CigarData[cigar_idx].Type, MD_op[MD_idx]);
        break;
      }

      if (num_errors*5 <= num_bases)    AQ7_bases = num_bases;
      if (num_errors*10 <= num_bases)   AQ10_bases = num_bases;
      if (num_errors*50 <= num_bases)   AQ17_bases = num_bases;
      if (num_errors*100 <= num_bases)  AQ20_bases = num_bases;
      if (num_errors == 0)              AQ47_bases = num_bases;
    }

    //
    // Step 3. Profit
    //

    if (num_bases >= 20)    aligned_histogram.Add(num_bases);
    if (AQ7_bases >= 20)    AQ7_histogram.Add(AQ7_bases);
    if (AQ10_bases >= 20)   AQ10_histogram.Add(AQ10_bases);
    if (AQ17_bases >= 20)   AQ17_histogram.Add(AQ17_bases);
    if (AQ20_bases >= 20)   AQ20_histogram.Add(AQ20_bases);
    if (AQ47_bases >= 20)   AQ47_histogram.Add(AQ47_bases);
  }

  input_bam.Close();


  //
  // Processing complete, generate ionstats_alignment.json
  //

  Json::Value output_json(Json::objectValue);
  output_json["meta"]["creation_date"] = get_time_iso_string(time(NULL));
  output_json["meta"]["format_name"] = "ionstats_alignment";
  output_json["meta"]["format_version"] = "1.0";

  called_histogram.SaveToJson(output_json["full"]);
  aligned_histogram.SaveToJson(output_json["aligned"]);
  AQ7_histogram.SaveToJson(output_json["AQ7"]);
  AQ10_histogram.SaveToJson(output_json["AQ10"]);
  AQ17_histogram.SaveToJson(output_json["AQ17"]);
  AQ20_histogram.SaveToJson(output_json["AQ20"]);
  AQ47_histogram.SaveToJson(output_json["AQ47"]);
  error_by_position.SaveToJson(output_json["error_by_position"]);

  ofstream out(output_json_filename.c_str(), ios::out);
  if (out.good()) {
    out << output_json.toStyledString();
    return 0;
  } else {
    fprintf(stderr, "ERROR: unable to write to '%s'\n", output_json_filename.c_str());
    return 1;
  }

  return 0;
}
void PileupEngine::PileupEnginePrivate::ParseAlignmentCigar(const BamAlignment& al) {
  
    // skip if unmapped
    if ( !al.IsMapped() ) return;
    
    // intialize local variables
    int  genomePosition      = al.Position;
    int  positionInAlignment = 0;
    bool isNewReadSegment    = true;
    bool saveAlignment       = true;    
    PileupAlignment pileupAlignment(al);
    
    // iterate over CIGAR operations
    const int numCigarOps = (const int)al.CigarData.size();
    for (int i = 0; i < numCigarOps; ++i ) { 
        const CigarOp& op = al.CigarData.at(i);
      
        // if op is MATCH
        if ( op.Type == 'M' ) {
          
            // if match op overlaps current position
            if ( genomePosition + (int)op.Length > CurrentPosition ) {
              
                // set pileup data
                pileupAlignment.IsCurrentDeletion   = false;
                pileupAlignment.IsNextDeletion      = false;
                pileupAlignment.IsNextInsertion     = false;
                pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition);
                
                // check for beginning of read segment
                if ( genomePosition == CurrentPosition && isNewReadSegment ) 
                    pileupAlignment.IsSegmentBegin = true;
                
                // if we're at the end of a match operation
                if ( genomePosition + (int)op.Length - 1 == CurrentPosition ) {
                    
                    // if not last operation
                    if ( i < numCigarOps - 1 ) {
                        
                        // check next CIGAR op
                        const CigarOp& nextOp = al.CigarData.at(i+1);
                        
                        // if next CIGAR op is DELETION
                        if ( nextOp.Type == 'D') {
                            pileupAlignment.IsNextDeletion = true;
                            pileupAlignment.DeletionLength = nextOp.Length;
                        }
                        
                        // if next CIGAR op is INSERTION
                        else if ( nextOp.Type == 'I' ) {
                            pileupAlignment.IsNextInsertion = true;
                            pileupAlignment.InsertionLength = nextOp.Length;
                        }
                            
                        // if next CIGAR op is either DELETION or INSERTION
                        if ( nextOp.Type == 'D' || nextOp.Type == 'I' ) {

                            // if there is a CIGAR op after the DEL/INS
                            if ( i < numCigarOps - 2 ) {
                                const CigarOp& nextNextOp = al.CigarData.at(i+2);
                                
                                // if next CIGAR op is clipping or ref_skip
                                if ( nextNextOp.Type == 'S' || 
                                     nextNextOp.Type == 'N' ||
                                     nextNextOp.Type == 'H' )
                                    pileupAlignment.IsSegmentEnd = true;
                            } 
                            else {
                                pileupAlignment.IsSegmentEnd = true;
                                
                                // if next CIGAR op is clipping or ref_skip
                                if ( nextOp.Type == 'S' || 
                                     nextOp.Type == 'N' ||
                                     nextOp.Type == 'H' )
                                    pileupAlignment.IsSegmentEnd = true;
                            }
                        }
                        
                        // otherwise
                        else { 
                        
                            // if next CIGAR op is clipping or ref_skip
                            if ( nextOp.Type == 'S' || 
                                 nextOp.Type == 'N' ||
                                 nextOp.Type == 'H' )
                                pileupAlignment.IsSegmentEnd = true;
                        }
                    }
                    
                    // else this is last operation
                    else pileupAlignment.IsSegmentEnd = true;
                }
            }
          
            // increment markers
            genomePosition      += op.Length;
            positionInAlignment += op.Length;
        } 
        
        // if op is DELETION
        else if ( op.Type == 'D' ) {
          
            // if deletion op overlaps current position
            if ( genomePosition + (int)op.Length > CurrentPosition ) {
              
                // set pileup data
                pileupAlignment.IsCurrentDeletion   = true;
                pileupAlignment.IsNextDeletion      = false;
                pileupAlignment.IsNextInsertion     = true;
                pileupAlignment.PositionInAlignment = positionInAlignment + (CurrentPosition - genomePosition);
            }
            
            // increment marker
            genomePosition += op.Length;
        }

        // if op is REF_SKIP
        else if ( op.Type == 'N' ) {
            genomePosition += op.Length;
        }
        
        // if op is INSERTION or SOFT_CLIP
        else if ( op.Type == 'I' || op.Type == 'S' ) {
            positionInAlignment += op.Length;
        }
        
        // checl for beginning of new read segment
        if ( op.Type == 'N' ||
             op.Type == 'S' ||
             op.Type == 'H' )
            isNewReadSegment = true;
        else 
            isNewReadSegment = false;
      
        // if we've moved beyond current position
        if ( genomePosition > CurrentPosition ) {
            if ( op.Type == 'N' ) saveAlignment = false; // ignore alignment if REF_SKIP
            break;
        }
    }

    // save pileup position if flag is true
    if ( saveAlignment )
        CurrentPileupData.PileupAlignments.push_back( pileupAlignment );
}
Esempio n. 25
0
int main (int argc, char *argv[]) {

    bool mapped  =false;
    bool unmapped=false;

    const string usage=string(string(argv[0])+" [options] input.bam out.bam"+"\n\n"+
			      "This program takes a BAM file as input and produces\n"+
			      "another where the putative deaminated bases have\n"+
			      "have been cut\n"+
			      "\n"+
			      "Options:\n");
			      // "\t"+"-u , --unmapped" +"\n\t\t"+"For an unmapped bam file"+"\n"+
			      // "\t"+"-m , --mapped"   +"\n\t\t"+"For an mapped bam file"+"\n");
			      
			      

    if( (argc== 1) ||
	(argc== 2 && string(argv[1]) == "-h") ||
	(argc== 2 && string(argv[1]) == "-help") ||
	(argc== 2 && string(argv[1]) == "--help") ){
	cout<<"Usage:"<<endl;
	cout<<usage<<endl;
	cout<<""<<endl;
	return 1;
    }

    // for(int i=1;i<(argc-1);i++){ //all but the last arg

    // 	if(strcmp(argv[i],"-m") == 0 || strcmp(argv[i],"--mapped") == 0 ){
    // 	    mapped=true;
    // 	    continue;
    // 	}

    // 	if(strcmp(argv[i],"-u") == 0 || strcmp(argv[i],"--unmapped") == 0 ){
    // 	    unmapped=true;
    // 	    continue;
    // 	}
       
    // 	cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl;
    // 	return 1;
    // }

    if(argc != 3){
	cerr<<"Error: Must specify the input and output BAM files";
	return 1;
    }

    string inbamFile =argv[argc-2];
    string outbamFile=argv[argc-1];

    // if(!mapped && !unmapped){
    // 	cerr << "Please specify whether you reads are mapped or unmapped" << endl;
    // 	return 1;
    // }
    // if(mapped && unmapped){
    // 	cerr << "Please specify either mapped or unmapped but not both" << endl;
    // 	return 1;
    // }

    BamReader reader;

    if ( !reader.Open(inbamFile) ) {
    	cerr << "Could not open input BAM files." << endl;
    	return 1;
    }


    vector<RefData>  testRefData=reader.GetReferenceData();
    const SamHeader header = reader.GetHeader();
    const RefVector references = reader.GetReferenceData();

    BamWriter writer;
    if ( !writer.Open(outbamFile, header, references) ) {
	cerr << "Could not open output BAM file" << endl;
	return 1;
    }

    BamAlignment al;
    // BamAlignment al2;
    // bool al2Null=true;
    
    while ( reader.GetNextAlignment(al) ) {

	    if(al.IsPaired() ){  

		if(al.IsFirstMate() ){ //5' end, need to check first base only
		    if(al.IsReverseStrand()){ //
			if(!al.IsMapped()){
			    cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
			    //return 1;
			}
			int indexToCheck;
			//last
			indexToCheck=al.QueryBases.length()-1;
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);			 
			    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			    al.Qualities  = al.Qualities.substr(0, indexToCheck);
			}
		    }else{
			int indexToCheck;
			//first base
			indexToCheck=0;
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			    al.Qualities  = al.Qualities.substr(indexToCheck+1);
			}
		    }
		}else{ //3' end, need to check last two bases only
		    if( al.IsSecondMate() ){
			if(al.IsReverseStrand()){ //
			    if(!al.IsMapped()){
				cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
				//return 1;
			    }
			    int indexToCheck;

			    //second to last
			    indexToCheck=al.QueryBases.length()-2;
			    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				//al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				al.QueryBases = al.QueryBases.substr(0,indexToCheck);
				al.Qualities  = al.Qualities.substr(0, indexToCheck);
			    }else{
				//last
				indexToCheck=al.QueryBases.length()-1;
				if(toupper(al.QueryBases[indexToCheck]) == 'T'){
				    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
				    al.Qualities  = al.Qualities.substr(0, indexToCheck);
				}
			    }
			}else{
			    int indexToCheck;
			    //second base
			    indexToCheck=1;
			    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				//al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				al.QueryBases = al.QueryBases.substr(indexToCheck+1);
				al.Qualities  = al.Qualities.substr(indexToCheck+1);
			    }else{
				//first base
				indexToCheck=0;
				if(toupper(al.QueryBases[indexToCheck]) == 'A'){
				    //al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
				    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
				    al.Qualities  = al.Qualities.substr(indexToCheck+1);
				}
			    }

			}
		    }else{
			cerr << "Wrong state" << endl;
			return 1;
		    }
		}

	    }//end of paired end
	    else{//we consider single reads to have been sequenced from 5' to 3'

		if(al.IsReverseStrand()){ //need to consider 
		    if(!al.IsMapped()){
			cerr << "Cannot have reverse complemented unmapped reads: " <<al.Name<< endl;
			//return 1;
		    }

		    int indexToCheck;



		    //second base
		    indexToCheck=1;
		    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"51 "<<al.QueryBases<<endl;
			// cout<<"51 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			al.Qualities  = al.Qualities.substr(indexToCheck+1);
			// cout<<"52 "<<al.QueryBases<<endl;
			// cout<<"52 "<<al.Qualities<<endl;
		    }else{
			//first base
			indexToCheck=0;
			if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			    // al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    // cout<<"61 "<<al.QueryBases<<endl;
			    // cout<<"61 "<<al.Qualities<<endl;
			    al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			    al.Qualities  = al.Qualities.substr(indexToCheck+1);
			    // cout<<"62 "<<al.QueryBases<<endl;
			    // cout<<"62 "<<al.Qualities<<endl;
			}

		    }


		    //last
		    indexToCheck=al.QueryBases.length()-1;
		    if(toupper(al.QueryBases[indexToCheck]) == 'A'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"21 "<<al.QueryBases<<endl;
			// cout<<"21 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			al.Qualities  = al.Qualities.substr(0, indexToCheck);
			// cout<<"22 "<<al.QueryBases<<endl;
			// cout<<"22 "<<al.Qualities<<endl;
		    }
		}else{

		    int indexToCheck;
		    //first base
		    indexToCheck=0;
		    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);

			// cout<<"11 "<<al.QueryBases<<endl;
			// cout<<"11 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(indexToCheck+1);
			al.Qualities  = al.Qualities.substr(indexToCheck+1);
			// cout<<"12 "<<al.QueryBases<<endl;
			// cout<<"12 "<<al.Qualities<<endl;

		    }

		    //second to last
		    indexToCheck=al.QueryBases.length()-2;
		    if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			// al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			// cout<<"31 "<<al.QueryBases<<endl;
			// cout<<"31 "<<al.Qualities<<endl;
			al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			al.Qualities  = al.Qualities.substr(0, indexToCheck);
			// cout<<"32 "<<al.QueryBases<<endl;
			// cout<<"32 "<<al.Qualities<<endl;
		    }else{

			//last
			indexToCheck=al.QueryBases.length()-1;
			if(toupper(al.QueryBases[indexToCheck]) == 'T'){
			    // al.Qualities[indexToCheck]=char(offset+baseQualForDeam);
			    // cout<<"41 "<<al.QueryBases<<endl;
			    // cout<<"41 "<<al.Qualities<<endl;
			    al.QueryBases = al.QueryBases.substr(0,indexToCheck);
			    al.Qualities  = al.Qualities.substr(0, indexToCheck);
			    // cout<<"42 "<<al.QueryBases<<endl;
			    // cout<<"42 "<<al.Qualities<<endl;
			}

		    }
		}
	    }//end of single end

	    writer.SaveAlignment(al);		


    }//    while ( reader.GetNextAlignment(al) ) {

    reader.Close();
    writer.Close();
   
    return 0;
}
int main_asequantmultirg(const vector<string> &all_args)
{
    Init(all_args);
    
    cerr << "* Reading bam file " << endl;
    OpenBam(bam_reader, bam_file);
    bam_reader.OpenIndex(bam_file + ".bai");
    
    vector<string> readGroupVector;
    SamHeader header = bam_reader.GetHeader();
    SamReadGroupDictionary headerRG = header.ReadGroups;
    for (SamReadGroupIterator it = headerRG.Begin(); it != headerRG.End(); it ++)
    {
        readGroupVector.push_back(it -> ID);
    }
    
    
    vector<RefData> chroms = bam_reader.GetReferenceData();
    
    cout << "#CHROM" << "\t" << "POS" << "\t" << "REF" << "\t" << "ALT";
    for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++)
    {
        cout << "\t" << *it;
    }
    cout << endl;
    
    StlFor(chrom_idx, chroms)
    {
        string &chrom = chroms[chrom_idx].RefName;
        vector<Snp> snps = snps_by_chrom[chrom];
        
        int s = 0; // Index into snp array
        
        BamAlignment bam;
        bam_reader.Jump(chrom_idx);
        
        string align;
        string qualities;
        
        cerr << "* On chrom " << chrom << endl;

        while (bam_reader.GetNextAlignment(bam) && bam.RefID == chrom_idx) 
        {
	  if (bam.MapQuality < min_map_qual || !bam.IsMapped())
                continue;
       
            string currentRG;
            Assert(bam.GetReadGroup(currentRG));
            
            int start = AlignStart(bam);
            int end = AlignEnd(bam);
            
            // Move the current SNP pointer so that it is ahead of the read's start (since bam alignments are in sorted order)
            while (s < snps.size() && snps[s].pos < start)
                ++s;
            
            // Stop everything if we have visited all SNPs on this chrom
            if (s >= snps.size())
                break;
            
            // Find any/all SNPs that are within the bam alignment
            int n = 0; // Number of SNPs overlapped
            while ((s + n) < snps.size() && snps[s + n].pos < end) // Then it overlaps!
                ++n;
            
            // Now, look at each SNP and see which way it votes
            AlignedString(bam, align);
            AlignedQualities(bam, qualities);
            Assert(align.size() == qualities.size());

            // Now, tally votes
            for (int i = 0; i < n; ++i)
            {
                Snp &snp = snps[s + i];
                char base = align[snp.pos - start]; // Base from the read
                int qual = int(qualities[snp.pos - start]) - ascii_offset; // Base from the read
                
                //AssertMsg(qual >= 0 && qual <= 100, ToStr(qual) + "\n" + bam.Name + "\n" + CigarToStr(bam.CigarData) + "\n" + bam.QueryBases + "\n" + bam.Qualities);
                
                if (base == '-' || qual < min_base_qual)
                    continue;
                
                map<string, Counts> &RG_counts = bam.IsReverseStrand() ? snp.rev : snp.fwd;
                
                map<string, Counts>::iterator searchIt = RG_counts.find(currentRG);
                
                if (searchIt == RG_counts.end())
                {
                    if (base == snp.ref)
                    {
                        RG_counts[currentRG].num_ref = 1;
                        RG_counts[currentRG].num_alt = 0;
                        RG_counts[currentRG].num_other = 0;
                    }
                    else if (base == snp.alt)
                    {
                        RG_counts[currentRG].num_ref = 0;
                        RG_counts[currentRG].num_alt = 1;
                        RG_counts[currentRG].num_other = 0;
                    }
                    else
                    {
                        RG_counts[currentRG].num_ref = 0;
                        RG_counts[currentRG].num_alt = 0;
                        RG_counts[currentRG].num_other = 1;
                    }
                }
                else
                {
                    if (base == snp.ref)
                    {
                        searchIt -> second.num_ref += 1;
                    }
                    else if (base == snp.alt)
                    {
                        searchIt -> second.num_alt += 1;
                    }
                    else
                    {
                        searchIt -> second.num_other += 1;
                    }
                }
            }
        }
        
        // Output counts
        for (int s = 0; s < snps.size(); ++s)
        {
            cout << chrom << "\t" << snps[s].pos + 1 << "\t" << snps[s].ref << "\t" << snps[s].alt;
            for (vector<string>::iterator it = readGroupVector.begin(); it != readGroupVector.end(); it ++)
            {
                map<string, Counts>::iterator searchIt = snps[s].fwd.find(*it);
                if (searchIt != snps[s].fwd.end())
                {
                    cout << "\t" << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other << ",";
                }
                else
                {
                    cout << "\t" << "0,0,0,";
                }
                searchIt = snps[s].rev.find(*it);
                if (searchIt != snps[s].rev.end())
                {
                    cout << searchIt -> second.num_ref << "," << searchIt -> second.num_alt << "," << searchIt -> second.num_other;
                }
                else
                {
                    cout << "0,0,0";
                }
            }
            cout << endl;
        }
    }
Esempio n. 27
0
File: main.cpp Progetto: egafni/glia
void realign_bam(Parameters& params) {

    FastaReference reference;
    reference.open(params.fasta_reference);

    bool suppress_output = false;

    int dag_window_size = params.dag_window_size;
    
    // open BAM file
    BamReader reader;
    if (!reader.Open("stdin")) {
        cerr << "could not open stdin for reading" << endl;
        exit(1);
    }

    BamWriter writer;
    if (!params.dry_run && !writer.Open("stdout", reader.GetHeaderText(), reader.GetReferenceData())) {
        cerr << "could not open stdout for writing" << endl;
        exit(1);
    }

    // store the names of all the reference sequences in the BAM file
    map<int, string> referenceIDToName;
    vector<RefData> referenceSequences = reader.GetReferenceData();
    int i = 0;
    for (RefVector::iterator r = referenceSequences.begin(); r != referenceSequences.end(); ++r) {
        referenceIDToName[i] = r->RefName;
        ++i;
    }

    vcf::VariantCallFile vcffile;
    if (!params.vcf_file.empty()) {
        if (!vcffile.open(params.vcf_file)) {
            cerr << "could not open VCF file " << params.vcf_file << endl;
            exit(1);
        }
    } else {
        cerr << "realignment requires VCF file" << endl;
        exit(1);
    }
    vcf::Variant var(vcffile);

    BamAlignment alignment;
    map<long int, vector<BamAlignment> > alignmentSortQueue;

    // get alignment
    // assemble DAG in region around alignment
    // loop for each alignment in BAM:
    //     update DAG when current alignment gets close to edge of assembled DAG
    //     attempt to realign if read has a certain number of mismatches + gaps or softclips, weighted by basequal
    //     if alignment to DAG has fewer mismatches and gaps than original alignment, use it
    //         flatten read into reference space (for now just output alleles from VCF un-spanned insertions)
    //     write read to queue for streaming re-sorting (some positional change will occur)

    long int dag_start_position = 0;
    string currentSeqname;
    string ref;
    //vector<Cigar> cigars; // contains the Cigar strings of nodes in the graph
    //vector<long int> refpositions; // contains the reference start coords of nodes in the graph
    ReferenceMappings ref_map;
    gssw_graph* graph = gssw_graph_create(0);
    int8_t* nt_table = gssw_create_nt_table();
    int8_t* mat = gssw_create_score_matrix(params.match, params.mism);

    int total_reads = 0;
    int total_realigned = 0;
    int total_improved = 0;
    bool emptyDAG = false; // if the dag is constructed over empty sequence
                           // such as when realigning reads mapped to all-N sequence
    if (params.debug) {
        cerr << "about to start processing alignments" << endl;
    }

    while (reader.GetNextAlignment(alignment)) {

        string& seqname = referenceIDToName[alignment.RefID];

        if (params.debug) {
            cerr << "--------------------------------------------" << endl
                 << "processing alignment " << alignment.Name << " at "
                 << seqname << ":" << alignment.Position << endl;
        }

        /*
        if (!alignment.IsMapped() && graph->size == 0) {
            if (params.debug) {
                cerr << "unable to build DAG using unmapped read "
                     << alignment.Name << " @ "
                     << seqname << ":" << alignment.Position
                     << " no previous mapped read found and DAG currently empty" << endl;
            }
            alignmentSortQueue[dag_start_position+dag_window_size].push_back(alignment);
            continue;
        }
        */

        ++total_reads;

        BamAlignment originalAlignment = alignment;
        long unsigned int initialAlignmentPosition = alignment.Position;
        //if (dag_start_position == 1) {
        //    dag_start_position = max(1, (int)initialAlignmentPosition - dag_window_size/2);
        //}

        // should we construct a new DAG?  do so when 3/4 of the way through the current one
        // center on current position + 1/2 dag window
        // TODO check this scheme using some scribbles on paper
        // alignment.IsMapped()
        if ((seqname != currentSeqname
             || ((alignment.Position + (alignment.QueryBases.size()/2)
                  > (3*dag_window_size/4) + dag_start_position)))
            && alignment.Position < reference.sequenceLength(seqname)) {

            if (seqname != currentSeqname) {
                if (params.debug) {
                    cerr << "switched ref seqs" << endl;
                }
                dag_start_position = max((long int) 0,
                                         (long int) (alignment.GetEndPosition() - dag_window_size/2));
            // recenter DAG
            } else if (!ref_map.empty()) {
                dag_start_position = dag_start_position + dag_window_size/2;
                dag_start_position = max(dag_start_position,
                                         (long int) (alignment.GetEndPosition() - dag_window_size/2));
            } else {
                dag_start_position = alignment.Position - dag_window_size/2;
            }
            dag_start_position = max((long int)0, dag_start_position);

            // TODO get sequence length and use to bound noted window size (edge case)
            //cerr << "getting ref " << seqname << " " << max((long int) 0, dag_start_position) << " " << dag_window_size << endl;

            // get variants for new DAG
            vector<vcf::Variant> variants;
            if (!vcffile.setRegion(seqname,
                                   dag_start_position + 1,
                                   dag_start_position + dag_window_size)) {
                // this is not necessarily an error; there should be a better way to check for VCF file validity
                /*
                cerr << "could not set region on VCF file to " << currentSeqname << ":"
                     << dag_start_position << "-" << dag_start_position + ref.size()
                     << endl;
                */
                //exit(1);
            } else {

                // check first variant
                if (vcffile.getNextVariant(var)) {
                    while (var.position <= dag_start_position + 1) {
                        //cerr << "var position == dag_start_position " << endl;
                        dag_start_position -= 1;
                        vcffile.setRegion(seqname,
                                          dag_start_position + 1,
                                          dag_start_position + dag_window_size);
                        if (!vcffile.getNextVariant(var)) { break; }
                    }
                }

                vcffile.setRegion(seqname,
                                  dag_start_position + 1,
                                  dag_start_position + dag_window_size);

                while (vcffile.getNextVariant(var)) {
                    if (params.debug) cerr << "getting variant at " << var.sequenceName << ":" << var.position << endl;
                    //cerr << var.position << " + " << var.ref.length() << " <= " << dag_start_position << " + " << dag_window_size << endl;
                    //cerr << var.position << " >= " << dag_start_position << endl;
                    if (var.position + var.ref.length() <= dag_start_position + dag_window_size
                        && var.position >= dag_start_position) {
                        variants.push_back(var);
                    }
                }

            }

            //cerr << "dag_start_position " << dag_start_position << endl;
            ref = reference.getSubSequence(seqname,
                                           max((long int) 0, dag_start_position),
                                           dag_window_size); // 0/1 conversion

            // clear graph and metadata
            ref_map.clear();
            //cigars.clear();
            //refpositions.clear();
            gssw_graph_destroy(graph);

            if (params.debug) { cerr << "constructing DAG" << endl; }
            // and build the DAG
            graph = gssw_graph_create(0);
            constructDAGProgressive(graph,
                                    ref_map,
                                    ref,
                                    seqname,
                                    variants,
                                    dag_start_position,
                                    nt_table,
                                    mat,
                                    params.flat_input_vcf);

            if (params.debug) {
                cerr << "graph has " << graph->size << " nodes" << endl;
                cerr << "DAG generated from input variants over "
                     << seqname << ":" << dag_start_position << "-" << dag_start_position + dag_window_size
                     << endl;
            }
            if (params.display_dag) {
                gssw_graph_print(graph);
                /*
                for (Backbone::iterator b = backbone.begin(); b != backbone.end(); ++b) {
                    cout << b->first << " "
                         << b->first->id << " "
                         << b->second.ref_position << " "
                         << b->second.cigar << endl
                         << b->first->seq << endl;
                }
                */
            }

            if (graph->size == 1 && allN(ref) || graph->size == 0) {
                if (params.debug) {
                    cerr << "DAG is empty (1 node, all N).  Alignment is irrelevant." << endl;
                }
                emptyDAG = true;
            } else {
                emptyDAG = false;
            }

        }

        AlignmentStats stats_before;
        bool was_mapped = alignment.IsMapped();
        bool has_realigned = false;
        if (was_mapped) {
            if (dag_start_position + dag_window_size < alignment.GetEndPosition()) {
                ref = reference.getSubSequence(seqname,
                                               max((long int) 0, dag_start_position),
                                               alignment.GetEndPosition() - dag_start_position); // 0/1 conversion
            }
        }

        if (params.debug) {
            if (emptyDAG) {
                cerr << "cannot realign against empty (all-N single node) graph" << endl;
            }
        }

        if (!emptyDAG && shouldRealign(alignment, ref, dag_start_position, params, stats_before)) {

            ++total_realigned;

            if (params.debug) {
                cerr << "realigning: " << alignment.Name
                     << " " << alignment.QueryBases << endl
                     << " aligned @ " << alignment.Position
                     << " to variant graph over "
                     << seqname
                     << ":" << dag_start_position
                     << "-" << dag_start_position + dag_window_size << endl;
            }

            //{
            try {

                Cigar flat_cigar;
                string read = alignment.QueryBases;
                string qualities = alignment.Qualities;
                int score;
                long int position;
                string strand;
                gssw_graph_mapping* gm =
                    gswalign(graph,
                             ref_map,
                             read,
                             qualities,
                             params,
                             position,
                             score,
                             flat_cigar,
                             strand,
                             nt_table,
                             mat);
                //
                gssw_graph_mapping_destroy(gm);

                if (params.dry_run) {

                    if (strand == "-" && !alignment.IsMapped()) {
                        read = reverseComplement(read);
                    }
                    cout << read << endl;
                    cout << graph_mapping_to_string(gm) << endl;
                    cout << score << " " << strand << " "
                         << position << " "
                         << flat_cigar << endl;

                } else {

                    /*
                    if (strand == "-") {
                        read = reverseComplement(trace_report.read);
                    }
                   */
 
                    // TODO the qualities are not on the right side of the read
                    if (strand == "-" && alignment.IsMapped()) {
                        // if we're realigning, this is always true unless we swapped strands
                        alignment.SetIsReverseStrand(true);
                        //reverse(alignment.Qualities.begin(), alignment.Qualities.end()); // reverse qualities
                    }
                    //alignment.QueryBases = reverseComplement(trace_report.read);
                    alignment.QueryBases = read;
                    alignment.Qualities = qualities;

                    alignment.Position = position;// + 1;// + 1;//(trace_report.node->position - 1) + trace_report.x;
                    alignment.SetIsMapped(true);
                    if (!alignment.MapQuality) {
                        alignment.MapQuality = 20; // horrible hack...  at least approximate with alignment mismatches against graph
                    }

                    // check if somehow we've ended up with an indel at the ends
                    // if so, grab the reference sequence right beyond it and add
                    // a single match to the cigar, allowing variant detection methods
                    // to run on the results without internal modification
                    Cigar& cigar = flat_cigar;
                    //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl;
                    int flankSize = params.flatten_flank;
                    if (cigar.front().isIndel() ||
                        (cigar.front().isSoftclip() && cigar.at(1).isIndel())) {
                        alignment.Position -= flankSize;
                        string refBase = reference.getSubSequence(seqname, alignment.Position, flankSize);
                        if (cigar.front().isSoftclip()) {
                            alignment.QueryBases.erase(alignment.QueryBases.begin(),
                                                       alignment.QueryBases.begin()+cigar.front().length);
                            alignment.Qualities.erase(alignment.Qualities.begin(),
                                                       alignment.Qualities.begin()+cigar.front().length);
                            cigar.erase(cigar.begin());
                        }
                        alignment.QueryBases.insert(0, refBase);
                        alignment.Qualities.insert(0, string(flankSize, shortInt2QualityChar(30)));
                        Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M'));
                        newCigar.append(flat_cigar);
                        flat_cigar = newCigar;
                    }
                    if (cigar.back().isIndel() ||
                        (cigar.back().isSoftclip() && cigar.at(cigar.size()-2).isIndel())) {
                        string refBase = reference.getSubSequence(seqname,
                                                                  alignment.Position
                                                                  + flat_cigar.refLen(),
                                                                  flankSize);
                        if (cigar.back().isSoftclip()) {
                            alignment.QueryBases.erase(alignment.QueryBases.end()-cigar.back().length,
                                                       alignment.QueryBases.end());
                            alignment.Qualities.erase(alignment.Qualities.end()-cigar.back().length,
                                                      alignment.Qualities.end());
                            cigar.pop_back();
                        }
                        Cigar newCigar; newCigar.push_back(CigarElement(flankSize, 'M'));
                        flat_cigar.append(newCigar);
                        //flat_cigar.append(newCigar);
                        alignment.QueryBases.append(refBase);
                        alignment.Qualities.append(string(flankSize, shortInt2QualityChar(30)));
                    }

                    flat_cigar.toCigarData(alignment.CigarData);
                    //cerr << flat_cigar << " " << flat_cigar.readLen() << " " << flat_cigar.refLen() << endl;

                    if (dag_start_position + dag_window_size < alignment.GetEndPosition()) {
                        ref = reference.getSubSequence(seqname,
                                                       max((long int) 0, dag_start_position),
                                                       alignment.GetEndPosition() - dag_start_position); // 0/1 conversion
                    }

                    AlignmentStats stats_after;
                    countMismatchesAndGaps(alignment, flat_cigar, ref, dag_start_position, stats_after, params.debug);
                    /*
                    if ((!was_mapped || (stats_before.softclip_qsum >= stats_after.softclip_qsum
                                         && stats_before.mismatch_qsum >= stats_after.mismatch_qsum))
                         && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {
                    */
                    /*
                    if ((!was_mapped || (stats_before.softclip_qsum + stats_before.mismatch_qsum
                                         >= stats_after.softclip_qsum + stats_after.mismatch_qsum))
                         && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {
                    */

                    // we accept the new alignment if...
                    if (!was_mapped  // it wasn't mapped previously
                        // or if we have removed soft clips or mismatches (per quality) from the alignment
                        //|| ((stats_before.softclip_qsum >= stats_after.softclip_qsum
                        //     && stats_before.mismatch_qsum >= stats_after.mismatch_qsum)
                        || ((stats_before.softclip_qsum + stats_before.mismatch_qsum
                             >= stats_after.softclip_qsum + stats_after.mismatch_qsum)
                            // and if we have added gaps, we have added them to remove mismatches or softclips
                            && (stats_before.gaps >= stats_after.gaps // accept any time we reduce gaps while not increasing softclips/mismatches
                                || (stats_before.gaps < stats_after.gaps // and allow gap increases when they improve the alignment
                                    && (stats_before.softclip_qsum 
                                        + stats_before.mismatch_qsum
                                        >
                                        stats_after.softclip_qsum
                                        + stats_after.mismatch_qsum))))
                            // and the alignment must not have more than the acceptable number of gaps, softclips, or mismatches
                            // as provided in input parameters
                        && acceptRealignment(alignment, ref, dag_start_position, params, stats_after)) {

                        // keep the alignment
                        // TODO require threshold of softclips to keep alignment (or count of gaps, mismatches,...)
                        if (params.debug) {
                            cerr << "realigned " << alignment.Name << " to graph, which it maps to with "
                                 << stats_after.mismatch_qsum << "q in mismatches and "
                                 << stats_after.softclip_qsum << "q in soft clips" << endl;
                        }
                        ++total_improved;
                        has_realigned = true;
                    } else {
                        // reset to old version of alignment
                        if (params.debug) {
                            cerr << "failed realignment of " << alignment.Name << " to graph, which it maps to with: " 
                                 << stats_after.mismatch_qsum << "q in mismatches " << "(vs " << stats_before.mismatch_qsum << "q before), and "
                                 << stats_after.softclip_qsum << "q in soft clips " << "(vs " << stats_before.softclip_qsum << "q before) " << endl;
                        }
                        has_realigned = false;
                        alignment = originalAlignment;
                    }
                }
                //} // try block

            } catch (...) {
                cerr << "exception when realigning " << alignment.Name
                     << " at position " << referenceIDToName[alignment.RefID]
                     << ":" << alignment.Position
                     << " " << alignment.QueryBases << endl;
                // reset to original alignment
                has_realigned = false;
                alignment = originalAlignment;

            }
        }

        // ensure correct order if alignments move
        long int maxOutputPos = initialAlignmentPosition - dag_window_size;
        // if we switched sequences we need to flush out all the reads from the previous one
        string lastSeqname = currentSeqname;
        if (seqname != currentSeqname) {
            // so the max output position is set past the end of the last chromosome
            if (!currentSeqname.empty()) {
                maxOutputPos = reference.sequenceLength(currentSeqname) + dag_window_size;
            }
            currentSeqname = seqname;
        }

        if (!params.dry_run) {
            map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin();
            for ( ; p != alignmentSortQueue.end(); ++p) {
                // except if we are running in unsorted mode, stop when we are at the window size
                if (!params.unsorted_output && p->first > maxOutputPos) {
                    break; // no more to do
                } else {
                    for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a) {
                        writer.SaveAlignment(*a);
                    }
                }
            }
            if (p != alignmentSortQueue.begin()) {
                alignmentSortQueue.erase(alignmentSortQueue.begin(), p);
            }
            if (!params.only_realigned || has_realigned) {
                alignmentSortQueue[alignment.Position].push_back(alignment);
            }
        }
    } // end GetNextAlignment loop

    if (!params.dry_run) {
        map<long int, vector<BamAlignment> >::iterator p = alignmentSortQueue.begin();
        for ( ; p != alignmentSortQueue.end(); ++p) {
            for (vector<BamAlignment>::iterator a = p->second.begin(); a != p->second.end(); ++a)
                writer.SaveAlignment(*a);
        }
    }

    gssw_graph_destroy(graph);
    free(nt_table);
	free(mat);

    reader.Close();
    writer.Close();

    if (params.debug) {
        cerr << "total reads:\t" << total_reads << endl;
        cerr << "realigned:\t" << total_realigned << endl;
        cerr << "improved:\t" << total_improved << endl;
    }

}