Example #1
0
bool ConvertTool::ConvertToolPrivate::Run(void) {
 
    // ------------------------------------
    // initialize conversion input/output
        
    // set to default input if none provided
    if ( !m_settings->HasInput ) 
        m_settings->InputFiles.push_back(Options::StandardIn());
    
    // open input files
    BamMultiReader reader;
    if ( !reader.Open(m_settings->InputFiles) ) {
        cerr << "bamtools convert ERROR: could not open input BAM file(s)... Aborting." << endl;
        return false;
    }

    // if input is not stdin & a region is provided, look for index files
    if ( m_settings->HasInput && m_settings->HasRegion ) {
        if ( !reader.LocateIndexes() ) {
            cerr << "bamtools convert ERROR: could not locate index file(s)... Aborting." << endl;
            return false;
        }
    }

    // retrieve reference data
    m_references = reader.GetReferenceData();

    // set region if specified
    BamRegion region;
    if ( m_settings->HasRegion ) {
        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {

            if ( reader.HasIndexes() ) {
                if ( !reader.SetRegion(region) ) {
                    cerr << "bamtools convert ERROR: set region failed. Check that REGION describes a valid range" << endl;
                    reader.Close();
                    return false;
                }
            }

        } else {
            cerr << "bamtools convert ERROR: could not parse REGION: " << m_settings->Region << endl;
            cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
                 << endl;
            reader.Close();
            return false;
        }
    }
        
    // if output file given
    ofstream outFile;
    if ( m_settings->HasOutput ) {
      
        // open output file stream
        outFile.open(m_settings->OutputFilename.c_str());
        if ( !outFile ) {
            cerr << "bamtools convert ERROR: could not open " << m_settings->OutputFilename
                 << " for output" << endl;
            return false; 
        }
        
        // set m_out to file's streambuf
        m_out.rdbuf(outFile.rdbuf()); 
    }
    
    // -------------------------------------
    // do conversion based on format
    
     bool convertedOk = true;
    
    // pileup is special case
    // conversion not done per alignment, like the other formats
    if ( m_settings->Format == FORMAT_PILEUP )
        convertedOk = RunPileupConversion(&reader);
    
    // all other formats
    else {
    
        bool formatError = false;
        
        // set function pointer to proper conversion method
        void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
        if      ( m_settings->Format == FORMAT_BED )   pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
        else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
        else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
        else if ( m_settings->Format == FORMAT_JSON )  pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
        else if ( m_settings->Format == FORMAT_SAM )   pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
        else if ( m_settings->Format == FORMAT_YAML )  pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintYaml;
        else { 
            cerr << "bamtools convert ERROR: unrecognized format: " << m_settings->Format << endl;
            cerr << "Please see documentation for list of supported formats " << endl;
            formatError = true;
            convertedOk = false;
        }
        
        // if format selected ok
        if ( !formatError ) {
        
            // if SAM format & not omitting header, print SAM header first
            if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) 
                m_out << reader.GetHeaderText();
            
            // iterate through file, doing conversion
            BamAlignment a;
            while ( reader.GetNextAlignment(a) )
                (this->*pFunction)(a);
            
            // set flag for successful conversion
            convertedOk = true;
        }
    }
    
    // ------------------------
    // clean up & exit
    reader.Close();
    if ( m_settings->HasOutput )
        outFile.close();
    return convertedOk;   
}
Example #2
0
int CountTool::Run(int argc, char* argv[]) { 

    // parse command line arguments
    Options::Parse(argc, argv, 1);

    // if no '-in' args supplied, default to stdin
    if ( !m_settings->HasInput ) 
        m_settings->InputFiles.push_back(Options::StandardIn());
    
    // open reader without index
    BamMultiReader reader;
    if (!reader.Open(m_settings->InputFiles, false, true)) {
        cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl;
        return 1;
    }

    // alignment counter
    BamAlignment al;
    int alignmentCount(0);
    
    // if no region specified, count entire file 
    if ( !m_settings->HasRegion ) {
        while ( reader.GetNextAlignmentCore(al) ) 
            ++alignmentCount;
    }
    
    // otherwise attempt to use region as constraint
    else {
        
        // if region string parses OK
        BamRegion region;
        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {

            // attempt to re-open reader with index files
            reader.Close();
            bool openedOK = reader.Open(m_settings->InputFiles, true, true );
            
            // if error
            if ( !openedOK ) {
                cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl;
                return 1;
            }
            
            // if index data available, we can use SetRegion
            if ( reader.IsIndexLoaded() ) {
              
                // attempt to use SetRegion(), if failed report error
                if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
                    cerr << "ERROR: Region requested, but could not set BamReader region to REGION: " << m_settings->Region << " Aborting." << endl;
                    reader.Close();
                    return 1;
                } 
              
                // everything checks out, just iterate through specified region, counting alignments
                while ( reader.GetNextAlignmentCore(al) )
                    ++alignmentCount;
            } 
            
            // no index data available, we have to iterate through until we
            // find overlapping alignments
            else {
                while( reader.GetNextAlignmentCore(al) ) {
                    if ( (al.RefID >= region.LeftRefID)  && ( (al.Position + al.Length) >= region.LeftPosition ) &&
                          (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
                    {
                        ++alignmentCount;
                    }
                }
            }
        } 
        
        // error parsing REGION string
        else {
            cerr << "ERROR: Could not parse REGION - " << m_settings->Region << endl;
            cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl;
            reader.Close();
            return 1;
        }
    }
    
    // print results 
    cout << alignmentCount << endl;
    
    // clean & exit
    reader.Close();
    return 0;
}
Example #3
0
int main (int argc, char * argv[])
{
    vector<string> inputFilenames;
    string combinedOutFilename, alignmentsOutFilename;

    try
    {
        TCLAP::CmdLine cmd("Program description", ' ', VERSION);

        TCLAP::ValueArg<string> combinedOutputArg("o", "out", 
            "Combined output filename (BAM format)", true, "", "combined.bam", cmd);

        TCLAP::ValueArg<int> minInsertArg("n", "min-insert", 
            "Minimum insert size", false, DEFAULT_MIN_GAP, "min insert size", cmd);

        TCLAP::ValueArg<int> maxInsertArg("x", "max-insert", 
            "Maximum insert size", false, DEFAULT_MAX_GAP, "max insert size", cmd);

        TCLAP::MultiArg<string> inputArgs("b", "bam",
            "Input BAM file", true,
            "input.bam", cmd);

        cmd.parse(argc, argv);

        combinedOutFilename = combinedOutputArg.getValue();

        MIN_GAP = minInsertArg.getValue();
        MAX_GAP = maxInsertArg.getValue();
        inputFilenames = inputArgs.getValue();

    } catch (TCLAP::ArgException &e) {
        cerr << "Error: " << e.error() << " " << e.argId() << endl;
    }

    // TODO require that alignments are sorted by name

    BamMultiReader reader;
    reader.Open(inputFilenames);

    if (!ValidOut.Open(combinedOutFilename, reader.GetHeader(),
                       reader.GetReferenceData()))
    {
        cerr << ValidOut.GetErrorString() << endl;
        return 1;
    }

    string current, prev;
    char mateID;
    Group group;
    set<string> references;

    Alignment a;
    while (reader.GetNextAlignment(a))
    {
        parseID(a.Name, current, mateID);

        if (current.compare(prev) && prev.size() > 0)
        {
            processGroup(group, references);
            group.clear();
            references.clear();
        }

        references.insert(a.RefName);

        GroupKey key;
        key.refID = a.RefName;
        key.mateID = mateID;
        key.rev = a.IsReverseStrand();

        group.insert( std::make_pair( key, a ) );

        prev = current;
    }
    processGroup(group, references);
}
Example #4
0
bool ConvertTool::ConvertToolPrivate::Run(void) {
 
    // ------------------------------------
    // initialize conversion input/output
        
    // set to default input if none provided
    if ( !m_settings->HasInput ) 
        m_settings->InputFiles.push_back(Options::StandardIn());
    
    // open input files
    BamMultiReader reader;
    if ( !m_settings->HasInput ) { // don't attempt to open index for stdin
        if ( !reader.Open(m_settings->InputFiles, false) ) {
            cerr << "Could not open input files" << endl;
            return false;
        }
    } else {
        if ( !reader.Open(m_settings->InputFiles, true) ) {
            if ( !reader.Open(m_settings->InputFiles, false) ) {
                cerr << "Could not open input files" << endl;
                return false;
            } else {
                cerr << "Opened reader without index file, jumping is disabled." << endl;
            }
        }
    }
    m_references = reader.GetReferenceData();

    // set region if specified
    BamRegion region;
    if ( m_settings->HasRegion ) {
        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
            if ( !reader.SetRegion(region) ) {
                cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
                return false;
            }
        } else {
            cerr << "Could not parse REGION: " << m_settings->Region << endl;
            return false;
        }
    }
        
    // if output file given
    ofstream outFile;
    if ( m_settings->HasOutput ) {
      
        // open output file stream
        outFile.open(m_settings->OutputFilename.c_str());
        if ( !outFile ) {
            cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; 
            return false; 
        }
        
        // set m_out to file's streambuf
        m_out.rdbuf(outFile.rdbuf()); 
    }
    
    // -------------------------------------
    // do conversion based on format
    
     bool convertedOk = true;
    
    // pileup is special case
    // conversion not done per alignment, like the other formats
    if ( m_settings->Format == FORMAT_PILEUP )
        convertedOk = RunPileupConversion(&reader);
    
    // all other formats
    else {
    
        bool formatError = false;
        
        // set function pointer to proper conversion method
        void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
        if      ( m_settings->Format == FORMAT_BED )      pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
        else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph;
        else if ( m_settings->Format == FORMAT_FASTA )    pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
        else if ( m_settings->Format == FORMAT_FASTQ )    pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
        else if ( m_settings->Format == FORMAT_JSON )     pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
        else if ( m_settings->Format == FORMAT_SAM )      pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
        else if ( m_settings->Format == FORMAT_WIGGLE )   pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle;
        else if ( m_settings->Format == FORMAT_YAML )     pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintYaml;
        else { 
            cerr << "Unrecognized format: " << m_settings->Format << endl;
            cerr << "Please see help|README (?) for details on supported formats " << endl;
            formatError = true;
            convertedOk = false;
        }
        
        // if format selected ok
        if ( !formatError ) {
        
            // if SAM format & not omitting header, print SAM header first
            if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) 
                m_out << reader.GetHeaderText();
            
            // iterate through file, doing conversion
            BamAlignment a;
            while ( reader.GetNextAlignment(a) )
              (this->*pFunction)(a);
            
            // set flag for successful conversion
            convertedOk = true;
        }
    }
    
    // ------------------------
    // clean up & exit
    
    reader.Close();
    if ( m_settings->HasOutput ) outFile.close();
    return convertedOk;   
}
Example #5
0
int main ( int argc, char *argv[] ) { 

  struct parameters *param = 0;
  param = interface(param, argc, argv);


  //bam input and generate index if not yet 
  //-------------------------------------------------------------------------------------------------------+
  // BAM input (file or filenames?)                                                                        |
  //-------------------------------------------------------------------------------------------------------+
  char *fof = param->mapping_f;
  FILE *IN=NULL;
  char linefof[5000];
  int filecount=0;
  vector <string> fnames;

  if (strchr(fof,' ')!=NULL) {
    char *ptr;
    ptr=strtok(fof," ");
    while (ptr!=NULL) {
      fnames.push_back(ptr);
      filecount++;
      ptr=strtok(NULL," ");
    }
  } else {
    IN=fopen(fof,"rt");
    if (IN!=NULL) {
      long linecount=0;
      while (fgets(linefof,5000-1,IN)!=NULL) {
        linecount++;
        if (linefof[0]!='#' && linefof[0]!='\n') {
          char *ptr=strchr(linefof,'\n');
          if (ptr!=NULL && ptr[0]=='\n') {
            ptr[0]='\0';
          }
          FILE *dummy=NULL;
          dummy=fopen(linefof,"rt");
          if (dummy!=NULL) {     // seems to be a file of filenames...
            fclose(dummy);
            fnames.push_back(linefof);
            filecount++;
          } else if (filecount==0 || linecount>=1000-1) {  // seems to be a single file
            fnames.push_back(fof);
            filecount++;
            break;
          }
        }
      }
      fclose(IN);
    }
  }  //file or file name decided and stored in vector "fnames"

  cerr << "the input mapping files are:" << endl;
  vector <string>::iterator fit = fnames.begin();
  for(; fit != fnames.end(); fit++) {
    cerr << *fit << endl;
  }

  //-------------------------------------------------------------------------------------------------------+
  // end of file or filenames                                                                              |
  //-------------------------------------------------------------------------------------------------------+

  // open the BAM file(s)
  BamMultiReader reader;
  reader.Open(fnames);

  // get header & reference information
  string header = reader.GetHeaderText();
  RefVector refs = reader.GetReferenceData();


  // attempt to open BamWriter
  BamWriter writer;
  string outputBam = param->writer;
  if ( outputBam != "" ) {
    if ( !writer.Open(param->writer, header, refs) ) {
      cerr << "Could not open output BAM file" << endl;
      exit(0);
    }
  }


  BamAlignment bam;
  while (reader.GetNextAlignment(bam)) { //change RG 

    string rg = "RG";
    string rgType = "Z";
    string rgValue = "1";
    bam.EditTag(rg,rgType,rgValue);

    writer.SaveAlignment(bam);
    
  }  // read a bam

  return 0;

} //main
Example #6
0
bool FilterTool::FilterToolPrivate::Run(void) {
    
    // set to default input if none provided
    if ( !m_settings->HasInput && !m_settings->HasInputFilelist )
        m_settings->InputFiles.push_back(Options::StandardIn());

    // add files in the filelist to the input file list
    if ( m_settings->HasInputFilelist ) {

        ifstream filelist(m_settings->InputFilelist.c_str(), ios::in);
        if ( !filelist.is_open() ) {
            cerr << "bamtools filter ERROR: could not open input BAM file list... Aborting." << endl;
            return false;
        }

        string line;
        while ( getline(filelist, line) )
            m_settings->InputFiles.push_back(line);
    }

    // initialize defined properties & user-specified filters
    // quit if failed
    if ( !SetupFilters() )
        return false;

    // open reader without index
    BamMultiReader reader;
    if ( !reader.Open(m_settings->InputFiles) ) {
        cerr << "bamtools filter ERROR: could not open input files for reading." << endl;
        return false;
    }

    // retrieve reader header & reference data
    const string headerText = reader.GetHeaderText();
    filterToolReferences = reader.GetReferenceData();
    
    // determine compression mode for BamWriter
    bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
                              !m_settings->IsForceCompression );
    BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
    if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;

    // open BamWriter
    BamWriter writer;
    writer.SetCompressionMode(compressionMode);
    if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) {
        cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl;
        reader.Close();
        return false;
    }

    // if no region specified, filter entire file 
    BamAlignment al;
    if ( !m_settings->HasRegion ) {
        while ( reader.GetNextAlignment(al) ) {
            if ( CheckAlignment(al) ) 
                writer.SaveAlignment(al);
        }
    }
    
    // otherwise attempt to use region as constraint
    else {
        
        // if region string parses OK
        BamRegion region;
        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {

            // attempt to find index files
            reader.LocateIndexes();

            // if index data available for all BAM files, we can use SetRegion
            if ( reader.HasIndexes() ) {

                // attempt to use SetRegion(), if failed report error
                if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
                    cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl;
                    reader.Close();
                    return false;
                } 
              
                // everything checks out, just iterate through specified region, filtering alignments
                while ( reader.GetNextAlignment(al) )
                    if ( CheckAlignment(al) ) 
                        writer.SaveAlignment(al);
                }
            
            // no index data available, we have to iterate through until we
            // find overlapping alignments
            else {
                while ( reader.GetNextAlignment(al) ) {
                    if ( (al.RefID >= region.LeftRefID)  && ((al.Position + al.Length) >= region.LeftPosition) &&
                         (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
                    {
                        if ( CheckAlignment(al) ) 
                            writer.SaveAlignment(al);
                    }
                }
            }
        } 
        
        // error parsing REGION string
        else {
            cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl;
            cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
                 << endl;
            reader.Close();
            return false;
        }
    }

    // clean up & exit
    reader.Close();
    writer.Close();
    return true;
}
Example #7
0
int CropBamTool::CropBam()
{
    // open bam files
    BamMultiReader bamReader;
    bamReader.Open(bamFiles);

    // the dictionary of chromosomes
    RefVector genome = bamReader.GetReferenceData();

    // get the scanning window
    vector<tuple<int,int,int>> windows;
    int numWindows = GenericRegionTools::toScanWindow(genome, regionStrings, windows);

    unordered_set<string> readpool;

    // temporary struct for sequence object
    typedef struct {
        string name;
        int head_soft_clip;
        int tail_soft_clip;
        string seq;
        string qual;
    }cropbam_seq_t;

    // temporary struct for unique seqs
    map<string,list<cropbam_seq_t>> uniqueSeqPool;

    // lambda expression for output
    auto Output = [this](cropbam_seq_t &a){
          if (this->outFormat=="fasta"){
              cout << ">" << a.name << "\t"
                   << "head_soft_clip=" << a.head_soft_clip << "\t"
                   << "tail_soft_clip=" << a.tail_soft_clip << "\t"
                   << endl
                   << a.seq << endl;
          }
          if (this->outFormat=="fastq"){
              cout << "@" << a.name << "\t"
                   << "head_soft_clip=" << a.head_soft_clip << "\t"
                   << "tail_soft_clip=" << a.tail_soft_clip << "\t"
                   << endl
                   << a.seq << endl;
              cout << "+" << endl
                   << a.qual << endl;
          }
    };

    // loop over windows
    omp_set_dynamic(0);
    omp_set_num_threads(numThreads);
    #pragma omp parallel for shared(genome)
    for (int i=0; i<numWindows; i++)
    {
        clock_t tStart = clock();

        bamReader.Open(bamFiles);

        int wId = get<0>(windows[i]);
        int wLp = get<1>(windows[i]);
        int wRp = get<2>(windows[i]);

        if (verbose>=1) Verbose("process the window " + genome[wId].RefName + ":" + to_string(wLp+1) + "-" + to_string(wRp));

        // rewind the bam reader
        bamReader.Rewind();
        // set the region
        bamReader.SetRegion(wId, wLp, wId, wRp);

        int numReads = 0;
        // retrieve the alignment
        BamAlignment aln;
        while (bamReader.GetNextAlignment(aln))
        {
            // skip the alignment if it doesn't overlap the window
            if (aln.Position>=wRp || aln.GetEndPosition()<=wLp)
                continue;

            // skip the invalid alignment
            if (!isValidAlignment(aln, readLenThres, mapQualThres, alnFlagMarker))
                continue;

            // skip the alignment harboring too many mismatches
            if (!GenericBamAlignmentTools::validReadIdentity(aln, 1-alnIdenThres))
                continue;

            stringstream keyss;
            keyss << GenericBamAlignmentTools::getBamAlignmentName(aln) << "-"
                  << wId << "-" << wLp << "-" << wRp;
            string key = keyss.str();
            auto ptr = readpool.find(key);
            if (ptr!=readpool.end())
                continue;
            readpool.emplace(key);

            // get the partial read
            string readSegment, readQualSegment, genomeSegment;
            GenericBamAlignmentTools::getLocalAlignment(aln, wLp, wRp-wLp, readSegment, readQualSegment, genomeSegment);

            // add soft clip
            int hsc=0;
            auto ptr0 = aln.CigarData.begin();
            if (aln.Position>=wLp && (ptr0->Type=='S' || ptr0->Type=='H'))
            {
                stringstream headClipSeq, headClipQual;
                for (int i=0; i<ptr0->Length; i++)
                {
                    headClipSeq << aln.QueryBases[i];
                    headClipQual << aln.Qualities[i];
                }

                if (keepClip)
                {
                    readSegment=headClipSeq.str()+readSegment;
                    readQualSegment=headClipQual.str()+readQualSegment;
                }

                hsc += ptr0->Length;
            }
            int tsc=0;
            auto ptr1 = aln.CigarData.rbegin();
            if (aln.GetEndPosition()<wRp && (ptr1->Type=='S' || ptr1->Type=='H'))
            {
                string ss="", qs="";
                auto str=aln.QueryBases.rbegin();
                auto qtr=aln.Qualities.rbegin();
                for (int i=0; i<ptr1->Length; i++,str++,qtr++)
                {
                    ss=(*str)+ss;
                    qs=(*qtr)+qs;
                }
                if (keepClip)
                {
                    readSegment=readSegment+ss;
                    readQualSegment=readQualSegment+qs;
                }
                tsc += ptr1->Length;
            }

            if (readSegment.length()>=segmentLenThres)
            {
                cropbam_seq_t a;
                a.name = GenericBamAlignmentTools::getBamAlignmentName(aln);
                a.head_soft_clip = hsc;
                a.tail_soft_clip = tsc;
                a.seq = readSegment;
                a.qual = readQualSegment;
                if (uniqueSeqPool.count(a.seq)==0)
                    uniqueSeqPool[a.seq] = list<cropbam_seq_t>(1,a);
                else
                    uniqueSeqPool[a.seq].emplace_back(a);
//                if (outFormat=="fasta"){
//                    cout << ">" << GenericBamAlignmentTools::getBamAlignmentName(aln) << "\t"
//                         << "head_soft_clip=" << hsc << "\t"
//                         << "tail_soft_clip=" << tsc << "\t"
//                         << endl
//                         << readSegment << endl;
//                }

//                if (outFormat=="fastq"){
//                    cout << "@" << GenericBamAlignmentTools::getBamAlignmentName(aln) << "\t"
//                         << "head_soft_clip=" << hsc << "\t"
//                         << "tail_soft_clip=" << tsc << "\t"
//                         << endl
//                         << readSegment << endl;
//                    cout << "+" << endl
//                         << readQualSegment << endl;
//                }

                numReads++;
            }
        }

        numReads = 0;
        if (useUnique){
            ofstream of;
            of.open(outFreq);
            for (auto a : uniqueSeqPool){
                if (a.second.size()>=thresFreq){
                    Output(*a.second.begin());
                    of << a.second.begin()->name << "\t" << a.second.size() << endl;
                    numReads ++;
                }
            }
            of.close();
        }else{
            for (auto a : uniqueSeqPool){
                for (auto b : a.second){
                    Output(b);
                    numReads ++;
                }
            }
        }

        clock_t tEnd = clock();

        if (verbose>=1) Verbose("retrieve " + to_string(numReads) + " reads");
        if (verbose>=1) Verbose("time elapsed " + to_string((double)(tEnd-tStart)/CLOCKS_PER_SEC) + " seconds");
    }

    return 0;
}
Example #8
0
int bbctools_create( BbcUtils::OptParser &optParser ) {
    const vector<string> cmdArgs = optParser.getArgs();

	// remove .bbc extension from bbc file root, if present
	string bbcfileRoot = optParser.getOptValue( "bbc" );
	int i = bbcfileRoot.size() - 4;
	if( i > 0 && bbcfileRoot.substr(i,4) == ".bbc" ) {
		bbcfileRoot = bbcfileRoot.substr(0,i);
	}
	bool f_bci = optParser.getOptBoolean("index");
	bool f_cbc = optParser.getOptBoolean("coarse");

	string targetRegions   = optParser.getOptValue("regions");
	string annotationFields = optParser.getOptValue( "annotationFields");
	vector<string> auxRegionSplit = BbcUtils::mapToPairList(annotationFields);

	string  sumstatsFile = optParser.getOptValue("sumStats");
	string  covstatsFile = optParser.getOptValue("covStats");
	string  readOrigFile = optParser.getOptValue("readOrigin");
	string  readType     = optParser.getOptValue("readType");
	string  covDepths    = optParser.getOptValue("covDepths","-");
	double  minPcCov     = optParser.getOptNumber("minPcCov");
	int32_t primerLength = optParser.getOptInteger( "primerLength", (readType == "AmpliSeq" ? 30 : 0) );
	int32_t maxE2eEndGap = optParser.getOptInteger( "e2eGap", (readType == "AmpliSeq" ? 2 : 0) );

	bool   autoCreateBamIndex = optParser.getOptBoolean("autoCreateBamIndex");
	bool     samdepth         = optParser.getOptBoolean("samdepth");
	int32_t  filterQuality    = optParser.getOptInteger("minMAPQ");
	int32_t  minAlignLength   = optParser.getOptInteger("minAlignLength");
	bool     filterDuplicates = optParser.getOptBoolean("noDups");
	bool     filterUnique     = optParser.getOptBoolean("unique");
	uint32_t skipFlag         = filterDuplicates ? 0x704 : 0x304;
	uint16_t minMapQuality    = filterUnique ? 1 : filterQuality;

	bool onlyOnTargetReads = optParser.getOptBoolean("onTargetReads");
	bool onlyOnTargetBases = optParser.getOptBoolean("onTargetBases");

	// possible future options
	bool invertOnTarget = false;

	// check basic valid argument values and combinations
	int numOuts  = !bbcfileRoot.empty() + !covstatsFile.empty() + !sumstatsFile.empty() + !readOrigFile.empty();
	int numPipes = (bbcfileRoot == "-") + (covstatsFile == "-") + (sumstatsFile == "-") + (readOrigFile == "-");
	if( numOuts == 0 && !f_bci && !f_cbc ) {
		bbcfileRoot = "-";	// default if no other output specified
	} else if( numPipes > 1 ) {
		cerr << "Error: bbctools create: Only one file output (--covStats, --sumStats, --readOrigin or --bbc) may be piped to STDOUT." << endl;
		return -1;
	} else if( samdepth && numOuts ) {
		cerr << "Error: bbctools create: --samdepth (-s) option may only be used without other output options." << endl;
		return -1;
	}
	// check if single argument is a BBC file and leave open for reading if so
	BbcView bbcView;
	bool haveBbcFile = cmdArgs.size() == 1 && bbcView.Open( cmdArgs[0], true );
	bbcView.SelectPrintStream( samdepth ? "SAMDEPTH" : "BBCVIEW" );

	// check distinction between default and explicit no target regions - only for BBC input
	bool explicitNoTargetRegions = false;
	if( targetRegions == "-" ) {
		explicitNoTargetRegions = haveBbcFile;
		targetRegions = "";
	}
	if( targetRegions.empty() ) {
		if( onlyOnTargetBases && explicitNoTargetRegions && !invertOnTarget ) {
			cerr << "Warning: bbctools create --onTargetBases (-b) option with --regions '-' produces no coverage." << endl;
		} else if( onlyOnTargetReads ) {
			cerr << "Error: bbctools create --onTargetReads (-r) option requires a --regions file." << endl;
			return -1;
		}
	}
	// check for legal BBC create options
	if( f_bci || f_cbc ) {
		if( (bbcfileRoot.empty() || bbcfileRoot == "-") && !haveBbcFile ) {
			string opt = f_bci ? "--index (-i)" : "--coarse (-c)";
			cerr << "Error: bbctools create "+opt+" option requires the --bbc (-B) option or a BBC source file." << endl;
			return -1;
		}
	}
	BamMultiReader bamReader;
	if( haveBbcFile ) {
		// warn for options that do not work with BBC input
		if( filterQuality > 0 || filterDuplicates || filterUnique || minAlignLength ) {
			cerr << "Warning: SAM flag, alignment length and MAPQ filters ignored for BBC source file." << endl;
		}
		if( samdepth ) {
			cerr << "Error: --samdepth option is not supported for BBC source files." << endl;
			return -1;
		}
		if( !readOrigFile.empty() ) {
			cerr << "Error: --readOrigin option is not supported for BBC source files." << endl;
			return -1;
		}
	} else {
		// check / open for multiple BAM file inputs
		if ( !bamReader.Open(cmdArgs) ) {
			if( cmdArgs.size() == 1 ) cerr << "ERROR: Could not read input BAM file:";
			else cerr << "ERROR: Could not read all input BAM files:";
			// get and clean up bamtools error msg
			string errMsg = bamReader.GetErrorString();
			size_t i = errMsg.find_first_of('\n');
			if( i != string::npos ) errMsg = errMsg.substr(i+1);
			i = errMsg.find("::");
			if( i != string::npos ) {
				i = errMsg.find(": ");
				if( i != string::npos ) errMsg = errMsg.substr(i+1);
			}
			errMsg = BbcUtils::stringTrim(errMsg);
			errMsg[0] = toupper(errMsg[0]);
			cerr << endl << errMsg << "." << endl;
			return 1;
		}
	}
	// grab reference list from either input source
	const RefVector &references = haveBbcFile ? bbcView.GetReferenceData() : bamReader.GetReferenceData();
	if( !references.size() ) {
		// Issue would already been detected if input was BBC file
		cerr << "ERROR: " << (cmdArgs.size() > 1 ? "One or more " : "");
		cerr << "BAM file contains unaligned reads (no references).\n";
		return 1;
	}
	// check/set up target regions input regions/region statistics output
	RegionCoverage *regions = NULL;
	string covstatsStaticFields;
	bool trackRegionBaseCov = !covDepths.empty();
	if( covstatsFile.empty() ) {
		trackRegionBaseCov = false;
		if( !annotationFields.empty() ) {
			cerr << "Warning: --annotationFields (A) option ignored without --covStats (-C) option." << endl;
		}
		if( !covDepths.empty() && covDepths != "-" ) {
			cerr << "Warning: --covDepths (-D) option ignored without --covStats (-C) option." << endl;
		}
		if( !readType.empty() ) {
			cerr << "Warning: --readType (-T) option ignored without --covStats (-C) option." << endl;
		}
		// read regions for input only and/or creating sumStats
		if( !targetRegions.empty() || explicitNoTargetRegions || !sumstatsFile.empty() ) {
			regions = new RegionCoverage(references);
		}
	} else if( readType == "trgreads" || readType == "amplicon" || readType == "AmpliSeq" ) {
		if( haveBbcFile ) {
			cerr << "Creation of read coverage requires BAM file input." << endl;
			return -1;
		}
		AmpliconRegionStatistics *ampRegionStats = new AmpliconRegionStatistics(references);
		ampRegionStats->SetGenericReads( readType == "trgreads" );
		ampRegionStats->SetSigFacCoverage( minPcCov/100 );
		ampRegionStats->SetMaxUpstreamPrimerStart( primerLength );
		ampRegionStats->SetMaxE2eEndDistance( maxE2eEndGap );
		covstatsStaticFields = "overlaps,";
		covstatsStaticFields += (minPcCov > 0) ? "fwd_cov,rev_cov" : "fwd_e2e,rev_e2e";
		covstatsStaticFields += ",total_reads,fwd_reads,rev_reads";
		regions = ampRegionStats;
	} else if( readType == "trgbases" ) {
		if( haveBbcFile && targetRegions.empty() && !explicitNoTargetRegions ) {
			cerr << "Warning: Assuming reference contigs for base coverage targets (=> option --regions -)" << endl;
		}
		RegionStatistics *regionStats = new RegionStatistics(references);
		covstatsStaticFields = "covered,uncov_5p,uncov_3p,ave_basereads,fwd_basereads,rev_basereads";
		trackRegionBaseCov = true;
		regions = regionStats;
	} else if( readType == "covdepth" || readType.empty() ) {
		// output (sorted) targets file with only covDepth stats (if any)
		regions = new RegionCoverage(references);
	} else {
		cerr << "Unknown read type '" << readType << "'" << endl;
		return -1;
	}
	// Load the input regions or default to whole reference contig targets
	if( regions ) {
		regions->SetCovAtDepths( covDepths == "-" ? "20,100,500" : covDepths );
		if( targetRegions.empty() ) {
			regions->SetWholeContigTargets();
			// set contigs as explicit regions means all reads will seen as on-target
			// for consistency these are inverted (for input from BBC)
			invertOnTarget = true;
		} else {
			string auxFieldIdx = auxRegionSplit.size() ? auxRegionSplit[0] : "";
			string errMsg = regions->Load( targetRegions, "BED", auxFieldIdx );
			if( !errMsg.empty() ) {
				cerr << "ERROR: " + errMsg + "\n";
				return 1;
			}
		}
		if( onlyOnTargetReads && haveBbcFile ) {
			cerr << "Error: bbctools create --onTargetReads option is not supported for BBC source file." << endl;
			return -1;
		}
	}
	//
	// Perform all bbctools create utilities
	//
	BbcCreate *bbcCreate = NULL;
	if( !bbcfileRoot.empty() && (bbcfileRoot != "-" || !haveBbcFile) ) {
		bbcCreate = new BbcCreate(references);
		if( bbcfileRoot != "-" && !bbcCreate->Open(bbcfileRoot+".bbc") ) {
			return 1;
		}
		bbcCreate->SetNoOffTargetPositions(onlyOnTargetBases);
	}
	bbcView.SetNoOffTargetPositions(onlyOnTargetBases);
	// Stream input to output creators
	if( haveBbcFile ) {
		// BBC reader and driver via BbcView object
		if( bbcfileRoot != "-" || !covstatsFile.empty() ) {
			// disable BbcView text stream if using for file creation
			bbcView.SelectPrintStream("NONE");
		}
		// process input BBC for just new BBC and target coverage (defer BCI/CBC)
		bbcView.SetBbcCreate(bbcCreate);
		bbcView.SetRegionCoverage(regions);
		// explicitNoTargetRegions intended for explicitly removing on-target coverage
		bbcView.SetInvertOnTarget(explicitNoTargetRegions ^ invertOnTarget);
		if( bbcCreate || regions || bbcfileRoot == "-" ) {
			bbcView.ReadAll();
		}
	} else {
		// Test read tracking option for file write
		TrackReads *readTracker = NULL;
		try {
			if( !readOrigFile.empty() )
				readTracker = new TrackReads( readOrigFile, regions );
		} catch( std::runtime_error & ) {
			cerr << "ERROR: Unable to write to read tracking file " << readOrigFile << endl;
			return 1;
		}
		// BAM reader, BaseCoverage driver, dispatching to BbcCreate and BbcView objects
		BaseCoverage baseCov(references);
		baseCov.SetRegionCoverage(regions);
		baseCov.SetBbcCreate(bbcCreate);
		baseCov.SetInvertOnTarget(invertOnTarget);
		if( bbcfileRoot == "-" ) {
			baseCov.SetBbcView(&bbcView);
		}
		// Certain options require that all reads are processed, invalidating other performance options
		bool trackAllReads = !sumstatsFile.empty() || readTracker;
		// Implicit set of onlyOnTargetReads for performance when only these reads are required
		bool useBaseCov = (bbcfileRoot == "-" || bbcCreate);
		if( !targetRegions.empty() && !trackAllReads ) {
			onlyOnTargetReads |= onlyOnTargetBases;
			if( samdepth || !useBaseCov ) onlyOnTargetReads = true;
		}
		useBaseCov |= trackRegionBaseCov;
		// do not allow jumping if sumStats option is used - need to count all reads
		bool bamReaderSetRegions = (s_useBamReaderJump && !trackAllReads);
		int trgContig = 0, trgSrtPos = 0, trgEndPos = 0;
		int minJumpLen = s_initialMinJumpLen;
		int maxReadLen = s_initialMaxReadLen;
		if( onlyOnTargetReads ) {
			// load/create BAM index files for targeted reading
			// Note: BamIndex::BAMTOOLS format performed very badly and cannot use mixed with BTI/BAI files
			if( bamReaderSetRegions && !bamReader.LocateIndexes() ) {
				string plural( cmdArgs.size() > 1 ? "s" : "" );
				if( autoCreateBamIndex ) {
					cerr << "Warning: Did not locate BAM index (BAI) file" << plural << ", creating bamtools version..." << endl;
					// to avoid bug use new instance of BamMultiReader
					BamMultiReader bamReader2;
					if( !bamReader2.Open(cmdArgs) || !bamReader2.CreateIndexes() ) {
						cerr << "WARNING: Failed to create BAM index file" << plural << "." << endl;
						bamReaderSetRegions = false;
					} else {
						if( cmdArgs.size() == 1 ) {
							cerr << "Successfully created BAM index file: " << BbcUtils::fileName(cmdArgs[0]) << ".bai" << endl;
						} else {
							cerr << "Successfully created BAM index files." << endl;
						}
						// re-locate indexes with first reader - could not seem to locate BTI files created!
						if( !bamReader.LocateIndexes() ) {
							cerr << "WARNING: Failed to locate BAM index file" << plural << " just created!" << endl;
							bamReaderSetRegions = false;
						}
					}
				} else {
					cerr << "Warning: BAM index file" << plural << " not located for targeted BAM access." << endl;
					bamReaderSetRegions = false;
				}
			}
			// cancel region filtering if there are no regions to iterate (unexpected)
			if( !regions->GetNextRegion( trgContig, trgSrtPos, trgEndPos ) ) {
				onlyOnTargetReads = bamReaderSetRegions = false;
			}
			if( bamReaderSetRegions ) {
				bamReader.Jump( trgContig, trgSrtPos-maxReadLen );
			}
		}
		BamAlignment aln;
		while( bamReader.GetNextAlignmentCore(aln) ) {
			// appears to be an undocumented behavior here
			if( aln.RefID < 0 ) continue;
			// skip filtered reads by flag, length or mapping quality
			if( aln.AlignmentFlag & skipFlag ) continue;
			if( aln.MapQuality < minMapQuality ) continue;
			int32_t endPos = aln.GetEndPosition();
			if( minAlignLength > 0 ) {
				if( endPos - aln.Position < minAlignLength ) continue;
			}
			// screen for on-target reads
			if( onlyOnTargetReads ) {
				// find next region overlapping or beyond of current read
				bool moreRegions = true;
				bool setRegion = false;
				while( aln.RefID > trgContig || (aln.RefID == trgContig && aln.Position > trgEndPos) ) {
					if( !regions->GetNextRegion( trgContig, trgSrtPos, trgEndPos ) ) {
						moreRegions = false;
						break;
					}
					setRegion = bamReaderSetRegions;
				}
				if( !moreRegions ) {
					// prevent further on-target checks and exit early if not using sumStats
					onlyOnTargetReads = false;
					if( trackAllReads ) {
						// force tracking of off-target reads
						regions->TrackReadsOnRegion(aln,endPos);
						if( readTracker ) readTracker->Write(aln,endPos);
						continue;
					}
					break;
				}
				if( setRegion ) {
					// track max read length for future index jumps - just in case long reads ever used
					if( endPos - aln.Position > maxReadLen ) {
						maxReadLen = endPos - aln.Position;
						if( maxReadLen > minJumpLen ) minJumpLen = maxReadLen;
					}
					if( aln.RefID != trgContig || trgSrtPos - aln.Position > minJumpLen ) {
						bamReader.Jump( trgContig, trgSrtPos-maxReadLen );
					}
				}
				if( aln.RefID < trgContig || endPos < trgSrtPos ) {
					// force tracking of off-target reads
					if( trackAllReads ) {
						regions->TrackReadsOnRegion(aln,endPos);
						if( readTracker ) readTracker->Write(aln,endPos);
					}
					continue;	// current is before next target region - fetch the next within bounds
				}
			}
			// record base coverage and region coverage statistics
			if( useBaseCov ) {
				endPos = baseCov.AddAlignment(aln,endPos);
				if( endPos <= 0 ) {
					if( endPos == 0 ) continue;	// read was silently ignored
					cerr << "ERROR: BAM file is not correctly sorted vs. reference." << endl;
					return 1;
				}
			}
			// record read coverage and region coverage statistics
			if( regions ) {
				regions->TrackReadsOnRegion(aln,endPos);
			}
			if( readTracker ) {
				readTracker->Write(aln,endPos);
			}
		}
		// flush and close objects associated with output
		baseCov.Flush();
	}
	// Output in-memory region stats file and ensure BBC file is closed
	if( regions ) {
		// build output fields title string
		string outFields = "contig_id,contig_srt,contig_end";
		if( !auxRegionSplit.empty() ) outFields += "," + auxRegionSplit[1];
		if( !covstatsStaticFields.empty() ) outFields += "," + covstatsStaticFields;
		regions->Write( covstatsFile, outFields );
		if( !sumstatsFile.empty() ) {
			regions->WriteSummary( sumstatsFile, invertOnTarget );
		}
		delete regions;
	}
	delete bbcCreate;

	// Complete remaining file creation options using a BBC file input
	// NOTE: Using BbbCreate for this would require code duplication and concurrent file output streaming
	if( f_bci || f_cbc ) {
		// Check BBC file source
		if( haveBbcFile ) {
			bbcfileRoot = cmdArgs[0];
	    	int i = bbcfileRoot.size() - 4;
	    	if( i > 0 && bbcfileRoot.substr(i,4) == ".bbc" ) {
	    		bbcfileRoot = bbcfileRoot.substr(0,i);
	    	}
		} else if( !bbcView.Open( bbcfileRoot+".bbc", true ) ) {
			cerr << "ERROR: Unexpected failure to read new BBC file '"+bbcfileRoot+".bam'" << endl;
			return 1;
		}
		if( f_bci ) {
			BbcIndex indexer( bbcfileRoot+".bci" );
			if( !bbcView.CreateIndex(indexer) ) {
				cerr << "ERROR: Failed to create index file '" << bbcfileRoot << ".bci'" << endl;
				return 1;
			}
		}
		if( f_cbc ) {
			// CBC generation can use BCI file but is no faster since whole BBC file is read
			BbcCoarse cbcWriter( bbcfileRoot+".cbc" );
			if( !bbcView.CreateCbc(cbcWriter) ) {
				cerr << "ERROR: Failed to create coarse base coverage file '" << bbcfileRoot << ".cbc'" << endl;
				return 1;
			}
		}
	}
	return 0;
}
Example #9
0
bool MergeTool::MergeToolPrivate::Run(void) {

    // set to default input if none provided
    if ( !m_settings->HasInputBamFilename )
        m_settings->InputFiles.push_back(Options::StandardIn());

    // opens the BAM files (by default without checking for indexes)
    BamMultiReader reader;
    if ( !reader.Open(m_settings->InputFiles) ) {
        cerr << "bamtools merge ERROR: could not open input BAM file(s)... Aborting." << endl;
        return false;
    }

    // retrieve header & reference dictionary info
    std::string mergedHeader = reader.GetHeaderText();
    RefVector references = reader.GetReferenceData();

    // determine compression mode for BamWriter
    bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
                               !m_settings->IsForceCompression );
    BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
    if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;

    // open BamWriter
    BamWriter writer;
    writer.SetCompressionMode(compressionMode);
    if ( !writer.Open(m_settings->OutputFilename, mergedHeader, references) ) {
        cerr << "bamtools merge ERROR: could not open "
             << m_settings->OutputFilename << " for writing." << endl;
        reader.Close();
        return false;
    }

    // if no region specified, store entire contents of file(s)
    if ( !m_settings->HasRegion ) {
        BamAlignment al;
        while ( reader.GetNextAlignmentCore(al) )
            writer.SaveAlignment(al);
    }

    // otherwise attempt to use region as constraint
    else {

        // if region string parses OK
        BamRegion region;
        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {

            // attempt to find index files
            reader.LocateIndexes();

            // if index data available for all BAM files, we can use SetRegion
            if ( reader.HasIndexes() ) {

                // attempt to use SetRegion(), if failed report error
                if ( !reader.SetRegion(region.LeftRefID,
                                       region.LeftPosition,
                                       region.RightRefID,
                                       region.RightPosition) )
                {
                    cerr << "bamtools merge ERROR: set region failed. Check that REGION describes a valid range"
                         << endl;
                    reader.Close();
                    return false;
                }

                // everything checks out, just iterate through specified region, storing alignments
                BamAlignment al;
                while ( reader.GetNextAlignmentCore(al) )
                    writer.SaveAlignment(al);
            }

            // no index data available, we have to iterate through until we
            // find overlapping alignments
            else {
                BamAlignment al;
                while ( reader.GetNextAlignmentCore(al) ) {
                    if ( (al.RefID >= region.LeftRefID)  && ( (al.Position + al.Length) >= region.LeftPosition ) &&
                         (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) )
                    {
                        writer.SaveAlignment(al);
                    }
                }
            }
        }

        // error parsing REGION string
        else {
            cerr << "bamtools merge ERROR: could not parse REGION - " << m_settings->Region << endl;
            cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
                 << endl;
            reader.Close();
            writer.Close();
            return false;
        }
    }

    // clean & exit
    reader.Close();
    writer.Close();
    return true;
}
Example #10
0
int main ( int argc, char *argv[] ) { 

  struct parameters *param = 0;
  param = interface(param, argc, argv);

  //region file input (the region file should be sorted as the same way as the bam file)
  ifstream region_f;
  region_f.open(param->region_f, ios_base::in);  // the region file is opened

  //bam input and generate index if not yet 
  //-------------------------------------------------------------------------------------------------------+
  // BAM input (file or filenames?)                                                                        |
  //-------------------------------------------------------------------------------------------------------+
  char *fof = param->mapping_f;
  FILE *IN=NULL;
  char linefof[5000];
  int filecount=0;
  vector <string> fnames;

  if (strchr(fof,' ')!=NULL) {
    char *ptr;
    ptr=strtok(fof," ");
    while (ptr!=NULL) {
      fnames.push_back(ptr);
      filecount++;
      ptr=strtok(NULL," ");
    }
  } else {
    IN=fopen(fof,"rt");
    if (IN!=NULL) {
      long linecount=0;
      while (fgets(linefof,5000-1,IN)!=NULL) {
        linecount++;
        if (linefof[0]!='#' && linefof[0]!='\n') {
          char *ptr=strchr(linefof,'\n');
          if (ptr!=NULL && ptr[0]=='\n') {
            ptr[0]='\0';
          }
          FILE *dummy=NULL;
          dummy=fopen(linefof,"rt");
          if (dummy!=NULL) {     // seems to be a file of filenames...
            fclose(dummy);
            fnames.push_back(linefof);
            filecount++;
          } else if (filecount==0 || linecount>=1000-1) {  // seems to be a single file
            fnames.push_back(fof);
            filecount++;
            break;
          }
        }
      }
      fclose(IN);
    }
  }  //file or file name decided and stored in vector "fnames"

  cerr << "the input mapping files are:" << endl;
  vector <string>::iterator fit = fnames.begin();
  for(; fit != fnames.end(); fit++) {
    cerr << *fit << endl;
  }

  //-------------------------------------------------------------------------------------------------------+
  // end of file or filenames                                                                              |
  //-------------------------------------------------------------------------------------------------------+

  // open the BAM file(s)
  BamMultiReader reader;
  reader.Open(fnames);

  // get header & reference information
  string header = reader.GetHeaderText();
  RefVector refs = reader.GetReferenceData();

  if ( ! reader.LocateIndexes() )     // opens any existing index files that match our BAM files
    reader.CreateIndexes();         // creates index files for BAM files that still lack one


  // locus bias
  struct lb empty_profile = {0,0,0,0};
  vector <struct lb> locus_b(1000, empty_profile);
  // output locus bias file
  string locus_bias_set = param->lbias;
  ofstream locus_bias;
  if ( locus_bias_set != "" ) {
    locus_bias.open(param->lbias);
    if ( !locus_bias ) {
      cerr << "can not open locus_bias file.\n";
      exit(0);
    }
  }

  //should decide which chromosome
  string line;
  string old_chr = "SRP";
  string type = param->type;

  //whether do some position-level pile-up stuff
  bool posc = false;
  ofstream posc_f;
  ofstream chrmap_f;
  string poscset = param->posc;
  if ( poscset != "" ) {
    posc = true;
    posc_f.open(param->posc);
    chrmap_f.open(param->chrmap);
  }

  bool noChr;
  if ( param->nochr == 1 ){
    noChr = true;
  } else {
    noChr = false;
  }

  //regions for the input of region file
  deque <struct region> regions;

  getline(region_f, line); //get the first line
  eatline(line,regions,noChr);
  
  deque <struct region>::iterator it = regions.begin();

  while ( it->chr != old_chr ) {

    old_chr = it->chr;  // set the current chr as old chr

    int chr_id  = reader.GetReferenceID(it->chr);

    if ( chr_id == -1 ) {  //reference not found

      for (; it != regions.end() && it->chr == old_chr; ) {
        gene_processing(*it,locus_b);           // print the old region info
        it = regions.erase(it);         // erase the current region
      }
  
      while ( regions.empty() ) {    
        getline(region_f, line);
        if ( region_f.eof() ){
          cerr << "finished: end of region file, zone 0" << endl;
          break;
        }
        eatline(line, regions,noChr);
        it = regions.begin();
        if (it->chr == old_chr){  
          gene_processing(*it,locus_b);      
          regions.clear();
          continue;
        }
      }
      continue;
    }

    int chr_len = refs.at(chr_id).RefLength;

    if ( !reader.SetRegion(chr_id, 1, chr_id, chr_len) ) // here set region
      {
        cerr << "bamtools count ERROR: Jump region failed " << it->chr << endl;
        reader.Close();
        exit(1);
      }

    //pile-up pos stats
    set <string> fragment;
    map <string, unsigned int> pileup;
    bool isposPileup = false;
    unsigned int old_start   = 0;
    unsigned int total_tags  = 0;
    unsigned int total_pos   = 0;
    unsigned int pileup_pos  = 0;


    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {

      if ( bam.IsMapped() == false ) continue;   // skip unaligned reads

      unsigned int unique;
      bam.GetTag("NH", unique);
      if (param->unique == 1) {
        if (unique != 1) {                       // skipe uniquelly mapped reads
          continue;
        }
      }

      if (read_length == 0){
        read_length = bam.Length;
      }

      //cout << bam.Name << endl;
      string chrom = refs.at(bam.RefID).RefName;
      string strand = "+";
      if (bam.IsReverseStrand()) strand = "-";

      unsigned int alignmentStart =  bam.Position+1;
      unsigned int mateStart;
      if (type == "p") mateStart = bam.MatePosition+1;
      unsigned int alignmentEnd = bam.GetEndPosition();
      unsigned int cigarEnd;
      vector <int> blockLengths;
      vector <int> blockStarts;
      blockStarts.push_back(0);
      ParseCigar(bam.CigarData, blockStarts, blockLengths, cigarEnd);


      // position check for unique mapped reads (because is paired-end reads, shoule base on fragment level for paired end reads)
      if (posc == true && unique == 1) {

        if (type == "p" && fragment.count(bam.Name) > 0) 
          fragment.erase(bam.Name);

        else {

          total_tags++;
          if (type == "p"){
            fragment.insert(bam.Name);
          }
          string alignSum;
          if (type == "p") {
             alignSum = int2str(alignmentStart) + "\t" + int2str(mateStart) + "\t.\t" + strand;
          } else {
             alignSum = int2str(alignmentStart) + "\t" + int2str(alignmentEnd) + "\t.\t" + strand;
          }

          if ( alignmentStart != old_start ) {
            isposPileup = false;
            map <string, unsigned int>::iterator pit = pileup.begin();            
            for (; pit != pileup.end(); pit++) {
              posc_f << chrom << "\truping\tpileup\t" << pit->first << "\t.\t" << "Pileup=" << pit->second << endl;     //print pileup
            }
            pileup.clear();           //clear pileup set
            pileup.insert( pair <string, unsigned int> (alignSum, 1) );  //insert the new read
            total_pos++;
          }

          else if ( alignmentStart == old_start ) { // same starts
            if ( pileup.count(alignSum) > 0 ) {  // pileup
              if ( pileup[alignSum] == 1 && isposPileup == false ) { 
                pileup_pos++; isposPileup = true;
              }
              pileup[alignSum]++;
            }
            else {
              pileup.insert( pair <string, unsigned int> (alignSum, 1) );
            }
          } //same starts

        }   //new fragment

        old_start = alignmentStart;
      } // do pos check



      float incre = 1.;
      if (blockStarts.size() > 1) incre = 0.5;     // incre half for junction reads
      incre /= static_cast < float >(unique);        // for multi aligned reads

      deque <struct region>::iterator iter = regions.begin();

      if ( iter->start > alignmentEnd ) continue;  // skip reads not overlapping with the first region

      while ( iter->chr == old_chr && iter->start <= alignmentEnd && iter != regions.end() ) {

        if (iter->end < alignmentStart) {            // the region end is beyond the alignmentStart

          gene_processing(*iter,locus_b);            // processing
          iter = regions.erase(iter);                // this region should be removed
          if ( regions.empty() ) { 
            getline(region_f, line);                        // get a line of region file
            if ( ! region_f.eof() ) {
              eatline(line, regions, noChr);                         // eat a line and put it into the duque
              iter = regions.begin();
            }
            else {  // it's reaching the end of the region file
              cerr << "finished: end of region file, zone 3" << endl;
              break;
            }
          }
          continue;
        }

        if (iter->end >= alignmentStart && iter->start <= alignmentEnd) {  //overlapping, should take action

          vector <int>::iterator cigit = blockStarts.begin();
          for (; cigit != blockStarts.end(); cigit++) {
            unsigned int current_start = *cigit + alignmentStart;
            int current_pos = current_start - (iter->start);
            //cout << iter->chr << "\t" << iter->start << "\t" << iter->end << "\t" << current_start << endl;
            if ( (iter->tags).count(current_pos) > 0 ) {
              (iter->tags)[current_pos] += incre;
            }
            else
              (iter->tags).insert( pair<int, float>(current_pos, incre) );  
          }

        }  // overlapping take action!

        if ( (iter+1) != regions.end() )
          iter++;                                           // if this region is not the last element in the deque
        else {                                              // the last element
          getline(region_f, line);                          // get a line of region file
          if ( ! region_f.eof() ){
            eatline(line, regions, noChr);                         // eat a line and put it into the duque
            iter = regions.end();
            iter--;
          }
          else {  //it's reaching the end of the region file
            cerr << "finished: end of region file, zone 4" << endl;
            break;
          }
        }

      } //while

    }  // read a bam


    // print chr map
    if (posc == true) {
      chrmap_f << old_chr << "\t" << total_tags << "\t" << total_pos << "\t" << pileup_pos << endl;
    } 
 
    //somehow to loop back
    it = regions.begin();                   //reset to begin
    for (; it != regions.end() && it->chr == old_chr; ) {
      gene_processing(*it,locus_b);              // print the old region info
      it = regions.erase(it);             // erase the current region
    }
  
    while ( regions.empty() ) {    

      getline(region_f, line);
      if ( region_f.eof() ){
        cerr << "finished: end of region file, zone 5" << endl;
        //print locus bias
        for (unsigned int l = 0; l < 1000; l++){
	  locus_bias << l << "\t" << locus_b[l].ps << "\t" << locus_b[l].hs << "\t" << locus_b[l].pe << "\t" << locus_b[l].he << endl;
	}
        exit(0);
      }
      eatline(line, regions, noChr);
      it = regions.begin();
      if (it->chr == old_chr){
        gene_processing(*it, locus_b);      
        regions.clear();
        continue;
      }
    }

  } // region chr != old chr
      
  regions.clear();
  reader.Close();
  region_f.close();
  return 0;

} //main
Example #11
0
int FileReader::runInternal()
{
    ogeNameThread("am_FileReader");

    if(!format_specified)
        format = deduceFileFormat();

    if(format == FORMAT_BAM)
    {
        BamMultiReader reader;
        
        if(!reader.Open(filenames)) {
            cerr << "Error opening BAM files." << endl;
            reader.Close();
            return -1;
        }
        
        header = reader.GetHeader();
        references = reader.GetReferenceData();
        open = true;
        
        BamAlignment * al;
        
        while(true)
        {
            if(load_string_data)
                al = reader.GetNextAlignment();
            else
                al = reader.GetNextAlignmentCore();

            if(!al)
                break;
            
            putOutputAlignment(al);
        }
        
        reader.Close();
    } else if(format == FORMAT_SAM) {
        
        vector<SamReader> readers;
        
        SamHeader first_header;

        // before doing any reading, open the files to
        // verify they are the right format, etc.
        for(int i = 0; i < filenames.size(); i++) {
            SamReader reader;
            
            if(!reader.Open(filenames[i])) {
                cerr << "Error opening SAM file: " << filenames[i] << endl;
                return -1;
            }

            if(filenames.size() > 1 && i == 0)
                first_header = header;
            
            // TODO: We can probably find a better way to deal with multiple SAM file headers,
            // but for now we should disallow different headers to avoid issues.
            if(i > 0 && header.ToString() != first_header.ToString())
                cerr << "Warning! SAM input files have different headers." << endl;
            
            reader.Close();
        }

        for(int i = 0; i < filenames.size(); i++) {
            SamReader reader;
            
            if(!reader.Open(filenames[i])) {
                cerr << "Error opening SAM file: " << filenames[i] << endl;
                return -1;
            }
            
            header = reader.GetHeader();
            references = reader.GetReferenceData();
            open = true;
            
            if(filenames.size() > 1 && i == 0)
                first_header = header;

            BamAlignment * al = NULL;
            while(true)
            {
                al = reader.GetNextAlignment();
                
                if(NULL == al)
                    break;
                
                putOutputAlignment(al);
            }

            reader.Close();
        }
    } else {
        cerr << "FileReader couldn't detect file format. Aborting." << endl;
        exit(-1);
        return -1;
    }

    return 0;
}
Example #12
0
int MergeTool::Run(int argc, char* argv[]) {

    // parse command line arguments
    Options::Parse(argc, argv, 1);

    // set to default input if none provided
    if ( !m_settings->HasInputBamFilename )
        m_settings->InputFiles.push_back(Options::StandardIn());

    // opens the BAM files (by default without checking for indexes)
    BamMultiReader reader;
    if ( !reader.Open(m_settings->InputFiles, false, true) ) {
        cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl;
        return 1;
    }

    // retrieve header & reference dictionary info
    std::string mergedHeader = reader.GetHeaderText();
    RefVector references = reader.GetReferenceData();

    // open writer
    BamWriter writer;
    bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() && !m_settings->IsForceCompression );
    if ( !writer.Open(m_settings->OutputFilename, mergedHeader, references, writeUncompressed) ) {
        cerr << "ERROR: Could not open BAM file " << m_settings->OutputFilename << " for writing... Aborting." << endl;
        reader.Close();
        return 1;
    }

    // if no region specified, store entire contents of file(s)
    if ( !m_settings->HasRegion ) {
        BamAlignment al;
        while ( reader.GetNextAlignmentCore(al) )
            writer.SaveAlignment(al);
    }

    // otherwise attempt to use region as constraint
    else {

        // if region string parses OK
        BamRegion region;
        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {

            // attempt to re-open reader with index files
            reader.Close();
            bool openedOK = reader.Open(m_settings->InputFiles, true, true );

            // if error
            if ( !openedOK ) {
                cerr << "ERROR: Could not open input BAM file(s)... Aborting." << endl;
                return 1;
            }

            // if index data available, we can use SetRegion
            if ( reader.IsIndexLoaded() ) {

                // attempt to use SetRegion(), if failed report error
                if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
                    cerr << "ERROR: Region requested, but could not set BamReader region to REGION: " << m_settings->Region << " Aborting." << endl;
                    reader.Close();
                    return 1;
                }

                // everything checks out, just iterate through specified region, storing alignments
                BamAlignment al;
                while ( reader.GetNextAlignmentCore(al) )
                    writer.SaveAlignment(al);
            }

            // no index data available, we have to iterate through until we
            // find overlapping alignments
            else {
                BamAlignment al;
                while ( reader.GetNextAlignmentCore(al) ) {
                    if ( (al.RefID >= region.LeftRefID)  && ( (al.Position + al.Length) >= region.LeftPosition ) &&
                            (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) )
                    {
                        writer.SaveAlignment(al);
                    }
                }
            }
        }

        // error parsing REGION string
        else {
            cerr << "ERROR: Could not parse REGION - " << m_settings->Region << endl;
            cerr << "Be sure REGION is in valid format (see README) and that coordinates are valid for selected references" << endl;
            reader.Close();
            writer.Close();
            return 1;
        }
    }

    // clean & exit
    reader.Close();
    writer.Close();
    return 0;
}