Esempio n. 1
0
int main (int argc, char * argv[])
{
    vector<string> inputFilenames;
    string combinedOutFilename, alignmentsOutFilename;

    try
    {
        TCLAP::CmdLine cmd("Program description", ' ', VERSION);

        TCLAP::ValueArg<string> combinedOutputArg("o", "out", 
            "Combined output filename (BAM format)", true, "", "combined.bam", cmd);

        TCLAP::ValueArg<int> minInsertArg("n", "min-insert", 
            "Minimum insert size", false, DEFAULT_MIN_GAP, "min insert size", cmd);

        TCLAP::ValueArg<int> maxInsertArg("x", "max-insert", 
            "Maximum insert size", false, DEFAULT_MAX_GAP, "max insert size", cmd);

        TCLAP::MultiArg<string> inputArgs("b", "bam",
            "Input BAM file", true,
            "input.bam", cmd);

        cmd.parse(argc, argv);

        combinedOutFilename = combinedOutputArg.getValue();

        MIN_GAP = minInsertArg.getValue();
        MAX_GAP = maxInsertArg.getValue();
        inputFilenames = inputArgs.getValue();

    } catch (TCLAP::ArgException &e) {
        cerr << "Error: " << e.error() << " " << e.argId() << endl;
    }

    // TODO require that alignments are sorted by name

    BamMultiReader reader;
    reader.Open(inputFilenames);

    if (!ValidOut.Open(combinedOutFilename, reader.GetHeader(),
                       reader.GetReferenceData()))
    {
        cerr << ValidOut.GetErrorString() << endl;
        return 1;
    }

    string current, prev;
    char mateID;
    Group group;
    set<string> references;

    Alignment a;
    while (reader.GetNextAlignment(a))
    {
        parseID(a.Name, current, mateID);

        if (current.compare(prev) && prev.size() > 0)
        {
            processGroup(group, references);
            group.clear();
            references.clear();
        }

        references.insert(a.RefName);

        GroupKey key;
        key.refID = a.RefName;
        key.mateID = mateID;
        key.rev = a.IsReverseStrand();

        group.insert( std::make_pair( key, a ) );

        prev = current;
    }
    processGroup(group, references);
}
Esempio n. 2
0
bool ConvertTool::ConvertToolPrivate::Run(void) {
 
    // ------------------------------------
    // initialize conversion input/output
        
    // set to default input if none provided
    if ( !m_settings->HasInput ) 
        m_settings->InputFiles.push_back(Options::StandardIn());
    
    // open input files
    BamMultiReader reader;
    if ( !m_settings->HasInput ) { // don't attempt to open index for stdin
        if ( !reader.Open(m_settings->InputFiles, false) ) {
            cerr << "Could not open input files" << endl;
            return false;
        }
    } else {
        if ( !reader.Open(m_settings->InputFiles, true) ) {
            if ( !reader.Open(m_settings->InputFiles, false) ) {
                cerr << "Could not open input files" << endl;
                return false;
            } else {
                cerr << "Opened reader without index file, jumping is disabled." << endl;
            }
        }
    }
    m_references = reader.GetReferenceData();

    // set region if specified
    BamRegion region;
    if ( m_settings->HasRegion ) {
        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {
            if ( !reader.SetRegion(region) ) {
                cerr << "Could not set BamReader region to REGION: " << m_settings->Region << endl;
                return false;
            }
        } else {
            cerr << "Could not parse REGION: " << m_settings->Region << endl;
            return false;
        }
    }
        
    // if output file given
    ofstream outFile;
    if ( m_settings->HasOutput ) {
      
        // open output file stream
        outFile.open(m_settings->OutputFilename.c_str());
        if ( !outFile ) {
            cerr << "Could not open " << m_settings->OutputFilename << " for output" << endl; 
            return false; 
        }
        
        // set m_out to file's streambuf
        m_out.rdbuf(outFile.rdbuf()); 
    }
    
    // -------------------------------------
    // do conversion based on format
    
     bool convertedOk = true;
    
    // pileup is special case
    // conversion not done per alignment, like the other formats
    if ( m_settings->Format == FORMAT_PILEUP )
        convertedOk = RunPileupConversion(&reader);
    
    // all other formats
    else {
    
        bool formatError = false;
        
        // set function pointer to proper conversion method
        void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
        if      ( m_settings->Format == FORMAT_BED )      pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
        else if ( m_settings->Format == FORMAT_BEDGRAPH ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBedGraph;
        else if ( m_settings->Format == FORMAT_FASTA )    pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
        else if ( m_settings->Format == FORMAT_FASTQ )    pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
        else if ( m_settings->Format == FORMAT_JSON )     pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
        else if ( m_settings->Format == FORMAT_SAM )      pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
        else if ( m_settings->Format == FORMAT_WIGGLE )   pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintWiggle;
        else if ( m_settings->Format == FORMAT_YAML )     pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintYaml;
        else { 
            cerr << "Unrecognized format: " << m_settings->Format << endl;
            cerr << "Please see help|README (?) for details on supported formats " << endl;
            formatError = true;
            convertedOk = false;
        }
        
        // if format selected ok
        if ( !formatError ) {
        
            // if SAM format & not omitting header, print SAM header first
            if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) 
                m_out << reader.GetHeaderText();
            
            // iterate through file, doing conversion
            BamAlignment a;
            while ( reader.GetNextAlignment(a) )
              (this->*pFunction)(a);
            
            // set flag for successful conversion
            convertedOk = true;
        }
    }
    
    // ------------------------
    // clean up & exit
    
    reader.Close();
    if ( m_settings->HasOutput ) outFile.close();
    return convertedOk;   
}
Esempio n. 3
0
int main ( int argc, char *argv[] ) { 

  struct parameters *param = 0;
  param = interface(param, argc, argv);


  //bam input and generate index if not yet 
  //-------------------------------------------------------------------------------------------------------+
  // BAM input (file or filenames?)                                                                        |
  //-------------------------------------------------------------------------------------------------------+
  char *fof = param->mapping_f;
  FILE *IN=NULL;
  char linefof[5000];
  int filecount=0;
  vector <string> fnames;

  if (strchr(fof,' ')!=NULL) {
    char *ptr;
    ptr=strtok(fof," ");
    while (ptr!=NULL) {
      fnames.push_back(ptr);
      filecount++;
      ptr=strtok(NULL," ");
    }
  } else {
    IN=fopen(fof,"rt");
    if (IN!=NULL) {
      long linecount=0;
      while (fgets(linefof,5000-1,IN)!=NULL) {
        linecount++;
        if (linefof[0]!='#' && linefof[0]!='\n') {
          char *ptr=strchr(linefof,'\n');
          if (ptr!=NULL && ptr[0]=='\n') {
            ptr[0]='\0';
          }
          FILE *dummy=NULL;
          dummy=fopen(linefof,"rt");
          if (dummy!=NULL) {     // seems to be a file of filenames...
            fclose(dummy);
            fnames.push_back(linefof);
            filecount++;
          } else if (filecount==0 || linecount>=1000-1) {  // seems to be a single file
            fnames.push_back(fof);
            filecount++;
            break;
          }
        }
      }
      fclose(IN);
    }
  }  //file or file name decided and stored in vector "fnames"

  cerr << "the input mapping files are:" << endl;
  vector <string>::iterator fit = fnames.begin();
  for(; fit != fnames.end(); fit++) {
    cerr << *fit << endl;
  }

  //-------------------------------------------------------------------------------------------------------+
  // end of file or filenames                                                                              |
  //-------------------------------------------------------------------------------------------------------+

  // open the BAM file(s)
  BamMultiReader reader;
  reader.Open(fnames);

  // get header & reference information
  string header = reader.GetHeaderText();
  RefVector refs = reader.GetReferenceData();


  // attempt to open BamWriter
  BamWriter writer;
  string outputBam = param->writer;
  if ( outputBam != "" ) {
    if ( !writer.Open(param->writer, header, refs) ) {
      cerr << "Could not open output BAM file" << endl;
      exit(0);
    }
  }


  BamAlignment bam;
  while (reader.GetNextAlignment(bam)) { //change RG 

    string rg = "RG";
    string rgType = "Z";
    string rgValue = "1";
    bam.EditTag(rg,rgType,rgValue);

    writer.SaveAlignment(bam);
    
  }  // read a bam

  return 0;

} //main
Esempio n. 4
0
int CropBamTool::CropBam()
{
    // open bam files
    BamMultiReader bamReader;
    bamReader.Open(bamFiles);

    // the dictionary of chromosomes
    RefVector genome = bamReader.GetReferenceData();

    // get the scanning window
    vector<tuple<int,int,int>> windows;
    int numWindows = GenericRegionTools::toScanWindow(genome, regionStrings, windows);

    unordered_set<string> readpool;

    // temporary struct for sequence object
    typedef struct {
        string name;
        int head_soft_clip;
        int tail_soft_clip;
        string seq;
        string qual;
    }cropbam_seq_t;

    // temporary struct for unique seqs
    map<string,list<cropbam_seq_t>> uniqueSeqPool;

    // lambda expression for output
    auto Output = [this](cropbam_seq_t &a){
          if (this->outFormat=="fasta"){
              cout << ">" << a.name << "\t"
                   << "head_soft_clip=" << a.head_soft_clip << "\t"
                   << "tail_soft_clip=" << a.tail_soft_clip << "\t"
                   << endl
                   << a.seq << endl;
          }
          if (this->outFormat=="fastq"){
              cout << "@" << a.name << "\t"
                   << "head_soft_clip=" << a.head_soft_clip << "\t"
                   << "tail_soft_clip=" << a.tail_soft_clip << "\t"
                   << endl
                   << a.seq << endl;
              cout << "+" << endl
                   << a.qual << endl;
          }
    };

    // loop over windows
    omp_set_dynamic(0);
    omp_set_num_threads(numThreads);
    #pragma omp parallel for shared(genome)
    for (int i=0; i<numWindows; i++)
    {
        clock_t tStart = clock();

        bamReader.Open(bamFiles);

        int wId = get<0>(windows[i]);
        int wLp = get<1>(windows[i]);
        int wRp = get<2>(windows[i]);

        if (verbose>=1) Verbose("process the window " + genome[wId].RefName + ":" + to_string(wLp+1) + "-" + to_string(wRp));

        // rewind the bam reader
        bamReader.Rewind();
        // set the region
        bamReader.SetRegion(wId, wLp, wId, wRp);

        int numReads = 0;
        // retrieve the alignment
        BamAlignment aln;
        while (bamReader.GetNextAlignment(aln))
        {
            // skip the alignment if it doesn't overlap the window
            if (aln.Position>=wRp || aln.GetEndPosition()<=wLp)
                continue;

            // skip the invalid alignment
            if (!isValidAlignment(aln, readLenThres, mapQualThres, alnFlagMarker))
                continue;

            // skip the alignment harboring too many mismatches
            if (!GenericBamAlignmentTools::validReadIdentity(aln, 1-alnIdenThres))
                continue;

            stringstream keyss;
            keyss << GenericBamAlignmentTools::getBamAlignmentName(aln) << "-"
                  << wId << "-" << wLp << "-" << wRp;
            string key = keyss.str();
            auto ptr = readpool.find(key);
            if (ptr!=readpool.end())
                continue;
            readpool.emplace(key);

            // get the partial read
            string readSegment, readQualSegment, genomeSegment;
            GenericBamAlignmentTools::getLocalAlignment(aln, wLp, wRp-wLp, readSegment, readQualSegment, genomeSegment);

            // add soft clip
            int hsc=0;
            auto ptr0 = aln.CigarData.begin();
            if (aln.Position>=wLp && (ptr0->Type=='S' || ptr0->Type=='H'))
            {
                stringstream headClipSeq, headClipQual;
                for (int i=0; i<ptr0->Length; i++)
                {
                    headClipSeq << aln.QueryBases[i];
                    headClipQual << aln.Qualities[i];
                }

                if (keepClip)
                {
                    readSegment=headClipSeq.str()+readSegment;
                    readQualSegment=headClipQual.str()+readQualSegment;
                }

                hsc += ptr0->Length;
            }
            int tsc=0;
            auto ptr1 = aln.CigarData.rbegin();
            if (aln.GetEndPosition()<wRp && (ptr1->Type=='S' || ptr1->Type=='H'))
            {
                string ss="", qs="";
                auto str=aln.QueryBases.rbegin();
                auto qtr=aln.Qualities.rbegin();
                for (int i=0; i<ptr1->Length; i++,str++,qtr++)
                {
                    ss=(*str)+ss;
                    qs=(*qtr)+qs;
                }
                if (keepClip)
                {
                    readSegment=readSegment+ss;
                    readQualSegment=readQualSegment+qs;
                }
                tsc += ptr1->Length;
            }

            if (readSegment.length()>=segmentLenThres)
            {
                cropbam_seq_t a;
                a.name = GenericBamAlignmentTools::getBamAlignmentName(aln);
                a.head_soft_clip = hsc;
                a.tail_soft_clip = tsc;
                a.seq = readSegment;
                a.qual = readQualSegment;
                if (uniqueSeqPool.count(a.seq)==0)
                    uniqueSeqPool[a.seq] = list<cropbam_seq_t>(1,a);
                else
                    uniqueSeqPool[a.seq].emplace_back(a);
//                if (outFormat=="fasta"){
//                    cout << ">" << GenericBamAlignmentTools::getBamAlignmentName(aln) << "\t"
//                         << "head_soft_clip=" << hsc << "\t"
//                         << "tail_soft_clip=" << tsc << "\t"
//                         << endl
//                         << readSegment << endl;
//                }

//                if (outFormat=="fastq"){
//                    cout << "@" << GenericBamAlignmentTools::getBamAlignmentName(aln) << "\t"
//                         << "head_soft_clip=" << hsc << "\t"
//                         << "tail_soft_clip=" << tsc << "\t"
//                         << endl
//                         << readSegment << endl;
//                    cout << "+" << endl
//                         << readQualSegment << endl;
//                }

                numReads++;
            }
        }

        numReads = 0;
        if (useUnique){
            ofstream of;
            of.open(outFreq);
            for (auto a : uniqueSeqPool){
                if (a.second.size()>=thresFreq){
                    Output(*a.second.begin());
                    of << a.second.begin()->name << "\t" << a.second.size() << endl;
                    numReads ++;
                }
            }
            of.close();
        }else{
            for (auto a : uniqueSeqPool){
                for (auto b : a.second){
                    Output(b);
                    numReads ++;
                }
            }
        }

        clock_t tEnd = clock();

        if (verbose>=1) Verbose("retrieve " + to_string(numReads) + " reads");
        if (verbose>=1) Verbose("time elapsed " + to_string((double)(tEnd-tStart)/CLOCKS_PER_SEC) + " seconds");
    }

    return 0;
}
Esempio n. 5
0
bool FilterTool::FilterToolPrivate::Run(void) {
    
    // set to default input if none provided
    if ( !m_settings->HasInput && !m_settings->HasInputFilelist )
        m_settings->InputFiles.push_back(Options::StandardIn());

    // add files in the filelist to the input file list
    if ( m_settings->HasInputFilelist ) {

        ifstream filelist(m_settings->InputFilelist.c_str(), ios::in);
        if ( !filelist.is_open() ) {
            cerr << "bamtools filter ERROR: could not open input BAM file list... Aborting." << endl;
            return false;
        }

        string line;
        while ( getline(filelist, line) )
            m_settings->InputFiles.push_back(line);
    }

    // initialize defined properties & user-specified filters
    // quit if failed
    if ( !SetupFilters() )
        return false;

    // open reader without index
    BamMultiReader reader;
    if ( !reader.Open(m_settings->InputFiles) ) {
        cerr << "bamtools filter ERROR: could not open input files for reading." << endl;
        return false;
    }

    // retrieve reader header & reference data
    const string headerText = reader.GetHeaderText();
    filterToolReferences = reader.GetReferenceData();
    
    // determine compression mode for BamWriter
    bool writeUncompressed = ( m_settings->OutputFilename == Options::StandardOut() &&
                              !m_settings->IsForceCompression );
    BamWriter::CompressionMode compressionMode = BamWriter::Compressed;
    if ( writeUncompressed ) compressionMode = BamWriter::Uncompressed;

    // open BamWriter
    BamWriter writer;
    writer.SetCompressionMode(compressionMode);
    if ( !writer.Open(m_settings->OutputFilename, headerText, filterToolReferences) ) {
        cerr << "bamtools filter ERROR: could not open " << m_settings->OutputFilename << " for writing." << endl;
        reader.Close();
        return false;
    }

    // if no region specified, filter entire file 
    BamAlignment al;
    if ( !m_settings->HasRegion ) {
        while ( reader.GetNextAlignment(al) ) {
            if ( CheckAlignment(al) ) 
                writer.SaveAlignment(al);
        }
    }
    
    // otherwise attempt to use region as constraint
    else {
        
        // if region string parses OK
        BamRegion region;
        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {

            // attempt to find index files
            reader.LocateIndexes();

            // if index data available for all BAM files, we can use SetRegion
            if ( reader.HasIndexes() ) {

                // attempt to use SetRegion(), if failed report error
                if ( !reader.SetRegion(region.LeftRefID, region.LeftPosition, region.RightRefID, region.RightPosition) ) {
                    cerr << "bamtools filter ERROR: set region failed. Check that REGION describes a valid range" << endl;
                    reader.Close();
                    return false;
                } 
              
                // everything checks out, just iterate through specified region, filtering alignments
                while ( reader.GetNextAlignment(al) )
                    if ( CheckAlignment(al) ) 
                        writer.SaveAlignment(al);
                }
            
            // no index data available, we have to iterate through until we
            // find overlapping alignments
            else {
                while ( reader.GetNextAlignment(al) ) {
                    if ( (al.RefID >= region.LeftRefID)  && ((al.Position + al.Length) >= region.LeftPosition) &&
                         (al.RefID <= region.RightRefID) && ( al.Position <= region.RightPosition) ) 
                    {
                        if ( CheckAlignment(al) ) 
                            writer.SaveAlignment(al);
                    }
                }
            }
        } 
        
        // error parsing REGION string
        else {
            cerr << "bamtools filter ERROR: could not parse REGION: " << m_settings->Region << endl;
            cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
                 << endl;
            reader.Close();
            return false;
        }
    }

    // clean up & exit
    reader.Close();
    writer.Close();
    return true;
}
Esempio n. 6
0
bool ConvertTool::ConvertToolPrivate::Run(void) {
 
    // ------------------------------------
    // initialize conversion input/output
        
    // set to default input if none provided
    if ( !m_settings->HasInput && !m_settings->HasInputFilelist )
        m_settings->InputFiles.push_back(Options::StandardIn());
    
    // add files in the filelist to the input file list
    if ( m_settings->HasInputFilelist ) {

        ifstream filelist(m_settings->InputFilelist.c_str(), ios::in);
        if ( !filelist.is_open() ) {
            cerr << "bamtools convert ERROR: could not open input BAM file list... Aborting." << endl;
            return false;
        }

        string line;
        while ( getline(filelist, line) )
            m_settings->InputFiles.push_back(line);
    }

    // open input files
    BamMultiReader reader;
    if ( !reader.Open(m_settings->InputFiles) ) {
        cerr << "bamtools convert ERROR: could not open input BAM file(s)... Aborting." << endl;
        return false;
    }

    // if input is not stdin & a region is provided, look for index files
    if ( m_settings->HasInput && m_settings->HasRegion ) {
        if ( !reader.LocateIndexes() ) {
            cerr << "bamtools convert ERROR: could not locate index file(s)... Aborting." << endl;
            return false;
        }
    }

    // retrieve reference data
    m_references = reader.GetReferenceData();

    // set region if specified
    BamRegion region;
    if ( m_settings->HasRegion ) {
        if ( Utilities::ParseRegionString(m_settings->Region, reader, region) ) {

            if ( reader.HasIndexes() ) {
                if ( !reader.SetRegion(region) ) {
                    cerr << "bamtools convert ERROR: set region failed. Check that REGION describes a valid range" << endl;
                    reader.Close();
                    return false;
                }
            }

        } else {
            cerr << "bamtools convert ERROR: could not parse REGION: " << m_settings->Region << endl;
            cerr << "Check that REGION is in valid format (see documentation) and that the coordinates are valid"
                 << endl;
            reader.Close();
            return false;
        }
    }
        
    // if output file given
    ofstream outFile;
    if ( m_settings->HasOutput ) {
      
        // open output file stream
        outFile.open(m_settings->OutputFilename.c_str());
        if ( !outFile ) {
            cerr << "bamtools convert ERROR: could not open " << m_settings->OutputFilename
                 << " for output" << endl;
            return false; 
        }
        
        // set m_out to file's streambuf
        m_out.rdbuf(outFile.rdbuf()); 
    }
    
    // -------------------------------------
    // do conversion based on format
    
     bool convertedOk = true;
    
    // pileup is special case
    // conversion not done per alignment, like the other formats
    if ( m_settings->Format == FORMAT_PILEUP )
        convertedOk = RunPileupConversion(&reader);
    
    // all other formats
    else {
    
        bool formatError = false;
        
        // set function pointer to proper conversion method
        void (BamTools::ConvertTool::ConvertToolPrivate::*pFunction)(const BamAlignment&) = 0;
        if      ( m_settings->Format == FORMAT_BED )   pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintBed;
        else if ( m_settings->Format == FORMAT_FASTA ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFasta;
        else if ( m_settings->Format == FORMAT_FASTQ ) pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintFastq;
        else if ( m_settings->Format == FORMAT_JSON )  pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintJson;
        else if ( m_settings->Format == FORMAT_SAM )   pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintSam;
        else if ( m_settings->Format == FORMAT_YAML )  pFunction = &BamTools::ConvertTool::ConvertToolPrivate::PrintYaml;
        else { 
            cerr << "bamtools convert ERROR: unrecognized format: " << m_settings->Format << endl;
            cerr << "Please see documentation for list of supported formats " << endl;
            formatError = true;
            convertedOk = false;
        }
        
        // if format selected ok
        if ( !formatError ) {
        
            // if SAM format & not omitting header, print SAM header first
            if ( (m_settings->Format == FORMAT_SAM) && !m_settings->IsOmittingSamHeader ) 
                m_out << reader.GetHeaderText();
            
            // iterate through file, doing conversion
            BamAlignment a;
            while ( reader.GetNextAlignment(a) )
                (this->*pFunction)(a);
            
            // set flag for successful conversion
            convertedOk = true;
        }
    }
    
    // ------------------------
    // clean up & exit
    reader.Close();
    if ( m_settings->HasOutput )
        outFile.close();
    return convertedOk;   
}
Esempio n. 7
0
int main ( int argc, char *argv[] ) { 

  struct parameters *param = 0;
  param = interface(param, argc, argv);

  //region file input (the region file should be sorted as the same way as the bam file)
  ifstream region_f;
  region_f.open(param->region_f, ios_base::in);  // the region file is opened

  //bam input and generate index if not yet 
  //-------------------------------------------------------------------------------------------------------+
  // BAM input (file or filenames?)                                                                        |
  //-------------------------------------------------------------------------------------------------------+
  char *fof = param->mapping_f;
  FILE *IN=NULL;
  char linefof[5000];
  int filecount=0;
  vector <string> fnames;

  if (strchr(fof,' ')!=NULL) {
    char *ptr;
    ptr=strtok(fof," ");
    while (ptr!=NULL) {
      fnames.push_back(ptr);
      filecount++;
      ptr=strtok(NULL," ");
    }
  } else {
    IN=fopen(fof,"rt");
    if (IN!=NULL) {
      long linecount=0;
      while (fgets(linefof,5000-1,IN)!=NULL) {
        linecount++;
        if (linefof[0]!='#' && linefof[0]!='\n') {
          char *ptr=strchr(linefof,'\n');
          if (ptr!=NULL && ptr[0]=='\n') {
            ptr[0]='\0';
          }
          FILE *dummy=NULL;
          dummy=fopen(linefof,"rt");
          if (dummy!=NULL) {     // seems to be a file of filenames...
            fclose(dummy);
            fnames.push_back(linefof);
            filecount++;
          } else if (filecount==0 || linecount>=1000-1) {  // seems to be a single file
            fnames.push_back(fof);
            filecount++;
            break;
          }
        }
      }
      fclose(IN);
    }
  }  //file or file name decided and stored in vector "fnames"

  cerr << "the input mapping files are:" << endl;
  vector <string>::iterator fit = fnames.begin();
  for(; fit != fnames.end(); fit++) {
    cerr << *fit << endl;
  }

  //-------------------------------------------------------------------------------------------------------+
  // end of file or filenames                                                                              |
  //-------------------------------------------------------------------------------------------------------+

  // open the BAM file(s)
  BamMultiReader reader;
  reader.Open(fnames);

  // get header & reference information
  string header = reader.GetHeaderText();
  RefVector refs = reader.GetReferenceData();

  if ( ! reader.LocateIndexes() )     // opens any existing index files that match our BAM files
    reader.CreateIndexes();         // creates index files for BAM files that still lack one


  // locus bias
  struct lb empty_profile = {0,0,0,0};
  vector <struct lb> locus_b(1000, empty_profile);
  // output locus bias file
  string locus_bias_set = param->lbias;
  ofstream locus_bias;
  if ( locus_bias_set != "" ) {
    locus_bias.open(param->lbias);
    if ( !locus_bias ) {
      cerr << "can not open locus_bias file.\n";
      exit(0);
    }
  }

  //should decide which chromosome
  string line;
  string old_chr = "SRP";
  string type = param->type;

  //whether do some position-level pile-up stuff
  bool posc = false;
  ofstream posc_f;
  ofstream chrmap_f;
  string poscset = param->posc;
  if ( poscset != "" ) {
    posc = true;
    posc_f.open(param->posc);
    chrmap_f.open(param->chrmap);
  }

  bool noChr;
  if ( param->nochr == 1 ){
    noChr = true;
  } else {
    noChr = false;
  }

  //regions for the input of region file
  deque <struct region> regions;

  getline(region_f, line); //get the first line
  eatline(line,regions,noChr);
  
  deque <struct region>::iterator it = regions.begin();

  while ( it->chr != old_chr ) {

    old_chr = it->chr;  // set the current chr as old chr

    int chr_id  = reader.GetReferenceID(it->chr);

    if ( chr_id == -1 ) {  //reference not found

      for (; it != regions.end() && it->chr == old_chr; ) {
        gene_processing(*it,locus_b);           // print the old region info
        it = regions.erase(it);         // erase the current region
      }
  
      while ( regions.empty() ) {    
        getline(region_f, line);
        if ( region_f.eof() ){
          cerr << "finished: end of region file, zone 0" << endl;
          break;
        }
        eatline(line, regions,noChr);
        it = regions.begin();
        if (it->chr == old_chr){  
          gene_processing(*it,locus_b);      
          regions.clear();
          continue;
        }
      }
      continue;
    }

    int chr_len = refs.at(chr_id).RefLength;

    if ( !reader.SetRegion(chr_id, 1, chr_id, chr_len) ) // here set region
      {
        cerr << "bamtools count ERROR: Jump region failed " << it->chr << endl;
        reader.Close();
        exit(1);
      }

    //pile-up pos stats
    set <string> fragment;
    map <string, unsigned int> pileup;
    bool isposPileup = false;
    unsigned int old_start   = 0;
    unsigned int total_tags  = 0;
    unsigned int total_pos   = 0;
    unsigned int pileup_pos  = 0;


    BamAlignment bam;
    while (reader.GetNextAlignment(bam)) {

      if ( bam.IsMapped() == false ) continue;   // skip unaligned reads

      unsigned int unique;
      bam.GetTag("NH", unique);
      if (param->unique == 1) {
        if (unique != 1) {                       // skipe uniquelly mapped reads
          continue;
        }
      }

      if (read_length == 0){
        read_length = bam.Length;
      }

      //cout << bam.Name << endl;
      string chrom = refs.at(bam.RefID).RefName;
      string strand = "+";
      if (bam.IsReverseStrand()) strand = "-";

      unsigned int alignmentStart =  bam.Position+1;
      unsigned int mateStart;
      if (type == "p") mateStart = bam.MatePosition+1;
      unsigned int alignmentEnd = bam.GetEndPosition();
      unsigned int cigarEnd;
      vector <int> blockLengths;
      vector <int> blockStarts;
      blockStarts.push_back(0);
      ParseCigar(bam.CigarData, blockStarts, blockLengths, cigarEnd);


      // position check for unique mapped reads (because is paired-end reads, shoule base on fragment level for paired end reads)
      if (posc == true && unique == 1) {

        if (type == "p" && fragment.count(bam.Name) > 0) 
          fragment.erase(bam.Name);

        else {

          total_tags++;
          if (type == "p"){
            fragment.insert(bam.Name);
          }
          string alignSum;
          if (type == "p") {
             alignSum = int2str(alignmentStart) + "\t" + int2str(mateStart) + "\t.\t" + strand;
          } else {
             alignSum = int2str(alignmentStart) + "\t" + int2str(alignmentEnd) + "\t.\t" + strand;
          }

          if ( alignmentStart != old_start ) {
            isposPileup = false;
            map <string, unsigned int>::iterator pit = pileup.begin();            
            for (; pit != pileup.end(); pit++) {
              posc_f << chrom << "\truping\tpileup\t" << pit->first << "\t.\t" << "Pileup=" << pit->second << endl;     //print pileup
            }
            pileup.clear();           //clear pileup set
            pileup.insert( pair <string, unsigned int> (alignSum, 1) );  //insert the new read
            total_pos++;
          }

          else if ( alignmentStart == old_start ) { // same starts
            if ( pileup.count(alignSum) > 0 ) {  // pileup
              if ( pileup[alignSum] == 1 && isposPileup == false ) { 
                pileup_pos++; isposPileup = true;
              }
              pileup[alignSum]++;
            }
            else {
              pileup.insert( pair <string, unsigned int> (alignSum, 1) );
            }
          } //same starts

        }   //new fragment

        old_start = alignmentStart;
      } // do pos check



      float incre = 1.;
      if (blockStarts.size() > 1) incre = 0.5;     // incre half for junction reads
      incre /= static_cast < float >(unique);        // for multi aligned reads

      deque <struct region>::iterator iter = regions.begin();

      if ( iter->start > alignmentEnd ) continue;  // skip reads not overlapping with the first region

      while ( iter->chr == old_chr && iter->start <= alignmentEnd && iter != regions.end() ) {

        if (iter->end < alignmentStart) {            // the region end is beyond the alignmentStart

          gene_processing(*iter,locus_b);            // processing
          iter = regions.erase(iter);                // this region should be removed
          if ( regions.empty() ) { 
            getline(region_f, line);                        // get a line of region file
            if ( ! region_f.eof() ) {
              eatline(line, regions, noChr);                         // eat a line and put it into the duque
              iter = regions.begin();
            }
            else {  // it's reaching the end of the region file
              cerr << "finished: end of region file, zone 3" << endl;
              break;
            }
          }
          continue;
        }

        if (iter->end >= alignmentStart && iter->start <= alignmentEnd) {  //overlapping, should take action

          vector <int>::iterator cigit = blockStarts.begin();
          for (; cigit != blockStarts.end(); cigit++) {
            unsigned int current_start = *cigit + alignmentStart;
            int current_pos = current_start - (iter->start);
            //cout << iter->chr << "\t" << iter->start << "\t" << iter->end << "\t" << current_start << endl;
            if ( (iter->tags).count(current_pos) > 0 ) {
              (iter->tags)[current_pos] += incre;
            }
            else
              (iter->tags).insert( pair<int, float>(current_pos, incre) );  
          }

        }  // overlapping take action!

        if ( (iter+1) != regions.end() )
          iter++;                                           // if this region is not the last element in the deque
        else {                                              // the last element
          getline(region_f, line);                          // get a line of region file
          if ( ! region_f.eof() ){
            eatline(line, regions, noChr);                         // eat a line and put it into the duque
            iter = regions.end();
            iter--;
          }
          else {  //it's reaching the end of the region file
            cerr << "finished: end of region file, zone 4" << endl;
            break;
          }
        }

      } //while

    }  // read a bam


    // print chr map
    if (posc == true) {
      chrmap_f << old_chr << "\t" << total_tags << "\t" << total_pos << "\t" << pileup_pos << endl;
    } 
 
    //somehow to loop back
    it = regions.begin();                   //reset to begin
    for (; it != regions.end() && it->chr == old_chr; ) {
      gene_processing(*it,locus_b);              // print the old region info
      it = regions.erase(it);             // erase the current region
    }
  
    while ( regions.empty() ) {    

      getline(region_f, line);
      if ( region_f.eof() ){
        cerr << "finished: end of region file, zone 5" << endl;
        //print locus bias
        for (unsigned int l = 0; l < 1000; l++){
	  locus_bias << l << "\t" << locus_b[l].ps << "\t" << locus_b[l].hs << "\t" << locus_b[l].pe << "\t" << locus_b[l].he << endl;
	}
        exit(0);
      }
      eatline(line, regions, noChr);
      it = regions.begin();
      if (it->chr == old_chr){
        gene_processing(*it, locus_b);      
        regions.clear();
        continue;
      }
    }

  } // region chr != old chr
      
  regions.clear();
  reader.Close();
  region_f.close();
  return 0;

} //main
Esempio n. 8
0
int FileReader::runInternal()
{
    ogeNameThread("am_FileReader");

    if(!format_specified)
        format = deduceFileFormat();

    if(format == FORMAT_BAM)
    {
        BamMultiReader reader;
        
        if(!reader.Open(filenames)) {
            cerr << "Error opening BAM files." << endl;
            reader.Close();
            return -1;
        }
        
        header = reader.GetHeader();
        references = reader.GetReferenceData();
        open = true;
        
        BamAlignment * al;
        
        while(true)
        {
            if(load_string_data)
                al = reader.GetNextAlignment();
            else
                al = reader.GetNextAlignmentCore();

            if(!al)
                break;
            
            putOutputAlignment(al);
        }
        
        reader.Close();
    } else if(format == FORMAT_SAM) {
        
        vector<SamReader> readers;
        
        SamHeader first_header;

        // before doing any reading, open the files to
        // verify they are the right format, etc.
        for(int i = 0; i < filenames.size(); i++) {
            SamReader reader;
            
            if(!reader.Open(filenames[i])) {
                cerr << "Error opening SAM file: " << filenames[i] << endl;
                return -1;
            }

            if(filenames.size() > 1 && i == 0)
                first_header = header;
            
            // TODO: We can probably find a better way to deal with multiple SAM file headers,
            // but for now we should disallow different headers to avoid issues.
            if(i > 0 && header.ToString() != first_header.ToString())
                cerr << "Warning! SAM input files have different headers." << endl;
            
            reader.Close();
        }

        for(int i = 0; i < filenames.size(); i++) {
            SamReader reader;
            
            if(!reader.Open(filenames[i])) {
                cerr << "Error opening SAM file: " << filenames[i] << endl;
                return -1;
            }
            
            header = reader.GetHeader();
            references = reader.GetReferenceData();
            open = true;
            
            if(filenames.size() > 1 && i == 0)
                first_header = header;

            BamAlignment * al = NULL;
            while(true)
            {
                al = reader.GetNextAlignment();
                
                if(NULL == al)
                    break;
                
                putOutputAlignment(al);
            }

            reader.Close();
        }
    } else {
        cerr << "FileReader couldn't detect file format. Aborting." << endl;
        exit(-1);
        return -1;
    }

    return 0;
}