pos_t VariantProcessor::processAlignment(const BamAlignment& alignment) { /* For each alignment, extract the MD and NM tags, validate against CIGAR string, and create Variants and ReadHaplotypes. All reads for a block are stored in a deque, and processed again to create candidate haplotypes. Returns the start position of this alignment (TODO correct?) */ if (!alignment.HasTag("NM") || !alignment.HasTag("MD")) { std::cerr << "error: BamAlignment '" << alignment.Name << "' does not have either NM or MD tags" << std::endl; } int nm_tag; string md_tag; unsigned int aln_len = alignment.GetEndPosition() - alignment.Position; alignment.GetTag("MD", md_tag); alignment.GetTag("NM", nm_tag); // Reconstruct reference sequence using MD tags string refseq = createReferenceSequence(alignment); // With reconstructed reference sequence and query sequence, look // for variants. It's a bit roundabout to reconstruct reference from // MD, then use it to find variants (already in MD) but keeping // state between CIGAR and MD is tricky. This also is a good // validation; variants found must much the number of variants in // CIGAR/MD. vector<VariantPtr> variants; vector<VariantPtr> read_variants; const vector<CigarOp>& cigar = alignment.CigarData; int refpos = 0, readpos = 0; for (vector<CigarOp>::const_iterator op = cigar.begin(); op != cigar.end(); ++op) { if (op->Type == 'S') { readpos += op->Length; } else if (op->Type == 'M') { // match or SNP processMatchOrMismatch(alignment, read_variants, op->Length, refseq, refpos, readpos); readpos += op->Length; refpos += op->Length; } else if (op->Type == 'I') { processInsertion(alignment, read_variants, op->Length, refseq, refpos, readpos); readpos += op->Length; } else if (op->Type == 'D') { processDeletion(alignment, read_variants, op->Length, refseq, refpos, readpos); refpos += op->Length; // deletion w.r.t reference; skip ref length } else { cerr << "error: unidentified CIGAR type: " << op->Type << endl; exit(1); } } // Add to alignments list block_alignments.push_back(alignment); return 0; // TODO(vsbuffalo) }
int DataStatisticsTool::Execute() { // iterate over reads in BAM file(s) BamAlignment alignObj; while(bamReader.GetNextAlignment(alignObj)) { if (alignObj.IsDuplicate()) continue; if (alignObj.IsFailedQC()) continue; if (!alignObj.IsMapped()) continue; if (!alignObj.IsPrimaryAlignment()) continue; if (alignObj.IsPaired() && !alignObj.IsProperPair()) continue; if (alignObj.IsPaired() && !alignObj.IsMateMapped()) continue; if (!alignObj.HasTag("MD")) continue; // // debug // GenericBamAlignmentTools::printBamAlignmentCigar(alignObj); // GenericBamAlignmentTools::printBamAlignmentMD(alignObj); // shift InDel GenericBamAlignmentTools::leftShiftInDel(alignObj); // // debug // GenericBamAlignmentTools::printBamAlignmentCigar(alignObj); // GenericBamAlignmentTools::printBamAlignmentMD(alignObj); // get the alignment sequences string alignRead; string alignGenome; GenericBamAlignmentTools::getAlignmentSequences(alignObj, alignRead, alignGenome); // update the statistics statistics.update(alignRead, alignGenome); } // print to screen cout << statistics << endl; // statistics.printMatchMismatch(); // close BAM reader bamReader.Close(); // close Fasta genomeFasta.Close(); return 1; }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:editDist [in bam]"<<endl<<"this program returns the NM field of all aligned reads"<<endl; return 1; } string bamfiletopen = string(argv[1]); // cout<<bamfiletopen<<endl; BamReader reader; // cout<<"ok"<<endl; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } BamAlignment al; // cout<<"ok"<<endl; while ( reader.GetNextAlignment(al) ) { // cout<<al.Name<<endl; if(!al.IsMapped()) continue; if(al.HasTag("NM") ){ int editDist; if(al.GetTag("NM",editDist) ){ cout<<editDist<<endl; }else{ cerr<<"Cannot retrieve NM field for "<<al.Name<<endl; return 1; } }else{ cerr<<"Warning: read "<<al.Name<<" is aligned but has no NM field"<<endl; } } //while al reader.Close(); return 0; }
//{{{ void process_intra_chrom_split(const BamAlignment &curr, void SV_SplitRead:: process_intra_chrom_split(const BamAlignment &curr, const RefVector refs, BamWriter &inter_chrom_reads, map<string, BamAlignment> &mapped_splits, UCSCBins<SV_BreakPoint*> &r_bin, int weight, int id, int sample_id, SV_SplitReadReader *_reader) { if (mapped_splits.find(curr.Name) == mapped_splits.end()) { uint32_t clipped = count_clipped(curr.CigarData); if ( curr.HasTag("YP") == true) { uint32_t t; curr.GetTag("YP", t); if (t == 2) mapped_splits[curr.Name] = curr; } else if (clipped >= _reader->min_clip) mapped_splits[curr.Name] = curr; } else { if ( mapped_splits[curr.Name].RefID == curr.RefID ) { try { SV_SplitRead *new_split_read = new SV_SplitRead(mapped_splits[curr.Name], curr, refs, weight, id, sample_id, _reader); SV_BreakPoint *new_bp = NULL; if (new_split_read->is_sane()) { new_bp = new_split_read->get_bp(); if (new_bp != NULL) { new_bp->cluster(r_bin); } else { cerr << "Alignment name:" << curr.Name << endl; free(new_split_read); } } else free(new_split_read); } catch (int) { cerr << "Error creating split read: " << endl; } } else { BamAlignment al1 = curr; BamAlignment al2 = mapped_splits[curr.Name]; al1.MateRefID = al2.RefID; al2.MateRefID = al1.RefID; al1.MatePosition = al2.Position; al2.MatePosition = al1.Position; string x = _reader->get_source_file_name(); al1.AddTag("LS","Z",x); al2.AddTag("LS","Z",x); inter_chrom_reads.SaveAlignment(al1); inter_chrom_reads.SaveAlignment(al2); } mapped_splits.erase(curr.Name); } }
int main (int argc, char *argv[]) { if( (argc!= 3) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cerr<<"Usage:splitByRG [in bam] [out prefix]"<<endl<<"this program creates one bam file per RG in the with the outprefix\nFor example splitByRG in.bam out will create\nout.rg1.bam\nout.rg2.bam\n"<<endl; return 1; } string bamfiletopen = string(argv[1]); // if(!strEndsWith(bamfiletopen,".bam")){ // } string bamDirOutPrefix = string(argv[2]); map<string,BamWriter *> rg2BamWriter; // if(!isDirectory(bamDirOut)){ // cerr<<"ERROR: the out directory does not exist"<<endl; // return 1; // } BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); vector<RefData> refData=reader.GetReferenceData(); string pID = "splitByRG"; string pName = "splitByRG"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),"..")); SamReadGroupDictionary srgd=header.ReadGroups; for(SamReadGroupConstIterator srgci=srgd.ConstBegin(); srgci<srgd.ConstEnd(); srgci++){ //cout<<*srgci<<endl; const SamReadGroup rg = (*srgci); //cout<<rg.ID<<endl; rg2BamWriter[rg.ID] = new BamWriter(); rg2BamWriter[rg.ID]->Open(bamDirOutPrefix+"."+rg.ID+".bam",header,references); } BamAlignment al; unsigned int total=0; while ( reader.GetNextAlignment(al) ) { // al.SetIsFailedQC(false); // writer.SaveAlignment(al); // if(al.IsMapped () ){ // if(rg2BamWriter.find(refData[al.RefID].RefName) == rg2BamWriter.end()){ //new // rg2BamWriter[refData[al.RefID].RefName] = new BamWriter(); // if ( !rg2BamWriter[refData[al.RefID].RefName]->Open(bamDirOutPrefix+"."+refData[al.RefID].RefName+".bam",header,references) ) { // cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<refData[al.RefID].RefName<<".bam" << endl; // return 1; // } // }else{ // rg2BamWriter[refData[al.RefID].RefName]->SaveAlignment(al); // } // }else{ // unmapped.SaveAlignment(al); // } if(al.HasTag("RG")){ string rgTag; al.GetTag("RG",rgTag); //cout<<rgTag<<endl; if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new cerr<<"Found new RG "<<rgTag<<endl; rg2BamWriter[rgTag] = new BamWriter(); if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) { cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl; return 1; } rg2BamWriter[rgTag]->SaveAlignment(al); }else{ rg2BamWriter[rgTag]->SaveAlignment(al); } }else{ string rgTag="unknown"; //cout<<rgTag<<endl; if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new cerr<<"Found new RG "<<rgTag<<endl; rg2BamWriter[rgTag] = new BamWriter(); if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) { cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl; return 1; } rg2BamWriter[rgTag]->SaveAlignment(al); }else{ rg2BamWriter[rgTag]->SaveAlignment(al); } // cerr << "Cannot get RG tag for " << al.Name<<endl; // return 1; } total++; } //while al reader.Close(); // writer.Close(); // unmapped.Close(); map<string,BamWriter *>::iterator rg2BamWriterIt; for (rg2BamWriterIt =rg2BamWriter.begin(); rg2BamWriterIt!=rg2BamWriter.end(); rg2BamWriterIt++){ rg2BamWriterIt->second->Close(); } cerr<<"Wrote succesfully "<<total<<" reads"<<endl; return 0; }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<""<<endl; cout<<"plotQualScore input.bam"<<endl; return 1; } string bamfiletopen = string(argv[1]); BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } // if ( !reader.LocateIndex() ){ // cerr << "warning: cannot locate index for file " << bamfiletopen<<endl; // //return 1; // } BamAlignment al; BamAlignment al2; bool unsurePEorSE=true; bool pe=true; int strLength=-1; int vecLengthToUse=-1; map<short,unsigned long> ** counterA = 0; map<short,unsigned long> ** counterC = 0; map<short,unsigned long> ** counterG = 0; map<short,unsigned long> ** counterT = 0; int lengthIndex1=0; int lengthIndex2=0; string seqInd1; string seqInd2; string qualInd1; string qualInd2; int offsetInd2; while ( reader.GetNextAlignment(al) ) { if(unsurePEorSE){ strLength=al.QueryBases.length(); if(al.IsPaired()){ pe=true; vecLengthToUse=2*strLength; }else{ pe=false; vecLengthToUse=strLength; } string index1; string index2; if(al.HasTag("XI")){ al.GetTag("XI",index1); vecLengthToUse+=index1.length(); lengthIndex1=index1.length(); } if(al.HasTag("XJ")){ al.GetTag("XJ",index2); vecLengthToUse+=index2.length(); lengthIndex2=index2.length(); } counterA = new map<short,unsigned long> * [vecLengthToUse]; counterC = new map<short,unsigned long> * [vecLengthToUse]; counterG = new map<short,unsigned long> * [vecLengthToUse]; counterT = new map<short,unsigned long> * [vecLengthToUse]; for(int i=0;i<vecLengthToUse;i++){ counterA[i]=new map<short,unsigned long> (); counterC[i]=new map<short,unsigned long> (); counterG[i]=new map<short,unsigned long> (); counterT[i]=new map<short,unsigned long> (); for(short k=minQualScore;k<=maxQualScore;k++){ (*counterA[i])[k]=0; (*counterC[i])[k]=0; (*counterG[i])[k]=0; (*counterT[i])[k]=0; } } unsurePEorSE=false; }else{ if(pe && !al.IsPaired()){ cerr << "Cannot have unpaired reads in PE mode" << endl; return 1; } if(!pe && al.IsPaired()){ cerr << "Cannot have unpaired reads in SE mode" << endl; return 1; } } if(al.QueryBases.length() != al.Qualities.length()){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } if(int(al.QueryBases.length()) != strLength){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } if(pe){ if(al.IsFirstMate()){ reader.GetNextAlignment(al2); if(al2.QueryBases.length() != al2.Qualities.length()){ cerr << "Cannot have different lengths for sequence and quality" << endl; return 1; } }else{ cerr << "First read should be the first mate" << endl; return 1; } } //cycle for(unsigned int i=0;i<al.QueryBases.length();i++){ short x=(short(al.Qualities[i])-qualOffset); if(al.QueryBases[i] == 'A'){ (*counterA[i])[x]++; } if(al.QueryBases[i] == 'C'){ (*counterC[i])[x]++; } if(al.QueryBases[i] == 'G'){ (*counterG[i])[x]++; } if(al.QueryBases[i] == 'T'){ (*counterT[i])[x]++; } } //The indices for al and al2 should hopefully be the same if(lengthIndex1>0){ al.GetTag("XI",seqInd1); al.GetTag("YI",qualInd1); int j; for(int i=0;i<lengthIndex1;i++){ j=i+al.QueryBases.length(); short x=(short(qualInd1[i])-qualOffset); if(seqInd1[i] == 'A'){ (*counterA[j])[x]++; } if(seqInd1[i] == 'C'){ (*counterC[j])[x]++; } if(seqInd1[i] == 'G'){ (*counterG[j])[x]++; } if(seqInd1[i] == 'T'){ (*counterT[j])[x]++; } } } if(pe){ offsetInd2=al.QueryBases.length()+lengthIndex1+al2.QueryBases.length(); int j; for(unsigned int i=0;i<al2.QueryBases.length();i++){ j=i+al.QueryBases.length()+lengthIndex1; short x=(short(al2.Qualities[i])-qualOffset); if(al2.QueryBases[i] == 'A'){ (*counterA[j])[x]++; } if(al2.QueryBases[i] == 'C'){ (*counterC[j])[x]++; } if(al2.QueryBases[i] == 'G'){ (*counterG[j])[x]++; } if(al2.QueryBases[i] == 'T'){ (*counterT[j])[x]++; } } }else{ offsetInd2=al.QueryBases.length()+lengthIndex1; } //The indices for al and al2 should hopefully be the same if(lengthIndex2>0){ al.GetTag("XJ",seqInd2); al.GetTag("YJ",qualInd2); int j; for(int i=0;i<lengthIndex2;i++){ j=offsetInd2+i; short x=(short(qualInd2[i])-qualOffset); if(seqInd2[i] == 'A'){ (*counterA[j])[x]++; } if(seqInd2[i] == 'C'){ (*counterC[j])[x]++; } if(seqInd2[i] == 'G'){ (*counterG[j])[x]++; } if(seqInd2[i] == 'T'){ (*counterT[j])[x]++; } } } } reader.Close(); cout<<"cycle\t"<<"nuc\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<k<<"\t"; } cout<<maxQualScore<<endl; for(int i=0;i<vecLengthToUse;i++){ cout<<(i+1)<<"\t"; cout<<"A\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterA[i])[k]<<"\t"; } cout<<(*counterA[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"C\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterC[i])[k]<<"\t"; } cout<<(*counterC[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"G\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterG[i])[k]<<"\t"; } cout<<(*counterG[i])[maxQualScore]<<endl; cout<<(i+1)<<"\t"; cout<<"T\t"; for(short k=minQualScore;k<maxQualScore;k++){ cout<<(*counterT[i])[k]<<"\t"; } cout<<(*counterT[i])[maxQualScore]<<endl; } return 0; }
int main (int argc, char *argv[]) { bool produceUnCompressedBAM=false; bool verbose=false; bool ancientDNA=false; bool keepOrig=false; string adapter_F=options_adapter_F_BAM; string adapter_S=options_adapter_S_BAM; string adapter_chimera=options_adapter_chimera_BAM; string key=""; bool allowMissing=false; int trimCutoff=1; bool allowAligned=false; bool printLog=false; string logFileName; BamReader reader; BamWriter writer; string bamFile; string bamFileOUT=""; string key1; string key2; bool useDist=false; double location=-1.0; double scale =-1.0; bool fastqFormat=false; string fastqfile1 = ""; string fastqfile2 = ""; string fastqoutfile = ""; bool singleEndModeFQ=true; const string usage=string(string(argv[0])+ " [options] BAMfile"+"\n"+ "\nThis program takes an unaligned BAM where mates are consecutive\nor fastq files and trims and merges reads\n"+ "\n\tYou can specify a unaligned bam file or one or two fastq :\n"+ "\t\t"+"-fq1" +"\t\t"+"First fastq"+"\n"+ "\t\t"+"-fq2" +"\t\t"+"Second fastq file (for paired-end)"+"\n"+ "\t\t"+"-fqo" +"\t\t"+"Output fastq prefix"+"\n\n"+ //"\t"+"-p , --PIPE"+"\n\t\t"+"Read BAM from and write it to PIPE"+"\n"+ "\t"+"-o , --outfile" +"\t\t"+"Output (BAM format)."+"\n"+ "\t"+"-u " +"\t\t"+"Produce uncompressed bam (good for pipe)"+"\n"+ // "\t"+" , --outprefix" +"\n\t\t"+"Prefix for output files (default '"+outprefix+"')."+"\n"+ //"\t"+" , --SAM" +"\n\t\t"+"Output SAM not BAM."+"\n"+ "\t"+"--aligned" +"\t\t"+"Allow reads to be aligned (default "+boolStringify(allowAligned)+")"+"\n"+ "\t"+"-v , --verbose" +"\t\t"+"Turn all messages on (default "+boolStringify(verbose)+")"+"\n"+ "\t"+"--log [log file]" +"\t"+"Print a tally of merged reads to this log file (default only to stderr)"+"\n"+ "\n\t"+"Paired End merging/Single Read trimming options"+"\n"+ "\t\t"+"You can specify either:"+"\n"+ "\t\t\t"+"--ancientdna"+"\t\t\t"+"ancient DNA (default "+boolStringify(ancientDNA)+")"+"\n"+ "\t\t"+" "+"\t\t\t\t"+"this allows for partial overlap"+"\n"+ "\n\t\t"+"or if you know your size length distribution:"+"\n"+ "\t\t\t"+"--loc"+"\t\t\t\t"+"Location for lognormal dist. (default none)"+"\n"+ "\t\t\t"+"--scale"+"\t\t\t\t"+"Scale for lognormal dist. (default none)"+"\n"+ // "\t\t\t\t\t\t\tGood for merging ancient DNA reads into a single sequence\n\n" "\n\t\t"+"--keepOrig"+"\t\t\t\t"+"Write original reads if they are trimmed or merged (default "+boolStringify(keepOrig)+")"+"\n"+ "\t\t\t\t\t\t\tSuch reads will be marked as PCR duplicates\n\n" "\t\t"+"-f , --adapterFirstRead" +"\t\t\t"+"Adapter that is observed after the forward read (def. Multiplex: "+options_adapter_F_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-s , --adapterSecondRead" +"\t\t"+"Adapter that is observed after the reverse read (def. Multiplex: "+options_adapter_S_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-c , --FirstReadChimeraFilter" +"\t\t"+"If the forward read looks like this sequence, the cluster is filtered out.\n\t\t\t\t\t\t\tProvide several sequences separated by comma (def. Multiplex: "+options_adapter_chimera_BAM.substr(0,30)+")"+"\n"+ "\t\t"+"-k , --key"+"\t\t\t\t"+"Key sequence with which each sequence starts. Comma separate for forward and reverse reads. (default '"+key+"')"+"\n"+ "\t\t"+"-i , --allowMissing"+"\t\t\t"+"Allow one base in one key to be missing or wrong. (default "+boolStringify(allowMissing)+")"+"\n"+ "\t\t"+"-t , --trimCutoff"+"\t\t\t"+"Lowest number of adapter bases to be observed for single Read trimming (default "+stringify(trimCutoff)+")"); if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<endl; cout<<""<<endl; cout<<usage<<endl; return 1; } for(int i=1;i<(argc-1);i++){ //all but the last arg if(strcmp(argv[i],"-fq1") == 0 ){ fastqfile1=string(argv[i+1]); fastqFormat=true; i++; continue; } if(strcmp(argv[i],"-fq2") == 0 ){ fastqfile2=string(argv[i+1]); fastqFormat=true; singleEndModeFQ=false; i++; continue; } if(strcmp(argv[i],"-fqo") == 0 ){ fastqoutfile=string(argv[i+1]); fastqFormat=true; i++; continue; } if(strcmp(argv[i],"--log") == 0 ){ logFileName =string(argv[i+1]); printLog=true; i++; continue; } if(strcmp(argv[i],"-p") == 0 || strcmp(argv[i],"--PIPE") == 0 ){ cerr<<"This version no longer works with pipe, exiting"<<endl; return 1; } if(strcmp(argv[i],"-u") == 0 ){ produceUnCompressedBAM=true; continue; } if(strcmp(argv[i],"--aligned") == 0 ){ allowAligned=true; continue; } if(strcmp(argv[i],"-o") == 0 || strcmp(argv[i],"--outfile") == 0 ){ bamFileOUT =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-v") == 0 || strcmp(argv[i],"--verbose") == 0 ){ verbose=true; continue; } if(strcmp(argv[i],"--ancientdna") == 0 ){ ancientDNA=true; continue; } if(strcmp(argv[i],"--keepOrig") == 0 ){ keepOrig=true; continue; } if(strcmp(argv[i],"--loc") == 0 ){ location =destringify<double>(argv[i+1]); i++; continue; } if(strcmp(argv[i],"--scale") == 0 ){ scale =destringify<double>(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-f") == 0 || strcmp(argv[i],"--adapterFirstRead") == 0 ){ adapter_F =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-s") == 0 || strcmp(argv[i],"--adapterSecondRead") == 0 ){ adapter_S =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-c") == 0 || strcmp(argv[i],"--FirstReadChimeraFilter") == 0 ){ adapter_chimera =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-k") == 0 || strcmp(argv[i],"--keys") == 0 ){ key =string(argv[i+1]); i++; continue; } if(strcmp(argv[i],"-i") == 0 || strcmp(argv[i],"--allowMissing") == 0 ){ allowMissing=true; continue; } if(strcmp(argv[i],"-t") == 0 || strcmp(argv[i],"--trimCutoff") == 0 ){ trimCutoff=atoi(argv[i+1]); i++; continue; } cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; return 1; } bamFile=argv[argc-1]; if( (location != -1.0 && scale == -1.0) || (location == -1.0 && scale != -1.0) ){ cerr<<"Cannot specify --location without specifying --scale"<<endl; return 1; } if( (location != -1.0 && scale != -1.0) ){ useDist=true; if(ancientDNA){ cerr<<"Cannot specify --location/--scale and --ancientDNA"<<endl; return 1; } } MergeTrimReads mtr (adapter_F,adapter_S,adapter_chimera, key1,key2, trimCutoff,allowMissing,ancientDNA,location,scale,useDist); fqwriters onereadgroup; if(fastqFormat){ if( bamFileOUT != "" || produceUnCompressedBAM || allowAligned){ cerr<<"ERROR : Cannot specify options like -o, -u or --allowAligned for fastq"<<endl; return 1; } if(fastqfile1 == ""){ cerr<<"ERROR : Must specify as least the first file for fastq"<<endl; return 1; } FastQParser * fqp1; FastQParser * fqp2; if(singleEndModeFQ){ fqp1 = new FastQParser (fastqfile1); string outdirs = fastqoutfile+".fq.gz"; string outdirsf = fastqoutfile+".fail.fq.gz"; onereadgroup.single.open(outdirs.c_str(), ios::out); onereadgroup.singlef.open(outdirsf.c_str(), ios::out); if(!onereadgroup.single.good()){ cerr<<"Cannot write to file "<<outdirs<<endl; return 1; } if(!onereadgroup.singlef.good()){ cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; } }else{ fqp1 = new FastQParser (fastqfile1); fqp2 = new FastQParser (fastqfile2); string outdirs = fastqoutfile+".fq.gz"; string outdir1 = fastqoutfile+"_r1.fq.gz"; string outdir2 = fastqoutfile+"_r2.fq.gz"; string outdirsf = fastqoutfile+".fail.fq.gz"; string outdir1f = fastqoutfile+"_r1.fail.fq.gz"; string outdir2f = fastqoutfile+"_r2.fail.fq.gz"; onereadgroup.single.open(outdirs.c_str(), ios::out); onereadgroup.pairr1.open(outdir1.c_str(), ios::out); onereadgroup.pairr2.open(outdir2.c_str(), ios::out); onereadgroup.singlef.open(outdirsf.c_str(), ios::out); onereadgroup.pairr1f.open(outdir1f.c_str(), ios::out); onereadgroup.pairr2f.open(outdir2f.c_str(), ios::out); if(!onereadgroup.single.good()){ cerr<<"Cannot write to file "<<outdirs<<endl; return 1; } if(!onereadgroup.pairr1.good()){ cerr<<"Cannot write to file "<<outdir1<<endl; return 1; } if(!onereadgroup.pairr2.good()){ cerr<<"Cannot write to file "<<outdir2<<endl; return 1; } if(!onereadgroup.singlef.good()){ cerr<<"Cannot write to file "<<outdirsf<<endl; return 1; } if(!onereadgroup.pairr1f.good()){ cerr<<"Cannot write to file "<<outdir1f<<endl; return 1; } if(!onereadgroup.pairr2f.good()){ cerr<<"Cannot write to file "<<outdir2f<<endl; return 1; } } unsigned int totalSeqs=0; while(fqp1->hasData()){ FastQObj * fo1=fqp1->getData(); vector<string> def1=allTokens( *(fo1->getID()), ' ' ); string def1s=def1[0]; FastQObj * fo2; string def2s; string ext2s; if(!singleEndModeFQ){ if(!fqp2->hasData()){ cerr << "ERROR: Discrepency between fastq files at record " << *(fo1->getID()) <<endl; return 1; } fo2=fqp2->getData(); vector<string> def2=allTokens( *(fo2->getID()), ' ' ); def2s=def2[0]; if(strEndsWith(def1s,"/1")){ def1s=def1s.substr(0,def1s.size()-2); } if(strEndsWith(def2s,"/2")){ def2s=def2s.substr(0,def2s.size()-2); } if(strBeginsWith(def1s,"@")){ def1s=def1s.substr(1,def1s.size()-1); } if(strBeginsWith(def2s,"@")){ def2s=def2s.substr(1,def2s.size()-1); } if(def1s != def2s){ cerr << "ERROR: Discrepency between fastq files, different names " << *(fo1->getID()) <<" and "<< *(fo2->getID()) <<endl; return 1; } merged result= mtr.process_PE(*(fo1->getSeq()),*(fo1->getQual()), *(fo2->getSeq()),*(fo2->getQual())); mtr.incrementCountall(); if(result.code != ' '){ //keys or chimeras if(result.code == 'K'){ mtr.incrementCountfkey(); }else{ if(result.code == 'D'){ mtr.incrementCountchimera(); }else{ cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl; exit(1); } } onereadgroup.pairr2f<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl; onereadgroup.pairr1f<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; continue; }else{ if(result.sequence != ""){ //new sequence onereadgroup.single<<"@"<<def1s<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl; if( result.sequence.length() > max(fo1->getSeq()->length(),fo2->getSeq()->length()) ){ mtr.incrementCountmergedoverlap(); }else{ mtr.incrementCountmerged(); } }else{ //keep as is mtr.incrementCountnothing(); onereadgroup.pairr2<<"@"<<def2s<<"/2" <<endl <<*(fo2->getSeq())<<endl<<"+"<<endl <<*(fo2->getQual())<<endl; onereadgroup.pairr1<<"@"<<def1s<<"/1" <<endl <<*(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; } } }else{ merged result=mtr.process_SR(*(fo1->getSeq()),*(fo1->getQual())); mtr.incrementCountall(); if(result.code != ' '){ //either chimera or missing key if(result.code == 'K'){ mtr.incrementCountfkey(); }else{ if(result.code == 'D'){ mtr.incrementCountchimera(); }else{ cerr << "leehom: Wrong return code =\""<<result.code<<"\""<<endl; exit(1); } } onereadgroup.singlef<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; continue; } if(result.sequence != ""){ //new sequence mtr.incrementCounttrimmed(); onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << result.sequence<<endl<<"+"<<endl <<result.quality<<endl; }else{ mtr.incrementCountnothing(); onereadgroup.single<<""<<*(fo1->getID())<<"" <<endl << *(fo1->getSeq())<<endl<<"+"<<endl <<*(fo1->getQual())<<endl; } } totalSeqs++; } delete fqp1; if(!singleEndModeFQ){ delete fqp2; } if(singleEndModeFQ){ onereadgroup.single.close(); onereadgroup.singlef.close(); }else{ onereadgroup.single.close(); onereadgroup.pairr1.close(); onereadgroup.pairr2.close(); onereadgroup.singlef.close(); onereadgroup.pairr1f.close(); onereadgroup.pairr2f.close(); } //fastq }else{ //else BAM // initMerge(); // set_adapter_sequences(adapter_F, // adapter_S, // adapter_chimera); // set_options(trimCutoff,allowMissing,mergeoverlap); if(key != ""){ size_t found=key.find(","); if (found == string::npos){ //single end reads key1=key; key2=""; } else{ //paired-end key1=key.substr(0,found); key2=key.substr(found+1,key.length()-found+1); } } if( bamFileOUT == "" ){ cerr<<"The output must be a be specified, exiting"<<endl; return 1; } if ( !reader.Open(bamFile) ) { cerr << "Could not open input BAM file "<<bamFile << endl; return 1; } SamHeader header = reader.GetHeader(); string pID = "mergeTrimReadsBAM"; string pName = "mergeTrimReadsBAM"; string pCommandLine = ""; for(int i=0;i<(argc);i++){ pCommandLine += (string(argv[i])+" "); } putProgramInHeader(&header,pID,pName,pCommandLine,returnGitHubVersion(string(argv[0]),"..")); const RefVector references = reader.GetReferenceData(); //we will not call bgzip with full compression, good for piping into another program to //lessen the load on the CPU if(produceUnCompressedBAM) writer.SetCompressionMode(BamWriter::Uncompressed); if ( !writer.Open(bamFileOUT,header,references) ) { cerr << "Could not open output BAM file "<<bamFileOUT << endl; return 1; } SamHeader sh=reader.GetHeader(); //Up to the user to be sure that a sequence is followed by his mate // if(!sh.HasSortOrder() || // sh.SortOrder != "queryname"){ // cerr << "Bamfile must be sorted by queryname" << endl; // return 1; // } BamAlignment al; BamAlignment al2; bool al2Null=true; while ( reader.GetNextAlignment(al) ) { if(al.IsMapped() || al.HasTag("NM") || al.HasTag("MD") ){ if(!allowAligned){ cerr << "Reads should not be aligned" << endl; return 1; }else{ //should we remove tags ? } } if(al.IsPaired() && al2Null ){ al2=al; al2Null=false; continue; }else{ if(al.IsPaired() && !al2Null){ bool result = mtr.processPair(al,al2); if( result ){//was merged BamAlignment orig; BamAlignment orig2; if(keepOrig){ orig2 = al2; orig = al; } writer.SaveAlignment(al); if(keepOrig){ orig.SetIsDuplicate(true); orig2.SetIsDuplicate(true); writer.SaveAlignment(orig2); writer.SaveAlignment(orig); } //the second record is empty }else{ //keep the sequences as pairs writer.SaveAlignment(al2); writer.SaveAlignment(al); } // // SINGLE END // }else{ BamAlignment orig; if(keepOrig){ orig =al; } mtr.processSingle(al); if(keepOrig){ //write duplicate if(orig.QueryBases.length() != al.QueryBases.length()){ orig.SetIsDuplicate(true); writer.SaveAlignment(orig); } } writer.SaveAlignment(al); } //end single end al2Null=true; }//second pair } //while al reader.Close(); writer.Close(); } //else BAM cerr <<mtr.reportSingleLine()<<endl; if(printLog){ ofstream fileLog; fileLog.open(logFileName.c_str()); if (fileLog.is_open()){ fileLog <<mtr.reportMultipleLines() <<endl; }else{ cerr << "Unable to print to file "<<logFileName<<endl; } fileLog.close(); } return 0; }
int main (int argc, char *argv[]) { if( (argc!= 4 && argc !=5 && argc !=6) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cerr<<"Usage:splitByRG [in bam] [rg Tally] [out prefix] (optional target)"<<endl<<"this program will subsample a BAM file per read group for a certain target\nFor example splitByRG in.bam tally.txt out will create\nout.rg1.bam\nout.rg2.bam\n"<<endl; return 1; } string bamfiletopen = string(argv[1]); string rgTally = string(argv[2]); string bamDirOutPrefix = string(argv[3]); int target = 200000; int maxTarget = 1000000; if(argc==5){ target = destringify<int> ( string(argv[4]) ); } if(argc==6){ target = destringify<int> ( string(argv[4]) ); maxTarget = destringify<int> ( string(argv[5]) ); } cerr<<"minimum fragments:\t"<<target<<endl; cerr<<"target fragments:\t"<<maxTarget<<endl; string line; ifstream myFileTally; map<string,double> rg2Fraction; myFileTally.open(rgTally.c_str(), ios::in); cerr<<"Retained groups:\n"<<endl; cerr<<"RG\t#mapped\tfraction retained"<<endl; cerr<<"-----------------------------------"<<endl; if (myFileTally.is_open()){ while ( getline (myFileTally,line)){ vector<string> tokens = allTokens(line,'\t'); if(tokens.size() > 6) if( tokens[1] == "pass" && (tokens[0] != "\"\"" && tokens[0] != "control" && tokens[0] != "TOTAL") ){ //cout<<tokens[0]<<"\t"<<tokens[5]<<endl; int count = destringify<int>(tokens[5]); if(count>target){ if(count>=maxTarget){ rg2Fraction[ tokens[0] ] = double(maxTarget)/double(count); cout<<tokens[0]<<"\t"<<count<<"\t"<<double(maxTarget)/double(count)<<endl; }else{ cout<<tokens[0]<<"\t"<<count<<"\t"<<1.0<<endl; rg2Fraction[ tokens[0] ] = 1.0; } } } } myFileTally.close(); }else{ cerr << "Unable to open file "<<rgTally<<endl; return 1; } map<string,BamWriter *> rg2BamWriter; // if(!isDirectory(bamDirOut)){ // cerr<<"ERROR: the out directory does not exist"<<endl; // return 1; // } BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); vector<RefData> refData=reader.GetReferenceData(); SamReadGroupDictionary srgd=header.ReadGroups; for(SamReadGroupConstIterator srgci=srgd.ConstBegin(); srgci<srgd.ConstEnd(); srgci++){ //cout<<*srgci<<endl; const SamReadGroup rg = (*srgci); //cout<<rg.ID<<endl; if( rg2Fraction.find(rg.ID) != rg2Fraction.end() ){ rg2BamWriter[rg.ID] = new BamWriter(); rg2BamWriter[rg.ID]->Open(bamDirOutPrefix+"."+rg.ID+".bam",header,references); } //cout<<bamDirOutPrefix+"."+rg.ID+".bam"<<endl; } // return 1; // BamWriter unmapped; // cout<<header.ToString()<<endl; // return 1; // if ( !unmapped.Open(bamDirOutPrefix+".unmapped.bam",header,references) ) { // cerr << "Could not open output BAM file "<< bamDirOutPrefix+".unmapped.bam" << endl; // return 1; // } // cout<<"reading"<<endl; BamAlignment al; unsigned int total=0; while ( reader.GetNextAlignment(al) ) { if(al.HasTag("RG") && al.IsMapped() ){ string rgTag; al.GetTag("RG",rgTag); //cout<<rgTag<<endl; if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new: ignore completely }else{ if( randomProb() <= rg2Fraction[ rgTag ] ){ rg2BamWriter[rgTag]->SaveAlignment(al); //cout<<"wrote "<<rgTag<<endl; } else{ //cout<<"skipped "<<rgTag<<endl; } } }// else{ // string rgTag="unknown"; // //cout<<rgTag<<endl; // if(rg2BamWriter.find(rgTag) == rg2BamWriter.end()){ //new // cerr<<"Found new RG "<<rgTag<<endl; // rg2BamWriter[rgTag] = new BamWriter(); // if ( !rg2BamWriter[rgTag]->Open(bamDirOutPrefix+"."+rgTag+".bam",header,references) ) { // cerr << "Could not open output BAM file "<< bamDirOutPrefix<<"."<<rgTag<<".bam" << endl; // return 1; // } // rg2BamWriter[rgTag]->SaveAlignment(al); // }else{ // rg2BamWriter[rgTag]->SaveAlignment(al); // } // // cerr << "Cannot get RG tag for " << al.Name<<endl; // // return 1; // } total++; } //while al reader.Close(); // writer.Close(); // unmapped.Close(); map<string,BamWriter *>::iterator rg2BamWriterIt; for (rg2BamWriterIt =rg2BamWriter.begin(); rg2BamWriterIt!=rg2BamWriter.end(); rg2BamWriterIt++){ rg2BamWriterIt->second->Close(); } cerr<<"Wrote succesfully "<<total<<" reads"<<endl; return 0; }
void GenericIndividualSnpCall::simpleSnpCall(string &fastaObj, BamReader &bamObj, int chrID, int leftPosition, int rightPosition, vector<Allele> &variantCandidates, map<int,list<tuple<char,int,int,double>>> &bamData) { set<int> BlockSnpPositions; vector<Allele> BlockSnpAlleles; // rewind bamObj.Rewind(); // set region bamObj.SetRegion(chrID, leftPosition, chrID, rightPosition); BamAlignment al; // search SNP positions in the region while (bamObj.GetNextAlignment(al)) { if (!GenericBamAlignmentTools::goodAlignment(al)) continue; if (!al.HasTag("MD")) continue; vector<long> SnpInAlignment; GenericBamAlignmentTools::getBamAlignmentMismatches(al, SnpInAlignment); for (int i=0; i<SnpInAlignment.size(); i++) { BlockSnpPositions.insert(SnpInAlignment[i]); } } // pileup visitor SimpleSnpCallPileupVisitor visitor(&fastaObj, chrID, leftPosition, rightPosition, m_downSample, &BlockSnpPositions, &BlockSnpAlleles, &bamData); PileupEngine SimpleSnpCallPileupEngine; SimpleSnpCallPileupEngine.AddVisitor(&visitor); // rewind bamObj.Rewind(); // set region bamObj.SetRegion(chrID, leftPosition, chrID, rightPosition); // load data while(bamObj.GetNextAlignment(al)) { if (!GenericBamAlignmentTools::goodAlignment(al)) continue; if (!GenericBamAlignmentTools::validMapQuality(al, m_minMapQuality)) continue; if (!GenericBamAlignmentTools::validReadIdentity(al, m_maxMismatchFrac)) continue; if (!GenericBamAlignmentTools::validReadLength(al, m_minReadLength)) continue; if (!al.HasTag("MD")) continue; SimpleSnpCallPileupEngine.AddAlignment(al); } SimpleSnpCallPileupEngine.Flush(); // Filter SNP candidiate for (int i=0; i<BlockSnpAlleles.size(); i++) { Allele allele = BlockSnpAlleles[i]; if (allele.m_alleleDepth < m_minSnpRead) continue; if (allele.m_alleleDepth < m_minSnpFrac*allele.m_globalDepth) continue; variantCandidates.push_back(allele); } }
int main (int argc, char *argv[]) { if( (argc== 1) || (argc== 2 && string(argv[1]) == "-h") || (argc== 2 && string(argv[1]) == "-help") || (argc== 2 && string(argv[1]) == "--help") ){ cout<<"Usage:"<<argv[0]<<"<options> [in bam] [ref original length] [extension length]"<<endl; cout<<"This program returns the same BAM file except with the XA flag for circular references"<<endl; cout<<"Options:"<<endl; //cout<<"\t-m\t\t\t\tUse mapped reads only"<<endl; return 1; } for(int i=1;i<(argc-3);i++){ //all but the last 3 args // if(string(argv[i]) == "-m" ){ // onlyMapped=true; // continue; // } cerr<<"Unknown option "<<argv[i] <<" exiting"<<endl; return 1; } string bamfiletopen = string( argv[argc-3]); int origLength = destringify<int>(argv[argc-2]); int extLength = destringify<int>(argv[argc-1]); string outputFilename = "/dev/stdout"; BamReader reader; if ( !reader.Open(bamfiletopen) ) { cerr << "Could not open input BAM files." << endl; return 1; } const SamHeader header = reader.GetHeader(); const RefVector references = reader.GetReferenceData(); BamWriter writer; if ( !writer.Open(outputFilename, header, references) ) { cerr << "Could not open output BAM file" << endl; return 1; } BamAlignment al; string nameTAG="XA"; while ( reader.GetNextAlignment(al) ) { if(al.HasTag(nameTAG)) { cerr << "ERROR: Read "<<al.Name<<" already has XA tags" << endl; return 1; } writer.SaveAlignment(al); } //while al reader.Close(); writer.Close(); return 0; }