void setReadCountInSection( vector<int> & raw_counts, string & chr, int center, SamFile & samIn, SamFileHeader & samHeader, vector<RefSeq*> & REF_SEQ ) { int st = center - WIN/2; int ed = center + WIN/2; bool section_status = samIn.SetReadSection( chr.c_str(), st, ed ); if (!section_status) { std::cerr << "Warning: Unable to set read section: " << chr << ": " << st << "-" << ed << ". Set section depth = 0!" << std::endl; return; } // proper reads map<string, vector< DiscPair > > disc_recs; // record where the disc come from ProperDeck pDeck( REF_SEQ ); SamRecord sam_rec; while( samIn.ReadRecord(samHeader, sam_rec) ) { bool pass_qc = PassQC( sam_rec ); if ( !pass_qc ) continue; if ( sam_rec.getFlag() & 0x2 ) { if ( sam_rec.getInsertSize() > 0 ) pDeck.Add( sam_rec ); else { RetrievedIndex rv = pDeck.RetrieveIndex( sam_rec ); int index = getRetrievedIndexToRawCounts( rv ); if (index >= 0) { // for read partially in window, only add clip if ( sam_rec.get1BasedPosition() < st || sam_rec.get1BasedAlignmentEnd() > ed ) { if ( index >= 2 ) raw_counts[ index ]++; } else raw_counts[ index ]++; } } } else { // disc: rec info and wait to reset section later if ( sam_rec.getReadLength() < 30 ) continue; string mate_chr = sam_rec.getMateReferenceName(); // check if this one is valid as anchor DiscPair new_pair( 1, 0, sam_rec, REF_SEQ ); disc_recs[mate_chr].push_back( new_pair ); } } // disc reads for( map<string, vector< DiscPair > >::iterator chr_it = disc_recs.begin(); chr_it != disc_recs.end(); chr_it++ ) { for( vector< DiscPair >::iterator dp_it = chr_it->second.begin(); dp_it != chr_it->second.end(); dp_it++ ) { bool section_status = samIn.SetReadSection( chr_it->first.c_str(), dp_it->GetSecondAlignPosition(), dp_it->GetSecondAlignPosition() + WIN ); if (!section_status) { std::cerr << "ERROR: Unable to set read section: " << chr << ": " << st << "-" << ed << std::endl; exit(1); } SamRecord sam_rec; while( samIn.ReadRecord(samHeader, sam_rec) ) { bool pass_qc = PassQC( sam_rec ); if ( !pass_qc ) continue; if ( !DiscSamPass(sam_rec) ) continue; int position = sam_rec.get1BasedPosition(); if ( position > dp_it->GetFirstAlignPosition() ) break; if ( sam_rec.getFlag() & 0x2 ) continue; bool same_pair = dp_it->IsSamePair( sam_rec ); if ( !same_pair ) continue; // now add ro raw stats: always use first as anchor dp_it->AddSecondToPair( sam_rec ); Loci loci = dp_it->GetFirstLoci(); if ( dp_it->FirstValid() ) { int index = getLociToRawCounts( loci ); raw_counts[ index ]++; // clear & break break; } } } } }
void testIndex(BamIndex& bamIndex) { assert(bamIndex.getNumMappedReads(1) == 2); assert(bamIndex.getNumUnMappedReads(1) == 0); assert(bamIndex.getNumMappedReads(0) == 4); assert(bamIndex.getNumUnMappedReads(0) == 1); assert(bamIndex.getNumMappedReads(23) == -1); assert(bamIndex.getNumUnMappedReads(23) == -1); assert(bamIndex.getNumMappedReads(-1) == 0); assert(bamIndex.getNumUnMappedReads(-1) == 2); assert(bamIndex.getNumMappedReads(-2) == -1); assert(bamIndex.getNumUnMappedReads(-2) == -1); assert(bamIndex.getNumMappedReads(22) == 0); assert(bamIndex.getNumUnMappedReads(22) == 0); // Get the chunks for reference id 1. Chunk testChunk; SortedChunkList chunkList; assert(bamIndex.getChunksForRegion(1, -1, -1, chunkList) == true); assert(!chunkList.empty()); testChunk = chunkList.pop(); assert(chunkList.empty()); assert(testChunk.chunk_beg == 0x4e7); assert(testChunk.chunk_end == 0x599); // Get the chunks for reference id 0. assert(bamIndex.getChunksForRegion(0, -1, -1, chunkList) == true); assert(!chunkList.empty()); testChunk = chunkList.pop(); assert(chunkList.empty()); assert(testChunk.chunk_beg == 0x360); assert(testChunk.chunk_end == 0x4e7); // Get the chunks for reference id 2. assert(bamIndex.getChunksForRegion(2, -1, -1, chunkList) == true); assert(!chunkList.empty()); testChunk = chunkList.pop(); assert(chunkList.empty()); assert(testChunk.chunk_beg == 0x599); assert(testChunk.chunk_end == 0x5ea); // Get the chunks for reference id 3. // There isn't one for this ref id, but still successfully read the file, // so it should return true, but the list should be empty. assert(bamIndex.getChunksForRegion(3, -1, -1, chunkList) == true); assert(chunkList.empty()); // Test reading an indexed bam file. SamFile inFile; assert(inFile.OpenForRead("testFiles/sortedBam.bam")); inFile.setSortedValidation(SamFile::COORDINATE); assert(inFile.ReadBamIndex("testFiles/sortedBam.bam.bai")); SamFileHeader samHeader; assert(inFile.ReadHeader(samHeader)); SamRecord samRecord; // Test getting num mapped/unmapped reads. assert(inFile.getNumMappedReadsFromIndex(1) == 2); assert(inFile.getNumUnMappedReadsFromIndex(1) == 0); assert(inFile.getNumMappedReadsFromIndex(0) == 4); assert(inFile.getNumUnMappedReadsFromIndex(0) == 1); assert(inFile.getNumMappedReadsFromIndex(23) == -1); assert(inFile.getNumUnMappedReadsFromIndex(23) == -1); assert(inFile.getNumMappedReadsFromIndex(-1) == 0); assert(inFile.getNumUnMappedReadsFromIndex(-1) == 2); assert(inFile.getNumMappedReadsFromIndex(-2) == -1); assert(inFile.getNumUnMappedReadsFromIndex(-2) == -1); assert(inFile.getNumMappedReadsFromIndex(22) == 0); assert(inFile.getNumUnMappedReadsFromIndex(22) == 0); assert(inFile.getNumMappedReadsFromIndex("2", samHeader) == 2); assert(inFile.getNumUnMappedReadsFromIndex("2", samHeader) == 0); assert(inFile.getNumMappedReadsFromIndex("1", samHeader) == 4); assert(inFile.getNumUnMappedReadsFromIndex("1", samHeader) == 1); assert(inFile.getNumMappedReadsFromIndex("22", samHeader) == 0); assert(inFile.getNumUnMappedReadsFromIndex("22", samHeader) == 0); assert(inFile.getNumMappedReadsFromIndex("", samHeader) == 0); assert(inFile.getNumUnMappedReadsFromIndex("*", samHeader) == 2); assert(inFile.getNumMappedReadsFromIndex("unknown", samHeader) == -1); assert(inFile.getNumUnMappedReadsFromIndex("unknown", samHeader) == -1); assert(inFile.getNumMappedReadsFromIndex("X", samHeader) == 0); assert(inFile.getNumUnMappedReadsFromIndex("X", samHeader) == 0); // Section -1 = Ref *: 2 records (8 & 10 from testSam.sam that is reflected // in the validation. assert(inFile.SetReadSection(-1)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead8(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead10(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 2 = Ref 3: 1 records (9 from testSam.sam that is reflected // in the validation. assert(inFile.SetReadSection(2)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead9(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 0 = Ref 1: 5 records (3, 4, 1, 2, & 6 from testSam.sam that is // reflected in the validation. assert(inFile.SetReadSection(0)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead3(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead4(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead6(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 1 = Ref 2: 2 records (5 & 7 from testSam.sam that is reflected // in the validation. assert(inFile.SetReadSection(1)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead5(samRecord); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead7(samRecord); assert(inFile.ReadRecord(samHeader, samRecord) == false); // Section 3 to 22 (ref 4 - 23): 0 records. for(int i = 3; i < 23; i++) { assert(inFile.SetReadSection(i)); assert(inFile.ReadRecord(samHeader, samRecord) == false); } // Set the read section. assert(inFile.SetReadSection("1", 1010, 1012)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 2); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 0); assert(samRecord.getNumOverlaps(1010, 1012) == 0); assert(samRecord.getNumOverlaps(1010, 1020) == 0); assert(samRecord.getNumOverlaps(1010, 1011) == 0); assert(samRecord.getNumOverlaps(1011, 1012) == 0); assert(inFile.ReadRecord(samHeader, samRecord) == false); assert(inFile.SetReadSection("1", 1010, 1020)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 5); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 0); assert(samRecord.getNumOverlaps(1010, 1012) == 0); assert(samRecord.getNumOverlaps(1010, 1020) == 0); assert(samRecord.getNumOverlaps(1010, 1011) == 0); assert(samRecord.getNumOverlaps(1011, 1012) == 0); assert(inFile.ReadRecord(samHeader, samRecord) == false); assert(inFile.SetReadSection("1", 1010, 1011)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 1); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord) == false); assert(inFile.SetReadSection("1", 1011, 1012)); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead1(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 1); assert(samRecord.getNumOverlaps(1010, 1012) == 2); assert(samRecord.getNumOverlaps(1010, 1020) == 5); assert(samRecord.getNumOverlaps(1010, 1011) == 1); assert(samRecord.getNumOverlaps(1011, 1012) == 1); assert(inFile.ReadRecord(samHeader, samRecord)); validateRead2(samRecord); assert(inFile.GetNumOverlaps(samRecord) == 0); assert(samRecord.getNumOverlaps(1010, 1012) == 0); assert(samRecord.getNumOverlaps(1010, 1020) == 0); assert(samRecord.getNumOverlaps(1010, 1011) == 0); assert(samRecord.getNumOverlaps(1011, 1012) == 0); assert(inFile.ReadRecord(samHeader, samRecord) == false); }
bool BamProcessor::init (const ContalignParams& p) { read_cnt_ = proc_cnt_ = toolongs_ = unaligned_cnt_ = fail_cnt_ = nomd_cnt_ = realigned_cnt_ = modified_cnt_ = pos_adjusted_cnt_ = 0; log_diff_ = log_matr_ = log_base_ = false; p_ = &p; if (!*p.inbam ()) ers << "Input file name not specified" << Throw; limit_ = p.limit (); skip_ = p.skip (); infile_.OpenForRead (p.inbam ()); if (!infile_.IsOpen ()) ers << p.inbam () << ThrowEx (FileNotFoundRerror); bool index_ok = false; if (*p.bamidx ()) { index_ok = infile_.ReadBamIndex (p.bamidx ()); if (!index_ok) warn << "Unable to open specified BAM index: " << p.bamidx () << ". Default index will be attempted" << std::endl; } if (!index_ok) { try { index_ok = infile_.ReadBamIndex (); } catch (std::exception& e) { // for some reason not converted into return status by libStatGen } if (!index_ok) warn << "Unable to open default BAM index for " << p.inbam () << std::endl; } if (*p.refname () || p.refno () != -1) { if (!index_ok) ers << "Reference section specified, but the BAM index could not be open." << Throw; if (*p.refname ()) { if (p.endpos () != 0) { infile_.SetReadSection (p.refname (), p.begpos (), p.endpos ()); info << "Read section set : " << p.refname () << ": " << p.begpos () << "-" << p.endpos () << std::endl; } else { infile_.SetReadSection (p.refname ()); info << "Read section set : " << p.refname () << std::endl; } } else { if (p.endpos () != 0) { info << "Read section set : ref# " << p.refno () << ": " << p.begpos () << "-" << p.endpos () << std::endl; infile_.SetReadSection (p.refno (), p.begpos (), p.endpos ()); } else { info << "Read section set : ref# " << p.refno () << std::endl; infile_.SetReadSection (p.refno ()); } } } if (*p.outbam ()) { if (!p.overwrite () && file_exists (p.outbam ())) ers << "Output file " << p.outbam () << " exists. Use --ov key to allow overwriting" << Throw; outfile_.OpenForWrite (p.outbam ()); if (!outfile_.IsOpen ()) ers << "Unable to open output file " << p.outbam () << std::endl; } if (*p.logfname ()) { if (!p.overwrite () && file_exists (p.logfname ())) ers << "Log file " << p.logfname () << " exists. Use --ov key to allow overwriting" << Throw; logfile_.open (p.logfname (), std::fstream::out); if (!logfile_.is_open ()) ers << "Unable to open log file " << p.logfname () << std::endl; time_t t = time (NULL); logfile_ << "Context-aware realigner log\nStarted at " << asctime (localtime (&t)) << "\nParameters:\n"; logfile_ << *(p.parameters_); logfile_ << std::endl; log_base_ = p.logging ("base"); log_diff_ = p.logging ("diff"); log_matr_ = p.logging ("matr"); } band_width_ = p.bwid (); switch (p.algo ()) { case ContalignParams::TEMPL: { matrix_.configure (genstr::nucleotides.symbols (), genstr::nucleotides.size (), genstr::NegUnitaryMatrix <int, 4>().values ()); gap_cost_.configure (p.gip (), p.gep ()); taligner_.configure (&matrix_, &gap_cost_, &gap_cost_, &genstr::nn2num, &genstr::nn2num); } break; case ContalignParams::PLAIN: { batches_.reset (max_batch_no_); aligner_.init (MAX_SEQ_LEN, MAX_SEQ_LEN*MAX_BAND_WIDTH, p.gip (), p.gep (), p.mat (), -p.mis ()); if (log_matr_) aligner_.set_log (logfile_); if (p.debug () > 5) aligner_.set_trace (true); } break; case ContalignParams::POLY: { batches_.reset (max_batch_no_); contalign_.init (MAX_SEQ_LEN, MAX_RSEQ_LEN, MAX_SEQ_LEN*MAX_BAND_WIDTH, p.gip (), p.gep (), p.mat (), -p.mis ()); if (log_matr_) contalign_.set_log (logfile_); if (p.debug () > 5) contalign_.set_trace (true); } break; default: { ers << "Alignment algorithm " << p.algostr () << " not yet supported" << Throw; } } timer_.reset (DEFAULT_REPORT_IVAL, 1); return true; }
int Stats::execute(int argc, char **argv) { // Extract command line arguments. String inFile = ""; String indexFile = ""; bool basic = false; bool noeof = false; bool params = false; bool qual = false; bool phred = false; int maxNumReads = -1; bool unmapped = false; String pBaseQC = ""; String cBaseQC = ""; String regionList = ""; int excludeFlags = 0; int requiredFlags = 0; bool withinRegion = false; int minMapQual = 0; String dbsnp = ""; PosList *dbsnpListPtr = NULL; bool baseSum = false; int bufferSize = PileupHelper::DEFAULT_WINDOW_SIZE; ParameterList inputParameters; BEGIN_LONG_PARAMETERS(longParameterList) LONG_PARAMETER_GROUP("Required Parameters") LONG_STRINGPARAMETER("in", &inFile) LONG_PARAMETER_GROUP("Types of Statistics") LONG_PARAMETER("basic", &basic) LONG_PARAMETER("qual", &qual) LONG_PARAMETER("phred", &phred) LONG_STRINGPARAMETER("pBaseQC", &pBaseQC) LONG_STRINGPARAMETER("cBaseQC", &cBaseQC) LONG_PARAMETER_GROUP("Optional Parameters") LONG_INTPARAMETER("maxNumReads", &maxNumReads) LONG_PARAMETER("unmapped", &unmapped) LONG_STRINGPARAMETER("bamIndex", &indexFile) LONG_STRINGPARAMETER("regionList", ®ionList) LONG_INTPARAMETER("excludeFlags", &excludeFlags) LONG_INTPARAMETER("requiredFlags", &requiredFlags) LONG_PARAMETER("noeof", &noeof) LONG_PARAMETER("params", ¶ms) LONG_PARAMETER_GROUP("Optional phred/qual Only Parameters") LONG_PARAMETER("withinRegion", &withinRegion) LONG_PARAMETER_GROUP("Optional BaseQC Only Parameters") LONG_PARAMETER("baseSum", &baseSum) LONG_INTPARAMETER("bufferSize", &bufferSize) LONG_INTPARAMETER("minMapQual", &minMapQual) LONG_STRINGPARAMETER("dbsnp", &dbsnp) END_LONG_PARAMETERS(); inputParameters.Add(new LongParameters ("Input Parameters", longParameterList)); inputParameters.Read(argc-1, &(argv[1])); // If no eof block is required for a bgzf file, set the bgzf file type to // not look for it. if(noeof) { // Set that the eof block is not required. BgzfFileType::setRequireEofBlock(false); } // Check to see if the in file was specified, if not, report an error. if(inFile == "") { usage(); inputParameters.Status(); // In file was not specified but it is mandatory. std::cerr << "--in is a mandatory argument for stats, " << "but was not specified" << std::endl; return(-1); } // Use the index file if unmapped or regionList is not empty. bool useIndex = (unmapped|| (!regionList.IsEmpty())); // IndexFile is required, so check to see if it has been set. if(useIndex && (indexFile == "")) { // In file was not specified, so set it to the in file // + ".bai" indexFile = inFile + ".bai"; } //////////////////////////////////////// // Setup in case pileup is used. Pileup<PileupElementBaseQCStats> pileup(bufferSize); // Initialize start/end positions. myStartPos = 0; myEndPos = -1; // Open the output qc file if applicable. IFILE baseQCPtr = NULL; if(!pBaseQC.IsEmpty() && !cBaseQC.IsEmpty()) { usage(); inputParameters.Status(); // Cannot specify both types of baseQC. std::cerr << "Cannot specify both --pBaseQC & --cBaseQC." << std::endl; return(-1); } else if(!pBaseQC.IsEmpty()) { baseQCPtr = ifopen(pBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(true); } else if(!cBaseQC.IsEmpty()) { baseQCPtr = ifopen(cBaseQC, "w"); PileupElementBaseQCStats::setPercentStats(false); } if(baseQCPtr != NULL) { PileupElementBaseQCStats::setOutputFile(baseQCPtr); PileupElementBaseQCStats::printHeader(); } if((baseQCPtr != NULL) || baseSum) { PileupElementBaseQCStats::setMapQualFilter(minMapQual); PileupElementBaseQCStats::setBaseSum(baseSum); } if(params) { inputParameters.Status(); } // Open the file for reading. SamFile samIn; if(!samIn.OpenForRead(inFile)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } samIn.SetReadFlags(requiredFlags, excludeFlags); // Set whether or not basic statistics should be generated. samIn.GenerateStatistics(basic); // Read the sam header. SamFileHeader samHeader; if(!samIn.ReadHeader(samHeader)) { fprintf(stderr, "%s\n", samIn.GetStatusMessage()); return(samIn.GetStatus()); } // Open the bam index file for reading if we are // doing unmapped reads (also set the read section). if(useIndex) { samIn.ReadBamIndex(indexFile); if(unmapped) { samIn.SetReadSection(-1); } if(!regionList.IsEmpty()) { myRegionList = ifopen(regionList, "r"); } } ////////////////////////// // Read dbsnp if specified and doing baseQC if(((baseQCPtr != NULL) || baseSum) && (!dbsnp.IsEmpty())) { // Read the dbsnp file. IFILE fdbSnp; fdbSnp = ifopen(dbsnp,"r"); // Determine how many entries. const SamReferenceInfo& refInfo = samHeader.getReferenceInfo(); int maxRefLen = 0; for(int i = 0; i < refInfo.getNumEntries(); i++) { int refLen = refInfo.getReferenceLength(i); if(refLen >= maxRefLen) { maxRefLen = refLen + 1; } } dbsnpListPtr = new PosList(refInfo.getNumEntries(),maxRefLen); if(fdbSnp==NULL) { std::cerr << "Open dbSNP file " << dbsnp.c_str() << " failed!\n"; } else if(dbsnpListPtr == NULL) { std::cerr << "Failed to init the memory allocation for the dbsnpList.\n"; } else { // Read the dbsnp file. StringArray tokens; String buffer; int position = 0; int refID = 0; // Loop til the end of the file. while (!ifeof(fdbSnp)) { // Read the next line. buffer.ReadLine(fdbSnp); // If it does not have at least 2 columns, // continue to the next line. if (buffer.IsEmpty() || buffer[0] == '#') continue; tokens.AddTokens(buffer); if(tokens.Length() < 2) continue; if(!tokens[1].AsInteger(position)) { std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << tokens[1] << "; Skipping to the next line.\n"; continue; } // Look up the reference name. refID = samHeader.getReferenceID(tokens[0]); if(refID != SamReferenceInfo::NO_REF_ID) { // Reference id was found, so add it to the dbsnp dbsnpListPtr->addPosition(refID, position); } tokens.Clear(); buffer.Clear(); } } ifclose(fdbSnp); } // Read the sam records. SamRecord samRecord; int numReads = 0; ////////////////////// // Setup in case doing a quality count. // Quality histogram. const int MAX_QUAL = 126; const int START_QUAL = 33; uint64_t qualCount[MAX_QUAL+1]; for(int i = 0; i <= MAX_QUAL; i++) { qualCount[i] = 0; } const int START_PHRED = 0; const int PHRED_DIFF = START_QUAL - START_PHRED; const int MAX_PHRED = MAX_QUAL - PHRED_DIFF; uint64_t phredCount[MAX_PHRED+1]; for(int i = 0; i <= MAX_PHRED; i++) { phredCount[i] = 0; } int refPos = 0; Cigar* cigarPtr = NULL; char cigarChar = '?'; // Exclude clips from the qual/phred counts if unmapped reads are excluded. bool qualExcludeClips = excludeFlags & SamFlag::UNMAPPED; ////////////////////////////////// // When not reading by sections, getNextSection returns true // the first time, then false the next time. while(getNextSection(samIn)) { // Keep reading records from the file until SamFile::ReadRecord // indicates to stop (returns false). while(((maxNumReads < 0) || (numReads < maxNumReads)) && samIn.ReadRecord(samHeader, samRecord)) { // Another record was read, so increment the number of reads. ++numReads; // See if the quality histogram should be genereated. if(qual || phred) { // Get the quality. const char* qual = samRecord.getQuality(); // Check for no quality ('*'). if((qual[0] == '*') && (qual[1] == 0)) { // This record does not have a quality string, so no // quality processing is necessary. } else { int index = 0; cigarPtr = samRecord.getCigarInfo(); cigarChar = '?'; refPos = samRecord.get0BasedPosition(); if(!qualExcludeClips && (cigarPtr != NULL)) { // Offset the reference position by any soft clips // by subtracting the queryIndex of this start position. // refPos is now the start position of the clips. refPos -= cigarPtr->getQueryIndex(0); } while(qual[index] != 0) { // Skip this quality if it is clipped and we are skipping clips. if(cigarPtr != NULL) { cigarChar = cigarPtr->getCigarCharOpFromQueryIndex(index); } if(qualExcludeClips && Cigar::isClip(cigarChar)) { // Skip a clipped quality. ++index; // Increment the position. continue; } if(withinRegion && (myEndPos != -1) && (refPos >= myEndPos)) { // We have hit the end of the region, stop processing this // quality string. break; } if(withinRegion && (refPos < myStartPos)) { // This position is not in the target. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Check for valid quality. if((qual[index] < START_QUAL) || (qual[index] > MAX_QUAL)) { if(qual) { std::cerr << "Invalid Quality found: " << qual[index] << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } if(phred) { std::cerr << "Invalid Phred Quality found: " << qual[index] - PHRED_DIFF << ". Must be between " << START_QUAL << " and " << MAX_QUAL << ".\n"; } // Skip an invalid quality. ++index; // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } continue; } // Increment the count for this quality. ++(qualCount[(int)(qual[index])]); ++(phredCount[(int)(qual[index]) - PHRED_DIFF]); // Update the position if this is found in the reference or a clip. if(Cigar::foundInReference(cigarChar) || Cigar::isClip(cigarChar)) { ++refPos; } ++index; } } } // Check the next thing to do for the read. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases for this read. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); } } // Done with a section, move on to the next one. // New section, so flush the pileup. pileup.flushPileup(); } // Flush the rest of the pileup. if((baseQCPtr != NULL) || baseSum) { // Pileup the bases. pileup.processAlignmentRegion(samRecord, myStartPos, myEndPos, dbsnpListPtr); PileupElementBaseQCStats::printSummary(); ifclose(baseQCPtr); } std::cerr << "Number of records read = " << samIn.GetCurrentRecordCount() << std::endl; if(basic) { std::cerr << std::endl; samIn.PrintStatistics(); } // Print the quality stats. if(qual) { std::cerr << std::endl; std::cerr << "Quality\tCount\n"; for(int i = START_QUAL; i <= MAX_QUAL; i++) { std::cerr << i << "\t" << qualCount[i] << std::endl; } } // Print the phred quality stats. if(phred) { std::cerr << std::endl; std::cerr << "Phred\tCount\n"; for(int i = START_PHRED; i <= MAX_PHRED; i++) { std::cerr << i << "\t" << phredCount[i] << std::endl; } } SamStatus::Status status = samIn.GetStatus(); if(status == SamStatus::NO_MORE_RECS) { // A status of NO_MORE_RECS means that all reads were successful. status = SamStatus::SUCCESS; } return(status); }
bool Stats::getNextSection(SamFile &samIn) { static bool alreadyRead = false; if(myRegionList == NULL) { // no region list is set, so just read once. if(alreadyRead) { // No regions and it has already been read, so // return false, no more to read. return(false); } // Return true that there is more to read, but // set the flag that it has already been read // so the next call will return false. alreadyRead = true; return(true); } else { // There is a region list, so read process that. // Track whether or not a section has been found. bool sectionFound = false; myStartPos = 0; myEndPos = 0; // Loop until the end of the file or the end of the file or // a section is found. while(!sectionFound && !ifeof(myRegionList)) { myRegBuffer.Clear(); myRegBuffer.ReadLine(myRegionList); if(myRegBuffer.IsEmpty()) { // Nothing read, so continue to the next line. continue; } // A line was read, so parse it. myRegColumn.ReplaceColumns(myRegBuffer, '\t'); if(myRegColumn.Length() < 3) { // Incorrectly formatted line. std::cerr << "Improperly formatted reg line: " << myRegBuffer << "; Skipping to the next line.\n"; continue; } // Check the columns. if(!myRegColumn[1].AsInteger(myStartPos)) { // The start position (2nd column) is not an integer. std::cerr << "Improperly formatted region line, start position " << "(2nd column) is not an integer: " << myRegColumn[1] << "; Skipping to the next line.\n"; } else if(!myRegColumn[2].AsInteger(myEndPos)) { // The end position (3rd column) is not an integer. std::cerr << "Improperly formatted region line, end position " << "(3rd column) is not an integer: " << myRegColumn[2] << "; Skipping to the next line.\n"; } else if((myStartPos >= myEndPos) && (myEndPos != -1)) { // The start position is >= the end position std::cerr << "Improperly formatted region line, the start position " << "is >= end position: " << myRegColumn[1] << " >= " << myRegColumn[2] << "; Skipping to the next line.\n"; } else { sectionFound = true; samIn.SetReadSection(myRegColumn[0].c_str(), myStartPos, myEndPos); } } return(sectionFound); } }
/* if a discordant read is mapped to MEI (overlap with MEI coord) add centor of ( anchor_end + 3*avr_ins_var ) skip unmap & clip check 3 types at the same time */ void Sites::AddDiscoverSiteFromSingleBam( SingleSampleInfo & si ) { /*for(int i=0;i<NMEI; i++) { std::cout << "m=" << i << ": "; for(map<string, map<int, bool> >::iterator t=meiCoord[m].begin(); t!=meiCoord[m].end(); t++) std::cout << t->first << "->" << t->second.size() << " "; std::cout << std::endl; }*/ avr_read_length = si.avr_read_length; avr_ins_size = si.avr_ins_size; min_read_length = avr_read_length / 3; current_depth = si.depth; // total_depth += current_depth; resetDepthAddFlag(); SamFile bam; SamFileHeader bam_header; OpenBamAndBai( bam,bam_header, si.bam_name ); for( int i=0; i<pchr_list->size(); i++ ) { string chr = (*pchr_list)[i]; // if ( !single_chr.empty() && chr.compare(single_chr)!=0 ) // continue; if ( siteList.find(chr) == siteList.end() ) siteList[chr].clear(); // map<string, map<int, SingleSite> >::iterator pchr = siteList[m].find(chr); // map<string, map<int, bool> >::iterator coord_chr_ptr = meiCoord[m].find(chr); // if (coord_chr_ptr == meiCoord[m].end()) // continue; bool section_status; if (range_start<0) { // no range section_status = bam.SetReadSection( chr.c_str() ); if (!section_status) { string str = "Cannot set read section at chr " + chr; morphWarning( str ); } } else { // set range section_status = bam.SetReadSection( chr.c_str(), range_start, range_end ); if (!section_status) { string str = "Cannot set read section at chr " + chr + " " + std::to_string(range_start) + "-" + std::to_string(range_end); morphWarning( str ); } } // DO ADDING // if (siteList[chr].empty()) // p_reach_last = 1; // else { // p_reach_last = 0; pnearest = siteList[chr].begin(); // } SingleSite new_site; // temporary cluster. will be added to map later. new_site.depth = current_depth; bool start_new = 1; // check if need to add new_site to map and start new new_site SamRecord rec; int between = 0; // count #reads after new_site.end. If end changed, add it to rcount and reset to zero while( bam.ReadRecord( bam_header, rec ) ) { if (!start_new) { if (rec.get1BasedPosition() >= new_site.end) between++; else new_site.rcount++; } if (rec.getFlag() & 0x2) continue; if ( OneEndUnmap( rec.getFlag() ) ) continue; if ( IsSupplementary(rec.getFlag()) ) continue; if ( rec.getReadLength() < min_read_length ) continue; if ( rec.getMapQuality() < MIN_QUALITY ) continue; if (chr.compare(rec.getMateReferenceName())==0 && rec.getInsertSize() < abs(avr_ins_size*2)) continue; bool is_mei = 0; vector<bool> is_in_coord; is_in_coord.resize(3, 0); for(int m=0; m<NMEI; m++) { map<string, map<int, bool> >::iterator coord_chr_ptr = meiCoord[m].find(rec.getMateReferenceName()); if (coord_chr_ptr == meiCoord[m].end()) is_in_coord[m] = 0; else is_in_coord[m] = isWithinCoord( rec.get1BasedMatePosition(), coord_chr_ptr->second ); // within MEI coord if (is_in_coord[m]) is_mei = 1; } if (!is_mei) continue; if (start_new) { setNewCluster( is_in_coord, new_site,rec); start_new = 0; between = 0; } else { // add to existing cluster if ( rec.get1BasedPosition() > new_site.end + avr_ins_size ) { // start new coord addClusterToMap(new_site, siteList[chr]); setNewCluster( is_in_coord, new_site, rec); start_new = 0; between = 0; } else { addToCurrentCluster( is_in_coord, new_site, rec); new_site.rcount += between; between = 0; } } } // add last one if (!start_new) addClusterToMap(new_site, siteList[chr]); } bam.Close(); }