bool BamRecord::initFromFile(BamFileReader *bamFileReader) { setFileIdx(bamFileReader->getFileIdx()); _bamAlignment = bamFileReader->getAlignment(); bamFileReader->getChrName(_chrName); _bamChromId = bamFileReader->getCurrChromdId(); _startPos = bamFileReader->getStartPos(); int2str(_startPos, _startPosStr); _endPos = bamFileReader->getEndPos(); int2str(_endPos, _endPosStr); bamFileReader->getName(_name); bamFileReader->getScore(_score); char strandChar = bamFileReader->getStrand(); setStrand(strandChar); _isUnmapped = !bamFileReader->getAlignment().IsMapped(); _isMateUnmapped = !bamFileReader->getAlignment().IsMateMapped(); // Get the cigar data into a string. buildCigarStr(); bamFileReader->getMateChrName(_mateChrName); int2str(_bamAlignment.MatePosition, _matePos); int2str(_bamAlignment.InsertSize, _insertSize); _queryBases = _bamAlignment.QueryBases; _qualities = _bamAlignment.Qualities; return true; }
void BranchAcceptor::load(istream &is) { double cutoff; Strand strand; SignalType signalType; BOOM::String p; int consensusOffset; is >> signalType; is >> p; cutoff=p.asDouble(); is >> strand; is >> consensusOffset; setSignalType(signalType); setStrand(strand); setCutoff(cutoff); BOOM::String dummy; is>>dummy; // will always be "WWAM" branchPoint=new WWAM(getGC(),is); is>>dummy; // will always be "WAM" acceptor=new WAM(getGC(),is); int contextWindowLength=branchPoint->getContextWindowLength()+ acceptor->getContextWindowLength(); setSizes(2,consensusOffset,contextWindowLength); }
bool GffRecord::initFromFile(SingleLineDelimTextFileReader *fileReader) { fileReader->getField(0, _chrName); _chrId = fileReader->getCurrChromdId(); fileReader->getField(3, _startPosStr); _startPos = str2chrPos(_startPosStr); _startPos--; // VCF is one-based. Here we intentionally don't decrement the string version, //because we'll still want to output the one-based number in the print methods, even though //internally we decrement the integer to comply with the 0-based format common to other records. fileReader->getField(4, _endPosStr); //endPos is just the startPos plus the length of the variant _endPos = str2chrPos(_endPosStr); fileReader->getField(2, _name); fileReader->getField(1, _source); fileReader->getField(5, _score); //GFF allows a '.' for the strandChar, signifying it is not known. char strandChar = 0; fileReader->getField(6, strandChar); setStrand(strandChar); fileReader->getField(7, _frame); _numFields = fileReader->getNumFields(); if (_numFields == 9) { fileReader->getField(8, _group); } return true; }
IMM::IMM(BOOM::Vector<TrainingSequence*> &v,int order, int minSampleSize,int phase,ContentType contentType, Strand strand) : N(order), alphabetSize(alphabet.getNumElements()), phase(phase), revComp(NULL), models(new BOOM::Vector<BOOM::StringMap<double>*>) { setContentType(contentType); if(strand==EITHER_STRAND) strand=::getStrand(contentType); setStrand(strand); buildModels(v,minSampleSize); if(strand==FORWARD_STRAND) { BOOM::Vector<TrainingSequence*> rcSeqs; revCompSeqs(v,rcSeqs); revComp=new IMM(rcSeqs,order,minSampleSize,phase, ::reverseComplement(contentType), REVERSE_STRAND); revComp->revComp=this; } }
ThreePeriodicMarkovChain::ThreePeriodicMarkovChain(BOOM::Vector<TrainingSequence*> & sequences,int order, int minSampleSize, ContentType contentType) { setContentType(contentType); setStrand(PLUS_STRAND); buildModels(sequences,minSampleSize,order); }
void ThreePeriodicMarkovChain::load(istream &is) { BOOM::String dummy; for(int i=0 ; i<3 ; ++i) { is >> dummy; chains[i]=new MarkovChain(is); } ContentType contentType=chains[0]->getContentType(); setContentType(contentType); setStrand(::getStrand(contentType)); }
IMM::IMM(const IMM &other) : N(other.N), phase(other.phase), alphabetSize(other.alphabetSize), revComp(NULL), models(new BOOM::Vector<BOOM::StringMap<double>*>) { for(int i=0 ; i<=N ; ++i) models->push_back(new BOOM::StringMap<double>(*(*other.models)[i])); setContentType(other.getContentType()); setStrand(other.getStrand()); if(getContentType()==INTERGENIC) revComp=this; else if(other.revComp && getStrand()==FORWARD_STRAND) revComp=new IMM(*other.revComp); }
BranchAcceptor::BranchAcceptor(GarbageCollector &gc, BOOM::Vector<TrainingSequence*> &seqs, int branchPointOrder, int acceptorOrder, int branchContextLength, int minSampleSize, int consensusOffset, SignalType signalType) : SignalSensor(gc), branchPoint(NULL), acceptor(NULL) { // ctor // Misc. initialization setStrand(FORWARD_STRAND); setSignalType(signalType); int sensorLength=seqs[0]->getLength(); int acceptorContextLength=sensorLength-branchContextLength; setSizes(2,consensusOffset,sensorLength); // Split training sequences into branch point windows and acceptor // windows BOOM::Vector<TrainingSequence*> branchPoints, acceptors; BOOM::Vector<TrainingSequence*>::iterator cur=seqs.begin(), end=seqs.end(); for(; cur!=end ; ++cur) { TrainingSequence &S=**cur; TrainingSequence *branchPoint=new TrainingSequence(); TrainingSequence *acceptor=new TrainingSequence(); S.getSubsequence(0,branchContextLength,*branchPoint); S.getSubsequence(branchContextLength,acceptorContextLength,*acceptor); branchPoints.push_back(branchPoint); acceptors.push_back(acceptor); } // Train the branch point sensor & the acceptor sensor acceptor=new WAM(gc,acceptors,acceptorOrder,minSampleSize, signalType,consensusOffset-branchContextLength,2); branchPoint=new WWAM(gc,branchPoints,branchPointOrder,minSampleSize, signalType,0,0); // Delete the training subsequences cur=branchPoints.begin(); end=branchPoints.end(); for(; cur!=end ; ++cur) delete *cur; cur=acceptors.begin(); end=acceptors.end(); for(; cur!=end ; ++cur) delete *cur; }
void Interval::setInterval(Domain start, Domain stop, int str){ assert(inRange(start, stop)); setStart(start); setStop(stop); setStrand(str); }
IMM::IMM(istream &is,Strand strand) : revComp(NULL), models(new BOOM::Vector<BOOM::StringMap<double>*>) { setStrand(strand); load(is); }
ThreePeriodicMarkovChain::ThreePeriodicMarkovChain(Strand strand, ContentType contentType) { setStrand(strand); setContentType(contentType); }
BWM::BWM(GarbageCollector &gc,BOOM::Vector<TrainingSequence*> &sequences, SignalType signalType,int consensusOffset,int consensusLength, float gcContent,float alpha) : WMM(gc) { /* This constructor performs training of the BWM */ // Set some things in the base class setStrand(FORWARD_STRAND); setSignalType(signalType); // Compute background single-nucleotide probabilities float atContent=1-gcContent; float AT=atContent/2, GC=gcContent/2; BOOM::Array1D<float> background(alphabet.getNumElements()); background.setAllTo(0); Symbol A=alphabet.lookup('A'), T=alphabet.lookup('T'); Symbol C=alphabet.lookup('C'), G=alphabet.lookup('G'); background[A]=AT; background[T]=AT; background[C]=GC; background[G]=GC; // Allocate the underlying WMM matrix float pseudocount=0; int n=sequences.size(); int len=sequences[0]->getLength(); setSizes(consensusLength,consensusOffset,len); int nAlpha=alphabet.getNumElements(); matrix.resize(len,nAlpha); matrix.setAllTo(pseudocount); // Iterate through the training sequences & collect counts BOOM::FloatArray1D effectiveSize(len); effectiveSize.setAllTo(0); for(int i=0 ; i<n ; ++i) { TrainingSequence &seq=*sequences[i]; int l=seq.getLength(); if(l!=len) throw BOOM::String("length mismatch in BWM: ")+len+" vs "+l; for(int pos=0 ; pos<len ; ++pos) { Symbol s=seq[pos]; int count=seq.getBoostCount(); matrix[pos][s]+=count; effectiveSize[pos]+=count; } } // Perform a binomial test at each position to see if frequencies // are significantly different from the background frequencies numSignificantPositions=0; for(int pos=0 ; pos<len ; ++pos) { // First, we have to identify the most extreme count int mostExtremeDiff=0; Symbol mostExtremeSymbol=0; int sampleSize=int(effectiveSize[pos]+5/9.0); for(Symbol s=0 ; s<nAlpha ; ++s) { int expected=int(background[s]*sampleSize+5/9.0); int observed=(int)matrix[pos][s]; int diff=abs(expected-observed); if(diff>mostExtremeDiff) { mostExtremeDiff=diff; mostExtremeSymbol=s; } } // Apply binomial distribution to get P-value for this count float p=background[mostExtremeSymbol]; int expected=int(p*sampleSize+5/9.0); int observed=expected+mostExtremeDiff; // +/- are symmetric, so use + double P= BinomialDistribution::rightTailedPValue(observed,sampleSize,p); // If not significantly different from background, then the observed // frequencies are probably just a statistical fluctation due to // small sample size, so use the background probabilities instead if(P>alpha) { effectiveSize[pos]=0; for(Symbol s=0 ; s<nAlpha ; ++s) { matrix[pos][s]=background[s]*sampleSize; effectiveSize[pos]+=matrix[pos][s]; } } else { cout<<pos<<" : using OBSERVED frequencies, p="<<P<<endl; ++numSignificantPositions; } } // Normalize counts into probabilities for(int pos=0 ; pos<len ; ++pos) for(Symbol s=0 ; s<nAlpha ; ++s) matrix[pos][s]/=effectiveSize[pos]; // Convert probabilities to log-probabilities convertToLogs(); }