void BranchAcceptor::load(istream &is) { double cutoff; Strand strand; SignalType signalType; BOOM::String p; int consensusOffset; is >> signalType; is >> p; cutoff=p.asDouble(); is >> strand; is >> consensusOffset; setSignalType(signalType); setStrand(strand); setCutoff(cutoff); BOOM::String dummy; is>>dummy; // will always be "WWAM" branchPoint=new WWAM(getGC(),is); is>>dummy; // will always be "WAM" acceptor=new WAM(getGC(),is); int contextWindowLength=branchPoint->getContextWindowLength()+ acceptor->getContextWindowLength(); setSizes(2,consensusOffset,contextWindowLength); }
/** * Reimplemented from UMLWidget::loadFromXMI to load * SignalWidget from XMI. */ bool SignalWidget::loadFromXMI( QDomElement & qElement ) { if( !UMLWidget::loadFromXMI( qElement ) ) return false; setName(qElement.attribute( "signalname", "" )); setDocumentation(qElement.attribute( "documentation", "" )); QString type = qElement.attribute( "signaltype", "" ); setSignalType((SignalType)type.toInt()); return true; }
BranchAcceptor::BranchAcceptor(GarbageCollector &gc, BOOM::Vector<TrainingSequence*> &seqs, int branchPointOrder, int acceptorOrder, int branchContextLength, int minSampleSize, int consensusOffset, SignalType signalType) : SignalSensor(gc), branchPoint(NULL), acceptor(NULL) { // ctor // Misc. initialization setStrand(FORWARD_STRAND); setSignalType(signalType); int sensorLength=seqs[0]->getLength(); int acceptorContextLength=sensorLength-branchContextLength; setSizes(2,consensusOffset,sensorLength); // Split training sequences into branch point windows and acceptor // windows BOOM::Vector<TrainingSequence*> branchPoints, acceptors; BOOM::Vector<TrainingSequence*>::iterator cur=seqs.begin(), end=seqs.end(); for(; cur!=end ; ++cur) { TrainingSequence &S=**cur; TrainingSequence *branchPoint=new TrainingSequence(); TrainingSequence *acceptor=new TrainingSequence(); S.getSubsequence(0,branchContextLength,*branchPoint); S.getSubsequence(branchContextLength,acceptorContextLength,*acceptor); branchPoints.push_back(branchPoint); acceptors.push_back(acceptor); } // Train the branch point sensor & the acceptor sensor acceptor=new WAM(gc,acceptors,acceptorOrder,minSampleSize, signalType,consensusOffset-branchContextLength,2); branchPoint=new WWAM(gc,branchPoints,branchPointOrder,minSampleSize, signalType,0,0); // Delete the training subsequences cur=branchPoints.begin(); end=branchPoints.end(); for(; cur!=end ; ++cur) delete *cur; cur=acceptors.begin(); end=acceptors.end(); for(; cur!=end ; ++cur) delete *cur; }
BWM::BWM(GarbageCollector &gc,BOOM::Vector<TrainingSequence*> &sequences, SignalType signalType,int consensusOffset,int consensusLength, float gcContent,float alpha) : WMM(gc) { /* This constructor performs training of the BWM */ // Set some things in the base class setStrand(FORWARD_STRAND); setSignalType(signalType); // Compute background single-nucleotide probabilities float atContent=1-gcContent; float AT=atContent/2, GC=gcContent/2; BOOM::Array1D<float> background(alphabet.getNumElements()); background.setAllTo(0); Symbol A=alphabet.lookup('A'), T=alphabet.lookup('T'); Symbol C=alphabet.lookup('C'), G=alphabet.lookup('G'); background[A]=AT; background[T]=AT; background[C]=GC; background[G]=GC; // Allocate the underlying WMM matrix float pseudocount=0; int n=sequences.size(); int len=sequences[0]->getLength(); setSizes(consensusLength,consensusOffset,len); int nAlpha=alphabet.getNumElements(); matrix.resize(len,nAlpha); matrix.setAllTo(pseudocount); // Iterate through the training sequences & collect counts BOOM::FloatArray1D effectiveSize(len); effectiveSize.setAllTo(0); for(int i=0 ; i<n ; ++i) { TrainingSequence &seq=*sequences[i]; int l=seq.getLength(); if(l!=len) throw BOOM::String("length mismatch in BWM: ")+len+" vs "+l; for(int pos=0 ; pos<len ; ++pos) { Symbol s=seq[pos]; int count=seq.getBoostCount(); matrix[pos][s]+=count; effectiveSize[pos]+=count; } } // Perform a binomial test at each position to see if frequencies // are significantly different from the background frequencies numSignificantPositions=0; for(int pos=0 ; pos<len ; ++pos) { // First, we have to identify the most extreme count int mostExtremeDiff=0; Symbol mostExtremeSymbol=0; int sampleSize=int(effectiveSize[pos]+5/9.0); for(Symbol s=0 ; s<nAlpha ; ++s) { int expected=int(background[s]*sampleSize+5/9.0); int observed=(int)matrix[pos][s]; int diff=abs(expected-observed); if(diff>mostExtremeDiff) { mostExtremeDiff=diff; mostExtremeSymbol=s; } } // Apply binomial distribution to get P-value for this count float p=background[mostExtremeSymbol]; int expected=int(p*sampleSize+5/9.0); int observed=expected+mostExtremeDiff; // +/- are symmetric, so use + double P= BinomialDistribution::rightTailedPValue(observed,sampleSize,p); // If not significantly different from background, then the observed // frequencies are probably just a statistical fluctation due to // small sample size, so use the background probabilities instead if(P>alpha) { effectiveSize[pos]=0; for(Symbol s=0 ; s<nAlpha ; ++s) { matrix[pos][s]=background[s]*sampleSize; effectiveSize[pos]+=matrix[pos][s]; } } else { cout<<pos<<" : using OBSERVED frequencies, p="<<P<<endl; ++numSignificantPositions; } } // Normalize counts into probabilities for(int pos=0 ; pos<len ; ++pos) for(Symbol s=0 ; s<nAlpha ; ++s) matrix[pos][s]/=effectiveSize[pos]; // Convert probabilities to log-probabilities convertToLogs(); }