Example #1
0
void BranchAcceptor::load(istream &is)
{
  double cutoff;
  Strand strand;
  SignalType signalType;
  BOOM::String p;
  int consensusOffset;
  is >> signalType;
  is >> p; 
  cutoff=p.asDouble();
  is >> strand;
  is >> consensusOffset;
  setSignalType(signalType);
  setStrand(strand);
  setCutoff(cutoff);

  BOOM::String dummy;
  is>>dummy; // will always be "WWAM"
  branchPoint=new WWAM(getGC(),is);
  is>>dummy; // will always be "WAM"
  acceptor=new WAM(getGC(),is);

  int contextWindowLength=branchPoint->getContextWindowLength()+
    acceptor->getContextWindowLength();
  setSizes(2,consensusOffset,contextWindowLength);
}
Example #2
0
/**
 * Reimplemented from UMLWidget::loadFromXMI to load
 * SignalWidget from XMI.
 */
bool SignalWidget::loadFromXMI( QDomElement & qElement )
{
    if( !UMLWidget::loadFromXMI( qElement ) )
        return false;
    setName(qElement.attribute( "signalname", "" ));

    setDocumentation(qElement.attribute( "documentation", "" ));

    QString type = qElement.attribute( "signaltype", "" );
    setSignalType((SignalType)type.toInt());

   return true;
}
Example #3
0
BranchAcceptor::BranchAcceptor(GarbageCollector &gc,
			       BOOM::Vector<TrainingSequence*> &seqs,
			       int branchPointOrder,
			       int acceptorOrder,
			       int branchContextLength,
			       int minSampleSize,
			       int consensusOffset, 
			       SignalType signalType)
  : SignalSensor(gc),
    branchPoint(NULL),
    acceptor(NULL)
{
  // ctor

  // Misc. initialization
  setStrand(FORWARD_STRAND);
  setSignalType(signalType);
  int sensorLength=seqs[0]->getLength();
  int acceptorContextLength=sensorLength-branchContextLength;
  setSizes(2,consensusOffset,sensorLength);   

  // Split training sequences into branch point windows and acceptor
  // windows
  BOOM::Vector<TrainingSequence*> branchPoints, acceptors;
  BOOM::Vector<TrainingSequence*>::iterator cur=seqs.begin(),
    end=seqs.end();
  for(; cur!=end ; ++cur)
    {
      TrainingSequence &S=**cur;
      TrainingSequence *branchPoint=new TrainingSequence();
      TrainingSequence *acceptor=new TrainingSequence();
      S.getSubsequence(0,branchContextLength,*branchPoint);
      S.getSubsequence(branchContextLength,acceptorContextLength,*acceptor);
      branchPoints.push_back(branchPoint);
      acceptors.push_back(acceptor);
    }

  // Train the branch point sensor & the acceptor sensor
  acceptor=new WAM(gc,acceptors,acceptorOrder,minSampleSize,
		   signalType,consensusOffset-branchContextLength,2);
  branchPoint=new WWAM(gc,branchPoints,branchPointOrder,minSampleSize,
		       signalType,0,0);

  // Delete the training subsequences
  cur=branchPoints.begin(); end=branchPoints.end();
  for(; cur!=end ; ++cur) delete *cur;
  cur=acceptors.begin(); end=acceptors.end();
  for(; cur!=end ; ++cur) delete *cur;
}
Example #4
0
File: BWM.C Project: bmajoros/EGGS
BWM::BWM(GarbageCollector &gc,BOOM::Vector<TrainingSequence*> &sequences,
	 SignalType signalType,int consensusOffset,int consensusLength,
	 float gcContent,float alpha)
  : WMM(gc)
{
  /*
    This constructor performs training of the BWM
   */

  // Set some things in the base class
  setStrand(FORWARD_STRAND);
  setSignalType(signalType);

  // Compute background single-nucleotide probabilities
  float atContent=1-gcContent;
  float AT=atContent/2, GC=gcContent/2;
  BOOM::Array1D<float> background(alphabet.getNumElements());
  background.setAllTo(0);
  Symbol A=alphabet.lookup('A'), T=alphabet.lookup('T');
  Symbol C=alphabet.lookup('C'), G=alphabet.lookup('G');
  background[A]=AT;
  background[T]=AT;
  background[C]=GC;
  background[G]=GC;

  // Allocate the underlying WMM matrix
  float pseudocount=0;
  int n=sequences.size();
  int len=sequences[0]->getLength();
  setSizes(consensusLength,consensusOffset,len);
  int nAlpha=alphabet.getNumElements();
  matrix.resize(len,nAlpha);
  matrix.setAllTo(pseudocount);

  // Iterate through the training sequences & collect counts
  BOOM::FloatArray1D effectiveSize(len);
  effectiveSize.setAllTo(0);
  for(int i=0 ; i<n ; ++i)
    {
      TrainingSequence &seq=*sequences[i];
      int l=seq.getLength();
      if(l!=len) throw BOOM::String("length mismatch in BWM: ")+len+" vs "+l;
      for(int pos=0 ; pos<len ; ++pos)
	{
	  Symbol s=seq[pos];
	  int count=seq.getBoostCount();
	  matrix[pos][s]+=count;
	  effectiveSize[pos]+=count;
	}
    }

  // Perform a binomial test at each position to see if frequencies
  // are significantly different from the background frequencies
  numSignificantPositions=0;
  for(int pos=0 ; pos<len ; ++pos)
    {
      // First, we have to identify the most extreme count
      int mostExtremeDiff=0;
      Symbol mostExtremeSymbol=0;
      int sampleSize=int(effectiveSize[pos]+5/9.0);
      for(Symbol s=0 ; s<nAlpha ; ++s)
	{
	  int expected=int(background[s]*sampleSize+5/9.0);
	  int observed=(int)matrix[pos][s];
	  int diff=abs(expected-observed);
	  if(diff>mostExtremeDiff)
	    {
	      mostExtremeDiff=diff;
	      mostExtremeSymbol=s;
	    }
	}

      // Apply binomial distribution to get P-value for this count
      float p=background[mostExtremeSymbol];
      int expected=int(p*sampleSize+5/9.0);
      int observed=expected+mostExtremeDiff; // +/- are symmetric, so use +
      double P=
	BinomialDistribution::rightTailedPValue(observed,sampleSize,p);

      // If not significantly different from background, then the observed
      // frequencies are probably just a statistical fluctation due to
      // small sample size, so use the background probabilities instead
      if(P>alpha)
	{
	  effectiveSize[pos]=0;
	  for(Symbol s=0 ; s<nAlpha ; ++s)
	    {
	      matrix[pos][s]=background[s]*sampleSize;
	      effectiveSize[pos]+=matrix[pos][s];
	    }
	}
      else 
	{
	  cout<<pos<<" : using OBSERVED frequencies, p="<<P<<endl;
	  ++numSignificantPositions;
	}
    }

  // Normalize counts into probabilities
  for(int pos=0 ; pos<len ; ++pos)
    for(Symbol s=0 ; s<nAlpha ; ++s)
      matrix[pos][s]/=effectiveSize[pos];

  // Convert probabilities to log-probabilities
  convertToLogs();  
}