Beispiel #1
0
double BOOM::SequenceEntropy::entropy(const BOOM::String &str,
				    double &maxEntropy)
{
  int len=str.length();
  BOOM::StringMap<int> counts(hashTableSize(0));
  const char *p=str.c_str();
  int total=0;
  for(int i=0 ; i<len ; ++i, ++p)
    {
      if(counts.isDefined(p,1)) 
	++counts.lookup(p,1);
      else 
	counts.lookup(p,1)=1;
      ++total;
    }
  double entropy=0;
  StringMapIterator<int> cur=counts.begin(), end=counts.end();
  for(; cur!=end ; ++cur)
    {
      int count=(*cur).second;
      double p=count/double(total);
      entropy-=p*lg(p);
    }
  maxEntropy=-lg(1.0/counts.size());
  if(entropy>maxEntropy) entropy=maxEntropy;
  return entropy;
}
Beispiel #2
0
double BOOM::SequenceEntropy::jointEntropy(const BOOM::String &str,
					   int order,
					   double &maxEntropy)
{
  int len=str.length();
  int gramSize=order+1;
  if(gramSize>=len) 
    throw BOOM::String("Order ")+order+
      " is too large for sequence of length "+len;
  int numWindows=len-gramSize+1;
  BOOM::StringMap<int> counts(hashTableSize(order));
  const char *p=str.c_str();
  int total=0;
  for(int i=0 ; i<numWindows ; ++i, ++p)
    {
      if(counts.isDefined(p,gramSize)) 
	++counts.lookup(p,gramSize);
      else 
	counts.lookup(p,gramSize)=1;
      ++total;
    }
  double entropy=0;
  StringMapIterator<int> cur=counts.begin(), end=counts.end();
  for(; cur!=end ; ++cur)
    {
      int count=(*cur).second;
      double p=count/double(total);
      entropy-=p*lg(p);
    }
  maxEntropy=-lg(1.0/counts.size());
  if(entropy>maxEntropy) entropy=maxEntropy;
  return entropy;
}
Beispiel #3
0
void Application::processForwardFeature(int featureEnd,
					const BOOM::String &seq)
{
  int begin=featureEnd;
  int end=begin+margin;
  int len=seq.length();
  if(end>=len) end=len-1;
  BOOM::String subseq=seq.substr(begin,end-begin);
  margins.push_back(subseq);
}
BOOM::String SignalPeptide::getSequence()
{
  BOOM::String sequence;
  int n=exons.size();
  for(int i=0 ; i<n ; ++i)
    sequence+=exons[i].getSequence();
  if(sequence[0]=='M') 
    sequence=sequence.substring(1,sequence.length()-1);
  return sequence;
}
Beispiel #5
0
void SignalSensor::addConsensus(const BOOM::String &s)
{
  int len=s.length();
  if(consensusLength>0 && consensusLength!=len)
    throw BOOM::String(
       "Consensus lengths differ in SignalSensor::addConsensus");
  
  consensusLength=len;
  consensuses.lookup(s.c_str(),len)=char(1);
}
void Application::updateCodonFreqs(const BOOM::String &transcript)
{
  const char *str=transcript.c_str();
  int len=transcript.length();
  const char *p=str;
  for(int i=0 ; i<len ; i+=3, p+=3)
    {
      BOOM::String codon(p,3);
      char acid=BOOM::ProteinTrans::mapCodon(codon.c_str());
      BOOM::Map<BOOM::String,float> &counts=codonFreqs[acid];
      if(!counts.isDefined(codon)) counts[codon]=1;
      else ++counts[codon];
    }
}
Beispiel #7
0
double IMM::scoreSingleBase(const Sequence &seq,const BOOM::String &str,
				    int index,Symbol s,char c)
{
  const char *p=str.c_str();
  switch(getStrand())
    {
    case PLUS_STRAND:
      {
	int maxOrder=(index>N ? N : index);
	for(int order=maxOrder ; order>=0 ; --order)
	  {
	    BOOM::StringMap<double> &model=*(*models)[order];
	    if(model.isDefined(p,index-order,order+1))
	      return model.lookup(p,index-order,order+1);
	  }
	throw BOOM::String("IMM::scoreSingleBase('+',")+
	  index+",strlen="+strlen(p)+",str="+
	  str.substring(index,maxOrder)+")";
      }

    case MINUS_STRAND:
      {
	/*
	  On the minus strand we have to take our contexts from the
	  right (but only because we trained the model that way)
	 */
	int seqLen=str.length();
	int maxOrder=seqLen-index-1;
	if(maxOrder>N) maxOrder=N;
	for(int order=maxOrder ; order>=0 ; --order)
	  {
	    BOOM::StringMap<double> &model=*(*models)[order];
	    if(model.isDefined(p,index,order+1)) 
	      return model.lookup(p,index,order+1);
	  }
	throw BOOM::Stacktrace(
          BOOM::String("IMM::scoreSingleBase('-',")+
	    index+",strlen="+strlen(p)+",str="+
	  str.substring(index,maxOrder)+")");
      }

    default: throw BOOM::String(__FILE__)+__LINE__;
    }
}
Beispiel #8
0
double BOOM::SequenceEntropy::conditionalEntropy(const BOOM::String &str,
						 int order)
{
  if(order<1) 
    throw "BOOM::SequenceEntropy::conditionalEntropy() : order<1";
  int len=str.length();
  int gramSize=order+1;
  if(gramSize>=len) 
    throw BOOM::String("Order ")+order+
      " is too large for sequence of length "+len;
  int numWindows=len-gramSize+1;
  BOOM::StringMap<int> counts(hashTableSize(order));
  BOOM::StringMap<int> prefixCounts(hashTableSize(order-1));
  const char *p=str.c_str();
  int total=0;
  for(int i=0 ; i<numWindows ; ++i, ++p)
    {
      if(counts.isDefined(p,gramSize)) 
	++counts.lookup(p,gramSize);
      else 
	counts.lookup(p,gramSize)=1;
      if(prefixCounts.isDefined(p,gramSize-1)) 
	++prefixCounts.lookup(p,gramSize-1);
      else 
	prefixCounts.lookup(p,gramSize-1)=1;
      ++total;
    }
  double entropy=0;
  StringMapIterator<int> cur=counts.begin(), end=counts.end();
  for(; cur!=end ; ++cur)
    {
      int count=(*cur).second;
      const char *s=(*cur).first;
      double p=count/double(total);
      double condP=count/double(prefixCounts.lookup(s,gramSize-1));
      entropy-=p*lg(condP);
    }
  return entropy;
}
Beispiel #9
0
BOOM::String BOOM::String::substitute(const BOOM::String &from,
				  const BOOM::String &to) const
{
  BOOM::String rval;
  const char *pattern=from.c_str();
  int patternLen=from.length();
  const char *ptr=c_str();
  const char *last=ptr+length()-patternLen;
  while(ptr<=last) {
    if(localMatch(ptr,pattern,patternLen)) {
      ptr+=patternLen;
      rval+=to;
    }
    else {
      rval+=*ptr;
      ptr++;
    }
  }
  //int extra=patternLen-1;
  //for(int i=0 ; i<extra ; ++i) rval+=*ptr++;
  for(; *ptr ; ++ptr) rval+=*ptr;
  return rval;
}
Beispiel #10
0
void BOOM::FastaReader::maskStrangeChars(BOOM::String &s)
{
  int len=s.length();
  for(int i=0 ; i<len ; ++i)
    if(!alphabet.isDefined(s[i])) {s[i]='N';INTERNAL_ERROR;}
}