double BOOM::SequenceEntropy::entropy(const BOOM::String &str, double &maxEntropy) { int len=str.length(); BOOM::StringMap<int> counts(hashTableSize(0)); const char *p=str.c_str(); int total=0; for(int i=0 ; i<len ; ++i, ++p) { if(counts.isDefined(p,1)) ++counts.lookup(p,1); else counts.lookup(p,1)=1; ++total; } double entropy=0; StringMapIterator<int> cur=counts.begin(), end=counts.end(); for(; cur!=end ; ++cur) { int count=(*cur).second; double p=count/double(total); entropy-=p*lg(p); } maxEntropy=-lg(1.0/counts.size()); if(entropy>maxEntropy) entropy=maxEntropy; return entropy; }
double BOOM::SequenceEntropy::jointEntropy(const BOOM::String &str, int order, double &maxEntropy) { int len=str.length(); int gramSize=order+1; if(gramSize>=len) throw BOOM::String("Order ")+order+ " is too large for sequence of length "+len; int numWindows=len-gramSize+1; BOOM::StringMap<int> counts(hashTableSize(order)); const char *p=str.c_str(); int total=0; for(int i=0 ; i<numWindows ; ++i, ++p) { if(counts.isDefined(p,gramSize)) ++counts.lookup(p,gramSize); else counts.lookup(p,gramSize)=1; ++total; } double entropy=0; StringMapIterator<int> cur=counts.begin(), end=counts.end(); for(; cur!=end ; ++cur) { int count=(*cur).second; double p=count/double(total); entropy-=p*lg(p); } maxEntropy=-lg(1.0/counts.size()); if(entropy>maxEntropy) entropy=maxEntropy; return entropy; }
void Application::processForwardFeature(int featureEnd, const BOOM::String &seq) { int begin=featureEnd; int end=begin+margin; int len=seq.length(); if(end>=len) end=len-1; BOOM::String subseq=seq.substr(begin,end-begin); margins.push_back(subseq); }
BOOM::String SignalPeptide::getSequence() { BOOM::String sequence; int n=exons.size(); for(int i=0 ; i<n ; ++i) sequence+=exons[i].getSequence(); if(sequence[0]=='M') sequence=sequence.substring(1,sequence.length()-1); return sequence; }
void SignalSensor::addConsensus(const BOOM::String &s) { int len=s.length(); if(consensusLength>0 && consensusLength!=len) throw BOOM::String( "Consensus lengths differ in SignalSensor::addConsensus"); consensusLength=len; consensuses.lookup(s.c_str(),len)=char(1); }
void Application::updateCodonFreqs(const BOOM::String &transcript) { const char *str=transcript.c_str(); int len=transcript.length(); const char *p=str; for(int i=0 ; i<len ; i+=3, p+=3) { BOOM::String codon(p,3); char acid=BOOM::ProteinTrans::mapCodon(codon.c_str()); BOOM::Map<BOOM::String,float> &counts=codonFreqs[acid]; if(!counts.isDefined(codon)) counts[codon]=1; else ++counts[codon]; } }
double IMM::scoreSingleBase(const Sequence &seq,const BOOM::String &str, int index,Symbol s,char c) { const char *p=str.c_str(); switch(getStrand()) { case PLUS_STRAND: { int maxOrder=(index>N ? N : index); for(int order=maxOrder ; order>=0 ; --order) { BOOM::StringMap<double> &model=*(*models)[order]; if(model.isDefined(p,index-order,order+1)) return model.lookup(p,index-order,order+1); } throw BOOM::String("IMM::scoreSingleBase('+',")+ index+",strlen="+strlen(p)+",str="+ str.substring(index,maxOrder)+")"; } case MINUS_STRAND: { /* On the minus strand we have to take our contexts from the right (but only because we trained the model that way) */ int seqLen=str.length(); int maxOrder=seqLen-index-1; if(maxOrder>N) maxOrder=N; for(int order=maxOrder ; order>=0 ; --order) { BOOM::StringMap<double> &model=*(*models)[order]; if(model.isDefined(p,index,order+1)) return model.lookup(p,index,order+1); } throw BOOM::Stacktrace( BOOM::String("IMM::scoreSingleBase('-',")+ index+",strlen="+strlen(p)+",str="+ str.substring(index,maxOrder)+")"); } default: throw BOOM::String(__FILE__)+__LINE__; } }
double BOOM::SequenceEntropy::conditionalEntropy(const BOOM::String &str, int order) { if(order<1) throw "BOOM::SequenceEntropy::conditionalEntropy() : order<1"; int len=str.length(); int gramSize=order+1; if(gramSize>=len) throw BOOM::String("Order ")+order+ " is too large for sequence of length "+len; int numWindows=len-gramSize+1; BOOM::StringMap<int> counts(hashTableSize(order)); BOOM::StringMap<int> prefixCounts(hashTableSize(order-1)); const char *p=str.c_str(); int total=0; for(int i=0 ; i<numWindows ; ++i, ++p) { if(counts.isDefined(p,gramSize)) ++counts.lookup(p,gramSize); else counts.lookup(p,gramSize)=1; if(prefixCounts.isDefined(p,gramSize-1)) ++prefixCounts.lookup(p,gramSize-1); else prefixCounts.lookup(p,gramSize-1)=1; ++total; } double entropy=0; StringMapIterator<int> cur=counts.begin(), end=counts.end(); for(; cur!=end ; ++cur) { int count=(*cur).second; const char *s=(*cur).first; double p=count/double(total); double condP=count/double(prefixCounts.lookup(s,gramSize-1)); entropy-=p*lg(condP); } return entropy; }
BOOM::String BOOM::String::substitute(const BOOM::String &from, const BOOM::String &to) const { BOOM::String rval; const char *pattern=from.c_str(); int patternLen=from.length(); const char *ptr=c_str(); const char *last=ptr+length()-patternLen; while(ptr<=last) { if(localMatch(ptr,pattern,patternLen)) { ptr+=patternLen; rval+=to; } else { rval+=*ptr; ptr++; } } //int extra=patternLen-1; //for(int i=0 ; i<extra ; ++i) rval+=*ptr++; for(; *ptr ; ++ptr) rval+=*ptr; return rval; }
void BOOM::FastaReader::maskStrangeChars(BOOM::String &s) { int len=s.length(); for(int i=0 ; i<len ; ++i) if(!alphabet.isDefined(s[i])) {s[i]='N';INTERNAL_ERROR;} }