double BOOM::SequenceEntropy::entropy(const BOOM::String &str, double &maxEntropy) { int len=str.length(); BOOM::StringMap<int> counts(hashTableSize(0)); const char *p=str.c_str(); int total=0; for(int i=0 ; i<len ; ++i, ++p) { if(counts.isDefined(p,1)) ++counts.lookup(p,1); else counts.lookup(p,1)=1; ++total; } double entropy=0; StringMapIterator<int> cur=counts.begin(), end=counts.end(); for(; cur!=end ; ++cur) { int count=(*cur).second; double p=count/double(total); entropy-=p*lg(p); } maxEntropy=-lg(1.0/counts.size()); if(entropy>maxEntropy) entropy=maxEntropy; return entropy; }
double BOOM::SequenceEntropy::jointEntropy(const BOOM::String &str, int order, double &maxEntropy) { int len=str.length(); int gramSize=order+1; if(gramSize>=len) throw BOOM::String("Order ")+order+ " is too large for sequence of length "+len; int numWindows=len-gramSize+1; BOOM::StringMap<int> counts(hashTableSize(order)); const char *p=str.c_str(); int total=0; for(int i=0 ; i<numWindows ; ++i, ++p) { if(counts.isDefined(p,gramSize)) ++counts.lookup(p,gramSize); else counts.lookup(p,gramSize)=1; ++total; } double entropy=0; StringMapIterator<int> cur=counts.begin(), end=counts.end(); for(; cur!=end ; ++cur) { int count=(*cur).second; double p=count/double(total); entropy-=p*lg(p); } maxEntropy=-lg(1.0/counts.size()); if(entropy>maxEntropy) entropy=maxEntropy; return entropy; }
void BOOM::FastaWriter::writeFasta(const BOOM::String &defline, const BOOM::String &sequence, const BOOM::String &filename) { ofstream os(filename.c_str()); addToFasta(defline,sequence.c_str(),os); }
void BranchAcceptor::load(istream &is) { double cutoff; Strand strand; SignalType signalType; BOOM::String p; int consensusOffset; is >> signalType; is >> p; cutoff=p.asDouble(); is >> strand; is >> consensusOffset; setSignalType(signalType); setStrand(strand); setCutoff(cutoff); BOOM::String dummy; is>>dummy; // will always be "WWAM" branchPoint=new WWAM(getGC(),is); is>>dummy; // will always be "WAM" acceptor=new WAM(getGC(),is); int contextWindowLength=branchPoint->getContextWindowLength()+ acceptor->getContextWindowLength(); setSizes(2,consensusOffset,contextWindowLength); }
BOOM::String SignalPeptide::getSequence() { BOOM::String sequence; int n=exons.size(); for(int i=0 ; i<n ; ++i) sequence+=exons[i].getSequence(); if(sequence[0]=='M') sequence=sequence.substring(1,sequence.length()-1); return sequence; }
void Application::processForwardFeature(int featureEnd, const BOOM::String &seq) { int begin=featureEnd; int end=begin+margin; int len=seq.length(); if(end>=len) end=len-1; BOOM::String subseq=seq.substr(begin,end-begin); margins.push_back(subseq); }
void SignalSensor::addConsensus(const BOOM::String &s) { int len=s.length(); if(consensusLength>0 && consensusLength!=len) throw BOOM::String( "Consensus lengths differ in SignalSensor::addConsensus"); consensusLength=len; consensuses.lookup(s.c_str(),len)=char(1); }
void Application::generateModel(const BOOM::String &filename, const BOOM::String &startCodonModelFile) { // Create output file ofstream os(filename.c_str()); os<<"SignalPeptide"<<endl; // Copy the start codon model into the file ifstream is(startCodonModelFile.c_str()); BOOM::String line; while(!is.eof()) { line.getline(is); if(is.eof()) break; os<<line<<endl; } // Write out each field separately os<<numFields<<endl; for(int fieldNum=0 ; fieldNum<numFields ; ++fieldNum) { Field &field=*fields[fieldNum]; // Count the number of codons (you just never know...) BOOM::Map<char,float>::iterator aCur=field.aminoAcidFreqs.begin(), aEnd=field.aminoAcidFreqs.end(); numCodons=0; for(; aCur!=aEnd ; ++aCur) { char acid=(*aCur).first; float &acidP=(*aCur).second; BOOM::Map<BOOM::String,float> codons=codonFreqs[acid]; numCodons+=codons.size(); } os<<field.fieldLength<<endl; os<<numCodons<<endl; aCur=field.aminoAcidFreqs.begin(); aEnd=field.aminoAcidFreqs.end(); for(; aCur!=aEnd ; ++aCur) { char acid=(*aCur).first; float &acidP=(*aCur).second; BOOM::Map<BOOM::String,float> codons=codonFreqs[acid]; BOOM::Map<BOOM::String,float>::iterator cur=codons.begin(), end=codons.end(); for(; cur!=end ; ++cur) { BOOM::String codon=(*cur).first; float codonP=(*cur).second; float logP=log(acidP*codonP); os<<codon<<" "<<logP<<endl; } } } }
void Application::updateCodonFreqs(const BOOM::String &transcript) { const char *str=transcript.c_str(); int len=transcript.length(); const char *p=str; for(int i=0 ; i<len ; i+=3, p+=3) { BOOM::String codon(p,3); char acid=BOOM::ProteinTrans::mapCodon(codon.c_str()); BOOM::Map<BOOM::String,float> &counts=codonFreqs[acid]; if(!counts.isDefined(codon)) counts[codon]=1; else ++counts[codon]; } }
bool IMM::save(const BOOM::String &filename) { ofstream os(filename.c_str()); if(!os.good()) throw BOOM::String("Error creating file ")+filename+ "in IMM::save()"; return save(os); }
bool BOOM::FastaReader::nextSequence(BOOM::String &defline, BOOM::String &sequence) { if(file.eof()) return false; if(cache.length()>0) { defline=cache; cache=""; } else defline=file.readLine(); if(file.eof()) return false; sequence=""; while(!file.eof()) { BOOM::String line=file.readLine(); if(line[0]=='>') { cache=line; break; } line.trimWhitespace(); sequence+=line; } sequence.toupper(); maskStrangeChars(sequence); return true; }
bool ThreePeriodicMarkovChain::save(const BOOM::String &filename) { ofstream os(filename.c_str()); if(!os.good()) throw BOOM::String("Error creating file ")+filename+ "in ThreePeriodicMarkovChain::save()"; return save(os); }
void Application::writeHistogramFile(BOOM::Vector<double> &scores, const BOOM::String &filename) { ofstream os(filename.c_str()); BOOM::Vector<double>::iterator cur=scores.begin(), end=scores.end(); for(; cur!=end ; ++cur) os<<*cur<<endl; }
bool BranchAcceptor::save(const BOOM::String &filename) { ofstream os(filename.c_str()); if(!os.good()) throw BOOM::String("Error creating file ")+filename+ "in BranchPoint::save()"; return save(os); }
double IMM::scoreSingleBase(const Sequence &seq,const BOOM::String &str, int index,Symbol s,char c) { const char *p=str.c_str(); switch(getStrand()) { case PLUS_STRAND: { int maxOrder=(index>N ? N : index); for(int order=maxOrder ; order>=0 ; --order) { BOOM::StringMap<double> &model=*(*models)[order]; if(model.isDefined(p,index-order,order+1)) return model.lookup(p,index-order,order+1); } throw BOOM::String("IMM::scoreSingleBase('+',")+ index+",strlen="+strlen(p)+",str="+ str.substring(index,maxOrder)+")"; } case MINUS_STRAND: { /* On the minus strand we have to take our contexts from the right (but only because we trained the model that way) */ int seqLen=str.length(); int maxOrder=seqLen-index-1; if(maxOrder>N) maxOrder=N; for(int order=maxOrder ; order>=0 ; --order) { BOOM::StringMap<double> &model=*(*models)[order]; if(model.isDefined(p,index,order+1)) return model.lookup(p,index,order+1); } throw BOOM::Stacktrace( BOOM::String("IMM::scoreSingleBase('-',")+ index+",strlen="+strlen(p)+",str="+ str.substring(index,maxOrder)+")"); } default: throw BOOM::String(__FILE__)+__LINE__; } }
void Application::processReverseFeature(int featureBegin, const BOOM::String &seq) { int end=featureBegin; int begin=featureBegin-margin; if(begin<0) begin=0; BOOM::String subseq= BOOM::ProteinTrans::reverseComplement(seq.substr(begin,end-begin)); margins.push_back(subseq); }
ThreePeriodicMarkovChain::ThreePeriodicMarkovChain(const BOOM::String & filename) { ifstream is(filename.c_str()); if(!is.good()) throw BOOM::String("Error opening file ")+filename +" in ThreePeriodicMarkovChain()"; BOOM::String modelType; is >> modelType; if(modelType!="3P") throw BOOM::String("Attempt to load an object of type ")+modelType+ " into a ThreePeriodicMarkovChain (3P)"; load(is); }
double BOOM::SequenceEntropy::conditionalEntropy(const BOOM::String &str, int order) { if(order<1) throw "BOOM::SequenceEntropy::conditionalEntropy() : order<1"; int len=str.length(); int gramSize=order+1; if(gramSize>=len) throw BOOM::String("Order ")+order+ " is too large for sequence of length "+len; int numWindows=len-gramSize+1; BOOM::StringMap<int> counts(hashTableSize(order)); BOOM::StringMap<int> prefixCounts(hashTableSize(order-1)); const char *p=str.c_str(); int total=0; for(int i=0 ; i<numWindows ; ++i, ++p) { if(counts.isDefined(p,gramSize)) ++counts.lookup(p,gramSize); else counts.lookup(p,gramSize)=1; if(prefixCounts.isDefined(p,gramSize-1)) ++prefixCounts.lookup(p,gramSize-1); else prefixCounts.lookup(p,gramSize-1)=1; ++total; } double entropy=0; StringMapIterator<int> cur=counts.begin(), end=counts.end(); for(; cur!=end ; ++cur) { int count=(*cur).second; const char *s=(*cur).first; double p=count/double(total); double condP=count/double(prefixCounts.lookup(s,gramSize-1)); entropy-=p*lg(condP); } return entropy; }
IMM::IMM(const BOOM::String &filename) : revComp(NULL), models(new BOOM::Vector<BOOM::StringMap<double>*>) { ifstream is(filename.c_str()); if(!is.good()) throw BOOM::String("Error opening file ")+filename +" in IMM::IMM()"; BOOM::String modelType; is >> modelType; if(modelType!="IMM") throw BOOM::String("Attempt to load an object of type ")+modelType+ " into an IMM"; load(is); }
BOOM::String BOOM::String::substitute(const BOOM::String &from, const BOOM::String &to) const { BOOM::String rval; const char *pattern=from.c_str(); int patternLen=from.length(); const char *ptr=c_str(); const char *last=ptr+length()-patternLen; while(ptr<=last) { if(localMatch(ptr,pattern,patternLen)) { ptr+=patternLen; rval+=to; } else { rval+=*ptr; ptr++; } } //int extra=patternLen-1; //for(int i=0 ; i<extra ; ++i) rval+=*ptr++; for(; *ptr ; ++ptr) rval+=*ptr; return rval; }
void EmpiricalDistribution::load(const BOOM::String &filename) { ifstream is(filename.c_str()); if(!is.good()) throw BOOM::String("Error opening file ")+filename+ " in EmpiricalDistribution::load()"; while(!is.eof()) { unsigned x; double y; is >> x; if(is.eof()) break; is >> y; v.push_back(new EmpiricalDistributionElement(x,y)); } binSize=v[1]->first-v[0]->first; }
BranchAcceptor::BranchAcceptor(GarbageCollector &gc,BOOM::String &filename) : SignalSensor(gc), branchPoint(NULL), acceptor(NULL) { // ctor ifstream is(filename.c_str()); if(!is.good()) throw BOOM::String("Error opening file ")+filename+ "in BranchAcceptor::BranchAcceptor()"; BOOM::String modelType; is >> modelType; if(modelType!="BranchAcceptor") throw BOOM::String("Attempt to load an object of type ")+modelType+ "in into a BranchAcceptor"; load(is); }
void Application::initFields(const BOOM::String &lengthString) { BOOM::Vector<BOOM::String> &fieldLengths=*lengthString.getFields(","); numFields=fieldLengths.size(); fields.resize(numFields); int begin=0; for(int i=0 ; i<numFields ; ++i) { int fieldLength=fieldLengths[i].asInt(); Field *field=fields[i]=new Field; field->fieldLength=fieldLength; field->begin=begin; begin+=fieldLength*3; field->end=begin; //cout<<"field "<<i<<" : "<<fieldLength<<" "<<field->begin<<" "<<field->end<<endl; } delete &fieldLengths; }
void Application::writeOutput(const BOOM::String &tataFile, const BOOM::String &outfile) { // Load TATA model SignalSensor *tata=SignalSensor::load(tataFile,GC); // Create output file and write header ofstream os(outfile.c_str()); os.precision(8); os<<"TataCapModel"<<endl; os<<minSeparation<<"\t"<<maxSeparation<<endl; // Write out the TATA model tata->save(os); // Write out the intergenic model Alphabet &alphabet=DnaAlphabet::global(); os<<"MC\nINTERGENIC\n0\t0\t1\n5"<<endl; os<<"A\n"<<intergenicModel[alphabet.lookup('A')]<<endl; os<<"C\n"<<intergenicModel[alphabet.lookup('C')]<<endl; os<<"G\n"<<intergenicModel[alphabet.lookup('G')]<<endl; os<<"N\n"<<intergenicModel[alphabet.lookup('N')]<<endl; os<<"T\n"<<intergenicModel[alphabet.lookup('T')]<<endl; os<<"MC\nINTERGENIC\n0\t0\t1\n5"<<endl; os<<"A\n"<<intergenicModel[alphabet.lookup('T')]<<endl; os<<"C\n"<<intergenicModel[alphabet.lookup('G')]<<endl; os<<"G\n"<<intergenicModel[alphabet.lookup('C')]<<endl; os<<"N\n"<<intergenicModel[alphabet.lookup('N')]<<endl; os<<"T\n"<<intergenicModel[alphabet.lookup('A')]<<endl; // Write out the CAP model capModel->save(os); // Write out the CAP/intergenic ratio model capIntergenicRatioModel->save(os); }
bool BOOM::String::occursAt(const BOOM::String &substring,int pos) const { return localMatch(substring.c_str(),c_str()+pos,substring.size()); }
bool BOOM::String::contains(const BOOM::String &s) const { return find(s.c_str())!=npos; }
void BOOM::FastaWriter::addToFasta(const BOOM::String &defline, const BOOM::String &sequence, ostream &os) { addToFasta(defline,sequence.c_str(),os); }
bool BOOM::String::stricmp(const BOOM::String &str) const { return strcasecmp(c_str(),str.c_str()); }
BOOM::String BOOM::String::operator+(const BOOM::String &s) const { return BOOM::String(*this+s.c_str()); }
void FastaWriter::appendToFasta(const BOOM::String &defline,const BOOM::String &sequence,const BOOM::String &filename) { ofstream os(filename.c_str(),std::ios::app); // ios_base::app); addToFasta(defline,sequence,os); }