void BOOM::FastaWriter::writeFasta(const BOOM::String &defline, const BOOM::String &sequence, const BOOM::String &filename) { ofstream os(filename.c_str()); addToFasta(defline,sequence.c_str(),os); }
void Application::generateModel(const BOOM::String &filename, const BOOM::String &startCodonModelFile) { // Create output file ofstream os(filename.c_str()); os<<"SignalPeptide"<<endl; // Copy the start codon model into the file ifstream is(startCodonModelFile.c_str()); BOOM::String line; while(!is.eof()) { line.getline(is); if(is.eof()) break; os<<line<<endl; } // Write out each field separately os<<numFields<<endl; for(int fieldNum=0 ; fieldNum<numFields ; ++fieldNum) { Field &field=*fields[fieldNum]; // Count the number of codons (you just never know...) BOOM::Map<char,float>::iterator aCur=field.aminoAcidFreqs.begin(), aEnd=field.aminoAcidFreqs.end(); numCodons=0; for(; aCur!=aEnd ; ++aCur) { char acid=(*aCur).first; float &acidP=(*aCur).second; BOOM::Map<BOOM::String,float> codons=codonFreqs[acid]; numCodons+=codons.size(); } os<<field.fieldLength<<endl; os<<numCodons<<endl; aCur=field.aminoAcidFreqs.begin(); aEnd=field.aminoAcidFreqs.end(); for(; aCur!=aEnd ; ++aCur) { char acid=(*aCur).first; float &acidP=(*aCur).second; BOOM::Map<BOOM::String,float> codons=codonFreqs[acid]; BOOM::Map<BOOM::String,float>::iterator cur=codons.begin(), end=codons.end(); for(; cur!=end ; ++cur) { BOOM::String codon=(*cur).first; float codonP=(*cur).second; float logP=log(acidP*codonP); os<<codon<<" "<<logP<<endl; } } } }
double BOOM::SequenceEntropy::entropy(const BOOM::String &str, double &maxEntropy) { int len=str.length(); BOOM::StringMap<int> counts(hashTableSize(0)); const char *p=str.c_str(); int total=0; for(int i=0 ; i<len ; ++i, ++p) { if(counts.isDefined(p,1)) ++counts.lookup(p,1); else counts.lookup(p,1)=1; ++total; } double entropy=0; StringMapIterator<int> cur=counts.begin(), end=counts.end(); for(; cur!=end ; ++cur) { int count=(*cur).second; double p=count/double(total); entropy-=p*lg(p); } maxEntropy=-lg(1.0/counts.size()); if(entropy>maxEntropy) entropy=maxEntropy; return entropy; }
bool IMM::save(const BOOM::String &filename) { ofstream os(filename.c_str()); if(!os.good()) throw BOOM::String("Error creating file ")+filename+ "in IMM::save()"; return save(os); }
bool ThreePeriodicMarkovChain::save(const BOOM::String &filename) { ofstream os(filename.c_str()); if(!os.good()) throw BOOM::String("Error creating file ")+filename+ "in ThreePeriodicMarkovChain::save()"; return save(os); }
double BOOM::SequenceEntropy::jointEntropy(const BOOM::String &str, int order, double &maxEntropy) { int len=str.length(); int gramSize=order+1; if(gramSize>=len) throw BOOM::String("Order ")+order+ " is too large for sequence of length "+len; int numWindows=len-gramSize+1; BOOM::StringMap<int> counts(hashTableSize(order)); const char *p=str.c_str(); int total=0; for(int i=0 ; i<numWindows ; ++i, ++p) { if(counts.isDefined(p,gramSize)) ++counts.lookup(p,gramSize); else counts.lookup(p,gramSize)=1; ++total; } double entropy=0; StringMapIterator<int> cur=counts.begin(), end=counts.end(); for(; cur!=end ; ++cur) { int count=(*cur).second; double p=count/double(total); entropy-=p*lg(p); } maxEntropy=-lg(1.0/counts.size()); if(entropy>maxEntropy) entropy=maxEntropy; return entropy; }
void Application::writeHistogramFile(BOOM::Vector<double> &scores, const BOOM::String &filename) { ofstream os(filename.c_str()); BOOM::Vector<double>::iterator cur=scores.begin(), end=scores.end(); for(; cur!=end ; ++cur) os<<*cur<<endl; }
bool BranchAcceptor::save(const BOOM::String &filename) { ofstream os(filename.c_str()); if(!os.good()) throw BOOM::String("Error creating file ")+filename+ "in BranchPoint::save()"; return save(os); }
void SignalSensor::addConsensus(const BOOM::String &s) { int len=s.length(); if(consensusLength>0 && consensusLength!=len) throw BOOM::String( "Consensus lengths differ in SignalSensor::addConsensus"); consensusLength=len; consensuses.lookup(s.c_str(),len)=char(1); }
IMM::IMM(const BOOM::String &filename) : revComp(NULL), models(new BOOM::Vector<BOOM::StringMap<double>*>) { ifstream is(filename.c_str()); if(!is.good()) throw BOOM::String("Error opening file ")+filename +" in IMM::IMM()"; BOOM::String modelType; is >> modelType; if(modelType!="IMM") throw BOOM::String("Attempt to load an object of type ")+modelType+ " into an IMM"; load(is); }
ThreePeriodicMarkovChain::ThreePeriodicMarkovChain(const BOOM::String & filename) { ifstream is(filename.c_str()); if(!is.good()) throw BOOM::String("Error opening file ")+filename +" in ThreePeriodicMarkovChain()"; BOOM::String modelType; is >> modelType; if(modelType!="3P") throw BOOM::String("Attempt to load an object of type ")+modelType+ " into a ThreePeriodicMarkovChain (3P)"; load(is); }
void Application::updateCodonFreqs(const BOOM::String &transcript) { const char *str=transcript.c_str(); int len=transcript.length(); const char *p=str; for(int i=0 ; i<len ; i+=3, p+=3) { BOOM::String codon(p,3); char acid=BOOM::ProteinTrans::mapCodon(codon.c_str()); BOOM::Map<BOOM::String,float> &counts=codonFreqs[acid]; if(!counts.isDefined(codon)) counts[codon]=1; else ++counts[codon]; } }
void EmpiricalDistribution::load(const BOOM::String &filename) { ifstream is(filename.c_str()); if(!is.good()) throw BOOM::String("Error opening file ")+filename+ " in EmpiricalDistribution::load()"; while(!is.eof()) { unsigned x; double y; is >> x; if(is.eof()) break; is >> y; v.push_back(new EmpiricalDistributionElement(x,y)); } binSize=v[1]->first-v[0]->first; }
BranchAcceptor::BranchAcceptor(GarbageCollector &gc,BOOM::String &filename) : SignalSensor(gc), branchPoint(NULL), acceptor(NULL) { // ctor ifstream is(filename.c_str()); if(!is.good()) throw BOOM::String("Error opening file ")+filename+ "in BranchAcceptor::BranchAcceptor()"; BOOM::String modelType; is >> modelType; if(modelType!="BranchAcceptor") throw BOOM::String("Attempt to load an object of type ")+modelType+ "in into a BranchAcceptor"; load(is); }
double IMM::scoreSingleBase(const Sequence &seq,const BOOM::String &str, int index,Symbol s,char c) { const char *p=str.c_str(); switch(getStrand()) { case PLUS_STRAND: { int maxOrder=(index>N ? N : index); for(int order=maxOrder ; order>=0 ; --order) { BOOM::StringMap<double> &model=*(*models)[order]; if(model.isDefined(p,index-order,order+1)) return model.lookup(p,index-order,order+1); } throw BOOM::String("IMM::scoreSingleBase('+',")+ index+",strlen="+strlen(p)+",str="+ str.substring(index,maxOrder)+")"; } case MINUS_STRAND: { /* On the minus strand we have to take our contexts from the right (but only because we trained the model that way) */ int seqLen=str.length(); int maxOrder=seqLen-index-1; if(maxOrder>N) maxOrder=N; for(int order=maxOrder ; order>=0 ; --order) { BOOM::StringMap<double> &model=*(*models)[order]; if(model.isDefined(p,index,order+1)) return model.lookup(p,index,order+1); } throw BOOM::Stacktrace( BOOM::String("IMM::scoreSingleBase('-',")+ index+",strlen="+strlen(p)+",str="+ str.substring(index,maxOrder)+")"); } default: throw BOOM::String(__FILE__)+__LINE__; } }
double BOOM::SequenceEntropy::conditionalEntropy(const BOOM::String &str, int order) { if(order<1) throw "BOOM::SequenceEntropy::conditionalEntropy() : order<1"; int len=str.length(); int gramSize=order+1; if(gramSize>=len) throw BOOM::String("Order ")+order+ " is too large for sequence of length "+len; int numWindows=len-gramSize+1; BOOM::StringMap<int> counts(hashTableSize(order)); BOOM::StringMap<int> prefixCounts(hashTableSize(order-1)); const char *p=str.c_str(); int total=0; for(int i=0 ; i<numWindows ; ++i, ++p) { if(counts.isDefined(p,gramSize)) ++counts.lookup(p,gramSize); else counts.lookup(p,gramSize)=1; if(prefixCounts.isDefined(p,gramSize-1)) ++prefixCounts.lookup(p,gramSize-1); else prefixCounts.lookup(p,gramSize-1)=1; ++total; } double entropy=0; StringMapIterator<int> cur=counts.begin(), end=counts.end(); for(; cur!=end ; ++cur) { int count=(*cur).second; const char *s=(*cur).first; double p=count/double(total); double condP=count/double(prefixCounts.lookup(s,gramSize-1)); entropy-=p*lg(condP); } return entropy; }
BOOM::String BOOM::String::substitute(const BOOM::String &from, const BOOM::String &to) const { BOOM::String rval; const char *pattern=from.c_str(); int patternLen=from.length(); const char *ptr=c_str(); const char *last=ptr+length()-patternLen; while(ptr<=last) { if(localMatch(ptr,pattern,patternLen)) { ptr+=patternLen; rval+=to; } else { rval+=*ptr; ptr++; } } //int extra=patternLen-1; //for(int i=0 ; i<extra ; ++i) rval+=*ptr++; for(; *ptr ; ++ptr) rval+=*ptr; return rval; }
void Application::writeOutput(const BOOM::String &tataFile, const BOOM::String &outfile) { // Load TATA model SignalSensor *tata=SignalSensor::load(tataFile,GC); // Create output file and write header ofstream os(outfile.c_str()); os.precision(8); os<<"TataCapModel"<<endl; os<<minSeparation<<"\t"<<maxSeparation<<endl; // Write out the TATA model tata->save(os); // Write out the intergenic model Alphabet &alphabet=DnaAlphabet::global(); os<<"MC\nINTERGENIC\n0\t0\t1\n5"<<endl; os<<"A\n"<<intergenicModel[alphabet.lookup('A')]<<endl; os<<"C\n"<<intergenicModel[alphabet.lookup('C')]<<endl; os<<"G\n"<<intergenicModel[alphabet.lookup('G')]<<endl; os<<"N\n"<<intergenicModel[alphabet.lookup('N')]<<endl; os<<"T\n"<<intergenicModel[alphabet.lookup('T')]<<endl; os<<"MC\nINTERGENIC\n0\t0\t1\n5"<<endl; os<<"A\n"<<intergenicModel[alphabet.lookup('T')]<<endl; os<<"C\n"<<intergenicModel[alphabet.lookup('G')]<<endl; os<<"G\n"<<intergenicModel[alphabet.lookup('C')]<<endl; os<<"N\n"<<intergenicModel[alphabet.lookup('N')]<<endl; os<<"T\n"<<intergenicModel[alphabet.lookup('A')]<<endl; // Write out the CAP model capModel->save(os); // Write out the CAP/intergenic ratio model capIntergenicRatioModel->save(os); }
BOOM::String BOOM::String::operator+(const BOOM::String &s) const { return BOOM::String(*this+s.c_str()); }
bool BOOM::String::stricmp(const BOOM::String &str) const { return strcasecmp(c_str(),str.c_str()); }
bool BOOM::String::occursAt(const BOOM::String &substring,int pos) const { return localMatch(substring.c_str(),c_str()+pos,substring.size()); }
bool BOOM::String::contains(const BOOM::String &s) const { return find(s.c_str())!=npos; }
void NmerRateMatrix::save(const BOOM::String &filename) { ofstream os(filename.c_str()); save(os); }
void FastaWriter::appendToFasta(const BOOM::String &defline,const BOOM::String &sequence,const BOOM::String &filename) { ofstream os(filename.c_str(),std::ios::app); // ios_base::app); addToFasta(defline,sequence,os); }
void BOOM::FastaWriter::addToFasta(const BOOM::String &defline, const BOOM::String &sequence, ostream &os) { addToFasta(defline,sequence.c_str(),os); }
bool SignalSensor::consensusOccursAt(const BOOM::String &str,int index) { return consensuses.isDefined(str.c_str(),index,consensusLength); }