Beispiel #1
0
double BOOM::SequenceEntropy::entropy(const BOOM::String &str,
				    double &maxEntropy)
{
  int len=str.length();
  BOOM::StringMap<int> counts(hashTableSize(0));
  const char *p=str.c_str();
  int total=0;
  for(int i=0 ; i<len ; ++i, ++p)
    {
      if(counts.isDefined(p,1)) 
	++counts.lookup(p,1);
      else 
	counts.lookup(p,1)=1;
      ++total;
    }
  double entropy=0;
  StringMapIterator<int> cur=counts.begin(), end=counts.end();
  for(; cur!=end ; ++cur)
    {
      int count=(*cur).second;
      double p=count/double(total);
      entropy-=p*lg(p);
    }
  maxEntropy=-lg(1.0/counts.size());
  if(entropy>maxEntropy) entropy=maxEntropy;
  return entropy;
}
Beispiel #2
0
double BOOM::SequenceEntropy::jointEntropy(const BOOM::String &str,
					   int order,
					   double &maxEntropy)
{
  int len=str.length();
  int gramSize=order+1;
  if(gramSize>=len) 
    throw BOOM::String("Order ")+order+
      " is too large for sequence of length "+len;
  int numWindows=len-gramSize+1;
  BOOM::StringMap<int> counts(hashTableSize(order));
  const char *p=str.c_str();
  int total=0;
  for(int i=0 ; i<numWindows ; ++i, ++p)
    {
      if(counts.isDefined(p,gramSize)) 
	++counts.lookup(p,gramSize);
      else 
	counts.lookup(p,gramSize)=1;
      ++total;
    }
  double entropy=0;
  StringMapIterator<int> cur=counts.begin(), end=counts.end();
  for(; cur!=end ; ++cur)
    {
      int count=(*cur).second;
      double p=count/double(total);
      entropy-=p*lg(p);
    }
  maxEntropy=-lg(1.0/counts.size());
  if(entropy>maxEntropy) entropy=maxEntropy;
  return entropy;
}
Beispiel #3
0
void BOOM::FastaWriter::writeFasta(const BOOM::String &defline,
				 const BOOM::String &sequence,
				 const BOOM::String &filename)
{
  ofstream os(filename.c_str());
  addToFasta(defline,sequence.c_str(),os);
}
Beispiel #4
0
void BranchAcceptor::load(istream &is)
{
  double cutoff;
  Strand strand;
  SignalType signalType;
  BOOM::String p;
  int consensusOffset;
  is >> signalType;
  is >> p; 
  cutoff=p.asDouble();
  is >> strand;
  is >> consensusOffset;
  setSignalType(signalType);
  setStrand(strand);
  setCutoff(cutoff);

  BOOM::String dummy;
  is>>dummy; // will always be "WWAM"
  branchPoint=new WWAM(getGC(),is);
  is>>dummy; // will always be "WAM"
  acceptor=new WAM(getGC(),is);

  int contextWindowLength=branchPoint->getContextWindowLength()+
    acceptor->getContextWindowLength();
  setSizes(2,consensusOffset,contextWindowLength);
}
BOOM::String SignalPeptide::getSequence()
{
  BOOM::String sequence;
  int n=exons.size();
  for(int i=0 ; i<n ; ++i)
    sequence+=exons[i].getSequence();
  if(sequence[0]=='M') 
    sequence=sequence.substring(1,sequence.length()-1);
  return sequence;
}
Beispiel #6
0
void Application::processForwardFeature(int featureEnd,
					const BOOM::String &seq)
{
  int begin=featureEnd;
  int end=begin+margin;
  int len=seq.length();
  if(end>=len) end=len-1;
  BOOM::String subseq=seq.substr(begin,end-begin);
  margins.push_back(subseq);
}
Beispiel #7
0
void SignalSensor::addConsensus(const BOOM::String &s)
{
  int len=s.length();
  if(consensusLength>0 && consensusLength!=len)
    throw BOOM::String(
       "Consensus lengths differ in SignalSensor::addConsensus");
  
  consensusLength=len;
  consensuses.lookup(s.c_str(),len)=char(1);
}
void Application::generateModel(const BOOM::String &filename,
				const BOOM::String &startCodonModelFile)
{
  // Create output file
  ofstream os(filename.c_str());
  os<<"SignalPeptide"<<endl;

  // Copy the start codon model into the file
  ifstream is(startCodonModelFile.c_str());
  BOOM::String line;
  while(!is.eof())
    {
      line.getline(is);
      if(is.eof()) break;
      os<<line<<endl;
    }

  // Write out each field separately
  os<<numFields<<endl;
  for(int fieldNum=0 ; fieldNum<numFields ; ++fieldNum)
    {
      Field &field=*fields[fieldNum];

      // Count the number of codons (you just never know...)
      BOOM::Map<char,float>::iterator aCur=field.aminoAcidFreqs.begin(),
	aEnd=field.aminoAcidFreqs.end();
      numCodons=0;
      for(; aCur!=aEnd ; ++aCur)
	{
	  char acid=(*aCur).first;
	  float &acidP=(*aCur).second;
	  BOOM::Map<BOOM::String,float> codons=codonFreqs[acid];
	  numCodons+=codons.size();
	}

      os<<field.fieldLength<<endl;
      os<<numCodons<<endl;
      aCur=field.aminoAcidFreqs.begin();  aEnd=field.aminoAcidFreqs.end();
      for(; aCur!=aEnd ; ++aCur)
	{
	  char acid=(*aCur).first;
	  float &acidP=(*aCur).second;
	  BOOM::Map<BOOM::String,float> codons=codonFreqs[acid];
	  BOOM::Map<BOOM::String,float>::iterator cur=codons.begin(),
	    end=codons.end();
	  for(; cur!=end ; ++cur)
	    {
	      BOOM::String codon=(*cur).first;
	      float codonP=(*cur).second;
	      float logP=log(acidP*codonP);
	      os<<codon<<" "<<logP<<endl;
	    }
	}
    }
}
void Application::updateCodonFreqs(const BOOM::String &transcript)
{
  const char *str=transcript.c_str();
  int len=transcript.length();
  const char *p=str;
  for(int i=0 ; i<len ; i+=3, p+=3)
    {
      BOOM::String codon(p,3);
      char acid=BOOM::ProteinTrans::mapCodon(codon.c_str());
      BOOM::Map<BOOM::String,float> &counts=codonFreqs[acid];
      if(!counts.isDefined(codon)) counts[codon]=1;
      else ++counts[codon];
    }
}
Beispiel #10
0
bool IMM::save(const BOOM::String &filename)
{
  ofstream os(filename.c_str());
  if(!os.good()) throw BOOM::String("Error creating file ")+filename+
		   "in IMM::save()";
  return save(os);
}
Beispiel #11
0
bool BOOM::FastaReader::nextSequence(BOOM::String &defline,
				     BOOM::String &sequence)
{
   
  if(file.eof()) return false;
  if(cache.length()>0) 
    {
      defline=cache;
      cache="";
    }
  else defline=file.readLine();
  if(file.eof()) return false;

   
  sequence="";
  while(!file.eof())
    {
      BOOM::String line=file.readLine();
      if(line[0]=='>')
	{
	  cache=line;
	  break;
	}
      line.trimWhitespace();
      sequence+=line;
    }
  sequence.toupper();
  maskStrangeChars(sequence);
  return true;
}
Beispiel #12
0
bool ThreePeriodicMarkovChain::save(const BOOM::String &filename)
{
  ofstream os(filename.c_str());
  if(!os.good()) throw BOOM::String("Error creating file ")+filename+
		   "in ThreePeriodicMarkovChain::save()";
  return save(os);
}
void Application::writeHistogramFile(BOOM::Vector<double> &scores,
				     const BOOM::String &filename)
{
  ofstream os(filename.c_str());
  BOOM::Vector<double>::iterator cur=scores.begin(), end=scores.end();
  for(; cur!=end ; ++cur)
    os<<*cur<<endl;
}
Beispiel #14
0
bool BranchAcceptor::save(const BOOM::String &filename)
{
  ofstream os(filename.c_str());
  if(!os.good())
    throw BOOM::String("Error creating file ")+filename+
      "in BranchPoint::save()";
  return save(os);
}
Beispiel #15
0
double IMM::scoreSingleBase(const Sequence &seq,const BOOM::String &str,
				    int index,Symbol s,char c)
{
  const char *p=str.c_str();
  switch(getStrand())
    {
    case PLUS_STRAND:
      {
	int maxOrder=(index>N ? N : index);
	for(int order=maxOrder ; order>=0 ; --order)
	  {
	    BOOM::StringMap<double> &model=*(*models)[order];
	    if(model.isDefined(p,index-order,order+1))
	      return model.lookup(p,index-order,order+1);
	  }
	throw BOOM::String("IMM::scoreSingleBase('+',")+
	  index+",strlen="+strlen(p)+",str="+
	  str.substring(index,maxOrder)+")";
      }

    case MINUS_STRAND:
      {
	/*
	  On the minus strand we have to take our contexts from the
	  right (but only because we trained the model that way)
	 */
	int seqLen=str.length();
	int maxOrder=seqLen-index-1;
	if(maxOrder>N) maxOrder=N;
	for(int order=maxOrder ; order>=0 ; --order)
	  {
	    BOOM::StringMap<double> &model=*(*models)[order];
	    if(model.isDefined(p,index,order+1)) 
	      return model.lookup(p,index,order+1);
	  }
	throw BOOM::Stacktrace(
          BOOM::String("IMM::scoreSingleBase('-',")+
	    index+",strlen="+strlen(p)+",str="+
	  str.substring(index,maxOrder)+")");
      }

    default: throw BOOM::String(__FILE__)+__LINE__;
    }
}
Beispiel #16
0
void Application::processReverseFeature(int featureBegin,
					const BOOM::String &seq)
{
  int end=featureBegin;
  int begin=featureBegin-margin;
  if(begin<0) begin=0;
  BOOM::String subseq=
    BOOM::ProteinTrans::reverseComplement(seq.substr(begin,end-begin));
  margins.push_back(subseq);
}
Beispiel #17
0
ThreePeriodicMarkovChain::ThreePeriodicMarkovChain(const BOOM::String &
						   filename)
{
  ifstream is(filename.c_str());
  if(!is.good()) throw BOOM::String("Error opening file ")+filename
		   +" in ThreePeriodicMarkovChain()";
  BOOM::String modelType;
  is >> modelType;
  if(modelType!="3P")
    throw BOOM::String("Attempt to load an object of type ")+modelType+
      " into a ThreePeriodicMarkovChain (3P)";
  load(is);  
}
Beispiel #18
0
double BOOM::SequenceEntropy::conditionalEntropy(const BOOM::String &str,
						 int order)
{
  if(order<1) 
    throw "BOOM::SequenceEntropy::conditionalEntropy() : order<1";
  int len=str.length();
  int gramSize=order+1;
  if(gramSize>=len) 
    throw BOOM::String("Order ")+order+
      " is too large for sequence of length "+len;
  int numWindows=len-gramSize+1;
  BOOM::StringMap<int> counts(hashTableSize(order));
  BOOM::StringMap<int> prefixCounts(hashTableSize(order-1));
  const char *p=str.c_str();
  int total=0;
  for(int i=0 ; i<numWindows ; ++i, ++p)
    {
      if(counts.isDefined(p,gramSize)) 
	++counts.lookup(p,gramSize);
      else 
	counts.lookup(p,gramSize)=1;
      if(prefixCounts.isDefined(p,gramSize-1)) 
	++prefixCounts.lookup(p,gramSize-1);
      else 
	prefixCounts.lookup(p,gramSize-1)=1;
      ++total;
    }
  double entropy=0;
  StringMapIterator<int> cur=counts.begin(), end=counts.end();
  for(; cur!=end ; ++cur)
    {
      int count=(*cur).second;
      const char *s=(*cur).first;
      double p=count/double(total);
      double condP=count/double(prefixCounts.lookup(s,gramSize-1));
      entropy-=p*lg(condP);
    }
  return entropy;
}
Beispiel #19
0
IMM::IMM(const BOOM::String &filename)
  : revComp(NULL), models(new BOOM::Vector<BOOM::StringMap<double>*>)
{
  ifstream is(filename.c_str());
  if(!is.good()) throw BOOM::String("Error opening file ")+filename
		   +" in IMM::IMM()";
  BOOM::String modelType;
  is >> modelType;
  if(modelType!="IMM")
    throw BOOM::String("Attempt to load an object of type ")+modelType+
      " into an IMM";
  load(is);
}
Beispiel #20
0
BOOM::String BOOM::String::substitute(const BOOM::String &from,
				  const BOOM::String &to) const
{
  BOOM::String rval;
  const char *pattern=from.c_str();
  int patternLen=from.length();
  const char *ptr=c_str();
  const char *last=ptr+length()-patternLen;
  while(ptr<=last) {
    if(localMatch(ptr,pattern,patternLen)) {
      ptr+=patternLen;
      rval+=to;
    }
    else {
      rval+=*ptr;
      ptr++;
    }
  }
  //int extra=patternLen-1;
  //for(int i=0 ; i<extra ; ++i) rval+=*ptr++;
  for(; *ptr ; ++ptr) rval+=*ptr;
  return rval;
}
Beispiel #21
0
void EmpiricalDistribution::load(const BOOM::String &filename)
{
  ifstream is(filename.c_str());
  if(!is.good()) throw BOOM::String("Error opening file ")+filename+
		   " in EmpiricalDistribution::load()";
  while(!is.eof())
    {
      unsigned x;
      double y;
      is >> x;
      if(is.eof()) break;
      is >> y;
      v.push_back(new EmpiricalDistributionElement(x,y));
    }
  binSize=v[1]->first-v[0]->first;
}
Beispiel #22
0
BranchAcceptor::BranchAcceptor(GarbageCollector &gc,BOOM::String &filename)
  : SignalSensor(gc),
    branchPoint(NULL),
    acceptor(NULL)
{
  // ctor

  ifstream is(filename.c_str());
  if(!is.good()) throw BOOM::String("Error opening file ")+filename+
		   "in BranchAcceptor::BranchAcceptor()";
  BOOM::String modelType;
  is >> modelType;
  if(modelType!="BranchAcceptor") 
    throw BOOM::String("Attempt to load an object of type ")+modelType+
      "in into a BranchAcceptor";
  load(is);
}
void Application::initFields(const BOOM::String &lengthString)
{
  BOOM::Vector<BOOM::String> &fieldLengths=*lengthString.getFields(",");
  numFields=fieldLengths.size();
  fields.resize(numFields);
  int begin=0;
  for(int i=0 ; i<numFields ; ++i)
    {
      int fieldLength=fieldLengths[i].asInt();
      Field *field=fields[i]=new Field;
      field->fieldLength=fieldLength;
      field->begin=begin;
      begin+=fieldLength*3;
      field->end=begin;
      //cout<<"field "<<i<<" : "<<fieldLength<<" "<<field->begin<<" "<<field->end<<endl;
    }
  delete &fieldLengths;
}
Beispiel #24
0
void Application::writeOutput(const BOOM::String &tataFile,
			      const BOOM::String &outfile)
{
  // Load TATA model
  SignalSensor *tata=SignalSensor::load(tataFile,GC);

  // Create output file and write header
  ofstream os(outfile.c_str());
  os.precision(8);
  os<<"TataCapModel"<<endl;
  os<<minSeparation<<"\t"<<maxSeparation<<endl;
  
  // Write out the TATA model
  tata->save(os);

  // Write out the intergenic model
  Alphabet &alphabet=DnaAlphabet::global();
  os<<"MC\nINTERGENIC\n0\t0\t1\n5"<<endl;
  os<<"A\n"<<intergenicModel[alphabet.lookup('A')]<<endl;
  os<<"C\n"<<intergenicModel[alphabet.lookup('C')]<<endl;
  os<<"G\n"<<intergenicModel[alphabet.lookup('G')]<<endl;
  os<<"N\n"<<intergenicModel[alphabet.lookup('N')]<<endl;
  os<<"T\n"<<intergenicModel[alphabet.lookup('T')]<<endl;
  os<<"MC\nINTERGENIC\n0\t0\t1\n5"<<endl;
  os<<"A\n"<<intergenicModel[alphabet.lookup('T')]<<endl;
  os<<"C\n"<<intergenicModel[alphabet.lookup('G')]<<endl;
  os<<"G\n"<<intergenicModel[alphabet.lookup('C')]<<endl;
  os<<"N\n"<<intergenicModel[alphabet.lookup('N')]<<endl;
  os<<"T\n"<<intergenicModel[alphabet.lookup('A')]<<endl;

  // Write out the CAP model
  capModel->save(os);

  // Write out the CAP/intergenic ratio model
  capIntergenicRatioModel->save(os);
}
Beispiel #25
0
bool BOOM::String::occursAt(const BOOM::String &substring,int pos) const
{
  return localMatch(substring.c_str(),c_str()+pos,substring.size());
}
Beispiel #26
0
bool BOOM::String::contains(const BOOM::String &s) const
{
  return find(s.c_str())!=npos;
}
Beispiel #27
0
void BOOM::FastaWriter::addToFasta(const BOOM::String &defline,
				 const BOOM::String &sequence,
				 ostream &os)
{
  addToFasta(defline,sequence.c_str(),os);
}
Beispiel #28
0
bool BOOM::String::stricmp(const BOOM::String &str) const
{
  return strcasecmp(c_str(),str.c_str());
}
Beispiel #29
0
BOOM::String BOOM::String::operator+(const BOOM::String &s) const
{
  return BOOM::String(*this+s.c_str());
}
Beispiel #30
0
void FastaWriter::appendToFasta(const BOOM::String &defline,const 
				BOOM::String &sequence,const BOOM::String &filename)
{
  ofstream os(filename.c_str(),std::ios::app); // ios_base::app);
  addToFasta(defline,sequence,os);
}