Esempio n. 1
0
bool BamRecord::initFromFile(BamFileReader *bamFileReader)
{
	setFileIdx(bamFileReader->getFileIdx());
	_bamAlignment = bamFileReader->getAlignment();
	bamFileReader->getChrName(_chrName);

	_bamChromId = bamFileReader->getCurrChromdId();
	_startPos = bamFileReader->getStartPos();
	int2str(_startPos, _startPosStr);
	_endPos = bamFileReader->getEndPos();
	int2str(_endPos, _endPosStr);
	bamFileReader->getName(_name);
	bamFileReader->getScore(_score);
	char strandChar = bamFileReader->getStrand();
	setStrand(strandChar);

	 _isUnmapped = !bamFileReader->getAlignment().IsMapped();
	_isMateUnmapped = !bamFileReader->getAlignment().IsMateMapped();


	// Get the cigar data into a string.
	buildCigarStr();

	bamFileReader->getMateChrName(_mateChrName);
	int2str(_bamAlignment.MatePosition, _matePos);
	int2str(_bamAlignment.InsertSize, _insertSize);
	_queryBases = _bamAlignment.QueryBases;
	_qualities = _bamAlignment.Qualities;

	return true;
}
Esempio n. 2
0
void BranchAcceptor::load(istream &is)
{
  double cutoff;
  Strand strand;
  SignalType signalType;
  BOOM::String p;
  int consensusOffset;
  is >> signalType;
  is >> p; 
  cutoff=p.asDouble();
  is >> strand;
  is >> consensusOffset;
  setSignalType(signalType);
  setStrand(strand);
  setCutoff(cutoff);

  BOOM::String dummy;
  is>>dummy; // will always be "WWAM"
  branchPoint=new WWAM(getGC(),is);
  is>>dummy; // will always be "WAM"
  acceptor=new WAM(getGC(),is);

  int contextWindowLength=branchPoint->getContextWindowLength()+
    acceptor->getContextWindowLength();
  setSizes(2,consensusOffset,contextWindowLength);
}
Esempio n. 3
0
bool GffRecord::initFromFile(SingleLineDelimTextFileReader *fileReader)
{
	fileReader->getField(0, _chrName);
	_chrId = fileReader->getCurrChromdId();
	fileReader->getField(3, _startPosStr);
	_startPos = str2chrPos(_startPosStr);
	_startPos--; // VCF is one-based. Here we intentionally don't decrement the string version,
	//because we'll still want to output the one-based number in the print methods, even though
	//internally we decrement the integer to comply with the 0-based format common to other records.
	fileReader->getField(4, _endPosStr);
	//endPos is just the startPos plus the length of the variant
	_endPos = str2chrPos(_endPosStr);

	fileReader->getField(2, _name);
	fileReader->getField(1, _source);
	fileReader->getField(5, _score);

	//GFF allows a '.' for the strandChar, signifying it is not known.
	char strandChar = 0;
	fileReader->getField(6, strandChar);
	setStrand(strandChar);

	fileReader->getField(7, _frame);
	_numFields = fileReader->getNumFields();
	if (_numFields == 9) {
		fileReader->getField(8, _group);
	}


	return true;
}
Esempio n. 4
0
File: IMM.C Progetto: bmajoros/EGGS
IMM::IMM(BOOM::Vector<TrainingSequence*> &v,int order,
	 int minSampleSize,int phase,ContentType contentType,
	 Strand strand)
  : N(order),
    alphabetSize(alphabet.getNumElements()),
    phase(phase),
    revComp(NULL),
    models(new BOOM::Vector<BOOM::StringMap<double>*>)
{
  setContentType(contentType);
  if(strand==EITHER_STRAND) strand=::getStrand(contentType);
  setStrand(strand);

  buildModels(v,minSampleSize);

  if(strand==FORWARD_STRAND)
    {
      BOOM::Vector<TrainingSequence*> rcSeqs;
      revCompSeqs(v,rcSeqs);
      revComp=new IMM(rcSeqs,order,minSampleSize,phase,
		      ::reverseComplement(contentType),
		      REVERSE_STRAND);
      revComp->revComp=this;
    }
}
Esempio n. 5
0
ThreePeriodicMarkovChain::ThreePeriodicMarkovChain(BOOM::Vector<TrainingSequence*> &
						   sequences,int order,
						   int minSampleSize,
						   ContentType contentType)
{
  setContentType(contentType);
  setStrand(PLUS_STRAND);
  buildModels(sequences,minSampleSize,order);
}
Esempio n. 6
0
void ThreePeriodicMarkovChain::load(istream &is)
{
  BOOM::String dummy;
  for(int i=0 ; i<3 ; ++i)
    {
      is >> dummy;
      chains[i]=new MarkovChain(is);
    }
  ContentType contentType=chains[0]->getContentType();
  setContentType(contentType);
  setStrand(::getStrand(contentType));
}
Esempio n. 7
0
File: IMM.C Progetto: bmajoros/EGGS
IMM::IMM(const IMM &other)
  : N(other.N), phase(other.phase), alphabetSize(other.alphabetSize),
    revComp(NULL), models(new BOOM::Vector<BOOM::StringMap<double>*>)
{
  for(int i=0 ; i<=N ; ++i)
    models->push_back(new BOOM::StringMap<double>(*(*other.models)[i]));
  setContentType(other.getContentType());
  setStrand(other.getStrand());
  if(getContentType()==INTERGENIC)
    revComp=this;
  else if(other.revComp && getStrand()==FORWARD_STRAND)
    revComp=new IMM(*other.revComp);
}
Esempio n. 8
0
BranchAcceptor::BranchAcceptor(GarbageCollector &gc,
			       BOOM::Vector<TrainingSequence*> &seqs,
			       int branchPointOrder,
			       int acceptorOrder,
			       int branchContextLength,
			       int minSampleSize,
			       int consensusOffset, 
			       SignalType signalType)
  : SignalSensor(gc),
    branchPoint(NULL),
    acceptor(NULL)
{
  // ctor

  // Misc. initialization
  setStrand(FORWARD_STRAND);
  setSignalType(signalType);
  int sensorLength=seqs[0]->getLength();
  int acceptorContextLength=sensorLength-branchContextLength;
  setSizes(2,consensusOffset,sensorLength);   

  // Split training sequences into branch point windows and acceptor
  // windows
  BOOM::Vector<TrainingSequence*> branchPoints, acceptors;
  BOOM::Vector<TrainingSequence*>::iterator cur=seqs.begin(),
    end=seqs.end();
  for(; cur!=end ; ++cur)
    {
      TrainingSequence &S=**cur;
      TrainingSequence *branchPoint=new TrainingSequence();
      TrainingSequence *acceptor=new TrainingSequence();
      S.getSubsequence(0,branchContextLength,*branchPoint);
      S.getSubsequence(branchContextLength,acceptorContextLength,*acceptor);
      branchPoints.push_back(branchPoint);
      acceptors.push_back(acceptor);
    }

  // Train the branch point sensor & the acceptor sensor
  acceptor=new WAM(gc,acceptors,acceptorOrder,minSampleSize,
		   signalType,consensusOffset-branchContextLength,2);
  branchPoint=new WWAM(gc,branchPoints,branchPointOrder,minSampleSize,
		       signalType,0,0);

  // Delete the training subsequences
  cur=branchPoints.begin(); end=branchPoints.end();
  for(; cur!=end ; ++cur) delete *cur;
  cur=acceptors.begin(); end=acceptors.end();
  for(; cur!=end ; ++cur) delete *cur;
}
Esempio n. 9
0
 void Interval::setInterval(Domain start, Domain stop, int str){
   assert(inRange(start, stop));
   setStart(start);
   setStop(stop);
   setStrand(str);
 }
Esempio n. 10
0
File: IMM.C Progetto: bmajoros/EGGS
IMM::IMM(istream &is,Strand strand)
  : revComp(NULL), models(new BOOM::Vector<BOOM::StringMap<double>*>)
{
  setStrand(strand);
  load(is);
}
Esempio n. 11
0
ThreePeriodicMarkovChain::ThreePeriodicMarkovChain(Strand strand,
						   ContentType contentType)
{
  setStrand(strand);
  setContentType(contentType);
}
Esempio n. 12
0
File: BWM.C Progetto: bmajoros/EGGS
BWM::BWM(GarbageCollector &gc,BOOM::Vector<TrainingSequence*> &sequences,
	 SignalType signalType,int consensusOffset,int consensusLength,
	 float gcContent,float alpha)
  : WMM(gc)
{
  /*
    This constructor performs training of the BWM
   */

  // Set some things in the base class
  setStrand(FORWARD_STRAND);
  setSignalType(signalType);

  // Compute background single-nucleotide probabilities
  float atContent=1-gcContent;
  float AT=atContent/2, GC=gcContent/2;
  BOOM::Array1D<float> background(alphabet.getNumElements());
  background.setAllTo(0);
  Symbol A=alphabet.lookup('A'), T=alphabet.lookup('T');
  Symbol C=alphabet.lookup('C'), G=alphabet.lookup('G');
  background[A]=AT;
  background[T]=AT;
  background[C]=GC;
  background[G]=GC;

  // Allocate the underlying WMM matrix
  float pseudocount=0;
  int n=sequences.size();
  int len=sequences[0]->getLength();
  setSizes(consensusLength,consensusOffset,len);
  int nAlpha=alphabet.getNumElements();
  matrix.resize(len,nAlpha);
  matrix.setAllTo(pseudocount);

  // Iterate through the training sequences & collect counts
  BOOM::FloatArray1D effectiveSize(len);
  effectiveSize.setAllTo(0);
  for(int i=0 ; i<n ; ++i)
    {
      TrainingSequence &seq=*sequences[i];
      int l=seq.getLength();
      if(l!=len) throw BOOM::String("length mismatch in BWM: ")+len+" vs "+l;
      for(int pos=0 ; pos<len ; ++pos)
	{
	  Symbol s=seq[pos];
	  int count=seq.getBoostCount();
	  matrix[pos][s]+=count;
	  effectiveSize[pos]+=count;
	}
    }

  // Perform a binomial test at each position to see if frequencies
  // are significantly different from the background frequencies
  numSignificantPositions=0;
  for(int pos=0 ; pos<len ; ++pos)
    {
      // First, we have to identify the most extreme count
      int mostExtremeDiff=0;
      Symbol mostExtremeSymbol=0;
      int sampleSize=int(effectiveSize[pos]+5/9.0);
      for(Symbol s=0 ; s<nAlpha ; ++s)
	{
	  int expected=int(background[s]*sampleSize+5/9.0);
	  int observed=(int)matrix[pos][s];
	  int diff=abs(expected-observed);
	  if(diff>mostExtremeDiff)
	    {
	      mostExtremeDiff=diff;
	      mostExtremeSymbol=s;
	    }
	}

      // Apply binomial distribution to get P-value for this count
      float p=background[mostExtremeSymbol];
      int expected=int(p*sampleSize+5/9.0);
      int observed=expected+mostExtremeDiff; // +/- are symmetric, so use +
      double P=
	BinomialDistribution::rightTailedPValue(observed,sampleSize,p);

      // If not significantly different from background, then the observed
      // frequencies are probably just a statistical fluctation due to
      // small sample size, so use the background probabilities instead
      if(P>alpha)
	{
	  effectiveSize[pos]=0;
	  for(Symbol s=0 ; s<nAlpha ; ++s)
	    {
	      matrix[pos][s]=background[s]*sampleSize;
	      effectiveSize[pos]+=matrix[pos][s];
	    }
	}
      else 
	{
	  cout<<pos<<" : using OBSERVED frequencies, p="<<P<<endl;
	  ++numSignificantPositions;
	}
    }

  // Normalize counts into probabilities
  for(int pos=0 ; pos<len ; ++pos)
    for(Symbol s=0 ; s<nAlpha ; ++s)
      matrix[pos][s]/=effectiveSize[pos];

  // Convert probabilities to log-probabilities
  convertToLogs();  
}