Ejemplo n.º 1
0
void fragFind(struct seqList *goodSeq, char *badName, int fragSize, int mismatchesAllowed, 
    boolean considerRc, double profile[16][4])
/* Do fast finding of patterns that are in FA file "goodName", but not in FA file
 * "badName."  BadName can be null.  Pass in the size of the pattern (fragSize) and
 * the number of mismatches to pattern you're willing to tolerate (mismatchesAllowed).
 * It returns the pattern in profile. */
{
int *goodTable, *badTable = NULL;
int goodCount, badCount = 0;
int goodIx;
long startTime;
DNA unpacked[17];

if (mismatchesAllowed > 3)
    errAbort("Sorry, fragFind can only handle 0-3 mismatches.");
if (fragSize > 10)
    errAbort("Sorry, fragFind can only handle fragments up to 10 bases.");

startTime = clock1000();
makeOligoHistogram(NULL, goodSeq, fragSize, &goodTable, &goodCount);
if (badName)
    makeOligoHistogram(badName, NULL, fragSize, &badTable, &badCount);
if (badName)
    {
    normalizeTable(goodTable, fragSize);
    normalizeTable(badTable, fragSize);
    diffTables(goodTable, badTable, fragSize);
    goodIx = fuzzVal(badTable, fragSize, mismatchesAllowed, considerRc);
    }
else
    {
    goodIx = fuzzVal(goodTable, fragSize, mismatchesAllowed, considerRc);
    }
freez(&goodTable);
freez(&badTable);
unpackVal(goodIx, fragSize, unpacked);
makeProfile(unpacked, fragSize, mismatchesAllowed, goodSeq, considerRc, profile);
}
Ejemplo n.º 2
0
void tmodel<COUNT, PROB>::normalizeTable(const vcbList&engl, const vcbList&french, int iter)
  // normalize conditional probability P(fj/ei):
  // i.e. make sure that Sum over all j of P(fj/e) = 1  
  // this method reads the counts portion of the table and normalize into
  // the probability portion. Then the counts are cleared (i.e. zeroed)
  // if the resulting probability of an entry is below a threshold, then 
  // remove it .
{
  if( iter==2 )
    {
      total2.resize(engl.uniqTokens());for(unsigned int i=0;i<total2.size();i++)total2[i]=0.0;
    }
  nFrench.resize(engl.uniqTokens());for(unsigned int i=0;i<nFrench.size();i++)nFrench[i]=0;
  nEng.resize(french.uniqTokens());for(unsigned int i=0;i<nEng.size();i++)nEng[i]=0;
  Vector<double> total(engl.uniqTokens(),0.0);
  //Vector<int> nFrench(engl.uniqTokens(), 0);
  //Vector<int> nEng(french.uniqTokens(), 0);

  typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::const_iterator i;
  for(i = ef.begin(); i != ef.end(); i++){ // for all possible source words e
    if( iter==2 )
      total2[((*i).first).first] += (*i).second.count;
    total[((*i).first).first] += (*i).second.count;
    nFrench[((*i).first).first]++;
    nEng[((*i).first).second]++;
  }
  for(unsigned int k=0;k<engl.uniqTokens();++k)
    if( nFrench[k] )
      {
	double probMass=(french.uniqTokensInCorpus()-nFrench[k])*PROB_SMOOTH;
	if( probMass<0.0 )
	  cout << k << " french.uniqTokensInCorpus(): " << french.uniqTokensInCorpus() << "  nFrench[k]:"<< nFrench[k] << '\n';
	total[k]+= total[k]*probMass/(1-probMass);
      }
  typename hash_map<wordPairIds, CPPair, hashpair, equal_to<wordPairIds> >::iterator j, k;
  PROB p ;
  int nParams=0;
  int nor = 0;
  for(j = ef.begin(); j != ef.end(); ){
    k = j;
    k++ ;
    if( (total[((*j).first).first])>0.0 )
      p = ((((*j).second).count) /(total[((*j).first).first])) ;
    else
      p= 0.0;
    if (p > PROB_CUTOFF)
      {
	if( iter>0 )
	  {
        if(useWord2Vec && word2vec.Method == 2){
            p = word2vec.L * word2vec.getW2VProb(((*j).first).first, ((*j).first).second) + (1. - word2vec.L) * p;
            nor = 1;
        }
	    ((*j).second).prob = 0 ;
        ((*j).second).count = p ;
	  }
	else
	  {
	    ((*j).second).prob = p ;
	    ((*j).second).count = 0 ;
	  }
	nParams++;
      }
    else {
      erase(((*j).first).first, ((*j).first).second);
    }
    j = k ;
  }
  if(nor)
      cout << "probabilities Normalized in iteration = " << iter << endl;
  if( iter>0 )
    return normalizeTable(engl, french, iter-1);
  else
    {
    }
}