Exemplo n.º 1
0
map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) {
    map<NFAVertex, NFAStateSet> squash;

    // Number of bits to use for all our masks. If we're a triggered graph,
    // tops have already been assigned, so we don't have to account for them.
    const u32 numStates = num_vertices(g);

    // Build post-dominator tree.
    PostDomTree pdom_tree;
    buildPDomTree(g, pdom_tree);

    // Build list of vertices by state ID and a set of init states.
    vector<NFAVertex> vByIndex(numStates, NFAGraph::null_vertex());
    NFAStateSet initStates(numStates);
    smgb_cache cache(g);

    // Mappings used for SOM mode calculations, otherwise left empty.
    unordered_map<NFAVertex, u32> region_map;
    vector<DepthMinMax> som_depths;
    if (som) {
        region_map = assignRegions(g);
        som_depths = getDistancesFromSOM(g);
    }

    for (auto v : vertices_range(g)) {
        const u32 vert_id = g[v].index;
        DEBUG_PRINTF("vertex %u/%u\n", vert_id, numStates);
        assert(vert_id < numStates);
        vByIndex[vert_id] = v;

        if (is_any_start(v, g) || !in_degree(v, g)) {
            initStates.set(vert_id);
        }
    }

    for (u32 i = 0; i < numStates; i++) {
        NFAVertex v = vByIndex[i];
        assert(v != NFAGraph::null_vertex());
        const CharReach &cr = g[v].char_reach;

        /* only non-init cyclics can be squashers */
        if (!hasSelfLoop(v, g) || initStates.test(i)) {
            continue;
        }

        DEBUG_PRINTF("state %u is cyclic\n", i);

        NFAStateSet mask(numStates), succ(numStates), pred(numStates);
        buildSquashMask(mask, g, v, cr, initStates, vByIndex, pdom_tree, som,
                        som_depths, region_map, cache);
        buildSucc(succ, g, v);
        buildPred(pred, g, v);
        const auto &reports = g[v].reports;

        for (size_t j = succ.find_first(); j != succ.npos;
             j = succ.find_next(j)) {
            NFAVertex vj = vByIndex[j];
            NFAStateSet pred2(numStates);
            buildPred(pred2, g, vj);
            if (pred2 == pred) {
                DEBUG_PRINTF("adding the sm from %zu to %u's sm\n", j, i);
                NFAStateSet tmp(numStates);
                buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree,
                                som, som_depths, region_map, cache);
                mask &= tmp;
            }
        }

        for (size_t j = pred.find_first(); j != pred.npos;
             j = pred.find_next(j)) {
            NFAVertex vj = vByIndex[j];
            NFAStateSet succ2(numStates);
            buildSucc(succ2, g, vj);
            /* we can use j as a basis for squashing if its succs are a subset
             * of ours */
            if ((succ2 & ~succ).any()) {
                continue;
            }

            if (som) {
                /* We cannot use j to add to the squash mask of v if it may
                 * have an earlier start of match offset. ie for us j as a
                 * basis for the squash mask of v we require:
                 * maxSomDist(j) <= minSomDist(v)
                 */

                /* ** TODO ** */

                const depth &max_som_dist_j =
                    som_depths[g[vj].index].max;
                const depth &min_som_dist_v =
                    som_depths[g[v].index].min;
                if (max_som_dist_j > min_som_dist_v ||
                    max_som_dist_j.is_infinite()) {
                    /* j can't be used as it may be storing an earlier SOM */
                    continue;
                }
            }

            const CharReach &crv = g[vj].char_reach;

            /* we also require that j's report information be a subset of ours
             */
            bool seen_special = false;
            for (auto w : adjacent_vertices_range(vj, g)) {
                if (is_special(w, g)) {
                    if (!edge(v, w, g).second) {
                        goto next_j;
                    }
                    seen_special = true;
                }
            }

            // FIXME: should be subset check?
            if (seen_special && g[vj].reports != reports) {
                continue;
            }

            /* ok we can use j */
            if ((crv & ~cr).none()) {
                NFAStateSet tmp(numStates);
                buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree,
                                som, som_depths, region_map, cache);
                mask &= tmp;
                mask.reset(j);
            }

        next_j:;
        }

        mask.set(i); /* never clear ourselves */

        if ((~mask).any()) { // i.e. some bits unset in mask
            DEBUG_PRINTF("%u squashes %zu other states\n", i, (~mask).count());
            squash.emplace(v, mask);
        }
    }

    findDerivedSquashers(g, vByIndex, pdom_tree, initStates, &squash, som,
                         som_depths, region_map, cache);

    clearMutualSquashers(g, vByIndex, squash);

    return squash;
}
Exemplo n.º 2
0
int mshiftbeta::discount(ngram ng_,int size,double& fstar,double& lambda, int cv)
{
  ngram ng(dict);
  ng.trans(ng_);

  //cout << "size :" << size << " " << ng <<"\n";

  if (size > 1) {

    ngram history=ng;

    //singleton pruning only on real counts!!
    if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq > cv) &&
        ((size < 3) || ((history.freq-cv) > prunethresh ))) { // no history pruning with corrected counts!

      int suc[3];
      suc[0]=succ1(history.link);
      suc[1]=succ2(history.link);
      suc[2]=history.succ-suc[0]-suc[1];


      if (get(ng,size,size) &&
          (!prunesingletons() || mfreq(ng,size)>1 || size<3) &&
          (!prunetopsingletons() || mfreq(ng,size)>1 || size<maxlevel())) {

        ng.freq=mfreq(ng,size);

        cv=(cv>ng.freq)?ng.freq:cv;

        if (ng.freq>cv) {

          double b=(ng.freq-cv>=3?beta[2][size]:beta[ng.freq-cv-1][size]);

          fstar=(double)((double)(ng.freq - cv) - b)/(double)(history.freq-cv);

          lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2])
                 /
                 (double)(history.freq-cv);

          if ((size>=3 && prunesingletons()) ||
              (size==maxlevel() && prunetopsingletons()))
            //correction
            lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv);

        } else {
          // ng.freq==cv

          ng.freq>=3?suc[2]--:suc[ng.freq-1]--; //update successor stat

          fstar=0.0;
          lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2])
                 /
                 (double)(history.freq-cv);

          if ((size>=3 && prunesingletons()) ||
              (size==maxlevel() && prunetopsingletons())) //correction
            lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv);

          ng.freq>=3?suc[2]++:suc[ng.freq-1]++; //resume successor stat
        }
      } else {
        fstar=0.0;
        lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2])
               /
               (double)(history.freq-cv);

        if ((size>=3 && prunesingletons()) ||
            (size==maxlevel() && prunetopsingletons())) //correction
          lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv);

      }

      //cerr << "ngram :" << ng << "\n";


      if (*ng.wordp(1)==dict->oovcode()) {
        lambda+=fstar;
        fstar=0.0;
      } else {
        *ng.wordp(1)=dict->oovcode();
        if (get(ng,size,size)) {
          ng.freq=mfreq(ng,size);
          if ((!prunesingletons() || mfreq(ng,size)>1 || size<3) &&
              (!prunetopsingletons() || mfreq(ng,size)>1 || size<maxlevel())) {
            double b=(ng.freq>=3?beta[2][size]:beta[ng.freq-1][size]);
            lambda+=(double)(ng.freq - b)/(double)(history.freq-cv);
          }
        }
      }
    } else {
      fstar=0;
      lambda=1;
    }
  } else { // unigram case, no cross-validation

    lambda=0.0;

    int unigrtotfreq=(size<lmsize()?btotfreq():totfreq());

	
	
    if (get(ng,size,size))
      fstar=(double) mfreq(ng,size)/(double)unigrtotfreq;
    else {
			 cerr << "Missing probability for word: " << dict->decode(*ng.wordp(1)) << "\n";					
			 exit(1);
		 }
  }

  return 1;
}