map<NFAVertex, NFAStateSet> findSquashers(const NGHolder &g, som_type som) { map<NFAVertex, NFAStateSet> squash; // Number of bits to use for all our masks. If we're a triggered graph, // tops have already been assigned, so we don't have to account for them. const u32 numStates = num_vertices(g); // Build post-dominator tree. PostDomTree pdom_tree; buildPDomTree(g, pdom_tree); // Build list of vertices by state ID and a set of init states. vector<NFAVertex> vByIndex(numStates, NFAGraph::null_vertex()); NFAStateSet initStates(numStates); smgb_cache cache(g); // Mappings used for SOM mode calculations, otherwise left empty. unordered_map<NFAVertex, u32> region_map; vector<DepthMinMax> som_depths; if (som) { region_map = assignRegions(g); som_depths = getDistancesFromSOM(g); } for (auto v : vertices_range(g)) { const u32 vert_id = g[v].index; DEBUG_PRINTF("vertex %u/%u\n", vert_id, numStates); assert(vert_id < numStates); vByIndex[vert_id] = v; if (is_any_start(v, g) || !in_degree(v, g)) { initStates.set(vert_id); } } for (u32 i = 0; i < numStates; i++) { NFAVertex v = vByIndex[i]; assert(v != NFAGraph::null_vertex()); const CharReach &cr = g[v].char_reach; /* only non-init cyclics can be squashers */ if (!hasSelfLoop(v, g) || initStates.test(i)) { continue; } DEBUG_PRINTF("state %u is cyclic\n", i); NFAStateSet mask(numStates), succ(numStates), pred(numStates); buildSquashMask(mask, g, v, cr, initStates, vByIndex, pdom_tree, som, som_depths, region_map, cache); buildSucc(succ, g, v); buildPred(pred, g, v); const auto &reports = g[v].reports; for (size_t j = succ.find_first(); j != succ.npos; j = succ.find_next(j)) { NFAVertex vj = vByIndex[j]; NFAStateSet pred2(numStates); buildPred(pred2, g, vj); if (pred2 == pred) { DEBUG_PRINTF("adding the sm from %zu to %u's sm\n", j, i); NFAStateSet tmp(numStates); buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree, som, som_depths, region_map, cache); mask &= tmp; } } for (size_t j = pred.find_first(); j != pred.npos; j = pred.find_next(j)) { NFAVertex vj = vByIndex[j]; NFAStateSet succ2(numStates); buildSucc(succ2, g, vj); /* we can use j as a basis for squashing if its succs are a subset * of ours */ if ((succ2 & ~succ).any()) { continue; } if (som) { /* We cannot use j to add to the squash mask of v if it may * have an earlier start of match offset. ie for us j as a * basis for the squash mask of v we require: * maxSomDist(j) <= minSomDist(v) */ /* ** TODO ** */ const depth &max_som_dist_j = som_depths[g[vj].index].max; const depth &min_som_dist_v = som_depths[g[v].index].min; if (max_som_dist_j > min_som_dist_v || max_som_dist_j.is_infinite()) { /* j can't be used as it may be storing an earlier SOM */ continue; } } const CharReach &crv = g[vj].char_reach; /* we also require that j's report information be a subset of ours */ bool seen_special = false; for (auto w : adjacent_vertices_range(vj, g)) { if (is_special(w, g)) { if (!edge(v, w, g).second) { goto next_j; } seen_special = true; } } // FIXME: should be subset check? if (seen_special && g[vj].reports != reports) { continue; } /* ok we can use j */ if ((crv & ~cr).none()) { NFAStateSet tmp(numStates); buildSquashMask(tmp, g, vj, cr, initStates, vByIndex, pdom_tree, som, som_depths, region_map, cache); mask &= tmp; mask.reset(j); } next_j:; } mask.set(i); /* never clear ourselves */ if ((~mask).any()) { // i.e. some bits unset in mask DEBUG_PRINTF("%u squashes %zu other states\n", i, (~mask).count()); squash.emplace(v, mask); } } findDerivedSquashers(g, vByIndex, pdom_tree, initStates, &squash, som, som_depths, region_map, cache); clearMutualSquashers(g, vByIndex, squash); return squash; }
int mshiftbeta::discount(ngram ng_,int size,double& fstar,double& lambda, int cv) { ngram ng(dict); ng.trans(ng_); //cout << "size :" << size << " " << ng <<"\n"; if (size > 1) { ngram history=ng; //singleton pruning only on real counts!! if (ng.ckhisto(size) && get(history,size,size-1) && (history.freq > cv) && ((size < 3) || ((history.freq-cv) > prunethresh ))) { // no history pruning with corrected counts! int suc[3]; suc[0]=succ1(history.link); suc[1]=succ2(history.link); suc[2]=history.succ-suc[0]-suc[1]; if (get(ng,size,size) && (!prunesingletons() || mfreq(ng,size)>1 || size<3) && (!prunetopsingletons() || mfreq(ng,size)>1 || size<maxlevel())) { ng.freq=mfreq(ng,size); cv=(cv>ng.freq)?ng.freq:cv; if (ng.freq>cv) { double b=(ng.freq-cv>=3?beta[2][size]:beta[ng.freq-cv-1][size]); fstar=(double)((double)(ng.freq - cv) - b)/(double)(history.freq-cv); lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) //correction lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); } else { // ng.freq==cv ng.freq>=3?suc[2]--:suc[ng.freq-1]--; //update successor stat fstar=0.0; lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) //correction lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); ng.freq>=3?suc[2]++:suc[ng.freq-1]++; //resume successor stat } } else { fstar=0.0; lambda=(beta[0][size] * suc[0] + beta[1][size] * suc[1] + beta[2][size] * suc[2]) / (double)(history.freq-cv); if ((size>=3 && prunesingletons()) || (size==maxlevel() && prunetopsingletons())) //correction lambda+=(double)(suc[0] * (1-beta[0][size])) / (double)(history.freq-cv); } //cerr << "ngram :" << ng << "\n"; if (*ng.wordp(1)==dict->oovcode()) { lambda+=fstar; fstar=0.0; } else { *ng.wordp(1)=dict->oovcode(); if (get(ng,size,size)) { ng.freq=mfreq(ng,size); if ((!prunesingletons() || mfreq(ng,size)>1 || size<3) && (!prunetopsingletons() || mfreq(ng,size)>1 || size<maxlevel())) { double b=(ng.freq>=3?beta[2][size]:beta[ng.freq-1][size]); lambda+=(double)(ng.freq - b)/(double)(history.freq-cv); } } } } else { fstar=0; lambda=1; } } else { // unigram case, no cross-validation lambda=0.0; int unigrtotfreq=(size<lmsize()?btotfreq():totfreq()); if (get(ng,size,size)) fstar=(double) mfreq(ng,size)/(double)unigrtotfreq; else { cerr << "Missing probability for word: " << dict->decode(*ng.wordp(1)) << "\n"; exit(1); } } return 1; }