static void allowIllegal(NGHolder &g, NFAVertex v, u8 pred_char) { if (in_degree(v, g) != 1) { DEBUG_PRINTF("unexpected pred\n"); assert(0); /* should be true due to the early stage of this analysis */ return; } CharReach &cr = g[v].char_reach; if (pred_char == 0xe0) { assert(cr.isSubsetOf(CharReach(0xa0, 0xbf))); if (cr == CharReach(0xa0, 0xbf)) { cr |= CharReach(0x80, 0x9f); } } else if (pred_char == 0xf0) { assert(cr.isSubsetOf(CharReach(0x90, 0xbf))); if (cr == CharReach(0x90, 0xbf)) { cr |= CharReach(0x80, 0x8f); } } else if (pred_char == 0xf4) { assert(cr.isSubsetOf(CharReach(0x80, 0x8f))); if (cr == CharReach(0x80, 0x8f)) { cr |= CharReach(0x90, 0xbf); } } else { assert(0); /* unexpected pred */ } }
static bool can_die_early(const NGHolder &g, const vector<StateInfo> &info, const dynamic_bitset<> &s, map<dynamic_bitset<>, u32> &visited, u32 age_limit) { if (contains(visited, s) && visited[s] >= age_limit) { /* we have already (or are in the process) of visiting here with a * looser limit. */ return false; } visited[s] = age_limit; if (s.none()) { DEBUG_PRINTF("dead\n"); return true; } if (age_limit == 0) { return false; } dynamic_bitset<> all_succ(s.size()); step(g, info, s, &all_succ); all_succ.reset(NODE_START_DOTSTAR); for (u32 i = 0; i < N_CHARS; i++) { dynamic_bitset<> next = all_succ; filter_by_reach(info, &next, CharReach(i)); if (can_die_early(g, info, next, visited, age_limit - 1)) { return true; } } return false; }
static bool expandCyclic(NGHolder &h, NFAVertex v) { DEBUG_PRINTF("inspecting %zu\n", h[v].index); bool changes = false; auto v_preds = preds(v, h); auto v_succs = succs(v, h); set<NFAVertex> start_siblings; set<NFAVertex> end_siblings; CharReach &v_cr = h[v].char_reach; /* We need to find start vertices which have all of our preds. * As we have a self loop, it must be one of our succs. */ for (auto a : adjacent_vertices_range(v, h)) { auto a_preds = preds(a, h); if (a_preds == v_preds && isutf8start(h[a].char_reach)) { DEBUG_PRINTF("%zu is a start v\n", h[a].index); start_siblings.insert(a); } } /* We also need to find full cont vertices which have all our own succs; * As we have a self loop, it must be one of our preds. */ for (auto a : inv_adjacent_vertices_range(v, h)) { auto a_succs = succs(a, h); if (a_succs == v_succs && h[a].char_reach == UTF_CONT_CR) { DEBUG_PRINTF("%zu is a full tail cont\n", h[a].index); end_siblings.insert(a); } } for (auto s : start_siblings) { if (out_degree(s, h) != 1) { continue; } const CharReach &cr = h[s].char_reach; if (cr.isSubsetOf(UTF_TWO_START_CR)) { if (end_siblings.find(*adjacent_vertices(s, h).first) == end_siblings.end()) { DEBUG_PRINTF("%zu is odd\n", h[s].index); continue; } } else if (cr.isSubsetOf(UTF_THREE_START_CR)) { NFAVertex m = *adjacent_vertices(s, h).first; if (h[m].char_reach != UTF_CONT_CR || out_degree(m, h) != 1) { continue; } if (end_siblings.find(*adjacent_vertices(m, h).first) == end_siblings.end()) { DEBUG_PRINTF("%zu is odd\n", h[s].index); continue; } } else if (cr.isSubsetOf(UTF_FOUR_START_CR)) { NFAVertex m1 = *adjacent_vertices(s, h).first; if (h[m1].char_reach != UTF_CONT_CR || out_degree(m1, h) != 1) { continue; } NFAVertex m2 = *adjacent_vertices(m1, h).first; if (h[m2].char_reach != UTF_CONT_CR || out_degree(m2, h) != 1) { continue; } if (end_siblings.find(*adjacent_vertices(m2, h).first) == end_siblings.end()) { DEBUG_PRINTF("%zu is odd\n", h[s].index); continue; } } else { DEBUG_PRINTF("%zu is bad\n", h[s].index); continue; } v_cr |= cr; clear_vertex(s, h); changes = true; } if (changes) { v_cr |= UTF_CONT_CR; /* we need to add in cont reach */ v_cr.set(0xc0); /* we can also add in the forbidden bytes as we require * valid unicode data */ v_cr.set(0xc1); v_cr |= CharReach(0xf5, 0xff); } return changes; }
static void findForwardAccelScheme(const vector<hwlmLiteral> &lits, hwlm_group_t expected_groups, AccelAux *aux) { DEBUG_PRINTF("building accel expected=%016llx\n", expected_groups); u32 min_len = MAX_ACCEL_OFFSET; vector<const hwlmLiteral *> filtered_lits; filterLits(lits, expected_groups, &filtered_lits, &min_len); if (filtered_lits.empty()) { return; } if (findDVerm(filtered_lits, aux) || findSVerm(filtered_lits, aux)) { return; } vector<CharReach> reach(MAX_ACCEL_OFFSET, CharReach()); for (const auto &lit : lits) { if (!(lit.groups & expected_groups)) { continue; } for (u32 i = 0; i < MAX_ACCEL_OFFSET && i < lit.s.length(); i++) { unsigned char c = lit.s[i]; if (lit.nocase) { DEBUG_PRINTF("adding %02hhx to %u\n", mytoupper(c), i); DEBUG_PRINTF("adding %02hhx to %u\n", mytolower(c), i); reach[i].set(mytoupper(c)); reach[i].set(mytolower(c)); } else { DEBUG_PRINTF("adding %02hhx to %u\n", c, i); reach[i].set(c); } } } u32 min_count = ~0U; u32 min_offset = ~0U; for (u32 i = 0; i < min_len; i++) { size_t count = reach[i].count(); DEBUG_PRINTF("offset %u is %s (reach %zu)\n", i, describeClass(reach[i]).c_str(), count); if (count < min_count) { min_count = (u32)count; min_offset = i; } } assert(min_offset <= min_len); if (min_count > MAX_SHUFTI_WIDTH) { DEBUG_PRINTF("min shufti with %u chars is too wide\n", min_count); return; } const CharReach &cr = reach[min_offset]; if (shuftiBuildMasks(cr, &aux->shufti.lo, &aux->shufti.hi) != -1) { DEBUG_PRINTF("built shufti for %s (%zu chars, offset %u)\n", describeClass(cr).c_str(), cr.count(), min_offset); aux->shufti.accel_type = ACCEL_SHUFTI; aux->shufti.offset = verify_u8(min_offset); return; } DEBUG_PRINTF("fail\n"); }