static void getBackwardReach(const NGHolder &g, ReportID report, u32 lag, map<s32, CharReach> &look) { ue2::flat_set<NFAVertex> curr, next; for (auto v : inv_adjacent_vertices_range(g.accept, g)) { if (contains(g[v].reports, report)) { curr.insert(v); } } for (u32 i = lag + 1; i <= MAX_BACK_LEN; i++) { if (curr.empty() || contains(curr, g.start) || contains(curr, g.startDs)) { break; } next.clear(); CharReach cr; for (auto v : curr) { assert(!is_special(v, g)); cr |= g[v].char_reach; insert(&next, inv_adjacent_vertices(v, g)); } assert(cr.any()); look[0 - i] |= cr; curr.swap(next); } }
TEST(ReverseTruffle, ExecMatch3) { m128 mask1, mask2; CharReach chars; chars.set('a'); chars.set('B'); truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2); /* 0123456789012345678901234567890 */ char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaBbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; size_t len = strlen(t1); for (size_t i = 0; i < 16; i++) { const u8 *rv = rtruffleExec(mask1, mask2, (u8 *)t1, (u8 *)t1 + len - i); ASSERT_NE((const u8 *)t1 - 1, rv); // not found EXPECT_EQ('B', (char)*rv); ASSERT_EQ((const u8 *)t1 + 32, rv); } // check that we match the 'a' bytes as well. ASSERT_EQ('B', t1[32]); t1[32] = 'b'; for (size_t i = 0; i < 16; i++) { const u8 *rv = rtruffleExec(mask1, mask2, (u8 *)t1, (u8 *)t1 + len - i); ASSERT_NE((const u8 *)t1 - 1, rv); // not found EXPECT_EQ('a', (char)*rv); ASSERT_EQ((const u8 *)t1 + 31, rv); } }
void describeEdge(FILE *f, const u16 *t, u16 i) { for (u16 s = 0; s < N_CHARS; s++) { if (!t[s]) { continue; } u16 ss; for (ss = 0; ss < s; ss++) { if (t[s] == t[ss]) { break; } } if (ss != s) { continue; } CharReach reach; for (ss = s; ss < 256; ss++) { if (t[s] == t[ss]) { reach.set(ss); } } fprintf(f, "%u -> %u [ label = \"", i, t[s]); describeClass(f, reach, 5, CC_OUT_DOT); fprintf(f, "\" ];\n"); } }
static void getForwardReach(const raw_dfa &rdfa, map<s32, CharReach> &look) { if (rdfa.states.size() < 2) { return; } ue2::flat_set<dstate_id_t> curr, next; curr.insert(rdfa.start_anchored); for (u32 i = 0; i < MAX_FWD_LEN && !curr.empty(); i++) { next.clear(); CharReach cr; for (const auto state_id : curr) { const dstate &ds = rdfa.states[state_id]; if (!ds.reports.empty() || !ds.reports_eod.empty()) { return; } for (unsigned c = 0; c < N_CHARS; c++) { dstate_id_t succ = ds.next[rdfa.alpha_remap[c]]; if (succ != DEAD_STATE) { cr.set(c); next.insert(succ); } } } assert(cr.any()); look[i] |= cr; curr.swap(next); } }
static void getForwardReach(const NGHolder &g, u32 top, map<s32, CharReach> &look) { ue2::flat_set<NFAVertex> curr, next; // Consider only successors of start with the required top. for (const auto &e : out_edges_range(g.start, g)) { NFAVertex v = target(e, g); if (v == g.startDs) { continue; } if (g[e].top == top) { curr.insert(v); } } for (u32 i = 0; i < MAX_FWD_LEN; i++) { if (curr.empty() || contains(curr, g.accept) || contains(curr, g.acceptEod)) { break; } next.clear(); CharReach cr; for (auto v : curr) { assert(!is_special(v, g)); cr |= g[v].char_reach; insert(&next, adjacent_vertices(v, g)); } assert(cr.any()); look[i] |= cr; curr.swap(next); } }
/** Find the set of characters that are not present in the reachability of * graph \p g after a certain depth (currently 8). If a character in this set * is encountered, it means that the NFA is either dead or has not progressed * more than 8 characters from its start states. */ CharReach findStopAlphabet(const NGHolder &g, som_type som) { const depth max_depth(MAX_STOP_DEPTH); const InitDepths depths(g); const map<NFAVertex, BoundedRepeatSummary> no_vertices; CharReach stopcr; for (auto v : vertices_range(g)) { if (is_special(v, g)) { continue; } if (depths.maxDist(g, v) >= max_depth) { if (som == SOM_NONE) { stopcr |= reduced_cr(v, g, no_vertices); } else { stopcr |= g[v].char_reach; } } } // Turn alphabet into stops. stopcr.flip(); return stopcr; }
TEST(ng_charreach, bitwise) { CharReach cr; CharReach cr2; CharReach cr3; CharReach cr4; cr.set('a'); cr2.set('z'); cr3.set('a'); cr3.set('z'); ASSERT_TRUE(cr < cr3); cr4 |= cr; cr4 |= cr2; ASSERT_TRUE(cr3 == cr4); ASSERT_TRUE(cr3 == (cr | cr2)); ASSERT_TRUE(cr4 == (cr | cr2)); ASSERT_TRUE(cr == (cr & cr3)); ASSERT_TRUE(cr2 == (cr2 & cr3)); cr3 &= cr; ASSERT_FALSE(cr3.test('z')); }
static void describeEdge(FILE *f, const mcsheng *m, const u16 *t, u16 i) { for (u16 s = 0; s < N_CHARS; s++) { if (!t[s]) { continue; } u16 ss; for (ss = 0; ss < s; ss++) { if (t[s] == t[ss]) { break; } } if (ss != s) { continue; } CharReach reach; for (ss = s; ss < 256; ss++) { if (t[s] == t[ss]) { reach.set(ss); } } fprintf(f, "%u -> %u [ ", i, t[s]); if (i < m->sheng_end && t[s] < m->sheng_end) { fprintf(f, "color = red, fontcolor = red "); } fprintf(f, "label = \""); describeClass(f, reach, 5, CC_OUT_DOT); fprintf(f, "\" ];\n"); } }
TEST(ng_charreach, init) { CharReach cr; ASSERT_EQ(0U, cr.count()); ASSERT_TRUE(cr.none()); ASSERT_FALSE(cr.all()); ASSERT_EQ(256U, cr.size()); }
TEST(ng_charreach, dot) { CharReach dot = CharReach::dot(); ASSERT_EQ(256, dot.count()); ASSERT_TRUE(dot.all()); for (size_t i = 0; i < 256; i++) { ASSERT_TRUE(dot.test(i)); } }
TEST(ng_charreach, copy) { CharReach cr; cr.set('a'); cr.set('z'); CharReach cr2(cr); ASSERT_EQ(cr.count(), cr2.count()); ASSERT_TRUE(cr == cr2); }
// Ugly but simple. string make_pattern() { std::ostringstream oss; oss << "^["; for (size_t i = reach.find_first(); i != CharReach::npos; i = reach.find_next(i)) { oss << "\\x" << std::hex << std::setw(2) << std::setfill('0') << (unsigned)(i & 0xff) << std::dec; } oss << "]{" << min << "," << max << "}"; return oss.str(); }
TEST(ng_charreach, set) { CharReach cr; ASSERT_EQ(0U, cr.count()); ASSERT_TRUE(cr.none()); ASSERT_FALSE(cr.all()); cr.set('q'); ASSERT_EQ(1U, cr.count()); cr.setall(); ASSERT_EQ(cr.size(), cr.count()); ASSERT_TRUE(cr.all()); }
TEST(ng_charreach, assignment) { CharReach cr; cr.set('f'); cr.set('l'); cr.set('y'); CharReach cr2; cr2 = cr; ASSERT_EQ(cr.count(), cr2.count()); ASSERT_TRUE(cr == cr2); }
TEST(ng_charreach, setRange) { // Exhaustive test: every possible contiguous range. for (unsigned range = 0; range < 256; range++) { for (unsigned from = 0; from < 256 - range; from++) { unsigned to = from + range; CharReach cr; cr.setRange(from, to); ASSERT_EQ(from, cr.find_first()); ASSERT_EQ(to, cr.find_last()); ASSERT_EQ(range + 1, cr.count()); } } }
TEST(ReverseTruffle, ExecMiniMatch0) { m128 lo, hi; CharReach chars; chars.set('a'); truffleBuildMasks(chars, (u8 *)&lo, (u8 *)&hi); char t1[] = "a"; const u8 *rv = rtruffleExec(lo, hi, (u8 *)t1, (u8 *)t1 + strlen(t1)); ASSERT_EQ((size_t)t1, (size_t)rv); }
TEST(Truffle, ExecMiniMatch3) { m128 lo, hi; CharReach chars; chars.set('a'); truffleBuildMasks(chars, (u8 *)&lo, (u8 *)&hi); char t1[] = "\0\0\0\0\0\0\0a\0\0\0"; const u8 *rv = truffleExec(lo, hi, (u8 *)t1, (u8 *)t1 + 11); ASSERT_EQ((size_t)t1 + 7, (size_t)rv); }
TEST(ng_charreach, find_nth) { const size_t npos = CharReach::npos; // One bit cases. for (size_t i = 0; i < 256; i++) { CharReach cr((unsigned char)i); ASSERT_EQ(i, cr.find_nth(0)); ASSERT_EQ(npos, cr.find_nth(1)); } // All bits set. CharReach dot = CharReach::dot(); for (size_t i = 0; i < 256; i++) { ASSERT_EQ(i, dot.find_nth(i)); } // Trivial two bit cases. for (size_t i = 0; i < 128; i++) { CharReach cr; cr.set(i); cr.set(256 - i); ASSERT_EQ(i, cr.find_nth(0)); ASSERT_EQ(256 - i, cr.find_nth(1)); ASSERT_EQ(npos, cr.find_nth(3)); } // More complex case. const std::string str("\x01\x02\x03\x05\x06\x20!#$%&./0123568:;ABCDEFMNOPUYZbcdefwxyz"); CharReach cr(str); for (size_t i = 0; i < str.length(); i++) { ASSERT_EQ(str[i], cr.find_nth(i)); } ASSERT_EQ(npos, cr.find_nth(str.length())); }
TEST(ng_charreach, caseless) { CharReach cr; cr.set('a'); ASSERT_FALSE(cr.isCaselessChar()); cr.set('A'); ASSERT_TRUE(cr.isCaselessChar()); cr.set('b'); ASSERT_FALSE(cr.isCaselessChar()); cr.set('B'); ASSERT_FALSE(cr.isCaselessChar()); }
u32 mcclellanStartReachSize(const raw_dfa *raw) { if (raw->states.size() < 2) { return 0; } const dstate &ds = raw->states[raw->start_anchored]; CharReach out; for (unsigned i = 0; i < N_CHARS; i++) { if (ds.next[raw->alpha_remap[i]] != DEAD_STATE) { out.set(i); } } return out.count(); }
TEST(Truffle, CompileDot) { m128 mask1, mask2; memset(&mask1, 0, sizeof(mask1)); memset(&mask2, 0, sizeof(mask2)); CharReach chars; chars.setall(); truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2); CharReach out = truffle2cr((u8 *)&mask1, (u8 *)&mask2); ASSERT_EQ(out, chars); }
void nfaExecLbrTruf_dump(const NFA *nfa, const string &base) { assert(nfa); assert(nfa->type == LBR_NFA_TRUF); StdioFile f(base + ".txt", "w"); const lbr_truf *lt = (const lbr_truf *)getImplNfa(nfa); lbrDumpCommon(<->common, f); CharReach cr = truffle2cr((const u8 *)<->mask1, (const u8 *)<->mask2); fprintf(f, "TRUFFLE model, scanning for: %s (%zu chars)\n", describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count()); fprintf(f, "\n"); dumpTextReverse(nfa, f); }
void nfaExecLbrShuf_dump(const NFA *nfa, const string &base) { assert(nfa); assert(nfa->type == LBR_NFA_SHUF); StdioFile f(base + ".txt", "w"); const lbr_shuf *ls = (const lbr_shuf *)getImplNfa(nfa); lbrDumpCommon(&ls->common, f); CharReach cr = shufti2cr((const u8 *)&ls->mask_lo, (const u8 *)&ls->mask_hi); fprintf(f, "SHUF model, scanning for: %s (%zu chars)\n", describeClass(cr, 20, CC_OUT_TEXT).c_str(), cr.count()); fprintf(f, "\n"); dumpTextReverse(nfa, f); }
TEST(Truffle, ExecNoMatch3) { m128 mask1, mask2; CharReach chars; chars.set('V'); /* V = 0x56, e = 0x65 */ truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2); char t1[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"; for (size_t i = 0; i < 16; i++) { const u8 *rv = truffleExec(mask1, mask2, (u8 *)t1 + i, (u8 *)t1 + strlen(t1)); ASSERT_EQ((size_t)t1 + strlen(t1), (size_t)rv); } }
TEST(ReverseTruffle, ExecNoMatch3) { m128 mask1, mask2; CharReach chars; chars.set('V'); /* V = 0x56, e = 0x65 */ truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2); char t[] = "eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee"; char *t1 = t + 1; size_t len = strlen(t1); for (size_t i = 0; i < 16; i++) { const u8 *rv = rtruffleExec(mask1, mask2, (u8 *)t1, (u8 *)t1 + len - i); ASSERT_EQ((const u8 *)t, rv); } }
TEST(ReverseTruffle, ExecMiniMatch2) { m128 mask1, mask2; CharReach chars; chars.set('a'); truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2); /* 0123456789012345678901234567890 */ char t1[] = "babbbbbabbbb"; size_t len = strlen(t1); const u8 *rv = rtruffleExec(mask1, mask2, (u8 *)t1, (u8 *)t1 + len); ASSERT_NE((const u8 *)t1 - 1, rv); // not found EXPECT_EQ('a', (char)*rv); ASSERT_EQ((const u8 *)t1 + 7, rv); }
TEST(Truffle, ExecMatch2) { m128 mask1, mask2; CharReach chars; chars.set('a'); truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2); /* 0123456789012345678901234567890 */ char t1[] = "bbbbbbbbbbbbbbbbbaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbabbbbbbbbbbbb"; for (size_t i = 0; i < 16; i++) { const u8 *rv = truffleExec(mask1, mask2, (u8 *)t1 + i, (u8 *)t1 + strlen(t1)); ASSERT_EQ((size_t)t1 + 17, (size_t)rv); } }
TEST(Truffle, ExecMatch5) { m128 mask1, mask2; CharReach chars; chars.set('a'); truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2); char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; for (size_t i = 0; i < 31; i++) { t1[48 - i] = 'a'; const u8 *rv = truffleExec(mask1, mask2, (u8 *)t1, (u8 *)t1 + strlen(t1)); ASSERT_EQ((size_t)&t1[48 - i], (size_t)rv); } }
TEST(ReverseTruffle, ExecMatch5) { m128 mask1, mask2; CharReach chars; chars.set('a'); truffleBuildMasks(chars, (u8 *)&mask1, (u8 *)&mask2); char t1[] = "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb"; size_t len = strlen(t1); for (size_t i = 0; i < len; i++) { t1[i] = 'a'; const u8 *rv = rtruffleExec(mask1, mask2, (u8 *)t1, (u8 *)t1 + len); ASSERT_EQ((const u8 *)t1 + i, rv); } }
TEST(ng_charreach, caseless2) { // Test every pair of characters. for (size_t i = 0; i < 256; i++) { ASSERT_FALSE(CharReach((unsigned char)i).isCaselessChar()); for (size_t j = 0; j < 256; j++) { CharReach cr; cr.set(i); cr.set(j); bool upper_lower = (i >= 'A' && i <= 'Z') && j == i + 0x20; bool lower_upper = (i >= 'a' && i <= 'z') && i == j + 0x20; bool caseless_pair = upper_lower | lower_upper; ASSERT_EQ(caseless_pair, cr.isCaselessChar()) << "Failed for i=" << i << ", j=" << j; } } }