int main(int /*argc*/, char ** /*argv*/) { lexertl::rules rules_; lexertl::state_machine sm_; rules_.push_state("FW"); rules_.push_state("SEMI"); rules_.push_state("NESTED"); rules_.push("*", "[/][/].*|[/][*](.|\n)*?[*][/]|[\"](.|\\\")*[\"]", rules_.skip(), "."); rules_.push("INITIAL", "for\\s*\\([^;]*;[^;]*;|while\\s*\\(", rules_.skip(), "FW"); rules_.push("FW", "\\)", rules_.skip(), "SEMI"); rules_.push("FW,NESTED", "\\(", ">NESTED"); rules_.push("NESTED", "\\)", rules_.skip(), "<"); rules_.push("SEMI", "\\s*;", 1, "INITIAL"); rules_.push("SEMI", ".|\n", rules_.skip(), "INITIAL"); lexertl::generator::build(rules_, sm_); lexertl::memory_file buff_("main.cpp"); const char *start_ = buff_.data(); const char *end_ = start_ + buff_.size(); lexertl::crmatch results_(start_, end_); do { lexertl::lookup(sm_, results_); if (results_.id == 1) { std::cout << "found on line " << std::count(start_, results_.second, '\n') + 1 << '\n'; } } while (results_.id != sm_.eoi()); return 0; }
void lex_prop_list() { lexertl::rules rules_; lexertl::state_machine state_machine_; std::ifstream if_("PropList.txt"); lexertl::stream_shared_iterator iter_(if_); lexertl::stream_shared_iterator end_; lexertl::match_results<lexertl::stream_shared_iterator> results_(iter_, end_); enum {eRange = 1, eName, eShortName}; rules_.push_state("RANGE"); rules_.push_state("WS"); rules_.push_state("NAME"); rules_.push_state("SHORT_NAME"); rules_.push_state("FINISH"); rules_.push("^#.*", rules_.skip()); rules_.push("\n", rules_.skip()); rules_.push("INITIAL", "^[0-9A-F]+(\\.\\.[0-9A-F]+)?", eRange, "RANGE"); rules_.push("RANGE", " *; ", rules_.skip(), "NAME"); rules_.push("NAME", "[A-Z][a-zA-Z_]+", eName, "WS"); rules_.push("WS", " # ", rules_.skip(), "SHORT_NAME"); rules_.push("SHORT_NAME", "[A-Z][a-z&]", eShortName, "FINISH"); rules_.push("FINISH", ".*\n", rules_.skip(), "INITIAL"); lexertl::generator::build(rules_, state_machine_); do { lexertl::lookup(state_machine_, results_); std::cout << "Id: " << results_.id << ", Token: '" << std::string(results_.start, results_.end) << "'\n"; if (results_.id > eShortName) { // int i = 0; } } while (results_.id != 0); }
void lex_unicode_data() { clock_t started_ = ::clock(); lexertl::rules rules_; lexertl::state_machine state_machine_; lexertl::memory_file if_("UnicodeData.txt"); const char *start_ = if_.data(); const char *end_ = start_ + if_.size(); lexertl::cmatch results_(start_, end_); enum {eNumber = 1, eName}; std::size_t num_ = 0; std::map<std::string, lexertl::basic_string_token<std::size_t> > map_; rules_.push_state("LONG_NAME"); rules_.push_state("SHORT_NAME"); rules_.push_state("FINISH"); rules_.push("INITIAL", "^[A-F0-9]+", eNumber, "LONG_NAME"); rules_.push("LONG_NAME", ";[^;]+;", rules_.skip(), "SHORT_NAME"); rules_.push("SHORT_NAME", "[A-Z][a-z]?", eName, "FINISH"); rules_.push("FINISH", ".*\n", rules_.skip(), "INITIAL"); lexertl::generator::build(rules_, state_machine_); do { lexertl::lookup(state_machine_, results_); if (results_.id == eNumber) { num_ = 0; for (;;) { if (*results_.start >= '0' && *results_.start <= '9') { num_ <<= 4; num_ |= *results_.start++ - '0'; } else if (*results_.start >= 'A' && *results_.start <= 'F') { num_ <<= 4; num_ |= *results_.start++ - 'A' + 10; } else { break; } } // ::sscanf(&*results_.start, "%x", &num_); // Too slow! } else if (results_.id == eName) { const std::string name_(results_.start, results_.end); map_[name_].insert(lexertl::basic_string_token<std::size_t>::range (num_, num_)); } } while (results_.id != 0); clock_t finished_ = ::clock(); double seconds_ = static_cast<double> (finished_ - started_) / CLOCKS_PER_SEC; std::cout << seconds_ << "\n"; }
void test_unicode() { lexertl::cmatch r_; const char *i_ = ""; r_.clear(); r_.reset(i_, i_); const char utf8_[] = "\xf0\x90\x8d\x86\xe6\x97\xa5\xd1\x88\x7f"; lexertl::basic_utf8_in_iterator<const char *, int> u8iter_(utf8_, utf8_ + sizeof(utf8_)); int i = *u8iter_; // 0x10346 i = *++u8iter_; // 0x65e5 i = *u8iter_++; // 0x65e5 i = *u8iter_; // 0x0448 i = *++u8iter_; // 0x7f const wchar_t utf16_[] = L"\xdbff\xdfff\xd801\xdc01\xd800\xdc00\xd7ff"; lexertl::basic_utf16_in_iterator<const wchar_t *, int> u16iter_(utf16_, utf16_ + sizeof(utf16_) / sizeof(wchar_t)); i = *u16iter_; // 0x10ffff i = *++u16iter_; // 0x10401 i = *u16iter_++; // 0x10401 i = *u16iter_; // 0x10000 i = *++u16iter_; // 0xd7ff // Not all compilers have char32_t, so use int for now lexertl::basic_rules<char, int> rules_(lexertl::icase); lexertl::basic_state_machine<int> sm_; const int in_[] = {0x393, ' ', 0x393, 0x398, ' ', 0x398, '1', ' ', 'i', 'd', 0x41f, 0}; std::basic_string<int> input_(in_); const int *iter_ = input_.c_str(); const int *end_ = iter_ + input_.size(); lexertl::match_results<const int *> results_(iter_, end_); rules_.push("\\p{LC}[\\p{LC}0-9]*", 1); lexertl::basic_generator<lexertl::basic_rules<char, int>, lexertl::basic_state_machine<int> >::build(rules_, sm_); #ifdef WIN32 HANDLE hStdOut = GetStdHandle(STD_OUTPUT_HANDLE); DWORD dwBytesWritten = 0; #endif do { #ifdef WIN32 std::wstring str_; #else std::string str_; #endif lexertl::lookup(sm_, results_); #ifdef WIN32 str_.assign(lexertl::basic_utf16_out_iterator<const int *> (results_.start, results_.end), lexertl::basic_utf16_out_iterator<const int *> (results_.end, results_.end)); std::wcout << L"Id: " << results_.id << L", Token: '"; ::WriteConsoleW(hStdOut, str_.c_str(), str_.size(), &dwBytesWritten, 0); std::wcout << '\'' << std::endl; #else str_.assign(lexertl::basic_utf8_out_iterator<const int *> (results_.start, results_.end), lexertl::basic_utf8_out_iterator<const int *> (results_.end, results_.end)); std::cout << "Id: " << results_.id << ", Token: '" << str_ << '\'' << std::endl; #endif } while (results_.id != 0); }
void case_mapping() { lexertl::rules rules_; lexertl::state_machine sm_; std::ifstream if_("UnicodeData.txt"); lexertl::stream_shared_iterator iter_(if_); lexertl::stream_shared_iterator end_; lexertl::match_results<lexertl::stream_shared_iterator> results_(iter_, end_); enum e_Token {eEOF, eCodeValue, eName, eLl, eLu, eNeither, eMapping, eEmpty}; e_Token eToken = eEOF; std::string code_; std::string mapping_; int count_ = 0; rules_.push_state("NAME"); rules_.push_state("TYPE"); rules_.push_state("Ll"); rules_.push_state("Lu"); rules_.push_state("MAPPING"); rules_.push_state("END"); rules_.push("INITIAL", "^[0-9A-F]{4,6};", eCodeValue, "NAME"); rules_.push("NAME", "[^;]*;", sm_.skip(), "TYPE"); rules_.push("TYPE", "Ll;", eLl, "Ll"); rules_.push("Ll", "([^;]*;){9}", sm_.skip(), "MAPPING"); rules_.push("TYPE", "Lu;", eLu, "Lu"); rules_.push("Lu", "([^;]*;){10}", sm_.skip(), "MAPPING"); rules_.push("TYPE", "[^;]*;", eNeither, "END"); rules_.push("MAPPING", ";", eEmpty, "END"); rules_.push("MAPPING", "[0-9A-F]{4,6};", eMapping, "END"); rules_.push("END", "[^\n]*\n", sm_.skip(), "INITIAL"); lexertl::generator::build(rules_, sm_); do { lexertl::lookup(sm_, results_); eToken = static_cast<e_Token>(results_.id); if (eToken == eEOF) { break; } else if (eToken != eCodeValue) { throw std::runtime_error("Syntax error"); } code_.assign(results_.start, results_.end); lexertl::lookup(sm_, results_); eToken = static_cast<e_Token>(results_.id); if (eToken != eLl && eToken != eLu && eToken != eNeither) { throw std::runtime_error("Syntax error"); } if (eToken != eNeither) { lexertl::lookup(sm_, results_); eToken = static_cast<e_Token>(results_.id); if (eToken == eMapping) { mapping_.assign(results_.start, results_.end); std::cout << "(0x" << code_.substr(0, code_.size() - 1) << ", " << "0x" << mapping_.substr(0, mapping_.size() - 1) << "), "; code_.clear(); mapping_.clear(); ++count_; if (count_ > 2) { count_ = 0; std::cout << '\n'; } } } } while (results_.id != 0); }
bool search(iterator first_, iterator second_, captures &captures_, lsm &lsm_, const sm_type &gsm_) { typedef lexertl::iterator<iterator, lsm, lexertl:: match_results<iterator> > lex_iterator; lex_iterator iter_(first_, second_, lsm_); lex_iterator end_; basic_match_results<sm_type> results_(iter_->id, gsm_); typedef parsertl::token<lex_iterator> token; typedef typename token::token_vector token_vector; typedef std::multimap<typename sm_type::id_type, token_vector> prod_map; prod_map prod_map_; bool success_ = search(gsm_, iter_, end_, &prod_map_); captures_.clear(); if (success_) { iterator last_ = iter_->first; typename prod_map::const_iterator pi_ = prod_map_.begin(); typename prod_map::const_iterator pe_ = prod_map_.end(); captures_.resize((gsm_._captures.empty() ? 0 : gsm_._captures.back().first + gsm_._captures.back().second.size()) + 1); captures_[0].push_back(std::make_pair(iter_->first, iter_->first)); for (; pi_ != pe_; ++pi_) { if (gsm_._captures.size() > pi_->first) { const typename sm_type::capture_vec_pair &row_ = gsm_._captures[pi_->first]; if (!row_.second.empty()) { typedef typename sm_type::capture_vector capture_vector; typename capture_vector::const_iterator ti_ = row_.second.begin(); typename capture_vector::const_iterator te_ = row_.second.end(); std::size_t index_ = 0; for (; ti_ != te_; ++ti_) { const token &token1_ = pi_->second[ti_->first]; const token &token2_ = pi_->second[ti_->second]; captures_[row_.first + index_ + 1]. push_back(std::make_pair(token1_.first, token2_.second)); ++index_; } } } } pi_ = prod_map_.begin(); pe_ = prod_map_.end(); for (; pi_ != pe_; ++pi_) { typename token::iter_type second_ = pi_->second.back().second; if (second_ > last_) { last_ = second_; } } captures_.front().back().second = last_; } return success_; }