bool parseLink(const std::u32string& html, Link& link) { size_t index, tmp; bool ret = false; Position pos; static const std::u32string LINK_START = std::u32string(U"<a"); static const std::u32string LINK_END = std::u32string(U"</a>"); static const std::u32string LINK_HREF = std::u32string(U"href="); static const char32_t GT = U'>'; char32_t quoteChar; // find <a if((index = html.find(LINK_START)) != std::u32string::npos) { pos.start = index; index += LINK_START.length(); // find href= if((index = html.find(LINK_HREF, index)) != std::u32string::npos) { index += LINK_HREF.length(); // advance index by ine (quote char) quoteChar = html[index]; ++index; tmp = index; // find the end of the quotation if((index = html.find(quoteChar, index)) != std::u32string::npos) { link.url = html.substr(tmp, index - tmp); // find > if((index = html.find(GT, index)) != std::u32string::npos) { ++index; tmp = index; // find </a> if((index = html.find(LINK_END, index)) != std::u32string::npos) { pos.len = index + LINK_END.length() - pos.start; link.text = html.substr(tmp, index - tmp); link.pos = pos; ret = true; } } } } } return ret; }
TagItems(const std::u32string& tagStartWithAttrs) { size_t tmp; if(tagStartWithAttrs.length() > 0 && (tmp = tagStartWithAttrs.find(' ')) != std::u32string::npos) { tagStart = tagStartWithAttrs.substr(0, tmp); tagEnd = tagStart + U">"; tagEnd.insert(1, U"/"); } else { tagStart = tagStartWithAttrs; tagEnd = tagStartWithAttrs + U">"; tagEnd.insert(1, U"/"); } }
grammar load_grammar(std::string const & nameOfMain, std::u32string const & document, std::map<std::string, associativity> const & associativities, std::set<std::string> const & longestNames) { (void)dont_care; parser p; abstract_syntax_graph asg = p.parse(builtins::wirth, document); std::string check = asg.to_dot(); permutation const & top = *asg.permutations[asg.root].begin(); std::vector<state_machine> machines; std::map<std::string, std::shared_ptr<details::behavior_node>> trees; for (match const & entry : top) { if (&entry.r == &productionDfa) { std::shared_ptr<details::behavior_node> behavior = process_production(document, entry, asg); match const & namePart = (*asg.permutations[entry].begin())[0]; std::string name = to_utf8(document.substr(namePart.document_position, namePart.consumed_character_count)); recognizer const * dontCare; if (builtins::resolve_builtin(name, dontCare)) { throw std::logic_error((name + " is a reserved name.").c_str()); // name is reserved for a builtin } trees[name] = behavior; } } return grammar(nameOfMain, trees, associativities, longestNames); }
int wmain(int argc, wchar_t *argv[]) { try { if (argc < 2) { std::cout << "No url" << std::endl; return 1; } //Получение страницы посредством get-запроса const Web::Url l_url(Encoding::utf16to8(reinterpret_cast<char16_t *>(argv[1]))); const std::string l_content(get(l_url)); //Извлечение статьи std::string l_text; if (!recognize(l_content, l_text)) { std::cout << "Couldn't recognize" << std::endl; return 1; } //Формирование путей const std::u32string l_utf32path(Encoding::utf8to32(l_url.path())); std::string l_fixedPath; for (auto j(l_utf32path.begin()); j != l_utf32path.end();) { const auto i(*j == '/' ? l_fixedPath += '/', j + 1 : j); j = std::find(i, l_utf32path.end(), '/'); const size_t l_size = std::min(30, std::distance(i, j)); const std::u32string l_part(i, i + l_size); l_fixedPath += Encoding::utf32to8(l_part); } const std::u32string l_u32fn(Encoding::utf8to32(l_url.file())); const std::u32string l_base(l_u32fn.substr(0, l_u32fn.find('.')).substr(0, 30)); const std::string l_fixedName(l_base.empty() ? std::string("a.txt") : Encoding::utf32to8(l_base) + ".txt"); //http://boost.2283326.n4.nabble.com/boost-filesystem-path-as-utf-8-tp4320098p4322460.html boost::filesystem::detail::utf8_codecvt_facet l_utf8; const boost::filesystem::path l_dirPath(l_url.host() + l_fixedPath, l_utf8); const boost::filesystem::path l_filePath(l_url.host() + l_fixedPath + '/' + l_fixedName, l_utf8); //Создание каталогов и сохранения статьи в текстовый файл boost::filesystem::create_directories(l_dirPath); boost::filesystem::ofstream l_of(l_filePath, std::ios::binary | std::ios::out); if (!l_of.is_open()) { std::cout << "Output file open error" << std::endl; return -1; } l_of.write("\xef\xbb\xbf", 3); //BOM l_of.write(l_text.data(), l_text.size()); l_of.close(); if (argc > 2) { const boost::filesystem::path l_xfp(l_url.host() + l_fixedPath + "/1.html", l_utf8); boost::filesystem::ofstream l_xof(l_xfp, std::ios::binary | std::ios::out); if (l_xof.is_open()) { l_xof.write("\xef\xbb\xbf", 3); l_xof.write(l_content.data(), l_content.size()); l_xof.close(); } } return 0; } catch (const std::bad_alloc &) { ::printf("Mem alloc error"); } catch (const std::exception &a) { std::cout << "Error" << std::endl << a.what() << std::endl; } return -1; }