/* * Find the first */ bool getElement(const std::u32string& html, const std::u32string& tagStartWithAttrs, Position& pos) { size_t index; pos.start = -1; pos.len = -1; if(tagStartWithAttrs.length() == 0 || tagStartWithAttrs[0] != U'<') { fprintf(stderr, "invalid tagStartWithAttrs supplied: must start with <\n"); return false; } TagItems tagItems(tagStartWithAttrs); std::stack<int> stack; size_t tagEndLen = tagItems.tagEnd.length(); size_t tagStartLen = tagItems.tagStart.length(); size_t htmlLen = html.length(); // search for our tagStartWithAttrs if((index = html.find(tagStartWithAttrs)) != std::u32string::npos) { pos.start = index; index += tagStartWithAttrs.length(); while((index = html.find(U"<", index)) != std::u32string::npos) { // check if the tag is a start tag if(index + tagStartLen <= htmlLen) { if(memcmp(html.c_str() + index, tagItems.tagStart.c_str(), tagStartLen*sizeof(tagItems.tagStart[0])) == 0) { // an embedded start tag pushed to the stack stack.push(index); } } // check if the tag is an end tag if(index + tagEndLen <= htmlLen) { if(memcmp(html.c_str() + index, tagItems.tagEnd.c_str(), tagEndLen*sizeof(tagItems.tagEnd[0])) == 0) { if(stack.size()) { // close tag for the last embedded start tag stack.pop(); } else { // we have found the end tag to our initial tagStartWithAttrs pos.len = index - pos.start + tagEndLen; break; } } } ++index; } } return true; }
bool parseLink(const std::u32string& html, Link& link) { size_t index, tmp; bool ret = false; Position pos; static const std::u32string LINK_START = std::u32string(U"<a"); static const std::u32string LINK_END = std::u32string(U"</a>"); static const std::u32string LINK_HREF = std::u32string(U"href="); static const char32_t GT = U'>'; char32_t quoteChar; // find <a if((index = html.find(LINK_START)) != std::u32string::npos) { pos.start = index; index += LINK_START.length(); // find href= if((index = html.find(LINK_HREF, index)) != std::u32string::npos) { index += LINK_HREF.length(); // advance index by ine (quote char) quoteChar = html[index]; ++index; tmp = index; // find the end of the quotation if((index = html.find(quoteChar, index)) != std::u32string::npos) { link.url = html.substr(tmp, index - tmp); // find > if((index = html.find(GT, index)) != std::u32string::npos) { ++index; tmp = index; // find </a> if((index = html.find(LINK_END, index)) != std::u32string::npos) { pos.len = index + LINK_END.length() - pos.start; link.text = html.substr(tmp, index - tmp); link.pos = pos; ret = true; } } } } } return ret; }
TagItems(const std::u32string& tagStartWithAttrs) { size_t tmp; if(tagStartWithAttrs.length() > 0 && (tmp = tagStartWithAttrs.find(' ')) != std::u32string::npos) { tagStart = tagStartWithAttrs.substr(0, tmp); tagEnd = tagStart + U">"; tagEnd.insert(1, U"/"); } else { tagStart = tagStartWithAttrs; tagEnd = tagStartWithAttrs + U">"; tagEnd.insert(1, U"/"); } }
int findTags(const std::u32string& html, std::map<std::u32string, std::vector<Position> >& tags) { size_t index = 0; size_t htmlLen = html.length(); while((index = html.find(U"<", index)) != std::u32string::npos) { for(std::map<std::u32string, std::vector<Position> >::iterator itr = tags.begin(); itr != tags.end(); ++itr) { const std::u32string& str = itr->first; if(index + str.length() <= htmlLen) { if(memcmp(html.c_str() + index, str.c_str(), str.length()*sizeof(str[0])) == 0) { itr->second.push_back(Position(index, 0)); } } } ++index; } return 0; }
int wmain(int argc, wchar_t *argv[]) { try { if (argc < 2) { std::cout << "No url" << std::endl; return 1; } //Получение страницы посредством get-запроса const Web::Url l_url(Encoding::utf16to8(reinterpret_cast<char16_t *>(argv[1]))); const std::string l_content(get(l_url)); //Извлечение статьи std::string l_text; if (!recognize(l_content, l_text)) { std::cout << "Couldn't recognize" << std::endl; return 1; } //Формирование путей const std::u32string l_utf32path(Encoding::utf8to32(l_url.path())); std::string l_fixedPath; for (auto j(l_utf32path.begin()); j != l_utf32path.end();) { const auto i(*j == '/' ? l_fixedPath += '/', j + 1 : j); j = std::find(i, l_utf32path.end(), '/'); const size_t l_size = std::min(30, std::distance(i, j)); const std::u32string l_part(i, i + l_size); l_fixedPath += Encoding::utf32to8(l_part); } const std::u32string l_u32fn(Encoding::utf8to32(l_url.file())); const std::u32string l_base(l_u32fn.substr(0, l_u32fn.find('.')).substr(0, 30)); const std::string l_fixedName(l_base.empty() ? std::string("a.txt") : Encoding::utf32to8(l_base) + ".txt"); //http://boost.2283326.n4.nabble.com/boost-filesystem-path-as-utf-8-tp4320098p4322460.html boost::filesystem::detail::utf8_codecvt_facet l_utf8; const boost::filesystem::path l_dirPath(l_url.host() + l_fixedPath, l_utf8); const boost::filesystem::path l_filePath(l_url.host() + l_fixedPath + '/' + l_fixedName, l_utf8); //Создание каталогов и сохранения статьи в текстовый файл boost::filesystem::create_directories(l_dirPath); boost::filesystem::ofstream l_of(l_filePath, std::ios::binary | std::ios::out); if (!l_of.is_open()) { std::cout << "Output file open error" << std::endl; return -1; } l_of.write("\xef\xbb\xbf", 3); //BOM l_of.write(l_text.data(), l_text.size()); l_of.close(); if (argc > 2) { const boost::filesystem::path l_xfp(l_url.host() + l_fixedPath + "/1.html", l_utf8); boost::filesystem::ofstream l_xof(l_xfp, std::ios::binary | std::ios::out); if (l_xof.is_open()) { l_xof.write("\xef\xbb\xbf", 3); l_xof.write(l_content.data(), l_content.size()); l_xof.close(); } } return 0; } catch (const std::bad_alloc &) { ::printf("Mem alloc error"); } catch (const std::exception &a) { std::cout << "Error" << std::endl << a.what() << std::endl; } return -1; }