예제 #1
0
bool parseLink(const std::u32string& html, Link& link) {
  size_t index, tmp;
  bool ret = false;
  Position pos;
  static const std::u32string LINK_START = std::u32string(U"<a");
  static const std::u32string LINK_END = std::u32string(U"</a>");
  static const std::u32string LINK_HREF = std::u32string(U"href=");
  static const char32_t GT = U'>';
  char32_t quoteChar;

  // find <a
  if((index = html.find(LINK_START)) != std::u32string::npos) {
    pos.start = index;
    index += LINK_START.length();
    // find href=
    if((index = html.find(LINK_HREF, index)) != std::u32string::npos) {
      index += LINK_HREF.length();
      // advance index by ine (quote char)
      quoteChar = html[index];
      ++index;
      tmp = index;
      // find the end of the quotation
      if((index = html.find(quoteChar, index)) != std::u32string::npos) {
        link.url = html.substr(tmp, index - tmp);
        // find >
        if((index = html.find(GT, index)) != std::u32string::npos) {
          ++index;
          tmp = index;
          // find </a>
          if((index = html.find(LINK_END, index)) != std::u32string::npos) {
            pos.len = index + LINK_END.length() - pos.start;
            link.text = html.substr(tmp, index - tmp);
            link.pos = pos;
            ret = true;
          }
        }
      }
    }
  }

  return ret;
}
예제 #2
0
 TagItems(const std::u32string& tagStartWithAttrs) {
   size_t tmp;
   if(tagStartWithAttrs.length() > 0 && (tmp = tagStartWithAttrs.find(' ')) != std::u32string::npos) {
     tagStart = tagStartWithAttrs.substr(0, tmp);
     tagEnd = tagStart + U">";
     tagEnd.insert(1, U"/");
   }
   else {
     tagStart = tagStartWithAttrs;
     tagEnd = tagStartWithAttrs  + U">";
     tagEnd.insert(1, U"/");
   }
 }
예제 #3
0
파일: wirth.cpp 프로젝트: coder0xff/Plange
grammar load_grammar(std::string const & nameOfMain, std::u32string const & document, std::map<std::string, associativity> const & associativities, std::set<std::string> const & longestNames) {
	(void)dont_care;
	parser p;
	abstract_syntax_graph asg = p.parse(builtins::wirth, document);
	std::string check = asg.to_dot();
	permutation const & top = *asg.permutations[asg.root].begin();
	std::vector<state_machine> machines;
	std::map<std::string, std::shared_ptr<details::behavior_node>> trees;
	for (match const & entry : top) {
		if (&entry.r == &productionDfa) {
			std::shared_ptr<details::behavior_node> behavior = process_production(document, entry, asg);
			match const & namePart = (*asg.permutations[entry].begin())[0];
			std::string name = to_utf8(document.substr(namePart.document_position, namePart.consumed_character_count));
			recognizer const * dontCare;
			if (builtins::resolve_builtin(name, dontCare)) {
				throw std::logic_error((name + " is a reserved name.").c_str()); // name is reserved for a builtin
			}
			trees[name] = behavior;
		}
	}
	return grammar(nameOfMain, trees, associativities, longestNames);
}
int wmain(int argc, wchar_t *argv[])
{
	try
	{
		if (argc < 2)
		{
			std::cout << "No url" << std::endl;
			return 1;
		}

		//Получение страницы посредством get-запроса
		const Web::Url l_url(Encoding::utf16to8(reinterpret_cast<char16_t *>(argv[1])));
		const std::string l_content(get(l_url));

		//Извлечение статьи
		std::string l_text;
		if (!recognize(l_content, l_text))
		{
			std::cout << "Couldn't recognize" << std::endl;
			return 1;
		}

		//Формирование путей
		const std::u32string l_utf32path(Encoding::utf8to32(l_url.path()));
		std::string l_fixedPath;
		for (auto j(l_utf32path.begin()); j != l_utf32path.end();)
		{
			const auto i(*j == '/' ? l_fixedPath += '/', j + 1 : j);
			j = std::find(i, l_utf32path.end(), '/');
			const size_t l_size = std::min(30, std::distance(i, j));
			const std::u32string l_part(i, i + l_size);
			l_fixedPath += Encoding::utf32to8(l_part);
		}
		const std::u32string l_u32fn(Encoding::utf8to32(l_url.file()));
		const std::u32string l_base(l_u32fn.substr(0, l_u32fn.find('.')).substr(0, 30));
		const std::string l_fixedName(l_base.empty() ? std::string("a.txt") : Encoding::utf32to8(l_base) + ".txt");
		//http://boost.2283326.n4.nabble.com/boost-filesystem-path-as-utf-8-tp4320098p4322460.html
		boost::filesystem::detail::utf8_codecvt_facet l_utf8;
		const boost::filesystem::path l_dirPath(l_url.host() + l_fixedPath, l_utf8);
		const boost::filesystem::path l_filePath(l_url.host() + l_fixedPath + '/' + l_fixedName, l_utf8);

		//Создание каталогов и сохранения статьи в текстовый файл
		boost::filesystem::create_directories(l_dirPath);
		boost::filesystem::ofstream l_of(l_filePath, std::ios::binary | std::ios::out);
		if (!l_of.is_open())
		{
			std::cout << "Output file open error" << std::endl;
			return -1;
		}
		l_of.write("\xef\xbb\xbf", 3);	//BOM
		l_of.write(l_text.data(), l_text.size());
		l_of.close();

		if (argc > 2)
		{
			const boost::filesystem::path l_xfp(l_url.host() + l_fixedPath + "/1.html", l_utf8);
			boost::filesystem::ofstream l_xof(l_xfp, std::ios::binary | std::ios::out);
			if (l_xof.is_open())
			{
				l_xof.write("\xef\xbb\xbf", 3);
				l_xof.write(l_content.data(), l_content.size());
				l_xof.close();
			}
		}

		return 0;
	}
	catch (const std::bad_alloc &)
	{
		::printf("Mem alloc error");
	}
	catch (const std::exception &a)
	{
		std::cout << "Error" << std::endl << a.what() << std::endl;
	}
	return -1;
}