コード例 #1
0
ファイル: htmlhound.cpp プロジェクト: andresmeidla/html-hound
/*
 * Find the first
 */
bool getElement(const std::u32string& html, const std::u32string& tagStartWithAttrs, Position& pos) {
  size_t index;
  pos.start = -1;
  pos.len = -1;

  if(tagStartWithAttrs.length() == 0 || tagStartWithAttrs[0] != U'<') {
    fprintf(stderr, "invalid tagStartWithAttrs supplied: must start with <\n");
    return false;
  }
  TagItems tagItems(tagStartWithAttrs);
  std::stack<int> stack;
  size_t tagEndLen = tagItems.tagEnd.length();
  size_t tagStartLen = tagItems.tagStart.length();
  size_t htmlLen = html.length();

  // search for our tagStartWithAttrs
  if((index = html.find(tagStartWithAttrs)) != std::u32string::npos) {
    pos.start = index;
    index += tagStartWithAttrs.length();
    while((index = html.find(U"<", index)) != std::u32string::npos) {
      // check if the tag is a start tag
      if(index + tagStartLen <= htmlLen) {
        if(memcmp(html.c_str() + index, tagItems.tagStart.c_str(), tagStartLen*sizeof(tagItems.tagStart[0])) == 0) {
          // an embedded start tag pushed to the stack
          stack.push(index);
        }
      }
      // check if the tag is an end tag
      if(index + tagEndLen <= htmlLen) {
        if(memcmp(html.c_str() + index, tagItems.tagEnd.c_str(), tagEndLen*sizeof(tagItems.tagEnd[0])) == 0) {
          if(stack.size()) {
            // close tag for the last embedded start tag
            stack.pop();
          }
          else {
            // we have found the end tag to our initial tagStartWithAttrs
            pos.len = index - pos.start + tagEndLen;
            break;
          }
        }
      }
      ++index;
    }
  }

  return true;
}
コード例 #2
0
ファイル: htmlhound.cpp プロジェクト: andresmeidla/html-hound
bool parseLink(const std::u32string& html, Link& link) {
  size_t index, tmp;
  bool ret = false;
  Position pos;
  static const std::u32string LINK_START = std::u32string(U"<a");
  static const std::u32string LINK_END = std::u32string(U"</a>");
  static const std::u32string LINK_HREF = std::u32string(U"href=");
  static const char32_t GT = U'>';
  char32_t quoteChar;

  // find <a
  if((index = html.find(LINK_START)) != std::u32string::npos) {
    pos.start = index;
    index += LINK_START.length();
    // find href=
    if((index = html.find(LINK_HREF, index)) != std::u32string::npos) {
      index += LINK_HREF.length();
      // advance index by ine (quote char)
      quoteChar = html[index];
      ++index;
      tmp = index;
      // find the end of the quotation
      if((index = html.find(quoteChar, index)) != std::u32string::npos) {
        link.url = html.substr(tmp, index - tmp);
        // find >
        if((index = html.find(GT, index)) != std::u32string::npos) {
          ++index;
          tmp = index;
          // find </a>
          if((index = html.find(LINK_END, index)) != std::u32string::npos) {
            pos.len = index + LINK_END.length() - pos.start;
            link.text = html.substr(tmp, index - tmp);
            link.pos = pos;
            ret = true;
          }
        }
      }
    }
  }

  return ret;
}
コード例 #3
0
ファイル: htmlhound.cpp プロジェクト: andresmeidla/html-hound
 TagItems(const std::u32string& tagStartWithAttrs) {
   size_t tmp;
   if(tagStartWithAttrs.length() > 0 && (tmp = tagStartWithAttrs.find(' ')) != std::u32string::npos) {
     tagStart = tagStartWithAttrs.substr(0, tmp);
     tagEnd = tagStart + U">";
     tagEnd.insert(1, U"/");
   }
   else {
     tagStart = tagStartWithAttrs;
     tagEnd = tagStartWithAttrs  + U">";
     tagEnd.insert(1, U"/");
   }
 }
コード例 #4
0
ファイル: htmlhound.cpp プロジェクト: andresmeidla/html-hound
int findTags(const std::u32string& html, std::map<std::u32string, std::vector<Position> >& tags) {
  size_t index = 0;
  size_t htmlLen = html.length();
  while((index = html.find(U"<", index)) != std::u32string::npos) {
    for(std::map<std::u32string, std::vector<Position> >::iterator itr = tags.begin(); itr != tags.end(); ++itr) {
      const std::u32string& str = itr->first;
      if(index + str.length() <= htmlLen) {
        if(memcmp(html.c_str() + index, str.c_str(), str.length()*sizeof(str[0])) == 0) {
          itr->second.push_back(Position(index, 0));
        }
      }
    }
    ++index;
  }
  return 0;
}
int wmain(int argc, wchar_t *argv[])
{
	try
	{
		if (argc < 2)
		{
			std::cout << "No url" << std::endl;
			return 1;
		}

		//Получение страницы посредством get-запроса
		const Web::Url l_url(Encoding::utf16to8(reinterpret_cast<char16_t *>(argv[1])));
		const std::string l_content(get(l_url));

		//Извлечение статьи
		std::string l_text;
		if (!recognize(l_content, l_text))
		{
			std::cout << "Couldn't recognize" << std::endl;
			return 1;
		}

		//Формирование путей
		const std::u32string l_utf32path(Encoding::utf8to32(l_url.path()));
		std::string l_fixedPath;
		for (auto j(l_utf32path.begin()); j != l_utf32path.end();)
		{
			const auto i(*j == '/' ? l_fixedPath += '/', j + 1 : j);
			j = std::find(i, l_utf32path.end(), '/');
			const size_t l_size = std::min(30, std::distance(i, j));
			const std::u32string l_part(i, i + l_size);
			l_fixedPath += Encoding::utf32to8(l_part);
		}
		const std::u32string l_u32fn(Encoding::utf8to32(l_url.file()));
		const std::u32string l_base(l_u32fn.substr(0, l_u32fn.find('.')).substr(0, 30));
		const std::string l_fixedName(l_base.empty() ? std::string("a.txt") : Encoding::utf32to8(l_base) + ".txt");
		//http://boost.2283326.n4.nabble.com/boost-filesystem-path-as-utf-8-tp4320098p4322460.html
		boost::filesystem::detail::utf8_codecvt_facet l_utf8;
		const boost::filesystem::path l_dirPath(l_url.host() + l_fixedPath, l_utf8);
		const boost::filesystem::path l_filePath(l_url.host() + l_fixedPath + '/' + l_fixedName, l_utf8);

		//Создание каталогов и сохранения статьи в текстовый файл
		boost::filesystem::create_directories(l_dirPath);
		boost::filesystem::ofstream l_of(l_filePath, std::ios::binary | std::ios::out);
		if (!l_of.is_open())
		{
			std::cout << "Output file open error" << std::endl;
			return -1;
		}
		l_of.write("\xef\xbb\xbf", 3);	//BOM
		l_of.write(l_text.data(), l_text.size());
		l_of.close();

		if (argc > 2)
		{
			const boost::filesystem::path l_xfp(l_url.host() + l_fixedPath + "/1.html", l_utf8);
			boost::filesystem::ofstream l_xof(l_xfp, std::ios::binary | std::ios::out);
			if (l_xof.is_open())
			{
				l_xof.write("\xef\xbb\xbf", 3);
				l_xof.write(l_content.data(), l_content.size());
				l_xof.close();
			}
		}

		return 0;
	}
	catch (const std::bad_alloc &)
	{
		::printf("Mem alloc error");
	}
	catch (const std::exception &a)
	{
		std::cout << "Error" << std::endl << a.what() << std::endl;
	}
	return -1;
}