Ejemplo n.º 1
0
ParsedPage HtmlApi::parsePage(Page *parentPage, const QString &source, int first, int limit) const
{
	ParsedPage ret;

	// Getting tags
	if (contains("Regex/Tags"))
	{
		QList<Tag> tgs = Tag::FromRegexp(value("Regex/Tags"), source);
		if (!tgs.isEmpty())
		{ ret.tags = tgs; }
	}

	// Getting images
	QRegularExpression rxImages(value("Regex/Image"), QRegularExpression::DotMatchesEverythingOption);
	auto matches = rxImages.globalMatch(source);
	int id = 0;
	while (matches.hasNext())
	{
		auto match = matches.next();
		QMap<QString, QString> d = multiMatchToMap(match, rxImages.namedCaptureGroups());

		// JSON elements
		if (d.contains("json") && !d["json"].isEmpty())
		{
			QVariant src = Json::parse(d["json"]);
			if (!src.isNull())
			{
				QMap<QString, QVariant> map = src.toMap();
				for (auto it = map.begin(); it != map.end(); ++it)
				{ d[it.key()] = it.value().toString(); }
			}
		}

		QSharedPointer<Image> img = parseImage(parentPage, d, id + first);
		if (!img.isNull())
		{ ret.images.append(img); }

		id++;
	}

	// Navigation
	if (contains("Regex/NextPage"))
	{
		QRegularExpression rxNextPage(value("Regex/NextPage"));
		auto match = rxNextPage.match(source);
		if (match.hasMatch())
		{ ret.urlNextPage = QUrl(match.captured(1)); }
	}
	if (contains("Regex/PrevPage"))
	{
		QRegularExpression rxPrevPage(value("Regex/PrevPage"));
		auto match = rxPrevPage.match(source);
		if (match.hasMatch())
		{ ret.urlPrevPage = QUrl(match.captured(1)); }
	}

	// Last page
	if (contains("LastPage"))
	{ ret.pageCount = value("LastPage").toInt(); }
	else if (contains("Regex/LastPage"))
	{
		QRegularExpression rxlast(value("Regex/LastPage"));
		auto match = rxlast.match(source);
		int cnt = match.hasMatch() ? match.captured(1).remove(",").toInt() : 0;
		if (cnt > 0)
		{
			int pagesCount = cnt;
			if (value("Urls/Tags").contains("{pid}") || (contains("Urls/PagePart") && value("Urls/PagePart").contains("{pid}")))
			{
				int forced = forcedLimit();
				int ppid = forced > 0 ? forced : limit;
				pagesCount = qFloor(static_cast<qreal>(pagesCount) / static_cast<qreal>(ppid)) + 1;
			}
			ret.pageCount = pagesCount;
		}
	}

	// Count images
	if (contains("Regex/Count"))
	{
		QRegularExpression rxlast(value("Regex/Count"));
		auto match = rxlast.match(source);
		int cnt = match.hasMatch() ? match.captured(1).remove(",").toInt() : 0;
		if (cnt > 0)
		{ ret.imageCount = cnt; }
	}

	// Wiki
	if (contains("Regex/Wiki"))
	{
		QRegularExpression rxwiki(value("Regex/Wiki"), QRegularExpression::DotMatchesEverythingOption);
		auto match = rxwiki.match(source);
		if (match.hasMatch())
		{
			QString wiki = match.captured(1);
			wiki.remove("/wiki/show?title=");
			wiki.remove(QRegularExpression("<p><a href=\"([^\"]+)\">Full entry &raquo;</a></p>"));
			wiki.replace("<h6>", "<span class=\"title\">").replace("</h6>", "</span>");
			ret.wiki = wiki;
		}
	}

	return ret;
}
Ejemplo n.º 2
0
ParsingStatus ParserArchiveFoolzUs::parseHTML(QString html) {
    QStringList res;
    QRegExp rxImages("<div class=\"thread_image_box\"[^>]*>[^<]*<a href=\"([^\"]+)\"(?:[^<]+)(<[^>]*>)[^<]*</a>", Qt::CaseInsensitive, QRegExp::RegExp2);
    QRegExp rxThreads("<a href=\"([^\"]+)\"[^>]*>View</a>", Qt::CaseSensitive, QRegExp::RegExp2);
    QRegExp rxTitle("<span class=\"subject\">([^<]+)</span>");

    //bool imagesAdded;
    bool pageIsFrontpage;
    int pos;
    _IMAGE i;
    QUrl u;
    QString sUrl;

    _html = html;
    _images.clear();
    _redirect.clear();
    _urlList.clear();
    _statusCode.hasErrors = false;
    _statusCode.hasImages = false;
    _statusCode.hasTitle = false;
    _statusCode.isFrontpage = false;

    pos = 0;
    i.downloaded = false;
    i.requested = false;
//    pageIsFrontpage = !html.contains("<div id=\"ca_thread_html\">");
    pageIsFrontpage = html.count("</aside>") > 1 ? true:false;

    if (pageIsFrontpage) {
        pos = 0;
        _statusCode.isFrontpage = true;

        while (pos > -1) {
            pos = rxThreads.indexIn(html, pos + 1);
            res = rxThreads.capturedTexts();

            if (!res.at(1).isEmpty()) {
                sUrl = res.at(1);
                if (sUrl.endsWith("/")) {
                    sUrl.remove(sUrl.length()-1,1);
                }

                _urlList.append(QUrl(sUrl));
            }
        }
    }
    else {
        // Checking for Images
        pos = 0;

        while (pos > -1) {
            pos = rxImages.indexIn(html, pos+1);
            res = rxImages.capturedTexts();

            i.originalFilename = res.at(1).right(res.at(1).length() - res.at(1).lastIndexOf("/") - 1);
            i.largeURI = res.at(1);
            i.thumbURI = "";

            if (pos != -1) {
                _images.append(i);
                _statusCode.hasImages = true;
            }
        }

        pos = 0;
        while (pos > -1) {
            pos = rxTitle.indexIn(html,pos+1);
            res = rxTitle.capturedTexts();

            if (res.at(1) != "") {
                _threadTitle = res.at(1);
                _statusCode.hasTitle = true;
                pos = -1;
            }
        }
    }

    return _statusCode;
}