ParsedPage HtmlApi::parsePage(Page *parentPage, const QString &source, int first, int limit) const { ParsedPage ret; // Getting tags if (contains("Regex/Tags")) { QList<Tag> tgs = Tag::FromRegexp(value("Regex/Tags"), source); if (!tgs.isEmpty()) { ret.tags = tgs; } } // Getting images QRegularExpression rxImages(value("Regex/Image"), QRegularExpression::DotMatchesEverythingOption); auto matches = rxImages.globalMatch(source); int id = 0; while (matches.hasNext()) { auto match = matches.next(); QMap<QString, QString> d = multiMatchToMap(match, rxImages.namedCaptureGroups()); // JSON elements if (d.contains("json") && !d["json"].isEmpty()) { QVariant src = Json::parse(d["json"]); if (!src.isNull()) { QMap<QString, QVariant> map = src.toMap(); for (auto it = map.begin(); it != map.end(); ++it) { d[it.key()] = it.value().toString(); } } } QSharedPointer<Image> img = parseImage(parentPage, d, id + first); if (!img.isNull()) { ret.images.append(img); } id++; } // Navigation if (contains("Regex/NextPage")) { QRegularExpression rxNextPage(value("Regex/NextPage")); auto match = rxNextPage.match(source); if (match.hasMatch()) { ret.urlNextPage = QUrl(match.captured(1)); } } if (contains("Regex/PrevPage")) { QRegularExpression rxPrevPage(value("Regex/PrevPage")); auto match = rxPrevPage.match(source); if (match.hasMatch()) { ret.urlPrevPage = QUrl(match.captured(1)); } } // Last page if (contains("LastPage")) { ret.pageCount = value("LastPage").toInt(); } else if (contains("Regex/LastPage")) { QRegularExpression rxlast(value("Regex/LastPage")); auto match = rxlast.match(source); int cnt = match.hasMatch() ? match.captured(1).remove(",").toInt() : 0; if (cnt > 0) { int pagesCount = cnt; if (value("Urls/Tags").contains("{pid}") || (contains("Urls/PagePart") && value("Urls/PagePart").contains("{pid}"))) { int forced = forcedLimit(); int ppid = forced > 0 ? forced : limit; pagesCount = qFloor(static_cast<qreal>(pagesCount) / static_cast<qreal>(ppid)) + 1; } ret.pageCount = pagesCount; } } // Count images if (contains("Regex/Count")) { QRegularExpression rxlast(value("Regex/Count")); auto match = rxlast.match(source); int cnt = match.hasMatch() ? match.captured(1).remove(",").toInt() : 0; if (cnt > 0) { ret.imageCount = cnt; } } // Wiki if (contains("Regex/Wiki")) { QRegularExpression rxwiki(value("Regex/Wiki"), QRegularExpression::DotMatchesEverythingOption); auto match = rxwiki.match(source); if (match.hasMatch()) { QString wiki = match.captured(1); wiki.remove("/wiki/show?title="); wiki.remove(QRegularExpression("<p><a href=\"([^\"]+)\">Full entry »</a></p>")); wiki.replace("<h6>", "<span class=\"title\">").replace("</h6>", "</span>"); ret.wiki = wiki; } } return ret; }
ParsingStatus ParserArchiveFoolzUs::parseHTML(QString html) { QStringList res; QRegExp rxImages("<div class=\"thread_image_box\"[^>]*>[^<]*<a href=\"([^\"]+)\"(?:[^<]+)(<[^>]*>)[^<]*</a>", Qt::CaseInsensitive, QRegExp::RegExp2); QRegExp rxThreads("<a href=\"([^\"]+)\"[^>]*>View</a>", Qt::CaseSensitive, QRegExp::RegExp2); QRegExp rxTitle("<span class=\"subject\">([^<]+)</span>"); //bool imagesAdded; bool pageIsFrontpage; int pos; _IMAGE i; QUrl u; QString sUrl; _html = html; _images.clear(); _redirect.clear(); _urlList.clear(); _statusCode.hasErrors = false; _statusCode.hasImages = false; _statusCode.hasTitle = false; _statusCode.isFrontpage = false; pos = 0; i.downloaded = false; i.requested = false; // pageIsFrontpage = !html.contains("<div id=\"ca_thread_html\">"); pageIsFrontpage = html.count("</aside>") > 1 ? true:false; if (pageIsFrontpage) { pos = 0; _statusCode.isFrontpage = true; while (pos > -1) { pos = rxThreads.indexIn(html, pos + 1); res = rxThreads.capturedTexts(); if (!res.at(1).isEmpty()) { sUrl = res.at(1); if (sUrl.endsWith("/")) { sUrl.remove(sUrl.length()-1,1); } _urlList.append(QUrl(sUrl)); } } } else { // Checking for Images pos = 0; while (pos > -1) { pos = rxImages.indexIn(html, pos+1); res = rxImages.capturedTexts(); i.originalFilename = res.at(1).right(res.at(1).length() - res.at(1).lastIndexOf("/") - 1); i.largeURI = res.at(1); i.thumbURI = ""; if (pos != -1) { _images.append(i); _statusCode.hasImages = true; } } pos = 0; while (pos > -1) { pos = rxTitle.indexIn(html,pos+1); res = rxTitle.capturedTexts(); if (res.at(1) != "") { _threadTitle = res.at(1); _statusCode.hasTitle = true; pos = -1; } } } return _statusCode; }