bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCurlFile& http, const CStdString& cacheContext) { CURL url(scrURL.m_url); http.SetReferer(scrURL.m_spoof); CStdString strCachePath; if (scrURL.m_isgz) http.SetContentEncoding("gzip"); if (!scrURL.m_cache.IsEmpty()) { URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath, "scrapers/"+cacheContext+"/"+scrURL.m_cache, strCachePath); if (XFILE::CFile::Exists(strCachePath)) { XFILE::CFile file; if (file.Open(strCachePath)) { char* temp = new char[(int)file.GetLength()]; file.Read(temp,file.GetLength()); strHTML.clear(); strHTML.append(temp,temp+file.GetLength()); file.Close(); delete[] temp; return true; } } } CStdString strHTML1(strHTML); if (scrURL.m_post) { CStdString strOptions = url.GetOptions(); strOptions = strOptions.substr(1); url.SetOptions(""); if (!http.Post(url.Get(), strOptions, strHTML1)) return false; } else if (!http.Get(url.Get(), strHTML1)) return false; strHTML = strHTML1; if (scrURL.m_url.Find(".zip") > -1 ) { XFILE::CZipFile file; CStdString strBuffer; int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz); if (iSize) { strHTML.clear(); strHTML.append(strBuffer.c_str(),strBuffer.data()+iSize); } } if (!scrURL.m_cache.IsEmpty()) { CStdString strCachePath; URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath, "scrapers/"+cacheContext+"/"+scrURL.m_cache, strCachePath); XFILE::CFile file; if (file.OpenForWrite(strCachePath,true)) file.Write(strHTML.data(),strHTML.size()); file.Close(); } return true; }
bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCurlFile& http, const CStdString& cacheContext) { CURL url(scrURL.m_url); http.SetReferer(scrURL.m_spoof); CStdString strCachePath; if (scrURL.m_isgz) http.SetContentEncoding("gzip"); if (!scrURL.m_cache.empty()) { strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath, "scrapers/" + cacheContext + "/" + scrURL.m_cache); if (XFILE::CFile::Exists(strCachePath)) { XFILE::CFile file; XFILE::auto_buffer buffer; if (file.LoadFile(strCachePath, buffer)) { strHTML.assign(buffer.get(), buffer.length()); return true; } } } CStdString strHTML1(strHTML); if (scrURL.m_post) { CStdString strOptions = url.GetOptions(); strOptions = strOptions.substr(1); url.SetOptions(""); if (!http.Post(url.Get(), strOptions, strHTML1)) return false; } else if (!http.Get(url.Get(), strHTML1)) return false; strHTML = strHTML1; std::string mimeType(http.GetMimeType()); CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType); if (ftype == CMime::FileTypeUnknown) ftype = CMime::GetFileTypeFromContent(strHTML); if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip) { XFILE::CZipFile file; std::string strBuffer; int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz? if (iSize > 0) { strHTML = strBuffer; CLog::Log(LOGDEBUG, "%s: Archive \"%s\" was unpacked in memory", __FUNCTION__, scrURL.m_url.c_str()); } else CLog::Log(LOGWARNING, "%s: \"%s\" looks like archive, but cannot be unpacked", __FUNCTION__, scrURL.m_url.c_str()); } std::string reportedCharset(http.GetServerReportedCharset()); if (ftype == CMime::FileTypeHtml) { std::string realHtmlCharset, converted; if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset)) CLog::Log(LOGWARNING, "%s: Can't find precise charset for HTML \"%s\", using \"%s\" as fallback", __FUNCTION__, scrURL.m_url.c_str(), realHtmlCharset.c_str()); else CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for HTML \"%s\"", __FUNCTION__, realHtmlCharset.c_str(), scrURL.m_url.c_str()); strHTML = converted; } else if (ftype == CMime::FileTypeXml) { CXBMCTinyXML xmlDoc; xmlDoc.Parse(strHTML, reportedCharset); std::string realXmlCharset(xmlDoc.GetUsedCharset()); if (!realXmlCharset.empty()) { CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for XML \"%s\"", __FUNCTION__, realXmlCharset.c_str(), scrURL.m_url.c_str()); std::string converted; g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted); strHTML = converted; } } else if (ftype == CMime::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0) { std::string realTextCharset, converted; CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset); strHTML = converted; if (reportedCharset != realTextCharset) CLog::Log(LOGWARNING, "%s: Using \"%s\" charset for plain text \"%s\" instead of server reported \"%s\" charset", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str(), reportedCharset.c_str()); else CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for plain text \"%s\"", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str()); } else if (!reportedCharset.empty()) { CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, reportedCharset.c_str(), scrURL.m_url.c_str()); if (reportedCharset != "UTF-8") { std::string converted; g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted); strHTML = converted; } } else CLog::Log(LOGDEBUG, "%s: Using content of \"%s\" as binary or text with \"UTF-8\" charset", __FUNCTION__, scrURL.m_url.c_str()); if (!scrURL.m_cache.empty()) { CStdString strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath, "scrapers/" + cacheContext + "/" + scrURL.m_cache); XFILE::CFile file; if (file.OpenForWrite(strCachePath,true)) file.Write(strHTML.data(),strHTML.size()); file.Close(); } return true; }
bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCurlFile& http, const CStdString& cacheContext) { CURL url(scrURL.m_url); http.SetReferer(scrURL.m_spoof); CStdString strCachePath; if (scrURL.m_isgz) http.SetContentEncoding("gzip"); if (!scrURL.m_cache.IsEmpty()) { strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath, "scrapers/" + cacheContext + "/" + scrURL.m_cache); if (XFILE::CFile::Exists(strCachePath)) { XFILE::CFile file; XFILE::auto_buffer buffer; if (file.LoadFile(strCachePath, buffer)) { strHTML.assign(buffer.get(), buffer.length()); return true; } } } CStdString strHTML1(strHTML); if (scrURL.m_post) { CStdString strOptions = url.GetOptions(); strOptions = strOptions.substr(1); url.SetOptions(""); if (!http.Post(url.Get(), strOptions, strHTML1)) return false; } else if (!http.Get(url.Get(), strHTML1)) return false; strHTML = strHTML1; std::string fileCharset(http.GetServerReportedCharset()); if (scrURL.m_url.Find(".zip") > -1 ) { XFILE::CZipFile file; CStdString strBuffer; int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz); if (iSize) { fileCharset.clear(); strHTML.clear(); strHTML.append(strBuffer.c_str(),strBuffer.data()+iSize); } } if (!fileCharset.empty() && fileCharset != "UTF-8") { std::string converted; if (g_charsetConverter.ToUtf8(fileCharset, strHTML, converted) && !converted.empty()) strHTML = converted; } if (!scrURL.m_cache.IsEmpty()) { CStdString strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath, "scrapers/" + cacheContext + "/" + scrURL.m_cache); XFILE::CFile file; if (file.OpenForWrite(strCachePath,true)) file.Write(strHTML.data(),strHTML.size()); file.Close(); } return true; }