示例#1
0
bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCurlFile& http, const CStdString& cacheContext)
{
  CURL url(scrURL.m_url);
  http.SetReferer(scrURL.m_spoof);
  CStdString strCachePath;

  if (scrURL.m_isgz)
    http.SetContentEncoding("gzip");

  if (!scrURL.m_cache.empty())
  {
    strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
                              "scrapers/" + cacheContext + "/" + scrURL.m_cache);
    if (XFILE::CFile::Exists(strCachePath))
    {
      XFILE::CFile file;
      XFILE::auto_buffer buffer;
      if (file.LoadFile(strCachePath, buffer))
      {
        strHTML.assign(buffer.get(), buffer.length());
        return true;
      }
    }
  }

  CStdString strHTML1(strHTML);

  if (scrURL.m_post)
  {
    CStdString strOptions = url.GetOptions();
    strOptions = strOptions.substr(1);
    url.SetOptions("");

    if (!http.Post(url.Get(), strOptions, strHTML1))
      return false;
  }
  else
    if (!http.Get(url.Get(), strHTML1))
      return false;

  strHTML = strHTML1;

  std::string mimeType(http.GetMimeType());
  CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
  if (ftype == CMime::FileTypeUnknown)
    ftype = CMime::GetFileTypeFromContent(strHTML);

  if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
  {
    XFILE::CZipFile file;
    std::string strBuffer;
    int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
    if (iSize > 0)
    {
      strHTML = strBuffer;
      CLog::Log(LOGDEBUG, "%s: Archive \"%s\" was unpacked in memory", __FUNCTION__, scrURL.m_url.c_str());
    }
    else
      CLog::Log(LOGWARNING, "%s: \"%s\" looks like archive, but cannot be unpacked", __FUNCTION__, scrURL.m_url.c_str());
  }

  std::string reportedCharset(http.GetServerReportedCharset());
  if (ftype == CMime::FileTypeHtml)
  {
    std::string realHtmlCharset, converted;
    if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
      CLog::Log(LOGWARNING, "%s: Can't find precise charset for HTML \"%s\", using \"%s\" as fallback", __FUNCTION__, scrURL.m_url.c_str(), realHtmlCharset.c_str());
    else
      CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for HTML \"%s\"", __FUNCTION__, realHtmlCharset.c_str(), scrURL.m_url.c_str());

    strHTML = converted;
  }
  else if (ftype == CMime::FileTypeXml)
  {
    CXBMCTinyXML xmlDoc;
    xmlDoc.Parse(strHTML, reportedCharset);
    
    std::string realXmlCharset(xmlDoc.GetUsedCharset());
    if (!realXmlCharset.empty())
    {
      CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for XML \"%s\"", __FUNCTION__, realXmlCharset.c_str(), scrURL.m_url.c_str());
      std::string converted;
      g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted);
      strHTML = converted;
    }
  }
  else if (ftype == CMime::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0)
  {
    std::string realTextCharset, converted;
    CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
    strHTML = converted;
    if (reportedCharset != realTextCharset)
      CLog::Log(LOGWARNING, "%s: Using \"%s\" charset for plain text \"%s\" instead of server reported \"%s\" charset", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str(), reportedCharset.c_str());
    else
      CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for plain text \"%s\"", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str());
  }
  else if (!reportedCharset.empty())
  {
    CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, reportedCharset.c_str(), scrURL.m_url.c_str());
    if (reportedCharset != "UTF-8")
    {
      std::string converted;
      g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted);
      strHTML = converted;
    }
  }
  else
    CLog::Log(LOGDEBUG, "%s: Using content of \"%s\" as binary or text with \"UTF-8\" charset", __FUNCTION__, scrURL.m_url.c_str());

  if (!scrURL.m_cache.empty())
  {
    CStdString strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
                              "scrapers/" + cacheContext + "/" + scrURL.m_cache);
    XFILE::CFile file;
    if (file.OpenForWrite(strCachePath,true))
      file.Write(strHTML.data(),strHTML.size());
    file.Close();
  }
  return true;
}