Ejemplo n.º 1
0
bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCurlFile& http, const CStdString& cacheContext)
{
  CURL url(scrURL.m_url);
  http.SetReferer(scrURL.m_spoof);
  CStdString strCachePath;

  if (scrURL.m_isgz)
    http.SetContentEncoding("gzip");

  if (!scrURL.m_cache.IsEmpty())
  {
    URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
                              "scrapers/"+cacheContext+"/"+scrURL.m_cache,
                              strCachePath);
    if (XFILE::CFile::Exists(strCachePath))
    {
      XFILE::CFile file;
      if (file.Open(strCachePath))
      {
        char* temp = new char[(int)file.GetLength()];
        file.Read(temp,file.GetLength());
        strHTML.clear();
        strHTML.append(temp,temp+file.GetLength());
        file.Close();
        delete[] temp;
        return true;
      }
    }
  }

  CStdString strHTML1(strHTML);

  if (scrURL.m_post)
  {
    CStdString strOptions = url.GetOptions();
    strOptions = strOptions.substr(1);
    url.SetOptions("");

    if (!http.Post(url.Get(), strOptions, strHTML1))
      return false;
  }
  else
    if (!http.Get(url.Get(), strHTML1))
      return false;

  strHTML = strHTML1;

  if (scrURL.m_url.Find(".zip") > -1 )
  {
    XFILE::CZipFile file;
    CStdString strBuffer;
    int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz);
    if (iSize)
    {
      strHTML.clear();
      strHTML.append(strBuffer.c_str(),strBuffer.data()+iSize);
    }
  }

  if (!scrURL.m_cache.IsEmpty())
  {
    CStdString strCachePath;
    URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
                              "scrapers/"+cacheContext+"/"+scrURL.m_cache,
                              strCachePath);
    XFILE::CFile file;
    if (file.OpenForWrite(strCachePath,true))
      file.Write(strHTML.data(),strHTML.size());
    file.Close();
  }
  return true;
}
Ejemplo n.º 2
0
bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCurlFile& http, const CStdString& cacheContext)
{
  CURL url(scrURL.m_url);
  http.SetReferer(scrURL.m_spoof);
  CStdString strCachePath;

  if (scrURL.m_isgz)
    http.SetContentEncoding("gzip");

  if (!scrURL.m_cache.empty())
  {
    strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
                              "scrapers/" + cacheContext + "/" + scrURL.m_cache);
    if (XFILE::CFile::Exists(strCachePath))
    {
      XFILE::CFile file;
      XFILE::auto_buffer buffer;
      if (file.LoadFile(strCachePath, buffer))
      {
        strHTML.assign(buffer.get(), buffer.length());
        return true;
      }
    }
  }

  CStdString strHTML1(strHTML);

  if (scrURL.m_post)
  {
    CStdString strOptions = url.GetOptions();
    strOptions = strOptions.substr(1);
    url.SetOptions("");

    if (!http.Post(url.Get(), strOptions, strHTML1))
      return false;
  }
  else
    if (!http.Get(url.Get(), strHTML1))
      return false;

  strHTML = strHTML1;

  std::string mimeType(http.GetMimeType());
  CMime::EFileType ftype = CMime::GetFileTypeFromMime(mimeType);
  if (ftype == CMime::FileTypeUnknown)
    ftype = CMime::GetFileTypeFromContent(strHTML);

  if (ftype == CMime::FileTypeZip || ftype == CMime::FileTypeGZip)
  {
    XFILE::CZipFile file;
    std::string strBuffer;
    int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz); // FIXME: use FileTypeGZip instead of scrURL.m_isgz?
    if (iSize > 0)
    {
      strHTML = strBuffer;
      CLog::Log(LOGDEBUG, "%s: Archive \"%s\" was unpacked in memory", __FUNCTION__, scrURL.m_url.c_str());
    }
    else
      CLog::Log(LOGWARNING, "%s: \"%s\" looks like archive, but cannot be unpacked", __FUNCTION__, scrURL.m_url.c_str());
  }

  std::string reportedCharset(http.GetServerReportedCharset());
  if (ftype == CMime::FileTypeHtml)
  {
    std::string realHtmlCharset, converted;
    if (!CCharsetDetection::ConvertHtmlToUtf8(strHTML, converted, reportedCharset, realHtmlCharset))
      CLog::Log(LOGWARNING, "%s: Can't find precise charset for HTML \"%s\", using \"%s\" as fallback", __FUNCTION__, scrURL.m_url.c_str(), realHtmlCharset.c_str());
    else
      CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for HTML \"%s\"", __FUNCTION__, realHtmlCharset.c_str(), scrURL.m_url.c_str());

    strHTML = converted;
  }
  else if (ftype == CMime::FileTypeXml)
  {
    CXBMCTinyXML xmlDoc;
    xmlDoc.Parse(strHTML, reportedCharset);
    
    std::string realXmlCharset(xmlDoc.GetUsedCharset());
    if (!realXmlCharset.empty())
    {
      CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for XML \"%s\"", __FUNCTION__, realXmlCharset.c_str(), scrURL.m_url.c_str());
      std::string converted;
      g_charsetConverter.ToUtf8(realXmlCharset, strHTML, converted);
      strHTML = converted;
    }
  }
  else if (ftype == CMime::FileTypePlainText || StringUtils::CompareNoCase(mimeType.substr(0, 5), "text/") == 0)
  {
    std::string realTextCharset, converted;
    CCharsetDetection::ConvertPlainTextToUtf8(strHTML, converted, reportedCharset, realTextCharset);
    strHTML = converted;
    if (reportedCharset != realTextCharset)
      CLog::Log(LOGWARNING, "%s: Using \"%s\" charset for plain text \"%s\" instead of server reported \"%s\" charset", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str(), reportedCharset.c_str());
    else
      CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for plain text \"%s\"", __FUNCTION__, realTextCharset.c_str(), scrURL.m_url.c_str());
  }
  else if (!reportedCharset.empty())
  {
    CLog::Log(LOGDEBUG, "%s: Using \"%s\" charset for \"%s\"", __FUNCTION__, reportedCharset.c_str(), scrURL.m_url.c_str());
    if (reportedCharset != "UTF-8")
    {
      std::string converted;
      g_charsetConverter.ToUtf8(reportedCharset, strHTML, converted);
      strHTML = converted;
    }
  }
  else
    CLog::Log(LOGDEBUG, "%s: Using content of \"%s\" as binary or text with \"UTF-8\" charset", __FUNCTION__, scrURL.m_url.c_str());

  if (!scrURL.m_cache.empty())
  {
    CStdString strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
                              "scrapers/" + cacheContext + "/" + scrURL.m_cache);
    XFILE::CFile file;
    if (file.OpenForWrite(strCachePath,true))
      file.Write(strHTML.data(),strHTML.size());
    file.Close();
  }
  return true;
}
Ejemplo n.º 3
0
bool CScraperUrl::Get(const SUrlEntry& scrURL, std::string& strHTML, XFILE::CCurlFile& http, const CStdString& cacheContext)
{
  CURL url(scrURL.m_url);
  http.SetReferer(scrURL.m_spoof);
  CStdString strCachePath;

  if (scrURL.m_isgz)
    http.SetContentEncoding("gzip");

  if (!scrURL.m_cache.IsEmpty())
  {
    strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
                              "scrapers/" + cacheContext + "/" + scrURL.m_cache);
    if (XFILE::CFile::Exists(strCachePath))
    {
      XFILE::CFile file;
      XFILE::auto_buffer buffer;
      if (file.LoadFile(strCachePath, buffer))
      {
        strHTML.assign(buffer.get(), buffer.length());
        return true;
      }
    }
  }

  CStdString strHTML1(strHTML);

  if (scrURL.m_post)
  {
    CStdString strOptions = url.GetOptions();
    strOptions = strOptions.substr(1);
    url.SetOptions("");

    if (!http.Post(url.Get(), strOptions, strHTML1))
      return false;
  }
  else
    if (!http.Get(url.Get(), strHTML1))
      return false;

  strHTML = strHTML1;
  std::string fileCharset(http.GetServerReportedCharset());

  if (scrURL.m_url.Find(".zip") > -1 )
  {
    XFILE::CZipFile file;
    CStdString strBuffer;
    int iSize = file.UnpackFromMemory(strBuffer,strHTML,scrURL.m_isgz);
    if (iSize)
    {
      fileCharset.clear();
      strHTML.clear();
      strHTML.append(strBuffer.c_str(),strBuffer.data()+iSize);
    }
  }

  if (!fileCharset.empty() && fileCharset != "UTF-8")
  {
    std::string converted;
    if (g_charsetConverter.ToUtf8(fileCharset, strHTML, converted) && !converted.empty())
      strHTML = converted;
  }

  if (!scrURL.m_cache.IsEmpty())
  {
    CStdString strCachePath = URIUtils::AddFileToFolder(g_advancedSettings.m_cachePath,
                              "scrapers/" + cacheContext + "/" + scrURL.m_cache);
    XFILE::CFile file;
    if (file.OpenForWrite(strCachePath,true))
      file.Write(strHTML.data(),strHTML.size());
    file.Close();
  }
  return true;
}