/*
	OnButtonAdd()
*/
void CWallPaperCrawlerSettingsDomainDlg::OnButtonAdd(void)
{
	char szValue[MAX_URL+1];

	UpdateData(TRUE);
	strcpyn(szValue,m_strValue,sizeof(szValue));

	CUrl url;
	URL Url;

	// controlla che non contenga caratteri jolly
	if(strchr(szValue,'?') || strchr(szValue,'*'))
	{
		::MessageBoxResourceEx(this->m_hWnd,MB_OK|MB_ICONERROR,WALLPAPER_PROGRAM_NAME,IDS_ERROR_WILDCARDS,szValue);
		return;
	}

	// controlla che sia un url
	if(!url.IsUrl(szValue))
	{
		::MessageBoxResourceEx(this->m_hWnd,MB_OK|MB_ICONERROR,WALLPAPER_PROGRAM_NAME,IDS_ERROR_INVALID_HTTP_URL,szValue);
		return;
	}
		
	// controlla che sia un url HTTP
	if(!url.IsUrlType(szValue,HTTP_URL))
	{
		if(::MessageBoxResourceEx(this->m_hWnd,MB_YESNO|MB_ICONQUESTION,WALLPAPER_PROGRAM_NAME,IDS_QUESTION_INVALID_HTTP_URL,szValue)==IDNO)
			return;
	}

	// elimina l'eventuale '/' finale
	int nLen = strlen(szValue)-1;
	if(szValue[nLen]=='/')
		szValue[nLen] = '\0';

	url.SplitUrl(szValue,&Url);
	BOOL bOnlyHost = TRUE;
	if(strlen(Url.file) > 0)
		bOnlyHost = FALSE;
	if(strlen(Url.dir) > 0)
		if(strcmp(Url.dir,"/")!=0)
			bOnlyHost = FALSE;
	if(!bOnlyHost)
	{
		::MessageBoxResourceEx(this->m_hWnd,MB_OK|MB_ICONERROR,WALLPAPER_PROGRAM_NAME,IDS_ERROR_NOT_HTTP_DOMAIN,szValue,strlen(Url.file) > 0 ? Url.file : Url.dir,Url.host);
		strcpyn(szValue,Url.host,sizeof(szValue));
	}

	if(strlen(szValue) > 0)
	{
		if(m_wndDomainList.FindItem(szValue) < 0)
			m_wndDomainList.SelectItem(m_wndDomainList.AddItem(szValue,0));
		else
			::MessageBoxResourceEx(this->m_hWnd,MB_OK|MB_ICONERROR,WALLPAPER_PROGRAM_NAME,IDS_ERROR_VALUE_EXISTS,szValue);
	}
	else
		::MessageBoxResource(this->m_hWnd,MB_OK|MB_ICONERROR,WALLPAPER_PROGRAM_NAME,IDS_ERROR_INVALID_VALUE);
}
Exemple #2
0
/*
 * 将请求返回的结果打印输出到屏幕.
 */
long CUrl::call_wirte_func(void *buffer, int size, int nmemb, void *uri)
{

	long count = size * nmemb;
	
	Uri *puri = (Uri *)uri;
	CUrl *pcurl = (CUrl *)puri->pcurl;


	if (pcurl != 0 && pcurl->isfirstwirte == true)
	{
		pcurl->get_response_code();
		pcurl->get_response_length();
		pcurl->get_response_contenttype();

		puri->request_size =	pcurl->request_size;
		puri->request_cursize =	0;
	}

	if (pcurl->request_size > 0)
	{
		if (puri->curbuf == NULL)
		{
			puri->curbuf = (char *)malloc(pcurl->request_size+1);
		}
		memcpy(puri->curbuf + puri->request_cursize,(const char *)buffer, count);
		puri->request_cursize += count;
		//puri->curbuf_allocsize = pcurl->request_size;
		puri->curbuf_size += count;
		puri->curbuf[puri->request_size] = 0;
	}
	else
	{
		if (puri->curbuf == NULL)
		{
			puri->curbuf = (char *)malloc(count+1);
			memcpy(puri->curbuf, buffer, count);
			puri->request_cursize = count;
			puri->request_size = count;
			//puri->curbuf_allocsize = count;
			puri->curbuf_size = count;
		} 
		else
		{
			puri->curbuf = (char *)realloc(puri->curbuf, puri->request_size + count+1);
			memcpy(puri->curbuf + puri->request_size , buffer, count);
			puri->request_cursize += count;
			puri->request_size += count;
			//puri->curbuf_allocsize += count;
			puri->curbuf_size += count;
		}
		puri->curbuf[puri->request_size] = 0;
	}

	pcurl->isfirstwirte = false;
	return count;
}
Exemple #3
0
void Uri::Requset()
{
	if(isinit == true)
	{
		CUrl curl;
		curl.requset(this);
		pcurl = 0;
	}
	return ;
};
Exemple #4
0
EXPORT_C CUrl* CUrl::NewL(const TParseBase& aFileName)
//
//	Static factory c'tor. Used for creating CUrl object for a file on local file system
	{
	CUrl* url = new(ELeave) CUrl();
	CleanupStack::PushL(url);
	url->ConstructL(aFileName);
	CleanupStack::Pop();
	return url;	
	}
Exemple #5
0
EXPORT_C CUrl* CUrl::NewL(const TDesC& aUrl)
//
//	Static factory c'tor.
	{
	CUrl* url = new(ELeave) CUrl();
	CleanupStack::PushL(url);
	url->ConstructL(aUrl);
	CleanupStack::Pop();
	return url;
	}
Exemple #6
0
bool CManagedUrlList::CurrentlyListed(CUrl Url) {
	// If the url is in the list of urls to visit, then we already know about it
	for (std::list<CUrl>::iterator j = this->begin(); j != this->end(); j++) {
		CUrl Current = *j;
		// ### This will have to be more specific to match the server too
		if (Url.GetResource() == Current.GetResource()) return true;
	}

	// Didn't find it in the list
	return false;
}
Exemple #7
0
bool CManagedUrlList::PreviouslyListed(CUrl Url) {
	// If the url is in the list of visited resources, then we've been there
	for (std::list<CUrl>::iterator i = m_PastItems.begin(); i != m_PastItems.end(); i++) {
		CUrl Current = *i;
		// ### This will have to be more specific to match the server too
		if (Current.GetResource() == Url.GetResource()) {
			// Already been there
			return true;
		}
	}
	// Couldn't find it
	return false;
}
Exemple #8
0
int CHttpRequest::Post(CUrl& iUrl, const CString& RawData){
  CString Server;
  int Port;  		       
  if (ProxyURL.StrLength() && Proxy.isValid()) {
    Server = Proxy.GetHost();
    Port = Proxy.GetPortValue();
  } else if (iUrl.isValid()) {
    Server = iUrl.GetHost();
    Port = iUrl.GetPortValue();
  } else return 0;
  inetSocket Sock(Port, Server);
  if (wsLastError.StrLength()) return 0;
  char iE[] = "xxxx\0"; sprintf(iE, "%c%c", 13, 10);  
  return PostHTTP(iUrl, RawData, iE, Sock);
}
Exemple #9
0
int CHttpRequest::Execute(CUrl& iUrl){
#ifdef _U_DEBUG
    cout << "CHttpRequest::Execute()" << endl;
#endif

    RHeader.Free();
    RData.Free();
    RStatus.Free();
    RStatusValue = -1;
    RedirectVector.Clear();

    CString Server;
    int Port;       
    if (ProxyURL.StrLength() && Proxy.isValid()) {
      Server = Proxy.GetHost();
      Port = Proxy.GetPortValue();
    } else if (iUrl.isValid()) {
      Server = iUrl.GetHost();
      Port = iUrl.GetPortValue();
    } else {
      RStatusValue = HTTPR_USER + 1;
      return 0;
    }
#ifdef _U_DEBUG
    cout << "CHttpRequest::Execute() - creating inetSocket" << endl;
#endif
    inetSocket Sock(Port, Server);
    if (wsLastError.StrLength()) {
#ifdef _U_DEBUG
      cout << "CHttpRequest::Execute() - ERROR at inetSocket - " << wsLastError << endl;
#endif
      RStatusValue = HTTPR_USER + 2;      
      return 0;
    }
#ifdef _U_DEBUG
    cout << "CHttpRequest::Execute() - inetSocket created" << endl;
#endif
    char iE[] = "xxxx\0"; sprintf(iE, "%c%c", 13, 10);  
    
    if (!GetHTTP10(iUrl, iE, Sock)) {
      if (!GetHTTP09(iUrl, iE, Sock)) {
	return 0;
      } else return 1;
    }
    else return 1;
}
bool CHttpConnection::connect(const CUrl &url) {
/*
** Host und Port aus CUrl holen
*/
  string host=url.getDomain();
  string sport=url.getPort();
  unsigned int port;

  if(sport=="")
    port=80;
  else
    port=atol(sport.c_str());
/*
** connect für Host und Port aufrufen
*/
  return(connect(host,port));
}
Exemple #11
0
int CHttpRequest::GetHTTP09(CUrl& iUrl, const CString& iE, inetSocket& Sock){
  /*
    attempt a retrieval of HTTP/0.9
    */  
  CString Request;  
  if (RLimit) Request += "GET "; else Request+="HEAD ";
  Request += iUrl.GetScheme(); Request+="://";
  Request += iUrl.GetHost();
  if (iUrl.GetPortValue() != 80) {
    Request+=":"; 
    Request += iUrl.GetPort();
  }
  Request += iUrl.GetUrlPath(); Request+=iE;  
  for (int i=0;i<RHeaderParams.entries_count();i++) {
    Request+=RHeaderParams.get_name(i);
    Request+=": ";
    Request+=RHeaderParams.get_value(i);
    Request += iE;
  }
  Request += iE;
#ifdef _U_DEBUG
  cout << "# HTTP 0.9 Request: =====" << endl;
  cout << Request;
  cout << "=====================" << endl;
#endif
  /*
    issue request
    */
  if (!Send(Sock, Request)) return 0;
  RHeader.Free();
  RData.Free();
  RStatus.Free();
  RStatusValue = -1;
  ProcessData(Sock);
  if (RData.StrLength()) {
    RHeaderResponse.clear();
    RStatusValue = 200;
    RStatus = "200";
    return 1;
  } else if (RStatusValue != -1) {
    return 0;
  } else {
    RStatusValue = HTTPR_USER + 3;
    return 0;
  }
}
Exemple #12
0
int CHttpRequest::PostHTTP(CUrl& iUrl, const CString& RawData, const CString& iE, inetSocket& Sock){
  CString Request;
  if (ProxyURL.StrLength() && Proxy.isValid()) {
    Request += "POST "; Request += iUrl.GetScheme(); Request+="://";
    Request += iUrl.GetHost(); 
    if (iUrl.GetPortValue() != 80) {
      Request+=":"; 
      Request += iUrl.GetPort();
    }
    Request += iUrl.GetUrlPath(); Request += " HTTP/1.0"; Request+=iE;    
  } else {
    Request += "POST "; Request += iUrl.GetUrlPath(); Request += " HTTP/1.0"; Request+=iE;
    Request += "Host: "; Request += iUrl.GetHost(); Request+=iE;
  }
  for (int i=0;i<RHeaderParams.entries_count();i++) {
    Request+=RHeaderParams.get_name(i);
    Request+=": ";
    Request+=RHeaderParams.get_value(i);
    Request += iE;
  }
  Request += iE;
  Request += RawData;
  if (!Send(Sock, Request)) return 0;
  ProcessHeader(Sock);
  ProcessData(Sock);
  if (RData.StrLength()) {
    RHeaderResponse.clear();
    RStatusValue = 200;
    RStatus = "200";
    return 1;
  } else return 0;
}
Exemple #13
0
void CUrlManager::Perform()
{
	for (unordered_set<CUrl*>::iterator it = m_sUrl.begin(); it != m_sUrl.end(); )
	{
		CUrl* pUrl = *it;
		pUrl->m_eMultiCode = curl_multi_perform(pUrl->m_pCurlm, &pUrl->m_nStillRunning);
		if (pUrl->m_eMultiCode > CURLM_OK || pUrl->m_nStillRunning == 0)
		{
			pUrl->OnWriteOver();
			it = m_sUrl.erase(it);
			delete pUrl;
		}
		else
		{
			++it;
		}
	}
}
int FetcherManager::doLogin(CURL *curl, Task *task, UrlNode *urlnode) {
    InfoCrawler *infocrawler = InfoCrawler::getInstance();
    TaskOtherInfo *taskother = infocrawler->getTaskScheduleManager()->getTaskOtherInfo(task->id);
    if (!taskother) {
        return -1;
    }
    if (!task) {
        return -1;
    }
    CUrl url;
    url.parse(task->loginurl);

    if (url.getUrl().empty()) {
        return -1;
    }

    HttpProtocol httpprotocol;
    char downstatistic[512] ;
    downstatistic[0] = 0;
    RESPONSE_HEADER rheader;

    mylog_info(m_pLogGlobalCtrl->infolog, "before login %s - %s:%s:%d",url.getUrl().c_str(),INFO_LOG_SUFFIX);
    int ret = httpprotocol.curl_login(curl, url, urlnode, infocrawler->getConf()->httptimeout, &rheader, downstatistic);
    mylog_info(m_pLogGlobalCtrl->infolog, "after login  %s %s %d - %s:%s:%d",url.getUrl().c_str(), downstatistic, ret,INFO_LOG_SUFFIX);
    /* if (ret == HTTP_FETCH_RET_REDIRECT) { //redirect
         errorlog("LOGIN ERROR: fetched %s  relocated to %s taskid %d\n", url.getUrl().c_str() ,(char *)page.m_sLocation.c_str(),task->id);
     } else*/
    if (ret == HTTP_FETCH_RET_ERROR) {//just discard
        mylog_error(m_pLogGlobalCtrl->errorlog, "login fetched %s taskid %d - %s:%s:%d:%d", url.getUrl().c_str(), task->id,INFO_LOG_SUFFIX,ret);
    } else if (ret == HTTP_FETCH_RET_ERROR_INVALIDHOST) { //invalid host, can not access
        mylog_error(m_pLogGlobalCtrl->errorlog, "login fetched %s taskid %d - %s:%s:%d:%d", url.getUrl().c_str(), task->id,INFO_LOG_SUFFIX,ret);
    } else if (ret == HTTP_FETCH_RET_ERROR_UNACCEPTED) { //content is invalid, discard
        mylog_error(m_pLogGlobalCtrl->errorlog, "LOGIN fetched %s unaccepted contenttyped %s taskid %d - %s:%s:%d:%d", url.getUrl().c_str(), rheader.contenttype.c_str(), task->id,INFO_LOG_SUFFIX,ret);
    } else
    {
        taskother->fetchingcookie = true;
        static char *loginok = "LOGIN OK";
        saveCookie(task->id, loginok, strlen(loginok));
        taskother->fetchingcookie = false;
        return 1;
    }
    return -1;
}
Exemple #15
0
EXPORT_C TInt CUrl::Compare(CUrl& aUrl, TInt aCompareComps) const
//
//	Scheme is case insensitive, rest of url is case sensitive
	{
	TInt result =0;
	if (aCompareComps & EUrlScheme)
		{
		result += Component(EUrlScheme).CompareF(aUrl.Component(EUrlScheme));
		if (result !=0)
			return result;
		}
		
	if (aCompareComps & EUrlLocation)
		{
		result += Component(EUrlLocation).Compare(aUrl.Component(EUrlLocation));
		if (result !=0)
			return result;
		}

	if (aCompareComps & EUrlUsername)
		{
		result += Component(EUrlUsername).Compare(aUrl.Component(EUrlUsername));
		if (result !=0)
			return result;
		}

	if (aCompareComps & EUrlPassword)
		{
		result += Component(EUrlPassword).Compare(aUrl.Component(EUrlPassword));
		if (result !=0)
			return result;
		}

	if (aCompareComps & EUrlPath)
		{
		result += Component(EUrlPath).Compare(aUrl.Component(EUrlPath));
		if (result !=0)
			return result;
		}

	if (aCompareComps & EUrlQuery)
		{
		result += Component(EUrlQuery).Compare(aUrl.Component(EUrlQuery));
		if (result !=0)
			return result;
		}

	if (aCompareComps & EUrlFragment)
		{
		result += Component(EUrlFragment).Compare(aUrl.Component(EUrlFragment));
		if (result !=0)
			return result;
		}
	return result;
	}
Exemple #16
0
bool CHttpRequest::Execute(const CUrl& Url) {
    Trace(tagHttp, levInfo, ("CHttpRequest - CHttpRequest {%s}", Url.GetBrute().GetBuffer()));
    m_Url = Url;
    ClearResults(true);
    switch(m_RequestMethod) {
    case htPost:
        assert(0);
        if (CreateSocket()) {
        }
        break;
    case htGet:
        if (m_RequestSizeLimit == 0) m_RequestMethod = htHead;
    case htHead:
        return (ExecuteGet(true) != -1);
    default:
        return false;
    }
    return false;
}
Exemple #17
0
STDMETHODIMP CBHttpRequest::Open(BSTR strMethod, BSTR strUrl, VARIANT_BOOL bAsync, VARIANT varUser, VARIANT varPassword)
{
    CUrl url;
    CStringA strObject;
    CStringA strUser;
    CStringA strPassword;

    Abort();

    s_cs.Enter();
    s_dwReqID ++;
    m_dwReqID = s_dwReqID;
    s_mapReq.SetAt(m_dwReqID, this);
    s_cs.Leave();

    url.CrackUrl(CBStringA(strUrl));
    m_bAsync = (bAsync != VARIANT_FALSE);

    strObject = url.GetUrlPath();
    strObject.Append(url.GetExtraInfo());

    if(varUser.vt != VT_ERROR)
    {
        HRESULT hr = varGetString(varUser, strUser);
        if(FAILED(hr))return hr;
    }

    if(varPassword.vt != VT_ERROR)
    {
        HRESULT hr = varGetString(varPassword, strPassword);
        if(FAILED(hr))return hr;
    }

    m_hConnection = InternetConnect(m_hSession, url.GetHostName(), url.GetPortNumber(),
                                    strUser.IsEmpty() ? NULL : (LPCSTR)strUser, strPassword.IsEmpty() ? NULL : (LPCSTR)strPassword,
                                    INTERNET_SERVICE_HTTP, 0, m_dwReqID);
    if(m_hConnection == NULL)
        return GetErrorResult();

    m_hFile = HttpOpenRequest(m_hConnection, CBStringA(strMethod), strObject, NULL, NULL, NULL, m_dwFlags, m_dwReqID);
    if(m_hFile == NULL)
        return GetErrorResult();

    m_eventComplete.Set();

    return S_OK;
}
Exemple #18
0
CVector<CString> CUrlTree::UrlToVector(const CUrl& Url) const {
    CVector<CString> Vector;
    
    CString MidString;
    
    MidString += Url.GetScheme();
    MidString += ":/";
    
#ifdef _UNIX
    if (Url.GetScheme().Same(g_strProto_FILE)) {
        MidString += "/";
    }       
#endif
    
    Vector += MidString;
           
    MidString = Url.GetHost();
    
    if (MidString.GetLength() && Url.GetPortValue() != 80) {
        MidString += ":";
        MidString += Url.GetPort();
    }
    
    if (MidString.GetLength()) {
        Vector += MidString;
    }
    
    int HostVectorSize = Vector.GetSize();
    CVector<CString> Vector2;
    CString::StrToVector(Url.GetUrlPath(), '/', &Vector2);
    
    Vector += Vector2;
    
    if (((int) Vector.GetSize() > HostVectorSize) && (!Vector[HostVectorSize].GetLength())) 
        Vector.RemoveAt(HostVectorSize);
    
    return Vector;
}
Exemple #19
0
	HRESULT WINAPI MonitorSink::QueryIAuthenticate(void* pv, REFIID riid, LPVOID* ppv, DWORD dw)
	{
		* ppv = NULL;

		if ( pv && InlineIsEqualGUID(riid, IID_IAuthenticate) )
		{
			MonitorSink * pThis = (MonitorSink *)pv;

			if ( pThis->m_pIEHostWindow && ! pThis->m_strURL.IsEmpty() && pThis->m_spTargetProtocol )
			{
				do 
				{
					CComPtr<IWinInetHttpInfo> spWinInetHttpInfo;
					if ( FAILED(pThis->m_spTargetProtocol->QueryInterface(&spWinInetHttpInfo)) ) break;
					if ( ! spWinInetHttpInfo ) break;

					CHAR szRawHeader[8192];		// IWinInetHttpInfo::QueryInfo() 返回的 Raw Header 不是 Unicode 的
					DWORD dwBuffSize = ARRAYSIZE(szRawHeader);

					if ( FAILED(spWinInetHttpInfo->QueryInfo(HTTP_QUERY_RAW_HEADERS, szRawHeader, &dwBuffSize, 0, NULL)) ) break;

					CString strHeader;
					HttpRawHeader2CrLfHeader(szRawHeader, strHeader);

					static const WCHAR AUTH_HEAD [] = L"\r\nWWW-Authenticate:";

					LPWSTR lpAuth = NULL;
					size_t nAuthLen = 0;
					if ( ! ExtractFieldValue( strHeader, AUTH_HEAD, & lpAuth, & nAuthLen ) ) break;
					if ( ! lpAuth ) break;

					CString strAuthScheme;
					CString strAuthRealm;

					// 可能有以下几种情况:
					// WWW-Authenticate: Basic realm="Secure Area"
					// WWW-Authenticate: Digest realm="*****@*****.**", qop="auth,auth-int", nonce="dcd98b7102dd2f0e8b11d0f600bfb0c093", opaque="5ccc069c403ebaf9f0171e9517f40e41"
					// WWW-Authenticate: NTLM
					// WWW-Authenticate: NTLM <auth token>
					LPWSTR pPos = StrStrW(lpAuth, L" ");
					if ( pPos )
					{
						* pPos = L'\0';
						strAuthScheme = lpAuth;

						do 
						{
							pPos = StrStrIW( pPos + 1, L"realm");
							if ( ! pPos ) break;
							pPos = StrChrW( pPos + 5, L'=');
							if ( ! pPos ) break;
							pPos = StrChrW( pPos + 1, L'"');
							if ( ! pPos ) break;
							LPWSTR lpRealm = pPos + 1;
							pPos = StrChrW( lpRealm, L'"');
							if ( ! pPos ) break;
							* pPos = L'\0';

							strAuthRealm = lpRealm;

						} while (false);

					}
					else
					{
						strAuthScheme = lpAuth;
					}

					VirtualFree( lpAuth, 0, MEM_RELEASE);

					// 由于 NPN_GetAuthenticationInfo 得不到 NTLM 的 domain,没办法做登录,只好不支持了
					if (strAuthRealm == _T("NTLM")) return E_NOINTERFACE;

					CUrl url;
					if ( url.CrackUrl(pThis->m_strURL) )
					{
						CW2A aScheme(url.GetSchemeName());
						CW2A aHost(url.GetHostName());
						int aPort = url.GetPortNumber();

						char* username = NULL;
						char* password = NULL;
						uint32_t ulen = 0, plen = 0;

						char* szAuthScheme = CStringToUTF8String(strAuthScheme);
						char* szAuthRealm = CStringToUTF8String(strAuthRealm);
						NPError result = NPN_GetAuthenticationInfo(pThis->m_pIEHostWindow->m_pPlugin->m_pNPInstance, aScheme, aHost, aPort, szAuthScheme, szAuthRealm, &username, &ulen, &password, &plen );
						delete[] szAuthScheme;
						delete[] szAuthRealm;
						if (result != NPERR_NO_ERROR) break;

						pThis->m_strUsername = username;
						pThis->m_strPassword = password;

						NPN_MemFree(username);
						NPN_MemFree(password);
					}

					* ppv = dynamic_cast<IAuthenticate *>(pThis);

					((IUnknown*)*ppv)->AddRef();

					return S_OK;

				} while (false);
			}
		}

		return E_NOINTERFACE;
	}
Exemple #20
0
void CCrawl::Fetch(void *arg)
{
	string str_url,host;

	int nGsock = -1;//之前的套接字文件描述符
	string strGHost;//之前的主机号

	//生成一个PSE file来存放网页数据
	//string ofs_name = DATA_PSE_FILE + "." + CStrFunction::itos(GetCurrentThreadId());//PSE.raw+当前线程号

	string ofs_name = DATA_PSE_FILE + CStrFunction::itos(GetCurrentThreadId())+ ".txt";//PSE+当前线程号+.txt
	CPSEFile pse_file(ofs_name);//创建一个PSE格式的文件,保存为原始网页库

	//生成一个link_for_pse file来存放链接数据
	ofs_name = DATA_LINK_FOR_PSE_FILE  + CStrFunction::itos(GetCurrentThreadId())+ ".txt";//PSE+当前线程号+.txt
	CLinkForPSEFile link_for_pse_file(ofs_name);//创建一个网页结构库

	int isleep_cnt = 0;//线程运行控制参数

	for(;;)
	{
		WaitForSingleObject(mutex_collection,INFINITE);//互斥锁

		int cnt = map_urls.size();
		if(cnt > 0)
		{
			//已经收集的没有访问的url
			cout<<"collection has "<<cnt<<" unvisited urls"<<endl;
			multimap<string,string>::iterator it = map_urls.begin();
			if(it != map_urls.end())
			{
				//从带访问的url队列中得到一个url进行访问
				str_url = (*it).second;
				map_urls.erase(it);

				ReleaseMutex(mutex_collection);

				//分解url
				CUrl iurl;
				//看看url是否有http://,没有则返回

				if(iurl.ParseUrl(str_url) == false)
				{
					cout<<"parse url false in Fetch"<<str_url<<endl;
					continue;
				}

				//表明现在抓取的网页所在的主机,同之前抓取的网页所在的主机不同
				//我们不能利用之前的套接字文件描述符进行CS通信,必须创建新的
				//套接字文件描述符进行通信,这是由于循环导致的
				if(strGHost != iurl.host_name)
				{
					closesocket(nGsock);
					nGsock = -1;
					strGHost = iurl.host_name;
				}

				//根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库
				((CCrawl *)arg)->DownroadFile(&pse_file,&link_for_pse_file,iurl,nGsock);

				cnt = 0;
			}else	
			{
				ReleaseMutex(mutex_collection);
				
			}
		}else
		{
			//等待访问的url队列map_urls已经没有url了,这是我们需要挂起线程进行等待
			ReleaseMutex(mutex_collection);
			Sleep(1000);
			isleep_cnt++;
		}

		if(b_f_over == true && isleep_cnt == 200)//当线程挂起的次数达到两百的时候,结束调用fetch
		{
			break;
		}
	}

	pse_file.Close();
	link_for_pse_file.Close();

}
Exemple #21
0
//将url放入map_urls到容器中
void CCrawl::AddUrl(const char * url)
{
	string str_url = url;
	if(str_url.empty() || str_url.length() < 8)
	{
		cout<<"the url is empty or too short"<<endl;
		return ;
	}

	CPage ipage;
	if(ipage.NormalizeUrl(str_url) == false)
		return ;

	CUrl iurl;

	//图片类型的网页,存放到历史网页链接库中
	if(iurl.IsImageUrl(str_url))
	{
		if(ofs_link_for_history_file)
		{
			WaitForSingleObject(mutex_link_for_history_file,INFINITE);

			ofs_link_for_history_file<<str_url<<endl;

			ReleaseMutex(mutex_link_for_history_file);
		}

		return ;
	}

	if(iurl.ParseUrl(str_url) == false)
	{
		cout<<"parse url error in AddUrl"<<endl;
		return ;
	}

	if(iurl.IsValidHost(iurl.host_name.c_str()) == false)
	{
		cout<<"not the valid host in AddUrl"<<endl;
		return ;
	}

	if(iurl.IsForeignHost(iurl.host_name.c_str()) )
	{
		cout<<"foreign host in AddUrl"<<endl;
		return ;
	}

	//如果是阻塞的ip地址,剔除掉
	unsigned long inaddr = 0;
	char *ip = NULL;

	inaddr =(unsigned long) inet_addr(iurl.host_name.c_str());

	if(inaddr != INADDR_NONE)
	{
		ip = new char[iurl.host_name.size() + 1];
		memset(ip,0,iurl.host_name.size() + 1);
		memcpy(ip,iurl.host_name.c_str(),iurl.host_name.size());
		if(!iurl.IsValidIp(ip))
		{
			delete []ip;
			ip = NULL;
			return ;
		}

		delete []ip;
		ip = NULL;
	}


	CStrFunction::StrToLower(iurl.host_name,iurl.host_name.size());

	CMD5 imd5;
	imd5.GenerateMd5((unsigned char *)str_url.c_str(),str_url.size());

	string str_digest = imd5.ToString();

	if(set_visited_url_md5.find(str_digest) != set_visited_url_md5.end())
	{
		return ;
	}

	if(set_unvisited_url_md5.find(str_digest) != set_unvisited_url_md5.end())
	{
		return ;
	}
	else
	{
		WaitForSingleObject(mutex_unvisited_url_md5,INFINITE);
		set_unvisited_url_md5.insert(str_digest);
		ReleaseMutex(mutex_unvisited_url_md5);
	}

	//确保同一个线程在一个网站上爬取
	int cnt = 0;

	for(;;)
	{
		if(1)//???????
		{
			WaitForSingleObject(mutex_visited_url_md5,INFINITE);
			map_urls.insert(val_type(iurl.host_name,str_url));
			ReleaseMutex(mutex_visited_url_md5);
			break;
		}
		else
		{
			cnt++;
			if(cnt%100 == 0)
				cout<<"~";
			

			if(cnt == 5000)
			{
				cout<<"remove it"<<endl;
			}

			Sleep(4000);
		}
	}

}
Exemple #22
0
EXPORT_C void CUrl::SetL(CUrl& aUrl)
	{
	HBufC* url = aUrl.UrlDes().AllocL();
	delete iUrlDes;
	iUrlDes = url;	
	}
int FetcherManager::fetch() {
    InfoCrawler *infocrawler = InfoCrawler::getInstance();
    UrlAnalyseManager *urlAnalyseManager = infocrawler->getUrlAnalyseManager();

    CURL *curl = curl_easy_init();
    curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); //just to start the cookie engine
    curl_easy_setopt(curl, CURLOPT_SHARE, sh);

    while(running()) {
        curl_easy_reset(curl);

        UrlNode *urlnode = NULL;
        bool html_from_outer= false;

        urlnode = urlAnalyseManager->getUrlFromOuterHtml();
        if (urlnode) {
            html_from_outer = true;
        } else {
            urlnode = urlAnalyseManager->getUrl();
        }

        if (urlnode == NULL) {
            my_sleep(100 * 1000); //0.1s
            continue;
        }
        if (!(urlnode->task))
        {
            mylog_info(m_pLogGlobalCtrl->infolog, "node task is null %s - %s:%s:%d",urlnode->url,INFO_LOG_SUFFIX);
        }
        TaskOtherInfo *taskother = infocrawler->getTaskScheduleManager()->getTaskOtherInfo(urlnode->taskid);
        int taskbatch = urlnode->taskbatch;
        if (urlnode->needtologin) {
            //need to login and cookie is null
            if (!(infocrawler->getTaskScheduleManager()->getCookieFromTask(urlnode->taskid))) {
                if (taskother->fetchingcookie) {
                    infocrawler->getUrlAnalyseManager()->insertUrl(urlnode);
                    infocrawler->getTaskScheduleManager()->decreaseTaskUrlNum(urlnode->task,taskbatch);
#ifdef URLMEMCACHEDB
                    infocrawler->deleteUrlMcLocalThread();
#endif
                    continue;
                } else {
                    doLogin(curl, urlnode->task, urlnode);
                }
            }
        }

        /*if (urlnode->task->sourcetype == SOURCE_TYPE_COMPANY && urlnode->type & URL_TYPE_HOMEPAGE)
        {
            strcat(urlnode->url, "&event=32698647");
            strcpy(urlnode->refererurl, "http://search.china.alibaba.com/tools/validate_redirect.htm?ru=http%253A%252F%252Fsearch.china.alibaba.com%252Fcompany%252Fcompany_search.htm%253Fkeywords%253D%25CA%25D6%25BB%25FA%2526pageSize%253D30%2526n%253Dy%2526showStyle%253Dpopular%2526beginPage%253D4&event=32698647&n=y");
        }*/
        CUrl url;
        url.parse(urlnode->url);
        //wrong url format
        if (url.getUrl().empty()) {
            infocrawler->getTaskScheduleManager()->increaseTaskErrorUrlNum(urlnode->taskid);
            infocrawler->getTaskScheduleManager()->decreaseTaskUrlNum(urlnode->task, taskbatch);
            infocrawler->getLocalDbManager()->decidesaveFetched(urlnode);
            delete urlnode;
#ifdef URLMEMCACHEDB
            infocrawler->deleteUrlMcLocalThread();
#endif
            continue;
        }
        Page page;
        Buffer *content = create_buffer(DEFAULT_PAGE_BUF_SIZE);


        //do fetch
        HttpProtocol httpprotocol;
        char downstatistic[512] ;
        downstatistic[0] = 0;
        RESPONSE_HEADER rheader;

//        mylog_info(m_pLogGlobalCtrl->infolog, "before fetch %s %s %llu %d %d  - %s:%s:%d",url.getUrl().c_str(), urlnode->url, urlnode->id, urlnode->taskid, urlnode->errornum,INFO_LOG_SUFFIX);
        //int ret = httpprotocol.fetch(url, content, urlnode, page, infocrawler->getConf()->httptimeout,urlnode->task->tasksendtype);
//        int ret = httpprotocol.curl_fetch(curl, url, content, urlnode, infocrawler->getConf()->httptimeout, urlnode->task->tasksendtype, &rheader, downstatistic);
        int sendtype = urlnode->task->tasksendtype;
        if (urlnode->task->sourcetype == SOURCE_TYPE_COMPANY && urlnode->type & URL_TYPE_HOMEPAGE)
        {
            /*FILE * f = fopen("ali.txt", "rb");
            char line[1024] = {0};
            int i = 0;
            string cookie;
            string post;
            while(fgets(line, 1023, f)) {
            	char *newline = strtrim(line, NULL);
            	if (i++ == 0) {
            		cookie = newline;
            	} else {
            		post = newline;
            	}
            }
            fclose(f);
            */
            sendtype = REQUEST_TYPE_GET;
        }

        int ret = 0;
        if (!html_from_outer) {
            ret = httpprotocol.curl_fetch(curl, url, content, urlnode, infocrawler->getConf()->httptimeout, sendtype, &rheader, downstatistic);
            mylog_info(m_pLogGlobalCtrl->infolog, "after fetched %s %s %d - %s:%s:%d",url.getUrl().c_str(), downstatistic, ret,INFO_LOG_SUFFIX);
        } else {
            add_buffer(content, (char *)urlnode->html.c_str(), urlnode->html.length());
            ret = urlnode->html.length();
            mylog_info(m_pLogGlobalCtrl->infolog, "get url from outer %s %d - %s:%s:%d", url.getUrl().c_str(), ret,INFO_LOG_SUFFIX);
        }

        /*if (ret == HTTP_FETCH_RET_REDIRECT) { //redirect
            int redirectnum = urlnode->redirectnum +1;
            if (redirectnum <= URL_FETCH_REDIRECT_TIMES)
            {
                UrlNode *newurlnode = new UrlNode(urlnode->task,urlnode->topicsource,urlnode->title,urlnode->taskbatch,(char *)urlnode->fatherurl,(char *)page.m_sLocation.c_str(), urlnode->other, urlnode->maxtype,urlnode->type, 0, urlnode->id,redirectnum ,urlnode->page,urlnode->layerid,urlnode->bbsid,urlnode->needtologin);

                newurlnode->insertother(URLNODE_OTHER_TYPE_COOKIE,(char *)page.m_sCookie.c_str(), page.m_sCookie.length());
                errorlog("ERROR: fetched %s %s relocated to %s %llu %d\n", url.getUrl().c_str(), urlnode->url, newurlnode->url, newurlnode->id, newurlnode->taskid);
                infocrawler->getUrlAnalyseManager()->insertUrl(newurlnode);
            }else
            {
                errorlog("ERROR: redirectunm > %d fetched %s %s relocated to %s %d\n", URL_FETCH_REDIRECT_TIMES, url.getUrl().c_str(), urlnode->url, (char * )page.m_sLocation.c_str(), urlnode->taskid);
            }
            urlnode->errornum = 0;
            */
        if (ret == HTTP_FETCH_RET_ERROR) {//just discard
            urlnode->errornum++;
            mylog_error(m_pLogGlobalCtrl->errorlog, "fetched %s  - %s:%s:%d:%d", url.getUrl().c_str(),INFO_LOG_SUFFIX,urlnode->errornum);
            /*} else if (ret == HTTP_FETCH_RET_ERROR_INVALIDHOST) { //invalid host, can not access
            urlnode->errornum++;
                errorlog("ERROR: fetched %s invalidhost %d\n", url.getUrl().c_str(), urlnode->errornum);
                */
        } else if (ret == HTTP_FETCH_RET_ERROR_UNACCEPTED) { //content is invalid, discard
            urlnode->errornum = URL_FETCH_RETRY_TIMES;
            //errorlog("ERROR: fetched %s unaccepted contenttype %d %s\n", url.getUrl().c_str(), urlnode->errornum, page.m_sContentType.c_str());
        } else { //ok
            //increase fetch num
            if (urlnode->type & URL_TYPE_NEEDTOSAVE)
                infocrawler->getTaskScheduleManager()->increaseFetchNum(urlnode->task);

            urlnode->errornum = 0;
            //extract urls and analyse, insert new url into queue
            char nextpageurl[MAX_URL_LEN] ;
            nextpageurl[0] = 0;
            int nextpage = infocrawler->getUrlAnalyseManager()->analyseUrls(urlnode, &rheader, content->data, ret, nextpageurl, html_from_outer);
            if (html_from_outer) {
                nextpage = 0;
                nextpageurl[0] = 0;
            }

            //write content to disk if we need, write fetched url into dist
            if (urlnode->type & URL_TYPE_NEEDTOSAVE) {
                if (urlnode->task->sourcetype == SOURCE_TYPE_BBS)
                {
                    char oldurlnodedata[64];
                    int tasktmp = 0;
                    int pagetmp = 0;
                    ulonglong idtmp = 0;
                    if (InfoCrawler::getInstance()->getLocalDbManager()->alreadyfetched(urlnode,oldurlnodedata))
                    {
                        sscanf(oldurlnodedata, "%llu/%d/%*d/%*d/%d/%*u", &idtmp,&pagetmp, &tasktmp);
                        if (pagetmp == urlnode->page)
                        {
                            int rettmp = infocrawler->getLocalDbManager()->erasecontent(idtmp,tasktmp);
                        }
                    }
                }
                if (urlnode->nextpage == 1 && nextpage >1)
                {
                    urlnode->nextpage = nextpage;
                }
                mylock::get_instance()->get(urlnode->id);
                infocrawler->getLocalDbManager()->savecontent(urlnode, &rheader, content->data, ret, nextpage);
                mylock::get_instance()->put(urlnode->id);
                //infocrawler->getPageManager()->SavePage(content->data, ret, urlnode, &rheader);

                mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s saved content%d %d title %s urlid %llu taskid %d batchid %d - %s:%s:%d",url.getUrl().c_str(), urlnode->errornum, ret,urlnode->title,urlnode->id, urlnode->taskid, urlnode->taskbatch ,INFO_LOG_SUFFIX);
                /*if ((urlnode->nextpage > 1)&& !(urlnode->type & URL_TYPE_HOMEPAGE)) {//if have nextpage, don't not save fetched
                    mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s have next, so don't not save fetched - %s:%s:%d",url.getUrl().c_str(),INFO_LOG_SUFFIX);
                } else {
                    infocrawler->getLocalDbManager()->saveFetched(urlnode);
                    infocrawler->getLocalDbManager()->saveUrl(urlnode, SAVE_FATHER_URL);
                    mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s save fetched%d - %s:%s:%d",url.getUrl().c_str(),urlnode->errornum,INFO_LOG_SUFFIX);
                }*/
                if (urlnode->nowpage == urlnode->totalpage)
                {
                    infocrawler->getLocalDbManager()->saveFetched(urlnode);
                    infocrawler->getLocalDbManager()->saveUrl(urlnode, SAVE_FATHER_URL);

                    //0128.begin()
                    /* char *content=NULL;
                     char dbname[64] = "";
                     char recordname[64] = "";
                     char urldbname[64] = "";
                     DBAccess *dbaccess = DBAccess::getInstance();

                     getContentDBName1(urlnode, dbname);
                     getRecordKeyName1(urlnode, recordname);
                     getUrlDBName1(urlnode, urldbname);

                     int suffix = dbaccess->load(dbname);
                     string fileno;
                     DBD *dbd = dbaccess->get(suffix, fileno, recordname, NULL);
                     if (dbd != NULL)
                     {
                         DbHypertableManager * dbhyper=InfoCrawler::getInstance()->gethyper();
                          ICCONFIG   *ifcong_=InfoCrawler::getInstance()->getConf();

                        // dbhyper->get_now_time();
                        string now_=TimeToString1();
                        //string now_;
                        if( dbhyper->insert_data_to_hypertable_content(urlnode,dbd->datbuf,dbd->datlen_u,string("content_tbl"),string("gbk"),ret,now_) )
                        {
                              char fetchtbl[32];fetchtbl[0]=0;
                             //sprintf(fetchtbl,"fetch_%d_tbl",urlnode->taskid);
                              sprintf(fetchtbl,"fetch_%d_tbl",1);
                             if( dbhyper->insert_data_to_hypertable_fetch(urlnode,string(fetchtbl),ifcong_->spider_id,ret,now_) )
                             {
                                char memorytable[128]; memorytable[0]=0;
                                 //sprintf(memorytable,"url_%d_tbl",urlnode->taskid);
                                 sprintf(memorytable,"url_%d_tbl",1);
                                 dbhyper->insert_data_to_hypertable_memorytable(urlnode,memorytable);
                              }
                         }

                          dbd_free(dbd);
                     }*/
                    //0128.end()
                    /*char * final_content;
                    final_content=NULL;
                    final_content=get_final_content(urlnode);
                    if(final_content !=NULL)
                    {
                        insert_data_to_hypertable(urlnode->fatherurl,final_content);
                        delete []final_content;
                    }*/
                    mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s save fetched%d nowpage %d totalpage %d - %s:%s:%d",url.getUrl().c_str(),urlnode->errornum,urlnode->nowpage,urlnode->totalpage,INFO_LOG_SUFFIX);
                } else
                {
                    mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s have next, so don't not save fetched nowpage %d totalpage %d  - %s:%s:%d",url.getUrl().c_str(),urlnode->nowpage,urlnode->totalpage,INFO_LOG_SUFFIX);
                }
            }
            //insert next page
            if (nextpageurl[0]) {
                UrlNode *newnode  = new UrlNode;
                if (!(urlnode->type & URL_TYPE_HOMEPAGE))
                    newnode->id = urlnode->id;
                newnode->task = urlnode->task;
                newnode->taskid = urlnode->taskid;
                newnode->copyother(urlnode->other,urlnode->maxtype);
                newnode->type = urlnode->type;
                newnode->page = urlnode->page + 1;
                newnode->copyurl(nextpageurl);
                newnode->copyfatherurl(urlnode->fatherurl);
                newnode->layerid  =  urlnode->layerid;
                newnode->needtologin = urlnode->needtologin;
                newnode->taskbatch= urlnode->taskbatch;
                newnode->copytitle(urlnode->title);
                newnode->copytopicsource(urlnode->topicsource);
                mylog_info(m_pLogGlobalCtrl->infolog, " now url %s new url %s title %s - %s:%s:%d",urlnode->url, newnode->url,urlnode->title,INFO_LOG_SUFFIX);
                infocrawler->getUrlAnalyseManager()->insertUrl(newnode, INSERT_URL_FORCED);
            }
        }

        free_buffer(content);

        Task *task = urlnode->task;
        int taskid = urlnode->taskid;

        //if get an error, we will retry but only fixed times
        if (urlnode->errornum > 0 && urlnode->errornum < URL_FETCH_RETRY_TIMES) {
            mylog_error(m_pLogGlobalCtrl->errorlog, "fetched %s reinsert for error %d %llu - %s:%s:%d", url.getUrl().c_str(), urlnode->errornum, urlnode->id,INFO_LOG_SUFFIX);
            infocrawler->getUrlAnalyseManager()->insertUrl(urlnode, INSERT_URL_FORCED,false);
        } else if (urlnode->errornum >= URL_FETCH_RETRY_TIMES) {
            infocrawler->getTaskScheduleManager()->increaseTaskErrorUrlNum(taskid);
            //write error url to DB
//            infocrawler->getDbManager()->WriteFetchError(url.getUrl().c_str(),taskid,taskbatch);
            mylog_error(m_pLogGlobalCtrl->errorlog, "fetched finished and download url %s urlnodeid %llu taskid %d - %s:%s:%d:%d", url.getUrl().c_str(),  urlnode->id, taskid,INFO_LOG_SUFFIX,urlnode->errornum);
            infocrawler->getLocalDbManager()->decidesaveFetched(urlnode);

            if (urlnode->type & URL_TYPE_NEEDTOSAVE)
            {
                infocrawler->getLocalDbManager()->saveUrl(urlnode);
            }
            delete urlnode;
        } else {
            mylog_info(m_pLogGlobalCtrl->infolog, "fetched finished %s error %d %llu fatherurl %s - %s:%s:%d",url.getUrl().c_str(), urlnode->errornum, urlnode->id, urlnode->fatherurl,INFO_LOG_SUFFIX);
            delete urlnode;
        }

        infocrawler->getTaskScheduleManager()->decreaseTaskUrlNum(task, taskbatch);

#ifdef URLMEMCACHEDB
        infocrawler->deleteUrlMcLocalThread();
#endif
    }
    curl_easy_cleanup(curl);
    mylog_info(m_pLogGlobalCtrl->infolog, "FetcherManager ISRUNNING false - %s:%s:%d",INFO_LOG_SUFFIX);
}
HRESULT FAsyncDownload::FHttpDownloadTP::ProcessDownload(FAsyncDownData *pData)
{
    HRESULT hr = E_FAIL; 


    FString ReqUrl = pData->m_pUrlInfo->m_DownloadUrl;
    UrlUnescapeInPlace(ReqUrl.GetBuffer(), 0); 

    CUrl url;
    url.CrackUrl(ReqUrl);

	const tchar* pszUserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)";
    FHInternet hIn = NULL; 
	if (g_AppSettings.m_Proxy.GetLength() > 0)
	{
		hIn = InternetOpen(pszUserAgent, INTERNET_OPEN_TYPE_PROXY, g_AppSettings.m_Proxy, g_AppSettings.m_ProxyA, 0);
	}
	else
	{
		hIn = InternetOpen(pszUserAgent, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
	}

     
    if (NULL == hIn)
        return E_HTTP_NET_ERROR; 

    FHInternet hCon = InternetConnect(hIn, url.GetHostName(), url.GetPortNumber(), url.GetUserName(), url.GetPassword(), INTERNET_SERVICE_HTTP, 0, 0); 

    if (NULL == hCon)
    {
        _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: InternetConnect() failed: %d\n", GetLastError()); 
        return E_HTTP_NET_ERROR; 
    }

	ULONG ulRecvTimeout = 15000; 
	InternetSetOption(hCon, INTERNET_OPTION_RECEIVE_TIMEOUT, &ulRecvTimeout, sizeof(ULONG));


    FString StrRes = url.GetUrlPath();
    StrRes+= url.GetExtraInfo(); 
    
    FHInternet hReq = HttpOpenRequest(hCon, "GET", StrRes, NULL, NULL, NULL, INTERNET_FLAG_NO_CACHE_WRITE | INTERNET_FLAG_DONT_CACHE, 0); 

    if (NULL == hReq)
    {
        _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: HttpOpenRequest() failed: %d\n", GetLastError()); 
        return E_HTTP_NET_ERROR; 
    }

	size_type FileSize = 0;
	
	

	if (!(pData->m_pUrlInfo->m_dwDownloadFlags & HTTP_FLAG_NO_RESUME))
		FileSize = GetFileSize(pData->m_pUrlInfo->m_DownloadFile);

    // See if file already exists on the disk.
    if (FileSize > 0)
    {
        FString StrRange; 
        StrRange.Format("Range: bytes=%I64d-", FileSize); 
        HttpAddRequestHeaders(hReq, StrRange, StrRange.GetLength(), HTTP_ADDREQ_FLAG_ADD_IF_NEW);
    }


	FString StrVersion; 
	StrVersion.Format("LTV_VERSION: %s", g_AppSettings.m_AppVersion); 
	HttpAddRequestHeaders(hReq, StrVersion, StrVersion.GetLength(), HTTP_ADDREQ_FLAG_ADD_IF_NEW);

    if (!HttpSendRequest(hReq, NULL, 0, NULL, 0))
    {
		int err = GetLastError(); 
        _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: HttpSendRequest() failed: %d (0x%x)\n", err, HRESULT_FROM_WIN32(err)); 
        InternetCloseHandle(hCon);
        InternetCloseHandle(hIn); 
        return E_HTTP_NET_ERROR; 
    }

    const DWORD dwBufferSize = 8192;
    char pBuffer[dwBufferSize];

    FHttpConnection FConn = hReq;

    DWORD dwStatusCode = FConn.GetStatusCode(); 

	FString ReqContentType = pData->m_pUrlInfo->m_ContentType; 
	pData->m_pUrlInfo->m_ContentType = FConn.GetHeader(HTTP_QUERY_CONTENT_TYPE);
	pData->m_pUrlInfo->m_dwStatusCode = dwStatusCode; 

	if (!MatchContentType(ReqContentType, pData->m_pUrlInfo->m_ContentType))
	{
		_DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: Content type mismatch: %s/%s\n", ReqContentType, pData->m_pUrlInfo->m_ContentType); 
		return E_NOINTERFACE; //E_NOINTERFACE = content type mismatch
	}

	if (dwStatusCode == 416 && FileSize > 0)
	{
		_DBGAlert("FAsyncDownload::FHttpDownloadTP::ProcessDownload: Server status code: %d. Download complete\n", dwStatusCode); 
		return S_OK; 
	}

    if (dwStatusCode < 200 || dwStatusCode > 206)
    {
        _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: Server status code: %d\n", dwStatusCode); 
		if (dwStatusCode == 404)
			return E_HTTP_NOTFOUND; 
		return E_HTTP_INVALID_STATUS; 
    }

    CAtlFile OutFile; 

	if (pData->m_pUrlInfo->m_dwDownloadFlags & HTTP_FLAG_NO_RESUME)
		DeleteFile(pData->m_pUrlInfo->m_DownloadFile); 

    hr = OutFile.Create(pData->m_pUrlInfo->m_DownloadFile, GENERIC_WRITE, 0, OPEN_ALWAYS);

    if (FAILED(hr))
    {
		_DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: CreateFile failed: 0x%x, %d : %s\n", hr, GetLastError(), pData->m_pUrlInfo->m_DownloadFile); 
        return E_HTTP_WRITE_FILE; 
    }

    size_type llTotalRead = 0; 
    size_type llSizeMax = 0; 

	size_type ContentLen = FConn.GetContentLength(); 

	pData->m_pUrlInfo->m_ContentLength = ContentLen; 

    if (dwStatusCode == 206)
    {
        FString FStrRange = FConn.GetHeader(HTTP_QUERY_CONTENT_RANGE);
        
        if (FStrRange)
        {
           //Content-Range: bytes 21010-47021/47022
           const char* pszBytes = strstr(FStrRange, "bytes ");
           if (pszBytes != NULL)
           {
               pszBytes+=sizeof("bytes");
               LONGLONG llOffset = _strtoi64(pszBytes, NULL, 10); 
               hr = OutFile.Seek(llOffset, FILE_BEGIN); 
               llTotalRead = (size_type)llOffset; 
               if (FAILED(hr))
               {
                   _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: Seek to position %d failed: 0x%x, %d\n", hr, GetLastError()); 
               }

               const char* pszTotal = strchr(pszBytes, '/');
               if (pszTotal != NULL)
                   llSizeMax = _strtoi64(pszTotal + 1, NULL, 10); 
           }
        }
    }
	else
	{
		if (ContentLen > 0 && ContentLen == FileSize)
		{
			OutFile.Close();
			return S_OK; 
		}
	}

    if (llSizeMax == 0)
		llSizeMax = ContentLen;


    pData->pBindStatusCallback.OnProgress((ULONG)llTotalRead, (ULONG)llSizeMax, BINDSTATUS_BEGINDOWNLOADDATA, L"");

    DWORD dwBytesRead = 0; 
    for (;;)
    {
        if (!InternetReadFile(hReq, pBuffer, dwBufferSize, &dwBytesRead))
        {
            _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: InternetReadFile() failed: %d\n", GetLastError()); 
			OutFile.Close();
            return E_HTTP_NET_ERROR; 
        }

		if (dwBytesRead == 0)
		{
			hr = S_OK; 
			break; 
		}

        DWORD dwBytesWritten = 0; 
        hr = OutFile.Write(pBuffer, dwBytesRead, &dwBytesWritten); 

		if (FAILED(hr))
        {
            _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: FileWrite failed: 0x%x, %d\n", hr, GetLastError()); 
			OutFile.Close();
            return E_HTTP_WRITE_FILE; 
        }

        llTotalRead+=dwBytesRead;
		
		pData->pBindStatusCallback.OnProgress((ULONG)llTotalRead, llSizeMax > 0 ? (ULONG)llSizeMax : llTotalRead , BINDSTATUS_DOWNLOADINGDATA, L"");


        if (m_pThis->m_Stopping || pData->pBindStatusCallback.m_Abort)
        {
            _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: Download aborted\n", hr, GetLastError()); 
            hr = E_ABORT; 
            break; 
        }
    }

	OutFile.Close();
    return hr; 
}
Exemple #25
0
int RunConverter( int argc, _TCHAR* * argv )
{
#ifdef _DEBUG
	// sleep a bit so we can have time to attach a debugger
	Tell(_T("Sleeping for %d seconds in debug mode."), startupTimeout / 1000);
	Sleep(startupTimeout);
#endif
	int ret = 0;
	wstring name;
	wstring title;
	HANDLE conversionHandle = NULL;

	po::options_description desc("Converts an MPEG-2 Program Stream to a DVR-MS, WMV, or WTV file.");
	po::positional_options_description pos;

	string input;
	string output;
	LONGLONG length;
	bool disableFileLogging;
	bool disableConsoleLogging;
	bool disableAllLogging;
	string interruptName;
	string interruptDirectory;
	string outputDirectory;
	string contentTitle;
	__int64 contentDuration = -1i64;

	desc.add_options()
		("help,?", "Display help message.")
		("input,i", po::value<string>(&input), "an MPEG2 input path. Can be a url.")
		("output,o", po::value<string>(&output), "output path.<type>. Where <type> can be one of \"dvr-ms\", \"wmv\", or \"wtv\"")
		("length,l", po::value<LONGLONG>(&length)->default_value(-1), "the length of the input content in bytes. Only required for a network path such as http." )
		("interrupt-name", po::value<string>(&interruptName), "the file name (without path or extension) of a file that will be created when conversion is to be interrupted.")
		("interrupt-directory", po::value<string>(&interruptDirectory), "the path for this app to look for an interrupt file. An interrupt file is the interrupt_file name with a .interrupt extension. The file itself can be empty.")
		("disable-file-logging", po::value<bool>(&disableFileLogging)->zero_tokens()->default_value(false), "indicates that logging to a file will be disabled.")
		("disable-console-logging", po::value<bool>(&disableConsoleLogging)->zero_tokens()->default_value(false), "indicates that logging to the console will be disabled.")
		("disable-all-logging", po::value<bool>(&disableAllLogging)->zero_tokens()->default_value(false), "indicates that all logging will be disabled.")
		("output-directory,d", po::value<string>(&outputDirectory), "the directory for this app to place conversion output. Only valid if output_path is omitted.")
		("content-title,t", po::value<string>(&contentTitle), "the Title that will be assigned to the output path.<type>.")
		("version,v", po::value<string>()->zero_tokens(), "prints the version of this app.")
		//("content-duration,d", po::value<__int64>(&contentDuration)->default_value(-1i64), "the duration of the input content in seconds." )
		;

	pos.add("input", 1);
	pos.add("output", 1);
	pos.add("length", 1);

	vector<string> args;

	for (int i = 1; i < argc; i++)
		args.push_back(WStringToString(argv[i]));

	po::variables_map variables;

	try
	{
		po::basic_parsed_options<char> oo = po::command_line_parser(args).
			options(desc).positional(pos).run();
		po::store(oo, variables);
		po::notify(variables);
	}
	catch (std::exception e)
	{
		Tell(_T("Invalid command line. Use --help to see options."));
		return -1;
	}

	if (!variables.count("input"))
	{
		bool display = false;
		wstring message;
		if (variables.count("version"))
		{
			message = _T("Version: ");
			message += MPEG2DVRMS_VERSION;
			display = true;
		}
		if (variables.count("help"))
		{
			message = _T("eh... help message not available yet. Hope you have the source!");
			display = true;
		}
		if (!display)
			message = _T("No input file was specified.");
		ret = 100;
		Tell(message);
	}
	else
	{
		try
		{
			//////////////////////////////////////////////////////////////////////////
			// command-line option handling

			LONGLONG contentLength = -1;

			if (variables.count("length"))
				contentLength = length;

			if (variables.count("interrupt-name"))
				name = StringToWString(interruptName);
			else
			{
				name = NewGuid();
				Tell(_T("Generated interrupt name is %s"), name.c_str());
			}

			if (variables.count("interrupt-directory"))
				_conversionFileStoragePath = StringToWString(interruptDirectory);

			if (variables.count("content-title"))
				title = StringToWString(contentTitle);
			else
				title = _T("");

#pragma region input output file handling
			wstring defaultExtension;

			if (IsVista())
				defaultExtension = _T(".dvr-ms");
			else
				defaultExtension = _T(".wtv");

			ATL_URL_SCHEME urlScheme;
			CUrl inputUrl;

			if (!inputUrl.CrackUrl(StringToWString(input).c_str()))
				urlScheme = ATL_URL_SCHEME_FILE;
			else
				urlScheme = inputUrl.GetScheme();

			if (urlScheme == -1)
				urlScheme = ATL_URL_SCHEME_FILE;

			wstring inputPath = StringToWString(input);
			CPath outputPath;

			if (urlScheme == ATL_URL_SCHEME_FILE)
			{
				CPath input = inputPath.c_str();

				if (input.IsFileSpec())
				{
					TCHAR szCurrentDirectory[MAX_PATH];
					if (!GetCurrentDirectory(MAX_PATH, szCurrentDirectory))
						throw CarverLab::Exception(GetLastError());
					wstring currentDirectory = (LPCTSTR)szCurrentDirectory;
					inputPath = currentDirectory + _T("\\") + inputPath.c_str();
					input = inputPath.c_str();
				}
				if (!input.FileExists())
					throw CarverLab::Exception(_T("MPEG2 input path does not exist."));

				if (!variables.count("output"))
					outputPath = inputPath.c_str();
				else
					outputPath = StringToWString(output).c_str();
			}
			else if (urlScheme == ATL_URL_SCHEME_HTTP ||
				urlScheme == ATL_URL_SCHEME_HTTPS)
			{
				if (!variables.count("output"))
				{
					wstring thefullpath;
					CString envString;
					envString.GetEnvironmentVariable(_T("PUBLIC"));
					thefullpath = envString;
					thefullpath += _T("\\Videos\\mpeg2dvrms-output");
					thefullpath += defaultExtension;
					outputPath = thefullpath.c_str();
				}
				else
					outputPath = StringToWString(output).c_str();
			}
			else
				throw CarverLab::Exception(_T("Only http or https URL schemes are supported."));

			bool isUrl = urlScheme != ATL_URL_SCHEME_FILE;

			CPath inPath = inputPath.c_str();

			if (outputPath.GetExtension().MakeLower() == inPath.GetExtension().MakeLower())
				outputPath.RenameExtension(defaultExtension.c_str());

			if (outputPath.GetExtension().MakeLower() == _T(".dvrms"))
			{
				outputPath.RemoveExtension();
				outputPath.AddExtension(_T(".dvr-ms"));
			}
#pragma endregion input output file handling

			//////////////////////////////////////////////////////////////////////////

			conversionHandle = CreateConversion(false, CComBSTR(name.c_str()));

			if (!conversionHandle)
				throw CarverLab::Exception();

			SetConsoleTitle(outputPath);

			Tell(_T("Press ENTER to interrupt and exit."));

			_done = false;
			HANDLE stdinput = GetStdHandle(STD_INPUT_HANDLE);

			std::auto_ptr<InternalThreadData> threadData(new InternalThreadData);
			threadData->activityCallback = ActivityCallback;
			threadData->contentLength = contentLength;
			threadData->conversionHandle = conversionHandle;
			threadData->inputPath = inPath;
			threadData->isUrl = isUrl;
			threadData->outputPath = outputPath;
			threadData->threadData = NULL;
			threadData->userData = NULL;
			threadData->contentTitle = StringToWString(contentTitle);
			threadData->contentDuration = contentDuration;

			_lastConvertedFilePath = outputPath;

			HANDLE thread = CreateThread(NULL, 0, BeginConversion, threadData.get(), 0, NULL);
			if (thread == NULL)
				throw CarverLab::Exception();

			bool shuttingDown = false;
			bool interruptSuccessful = false;	// will be true if the conversion is inactive after InterruptConversion is called

			while (!_done)
			{
				if (!shuttingDown && ((_kbhit() && _getch() == 13) || InterruptNow(name.c_str())))
				{
					shuttingDown = true;
					interruptSuccessful = InterruptConversion(conversionHandle, 30000);	// will wait 30 seconds for the conversion to die
				}

				Sleep(10);
			}

			if (!interruptSuccessful)
			{	// TODO: will need to kill this puppy in an unnice way... awwww
				Tell(_T("InterruptConversion was unsusccessful."));
			}
			// TODO: INFINITE? um... nope. this will need an intervention
			WaitForSingleObject(thread, INFINITE);

			CloseHandle(thread);
		}
		catch (CarverLab::Exception exception)
		{
			Tell(_T("*** Error: %s"), exception.GetErrorString());
			Tell(_T("Exiting..."));
			ret = exception.GetHRESULT();
		}
		catch (...)
		{
			DWORD errorCode = GetLastError();
			wstring error = Exception::GetLastErrorString(errorCode);
			Tell(_T("*** Unhandled Exception: %s"), error.c_str());
			Tell(_T("Exiting..."));
			ret = errorCode;
		}
		if (conversionHandle != NULL)
			CloseConversion(conversionHandle);
	}
#ifdef _DEBUG
	// sleep a bit so we can see any errors
	Tell(_T("Sleeping for %d seconds in debug mode."), sleepTimeout / 1000);
	Sleep(sleepTimeout);
#endif
	return ret;
}
Exemple #26
0
int CHttpRequest::ExecuteGet(bool Recurse) {

    bool bContinueExecuteGet = false;
    bool bKeepAlive = false;

    Trace(tagHttp, levInfo, ("CHttpRequest - ExecuteGet"));

    unsigned int nCredentialsIndex = 0;

    bool Result;

    do {

        if (! bKeepAlive) {
            Result = CreateSocket();
            
            if (! Result) {
                return -1;
            }
        }

        bContinueExecuteGet = false;

        // process authentication

        CString AuthHeader;
        
        switch(m_ServerAuthState.GetLeg()) {
        case AUTHENTICATION_STATE_NONE:            
            break;
        case AUTHENTICATION_STATE_PRECHALENGE:            
        case AUTHENTICATION_STATE_CHALENGE:
            if (! m_ServerAuthState.GetNextHeader(& AuthHeader, nCredentialsIndex))
                return m_RStatusValue;
            break;
        }

        ClearResults(false);

        // add the auth header
        if (AuthHeader.GetLength())
            SetHttpField(g_strHttpAuthorization, AuthHeader);

        /* create socket and get results */
                
        CStringTable Connection;
        if (!m_HttpFields.FindAndCopy(g_strHttpConnection, Connection) || 
            ! Connection.GetValue(g_strHttpConnection).GetLength()) {            
            SetHttpField(g_strHttpConnection, "Keep-Alive");
        }       
        
        
        Result = GetHTTP((float) 1.0);

        // 406: no acceptable objects found (NT4/ISM)
        // if (!Result || (m_RStatusValue == 406)) {            
        //    ClearResults(false);
        //    Result = CreateSocket() && GetHTTP((float) 0.9);
        //    if (! Result) {
        //        return -1;
        //    }
        // }

        Trace(tagHttp, levInfo, ("CHttpRequest - ExecuteGet {%d/%d}", Result, m_RStatusValue));        
        
        if (! Result) {
            return -1;
        }
        
        CString l_RedirectLoc;
        CUrl ResolvedUrl;

        switch(m_RStatusValue) {
        case 407:
            // $(TODO)
            break;
            
        case 401:        

            // any valid authentication stage
            if (m_ServerAuthState.GetLeg() == AUTHENTICATION_STATE_NONE) {         
                // we need to send credentials
                m_ServerAuthState.SetLeg(AUTHENTICATION_STATE_PRECHALENGE);
                bContinueExecuteGet = true;                
            } else if (m_ServerAuthState.GetLeg() == AUTHENTICATION_STATE_CHALENGE) {
                nCredentialsIndex ++;
                // equal, because we do want to run once without username/password credentials
                if (nCredentialsIndex <= m_ServerAuthState.GetSize()) {
                    bContinueExecuteGet = true;
                }
                m_ServerAuthState.SetLeg(AUTHENTICATION_STATE_PRECHALENGE);
                bKeepAlive = false;
            } else if (m_ServerAuthState.GetLeg() == AUTHENTICATION_STATE_PRECHALENGE) {
                m_ServerAuthState.SetLeg(AUTHENTICATION_STATE_CHALENGE);
                bContinueExecuteGet = true;
                bKeepAlive = true;
            }
            
            break;
            
        case 301: /* redirections */
        case 302:
        case 303:
        case 307:

            l_RedirectLoc = m_RFields.FindElement(g_strHttpLocation).GetValue(g_strHttpLocation);
            
            if (! l_RedirectLoc.GetLength()) {
                Trace(tagHttp, levInfo, ("CHttpRequest - GetHTTP - %d redirection without a Location: header.", m_RStatusValue));		      
                break;
	    }
                
            // HTTP 1.1 - Temporary Redirect is 302 and 307
            // m_RRedirections is relevant for final URL address
            // that could be retrieved
            
            ResolvedUrl = m_Url.Resolve(l_RedirectLoc);        
            l_RedirectLoc = ResolvedUrl.GetHttpAll();        
            
            if (m_FollowRedirections && m_RRedirections.Contains(l_RedirectLoc)) {
                // avoid circular redirections
                Trace(tagHttp, levInfo, ("CHttpRequest - GetHTTP - circular redirection %s", l_RedirectLoc.GetBuffer()));		      
                return m_RStatusValue;
            } else m_RRedirections.Add(l_RedirectLoc);
            
            if (!m_FollowRedirections)
                return m_RStatusValue;                
            
            if (m_ClientSocket.GetVerbose()) 
                cout << "\n\t[" << l_RedirectLoc << "]";
            
            ClearResults(false);
            m_Url.SetUrl(l_RedirectLoc);
            bContinueExecuteGet = true;
            
            break;
            
        case 305: /* use proxy */
            
            l_RedirectLoc = m_RFields.FindElement(g_strHttpLocation).GetValue(g_strHttpLocation);
            
            if (! l_RedirectLoc.GetLength())
                break;

            // HTTP 1.1 - Temporary Redirect is 302 and 307
            // m_RRedirections is relevant for final URL address
            // that could be retrieved
            
            ClearResults(false);
            m_Proxy.SetUrl(l_RedirectLoc);

            bContinueExecuteGet = true;

            break;
        };

        Trace(tagHttp, levInfo, ("CHttpRequest - GetHTTP - %d", m_RStatusValue));        

    } while (bContinueExecuteGet);

    return m_RStatusValue;
}
Exemple #27
0
int CHttpRequest::GetHTTP10(CUrl& iUrl, const CString& iE, inetSocket& Sock){
  /*
    attempt a retrieval of HTTP/1.0
    */  
  CString Request;
  if (ProxyURL.StrLength() && Proxy.isValid()) {
    if (RLimit) Request += "GET "; else Request+="HEAD ";
    Request += iUrl.GetScheme(); Request+="://";
    Request += iUrl.GetHost();
    if (iUrl.GetPortValue() != 80) {
      Request+=":"; 
      Request += iUrl.GetPort();
    }
    Request += iUrl.GetUrlPath(); Request += " HTTP/1.0"; Request+=iE;    
  } else {
    if (RLimit) Request += "GET "; else Request+="HEAD ";
    Request += iUrl.GetUrlPath(); Request += " HTTP/1.0"; Request+=iE;
    Request += "Host: "; Request += iUrl.GetHost(); Request+=iE;
  }
  for (int i=0;i<RHeaderParams.entries_count();i++) {
    Request+=RHeaderParams.get_name(i);
    Request+=": ";
    Request+=RHeaderParams.get_value(i);
    Request += iE;
  }
  Request += iE;
#ifdef _U_DEBUG
  cout << "# HTTP Request: =====" << endl;
  cout << Request;
  cout << "=====================" << endl;
#endif
  /*
    issue request
    */
  CString RLoc;
  if (!Send(Sock, Request)) return 0;    
  ProcessHeader(Sock);
  ProcessData(Sock);
  switch(RStatusValue) {
  case 200: return 1;    
  case 301:
  case 302:
  case 303:
  case 307:  
    if (!FollowRedirections) return RStatusValue;
    RLoc = RHeaderResponse.get_value("Location");
    if (RLoc.StrLength()) {      
      /*
	HTTP 1.1 - Temporary Redirect is 302 and 307
	RedirectVector is relevant for final URL address
	that could be retrieved
	*/
      if (!RedirectVector.Contains(RLoc)) {
	RedirectVector+=RLoc; 
	CUrl NewURL(RLoc);      
	if (!Proxy.isValid()) {
	  inetSocket Sock2(NewURL.GetPortValue(), NewURL.GetHost());
	  return GetHTTP10(NewURL, iE, Sock2);
	} else {
	  Sock.Reopen();
	  return GetHTTP10(NewURL, iE, Sock);
	}      
      }
    }  
    return RStatusValue;
  case 305: /* use proxy */    
    RLoc = RHeaderResponse.get_value("Location");
    if (RLoc.StrLength()) {
      CUrl ProxyURL(RLoc);
      if (ProxyURL.isValid()) {
	inetSocket ProxySock(ProxyURL.GetPortValue(), ProxyURL.GetHost());
	if (wsLastError.StrLength()) return RStatusValue;	
	return GetHTTP10(iUrl, iE, ProxySock);
      }
    }
    return RStatusValue;  
  default: return RStatusValue;   
  }
}