/* OnButtonAdd() */ void CWallPaperCrawlerSettingsDomainDlg::OnButtonAdd(void) { char szValue[MAX_URL+1]; UpdateData(TRUE); strcpyn(szValue,m_strValue,sizeof(szValue)); CUrl url; URL Url; // controlla che non contenga caratteri jolly if(strchr(szValue,'?') || strchr(szValue,'*')) { ::MessageBoxResourceEx(this->m_hWnd,MB_OK|MB_ICONERROR,WALLPAPER_PROGRAM_NAME,IDS_ERROR_WILDCARDS,szValue); return; } // controlla che sia un url if(!url.IsUrl(szValue)) { ::MessageBoxResourceEx(this->m_hWnd,MB_OK|MB_ICONERROR,WALLPAPER_PROGRAM_NAME,IDS_ERROR_INVALID_HTTP_URL,szValue); return; } // controlla che sia un url HTTP if(!url.IsUrlType(szValue,HTTP_URL)) { if(::MessageBoxResourceEx(this->m_hWnd,MB_YESNO|MB_ICONQUESTION,WALLPAPER_PROGRAM_NAME,IDS_QUESTION_INVALID_HTTP_URL,szValue)==IDNO) return; } // elimina l'eventuale '/' finale int nLen = strlen(szValue)-1; if(szValue[nLen]=='/') szValue[nLen] = '\0'; url.SplitUrl(szValue,&Url); BOOL bOnlyHost = TRUE; if(strlen(Url.file) > 0) bOnlyHost = FALSE; if(strlen(Url.dir) > 0) if(strcmp(Url.dir,"/")!=0) bOnlyHost = FALSE; if(!bOnlyHost) { ::MessageBoxResourceEx(this->m_hWnd,MB_OK|MB_ICONERROR,WALLPAPER_PROGRAM_NAME,IDS_ERROR_NOT_HTTP_DOMAIN,szValue,strlen(Url.file) > 0 ? Url.file : Url.dir,Url.host); strcpyn(szValue,Url.host,sizeof(szValue)); } if(strlen(szValue) > 0) { if(m_wndDomainList.FindItem(szValue) < 0) m_wndDomainList.SelectItem(m_wndDomainList.AddItem(szValue,0)); else ::MessageBoxResourceEx(this->m_hWnd,MB_OK|MB_ICONERROR,WALLPAPER_PROGRAM_NAME,IDS_ERROR_VALUE_EXISTS,szValue); } else ::MessageBoxResource(this->m_hWnd,MB_OK|MB_ICONERROR,WALLPAPER_PROGRAM_NAME,IDS_ERROR_INVALID_VALUE); }
/* * 将请求返回的结果打印输出到屏幕. */ long CUrl::call_wirte_func(void *buffer, int size, int nmemb, void *uri) { long count = size * nmemb; Uri *puri = (Uri *)uri; CUrl *pcurl = (CUrl *)puri->pcurl; if (pcurl != 0 && pcurl->isfirstwirte == true) { pcurl->get_response_code(); pcurl->get_response_length(); pcurl->get_response_contenttype(); puri->request_size = pcurl->request_size; puri->request_cursize = 0; } if (pcurl->request_size > 0) { if (puri->curbuf == NULL) { puri->curbuf = (char *)malloc(pcurl->request_size+1); } memcpy(puri->curbuf + puri->request_cursize,(const char *)buffer, count); puri->request_cursize += count; //puri->curbuf_allocsize = pcurl->request_size; puri->curbuf_size += count; puri->curbuf[puri->request_size] = 0; } else { if (puri->curbuf == NULL) { puri->curbuf = (char *)malloc(count+1); memcpy(puri->curbuf, buffer, count); puri->request_cursize = count; puri->request_size = count; //puri->curbuf_allocsize = count; puri->curbuf_size = count; } else { puri->curbuf = (char *)realloc(puri->curbuf, puri->request_size + count+1); memcpy(puri->curbuf + puri->request_size , buffer, count); puri->request_cursize += count; puri->request_size += count; //puri->curbuf_allocsize += count; puri->curbuf_size += count; } puri->curbuf[puri->request_size] = 0; } pcurl->isfirstwirte = false; return count; }
void Uri::Requset() { if(isinit == true) { CUrl curl; curl.requset(this); pcurl = 0; } return ; };
EXPORT_C CUrl* CUrl::NewL(const TParseBase& aFileName) // // Static factory c'tor. Used for creating CUrl object for a file on local file system { CUrl* url = new(ELeave) CUrl(); CleanupStack::PushL(url); url->ConstructL(aFileName); CleanupStack::Pop(); return url; }
EXPORT_C CUrl* CUrl::NewL(const TDesC& aUrl) // // Static factory c'tor. { CUrl* url = new(ELeave) CUrl(); CleanupStack::PushL(url); url->ConstructL(aUrl); CleanupStack::Pop(); return url; }
bool CManagedUrlList::CurrentlyListed(CUrl Url) { // If the url is in the list of urls to visit, then we already know about it for (std::list<CUrl>::iterator j = this->begin(); j != this->end(); j++) { CUrl Current = *j; // ### This will have to be more specific to match the server too if (Url.GetResource() == Current.GetResource()) return true; } // Didn't find it in the list return false; }
bool CManagedUrlList::PreviouslyListed(CUrl Url) { // If the url is in the list of visited resources, then we've been there for (std::list<CUrl>::iterator i = m_PastItems.begin(); i != m_PastItems.end(); i++) { CUrl Current = *i; // ### This will have to be more specific to match the server too if (Current.GetResource() == Url.GetResource()) { // Already been there return true; } } // Couldn't find it return false; }
int CHttpRequest::Post(CUrl& iUrl, const CString& RawData){ CString Server; int Port; if (ProxyURL.StrLength() && Proxy.isValid()) { Server = Proxy.GetHost(); Port = Proxy.GetPortValue(); } else if (iUrl.isValid()) { Server = iUrl.GetHost(); Port = iUrl.GetPortValue(); } else return 0; inetSocket Sock(Port, Server); if (wsLastError.StrLength()) return 0; char iE[] = "xxxx\0"; sprintf(iE, "%c%c", 13, 10); return PostHTTP(iUrl, RawData, iE, Sock); }
int CHttpRequest::Execute(CUrl& iUrl){ #ifdef _U_DEBUG cout << "CHttpRequest::Execute()" << endl; #endif RHeader.Free(); RData.Free(); RStatus.Free(); RStatusValue = -1; RedirectVector.Clear(); CString Server; int Port; if (ProxyURL.StrLength() && Proxy.isValid()) { Server = Proxy.GetHost(); Port = Proxy.GetPortValue(); } else if (iUrl.isValid()) { Server = iUrl.GetHost(); Port = iUrl.GetPortValue(); } else { RStatusValue = HTTPR_USER + 1; return 0; } #ifdef _U_DEBUG cout << "CHttpRequest::Execute() - creating inetSocket" << endl; #endif inetSocket Sock(Port, Server); if (wsLastError.StrLength()) { #ifdef _U_DEBUG cout << "CHttpRequest::Execute() - ERROR at inetSocket - " << wsLastError << endl; #endif RStatusValue = HTTPR_USER + 2; return 0; } #ifdef _U_DEBUG cout << "CHttpRequest::Execute() - inetSocket created" << endl; #endif char iE[] = "xxxx\0"; sprintf(iE, "%c%c", 13, 10); if (!GetHTTP10(iUrl, iE, Sock)) { if (!GetHTTP09(iUrl, iE, Sock)) { return 0; } else return 1; } else return 1; }
bool CHttpConnection::connect(const CUrl &url) { /* ** Host und Port aus CUrl holen */ string host=url.getDomain(); string sport=url.getPort(); unsigned int port; if(sport=="") port=80; else port=atol(sport.c_str()); /* ** connect für Host und Port aufrufen */ return(connect(host,port)); }
int CHttpRequest::GetHTTP09(CUrl& iUrl, const CString& iE, inetSocket& Sock){ /* attempt a retrieval of HTTP/0.9 */ CString Request; if (RLimit) Request += "GET "; else Request+="HEAD "; Request += iUrl.GetScheme(); Request+="://"; Request += iUrl.GetHost(); if (iUrl.GetPortValue() != 80) { Request+=":"; Request += iUrl.GetPort(); } Request += iUrl.GetUrlPath(); Request+=iE; for (int i=0;i<RHeaderParams.entries_count();i++) { Request+=RHeaderParams.get_name(i); Request+=": "; Request+=RHeaderParams.get_value(i); Request += iE; } Request += iE; #ifdef _U_DEBUG cout << "# HTTP 0.9 Request: =====" << endl; cout << Request; cout << "=====================" << endl; #endif /* issue request */ if (!Send(Sock, Request)) return 0; RHeader.Free(); RData.Free(); RStatus.Free(); RStatusValue = -1; ProcessData(Sock); if (RData.StrLength()) { RHeaderResponse.clear(); RStatusValue = 200; RStatus = "200"; return 1; } else if (RStatusValue != -1) { return 0; } else { RStatusValue = HTTPR_USER + 3; return 0; } }
int CHttpRequest::PostHTTP(CUrl& iUrl, const CString& RawData, const CString& iE, inetSocket& Sock){ CString Request; if (ProxyURL.StrLength() && Proxy.isValid()) { Request += "POST "; Request += iUrl.GetScheme(); Request+="://"; Request += iUrl.GetHost(); if (iUrl.GetPortValue() != 80) { Request+=":"; Request += iUrl.GetPort(); } Request += iUrl.GetUrlPath(); Request += " HTTP/1.0"; Request+=iE; } else { Request += "POST "; Request += iUrl.GetUrlPath(); Request += " HTTP/1.0"; Request+=iE; Request += "Host: "; Request += iUrl.GetHost(); Request+=iE; } for (int i=0;i<RHeaderParams.entries_count();i++) { Request+=RHeaderParams.get_name(i); Request+=": "; Request+=RHeaderParams.get_value(i); Request += iE; } Request += iE; Request += RawData; if (!Send(Sock, Request)) return 0; ProcessHeader(Sock); ProcessData(Sock); if (RData.StrLength()) { RHeaderResponse.clear(); RStatusValue = 200; RStatus = "200"; return 1; } else return 0; }
void CUrlManager::Perform() { for (unordered_set<CUrl*>::iterator it = m_sUrl.begin(); it != m_sUrl.end(); ) { CUrl* pUrl = *it; pUrl->m_eMultiCode = curl_multi_perform(pUrl->m_pCurlm, &pUrl->m_nStillRunning); if (pUrl->m_eMultiCode > CURLM_OK || pUrl->m_nStillRunning == 0) { pUrl->OnWriteOver(); it = m_sUrl.erase(it); delete pUrl; } else { ++it; } } }
int FetcherManager::doLogin(CURL *curl, Task *task, UrlNode *urlnode) { InfoCrawler *infocrawler = InfoCrawler::getInstance(); TaskOtherInfo *taskother = infocrawler->getTaskScheduleManager()->getTaskOtherInfo(task->id); if (!taskother) { return -1; } if (!task) { return -1; } CUrl url; url.parse(task->loginurl); if (url.getUrl().empty()) { return -1; } HttpProtocol httpprotocol; char downstatistic[512] ; downstatistic[0] = 0; RESPONSE_HEADER rheader; mylog_info(m_pLogGlobalCtrl->infolog, "before login %s - %s:%s:%d",url.getUrl().c_str(),INFO_LOG_SUFFIX); int ret = httpprotocol.curl_login(curl, url, urlnode, infocrawler->getConf()->httptimeout, &rheader, downstatistic); mylog_info(m_pLogGlobalCtrl->infolog, "after login %s %s %d - %s:%s:%d",url.getUrl().c_str(), downstatistic, ret,INFO_LOG_SUFFIX); /* if (ret == HTTP_FETCH_RET_REDIRECT) { //redirect errorlog("LOGIN ERROR: fetched %s relocated to %s taskid %d\n", url.getUrl().c_str() ,(char *)page.m_sLocation.c_str(),task->id); } else*/ if (ret == HTTP_FETCH_RET_ERROR) {//just discard mylog_error(m_pLogGlobalCtrl->errorlog, "login fetched %s taskid %d - %s:%s:%d:%d", url.getUrl().c_str(), task->id,INFO_LOG_SUFFIX,ret); } else if (ret == HTTP_FETCH_RET_ERROR_INVALIDHOST) { //invalid host, can not access mylog_error(m_pLogGlobalCtrl->errorlog, "login fetched %s taskid %d - %s:%s:%d:%d", url.getUrl().c_str(), task->id,INFO_LOG_SUFFIX,ret); } else if (ret == HTTP_FETCH_RET_ERROR_UNACCEPTED) { //content is invalid, discard mylog_error(m_pLogGlobalCtrl->errorlog, "LOGIN fetched %s unaccepted contenttyped %s taskid %d - %s:%s:%d:%d", url.getUrl().c_str(), rheader.contenttype.c_str(), task->id,INFO_LOG_SUFFIX,ret); } else { taskother->fetchingcookie = true; static char *loginok = "LOGIN OK"; saveCookie(task->id, loginok, strlen(loginok)); taskother->fetchingcookie = false; return 1; } return -1; }
EXPORT_C TInt CUrl::Compare(CUrl& aUrl, TInt aCompareComps) const // // Scheme is case insensitive, rest of url is case sensitive { TInt result =0; if (aCompareComps & EUrlScheme) { result += Component(EUrlScheme).CompareF(aUrl.Component(EUrlScheme)); if (result !=0) return result; } if (aCompareComps & EUrlLocation) { result += Component(EUrlLocation).Compare(aUrl.Component(EUrlLocation)); if (result !=0) return result; } if (aCompareComps & EUrlUsername) { result += Component(EUrlUsername).Compare(aUrl.Component(EUrlUsername)); if (result !=0) return result; } if (aCompareComps & EUrlPassword) { result += Component(EUrlPassword).Compare(aUrl.Component(EUrlPassword)); if (result !=0) return result; } if (aCompareComps & EUrlPath) { result += Component(EUrlPath).Compare(aUrl.Component(EUrlPath)); if (result !=0) return result; } if (aCompareComps & EUrlQuery) { result += Component(EUrlQuery).Compare(aUrl.Component(EUrlQuery)); if (result !=0) return result; } if (aCompareComps & EUrlFragment) { result += Component(EUrlFragment).Compare(aUrl.Component(EUrlFragment)); if (result !=0) return result; } return result; }
bool CHttpRequest::Execute(const CUrl& Url) { Trace(tagHttp, levInfo, ("CHttpRequest - CHttpRequest {%s}", Url.GetBrute().GetBuffer())); m_Url = Url; ClearResults(true); switch(m_RequestMethod) { case htPost: assert(0); if (CreateSocket()) { } break; case htGet: if (m_RequestSizeLimit == 0) m_RequestMethod = htHead; case htHead: return (ExecuteGet(true) != -1); default: return false; } return false; }
STDMETHODIMP CBHttpRequest::Open(BSTR strMethod, BSTR strUrl, VARIANT_BOOL bAsync, VARIANT varUser, VARIANT varPassword) { CUrl url; CStringA strObject; CStringA strUser; CStringA strPassword; Abort(); s_cs.Enter(); s_dwReqID ++; m_dwReqID = s_dwReqID; s_mapReq.SetAt(m_dwReqID, this); s_cs.Leave(); url.CrackUrl(CBStringA(strUrl)); m_bAsync = (bAsync != VARIANT_FALSE); strObject = url.GetUrlPath(); strObject.Append(url.GetExtraInfo()); if(varUser.vt != VT_ERROR) { HRESULT hr = varGetString(varUser, strUser); if(FAILED(hr))return hr; } if(varPassword.vt != VT_ERROR) { HRESULT hr = varGetString(varPassword, strPassword); if(FAILED(hr))return hr; } m_hConnection = InternetConnect(m_hSession, url.GetHostName(), url.GetPortNumber(), strUser.IsEmpty() ? NULL : (LPCSTR)strUser, strPassword.IsEmpty() ? NULL : (LPCSTR)strPassword, INTERNET_SERVICE_HTTP, 0, m_dwReqID); if(m_hConnection == NULL) return GetErrorResult(); m_hFile = HttpOpenRequest(m_hConnection, CBStringA(strMethod), strObject, NULL, NULL, NULL, m_dwFlags, m_dwReqID); if(m_hFile == NULL) return GetErrorResult(); m_eventComplete.Set(); return S_OK; }
CVector<CString> CUrlTree::UrlToVector(const CUrl& Url) const { CVector<CString> Vector; CString MidString; MidString += Url.GetScheme(); MidString += ":/"; #ifdef _UNIX if (Url.GetScheme().Same(g_strProto_FILE)) { MidString += "/"; } #endif Vector += MidString; MidString = Url.GetHost(); if (MidString.GetLength() && Url.GetPortValue() != 80) { MidString += ":"; MidString += Url.GetPort(); } if (MidString.GetLength()) { Vector += MidString; } int HostVectorSize = Vector.GetSize(); CVector<CString> Vector2; CString::StrToVector(Url.GetUrlPath(), '/', &Vector2); Vector += Vector2; if (((int) Vector.GetSize() > HostVectorSize) && (!Vector[HostVectorSize].GetLength())) Vector.RemoveAt(HostVectorSize); return Vector; }
HRESULT WINAPI MonitorSink::QueryIAuthenticate(void* pv, REFIID riid, LPVOID* ppv, DWORD dw) { * ppv = NULL; if ( pv && InlineIsEqualGUID(riid, IID_IAuthenticate) ) { MonitorSink * pThis = (MonitorSink *)pv; if ( pThis->m_pIEHostWindow && ! pThis->m_strURL.IsEmpty() && pThis->m_spTargetProtocol ) { do { CComPtr<IWinInetHttpInfo> spWinInetHttpInfo; if ( FAILED(pThis->m_spTargetProtocol->QueryInterface(&spWinInetHttpInfo)) ) break; if ( ! spWinInetHttpInfo ) break; CHAR szRawHeader[8192]; // IWinInetHttpInfo::QueryInfo() 返回的 Raw Header 不是 Unicode 的 DWORD dwBuffSize = ARRAYSIZE(szRawHeader); if ( FAILED(spWinInetHttpInfo->QueryInfo(HTTP_QUERY_RAW_HEADERS, szRawHeader, &dwBuffSize, 0, NULL)) ) break; CString strHeader; HttpRawHeader2CrLfHeader(szRawHeader, strHeader); static const WCHAR AUTH_HEAD [] = L"\r\nWWW-Authenticate:"; LPWSTR lpAuth = NULL; size_t nAuthLen = 0; if ( ! ExtractFieldValue( strHeader, AUTH_HEAD, & lpAuth, & nAuthLen ) ) break; if ( ! lpAuth ) break; CString strAuthScheme; CString strAuthRealm; // 可能有以下几种情况: // WWW-Authenticate: Basic realm="Secure Area" // WWW-Authenticate: Digest realm="*****@*****.**", qop="auth,auth-int", nonce="dcd98b7102dd2f0e8b11d0f600bfb0c093", opaque="5ccc069c403ebaf9f0171e9517f40e41" // WWW-Authenticate: NTLM // WWW-Authenticate: NTLM <auth token> LPWSTR pPos = StrStrW(lpAuth, L" "); if ( pPos ) { * pPos = L'\0'; strAuthScheme = lpAuth; do { pPos = StrStrIW( pPos + 1, L"realm"); if ( ! pPos ) break; pPos = StrChrW( pPos + 5, L'='); if ( ! pPos ) break; pPos = StrChrW( pPos + 1, L'"'); if ( ! pPos ) break; LPWSTR lpRealm = pPos + 1; pPos = StrChrW( lpRealm, L'"'); if ( ! pPos ) break; * pPos = L'\0'; strAuthRealm = lpRealm; } while (false); } else { strAuthScheme = lpAuth; } VirtualFree( lpAuth, 0, MEM_RELEASE); // 由于 NPN_GetAuthenticationInfo 得不到 NTLM 的 domain,没办法做登录,只好不支持了 if (strAuthRealm == _T("NTLM")) return E_NOINTERFACE; CUrl url; if ( url.CrackUrl(pThis->m_strURL) ) { CW2A aScheme(url.GetSchemeName()); CW2A aHost(url.GetHostName()); int aPort = url.GetPortNumber(); char* username = NULL; char* password = NULL; uint32_t ulen = 0, plen = 0; char* szAuthScheme = CStringToUTF8String(strAuthScheme); char* szAuthRealm = CStringToUTF8String(strAuthRealm); NPError result = NPN_GetAuthenticationInfo(pThis->m_pIEHostWindow->m_pPlugin->m_pNPInstance, aScheme, aHost, aPort, szAuthScheme, szAuthRealm, &username, &ulen, &password, &plen ); delete[] szAuthScheme; delete[] szAuthRealm; if (result != NPERR_NO_ERROR) break; pThis->m_strUsername = username; pThis->m_strPassword = password; NPN_MemFree(username); NPN_MemFree(password); } * ppv = dynamic_cast<IAuthenticate *>(pThis); ((IUnknown*)*ppv)->AddRef(); return S_OK; } while (false); } } return E_NOINTERFACE; }
void CCrawl::Fetch(void *arg) { string str_url,host; int nGsock = -1;//之前的套接字文件描述符 string strGHost;//之前的主机号 //生成一个PSE file来存放网页数据 //string ofs_name = DATA_PSE_FILE + "." + CStrFunction::itos(GetCurrentThreadId());//PSE.raw+当前线程号 string ofs_name = DATA_PSE_FILE + CStrFunction::itos(GetCurrentThreadId())+ ".txt";//PSE+当前线程号+.txt CPSEFile pse_file(ofs_name);//创建一个PSE格式的文件,保存为原始网页库 //生成一个link_for_pse file来存放链接数据 ofs_name = DATA_LINK_FOR_PSE_FILE + CStrFunction::itos(GetCurrentThreadId())+ ".txt";//PSE+当前线程号+.txt CLinkForPSEFile link_for_pse_file(ofs_name);//创建一个网页结构库 int isleep_cnt = 0;//线程运行控制参数 for(;;) { WaitForSingleObject(mutex_collection,INFINITE);//互斥锁 int cnt = map_urls.size(); if(cnt > 0) { //已经收集的没有访问的url cout<<"collection has "<<cnt<<" unvisited urls"<<endl; multimap<string,string>::iterator it = map_urls.begin(); if(it != map_urls.end()) { //从带访问的url队列中得到一个url进行访问 str_url = (*it).second; map_urls.erase(it); ReleaseMutex(mutex_collection); //分解url CUrl iurl; //看看url是否有http://,没有则返回 if(iurl.ParseUrl(str_url) == false) { cout<<"parse url false in Fetch"<<str_url<<endl; continue; } //表明现在抓取的网页所在的主机,同之前抓取的网页所在的主机不同 //我们不能利用之前的套接字文件描述符进行CS通信,必须创建新的 //套接字文件描述符进行通信,这是由于循环导致的 if(strGHost != iurl.host_name) { closesocket(nGsock); nGsock = -1; strGHost = iurl.host_name; } //根据URL以及套接字文件描述符抓取URL对应的网页,并保存为原始网页库和网页结构库 ((CCrawl *)arg)->DownroadFile(&pse_file,&link_for_pse_file,iurl,nGsock); cnt = 0; }else { ReleaseMutex(mutex_collection); } }else { //等待访问的url队列map_urls已经没有url了,这是我们需要挂起线程进行等待 ReleaseMutex(mutex_collection); Sleep(1000); isleep_cnt++; } if(b_f_over == true && isleep_cnt == 200)//当线程挂起的次数达到两百的时候,结束调用fetch { break; } } pse_file.Close(); link_for_pse_file.Close(); }
//将url放入map_urls到容器中 void CCrawl::AddUrl(const char * url) { string str_url = url; if(str_url.empty() || str_url.length() < 8) { cout<<"the url is empty or too short"<<endl; return ; } CPage ipage; if(ipage.NormalizeUrl(str_url) == false) return ; CUrl iurl; //图片类型的网页,存放到历史网页链接库中 if(iurl.IsImageUrl(str_url)) { if(ofs_link_for_history_file) { WaitForSingleObject(mutex_link_for_history_file,INFINITE); ofs_link_for_history_file<<str_url<<endl; ReleaseMutex(mutex_link_for_history_file); } return ; } if(iurl.ParseUrl(str_url) == false) { cout<<"parse url error in AddUrl"<<endl; return ; } if(iurl.IsValidHost(iurl.host_name.c_str()) == false) { cout<<"not the valid host in AddUrl"<<endl; return ; } if(iurl.IsForeignHost(iurl.host_name.c_str()) ) { cout<<"foreign host in AddUrl"<<endl; return ; } //如果是阻塞的ip地址,剔除掉 unsigned long inaddr = 0; char *ip = NULL; inaddr =(unsigned long) inet_addr(iurl.host_name.c_str()); if(inaddr != INADDR_NONE) { ip = new char[iurl.host_name.size() + 1]; memset(ip,0,iurl.host_name.size() + 1); memcpy(ip,iurl.host_name.c_str(),iurl.host_name.size()); if(!iurl.IsValidIp(ip)) { delete []ip; ip = NULL; return ; } delete []ip; ip = NULL; } CStrFunction::StrToLower(iurl.host_name,iurl.host_name.size()); CMD5 imd5; imd5.GenerateMd5((unsigned char *)str_url.c_str(),str_url.size()); string str_digest = imd5.ToString(); if(set_visited_url_md5.find(str_digest) != set_visited_url_md5.end()) { return ; } if(set_unvisited_url_md5.find(str_digest) != set_unvisited_url_md5.end()) { return ; } else { WaitForSingleObject(mutex_unvisited_url_md5,INFINITE); set_unvisited_url_md5.insert(str_digest); ReleaseMutex(mutex_unvisited_url_md5); } //确保同一个线程在一个网站上爬取 int cnt = 0; for(;;) { if(1)//??????? { WaitForSingleObject(mutex_visited_url_md5,INFINITE); map_urls.insert(val_type(iurl.host_name,str_url)); ReleaseMutex(mutex_visited_url_md5); break; } else { cnt++; if(cnt%100 == 0) cout<<"~"; if(cnt == 5000) { cout<<"remove it"<<endl; } Sleep(4000); } } }
EXPORT_C void CUrl::SetL(CUrl& aUrl) { HBufC* url = aUrl.UrlDes().AllocL(); delete iUrlDes; iUrlDes = url; }
int FetcherManager::fetch() { InfoCrawler *infocrawler = InfoCrawler::getInstance(); UrlAnalyseManager *urlAnalyseManager = infocrawler->getUrlAnalyseManager(); CURL *curl = curl_easy_init(); curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); //just to start the cookie engine curl_easy_setopt(curl, CURLOPT_SHARE, sh); while(running()) { curl_easy_reset(curl); UrlNode *urlnode = NULL; bool html_from_outer= false; urlnode = urlAnalyseManager->getUrlFromOuterHtml(); if (urlnode) { html_from_outer = true; } else { urlnode = urlAnalyseManager->getUrl(); } if (urlnode == NULL) { my_sleep(100 * 1000); //0.1s continue; } if (!(urlnode->task)) { mylog_info(m_pLogGlobalCtrl->infolog, "node task is null %s - %s:%s:%d",urlnode->url,INFO_LOG_SUFFIX); } TaskOtherInfo *taskother = infocrawler->getTaskScheduleManager()->getTaskOtherInfo(urlnode->taskid); int taskbatch = urlnode->taskbatch; if (urlnode->needtologin) { //need to login and cookie is null if (!(infocrawler->getTaskScheduleManager()->getCookieFromTask(urlnode->taskid))) { if (taskother->fetchingcookie) { infocrawler->getUrlAnalyseManager()->insertUrl(urlnode); infocrawler->getTaskScheduleManager()->decreaseTaskUrlNum(urlnode->task,taskbatch); #ifdef URLMEMCACHEDB infocrawler->deleteUrlMcLocalThread(); #endif continue; } else { doLogin(curl, urlnode->task, urlnode); } } } /*if (urlnode->task->sourcetype == SOURCE_TYPE_COMPANY && urlnode->type & URL_TYPE_HOMEPAGE) { strcat(urlnode->url, "&event=32698647"); strcpy(urlnode->refererurl, "http://search.china.alibaba.com/tools/validate_redirect.htm?ru=http%253A%252F%252Fsearch.china.alibaba.com%252Fcompany%252Fcompany_search.htm%253Fkeywords%253D%25CA%25D6%25BB%25FA%2526pageSize%253D30%2526n%253Dy%2526showStyle%253Dpopular%2526beginPage%253D4&event=32698647&n=y"); }*/ CUrl url; url.parse(urlnode->url); //wrong url format if (url.getUrl().empty()) { infocrawler->getTaskScheduleManager()->increaseTaskErrorUrlNum(urlnode->taskid); infocrawler->getTaskScheduleManager()->decreaseTaskUrlNum(urlnode->task, taskbatch); infocrawler->getLocalDbManager()->decidesaveFetched(urlnode); delete urlnode; #ifdef URLMEMCACHEDB infocrawler->deleteUrlMcLocalThread(); #endif continue; } Page page; Buffer *content = create_buffer(DEFAULT_PAGE_BUF_SIZE); //do fetch HttpProtocol httpprotocol; char downstatistic[512] ; downstatistic[0] = 0; RESPONSE_HEADER rheader; // mylog_info(m_pLogGlobalCtrl->infolog, "before fetch %s %s %llu %d %d - %s:%s:%d",url.getUrl().c_str(), urlnode->url, urlnode->id, urlnode->taskid, urlnode->errornum,INFO_LOG_SUFFIX); //int ret = httpprotocol.fetch(url, content, urlnode, page, infocrawler->getConf()->httptimeout,urlnode->task->tasksendtype); // int ret = httpprotocol.curl_fetch(curl, url, content, urlnode, infocrawler->getConf()->httptimeout, urlnode->task->tasksendtype, &rheader, downstatistic); int sendtype = urlnode->task->tasksendtype; if (urlnode->task->sourcetype == SOURCE_TYPE_COMPANY && urlnode->type & URL_TYPE_HOMEPAGE) { /*FILE * f = fopen("ali.txt", "rb"); char line[1024] = {0}; int i = 0; string cookie; string post; while(fgets(line, 1023, f)) { char *newline = strtrim(line, NULL); if (i++ == 0) { cookie = newline; } else { post = newline; } } fclose(f); */ sendtype = REQUEST_TYPE_GET; } int ret = 0; if (!html_from_outer) { ret = httpprotocol.curl_fetch(curl, url, content, urlnode, infocrawler->getConf()->httptimeout, sendtype, &rheader, downstatistic); mylog_info(m_pLogGlobalCtrl->infolog, "after fetched %s %s %d - %s:%s:%d",url.getUrl().c_str(), downstatistic, ret,INFO_LOG_SUFFIX); } else { add_buffer(content, (char *)urlnode->html.c_str(), urlnode->html.length()); ret = urlnode->html.length(); mylog_info(m_pLogGlobalCtrl->infolog, "get url from outer %s %d - %s:%s:%d", url.getUrl().c_str(), ret,INFO_LOG_SUFFIX); } /*if (ret == HTTP_FETCH_RET_REDIRECT) { //redirect int redirectnum = urlnode->redirectnum +1; if (redirectnum <= URL_FETCH_REDIRECT_TIMES) { UrlNode *newurlnode = new UrlNode(urlnode->task,urlnode->topicsource,urlnode->title,urlnode->taskbatch,(char *)urlnode->fatherurl,(char *)page.m_sLocation.c_str(), urlnode->other, urlnode->maxtype,urlnode->type, 0, urlnode->id,redirectnum ,urlnode->page,urlnode->layerid,urlnode->bbsid,urlnode->needtologin); newurlnode->insertother(URLNODE_OTHER_TYPE_COOKIE,(char *)page.m_sCookie.c_str(), page.m_sCookie.length()); errorlog("ERROR: fetched %s %s relocated to %s %llu %d\n", url.getUrl().c_str(), urlnode->url, newurlnode->url, newurlnode->id, newurlnode->taskid); infocrawler->getUrlAnalyseManager()->insertUrl(newurlnode); }else { errorlog("ERROR: redirectunm > %d fetched %s %s relocated to %s %d\n", URL_FETCH_REDIRECT_TIMES, url.getUrl().c_str(), urlnode->url, (char * )page.m_sLocation.c_str(), urlnode->taskid); } urlnode->errornum = 0; */ if (ret == HTTP_FETCH_RET_ERROR) {//just discard urlnode->errornum++; mylog_error(m_pLogGlobalCtrl->errorlog, "fetched %s - %s:%s:%d:%d", url.getUrl().c_str(),INFO_LOG_SUFFIX,urlnode->errornum); /*} else if (ret == HTTP_FETCH_RET_ERROR_INVALIDHOST) { //invalid host, can not access urlnode->errornum++; errorlog("ERROR: fetched %s invalidhost %d\n", url.getUrl().c_str(), urlnode->errornum); */ } else if (ret == HTTP_FETCH_RET_ERROR_UNACCEPTED) { //content is invalid, discard urlnode->errornum = URL_FETCH_RETRY_TIMES; //errorlog("ERROR: fetched %s unaccepted contenttype %d %s\n", url.getUrl().c_str(), urlnode->errornum, page.m_sContentType.c_str()); } else { //ok //increase fetch num if (urlnode->type & URL_TYPE_NEEDTOSAVE) infocrawler->getTaskScheduleManager()->increaseFetchNum(urlnode->task); urlnode->errornum = 0; //extract urls and analyse, insert new url into queue char nextpageurl[MAX_URL_LEN] ; nextpageurl[0] = 0; int nextpage = infocrawler->getUrlAnalyseManager()->analyseUrls(urlnode, &rheader, content->data, ret, nextpageurl, html_from_outer); if (html_from_outer) { nextpage = 0; nextpageurl[0] = 0; } //write content to disk if we need, write fetched url into dist if (urlnode->type & URL_TYPE_NEEDTOSAVE) { if (urlnode->task->sourcetype == SOURCE_TYPE_BBS) { char oldurlnodedata[64]; int tasktmp = 0; int pagetmp = 0; ulonglong idtmp = 0; if (InfoCrawler::getInstance()->getLocalDbManager()->alreadyfetched(urlnode,oldurlnodedata)) { sscanf(oldurlnodedata, "%llu/%d/%*d/%*d/%d/%*u", &idtmp,&pagetmp, &tasktmp); if (pagetmp == urlnode->page) { int rettmp = infocrawler->getLocalDbManager()->erasecontent(idtmp,tasktmp); } } } if (urlnode->nextpage == 1 && nextpage >1) { urlnode->nextpage = nextpage; } mylock::get_instance()->get(urlnode->id); infocrawler->getLocalDbManager()->savecontent(urlnode, &rheader, content->data, ret, nextpage); mylock::get_instance()->put(urlnode->id); //infocrawler->getPageManager()->SavePage(content->data, ret, urlnode, &rheader); mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s saved content%d %d title %s urlid %llu taskid %d batchid %d - %s:%s:%d",url.getUrl().c_str(), urlnode->errornum, ret,urlnode->title,urlnode->id, urlnode->taskid, urlnode->taskbatch ,INFO_LOG_SUFFIX); /*if ((urlnode->nextpage > 1)&& !(urlnode->type & URL_TYPE_HOMEPAGE)) {//if have nextpage, don't not save fetched mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s have next, so don't not save fetched - %s:%s:%d",url.getUrl().c_str(),INFO_LOG_SUFFIX); } else { infocrawler->getLocalDbManager()->saveFetched(urlnode); infocrawler->getLocalDbManager()->saveUrl(urlnode, SAVE_FATHER_URL); mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s save fetched%d - %s:%s:%d",url.getUrl().c_str(),urlnode->errornum,INFO_LOG_SUFFIX); }*/ if (urlnode->nowpage == urlnode->totalpage) { infocrawler->getLocalDbManager()->saveFetched(urlnode); infocrawler->getLocalDbManager()->saveUrl(urlnode, SAVE_FATHER_URL); //0128.begin() /* char *content=NULL; char dbname[64] = ""; char recordname[64] = ""; char urldbname[64] = ""; DBAccess *dbaccess = DBAccess::getInstance(); getContentDBName1(urlnode, dbname); getRecordKeyName1(urlnode, recordname); getUrlDBName1(urlnode, urldbname); int suffix = dbaccess->load(dbname); string fileno; DBD *dbd = dbaccess->get(suffix, fileno, recordname, NULL); if (dbd != NULL) { DbHypertableManager * dbhyper=InfoCrawler::getInstance()->gethyper(); ICCONFIG *ifcong_=InfoCrawler::getInstance()->getConf(); // dbhyper->get_now_time(); string now_=TimeToString1(); //string now_; if( dbhyper->insert_data_to_hypertable_content(urlnode,dbd->datbuf,dbd->datlen_u,string("content_tbl"),string("gbk"),ret,now_) ) { char fetchtbl[32];fetchtbl[0]=0; //sprintf(fetchtbl,"fetch_%d_tbl",urlnode->taskid); sprintf(fetchtbl,"fetch_%d_tbl",1); if( dbhyper->insert_data_to_hypertable_fetch(urlnode,string(fetchtbl),ifcong_->spider_id,ret,now_) ) { char memorytable[128]; memorytable[0]=0; //sprintf(memorytable,"url_%d_tbl",urlnode->taskid); sprintf(memorytable,"url_%d_tbl",1); dbhyper->insert_data_to_hypertable_memorytable(urlnode,memorytable); } } dbd_free(dbd); }*/ //0128.end() /*char * final_content; final_content=NULL; final_content=get_final_content(urlnode); if(final_content !=NULL) { insert_data_to_hypertable(urlnode->fatherurl,final_content); delete []final_content; }*/ mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s save fetched%d nowpage %d totalpage %d - %s:%s:%d",url.getUrl().c_str(),urlnode->errornum,urlnode->nowpage,urlnode->totalpage,INFO_LOG_SUFFIX); } else { mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s have next, so don't not save fetched nowpage %d totalpage %d - %s:%s:%d",url.getUrl().c_str(),urlnode->nowpage,urlnode->totalpage,INFO_LOG_SUFFIX); } } //insert next page if (nextpageurl[0]) { UrlNode *newnode = new UrlNode; if (!(urlnode->type & URL_TYPE_HOMEPAGE)) newnode->id = urlnode->id; newnode->task = urlnode->task; newnode->taskid = urlnode->taskid; newnode->copyother(urlnode->other,urlnode->maxtype); newnode->type = urlnode->type; newnode->page = urlnode->page + 1; newnode->copyurl(nextpageurl); newnode->copyfatherurl(urlnode->fatherurl); newnode->layerid = urlnode->layerid; newnode->needtologin = urlnode->needtologin; newnode->taskbatch= urlnode->taskbatch; newnode->copytitle(urlnode->title); newnode->copytopicsource(urlnode->topicsource); mylog_info(m_pLogGlobalCtrl->infolog, " now url %s new url %s title %s - %s:%s:%d",urlnode->url, newnode->url,urlnode->title,INFO_LOG_SUFFIX); infocrawler->getUrlAnalyseManager()->insertUrl(newnode, INSERT_URL_FORCED); } } free_buffer(content); Task *task = urlnode->task; int taskid = urlnode->taskid; //if get an error, we will retry but only fixed times if (urlnode->errornum > 0 && urlnode->errornum < URL_FETCH_RETRY_TIMES) { mylog_error(m_pLogGlobalCtrl->errorlog, "fetched %s reinsert for error %d %llu - %s:%s:%d", url.getUrl().c_str(), urlnode->errornum, urlnode->id,INFO_LOG_SUFFIX); infocrawler->getUrlAnalyseManager()->insertUrl(urlnode, INSERT_URL_FORCED,false); } else if (urlnode->errornum >= URL_FETCH_RETRY_TIMES) { infocrawler->getTaskScheduleManager()->increaseTaskErrorUrlNum(taskid); //write error url to DB // infocrawler->getDbManager()->WriteFetchError(url.getUrl().c_str(),taskid,taskbatch); mylog_error(m_pLogGlobalCtrl->errorlog, "fetched finished and download url %s urlnodeid %llu taskid %d - %s:%s:%d:%d", url.getUrl().c_str(), urlnode->id, taskid,INFO_LOG_SUFFIX,urlnode->errornum); infocrawler->getLocalDbManager()->decidesaveFetched(urlnode); if (urlnode->type & URL_TYPE_NEEDTOSAVE) { infocrawler->getLocalDbManager()->saveUrl(urlnode); } delete urlnode; } else { mylog_info(m_pLogGlobalCtrl->infolog, "fetched finished %s error %d %llu fatherurl %s - %s:%s:%d",url.getUrl().c_str(), urlnode->errornum, urlnode->id, urlnode->fatherurl,INFO_LOG_SUFFIX); delete urlnode; } infocrawler->getTaskScheduleManager()->decreaseTaskUrlNum(task, taskbatch); #ifdef URLMEMCACHEDB infocrawler->deleteUrlMcLocalThread(); #endif } curl_easy_cleanup(curl); mylog_info(m_pLogGlobalCtrl->infolog, "FetcherManager ISRUNNING false - %s:%s:%d",INFO_LOG_SUFFIX); }
HRESULT FAsyncDownload::FHttpDownloadTP::ProcessDownload(FAsyncDownData *pData) { HRESULT hr = E_FAIL; FString ReqUrl = pData->m_pUrlInfo->m_DownloadUrl; UrlUnescapeInPlace(ReqUrl.GetBuffer(), 0); CUrl url; url.CrackUrl(ReqUrl); const tchar* pszUserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)"; FHInternet hIn = NULL; if (g_AppSettings.m_Proxy.GetLength() > 0) { hIn = InternetOpen(pszUserAgent, INTERNET_OPEN_TYPE_PROXY, g_AppSettings.m_Proxy, g_AppSettings.m_ProxyA, 0); } else { hIn = InternetOpen(pszUserAgent, INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0); } if (NULL == hIn) return E_HTTP_NET_ERROR; FHInternet hCon = InternetConnect(hIn, url.GetHostName(), url.GetPortNumber(), url.GetUserName(), url.GetPassword(), INTERNET_SERVICE_HTTP, 0, 0); if (NULL == hCon) { _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: InternetConnect() failed: %d\n", GetLastError()); return E_HTTP_NET_ERROR; } ULONG ulRecvTimeout = 15000; InternetSetOption(hCon, INTERNET_OPTION_RECEIVE_TIMEOUT, &ulRecvTimeout, sizeof(ULONG)); FString StrRes = url.GetUrlPath(); StrRes+= url.GetExtraInfo(); FHInternet hReq = HttpOpenRequest(hCon, "GET", StrRes, NULL, NULL, NULL, INTERNET_FLAG_NO_CACHE_WRITE | INTERNET_FLAG_DONT_CACHE, 0); if (NULL == hReq) { _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: HttpOpenRequest() failed: %d\n", GetLastError()); return E_HTTP_NET_ERROR; } size_type FileSize = 0; if (!(pData->m_pUrlInfo->m_dwDownloadFlags & HTTP_FLAG_NO_RESUME)) FileSize = GetFileSize(pData->m_pUrlInfo->m_DownloadFile); // See if file already exists on the disk. if (FileSize > 0) { FString StrRange; StrRange.Format("Range: bytes=%I64d-", FileSize); HttpAddRequestHeaders(hReq, StrRange, StrRange.GetLength(), HTTP_ADDREQ_FLAG_ADD_IF_NEW); } FString StrVersion; StrVersion.Format("LTV_VERSION: %s", g_AppSettings.m_AppVersion); HttpAddRequestHeaders(hReq, StrVersion, StrVersion.GetLength(), HTTP_ADDREQ_FLAG_ADD_IF_NEW); if (!HttpSendRequest(hReq, NULL, 0, NULL, 0)) { int err = GetLastError(); _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: HttpSendRequest() failed: %d (0x%x)\n", err, HRESULT_FROM_WIN32(err)); InternetCloseHandle(hCon); InternetCloseHandle(hIn); return E_HTTP_NET_ERROR; } const DWORD dwBufferSize = 8192; char pBuffer[dwBufferSize]; FHttpConnection FConn = hReq; DWORD dwStatusCode = FConn.GetStatusCode(); FString ReqContentType = pData->m_pUrlInfo->m_ContentType; pData->m_pUrlInfo->m_ContentType = FConn.GetHeader(HTTP_QUERY_CONTENT_TYPE); pData->m_pUrlInfo->m_dwStatusCode = dwStatusCode; if (!MatchContentType(ReqContentType, pData->m_pUrlInfo->m_ContentType)) { _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: Content type mismatch: %s/%s\n", ReqContentType, pData->m_pUrlInfo->m_ContentType); return E_NOINTERFACE; //E_NOINTERFACE = content type mismatch } if (dwStatusCode == 416 && FileSize > 0) { _DBGAlert("FAsyncDownload::FHttpDownloadTP::ProcessDownload: Server status code: %d. Download complete\n", dwStatusCode); return S_OK; } if (dwStatusCode < 200 || dwStatusCode > 206) { _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: Server status code: %d\n", dwStatusCode); if (dwStatusCode == 404) return E_HTTP_NOTFOUND; return E_HTTP_INVALID_STATUS; } CAtlFile OutFile; if (pData->m_pUrlInfo->m_dwDownloadFlags & HTTP_FLAG_NO_RESUME) DeleteFile(pData->m_pUrlInfo->m_DownloadFile); hr = OutFile.Create(pData->m_pUrlInfo->m_DownloadFile, GENERIC_WRITE, 0, OPEN_ALWAYS); if (FAILED(hr)) { _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: CreateFile failed: 0x%x, %d : %s\n", hr, GetLastError(), pData->m_pUrlInfo->m_DownloadFile); return E_HTTP_WRITE_FILE; } size_type llTotalRead = 0; size_type llSizeMax = 0; size_type ContentLen = FConn.GetContentLength(); pData->m_pUrlInfo->m_ContentLength = ContentLen; if (dwStatusCode == 206) { FString FStrRange = FConn.GetHeader(HTTP_QUERY_CONTENT_RANGE); if (FStrRange) { //Content-Range: bytes 21010-47021/47022 const char* pszBytes = strstr(FStrRange, "bytes "); if (pszBytes != NULL) { pszBytes+=sizeof("bytes"); LONGLONG llOffset = _strtoi64(pszBytes, NULL, 10); hr = OutFile.Seek(llOffset, FILE_BEGIN); llTotalRead = (size_type)llOffset; if (FAILED(hr)) { _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: Seek to position %d failed: 0x%x, %d\n", hr, GetLastError()); } const char* pszTotal = strchr(pszBytes, '/'); if (pszTotal != NULL) llSizeMax = _strtoi64(pszTotal + 1, NULL, 10); } } } else { if (ContentLen > 0 && ContentLen == FileSize) { OutFile.Close(); return S_OK; } } if (llSizeMax == 0) llSizeMax = ContentLen; pData->pBindStatusCallback.OnProgress((ULONG)llTotalRead, (ULONG)llSizeMax, BINDSTATUS_BEGINDOWNLOADDATA, L""); DWORD dwBytesRead = 0; for (;;) { if (!InternetReadFile(hReq, pBuffer, dwBufferSize, &dwBytesRead)) { _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: InternetReadFile() failed: %d\n", GetLastError()); OutFile.Close(); return E_HTTP_NET_ERROR; } if (dwBytesRead == 0) { hr = S_OK; break; } DWORD dwBytesWritten = 0; hr = OutFile.Write(pBuffer, dwBytesRead, &dwBytesWritten); if (FAILED(hr)) { _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: FileWrite failed: 0x%x, %d\n", hr, GetLastError()); OutFile.Close(); return E_HTTP_WRITE_FILE; } llTotalRead+=dwBytesRead; pData->pBindStatusCallback.OnProgress((ULONG)llTotalRead, llSizeMax > 0 ? (ULONG)llSizeMax : llTotalRead , BINDSTATUS_DOWNLOADINGDATA, L""); if (m_pThis->m_Stopping || pData->pBindStatusCallback.m_Abort) { _DBGAlert("**FAsyncDownload::FHttpDownloadTP::ProcessDownload: Download aborted\n", hr, GetLastError()); hr = E_ABORT; break; } } OutFile.Close(); return hr; }
int RunConverter( int argc, _TCHAR* * argv ) { #ifdef _DEBUG // sleep a bit so we can have time to attach a debugger Tell(_T("Sleeping for %d seconds in debug mode."), startupTimeout / 1000); Sleep(startupTimeout); #endif int ret = 0; wstring name; wstring title; HANDLE conversionHandle = NULL; po::options_description desc("Converts an MPEG-2 Program Stream to a DVR-MS, WMV, or WTV file."); po::positional_options_description pos; string input; string output; LONGLONG length; bool disableFileLogging; bool disableConsoleLogging; bool disableAllLogging; string interruptName; string interruptDirectory; string outputDirectory; string contentTitle; __int64 contentDuration = -1i64; desc.add_options() ("help,?", "Display help message.") ("input,i", po::value<string>(&input), "an MPEG2 input path. Can be a url.") ("output,o", po::value<string>(&output), "output path.<type>. Where <type> can be one of \"dvr-ms\", \"wmv\", or \"wtv\"") ("length,l", po::value<LONGLONG>(&length)->default_value(-1), "the length of the input content in bytes. Only required for a network path such as http." ) ("interrupt-name", po::value<string>(&interruptName), "the file name (without path or extension) of a file that will be created when conversion is to be interrupted.") ("interrupt-directory", po::value<string>(&interruptDirectory), "the path for this app to look for an interrupt file. An interrupt file is the interrupt_file name with a .interrupt extension. The file itself can be empty.") ("disable-file-logging", po::value<bool>(&disableFileLogging)->zero_tokens()->default_value(false), "indicates that logging to a file will be disabled.") ("disable-console-logging", po::value<bool>(&disableConsoleLogging)->zero_tokens()->default_value(false), "indicates that logging to the console will be disabled.") ("disable-all-logging", po::value<bool>(&disableAllLogging)->zero_tokens()->default_value(false), "indicates that all logging will be disabled.") ("output-directory,d", po::value<string>(&outputDirectory), "the directory for this app to place conversion output. Only valid if output_path is omitted.") ("content-title,t", po::value<string>(&contentTitle), "the Title that will be assigned to the output path.<type>.") ("version,v", po::value<string>()->zero_tokens(), "prints the version of this app.") //("content-duration,d", po::value<__int64>(&contentDuration)->default_value(-1i64), "the duration of the input content in seconds." ) ; pos.add("input", 1); pos.add("output", 1); pos.add("length", 1); vector<string> args; for (int i = 1; i < argc; i++) args.push_back(WStringToString(argv[i])); po::variables_map variables; try { po::basic_parsed_options<char> oo = po::command_line_parser(args). options(desc).positional(pos).run(); po::store(oo, variables); po::notify(variables); } catch (std::exception e) { Tell(_T("Invalid command line. Use --help to see options.")); return -1; } if (!variables.count("input")) { bool display = false; wstring message; if (variables.count("version")) { message = _T("Version: "); message += MPEG2DVRMS_VERSION; display = true; } if (variables.count("help")) { message = _T("eh... help message not available yet. Hope you have the source!"); display = true; } if (!display) message = _T("No input file was specified."); ret = 100; Tell(message); } else { try { ////////////////////////////////////////////////////////////////////////// // command-line option handling LONGLONG contentLength = -1; if (variables.count("length")) contentLength = length; if (variables.count("interrupt-name")) name = StringToWString(interruptName); else { name = NewGuid(); Tell(_T("Generated interrupt name is %s"), name.c_str()); } if (variables.count("interrupt-directory")) _conversionFileStoragePath = StringToWString(interruptDirectory); if (variables.count("content-title")) title = StringToWString(contentTitle); else title = _T(""); #pragma region input output file handling wstring defaultExtension; if (IsVista()) defaultExtension = _T(".dvr-ms"); else defaultExtension = _T(".wtv"); ATL_URL_SCHEME urlScheme; CUrl inputUrl; if (!inputUrl.CrackUrl(StringToWString(input).c_str())) urlScheme = ATL_URL_SCHEME_FILE; else urlScheme = inputUrl.GetScheme(); if (urlScheme == -1) urlScheme = ATL_URL_SCHEME_FILE; wstring inputPath = StringToWString(input); CPath outputPath; if (urlScheme == ATL_URL_SCHEME_FILE) { CPath input = inputPath.c_str(); if (input.IsFileSpec()) { TCHAR szCurrentDirectory[MAX_PATH]; if (!GetCurrentDirectory(MAX_PATH, szCurrentDirectory)) throw CarverLab::Exception(GetLastError()); wstring currentDirectory = (LPCTSTR)szCurrentDirectory; inputPath = currentDirectory + _T("\\") + inputPath.c_str(); input = inputPath.c_str(); } if (!input.FileExists()) throw CarverLab::Exception(_T("MPEG2 input path does not exist.")); if (!variables.count("output")) outputPath = inputPath.c_str(); else outputPath = StringToWString(output).c_str(); } else if (urlScheme == ATL_URL_SCHEME_HTTP || urlScheme == ATL_URL_SCHEME_HTTPS) { if (!variables.count("output")) { wstring thefullpath; CString envString; envString.GetEnvironmentVariable(_T("PUBLIC")); thefullpath = envString; thefullpath += _T("\\Videos\\mpeg2dvrms-output"); thefullpath += defaultExtension; outputPath = thefullpath.c_str(); } else outputPath = StringToWString(output).c_str(); } else throw CarverLab::Exception(_T("Only http or https URL schemes are supported.")); bool isUrl = urlScheme != ATL_URL_SCHEME_FILE; CPath inPath = inputPath.c_str(); if (outputPath.GetExtension().MakeLower() == inPath.GetExtension().MakeLower()) outputPath.RenameExtension(defaultExtension.c_str()); if (outputPath.GetExtension().MakeLower() == _T(".dvrms")) { outputPath.RemoveExtension(); outputPath.AddExtension(_T(".dvr-ms")); } #pragma endregion input output file handling ////////////////////////////////////////////////////////////////////////// conversionHandle = CreateConversion(false, CComBSTR(name.c_str())); if (!conversionHandle) throw CarverLab::Exception(); SetConsoleTitle(outputPath); Tell(_T("Press ENTER to interrupt and exit.")); _done = false; HANDLE stdinput = GetStdHandle(STD_INPUT_HANDLE); std::auto_ptr<InternalThreadData> threadData(new InternalThreadData); threadData->activityCallback = ActivityCallback; threadData->contentLength = contentLength; threadData->conversionHandle = conversionHandle; threadData->inputPath = inPath; threadData->isUrl = isUrl; threadData->outputPath = outputPath; threadData->threadData = NULL; threadData->userData = NULL; threadData->contentTitle = StringToWString(contentTitle); threadData->contentDuration = contentDuration; _lastConvertedFilePath = outputPath; HANDLE thread = CreateThread(NULL, 0, BeginConversion, threadData.get(), 0, NULL); if (thread == NULL) throw CarverLab::Exception(); bool shuttingDown = false; bool interruptSuccessful = false; // will be true if the conversion is inactive after InterruptConversion is called while (!_done) { if (!shuttingDown && ((_kbhit() && _getch() == 13) || InterruptNow(name.c_str()))) { shuttingDown = true; interruptSuccessful = InterruptConversion(conversionHandle, 30000); // will wait 30 seconds for the conversion to die } Sleep(10); } if (!interruptSuccessful) { // TODO: will need to kill this puppy in an unnice way... awwww Tell(_T("InterruptConversion was unsusccessful.")); } // TODO: INFINITE? um... nope. this will need an intervention WaitForSingleObject(thread, INFINITE); CloseHandle(thread); } catch (CarverLab::Exception exception) { Tell(_T("*** Error: %s"), exception.GetErrorString()); Tell(_T("Exiting...")); ret = exception.GetHRESULT(); } catch (...) { DWORD errorCode = GetLastError(); wstring error = Exception::GetLastErrorString(errorCode); Tell(_T("*** Unhandled Exception: %s"), error.c_str()); Tell(_T("Exiting...")); ret = errorCode; } if (conversionHandle != NULL) CloseConversion(conversionHandle); } #ifdef _DEBUG // sleep a bit so we can see any errors Tell(_T("Sleeping for %d seconds in debug mode."), sleepTimeout / 1000); Sleep(sleepTimeout); #endif return ret; }
int CHttpRequest::ExecuteGet(bool Recurse) { bool bContinueExecuteGet = false; bool bKeepAlive = false; Trace(tagHttp, levInfo, ("CHttpRequest - ExecuteGet")); unsigned int nCredentialsIndex = 0; bool Result; do { if (! bKeepAlive) { Result = CreateSocket(); if (! Result) { return -1; } } bContinueExecuteGet = false; // process authentication CString AuthHeader; switch(m_ServerAuthState.GetLeg()) { case AUTHENTICATION_STATE_NONE: break; case AUTHENTICATION_STATE_PRECHALENGE: case AUTHENTICATION_STATE_CHALENGE: if (! m_ServerAuthState.GetNextHeader(& AuthHeader, nCredentialsIndex)) return m_RStatusValue; break; } ClearResults(false); // add the auth header if (AuthHeader.GetLength()) SetHttpField(g_strHttpAuthorization, AuthHeader); /* create socket and get results */ CStringTable Connection; if (!m_HttpFields.FindAndCopy(g_strHttpConnection, Connection) || ! Connection.GetValue(g_strHttpConnection).GetLength()) { SetHttpField(g_strHttpConnection, "Keep-Alive"); } Result = GetHTTP((float) 1.0); // 406: no acceptable objects found (NT4/ISM) // if (!Result || (m_RStatusValue == 406)) { // ClearResults(false); // Result = CreateSocket() && GetHTTP((float) 0.9); // if (! Result) { // return -1; // } // } Trace(tagHttp, levInfo, ("CHttpRequest - ExecuteGet {%d/%d}", Result, m_RStatusValue)); if (! Result) { return -1; } CString l_RedirectLoc; CUrl ResolvedUrl; switch(m_RStatusValue) { case 407: // $(TODO) break; case 401: // any valid authentication stage if (m_ServerAuthState.GetLeg() == AUTHENTICATION_STATE_NONE) { // we need to send credentials m_ServerAuthState.SetLeg(AUTHENTICATION_STATE_PRECHALENGE); bContinueExecuteGet = true; } else if (m_ServerAuthState.GetLeg() == AUTHENTICATION_STATE_CHALENGE) { nCredentialsIndex ++; // equal, because we do want to run once without username/password credentials if (nCredentialsIndex <= m_ServerAuthState.GetSize()) { bContinueExecuteGet = true; } m_ServerAuthState.SetLeg(AUTHENTICATION_STATE_PRECHALENGE); bKeepAlive = false; } else if (m_ServerAuthState.GetLeg() == AUTHENTICATION_STATE_PRECHALENGE) { m_ServerAuthState.SetLeg(AUTHENTICATION_STATE_CHALENGE); bContinueExecuteGet = true; bKeepAlive = true; } break; case 301: /* redirections */ case 302: case 303: case 307: l_RedirectLoc = m_RFields.FindElement(g_strHttpLocation).GetValue(g_strHttpLocation); if (! l_RedirectLoc.GetLength()) { Trace(tagHttp, levInfo, ("CHttpRequest - GetHTTP - %d redirection without a Location: header.", m_RStatusValue)); break; } // HTTP 1.1 - Temporary Redirect is 302 and 307 // m_RRedirections is relevant for final URL address // that could be retrieved ResolvedUrl = m_Url.Resolve(l_RedirectLoc); l_RedirectLoc = ResolvedUrl.GetHttpAll(); if (m_FollowRedirections && m_RRedirections.Contains(l_RedirectLoc)) { // avoid circular redirections Trace(tagHttp, levInfo, ("CHttpRequest - GetHTTP - circular redirection %s", l_RedirectLoc.GetBuffer())); return m_RStatusValue; } else m_RRedirections.Add(l_RedirectLoc); if (!m_FollowRedirections) return m_RStatusValue; if (m_ClientSocket.GetVerbose()) cout << "\n\t[" << l_RedirectLoc << "]"; ClearResults(false); m_Url.SetUrl(l_RedirectLoc); bContinueExecuteGet = true; break; case 305: /* use proxy */ l_RedirectLoc = m_RFields.FindElement(g_strHttpLocation).GetValue(g_strHttpLocation); if (! l_RedirectLoc.GetLength()) break; // HTTP 1.1 - Temporary Redirect is 302 and 307 // m_RRedirections is relevant for final URL address // that could be retrieved ClearResults(false); m_Proxy.SetUrl(l_RedirectLoc); bContinueExecuteGet = true; break; }; Trace(tagHttp, levInfo, ("CHttpRequest - GetHTTP - %d", m_RStatusValue)); } while (bContinueExecuteGet); return m_RStatusValue; }
int CHttpRequest::GetHTTP10(CUrl& iUrl, const CString& iE, inetSocket& Sock){ /* attempt a retrieval of HTTP/1.0 */ CString Request; if (ProxyURL.StrLength() && Proxy.isValid()) { if (RLimit) Request += "GET "; else Request+="HEAD "; Request += iUrl.GetScheme(); Request+="://"; Request += iUrl.GetHost(); if (iUrl.GetPortValue() != 80) { Request+=":"; Request += iUrl.GetPort(); } Request += iUrl.GetUrlPath(); Request += " HTTP/1.0"; Request+=iE; } else { if (RLimit) Request += "GET "; else Request+="HEAD "; Request += iUrl.GetUrlPath(); Request += " HTTP/1.0"; Request+=iE; Request += "Host: "; Request += iUrl.GetHost(); Request+=iE; } for (int i=0;i<RHeaderParams.entries_count();i++) { Request+=RHeaderParams.get_name(i); Request+=": "; Request+=RHeaderParams.get_value(i); Request += iE; } Request += iE; #ifdef _U_DEBUG cout << "# HTTP Request: =====" << endl; cout << Request; cout << "=====================" << endl; #endif /* issue request */ CString RLoc; if (!Send(Sock, Request)) return 0; ProcessHeader(Sock); ProcessData(Sock); switch(RStatusValue) { case 200: return 1; case 301: case 302: case 303: case 307: if (!FollowRedirections) return RStatusValue; RLoc = RHeaderResponse.get_value("Location"); if (RLoc.StrLength()) { /* HTTP 1.1 - Temporary Redirect is 302 and 307 RedirectVector is relevant for final URL address that could be retrieved */ if (!RedirectVector.Contains(RLoc)) { RedirectVector+=RLoc; CUrl NewURL(RLoc); if (!Proxy.isValid()) { inetSocket Sock2(NewURL.GetPortValue(), NewURL.GetHost()); return GetHTTP10(NewURL, iE, Sock2); } else { Sock.Reopen(); return GetHTTP10(NewURL, iE, Sock); } } } return RStatusValue; case 305: /* use proxy */ RLoc = RHeaderResponse.get_value("Location"); if (RLoc.StrLength()) { CUrl ProxyURL(RLoc); if (ProxyURL.isValid()) { inetSocket ProxySock(ProxyURL.GetPortValue(), ProxyURL.GetHost()); if (wsLastError.StrLength()) return RStatusValue; return GetHTTP10(iUrl, iE, ProxySock); } } return RStatusValue; default: return RStatusValue; } }