C++ (Cpp) CUrl::getUrl примеры использования

Язык программирования: C++ (Cpp)

Класс/Тип: CUrl

Метод/Функция: getUrl

Примеров на hotexamples.com: 2

C++ (Cpp) CUrl::getUrl - 2 примера найдено. Это лучшие примеры C++ (Cpp) кода для CUrl::getUrl, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GetUrlPath(6)

GetPortValue(6)

GetHost(6)

GetScheme(5)

GetPort(4)

CrackUrl(4)

GetPortNumber(3)

GetHostName(3)

GetExtraInfo(2)

GetResource(2)

getUrl(2)

ConstructL(2)

ParseUrl(2)

getPort(1)

getDomain(1)

IsValidHost(1)

UrlDes(1)

SplitUrl(1)

OnWriteOver(1)

IsValidIp(1)

Component(1)

IsUrlType(1)

IsUrl(1)

IsImageUrl(1)

IsForeignHost(1)

GetUserName(1)

GetSchemeName(1)

GetPassword(1)

GetHttpAll(1)

GetBrute(1)

get_response_code(1)

Пример #1

Показать файл

Файл: FetcherManager.cpp Проект: codrocker/bloomServer

int FetcherManager::doLogin(CURL *curl, Task *task, UrlNode *urlnode) {
    InfoCrawler *infocrawler = InfoCrawler::getInstance();
    TaskOtherInfo *taskother = infocrawler->getTaskScheduleManager()->getTaskOtherInfo(task->id);
    if (!taskother) {
        return -1;
    }
    if (!task) {
        return -1;
    }
    CUrl url;
    url.parse(task->loginurl);

    if (url.getUrl().empty()) {
        return -1;
    }

    HttpProtocol httpprotocol;
    char downstatistic[512] ;
    downstatistic[0] = 0;
    RESPONSE_HEADER rheader;

    mylog_info(m_pLogGlobalCtrl->infolog, "before login %s - %s:%s:%d",url.getUrl().c_str(),INFO_LOG_SUFFIX);
    int ret = httpprotocol.curl_login(curl, url, urlnode, infocrawler->getConf()->httptimeout, &rheader, downstatistic);
    mylog_info(m_pLogGlobalCtrl->infolog, "after login  %s %s %d - %s:%s:%d",url.getUrl().c_str(), downstatistic, ret,INFO_LOG_SUFFIX);
    /* if (ret == HTTP_FETCH_RET_REDIRECT) { //redirect
         errorlog("LOGIN ERROR: fetched %s  relocated to %s taskid %d\n", url.getUrl().c_str() ,(char *)page.m_sLocation.c_str(),task->id);
     } else*/
    if (ret == HTTP_FETCH_RET_ERROR) {//just discard
        mylog_error(m_pLogGlobalCtrl->errorlog, "login fetched %s taskid %d - %s:%s:%d:%d", url.getUrl().c_str(), task->id,INFO_LOG_SUFFIX,ret);
    } else if (ret == HTTP_FETCH_RET_ERROR_INVALIDHOST) { //invalid host, can not access
        mylog_error(m_pLogGlobalCtrl->errorlog, "login fetched %s taskid %d - %s:%s:%d:%d", url.getUrl().c_str(), task->id,INFO_LOG_SUFFIX,ret);
    } else if (ret == HTTP_FETCH_RET_ERROR_UNACCEPTED) { //content is invalid, discard
        mylog_error(m_pLogGlobalCtrl->errorlog, "LOGIN fetched %s unaccepted contenttyped %s taskid %d - %s:%s:%d:%d", url.getUrl().c_str(), rheader.contenttype.c_str(), task->id,INFO_LOG_SUFFIX,ret);
    } else
    {
        taskother->fetchingcookie = true;
        static char *loginok = "LOGIN OK";
        saveCookie(task->id, loginok, strlen(loginok));
        taskother->fetchingcookie = false;
        return 1;
    }
    return -1;
}

Пример #2

Показать файл

Файл: FetcherManager.cpp Проект: codrocker/bloomServer

int FetcherManager::fetch() {
    InfoCrawler *infocrawler = InfoCrawler::getInstance();
    UrlAnalyseManager *urlAnalyseManager = infocrawler->getUrlAnalyseManager();

    CURL *curl = curl_easy_init();
    curl_easy_setopt(curl, CURLOPT_COOKIEFILE, ""); //just to start the cookie engine
    curl_easy_setopt(curl, CURLOPT_SHARE, sh);

    while(running()) {
        curl_easy_reset(curl);

        UrlNode *urlnode = NULL;
        bool html_from_outer= false;

        urlnode = urlAnalyseManager->getUrlFromOuterHtml();
        if (urlnode) {
            html_from_outer = true;
        } else {
            urlnode = urlAnalyseManager->getUrl();
        }

        if (urlnode == NULL) {
            my_sleep(100 * 1000); //0.1s
            continue;
        }
        if (!(urlnode->task))
        {
            mylog_info(m_pLogGlobalCtrl->infolog, "node task is null %s - %s:%s:%d",urlnode->url,INFO_LOG_SUFFIX);
        }
        TaskOtherInfo *taskother = infocrawler->getTaskScheduleManager()->getTaskOtherInfo(urlnode->taskid);
        int taskbatch = urlnode->taskbatch;
        if (urlnode->needtologin) {
            //need to login and cookie is null
            if (!(infocrawler->getTaskScheduleManager()->getCookieFromTask(urlnode->taskid))) {
                if (taskother->fetchingcookie) {
                    infocrawler->getUrlAnalyseManager()->insertUrl(urlnode);
                    infocrawler->getTaskScheduleManager()->decreaseTaskUrlNum(urlnode->task,taskbatch);
#ifdef URLMEMCACHEDB
                    infocrawler->deleteUrlMcLocalThread();
#endif
                    continue;
                } else {
                    doLogin(curl, urlnode->task, urlnode);
                }
            }
        }

        /*if (urlnode->task->sourcetype == SOURCE_TYPE_COMPANY && urlnode->type & URL_TYPE_HOMEPAGE)
        {
            strcat(urlnode->url, "&event=32698647");
            strcpy(urlnode->refererurl, "http://search.china.alibaba.com/tools/validate_redirect.htm?ru=http%253A%252F%252Fsearch.china.alibaba.com%252Fcompany%252Fcompany_search.htm%253Fkeywords%253D%25CA%25D6%25BB%25FA%2526pageSize%253D30%2526n%253Dy%2526showStyle%253Dpopular%2526beginPage%253D4&event=32698647&n=y");
        }*/
        CUrl url;
        url.parse(urlnode->url);
        //wrong url format
        if (url.getUrl().empty()) {
            infocrawler->getTaskScheduleManager()->increaseTaskErrorUrlNum(urlnode->taskid);
            infocrawler->getTaskScheduleManager()->decreaseTaskUrlNum(urlnode->task, taskbatch);
            infocrawler->getLocalDbManager()->decidesaveFetched(urlnode);
            delete urlnode;
#ifdef URLMEMCACHEDB
            infocrawler->deleteUrlMcLocalThread();
#endif
            continue;
        }
        Page page;
        Buffer *content = create_buffer(DEFAULT_PAGE_BUF_SIZE);


        //do fetch
        HttpProtocol httpprotocol;
        char downstatistic[512] ;
        downstatistic[0] = 0;
        RESPONSE_HEADER rheader;

//        mylog_info(m_pLogGlobalCtrl->infolog, "before fetch %s %s %llu %d %d  - %s:%s:%d",url.getUrl().c_str(), urlnode->url, urlnode->id, urlnode->taskid, urlnode->errornum,INFO_LOG_SUFFIX);
        //int ret = httpprotocol.fetch(url, content, urlnode, page, infocrawler->getConf()->httptimeout,urlnode->task->tasksendtype);
//        int ret = httpprotocol.curl_fetch(curl, url, content, urlnode, infocrawler->getConf()->httptimeout, urlnode->task->tasksendtype, &rheader, downstatistic);
        int sendtype = urlnode->task->tasksendtype;
        if (urlnode->task->sourcetype == SOURCE_TYPE_COMPANY && urlnode->type & URL_TYPE_HOMEPAGE)
        {
            /*FILE * f = fopen("ali.txt", "rb");
            char line[1024] = {0};
            int i = 0;
            string cookie;
            string post;
            while(fgets(line, 1023, f)) {
            	char *newline = strtrim(line, NULL);
            	if (i++ == 0) {
            		cookie = newline;
            	} else {
            		post = newline;
            	}
            }
            fclose(f);
            */
            sendtype = REQUEST_TYPE_GET;
        }

        int ret = 0;
        if (!html_from_outer) {
            ret = httpprotocol.curl_fetch(curl, url, content, urlnode, infocrawler->getConf()->httptimeout, sendtype, &rheader, downstatistic);
            mylog_info(m_pLogGlobalCtrl->infolog, "after fetched %s %s %d - %s:%s:%d",url.getUrl().c_str(), downstatistic, ret,INFO_LOG_SUFFIX);
        } else {
            add_buffer(content, (char *)urlnode->html.c_str(), urlnode->html.length());
            ret = urlnode->html.length();
            mylog_info(m_pLogGlobalCtrl->infolog, "get url from outer %s %d - %s:%s:%d", url.getUrl().c_str(), ret,INFO_LOG_SUFFIX);
        }

        /*if (ret == HTTP_FETCH_RET_REDIRECT) { //redirect
            int redirectnum = urlnode->redirectnum +1;
            if (redirectnum <= URL_FETCH_REDIRECT_TIMES)
            {
                UrlNode *newurlnode = new UrlNode(urlnode->task,urlnode->topicsource,urlnode->title,urlnode->taskbatch,(char *)urlnode->fatherurl,(char *)page.m_sLocation.c_str(), urlnode->other, urlnode->maxtype,urlnode->type, 0, urlnode->id,redirectnum ,urlnode->page,urlnode->layerid,urlnode->bbsid,urlnode->needtologin);

                newurlnode->insertother(URLNODE_OTHER_TYPE_COOKIE,(char *)page.m_sCookie.c_str(), page.m_sCookie.length());
                errorlog("ERROR: fetched %s %s relocated to %s %llu %d\n", url.getUrl().c_str(), urlnode->url, newurlnode->url, newurlnode->id, newurlnode->taskid);
                infocrawler->getUrlAnalyseManager()->insertUrl(newurlnode);
            }else
            {
                errorlog("ERROR: redirectunm > %d fetched %s %s relocated to %s %d\n", URL_FETCH_REDIRECT_TIMES, url.getUrl().c_str(), urlnode->url, (char * )page.m_sLocation.c_str(), urlnode->taskid);
            }
            urlnode->errornum = 0;
            */
        if (ret == HTTP_FETCH_RET_ERROR) {//just discard
            urlnode->errornum++;
            mylog_error(m_pLogGlobalCtrl->errorlog, "fetched %s  - %s:%s:%d:%d", url.getUrl().c_str(),INFO_LOG_SUFFIX,urlnode->errornum);
            /*} else if (ret == HTTP_FETCH_RET_ERROR_INVALIDHOST) { //invalid host, can not access
            urlnode->errornum++;
                errorlog("ERROR: fetched %s invalidhost %d\n", url.getUrl().c_str(), urlnode->errornum);
                */
        } else if (ret == HTTP_FETCH_RET_ERROR_UNACCEPTED) { //content is invalid, discard
            urlnode->errornum = URL_FETCH_RETRY_TIMES;
            //errorlog("ERROR: fetched %s unaccepted contenttype %d %s\n", url.getUrl().c_str(), urlnode->errornum, page.m_sContentType.c_str());
        } else { //ok
            //increase fetch num
            if (urlnode->type & URL_TYPE_NEEDTOSAVE)
                infocrawler->getTaskScheduleManager()->increaseFetchNum(urlnode->task);

            urlnode->errornum = 0;
            //extract urls and analyse, insert new url into queue
            char nextpageurl[MAX_URL_LEN] ;
            nextpageurl[0] = 0;
            int nextpage = infocrawler->getUrlAnalyseManager()->analyseUrls(urlnode, &rheader, content->data, ret, nextpageurl, html_from_outer);
            if (html_from_outer) {
                nextpage = 0;
                nextpageurl[0] = 0;
            }

            //write content to disk if we need, write fetched url into dist
            if (urlnode->type & URL_TYPE_NEEDTOSAVE) {
                if (urlnode->task->sourcetype == SOURCE_TYPE_BBS)
                {
                    char oldurlnodedata[64];
                    int tasktmp = 0;
                    int pagetmp = 0;
                    ulonglong idtmp = 0;
                    if (InfoCrawler::getInstance()->getLocalDbManager()->alreadyfetched(urlnode,oldurlnodedata))
                    {
                        sscanf(oldurlnodedata, "%llu/%d/%*d/%*d/%d/%*u", &idtmp,&pagetmp, &tasktmp);
                        if (pagetmp == urlnode->page)
                        {
                            int rettmp = infocrawler->getLocalDbManager()->erasecontent(idtmp,tasktmp);
                        }
                    }
                }
                if (urlnode->nextpage == 1 && nextpage >1)
                {
                    urlnode->nextpage = nextpage;
                }
                mylock::get_instance()->get(urlnode->id);
                infocrawler->getLocalDbManager()->savecontent(urlnode, &rheader, content->data, ret, nextpage);
                mylock::get_instance()->put(urlnode->id);
                //infocrawler->getPageManager()->SavePage(content->data, ret, urlnode, &rheader);

                mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s saved content%d %d title %s urlid %llu taskid %d batchid %d - %s:%s:%d",url.getUrl().c_str(), urlnode->errornum, ret,urlnode->title,urlnode->id, urlnode->taskid, urlnode->taskbatch ,INFO_LOG_SUFFIX);
                /*if ((urlnode->nextpage > 1)&& !(urlnode->type & URL_TYPE_HOMEPAGE)) {//if have nextpage, don't not save fetched
                    mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s have next, so don't not save fetched - %s:%s:%d",url.getUrl().c_str(),INFO_LOG_SUFFIX);
                } else {
                    infocrawler->getLocalDbManager()->saveFetched(urlnode);
                    infocrawler->getLocalDbManager()->saveUrl(urlnode, SAVE_FATHER_URL);
                    mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s save fetched%d - %s:%s:%d",url.getUrl().c_str(),urlnode->errornum,INFO_LOG_SUFFIX);
                }*/
                if (urlnode->nowpage == urlnode->totalpage)
                {
                    infocrawler->getLocalDbManager()->saveFetched(urlnode);
                    infocrawler->getLocalDbManager()->saveUrl(urlnode, SAVE_FATHER_URL);

                    //0128.begin()
                    /* char *content=NULL;
                     char dbname[64] = "";
                     char recordname[64] = "";
                     char urldbname[64] = "";
                     DBAccess *dbaccess = DBAccess::getInstance();

                     getContentDBName1(urlnode, dbname);
                     getRecordKeyName1(urlnode, recordname);
                     getUrlDBName1(urlnode, urldbname);

                     int suffix = dbaccess->load(dbname);
                     string fileno;
                     DBD *dbd = dbaccess->get(suffix, fileno, recordname, NULL);
                     if (dbd != NULL)
                     {
                         DbHypertableManager * dbhyper=InfoCrawler::getInstance()->gethyper();
                          ICCONFIG   *ifcong_=InfoCrawler::getInstance()->getConf();

                        // dbhyper->get_now_time();
                        string now_=TimeToString1();
                        //string now_;
                        if( dbhyper->insert_data_to_hypertable_content(urlnode,dbd->datbuf,dbd->datlen_u,string("content_tbl"),string("gbk"),ret,now_) )
                        {
                              char fetchtbl[32];fetchtbl[0]=0;
                             //sprintf(fetchtbl,"fetch_%d_tbl",urlnode->taskid);
                              sprintf(fetchtbl,"fetch_%d_tbl",1);
                             if( dbhyper->insert_data_to_hypertable_fetch(urlnode,string(fetchtbl),ifcong_->spider_id,ret,now_) )
                             {
                                char memorytable[128]; memorytable[0]=0;
                                 //sprintf(memorytable,"url_%d_tbl",urlnode->taskid);
                                 sprintf(memorytable,"url_%d_tbl",1);
                                 dbhyper->insert_data_to_hypertable_memorytable(urlnode,memorytable);
                              }
                         }

                          dbd_free(dbd);
                     }*/
                    //0128.end()
                    /*char * final_content;
                    final_content=NULL;
                    final_content=get_final_content(urlnode);
                    if(final_content !=NULL)
                    {
                        insert_data_to_hypertable(urlnode->fatherurl,final_content);
                        delete []final_content;
                    }*/
                    mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s save fetched%d nowpage %d totalpage %d - %s:%s:%d",url.getUrl().c_str(),urlnode->errornum,urlnode->nowpage,urlnode->totalpage,INFO_LOG_SUFFIX);
                } else
                {
                    mylog_info(m_pLogGlobalCtrl->infolog, "fetched %s have next, so don't not save fetched nowpage %d totalpage %d  - %s:%s:%d",url.getUrl().c_str(),urlnode->nowpage,urlnode->totalpage,INFO_LOG_SUFFIX);
                }
            }
            //insert next page
            if (nextpageurl[0]) {
                UrlNode *newnode  = new UrlNode;
                if (!(urlnode->type & URL_TYPE_HOMEPAGE))
                    newnode->id = urlnode->id;
                newnode->task = urlnode->task;
                newnode->taskid = urlnode->taskid;
                newnode->copyother(urlnode->other,urlnode->maxtype);
                newnode->type = urlnode->type;
                newnode->page = urlnode->page + 1;
                newnode->copyurl(nextpageurl);
                newnode->copyfatherurl(urlnode->fatherurl);
                newnode->layerid  =  urlnode->layerid;
                newnode->needtologin = urlnode->needtologin;
                newnode->taskbatch= urlnode->taskbatch;
                newnode->copytitle(urlnode->title);
                newnode->copytopicsource(urlnode->topicsource);
                mylog_info(m_pLogGlobalCtrl->infolog, " now url %s new url %s title %s - %s:%s:%d",urlnode->url, newnode->url,urlnode->title,INFO_LOG_SUFFIX);
                infocrawler->getUrlAnalyseManager()->insertUrl(newnode, INSERT_URL_FORCED);
            }
        }

        free_buffer(content);

        Task *task = urlnode->task;
        int taskid = urlnode->taskid;

        //if get an error, we will retry but only fixed times
        if (urlnode->errornum > 0 && urlnode->errornum < URL_FETCH_RETRY_TIMES) {
            mylog_error(m_pLogGlobalCtrl->errorlog, "fetched %s reinsert for error %d %llu - %s:%s:%d", url.getUrl().c_str(), urlnode->errornum, urlnode->id,INFO_LOG_SUFFIX);
            infocrawler->getUrlAnalyseManager()->insertUrl(urlnode, INSERT_URL_FORCED,false);
        } else if (urlnode->errornum >= URL_FETCH_RETRY_TIMES) {
            infocrawler->getTaskScheduleManager()->increaseTaskErrorUrlNum(taskid);
            //write error url to DB
//            infocrawler->getDbManager()->WriteFetchError(url.getUrl().c_str(),taskid,taskbatch);
            mylog_error(m_pLogGlobalCtrl->errorlog, "fetched finished and download url %s urlnodeid %llu taskid %d - %s:%s:%d:%d", url.getUrl().c_str(),  urlnode->id, taskid,INFO_LOG_SUFFIX,urlnode->errornum);
            infocrawler->getLocalDbManager()->decidesaveFetched(urlnode);

            if (urlnode->type & URL_TYPE_NEEDTOSAVE)
            {
                infocrawler->getLocalDbManager()->saveUrl(urlnode);
            }
            delete urlnode;
        } else {
            mylog_info(m_pLogGlobalCtrl->infolog, "fetched finished %s error %d %llu fatherurl %s - %s:%s:%d",url.getUrl().c_str(), urlnode->errornum, urlnode->id, urlnode->fatherurl,INFO_LOG_SUFFIX);
            delete urlnode;
        }

        infocrawler->getTaskScheduleManager()->decreaseTaskUrlNum(task, taskbatch);

#ifdef URLMEMCACHEDB
        infocrawler->deleteUrlMcLocalThread();
#endif
    }
    curl_easy_cleanup(curl);
    mylog_info(m_pLogGlobalCtrl->infolog, "FetcherManager ISRUNNING false - %s:%s:%d",INFO_LOG_SUFFIX);
}