/* * 返回最后找到的链接的下一个下标,如果没找到返回 0; */ int extract_url(regex_t *re, char *str, Url *ourl) { const size_t nmatch = 2; regmatch_t matchptr[nmatch]; int len; char *p = str; while (regexec(re, p, nmatch, matchptr, 0) != REG_NOMATCH) { len = (matchptr[1].rm_eo - matchptr[1].rm_so); p = p + matchptr[1].rm_so; char *tmp = (char *)calloc(len+1, 1); strncpy(tmp, p, len); tmp[len] = '\0'; p = p + len + (matchptr[0].rm_eo - matchptr[1].rm_eo); /* exclude binary file */ if (is_bin_url(tmp)) { free(tmp); continue; } char *url = attach_domain(tmp, ourl->domain); if (url != NULL) { SPIDER_LOG(SPIDER_LEVEL_DEBUG, "I find a url: %s", url); Surl * surl = (Surl *)malloc(sizeof(Surl)); surl->level = ourl->level + 1; surl->type = TYPE_HTML; /* normalize url */ if ((surl->url = url_normalized(url)) == NULL) { SPIDER_LOG(SPIDER_LEVEL_WARN, "Normalize url fail"); free(surl); continue; } if (iscrawled(surl->url)) { /* if is crawled */ SPIDER_LOG(SPIDER_LEVEL_DEBUG, "I seen this url: %s", surl->url); free(surl->url); free(surl); continue; } else { push_surlqueue(surl); } } } return (p-str); }
static void dns_callback(int result, char type, int count, int ttl, void *addresses, void *arg) { Url * ourl = (Url *)arg; struct in_addr *addrs = (in_addr *)addresses; if (result != DNS_ERR_NONE || count == 0) { SPIDER_LOG(SPIDER_LEVEL_WARN, "Dns resolve fail: %s", ourl->domain); } else { char * ip = inet_ntoa(addrs[0]); SPIDER_LOG(SPIDER_LEVEL_DEBUG, "Dns resolve OK: %s -> %s", ourl->domain, ip); host_ip_map[ourl->domain] = strdup(ip); ourl->ip = strdup(ip); push_ourlqueue(ourl); } event_loopexit(NULL); // not safe for multithreads }
int main(int argc,char *argv[]) { /*实例化一个调度模块*/ SpiderProgramScheduler spiderProSch = new SpiderProgramScheduler(); //3、初始化环境 /* 3.1、通过参数控制是否以守护进程模式运行 3.2、提供帮助信息 3.3、读取配置文件,提取参数 3.4、载入程序动态库 */ if (spiderProSch.init(argc,argv) == 0) { //用SPIDER_LOG打印日志 SPIDER_LOG(SPIDER_LEVEL_ERROR,"app init is failed ,app has exit!"); return -1; } //4、开始程序主流程(控制模块) /*开始主业务流程*/ /* 4.1、将URL种子交给URL管理器。 4.2、分析种子,得到URL IP地址(DNS解析) 4.3、根据URL获取第一个页面资源 4.4、对页面进行处理 4.4.1 解析页面取得URL 4.4.2、生成URL列表,交给URL管理器 4.4.3、对页面进行持久化操作 4.5、 4.6、通过调用epoll框架产生新任务(先检测是否到达最大任务数,功能封装在独立的函数,循环添加,直到epoll监控的最大值) 4.6.1、从URL管理器中取出一个URL 4.6.2、利用URL的ip地址请求资源,将返回的socket句柄加入epoll监控树 4.7、通过epoll_wait()监控事件,触发则,创建线程处理接收的数据(对页面进行处理4.4) 一直在这个地方循环,直到url处理队列已经清空 */ if (spiderProSch.run() == 0) { SPIDER_LOG(SPIDER_LEVEL_ERROR,"main app has exception ,app has exit!"); return -1; } return 0; }
void dec2(int num) { int res; if(num==0) return; res=num%2; num/=2; dec2(num); SPIDER_LOG(SPIDER_LEVEL_DEBUG,"res:%p,%d,num=%d\n",&res,res,num); printf("%d",res); }
void push_surlqueue(Surl *url) { if (url != NULL && surl_precheck(url)) { SPIDER_LOG(SPIDER_LEVEL_DEBUG, "I want this url: %s", url->url); pthread_mutex_lock(&sq_lock); surl_queue.push(url); if (surl_queue.size() == 1) pthread_cond_signal(&sq_cond); pthread_mutex_unlock(&sq_lock); } }
void end_thread() { pthread_mutex_lock(&gctn_lock); int left = g_conf->max_job_num - (--g_cur_thread_num); if (left == 1) { /* can start one thread */ attach_epoll_task(); } else if (left > 1) { /* can start two thread */ attach_epoll_task(); attach_epoll_task(); } else { /* have reached g_conf->max_job_num , do nothing */ } SPIDER_LOG(SPIDER_LEVEL_DEBUG, "End Thread %lu, cur_thread_num=%d", pthread_self(), g_cur_thread_num); pthread_mutex_unlock(&gctn_lock); }
void begin_thread() { SPIDER_LOG(SPIDER_LEVEL_DEBUG, "Begin Thread %lu", pthread_self()); }