void * do_crawler(void *item) { char *url_ptr; int clientfd ; static int pages = 0; static int error = 0; char *buf = NULL; urlq_t *url_list_head = NULL, *p, *p_pre; char cur_dir[256]; hash_table *hash_in, *hash_out; /* int tid = pthread_self();*/ int j = 0; int i= 0; int pos_found = 0; char temp[256]; int status = 0; static int a = 0; static int b = 0; static int c = 0; static int d = 0; static int e = 0; static int f = 0; static int g = 0; while(1){ pthread_mutex_lock(&mutex); while (urlqueue.head_ptr->next == NULL){ pthread_cond_wait(&ready, &mutex); } pthread_mutex_lock(&((threadpool_item *)item)->waitlock); ((threadpool_item *)item)->idle = 0; pthread_mutex_unlock(&((threadpool_item *)item)->waitlock); url_ptr = queue_pop(&urlqueue); g++; pthread_mutex_unlock(&mutex); pthread_mutex_lock(&mutex); /*if not visited, set flag = 1*/ if(has_visited(hash, url_ptr) == 1){ pthread_mutex_unlock(&mutex); free(url_ptr); pthread_mutex_lock(&((threadpool_item *)item)->waitlock); ((threadpool_item *)item)->idle =1; pthread_mutex_unlock(&((threadpool_item *)item)->waitlock); d++; continue; } hash_out = has_url(hash, url_ptr); e++; pthread_mutex_unlock(&mutex); if (hash_out == NULL){ printf("error\n"); getchar(); } *temp = '\0'; cur_dir[0] = '\0'; strcpy(cur_dir, url_ptr); j = strlen(cur_dir); for (;cur_dir[j] != '/' && j != 0; j--) ; if(j == 0) cur_dir[j] = '\0'; else cur_dir[j+1] = '\0'; for (i = 0; i < 3; i++){ if((clientfd = open_tcp("127.0.0.1", 80)) < 0){ close_fd(clientfd); continue; } if( http_do_get(clientfd, rootdir, "127.0.0.1", url_ptr) < 0){ close_fd(clientfd); continue; } if(recv_line(clientfd, temp) <= 0){ close_fd(clientfd); continue; } if((status = http_response_status(temp)) == 4){ printf("%s error %d\n",url_ptr, error++); pthread_mutex_lock(&mutex); set_status(hash, url_ptr, 4); set_webg_status(webg, hash_out, 4); pthread_mutex_unlock(&mutex); pthread_mutex_lock(&((threadpool_item *)item)->waitlock); ((threadpool_item *)item)->idle =1; pthread_mutex_unlock(&((threadpool_item *)item)->waitlock); close_fd(clientfd); break; } buf = http_response_body(clientfd); close_fd(clientfd); break; } if (status == 4) continue; if(i == 3){ pthread_mutex_lock(&((threadpool_item *)item)->waitlock); ((threadpool_item *)item)->idle =1; pthread_mutex_unlock(&((threadpool_item *)item)->waitlock); close_fd(clientfd); continue; } if (buf == NULL){ pthread_mutex_lock(&((threadpool_item *)item)->waitlock); ((threadpool_item *)item)->idle =1; pthread_mutex_unlock(&((threadpool_item *)item)->waitlock); continue; } printf("%s pages %d\n", url_ptr,pages++); extract_link(buf, cur_dir, &url_list_head); free(buf); buf = NULL; p = url_list_head->next; p_pre = url_list_head; while (p != NULL){ if(strcmp(url_ptr, p->url_ptr) == 0){ p_pre->next = p->next; free(p->url_ptr); free(p); a++; p = p_pre->next; printf("a= %d, b= %d, c= %d, d= %d, e= %d, f= %d, g= %d\n", a,b,c,d,e,f,g); continue; } pthread_mutex_lock(&mutex); hash_in = has_url(hash, p->url_ptr); if (hash_in != NULL ){ insert_edge(webg, hash_in, hash_out); pthread_mutex_unlock(&mutex); p_pre->next = p->next; free(p->url_ptr); free(p); p = p_pre->next; b++; printf("a= %d, b= %d, c= %d, d= %d, e= %d, f= %d, g= %d\n", a,b,c,d,e,f,g); continue; } else{ pos_found = insert_vertex(webg, hash_out, p->url_ptr); insert_hash_item(hash, p->url_ptr, pos_found, 0); pthread_mutex_unlock(&mutex); c++; p_pre = p; p = p->next; printf("a= %d, b= %d, c= %d, d= %d, e= %d, f= %d, g= %d\n", a,b,c,d,e,f,g); } } if(p_pre != url_list_head){ pthread_mutex_lock(&mutex); queue_push(&urlqueue, url_list_head->next, p_pre); f++; pthread_mutex_unlock(&mutex); } free(url_list_head); p = p_pre = url_list_head = NULL; pthread_mutex_lock(&((threadpool_item *)item)->waitlock); ((threadpool_item *)item)->idle = 1; pthread_mutex_unlock(&((threadpool_item *)item)->waitlock); /*printf("next time!\n");*/ } /*printf("over!\n");*/ return NULL; }
int crawler_crawl(link_crawler_t *crawler, char *url, list_t *link_list) { http_url_t http_url; int status = CRAWLER_NULL; if(http_url_parse_s(&http_url, url) == URL_RECOGNIZED) { int port = strlen(http_url.port) != 0 ? atoi(http_url.port) : 80; char path[4096] = {""}; strlen(http_url.search) == 0 ? sprintf(path, "%s", http_url.path) : sprintf(path, "%s?%s", http_url.path, http_url.search); if(http_connect(&crawler->http_client, http_url.host, port) == CONNECT_OK) { int ecode, response_status; const char *page = NULL; printf("%s\n", url); ecode = http_do_get(&crawler->http_client, path); switch(ecode) { case RESPONSE_OK: response_status = http_response_status(&crawler->http_client); if(response_status == HTTP_OK) { /* get entity_body pointer */ page = http_response_body(&crawler->http_client); /* extrack link from buffer and save link into list*/ if(page != NULL) { extract_absolute_link_s(page, link_list, url); status = CRAWLER_OK; } else { status = CRAWLER_NULL; fprintf(stderr, "Request %s:%d%s failed, response body is null.\n", crawler->http_client.connection.host, crawler->http_client.connection.port, path); } } else { fprintf(stderr, "Request %s:%d%s failed, Status code: %d.\n", crawler->http_client.connection.host, crawler->http_client.connection.port, path, response_status); status = CRAWLER_NONEED; } break; case RESPONSE_OVERFLOW: fprintf(stderr, "Request %s:%d%s do_get receive overflow.\n", crawler->http_client.connection.host, crawler->http_client.connection.port, path); status = CRAWLER_OVERFLOW; break; case RESPONSE_FAILED: fprintf(stderr, "Request %s:%d%s do_get receive break.\n", crawler->http_client.connection.host, crawler->http_client.connection.port, path); status = CRAWLER_BREAK; break; case RESPONSE_TIMEOUT: fprintf(stderr, "Request %s:%d%s do_get receive timeout.\n", crawler->http_client.connection.host, crawler->http_client.connection.port, path); status = CRAWLER_TIMEOUT; break; case REQUEST_FAILED: fprintf(stderr, "Request %s:%d%s do_get request failed.\n", http_url.host, port, path); status = CRAWLER_FAILED; break; default: fprintf(stderr, "Unknown ecode %d.\n", ecode); status = CRAWLER_UNKNOWN; break; } http_disconnect(&crawler->http_client); } else { fprintf(stderr, "Http connect %s:%d failed.\n", http_url.host, port); status = CRAWLER_UNREACH; } } else { fprintf(stderr, "Unrecognize url: %s\n", url); status = CRAWLER_UNKNOWN; } return status; }
int main(int argc, char **argv) { char *host = "125.211.218.8"; char *path = "/techqq/zt/2007/firacn/topic_html/xsm.htm"; /* char *host = "10.205.42.139"; char *path = "/techqq/a/20090423/000378.htm"; char *path = "/techqq/a/20120111/000508.htm"; char *path = "/techqq/a/20121008/000013.htm"; char *path = "/techqq/wlyx.htm"; char *path = "/techqq/index.html"; char *path = "/techqq/a/20121008/000048.htm"; 10.205.42.139:80/techqq/a/20090423/000378.htm */ const char *body_ptr; int status; http_client_t http_client1; http_connect(&http_client1, host, 80); http_do_get(&http_client1, path); status = http_response_status(&http_client1); body_ptr = http_response_body(&http_client1); printf("---------------------Status-----------------------\n"); printf("%d\n", status); printf("---------------------Body-----------------------\n"); printf("%p\n", body_ptr); printf("%s\n", body_ptr); http_disconnect(&http_client1); /* status = http_response_status(&http_client1); body_ptr = http_response_body(&http_client1); */ /* printf("---------------------Status-----------------------\n"); printf("%d\n", status); printf("---------------------Body-----------------------\n"); printf("%s\n", body_ptr); */ /* http_client_t http_client2; http_connect(&http_client2, "ufp.umeng.com", 80); http_do_get(&http_client2, "/login"); status = http_response_status(&http_client2); body_ptr = http_response_body(&http_client2); printf("---------------------Status-----------------------\n"); printf("%d\n", status); printf("---------------------Body-----------------------\n"); printf("%s\n", body_ptr); http_disconnect(&http_client2); http_client_t http_client3; http_connect(&http_client3, "blog.umeng.com", 80); http_do_get(&http_client3, "/index.php/category/products/"); status = http_response_status(&http_client3); body_ptr = http_response_body(&http_client3); printf("---------------------Status-----------------------\n"); printf("%d\n", status); printf("---------------------Body-----------------------\n"); printf("%s\n", body_ptr); http_disconnect(&http_client3); */ }