Esempio n. 1
0
void * do_crawler(void *item)
{
	char *url_ptr;
	int clientfd ;
	static int pages = 0;
	static int error = 0;
	char *buf = NULL;
	urlq_t *url_list_head = NULL, *p, *p_pre;
	char cur_dir[256];
	hash_table *hash_in, *hash_out;
/*	int tid = pthread_self();*/
	int j = 0;
	int i= 0;
	int pos_found = 0;
	char temp[256];
	int status = 0;
	static int a = 0;
	static int b = 0;
	static int c = 0;
	static int d = 0;
	static int e = 0;
	static int f = 0;
	static int g = 0;
	while(1){	
		pthread_mutex_lock(&mutex);
		while (urlqueue.head_ptr->next == NULL){
			pthread_cond_wait(&ready, &mutex);
		}

		pthread_mutex_lock(&((threadpool_item *)item)->waitlock);
		((threadpool_item *)item)->idle = 0;
		pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);	
		url_ptr = queue_pop(&urlqueue);
		g++;
		pthread_mutex_unlock(&mutex);
		

		pthread_mutex_lock(&mutex);
		/*if not visited, set flag = 1*/
		if(has_visited(hash, url_ptr) == 1){
			pthread_mutex_unlock(&mutex);
			free(url_ptr); 
			pthread_mutex_lock(&((threadpool_item *)item)->waitlock);	
			((threadpool_item *)item)->idle =1;
			pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);
			d++;	
			continue;	
		}
		hash_out = has_url(hash, url_ptr);
		e++;
		pthread_mutex_unlock(&mutex);
		if (hash_out == NULL){
			printf("error\n");
			getchar();
		}
		*temp = '\0';
		cur_dir[0] = '\0';
		strcpy(cur_dir, url_ptr);
		j = strlen(cur_dir);
		for (;cur_dir[j] != '/' && j != 0; j--) ;
		if(j == 0)
			cur_dir[j] = '\0';
		else
			cur_dir[j+1] = '\0';

		for (i = 0; i < 3; i++){
			if((clientfd = open_tcp("127.0.0.1", 80)) < 0){
				close_fd(clientfd);
				continue;
			}

			if( http_do_get(clientfd, rootdir, "127.0.0.1", url_ptr) < 0){
				close_fd(clientfd);
				continue;
			}

			if(recv_line(clientfd, temp) <= 0){
				close_fd(clientfd);
				continue;
			}
			if((status = http_response_status(temp))  == 4){
				printf("%s error %d\n",url_ptr, error++);
				pthread_mutex_lock(&mutex);
				set_status(hash, url_ptr, 4);
				set_webg_status(webg, hash_out, 4);
				pthread_mutex_unlock(&mutex);
			
				pthread_mutex_lock(&((threadpool_item *)item)->waitlock);	
				((threadpool_item *)item)->idle =1;
				pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);	
				close_fd(clientfd);
				break;	
			}
			buf = http_response_body(clientfd);
			close_fd(clientfd);
			break;
		}
		if (status == 4)
			continue;
		if(i == 3){
			pthread_mutex_lock(&((threadpool_item *)item)->waitlock);	
			((threadpool_item *)item)->idle =1;
			pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);	
			close_fd(clientfd);
			continue;	
		}
		if (buf == NULL){
			pthread_mutex_lock(&((threadpool_item *)item)->waitlock);	
			((threadpool_item *)item)->idle =1;
			pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);	
			continue;
		}
		printf("%s pages %d\n", url_ptr,pages++);
		extract_link(buf, cur_dir, &url_list_head);
		free(buf);
		buf = NULL;	
		p = url_list_head->next;
		p_pre = url_list_head;
		while (p != NULL){
			
			if(strcmp(url_ptr, p->url_ptr) == 0){
				p_pre->next = p->next;
				free(p->url_ptr);
				free(p);
				a++;
				p = p_pre->next;
				printf("a= %d, b= %d, c= %d, d= %d, e= %d, f= %d, g= %d\n", a,b,c,d,e,f,g);
				continue;	
			}			
			pthread_mutex_lock(&mutex);
			hash_in = has_url(hash, p->url_ptr);	
			if (hash_in != NULL ){
				insert_edge(webg, hash_in, hash_out);
				pthread_mutex_unlock(&mutex);
				p_pre->next = p->next;
				free(p->url_ptr);
				free(p);
				p = p_pre->next;
				b++;
				printf("a= %d, b= %d, c= %d, d= %d, e= %d, f= %d, g= %d\n", a,b,c,d,e,f,g);
				continue;
			}
			else{
				pos_found = insert_vertex(webg, hash_out, p->url_ptr);
				insert_hash_item(hash, p->url_ptr, pos_found, 0);	
				pthread_mutex_unlock(&mutex);
				c++;
				p_pre = p;
				p = p->next;
				printf("a= %d, b= %d, c= %d, d= %d, e= %d, f= %d, g= %d\n", a,b,c,d,e,f,g);
			}
		}
		
		if(p_pre != url_list_head){
			pthread_mutex_lock(&mutex);
			queue_push(&urlqueue, url_list_head->next, p_pre);
			f++;
			pthread_mutex_unlock(&mutex);
		}
		free(url_list_head);
		p = p_pre = url_list_head = NULL;
	
		pthread_mutex_lock(&((threadpool_item *)item)->waitlock);	
		((threadpool_item *)item)->idle = 1;
		pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);	
/*printf("next time!\n");*/
	}
/*printf("over!\n");*/
	return NULL;	
}
Esempio n. 2
0
int crawler_crawl(link_crawler_t *crawler, char *url, list_t *link_list)
{
    http_url_t http_url;
    int status = CRAWLER_NULL;
    if(http_url_parse_s(&http_url, url) == URL_RECOGNIZED) {
        int port = strlen(http_url.port) != 0 ? atoi(http_url.port) : 80;
        char path[4096] = {""};
        strlen(http_url.search) == 0 ? sprintf(path, "%s", http_url.path) : sprintf(path, "%s?%s", http_url.path, http_url.search);

        if(http_connect(&crawler->http_client, http_url.host, port) == CONNECT_OK) {
            int ecode, response_status;
            const char *page = NULL;
            printf("%s\n", url);
            ecode = http_do_get(&crawler->http_client, path);
            switch(ecode) {
            case RESPONSE_OK:
                response_status = http_response_status(&crawler->http_client);
                if(response_status == HTTP_OK) {
                    /* get entity_body pointer */
                    page = http_response_body(&crawler->http_client);
                    /* extrack link from buffer and save link into list*/
                    if(page != NULL) {
                        extract_absolute_link_s(page, link_list, url);
                        status = CRAWLER_OK;
                    } else {
                        status = CRAWLER_NULL;
                        fprintf(stderr, "Request %s:%d%s failed, response body is null.\n",
                                crawler->http_client.connection.host,
                                crawler->http_client.connection.port, path);
                    }
                } else {
                    fprintf(stderr, "Request %s:%d%s failed, Status code: %d.\n",
                            crawler->http_client.connection.host,
                            crawler->http_client.connection.port, path, response_status);
                    status = CRAWLER_NONEED;
                }
                break;
            case RESPONSE_OVERFLOW:
                fprintf(stderr, "Request %s:%d%s do_get receive overflow.\n",
                        crawler->http_client.connection.host,
                        crawler->http_client.connection.port, path);
                status = CRAWLER_OVERFLOW;
                break;
            case RESPONSE_FAILED:
                fprintf(stderr, "Request %s:%d%s do_get receive break.\n",
                        crawler->http_client.connection.host,
                        crawler->http_client.connection.port, path);
                status = CRAWLER_BREAK;
                break;
            case RESPONSE_TIMEOUT:
                fprintf(stderr, "Request %s:%d%s do_get receive timeout.\n",
                        crawler->http_client.connection.host,
                        crawler->http_client.connection.port, path);
                status = CRAWLER_TIMEOUT;
                break;
            case REQUEST_FAILED:
                fprintf(stderr, "Request %s:%d%s do_get request failed.\n",
                        http_url.host, port, path);
                status = CRAWLER_FAILED;
                break;
            default:
                fprintf(stderr, "Unknown ecode %d.\n", ecode);
                status = CRAWLER_UNKNOWN;
                break;
            }
            http_disconnect(&crawler->http_client);
        } else {
            fprintf(stderr, "Http connect %s:%d failed.\n", http_url.host, port);
            status = CRAWLER_UNREACH;
        }
    } else {
        fprintf(stderr, "Unrecognize url: %s\n", url);
        status = CRAWLER_UNKNOWN;
    }
    return status;
}
int main(int argc, char **argv)
{
    char *host = "125.211.218.8";
   char *path = "/techqq/zt/2007/firacn/topic_html/xsm.htm";
   /*
   char *host = "10.205.42.139";
   char *path = "/techqq/a/20090423/000378.htm";
char *path = "/techqq/a/20120111/000508.htm";
char *path = "/techqq/a/20121008/000013.htm";
    char *path = "/techqq/wlyx.htm";
    char *path = "/techqq/index.html";
    char *path = "/techqq/a/20121008/000048.htm";
10.205.42.139:80/techqq/a/20090423/000378.htm
*/
    const char *body_ptr;
    int status;
    http_client_t http_client1;
    http_connect(&http_client1, host, 80);
    http_do_get(&http_client1, path);
    status = http_response_status(&http_client1);
    body_ptr = http_response_body(&http_client1);
    printf("---------------------Status-----------------------\n");
    printf("%d\n", status);
    printf("---------------------Body-----------------------\n");
    printf("%p\n", body_ptr);
    printf("%s\n", body_ptr);
    http_disconnect(&http_client1);

/*
    status = http_response_status(&http_client1);
    body_ptr = http_response_body(&http_client1);
*/
/*
    printf("---------------------Status-----------------------\n");
    printf("%d\n", status);
    printf("---------------------Body-----------------------\n");
    printf("%s\n", body_ptr);
*/
/*


    http_client_t http_client2;
    http_connect(&http_client2, "ufp.umeng.com", 80);
    http_do_get(&http_client2, "/login");

    status = http_response_status(&http_client2);
    body_ptr = http_response_body(&http_client2);

    printf("---------------------Status-----------------------\n");
    printf("%d\n", status);
    printf("---------------------Body-----------------------\n");
    printf("%s\n", body_ptr);
    http_disconnect(&http_client2);


    http_client_t http_client3;
    http_connect(&http_client3, "blog.umeng.com", 80);
    http_do_get(&http_client3, "/index.php/category/products/");

    status = http_response_status(&http_client3);
    body_ptr = http_response_body(&http_client3);

    printf("---------------------Status-----------------------\n");
    printf("%d\n", status);
    printf("---------------------Body-----------------------\n");
    printf("%s\n", body_ptr);
    http_disconnect(&http_client3);
*/

}