示例#1
0
void * do_crawler(void *item)
{
	char *url_ptr;
	int clientfd ;
	static int pages = 0;
	static int error = 0;
	char *buf = NULL;
	urlq_t *url_list_head = NULL, *p, *p_pre;
	char cur_dir[256];
	hash_table *hash_in, *hash_out;
/*	int tid = pthread_self();*/
	int j = 0;
	int i= 0;
	int pos_found = 0;
	char temp[256];
	int status = 0;
	static int a = 0;
	static int b = 0;
	static int c = 0;
	static int d = 0;
	static int e = 0;
	static int f = 0;
	static int g = 0;
	while(1){	
		pthread_mutex_lock(&mutex);
		while (urlqueue.head_ptr->next == NULL){
			pthread_cond_wait(&ready, &mutex);
		}

		pthread_mutex_lock(&((threadpool_item *)item)->waitlock);
		((threadpool_item *)item)->idle = 0;
		pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);	
		url_ptr = queue_pop(&urlqueue);
		g++;
		pthread_mutex_unlock(&mutex);
		

		pthread_mutex_lock(&mutex);
		/*if not visited, set flag = 1*/
		if(has_visited(hash, url_ptr) == 1){
			pthread_mutex_unlock(&mutex);
			free(url_ptr); 
			pthread_mutex_lock(&((threadpool_item *)item)->waitlock);	
			((threadpool_item *)item)->idle =1;
			pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);
			d++;	
			continue;	
		}
		hash_out = has_url(hash, url_ptr);
		e++;
		pthread_mutex_unlock(&mutex);
		if (hash_out == NULL){
			printf("error\n");
			getchar();
		}
		*temp = '\0';
		cur_dir[0] = '\0';
		strcpy(cur_dir, url_ptr);
		j = strlen(cur_dir);
		for (;cur_dir[j] != '/' && j != 0; j--) ;
		if(j == 0)
			cur_dir[j] = '\0';
		else
			cur_dir[j+1] = '\0';

		for (i = 0; i < 3; i++){
			if((clientfd = open_tcp("127.0.0.1", 80)) < 0){
				close_fd(clientfd);
				continue;
			}

			if( http_do_get(clientfd, rootdir, "127.0.0.1", url_ptr) < 0){
				close_fd(clientfd);
				continue;
			}

			if(recv_line(clientfd, temp) <= 0){
				close_fd(clientfd);
				continue;
			}
			if((status = http_response_status(temp))  == 4){
				printf("%s error %d\n",url_ptr, error++);
				pthread_mutex_lock(&mutex);
				set_status(hash, url_ptr, 4);
				set_webg_status(webg, hash_out, 4);
				pthread_mutex_unlock(&mutex);
			
				pthread_mutex_lock(&((threadpool_item *)item)->waitlock);	
				((threadpool_item *)item)->idle =1;
				pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);	
				close_fd(clientfd);
				break;	
			}
			buf = http_response_body(clientfd);
			close_fd(clientfd);
			break;
		}
		if (status == 4)
			continue;
		if(i == 3){
			pthread_mutex_lock(&((threadpool_item *)item)->waitlock);	
			((threadpool_item *)item)->idle =1;
			pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);	
			close_fd(clientfd);
			continue;	
		}
		if (buf == NULL){
			pthread_mutex_lock(&((threadpool_item *)item)->waitlock);	
			((threadpool_item *)item)->idle =1;
			pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);	
			continue;
		}
		printf("%s pages %d\n", url_ptr,pages++);
		extract_link(buf, cur_dir, &url_list_head);
		free(buf);
		buf = NULL;	
		p = url_list_head->next;
		p_pre = url_list_head;
		while (p != NULL){
			
			if(strcmp(url_ptr, p->url_ptr) == 0){
				p_pre->next = p->next;
				free(p->url_ptr);
				free(p);
				a++;
				p = p_pre->next;
				printf("a= %d, b= %d, c= %d, d= %d, e= %d, f= %d, g= %d\n", a,b,c,d,e,f,g);
				continue;	
			}			
			pthread_mutex_lock(&mutex);
			hash_in = has_url(hash, p->url_ptr);	
			if (hash_in != NULL ){
				insert_edge(webg, hash_in, hash_out);
				pthread_mutex_unlock(&mutex);
				p_pre->next = p->next;
				free(p->url_ptr);
				free(p);
				p = p_pre->next;
				b++;
				printf("a= %d, b= %d, c= %d, d= %d, e= %d, f= %d, g= %d\n", a,b,c,d,e,f,g);
				continue;
			}
			else{
				pos_found = insert_vertex(webg, hash_out, p->url_ptr);
				insert_hash_item(hash, p->url_ptr, pos_found, 0);	
				pthread_mutex_unlock(&mutex);
				c++;
				p_pre = p;
				p = p->next;
				printf("a= %d, b= %d, c= %d, d= %d, e= %d, f= %d, g= %d\n", a,b,c,d,e,f,g);
			}
		}
		
		if(p_pre != url_list_head){
			pthread_mutex_lock(&mutex);
			queue_push(&urlqueue, url_list_head->next, p_pre);
			f++;
			pthread_mutex_unlock(&mutex);
		}
		free(url_list_head);
		p = p_pre = url_list_head = NULL;
	
		pthread_mutex_lock(&((threadpool_item *)item)->waitlock);	
		((threadpool_item *)item)->idle = 1;
		pthread_mutex_unlock(&((threadpool_item *)item)->waitlock);	
/*printf("next time!\n");*/
	}
/*printf("over!\n");*/
	return NULL;	
}
void insert_register(struct declared_register *reg)
{
    insert_hash_item(declared_register_table, reg->name, reg);
}