void *parser(void *arg) { struct input_args *in_args = (struct input_args *)arg; while (1) { struct page *page = (struct page *)unbounded_buffer_get(in_args->page_queue); if (page == NULL) break; char *start = page->content; while ((start = strstr(start, "link:")) != NULL) { if (start > page->content && *(start - 1) != ' ' && *(start - 1) != '\n') { start = start + 5; continue; } char *end = start + 5; while (*end != ' ' && *end != '\n' && *end != '\0') end++; if (*end == '\0') { char *url = str_duplicate(start + 5); in_args->edge(page->url, url); bounded_buffer_put(in_args->url_queue, (void *)url); break; } else { char tmp = *end; *end = '\0'; char *url = str_duplicate(start + 5); in_args->edge(page->url, url); bounded_buffer_put(in_args->url_queue, (void *)url); *end = tmp; start = end + 1; } } mem_free(page->url); mem_free(page->content); mem_free(page); unbounded_buffer_done(in_args->page_queue); mutex_lock(in_args->done_mutex); cond_signal(in_args->done_cond); mutex_unlock(in_args->done_mutex); } return NULL; }
void* fill_buffer(void* arg){ params* p; buffer* buf; char temp[BUFFER_LENGTH]; FILE* file; p=(params*)arg; buf=p->buf; file = open_file(p->dictionary); /* Open file */ do{ if(sem_wait(p->empty)==-1) perror("error on sem empty"); if(buf->nb_elem < buf->size){ if(get_next(file, temp)<0){ p->found = true; printf("Password not found\n"); } bounded_buffer_put(buf, temp); } sem_post(p->full); }while(temp!=NULL && !p->found); close_file(file); pthread_exit(NULL); }
int crawl(char *start_url, int download_workers, int parse_workers, int queue_size, char *(*_fetch_fn)(char *url), void (*_edge_fn)(char *from, char *to)) { int i; bounded_buffer_t url_queue; unbounded_buffer_t page_queue; hashset_t url_set; bounded_buffer_init(&url_queue, queue_size); unbounded_buffer_init(&page_queue); hashset_init(&url_set, HASHSET_BUCKETS); bounded_buffer_put(&url_queue, (void *)str_duplicate(start_url)); mutex_t done_mutex; cond_t done_cond; mutex_init(&done_mutex); cond_init(&done_cond); struct input_args in_args; in_args.url_queue = &url_queue; in_args.page_queue = &page_queue; in_args.url_set = &url_set; in_args.fetch = _fetch_fn; in_args.edge = _edge_fn; in_args.done_mutex = &done_mutex; in_args.done_cond = &done_cond; thread_t downloaders[download_workers]; thread_t parsers[parse_workers]; for (i = 0; i < download_workers; i++) thread_create(&downloaders[i], downloader, (void *)&in_args); for (i = 0; i < parse_workers; i++) thread_create(&parsers[i], parser, (void *)&in_args); while (1) { mutex_lock(&done_mutex); mutex_lock(&url_queue.mutex); mutex_lock(&url_queue.worker_mutex); mutex_lock(&page_queue.mutex); mutex_lock(&page_queue.worker_mutex); if (url_queue.count == 0 && url_queue.workers == 0 && page_queue.count == 0 && page_queue.workers == 0) { url_queue.done = 1; page_queue.done = 1; cond_broadcast(&url_queue.empty); cond_broadcast(&url_queue.fill); cond_broadcast(&page_queue.fill); mutex_unlock(&url_queue.mutex); mutex_unlock(&url_queue.worker_mutex); mutex_unlock(&page_queue.mutex); mutex_unlock(&page_queue.worker_mutex); mutex_unlock(&done_mutex); break; } else { mutex_unlock(&url_queue.mutex); mutex_unlock(&url_queue.worker_mutex); mutex_unlock(&page_queue.mutex); mutex_unlock(&page_queue.worker_mutex); cond_wait(&done_cond, &done_mutex); mutex_unlock(&done_mutex); } } for (i = 0; i < download_workers; i++) thread_join(downloaders[i], NULL); for (i = 0; i < parse_workers; i++) thread_join(parsers[i], NULL); bounded_buffer_destroy(&url_queue); unbounded_buffer_destroy(&page_queue); hashset_destroy(&url_set); return 0; }