/* Checks the accuracy of the HITS computation */ void test_hits(CuTest *tc) { /* Compute the HITS score of the following graph * +-->2---+ * | | | * | v v * 1-->5<--3 * ^ ^ | * | | | * +---4<--+ * * The link matrix L[i,j], where L[i,j] = 1 means 'i' links to 'j' is: * * +- -+ * | 0 1 0 0 1 | * | 0 0 1 0 1 | * L = | 0 0 0 1 1 | * | 1 0 0 0 1 | * | 0 0 0 0 0 | * +- -+ */ char test_dir[] = "test-pagedb-XXXXXX"; mkdtemp(test_dir); PageDB *db; int ret = page_db_new(&db, test_dir); CuAssert(tc, db!=0? db->error->message: "NULL", ret == 0); db->persist = 0; char *urls[5] = {"1", "2", "3", "4", "5" }; LinkInfo links_1[] = {{"2", 0.1}, {"5", 0.1}}; LinkInfo links_2[] = {{"3", 0.1}, {"5", 0.1}}; LinkInfo links_3[] = {{"4", 0.1}, {"5", 0.1}}; LinkInfo links_4[] = {{"1", 0.1}, {"5", 0.1}}; LinkInfo *links[5] = { links_1, links_2, links_3, links_4, 0 }; int n_links[5] = {2, 2, 2, 2, 0}; for (int i=0; i<5; ++i) { CrawledPage *cp = crawled_page_new(urls[i]); for (int j=0; j<n_links[i]; ++j) crawled_page_add_link(cp, links[i][j].url, links[i][j].score); cp->score = i/5.0; crawled_page_set_hash64(cp, i); PageInfoList *pil; CuAssert(tc, db->error->message, page_db_add(db, cp, &pil) == 0); page_info_list_delete(pil); crawled_page_delete(cp); } PageDBLinkStream *st; CuAssert(tc, db->error->message, page_db_link_stream_new(&st, db) == 0); st->only_diff_domain = 0; Hits *hits; ret = hits_new(&hits, test_dir, 5); CuAssert(tc, hits!=0? hits->error->message: "NULL", ret == 0); hits->precision = 1e-8; CuAssert(tc, hits->error->message, hits_compute(hits, st, page_db_link_stream_next, page_db_link_stream_reset) == 0); page_db_link_stream_delete(st); uint64_t idx; float *h_score; float *a_score; float h_scores[5] = {0.250, 0.250, 0.250, 0.250, 0.000}; float a_scores[5] = {0.125, 0.125, 0.125, 0.125, 0.500}; for (int i=0; i<5; ++i) { CuAssert(tc, db->error->message, page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0); CuAssertPtrNotNull(tc, h_score = mmap_array_idx(hits->h1, idx)); CuAssertPtrNotNull(tc, a_score = mmap_array_idx(hits->a1, idx)); CuAssertDblEquals(tc, h_scores[i], *h_score, 1e-6); CuAssertDblEquals(tc, a_scores[i], *a_score, 1e-6); } CuAssert(tc, hits->error->message, hits_delete(hits) == 0); page_db_delete(db); }
static void test_page_db_crawl(CuTest *tc) { printf("%s\n", __func__); char test_dir[] = "test-pagedb-XXXXXX"; mkdtemp(test_dir); PageDB *db; int ret = page_db_new(&db, test_dir); CuAssert(tc, db!=0? db->error->message: "NULL", ret == 0); page_db_set_persist(db, 0); page_db_set_domain_temp(db, 20, 60.0); const size_t n_links = 10; LinkInfo links[n_links + 1]; for (size_t j=0; j<=n_links; ++j) { sprintf(links[j].url = malloc(100), "http://test_domain_%zu.org/test_url_%zu", j%100, j); links[j].score = j; } clock_t start = clock(); for (size_t i=1; i<=test_n_pages; ++i) { if (i % 10000 == 0) { double delta = (double)(clock() - start)/(double)CLOCKS_PER_SEC; if (delta > 0) { printf("%10zuK/%zuK: %.0f pages/sec\n", i/1000, test_n_pages/1000, ((double)i)/delta); } } free(links[0].url); for (size_t j=0; j<n_links; ++j) links[j] = links[j+1]; sprintf(links[n_links].url = malloc(50), "test_url_%zu", i + n_links); links[n_links].score = i; CrawledPage *cp = crawled_page_new(links[0].url); for (size_t j=1; j<=n_links; ++j) crawled_page_add_link(cp, links[j].url, 0.5); PageInfoList *pil; CuAssert(tc, db->error->message, page_db_add(db, cp, &pil) == 0); page_info_list_delete(pil); crawled_page_delete(cp); } for (size_t j=0; j<=n_links; ++j) free(links[j].url); PageDBLinkStream *st; CuAssert(tc, db->error->message, page_db_link_stream_new(&st, db) == 0); st->only_diff_domain = 0; Hits *hits; ret = hits_new(&hits, test_dir, test_n_pages); CuAssert(tc, hits!=0? hits->error->message: "NULL", ret == 0); hits->precision = 1e-3; HitsError hits_err = hits_compute(hits, st, page_db_link_stream_next, page_db_link_stream_reset); if (hits_err == hits_error_precision) hits_err = 0; CuAssert(tc, hits->error->message, hits_err == 0); CHECK_DELETE(tc, hits->error->message, hits_delete(hits)); page_db_link_stream_delete(st); page_db_delete(db); }