void test_link_stream(CuTest *tc) { printf("%s\n", __func__); char test_dir[] = "test-pagedb-XXXXXX"; mkdtemp(test_dir); PageDB *db; int ret = page_db_new(&db, test_dir); CuAssert(tc, db!=0? db->error->message: "NULL", ret == 0); db->persist = 0; /* +----------------+ * | | * a1 --> a2 | * | ^ | * | | v * +----> b1 -----> b2 */ const char *url_a1 = "http://test_a.org/1"; const char *url_a2 = "http://test_a.org/2"; const char *url_b1 = "http://test_b.org/1"; const char *url_b2 = "http://test_b.org/2"; CrawledPage *cp = crawled_page_new(url_a1); crawled_page_add_link(cp, url_a2, 1.0); crawled_page_add_link(cp, url_b1, 1.0); crawled_page_add_link(cp, url_b2, 1.0); CuAssert(tc, db->error->message, page_db_add(db, cp, 0) == 0); crawled_page_delete(cp); cp = crawled_page_new(url_b1); crawled_page_add_link(cp, url_b2, 1.0); crawled_page_add_link(cp, url_a2, 1.0); CuAssert(tc, db->error->message, page_db_add(db, cp, 0) == 0); crawled_page_delete(cp); uint64_t idx[4]; CuAssert(tc, db->error->message, page_db_get_idx(db, page_db_hash(url_a1), idx + 0) == 0); CuAssert(tc, db->error->message, page_db_get_idx(db, page_db_hash(url_a2), idx + 1) == 0); CuAssert(tc, db->error->message, page_db_get_idx(db, page_db_hash(url_b1), idx + 2) == 0); CuAssert(tc, db->error->message, page_db_get_idx(db, page_db_hash(url_b2), idx + 3) == 0); Link links_diff[] = { {.from = idx[0], .to = idx[2]}, // a1 -> b1 {.from = idx[0], .to = idx[3]}, // a1 -> b2
/* Checks the accuracy of the HITS computation */ void test_hits(CuTest *tc) { /* Compute the HITS score of the following graph * +-->2---+ * | | | * | v v * 1-->5<--3 * ^ ^ | * | | | * +---4<--+ * * The link matrix L[i,j], where L[i,j] = 1 means 'i' links to 'j' is: * * +- -+ * | 0 1 0 0 1 | * | 0 0 1 0 1 | * L = | 0 0 0 1 1 | * | 1 0 0 0 1 | * | 0 0 0 0 0 | * +- -+ */ char test_dir[] = "test-pagedb-XXXXXX"; mkdtemp(test_dir); PageDB *db; int ret = page_db_new(&db, test_dir); CuAssert(tc, db!=0? db->error->message: "NULL", ret == 0); db->persist = 0; char *urls[5] = {"1", "2", "3", "4", "5" }; LinkInfo links_1[] = {{"2", 0.1}, {"5", 0.1}}; LinkInfo links_2[] = {{"3", 0.1}, {"5", 0.1}}; LinkInfo links_3[] = {{"4", 0.1}, {"5", 0.1}}; LinkInfo links_4[] = {{"1", 0.1}, {"5", 0.1}}; LinkInfo *links[5] = { links_1, links_2, links_3, links_4, 0 }; int n_links[5] = {2, 2, 2, 2, 0}; for (int i=0; i<5; ++i) { CrawledPage *cp = crawled_page_new(urls[i]); for (int j=0; j<n_links[i]; ++j) crawled_page_add_link(cp, links[i][j].url, links[i][j].score); cp->score = i/5.0; crawled_page_set_hash64(cp, i); PageInfoList *pil; CuAssert(tc, db->error->message, page_db_add(db, cp, &pil) == 0); page_info_list_delete(pil); crawled_page_delete(cp); } PageDBLinkStream *st; CuAssert(tc, db->error->message, page_db_link_stream_new(&st, db) == 0); st->only_diff_domain = 0; Hits *hits; ret = hits_new(&hits, test_dir, 5); CuAssert(tc, hits!=0? hits->error->message: "NULL", ret == 0); hits->precision = 1e-8; CuAssert(tc, hits->error->message, hits_compute(hits, st, page_db_link_stream_next, page_db_link_stream_reset) == 0); page_db_link_stream_delete(st); uint64_t idx; float *h_score; float *a_score; float h_scores[5] = {0.250, 0.250, 0.250, 0.250, 0.000}; float a_scores[5] = {0.125, 0.125, 0.125, 0.125, 0.500}; for (int i=0; i<5; ++i) { CuAssert(tc, db->error->message, page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0); CuAssertPtrNotNull(tc, h_score = mmap_array_idx(hits->h1, idx)); CuAssertPtrNotNull(tc, a_score = mmap_array_idx(hits->a1, idx)); CuAssertDblEquals(tc, h_scores[i], *h_score, 1e-6); CuAssertDblEquals(tc, a_scores[i], *a_score, 1e-6); } CuAssert(tc, hits->error->message, hits_delete(hits) == 0); page_db_delete(db); }
/* Tests the loading/dumping of PageInfo from and into LMDB values */ void test_page_info_serialization(CuTest *tc) { printf("%s\n", __func__); MDB_val val; PageInfo pi1 = { .url = "test_url_123", .first_crawl = 123, .last_crawl = 456, .n_changes = 100, .n_crawls = 20, .score = 0.7, .content_hash_length = 8, .content_hash = "1234567" }; CuAssertTrue(tc, page_info_dump(&pi1, &val) == 0); CuAssertDblEquals(tc, 0.7, page_info_dump_get_score(&val), 1e-6); PageInfo *pi2 = page_info_load(&val); CuAssertPtrNotNull(tc, pi2); free(val.mv_data); CuAssertStrEquals(tc, pi1.url, pi2->url); CuAssertTrue(tc, pi1.first_crawl == pi2->first_crawl); CuAssertTrue(tc, pi1.last_crawl == pi2->last_crawl); CuAssertTrue(tc, pi1.n_changes == pi2->n_changes); CuAssertTrue(tc, pi1.n_crawls == pi2->n_crawls); CuAssertTrue(tc, pi1.score == pi2->score); CuAssertTrue(tc, pi1.content_hash_length == pi2->content_hash_length); CuAssertStrEquals(tc, pi1.content_hash, pi2->content_hash); page_info_delete(pi2); } /* Tests all the database operations on a very simple crawl of just two pages */ void test_page_db_simple(CuTest *tc) { printf("%s\n", __func__); char test_dir[] = "test-pagedb-XXXXXX"; mkdtemp(test_dir); PageDB *db; int ret = page_db_new(&db, test_dir); CuAssert(tc, db!=0? db->error->message: "NULL", ret == 0); db->persist = 0; CrawledPage *cp1 = crawled_page_new("www.yahoo.com"); crawled_page_add_link(cp1, "a", 0.1); crawled_page_add_link(cp1, "b", 0.2); crawled_page_add_link(cp1, "www.google.com", 0.3); crawled_page_set_hash64(cp1, 1000); cp1->score = 0.5; CrawledPage *cp2 = crawled_page_new("www.bing.com"); crawled_page_add_link(cp2, "x", 1.1); crawled_page_add_link(cp2, "y", 1.2); crawled_page_set_hash64(cp2, 2000); cp2->score = 0.2; PageInfoList *pil; CuAssert(tc, db->error->message, page_db_add(db, cp1, &pil) == 0); page_info_list_delete(pil); CuAssert(tc, db->error->message, page_db_add(db, cp2, &pil) == 0); page_info_list_delete(pil); crawled_page_set_hash64(cp2, 3000); CuAssert(tc, db->error->message, page_db_add(db, cp2, &pil) == 0); page_info_list_delete(pil); MMapArray *scores = 0; CuAssert(tc, db->error->message, page_db_get_scores(db, &scores) == 0); size_t idx; CuAssert(tc, db->error->message, page_db_get_idx(db, page_db_hash("www.yahoo.com"), &idx) == 0); CuAssertDblEquals( tc, 0.5, *(float*)mmap_array_idx(scores, idx), 1e-6); CuAssert(tc, db->error->message, page_db_get_idx(db, page_db_hash("x"), &idx) == 0); CuAssertDblEquals( tc, 1.1, *(float*)mmap_array_idx(scores, idx), 1e-6); CHECK_DELETE(tc, scores->error->message, mmap_array_delete(scores)); crawled_page_delete(cp1); crawled_page_delete(cp2); char pi_out[1000]; char *print_pages[] = {"www.yahoo.com", "www.google.com", "www.bing.com"}; for (size_t i=0; i<3; ++i) { PageInfo *pi; CuAssert(tc, db->error->message, page_db_get_info(db, page_db_hash(print_pages[i]), &pi) == 0); CuAssertPtrNotNull(tc, pi); switch(i) { case 0: CuAssertIntEquals(tc, 1, pi->n_crawls); CuAssertIntEquals(tc, 0, pi->n_changes); break; case 1: CuAssertIntEquals(tc, 0, pi->n_crawls); break; case 2: CuAssertIntEquals(tc, 2, pi->n_crawls); CuAssertIntEquals(tc, 1, pi->n_changes); break; } page_info_print(pi, pi_out); page_info_delete(pi); /* show on screen the page info: * * Mon Apr 6 15:34:50 2015|Mon Apr 6 15:34:50 2015|1.00e+00|0.00e+00|www.yahoo.com * Thu Jan 1 01:00:00 1970|Thu Jan 1 01:00:00 1970|0.00e+00|0.00e+00|www.google.com * Mon Apr 6 15:34:50 2015|Mon Apr 6 15:34:50 2015|2.00e+00|1.00e+00|www.bing.com */ #if 0 printf("%s\n", pi_out); #endif } PageDBLinkStream *es; CuAssert(tc, db->error->message, page_db_link_stream_new(&es, db) == 0); es->only_diff_domain = 0; if (es->state == stream_state_init) { Link link; int i=0; while (page_db_link_stream_next(es, &link) == stream_state_next) { switch(i++) { case 0: CuAssertIntEquals(tc, 0, link.from); CuAssertIntEquals(tc, 1, link.to); break; case 1: CuAssertIntEquals(tc, 0, link.from); CuAssertIntEquals(tc, 2, link.to); break; case 2: CuAssertIntEquals(tc, 0, link.from); CuAssertIntEquals(tc, 3, link.to); break; case 3: CuAssertIntEquals(tc, 4, link.from); CuAssertIntEquals(tc, 5, link.to); break; case 4: CuAssertIntEquals(tc, 4, link.from); CuAssertIntEquals(tc, 6, link.to); break; default: CuFail(tc, "too many links"); break; } } CuAssertTrue(tc, es->state != stream_state_error); } page_db_link_stream_delete(es); page_db_delete(db); }
void test_hashinfo_stream(CuTest *tc) { printf("%s\n", __func__); char test_dir[] = "test-pagedb-XXXXXX"; mkdtemp(test_dir); PageDB *db; int ret = page_db_new(&db, test_dir); CuAssert(tc, db!=0? db->error->message: "NULL", ret == 0); db->persist = 0; CrawledPage *cp = crawled_page_new("1"); crawled_page_add_link(cp, "a", 0); crawled_page_add_link(cp, "b", 0); CuAssert(tc, db->error->message, page_db_add(db, cp, 0) == 0); crawled_page_delete(cp); cp = crawled_page_new("2"); crawled_page_add_link(cp, "c", 0); crawled_page_add_link(cp, "d", 0); CuAssert(tc, db->error->message, page_db_add(db, cp, 0) == 0); crawled_page_delete(cp); HashInfoStream *stream; CuAssert(tc, db->error->message, hashinfo_stream_new(&stream, db) == 0); CuAssert(tc, "stream was not initialized", stream->state == stream_state_init); uint64_t hash; PageInfo *pi; char *expected_url[] = {"1", "a", "b", "2", "c", "d"}; uint64_t expected_hash[6]; for (int i=0; i<6; ++i) expected_hash[i] = page_db_hash(expected_url[i]); int found[] = {0, 0, 0, 0, 0, 0}; for (int i=0; i<6; ++i) { CuAssert(tc, "stream element expected", hashinfo_stream_next(stream, &hash, &pi) == stream_state_next); int match = 0; for (int j=0; j<6; ++j) if (hash == expected_hash[j]) { found[j] = 1; CuAssertStrEquals(tc, expected_url[j], pi->url); match = 1; } CuAssert(tc, "unexpected page hash", match); page_info_delete(pi); } hashinfo_stream_delete(stream); for (int i=0; i<6; ++i) CuAssertTrue(tc, found[i]); page_db_delete(db); }
void test_hashidx_stream(CuTest *tc) { printf("%s\n", __func__); char test_dir[] = "test-pagedb-XXXXXX"; mkdtemp(test_dir); PageDB *db; int ret = page_db_new(&db, test_dir); CuAssert(tc, db!=0? db->error->message: "NULL", ret == 0); db->persist = 0; CrawledPage *cp = crawled_page_new("1"); crawled_page_add_link(cp, "a", 0); crawled_page_add_link(cp, "b", 0); CuAssert(tc, db->error->message, page_db_add(db, cp, 0) == 0); crawled_page_delete(cp); cp = crawled_page_new("2"); crawled_page_add_link(cp, "c", 0); crawled_page_add_link(cp, "d", 0); CuAssert(tc, db->error->message, page_db_add(db, cp, 0) == 0); crawled_page_delete(cp); HashIdxStream *stream; CuAssert(tc, db->error->message, hashidx_stream_new(&stream, db) == 0); CuAssert(tc, "stream was not initialized", stream->state == stream_state_init); uint64_t hash; size_t idx; uint64_t expected_hash[] = { page_db_hash("1"), page_db_hash("a"), page_db_hash("b"), page_db_hash("2"), page_db_hash("c"), page_db_hash("d") }; for (int i=0; i<6; ++i) { CuAssert(tc, "stream element expected", hashidx_stream_next(stream, &hash, &idx) == stream_state_next); if (idx > 5) CuFail(tc, "unexpected index"); CuAssert(tc, "mismatch between index and hash", hash == expected_hash[idx]); } hashidx_stream_delete(stream); page_db_delete(db); }
static void test_page_db_crawl(CuTest *tc) { printf("%s\n", __func__); char test_dir[] = "test-pagedb-XXXXXX"; mkdtemp(test_dir); PageDB *db; int ret = page_db_new(&db, test_dir); CuAssert(tc, db!=0? db->error->message: "NULL", ret == 0); page_db_set_persist(db, 0); page_db_set_domain_temp(db, 20, 60.0); const size_t n_links = 10; LinkInfo links[n_links + 1]; for (size_t j=0; j<=n_links; ++j) { sprintf(links[j].url = malloc(100), "http://test_domain_%zu.org/test_url_%zu", j%100, j); links[j].score = j; } clock_t start = clock(); for (size_t i=1; i<=test_n_pages; ++i) { if (i % 10000 == 0) { double delta = (double)(clock() - start)/(double)CLOCKS_PER_SEC; if (delta > 0) { printf("%10zuK/%zuK: %.0f pages/sec\n", i/1000, test_n_pages/1000, ((double)i)/delta); } } free(links[0].url); for (size_t j=0; j<n_links; ++j) links[j] = links[j+1]; sprintf(links[n_links].url = malloc(50), "test_url_%zu", i + n_links); links[n_links].score = i; CrawledPage *cp = crawled_page_new(links[0].url); for (size_t j=1; j<=n_links; ++j) crawled_page_add_link(cp, links[j].url, 0.5); PageInfoList *pil; CuAssert(tc, db->error->message, page_db_add(db, cp, &pil) == 0); page_info_list_delete(pil); crawled_page_delete(cp); } for (size_t j=0; j<=n_links; ++j) free(links[j].url); PageDBLinkStream *st; CuAssert(tc, db->error->message, page_db_link_stream_new(&st, db) == 0); st->only_diff_domain = 0; Hits *hits; ret = hits_new(&hits, test_dir, test_n_pages); CuAssert(tc, hits!=0? hits->error->message: "NULL", ret == 0); hits->precision = 1e-3; HitsError hits_err = hits_compute(hits, st, page_db_link_stream_next, page_db_link_stream_reset); if (hits_err == hits_error_precision) hits_err = 0; CuAssert(tc, hits->error->message, hits_err == 0); CHECK_DELETE(tc, hits->error->message, hits_delete(hits)); page_db_link_stream_delete(st); page_db_delete(db); }
FreqSchedulerError freq_scheduler_cursor_write(FreqScheduler *sch, MDB_cursor *cursor, uint64_t hash, float freq) { if (freq <= 0) return 0; ScheduleKey sk = { .score = 0, .hash = hash }; MDB_val key = { .mv_size = sizeof(sk), .mv_data = &sk, }; MDB_val val = { .mv_size = sizeof(float), .mv_data = &freq, }; int mdb_rc; if ((mdb_rc = mdb_cursor_put(cursor, &key, &val, 0)) != 0) { freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, "adding page to schedule"); freq_scheduler_add_error(sch, mdb_strerror(mdb_rc)); } return sch->error->code; } FreqSchedulerError freq_scheduler_load_simple(FreqScheduler *sch, float freq_default, float freq_scale) { char *error1 = 0; char *error2 = 0; MDB_cursor *cursor = 0; HashInfoStream *st; if (hashinfo_stream_new(&st, sch->page_db) != 0) { error1 = "creating stream"; error2 = st? sch->page_db->error->message: "NULL"; goto on_error; } if (freq_scheduler_cursor_open(sch, &cursor) != 0) goto on_error; StreamState ss; uint64_t hash; PageInfo *pi; while ((ss = hashinfo_stream_next(st, &hash, &pi)) == stream_state_next) { if ((pi->n_crawls > 0) && ((sch->max_n_crawls == 0) || (pi->n_crawls < sch->max_n_crawls)) && !page_info_is_seed(pi)){ float freq = freq_default; if (freq_scale > 0) { float rate = page_info_rate(pi); if (rate > 0) { freq = freq_scale * rate; } } if (freq_scheduler_cursor_write(sch, cursor, hash, freq) != 0) goto on_error; } page_info_delete(pi); } if (ss != stream_state_end) { error1 = "incorrect stream state"; error2 = 0; hashinfo_stream_delete(st); goto on_error; } hashinfo_stream_delete(st); if (freq_scheduler_cursor_commit(sch, cursor) != 0) goto on_error; return sch->error->code; on_error: freq_scheduler_cursor_abort(sch, cursor); freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, error1); freq_scheduler_add_error(sch, error2); return sch->error->code; } FreqSchedulerError freq_scheduler_load_mmap(FreqScheduler *sch, MMapArray *freqs) { char *error1 = 0; char *error2 = 0; MDB_cursor *cursor = 0; if (txn_manager_expand( sch->txn_manager, 2*freqs->n_elements*freqs->element_size) != 0) { error1 = "resizing database"; error2 = sch->txn_manager->error->message; goto on_error; } if (freq_scheduler_cursor_open(sch, &cursor) != 0) goto on_error; for (size_t i=0; i<freqs->n_elements; ++i) { PageFreq *f = mmap_array_idx(freqs, i); ScheduleKey sk = { .score = 1.0/f->freq, .hash = f->hash }; MDB_val key = { .mv_size = sizeof(sk), .mv_data = &sk, }; MDB_val val = { .mv_size = sizeof(float), .mv_data = &f->freq, }; int mdb_rc; if ((mdb_rc = mdb_cursor_put(cursor, &key, &val, 0)) != 0) { error1 = "adding page to schedule"; error2 = mdb_strerror(mdb_rc); goto on_error; } } if (freq_scheduler_cursor_commit(sch, cursor) != 0) goto on_error; return sch->error->code; on_error: freq_scheduler_cursor_abort(sch, cursor); freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, error1); freq_scheduler_add_error(sch, error2); return sch->error->code; } FreqSchedulerError freq_scheduler_request(FreqScheduler *sch, size_t max_requests, PageRequest **request) { char *error1 = 0; char *error2 = 0; MDB_cursor *cursor = 0; if (freq_scheduler_cursor_open(sch, &cursor) != 0) goto on_error; PageRequest *req = *request = page_request_new(max_requests); if (!req) { error1 = "allocating memory"; goto on_error; } int interrupt_requests = 0; while ((req->n_urls < max_requests) && !interrupt_requests) { MDB_val key; MDB_val val; ScheduleKey sk; float freq; int mdb_rc; int crawl = 0; switch (mdb_rc = mdb_cursor_get(cursor, &key, &val, MDB_FIRST)) { case 0: // copy data before deleting cursor sk = *(ScheduleKey*)key.mv_data; freq = *(float*)val.mv_data; PageInfo *pi = 0; if (page_db_get_info(sch->page_db, sk.hash, &pi) != 0) { error1 = "retrieving PageInfo from PageDB"; error2 = sch->page_db->error->message; goto on_error; } if (pi) { if (sch->margin >= 0) { double elapsed = difftime(time(0), 0) - pi->last_crawl; if (elapsed < 1.0/(freq*(1.0 + sch->margin))) interrupt_requests = 1; } crawl = (sch->max_n_crawls == 0) || (pi->n_crawls < sch->max_n_crawls); } if (!interrupt_requests) { if ((mdb_rc = mdb_cursor_del(cursor, 0)) != 0) { error1 = "deleting head of schedule"; error2 = mdb_strerror(mdb_rc); goto on_error; } if (crawl) { if (page_request_add_url(req, pi->url) != 0) { error1 = "adding url to request"; goto on_error; } sk.score += 1.0/freq; val.mv_data = &freq; key.mv_data = &sk; if ((mdb_rc = mdb_cursor_put(cursor, &key, &val, 0)) != 0) { error1 = "moving element inside schedule"; error2 = mdb_strerror(mdb_rc); goto on_error; } } } page_info_delete(pi); break; case MDB_NOTFOUND: // no more pages left interrupt_requests = 1; break; default: error1 = "getting head of schedule"; error2 = mdb_strerror(mdb_rc); goto on_error; } } if (freq_scheduler_cursor_commit(sch, cursor) != 0) goto on_error; return sch->error->code; on_error: freq_scheduler_cursor_abort(sch, cursor); freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, error1); freq_scheduler_add_error(sch, error2); return sch->error->code; } FreqSchedulerError freq_scheduler_add(FreqScheduler *sch, const CrawledPage *page) { if (page_db_add(sch->page_db, page, 0) != 0) { freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, "adding crawled page"); freq_scheduler_add_error(sch, sch->page_db->error->message); } return sch->error->code; } void freq_scheduler_delete(FreqScheduler *sch) { mdb_env_close(sch->txn_manager->env); (void)txn_manager_delete(sch->txn_manager); if (!sch->persist) { char *data = build_path(sch->path, "data.mdb"); char *lock = build_path(sch->path, "lock.mdb"); remove(data); remove(lock); free(data); free(lock); remove(sch->path); } free(sch->path); error_delete(sch->error); free(sch); } FreqSchedulerError freq_scheduler_dump(FreqScheduler *sch, FILE *output) { MDB_cursor *cursor; if (freq_scheduler_cursor_open(sch, &cursor) != 0) return sch->error->code; int end = 0; MDB_cursor_op cursor_op = MDB_FIRST; do { int mdb_rc; MDB_val key; MDB_val val; ScheduleKey *key_data; float *val_data; switch (mdb_rc = mdb_cursor_get(cursor, &key, &val, cursor_op)) { case 0: key_data = (ScheduleKey*)key.mv_data; val_data = (float*)val.mv_data; fprintf(output, "%.2e %016"PRIx64" %.2e\n", key_data->score, key_data->hash, *val_data); break; case MDB_NOTFOUND: end = 1; break; default: freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, "iterating over database"); freq_scheduler_add_error(sch, mdb_strerror(mdb_rc)); end = 1; break; } cursor_op = MDB_NEXT; } while (!end); freq_scheduler_cursor_abort(sch, cursor); return sch->error->code; }
/* Checks the accuracy of the PageRank computation */ void test_page_rank(CuTest *tc) { printf("%s\n", __func__); /* Compute the PageRank score of the following graph * +-->2---+ * | | | * | v v * 1-->5<--3 * ^ ^ | * | | | * +---4<--+ * * The link matrix L[i,j], where L[i,j] = 1 means 'i' links to 'j' is: * * +- -+ * | 0 1 0 0 1 | * | 0 0 1 0 1 | * L = | 0 0 0 1 1 | * | 1 0 0 0 1 | * | 0 0 0 0 0 | * +- -+ * * Since page "5" has no outbound links, it is assumed it links to every other page: * * +- -+ * | 0 1 0 0 1 | * | 0 0 1 0 1 | * L = | 0 0 0 1 1 | * | 1 0 0 0 1 | * | 1 1 1 1 1 | * +- -+ * * The out degree is: * * deg = {2, 2, 2, 2, 5} * * Dividing each row with the out degree and transposing we get the matrix: * * M[i, j] = L[j, i]/deg[j] * * we get: * * +- -+ * | 0 0 0 0.5 0.2 | * | 0.5 0 0 0 0.2 | * M = | 0 0.5 0 0 0.2 | * | 0 0 0.5 0 0.2 | * | 0.5 0.5 0.5 0.5 0.2 | * +- -+ * * If 'd' is the damping then the PageRank 'PR' is: * * 1 - d * PR = ----- + (d * M)*PR * N * * For d=0.85 the numerical solution is: * * PR(1) = PR(2) = PR(3) = PR(4) = 0.15936255 * PR(5) = 0.3625498 */ char test_dir[] = "test-pagedb-XXXXXX"; mkdtemp(test_dir); PageDB *db; int ret = page_db_new(&db, test_dir); CuAssert(tc, db!=0? db->error->message: "NULL", ret == 0); db->persist = 0; char *urls[5] = {"1", "2", "3", "4", "5" }; LinkInfo links_1[] = {{"2", 0.1}, {"5", 0.1}}; LinkInfo links_2[] = {{"3", 0.1}, {"5", 0.1}}; LinkInfo links_3[] = {{"4", 0.1}, {"5", 0.1}}; LinkInfo links_4[] = {{"1", 0.1}, {"5", 0.1}}; LinkInfo *links[5] = { links_1, links_2, links_3, links_4, 0 }; int n_links[5] = {2, 2, 2, 2, 0}; for (int i=0; i<5; ++i) { CrawledPage *cp = crawled_page_new(urls[i]); for (int j=0; j<n_links[i]; ++j) crawled_page_add_link(cp, links[i][j].url, links[i][j].score); cp->score = i/5.0; crawled_page_set_hash64(cp, i); PageInfoList *pil; CuAssert(tc, db->error->message, page_db_add(db, cp, &pil) == 0); page_info_list_delete(pil); crawled_page_delete(cp); } // Without content scores // ------------------------------------------------------------------------ PageDBLinkStream *st; CuAssert(tc, db->error->message, page_db_link_stream_new(&st, db) == 0); st->only_diff_domain = 0; PageRank *pr; ret = page_rank_new(&pr, test_dir, 5); CuAssert(tc, pr!=0? pr->error->message: "NULL", ret == 0); pr->precision = 1e-6; CuAssert(tc, pr->error->message, page_rank_compute(pr, st, page_db_link_stream_next, page_db_link_stream_reset) == 0); page_db_link_stream_delete(st); uint64_t idx; float *score; float scores[5] = {0.15936255, 0.15936255, 0.15936255, 0.15936255, 0.3625498}; for (int i=0; i<5; ++i) { CuAssert(tc, db->error->message, page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0); CuAssertPtrNotNull(tc, score = mmap_array_idx(pr->value1, idx)); CuAssertDblEquals(tc, scores[i], *score, 1e-6); } CHECK_DELETE(tc, pr->error->message, page_rank_delete(pr)); // With content scores, damping = 0 // ------------------------------------------------------------------------ CuAssert(tc, db->error->message, page_db_link_stream_new(&st, db) == 0); st->only_diff_domain = 0; ret = page_rank_new(&pr, test_dir, 5); CuAssert(tc, pr!=0? pr->error->message: "NULL", ret == 0); pr->precision = 1e-6; pr->damping = 0.0; CuAssert(tc, db->error->message, page_db_get_scores(db, &pr->scores) == 0); CuAssert(tc, pr->error->message, page_rank_compute(pr, st, page_db_link_stream_next, page_db_link_stream_reset) == 0); page_db_link_stream_delete(st); float total_score = 0.0; for (int i=0; i<5; ++i) total_score += scores[i] = *((float*)mmap_array_idx(pr->scores, i)); for (int i=0; i<5; ++i) { CuAssert(tc, db->error->message, page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0); CuAssertPtrNotNull(tc, score = mmap_array_idx(pr->value1, idx)); CuAssertDblEquals(tc, scores[idx]/total_score, *score, 1e-6); } CHECK_DELETE(tc, pr->scores->error->message, mmap_array_delete(pr->scores)); CHECK_DELETE(tc, pr->error->message, page_rank_delete(pr)); // With content scores, damping = 0.5 // ------------------------------------------------------------------------ CuAssert(tc, db->error->message, page_db_link_stream_new(&st, db) == 0); st->only_diff_domain = 0; ret = page_rank_new(&pr, test_dir, 5); CuAssert(tc, pr!=0? pr->error->message: "NULL", ret == 0); pr->precision = 1e-6; pr->damping = 0.5; CuAssert(tc, db->error->message, page_db_get_scores(db, &pr->scores) == 0); CuAssert(tc, pr->error->message, page_rank_compute(pr, st, page_db_link_stream_next, page_db_link_stream_reset) == 0); page_db_link_stream_delete(st); float expected_pr[] = { 0.06386554621848739, 0.08739495798319329, 0.1647058823529412, 0.25546218487394956, 0.4285714285714286 }; for (int i=0; i<5; ++i) { CuAssert(tc, db->error->message, page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0); CuAssertPtrNotNull(tc, score = mmap_array_idx(pr->value1, idx)); CuAssertDblEquals(tc, expected_pr[i], *score, 1e-6); } CHECK_DELETE(tc, pr->scores->error->message, mmap_array_delete(pr->scores)); CHECK_DELETE(tc, pr->error->message, page_rank_delete(pr)); page_db_delete(db); }