Ejemplo n.º 1
0
void
test_link_stream(CuTest *tc) {
     printf("%s\n", __func__);
     char test_dir[] = "test-pagedb-XXXXXX";
     mkdtemp(test_dir);

     PageDB *db;
     int ret = page_db_new(&db, test_dir);
     CuAssert(tc,
              db!=0? db->error->message: "NULL",
              ret == 0);
     db->persist = 0;

     /* +----------------+
      * |                |
      * a1 --> a2        |
      * |      ^         |
      * |      |         v
      * +----> b1 -----> b2
      */
     const char *url_a1 = "http://test_a.org/1";
     const char *url_a2 = "http://test_a.org/2";
     const char *url_b1 = "http://test_b.org/1";
     const char *url_b2 = "http://test_b.org/2";
     CrawledPage *cp = crawled_page_new(url_a1);
     crawled_page_add_link(cp, url_a2, 1.0);
     crawled_page_add_link(cp, url_b1, 1.0);
     crawled_page_add_link(cp, url_b2, 1.0);
     CuAssert(tc,
              db->error->message,
              page_db_add(db, cp, 0) == 0);
     crawled_page_delete(cp);

     cp = crawled_page_new(url_b1);
     crawled_page_add_link(cp, url_b2, 1.0);
     crawled_page_add_link(cp, url_a2, 1.0);
     CuAssert(tc,
              db->error->message,
              page_db_add(db, cp, 0) == 0);
     crawled_page_delete(cp);
     uint64_t idx[4];
     CuAssert(tc,
              db->error->message,
              page_db_get_idx(db, page_db_hash(url_a1), idx + 0) == 0);
     CuAssert(tc,
              db->error->message,
              page_db_get_idx(db, page_db_hash(url_a2), idx + 1) == 0);
     CuAssert(tc,
              db->error->message,
              page_db_get_idx(db, page_db_hash(url_b1), idx + 2) == 0);
     CuAssert(tc,
              db->error->message,
              page_db_get_idx(db, page_db_hash(url_b2), idx + 3) == 0);

     Link links_diff[] = {
          {.from = idx[0], .to = idx[2]}, // a1 -> b1
          {.from = idx[0], .to = idx[3]}, // a1 -> b2
Ejemplo n.º 2
0
/* Checks the accuracy of the HITS computation */
void
test_hits(CuTest *tc) {
     /* Compute the HITS score of the following graph
      *        +-->2---+
      *        |   |   |
      *        |   v   v
      *        1-->5<--3
      *        ^   ^   |
      *        |   |   |
      *        +---4<--+
      *
      * The link matrix L[i,j], where L[i,j] = 1 means 'i' links to 'j' is:
      *
      *        +-         -+
      *        | 0 1 0 0 1 |
      *        | 0 0 1 0 1 |
      *    L = | 0 0 0 1 1 |
      *        | 1 0 0 0 1 |
      *        | 0 0 0 0 0 |
      *        +-         -+
      */
     char test_dir[] = "test-pagedb-XXXXXX";
     mkdtemp(test_dir);

     PageDB *db;
     int ret = page_db_new(&db, test_dir);
     CuAssert(tc,
              db!=0? db->error->message: "NULL",
              ret == 0);
     db->persist = 0;

     char *urls[5] = {"1", "2", "3", "4", "5" };
     LinkInfo links_1[] = {{"2", 0.1}, {"5", 0.1}};
     LinkInfo links_2[] = {{"3", 0.1}, {"5", 0.1}};
     LinkInfo links_3[] = {{"4", 0.1}, {"5", 0.1}};
     LinkInfo links_4[] = {{"1", 0.1}, {"5", 0.1}};
     LinkInfo *links[5] = {
          links_1, links_2, links_3, links_4, 0
     };
     int n_links[5] = {2, 2, 2, 2, 0};

     for (int i=0; i<5; ++i) {
          CrawledPage *cp = crawled_page_new(urls[i]);
          for (int j=0; j<n_links[i]; ++j)
               crawled_page_add_link(cp, links[i][j].url, links[i][j].score);
          cp->score = i/5.0;
          crawled_page_set_hash64(cp, i);

          PageInfoList *pil;
          CuAssert(tc,
                   db->error->message,
                   page_db_add(db, cp, &pil) == 0);
          page_info_list_delete(pil);
          crawled_page_delete(cp);
     }

     PageDBLinkStream *st;
     CuAssert(tc,
              db->error->message,
              page_db_link_stream_new(&st, db) == 0);
     st->only_diff_domain = 0;

     Hits *hits;
     ret = hits_new(&hits, test_dir, 5);
     CuAssert(tc,
              hits!=0? hits->error->message: "NULL",
              ret == 0);

     hits->precision = 1e-8;
     CuAssert(tc,
              hits->error->message,
              hits_compute(hits,
                           st,
                           page_db_link_stream_next,
                           page_db_link_stream_reset) == 0);
     page_db_link_stream_delete(st);

     uint64_t idx;
     float *h_score;
     float *a_score;
     float h_scores[5] = {0.250, 0.250, 0.250, 0.250, 0.000};
     float a_scores[5] = {0.125, 0.125, 0.125, 0.125, 0.500};

     for (int i=0; i<5; ++i) {
          CuAssert(tc,
                   db->error->message,
                   page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0);

          CuAssertPtrNotNull(tc,
                             h_score = mmap_array_idx(hits->h1, idx));
          CuAssertPtrNotNull(tc,
                             a_score = mmap_array_idx(hits->a1, idx));

          CuAssertDblEquals(tc, h_scores[i], *h_score, 1e-6);
          CuAssertDblEquals(tc, a_scores[i], *a_score, 1e-6);
     }
     CuAssert(tc,
              hits->error->message,
              hits_delete(hits) == 0);

     page_db_delete(db);
}
Ejemplo n.º 3
0
/* How new PageInfo are created:
      page_db_add ---------------> page_db_add_crawled_page_info
            |                                 |
            |                                 |
            v                                 v
      page_db_add_link_page_info      page_info_new_crawled
            |                                 |
            |                                 |
            |                                 |
            +--------> page_info_new_link <---+
*/
PageDBError
page_db_add(PageDB *db, const CrawledPage *page, PageInfoList **page_info_list) {
     // check if page should be expanded
     if (page_db_expand(db) != 0)
          return db->error->code;

     MDB_txn *txn;

     MDB_cursor *cur_hash2info;
     MDB_cursor *cur_hash2idx;
     MDB_cursor *cur_links;
     MDB_cursor *cur_info;

     MDB_val key;
     MDB_val val;

     int mdb_rc;
     char *error = 0;

     uint64_t *diff_id = 0;
     uint64_t *same_id = 0;

     // start a new write transaction
     txn = 0;
     if ((txn_manager_begin(db->txn_manager, 0, &txn)) != 0)
          error = db->txn_manager->error->message;
     else if ((mdb_rc = page_db_open_hash2info(txn, &cur_hash2info)) != 0)
          error = "opening hash2info cursor";
     else if ((mdb_rc = page_db_open_hash2idx(txn, &cur_hash2idx)) != 0)
          error = "opening hash2idx cursor";
     else if ((mdb_rc = page_db_open_links(txn, &cur_links)) != 0)
          error = "opening links cursor";
     else if ((mdb_rc = page_db_open_info(txn, &cur_info)) != 0)
          error = "opening info cursor";

     if (error != 0)
          goto on_error;

     // get n_pages
     key.mv_size = sizeof(info_n_pages);
     key.mv_data = info_n_pages;
     if ((mdb_rc = mdb_cursor_get(cur_info, &key, &val, MDB_SET)) != 0) {
          error = "retrieving info.n_pages";
          goto on_error;
     }
     size_t n_pages = *(size_t*)val.mv_data;

     uint64_t cp_hash = page_db_hash(page->url);
     key.mv_size = sizeof(uint64_t);
     key.mv_data = &cp_hash;

     if (db->domain_temp) {
          domain_temp_update(db->domain_temp, (float)page->time);
          domain_temp_heat(db->domain_temp, page_db_hash_get_domain(cp_hash));
     }

     PageInfo *pi;
     if (page_db_add_crawled_page_info(cur_hash2info, &key, page, &pi, &mdb_rc) != 0) {
          error = "adding/updating page info";
          goto on_error;
     }
     uint64_t link_depth = pi->depth + 1;

     if (page_info_list) {
          *page_info_list = page_info_list_new(pi, cp_hash);
          if (!*page_info_list) {
               error = "allocating new PageInfo list";
               goto on_error;
          }
     } else {
          page_info_delete(pi);
          pi = 0;
     }

     size_t n_links = crawled_page_n_links(page);
     // store here links inside the same domain as the crawled page
     same_id = malloc((n_links + 1)*sizeof(*same_id));
     // store here links outside the domain of the crawled page
     diff_id = malloc((n_links + 1)*sizeof(*diff_id));
     // next link id is going to be written here
     uint64_t *id = diff_id;
     // number of id's in same_id and diff_id. The first element of diff_id
     // array is reserved for the id of the crawled page, so we start at 1.
     // The first element of same_id will be a copy of the last element of
     // diff_id, so we start at 1 too.
     uint64_t same_i = 1;
     uint64_t diff_i = 1;
     if (!same_id || !diff_id) {
          error = "could not malloc";
          goto on_error;
     }
     // hash of the current URL
     uint64_t hash = cp_hash;
     for (size_t i=0; i <= n_links; ++i) {
          const LinkInfo *link = i > 0? crawled_page_get_link(page, i - 1): 0;
          if (link) {
               hash = page_db_hash(link->url);
               key.mv_size = sizeof(uint64_t);
               key.mv_data = &hash;

               id = same_domain(page->url, link->url)?
                    same_id + same_i++:
                    diff_id + diff_i++;
          }
          val.mv_size = sizeof(uint64_t);
          val.mv_data = &n_pages;

          switch (mdb_rc = mdb_cursor_put(cur_hash2idx, &key, &val, MDB_NOOVERWRITE)) {
          case MDB_KEYEXIST: // not really an error
               *id = *(uint64_t*)val.mv_data;
               break;
          case 0:
               *id = n_pages++;
               if (link) {
                    if (page_db_add_link_page_info(
                             cur_hash2info,
                             &key,
                             cp_hash,
                             link_depth,
                             link,
                             &pi,
                             &mdb_rc) != 0) {
                         error = "adding/updating link info";
                         goto on_error;
                    }
                    if (page_info_list) {
                         if (!(*page_info_list = page_info_list_cons(*page_info_list, pi, hash))) {
                              error = "adding new PageInfo to list";
                              goto on_error;
                         }
                    }
                    else {
                         page_info_delete(pi);
                    }

               }
               break;
          default:
               goto on_error;
          }
     }

     // store n_pages
     key.mv_size = sizeof(info_n_pages);
     key.mv_data = info_n_pages;
     val.mv_size = sizeof(size_t);
     val.mv_data = &n_pages;
     if ((mdb_rc = mdb_cursor_put(cur_info, &key, &val, 0)) != 0) {
          error = "storing n_pages";
          goto on_error;
     }

     // store links and commit transaction
     // The format for the links is the following:
     //
     // KEY = ID of crawled page
     // VAL = Number of links to different domain,
     //       diff link id 1, diff link id 2, ...
     //       same link id 1, same link id 2, ...

     key.mv_size = sizeof(uint64_t);
     key.mv_data = diff_id; // remember that diff_id[0] is the id of the
                            // crawled page
     // the links are stored as deltas starting from the 'from' page, encoded
     // using varint.
     uint8_t *buf = val.mv_data = malloc(MAX_VARINT_SIZE*(n_links + 1));
     if (!buf) {
          error = "allocating memory to store links";
          goto on_error;
     }
     // write number of diff links (substract 1 to take into account this page id)
     buf = varint_encode_uint64(diff_i - 1, buf);
     // write diff links
     for (size_t i=1; i<diff_i; ++i)
          buf = varint_encode_int64((int64_t)diff_id[i] - (int64_t)diff_id[i-1], buf);
     same_id[0] = diff_id[diff_i-1];
     for (size_t i=1; i<same_i; ++i)
          buf = varint_encode_int64((int64_t)same_id[i] - (int64_t)same_id[i-1], buf);

     val.mv_size = (char*)buf - (char*)val.mv_data;
     if ((mdb_rc = mdb_cursor_put(cur_links, &key, &val, 0)) != 0) {
          error = "storing links";
          goto on_error;
     }
     free(val.mv_data);
     free(same_id);
     free(diff_id);
     same_id = diff_id = 0;

     if (txn_manager_commit(db->txn_manager, txn) != 0) {
          error = db->txn_manager->error->message;
          goto on_error;
     }
     return db->error->code;

on_error:
     if (same_id)
          free(same_id);
     if (diff_id)
          free(diff_id);
     if (txn)
          txn_manager_abort(db->txn_manager, txn);

     page_db_set_error(db, page_db_error_internal, __func__);
     page_db_add_error(db, error);
     if (mdb_rc != 0)
          page_db_add_error(db, mdb_strerror(mdb_rc));

     return db->error->code;
}
Ejemplo n.º 4
0
/* Tests the loading/dumping of PageInfo from and into LMDB values */
void
test_page_info_serialization(CuTest *tc) {
     printf("%s\n", __func__);
     MDB_val val;
     PageInfo pi1 = {
          .url                 = "test_url_123",
          .first_crawl         = 123,
          .last_crawl          = 456,
          .n_changes           = 100,
          .n_crawls            = 20,
          .score               = 0.7,
          .content_hash_length = 8,
          .content_hash        = "1234567"
     };

     CuAssertTrue(tc, page_info_dump(&pi1, &val) == 0);
     CuAssertDblEquals(tc, 0.7, page_info_dump_get_score(&val), 1e-6);

     PageInfo *pi2 = page_info_load(&val);
     CuAssertPtrNotNull(tc, pi2);

     free(val.mv_data);

     CuAssertStrEquals(tc, pi1.url, pi2->url);
     CuAssertTrue(tc, pi1.first_crawl == pi2->first_crawl);
     CuAssertTrue(tc, pi1.last_crawl == pi2->last_crawl);
     CuAssertTrue(tc, pi1.n_changes == pi2->n_changes);
     CuAssertTrue(tc, pi1.n_crawls == pi2->n_crawls);
     CuAssertTrue(tc, pi1.score == pi2->score);
     CuAssertTrue(tc, pi1.content_hash_length == pi2->content_hash_length);
     CuAssertStrEquals(tc, pi1.content_hash, pi2->content_hash);

     page_info_delete(pi2);
}

/* Tests all the database operations on a very simple crawl of just two pages */
void
test_page_db_simple(CuTest *tc) {
     printf("%s\n", __func__);

     char test_dir[] = "test-pagedb-XXXXXX";
     mkdtemp(test_dir);

     PageDB *db;
     int ret = page_db_new(&db, test_dir);
     CuAssert(tc,
              db!=0? db->error->message: "NULL",
              ret == 0);
     db->persist = 0;

     CrawledPage *cp1 = crawled_page_new("www.yahoo.com");
     crawled_page_add_link(cp1, "a", 0.1);
     crawled_page_add_link(cp1, "b", 0.2);
     crawled_page_add_link(cp1, "www.google.com", 0.3);
     crawled_page_set_hash64(cp1, 1000);
     cp1->score = 0.5;

     CrawledPage *cp2 = crawled_page_new("www.bing.com");
     crawled_page_add_link(cp2, "x", 1.1);
     crawled_page_add_link(cp2, "y", 1.2);
     crawled_page_set_hash64(cp2, 2000);
     cp2->score = 0.2;

     PageInfoList *pil;
     CuAssert(tc,
              db->error->message,
              page_db_add(db, cp1, &pil) == 0);
     page_info_list_delete(pil);

     CuAssert(tc,
              db->error->message,
              page_db_add(db, cp2, &pil) == 0);
     page_info_list_delete(pil);

     crawled_page_set_hash64(cp2, 3000);
     CuAssert(tc,
              db->error->message,
              page_db_add(db, cp2, &pil) == 0);
     page_info_list_delete(pil);

     MMapArray *scores = 0;
     CuAssert(tc,
              db->error->message,
              page_db_get_scores(db, &scores) == 0);

     size_t idx;
     CuAssert(tc,
              db->error->message,
              page_db_get_idx(db, page_db_hash("www.yahoo.com"), &idx) == 0);
     CuAssertDblEquals(
          tc,
          0.5,
          *(float*)mmap_array_idx(scores, idx),
          1e-6);
     CuAssert(tc,
              db->error->message,
              page_db_get_idx(db, page_db_hash("x"), &idx) == 0);
     CuAssertDblEquals(
          tc,
          1.1,
          *(float*)mmap_array_idx(scores, idx),
          1e-6);

     CHECK_DELETE(tc, scores->error->message, mmap_array_delete(scores));

     crawled_page_delete(cp1);
     crawled_page_delete(cp2);

     char pi_out[1000];
     char *print_pages[] = {"www.yahoo.com", "www.google.com", "www.bing.com"};
     for (size_t i=0; i<3; ++i) {
          PageInfo *pi;
          CuAssert(tc,
                   db->error->message,
                   page_db_get_info(db, page_db_hash(print_pages[i]), &pi) == 0);

          CuAssertPtrNotNull(tc, pi);

          switch(i) {
          case 0:
               CuAssertIntEquals(tc, 1, pi->n_crawls);
               CuAssertIntEquals(tc, 0, pi->n_changes);
               break;
          case 1:
               CuAssertIntEquals(tc, 0, pi->n_crawls);
               break;
          case 2:
               CuAssertIntEquals(tc, 2, pi->n_crawls);
               CuAssertIntEquals(tc, 1, pi->n_changes);
               break;
          }
          page_info_print(pi, pi_out);
          page_info_delete(pi);
/* show on screen the page info:
 *
 * Mon Apr  6 15:34:50 2015|Mon Apr  6 15:34:50 2015|1.00e+00|0.00e+00|www.yahoo.com
 * Thu Jan  1 01:00:00 1970|Thu Jan  1 01:00:00 1970|0.00e+00|0.00e+00|www.google.com
 * Mon Apr  6 15:34:50 2015|Mon Apr  6 15:34:50 2015|2.00e+00|1.00e+00|www.bing.com
 */
#if 0
          printf("%s\n", pi_out);
#endif
     }

     PageDBLinkStream *es;
     CuAssert(tc,
              db->error->message,
              page_db_link_stream_new(&es, db) == 0);
     es->only_diff_domain = 0;

     if (es->state == stream_state_init) {
          Link link;
          int i=0;
          while (page_db_link_stream_next(es, &link) == stream_state_next) {
               switch(i++) {
               case 0:
                    CuAssertIntEquals(tc, 0, link.from);
                    CuAssertIntEquals(tc, 1, link.to);
                    break;
               case 1:
                    CuAssertIntEquals(tc, 0, link.from);
                    CuAssertIntEquals(tc, 2, link.to);
                    break;
               case 2:
                    CuAssertIntEquals(tc, 0, link.from);
                    CuAssertIntEquals(tc, 3, link.to);
                    break;
               case 3:
                    CuAssertIntEquals(tc, 4, link.from);
                    CuAssertIntEquals(tc, 5, link.to);
                    break;
               case 4:
                    CuAssertIntEquals(tc, 4, link.from);
                    CuAssertIntEquals(tc, 6, link.to);
                    break;
               default:
                    CuFail(tc, "too many links");
                    break;
               }
          }
          CuAssertTrue(tc, es->state != stream_state_error);
     }
     page_db_link_stream_delete(es);

     page_db_delete(db);
}
Ejemplo n.º 5
0
void
test_hashinfo_stream(CuTest *tc) {
     printf("%s\n", __func__);
     char test_dir[] = "test-pagedb-XXXXXX";
     mkdtemp(test_dir);

     PageDB *db;
     int ret = page_db_new(&db, test_dir);
     CuAssert(tc,
              db!=0? db->error->message: "NULL",
              ret == 0);
     db->persist = 0;

     CrawledPage *cp = crawled_page_new("1");
     crawled_page_add_link(cp, "a", 0);
     crawled_page_add_link(cp, "b", 0);
     CuAssert(tc,
              db->error->message,
              page_db_add(db, cp, 0) == 0);
     crawled_page_delete(cp);

     cp = crawled_page_new("2");
     crawled_page_add_link(cp, "c", 0);
     crawled_page_add_link(cp, "d", 0);
     CuAssert(tc,
              db->error->message,
              page_db_add(db, cp, 0) == 0);
     crawled_page_delete(cp);

     HashInfoStream *stream;
     CuAssert(tc,
              db->error->message,
              hashinfo_stream_new(&stream, db) == 0);

     CuAssert(tc,
              "stream was not initialized",
              stream->state == stream_state_init);

     uint64_t hash;
     PageInfo *pi;

     char *expected_url[] = {"1", "a", "b", "2", "c", "d"};
     uint64_t expected_hash[6];
     for (int i=0; i<6; ++i)
          expected_hash[i] = page_db_hash(expected_url[i]);
     int found[] = {0, 0, 0, 0, 0, 0};
     for (int i=0; i<6; ++i) {
          CuAssert(tc,
                   "stream element expected",
                   hashinfo_stream_next(stream, &hash, &pi) == stream_state_next);
          int match = 0;
          for (int j=0; j<6; ++j)
               if (hash == expected_hash[j]) {
                    found[j] = 1;
                    CuAssertStrEquals(tc,
                                      expected_url[j],
                                      pi->url);
                    match = 1;
               }
          CuAssert(tc, "unexpected page hash", match);
          page_info_delete(pi);
     }
     hashinfo_stream_delete(stream);

     for (int i=0; i<6; ++i)
          CuAssertTrue(tc, found[i]);

     page_db_delete(db);
}
Ejemplo n.º 6
0
void
test_hashidx_stream(CuTest *tc) {
     printf("%s\n", __func__);
     char test_dir[] = "test-pagedb-XXXXXX";
     mkdtemp(test_dir);

     PageDB *db;
     int ret = page_db_new(&db, test_dir);
     CuAssert(tc,
              db!=0? db->error->message: "NULL",
              ret == 0);
     db->persist = 0;

     CrawledPage *cp = crawled_page_new("1");
     crawled_page_add_link(cp, "a", 0);
     crawled_page_add_link(cp, "b", 0);
     CuAssert(tc,
              db->error->message,
              page_db_add(db, cp, 0) == 0);
     crawled_page_delete(cp);

     cp = crawled_page_new("2");
     crawled_page_add_link(cp, "c", 0);
     crawled_page_add_link(cp, "d", 0);
     CuAssert(tc,
              db->error->message,
              page_db_add(db, cp, 0) == 0);
     crawled_page_delete(cp);

     HashIdxStream *stream;
     CuAssert(tc,
              db->error->message,
              hashidx_stream_new(&stream, db) == 0);

     CuAssert(tc,
              "stream was not initialized",
              stream->state == stream_state_init);

     uint64_t hash;
     size_t idx;

     uint64_t expected_hash[] = {
          page_db_hash("1"),
          page_db_hash("a"),
          page_db_hash("b"),
          page_db_hash("2"),
          page_db_hash("c"),
          page_db_hash("d")
     };

     for (int i=0; i<6; ++i) {
          CuAssert(tc,
                   "stream element expected",
                   hashidx_stream_next(stream, &hash, &idx) == stream_state_next);
          if (idx > 5)
               CuFail(tc, "unexpected index");
          CuAssert(tc,
                   "mismatch between index and hash",
                   hash == expected_hash[idx]);
     }
     hashidx_stream_delete(stream);

     page_db_delete(db);
}
Ejemplo n.º 7
0
/* Checks the accuracy of the PageRank computation */
void
test_page_rank(CuTest *tc) {
     printf("%s\n", __func__);
     /* Compute the PageRank score of the following graph
      *        +-->2---+
      *        |   |   |
      *        |   v   v
      *        1-->5<--3
      *        ^   ^   |
      *        |   |   |
      *        +---4<--+
      *
      * The link matrix L[i,j], where L[i,j] = 1 means 'i' links to 'j' is:
      *
      *        +-         -+
      *        | 0 1 0 0 1 |
      *        | 0 0 1 0 1 |
      *    L = | 0 0 0 1 1 |
      *        | 1 0 0 0 1 |
      *        | 0 0 0 0 0 |
      *        +-         -+
      *
      * Since page "5" has no outbound links, it is assumed it links to every other page:
      *
      *        +-         -+
      *        | 0 1 0 0 1 |
      *        | 0 0 1 0 1 |
      *    L = | 0 0 0 1 1 |
      *        | 1 0 0 0 1 |
      *        | 1 1 1 1 1 |
      *        +-         -+
      *
      * The out degree is:
      *
      *    deg = {2, 2, 2, 2, 5}
      *
      * Dividing each row with the out degree and transposing we get the matrix:
      *
      *    M[i, j] = L[j, i]/deg[j]
      *
      * we get:
      *
      *        +-                   -+
      *        | 0   0   0   0.5 0.2 |
      *        | 0.5 0   0   0   0.2 |
      *    M = | 0   0.5 0   0   0.2 |
      *        | 0   0   0.5 0   0.2 |
      *        | 0.5 0.5 0.5 0.5 0.2 |
      *        +-                   -+
      *
      * If 'd' is the damping then the PageRank 'PR' is:
      *
      *        1 - d
      *   PR = ----- + (d * M)*PR
      *          N
      *
      * For d=0.85 the numerical solution is:
      *
      *   PR(1) = PR(2) = PR(3) = PR(4) = 0.15936255
      *   PR(5) = 0.3625498
      */
     char test_dir[] = "test-pagedb-XXXXXX";
     mkdtemp(test_dir);

     PageDB *db;
     int ret = page_db_new(&db, test_dir);
     CuAssert(tc,
              db!=0? db->error->message: "NULL",
              ret == 0);
     db->persist = 0;

     char *urls[5] = {"1", "2", "3", "4", "5" };
     LinkInfo links_1[] = {{"2", 0.1}, {"5", 0.1}};
     LinkInfo links_2[] = {{"3", 0.1}, {"5", 0.1}};
     LinkInfo links_3[] = {{"4", 0.1}, {"5", 0.1}};
     LinkInfo links_4[] = {{"1", 0.1}, {"5", 0.1}};
     LinkInfo *links[5] = {
          links_1, links_2, links_3, links_4, 0
     };

     int n_links[5] = {2, 2, 2, 2, 0};

     for (int i=0; i<5; ++i) {
          CrawledPage *cp = crawled_page_new(urls[i]);
          for (int j=0; j<n_links[i]; ++j)
               crawled_page_add_link(cp, links[i][j].url, links[i][j].score);
          cp->score = i/5.0;
          crawled_page_set_hash64(cp, i);
          PageInfoList *pil;
          CuAssert(tc,
                   db->error->message,
                   page_db_add(db, cp, &pil) == 0);
          page_info_list_delete(pil);
          crawled_page_delete(cp);
     }

     // Without content scores
     // ------------------------------------------------------------------------
     PageDBLinkStream *st;
     CuAssert(tc,
              db->error->message,
              page_db_link_stream_new(&st, db) == 0);
     st->only_diff_domain = 0;

     PageRank *pr;
     ret = page_rank_new(&pr, test_dir, 5);
     CuAssert(tc,
              pr!=0? pr->error->message: "NULL",
              ret == 0);

     pr->precision = 1e-6;

     CuAssert(tc,
              pr->error->message,
              page_rank_compute(pr,
                                st,
                                page_db_link_stream_next,
                                page_db_link_stream_reset) == 0);
     page_db_link_stream_delete(st);

     uint64_t idx;
     float *score;

     float scores[5] =  {0.15936255,  0.15936255,  0.15936255,  0.15936255,  0.3625498};
     for (int i=0; i<5; ++i) {
          CuAssert(tc,
                   db->error->message,
                   page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0);

          CuAssertPtrNotNull(tc,
                             score = mmap_array_idx(pr->value1, idx));

          CuAssertDblEquals(tc, scores[i], *score, 1e-6);
     }
     CHECK_DELETE(tc, pr->error->message, page_rank_delete(pr));

     // With content scores, damping = 0
     // ------------------------------------------------------------------------
     CuAssert(tc,
              db->error->message,
              page_db_link_stream_new(&st, db) == 0);
     st->only_diff_domain = 0;

     ret = page_rank_new(&pr, test_dir, 5);
     CuAssert(tc,
              pr!=0? pr->error->message: "NULL",
              ret == 0);

     pr->precision = 1e-6;
     pr->damping = 0.0;

     CuAssert(tc,
              db->error->message,
              page_db_get_scores(db, &pr->scores) == 0);

     CuAssert(tc,
              pr->error->message,
              page_rank_compute(pr,
                                st,
                                page_db_link_stream_next,
                                page_db_link_stream_reset) == 0);

     page_db_link_stream_delete(st);

     float total_score = 0.0;
     for (int i=0; i<5; ++i)
          total_score += scores[i] = *((float*)mmap_array_idx(pr->scores, i));

     for (int i=0; i<5; ++i) {
          CuAssert(tc,
                   db->error->message,
                   page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0);

          CuAssertPtrNotNull(tc,
                             score = mmap_array_idx(pr->value1, idx));

          CuAssertDblEquals(tc, scores[idx]/total_score, *score, 1e-6);
     }
     CHECK_DELETE(tc, pr->scores->error->message, mmap_array_delete(pr->scores));
     CHECK_DELETE(tc, pr->error->message, page_rank_delete(pr));

     // With content scores, damping = 0.5
     // ------------------------------------------------------------------------
     CuAssert(tc,
              db->error->message,
              page_db_link_stream_new(&st, db) == 0);
     st->only_diff_domain = 0;

     ret = page_rank_new(&pr, test_dir, 5);
     CuAssert(tc,
              pr!=0? pr->error->message: "NULL",
              ret == 0);

     pr->precision = 1e-6;
     pr->damping = 0.5;

     CuAssert(tc,
              db->error->message,
              page_db_get_scores(db, &pr->scores) == 0);

     CuAssert(tc,
              pr->error->message,
              page_rank_compute(pr,
                                st,
                                page_db_link_stream_next,
                                page_db_link_stream_reset) == 0);

     page_db_link_stream_delete(st);

     float expected_pr[] = {
          0.06386554621848739,
          0.08739495798319329,
          0.1647058823529412,
          0.25546218487394956,
          0.4285714285714286
     };

     for (int i=0; i<5; ++i) {
          CuAssert(tc,
                   db->error->message,
                   page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0);

          CuAssertPtrNotNull(tc,
                             score = mmap_array_idx(pr->value1, idx));

          CuAssertDblEquals(tc, expected_pr[i], *score, 1e-6);
     }
     CHECK_DELETE(tc, pr->scores->error->message, mmap_array_delete(pr->scores));
     CHECK_DELETE(tc, pr->error->message, page_rank_delete(pr));

     page_db_delete(db);
}