예제 #1
0
int
run_page_rank (FILE *f, int argc, char **argv)
{
  char line[128];
  PageRank *pr = page_rank_new ();
  int seed;
  double *r;

  seed = page_rank_find_node (pr, "-");
  page_rank_add_edge (pr, "-", "rillian");

  for (;;)
    {
      char *from;
      char *to;
      char *color;

      if (fgets (line, sizeof(line), f) == NULL)
	break;
      if (get_graph_line (line, &from, &to, &color))
	page_rank_add_edge (pr, from, to);
    }
  r = page_rank_compute (pr, seed, 0.15);
#if 0
  print_page_rank (pr, r);
#endif
  g_free (r);
  return 0;
}
예제 #2
0
int
page_rank_scorer_update(void *state) {
     PageRankScorer *prs = (PageRankScorer*)state;

     char *error1 = 0;
     char *error2 = 0;

     PageDBLinkStream *st = 0;
     if (page_db_link_stream_new(&st, prs->page_db) != 0) {
          error1 = "creating link stream";
          error2 = st? "unknown": "NULL";
          goto on_error;
     }

     if (prs->use_content_scores &&
         (page_db_get_scores(prs->page_db, &prs->page_rank->scores) != 0)) {
          error1 = "retrieving content scores";
          error2 = prs->page_db->error->message;
          goto on_error;
     }

     if (page_rank_compute(prs->page_rank,
                           st,
                           page_db_link_stream_next,
                           page_db_link_stream_reset) != 0) {
          error1 = "computing PageRank";
          error2 = prs->page_rank->error->message;
          goto on_error;
     }

     if (prs->use_content_scores &&
         mmap_array_delete(prs->page_rank->scores) != 0) {
          error1 = "deleting content scores";
          error2 = prs->page_rank->scores->error->message;
          goto on_error;
     }

     page_db_link_stream_delete(st);
     return 0;
on_error:
     page_db_link_stream_delete(st);

     page_rank_scorer_set_error(prs,  page_rank_scorer_error_internal, __func__);
     page_rank_scorer_add_error(prs, error1);
     page_rank_scorer_add_error(prs, error2);
     return prs->error->code;
}
예제 #3
0
/* Checks the accuracy of the PageRank computation */
void
test_page_rank(CuTest *tc) {
     printf("%s\n", __func__);
     /* Compute the PageRank score of the following graph
      *        +-->2---+
      *        |   |   |
      *        |   v   v
      *        1-->5<--3
      *        ^   ^   |
      *        |   |   |
      *        +---4<--+
      *
      * The link matrix L[i,j], where L[i,j] = 1 means 'i' links to 'j' is:
      *
      *        +-         -+
      *        | 0 1 0 0 1 |
      *        | 0 0 1 0 1 |
      *    L = | 0 0 0 1 1 |
      *        | 1 0 0 0 1 |
      *        | 0 0 0 0 0 |
      *        +-         -+
      *
      * Since page "5" has no outbound links, it is assumed it links to every other page:
      *
      *        +-         -+
      *        | 0 1 0 0 1 |
      *        | 0 0 1 0 1 |
      *    L = | 0 0 0 1 1 |
      *        | 1 0 0 0 1 |
      *        | 1 1 1 1 1 |
      *        +-         -+
      *
      * The out degree is:
      *
      *    deg = {2, 2, 2, 2, 5}
      *
      * Dividing each row with the out degree and transposing we get the matrix:
      *
      *    M[i, j] = L[j, i]/deg[j]
      *
      * we get:
      *
      *        +-                   -+
      *        | 0   0   0   0.5 0.2 |
      *        | 0.5 0   0   0   0.2 |
      *    M = | 0   0.5 0   0   0.2 |
      *        | 0   0   0.5 0   0.2 |
      *        | 0.5 0.5 0.5 0.5 0.2 |
      *        +-                   -+
      *
      * If 'd' is the damping then the PageRank 'PR' is:
      *
      *        1 - d
      *   PR = ----- + (d * M)*PR
      *          N
      *
      * For d=0.85 the numerical solution is:
      *
      *   PR(1) = PR(2) = PR(3) = PR(4) = 0.15936255
      *   PR(5) = 0.3625498
      */
     char test_dir[] = "test-pagedb-XXXXXX";
     mkdtemp(test_dir);

     PageDB *db;
     int ret = page_db_new(&db, test_dir);
     CuAssert(tc,
              db!=0? db->error->message: "NULL",
              ret == 0);
     db->persist = 0;

     char *urls[5] = {"1", "2", "3", "4", "5" };
     LinkInfo links_1[] = {{"2", 0.1}, {"5", 0.1}};
     LinkInfo links_2[] = {{"3", 0.1}, {"5", 0.1}};
     LinkInfo links_3[] = {{"4", 0.1}, {"5", 0.1}};
     LinkInfo links_4[] = {{"1", 0.1}, {"5", 0.1}};
     LinkInfo *links[5] = {
          links_1, links_2, links_3, links_4, 0
     };

     int n_links[5] = {2, 2, 2, 2, 0};

     for (int i=0; i<5; ++i) {
          CrawledPage *cp = crawled_page_new(urls[i]);
          for (int j=0; j<n_links[i]; ++j)
               crawled_page_add_link(cp, links[i][j].url, links[i][j].score);
          cp->score = i/5.0;
          crawled_page_set_hash64(cp, i);
          PageInfoList *pil;
          CuAssert(tc,
                   db->error->message,
                   page_db_add(db, cp, &pil) == 0);
          page_info_list_delete(pil);
          crawled_page_delete(cp);
     }

     // Without content scores
     // ------------------------------------------------------------------------
     PageDBLinkStream *st;
     CuAssert(tc,
              db->error->message,
              page_db_link_stream_new(&st, db) == 0);
     st->only_diff_domain = 0;

     PageRank *pr;
     ret = page_rank_new(&pr, test_dir, 5);
     CuAssert(tc,
              pr!=0? pr->error->message: "NULL",
              ret == 0);

     pr->precision = 1e-6;

     CuAssert(tc,
              pr->error->message,
              page_rank_compute(pr,
                                st,
                                page_db_link_stream_next,
                                page_db_link_stream_reset) == 0);
     page_db_link_stream_delete(st);

     uint64_t idx;
     float *score;

     float scores[5] =  {0.15936255,  0.15936255,  0.15936255,  0.15936255,  0.3625498};
     for (int i=0; i<5; ++i) {
          CuAssert(tc,
                   db->error->message,
                   page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0);

          CuAssertPtrNotNull(tc,
                             score = mmap_array_idx(pr->value1, idx));

          CuAssertDblEquals(tc, scores[i], *score, 1e-6);
     }
     CHECK_DELETE(tc, pr->error->message, page_rank_delete(pr));

     // With content scores, damping = 0
     // ------------------------------------------------------------------------
     CuAssert(tc,
              db->error->message,
              page_db_link_stream_new(&st, db) == 0);
     st->only_diff_domain = 0;

     ret = page_rank_new(&pr, test_dir, 5);
     CuAssert(tc,
              pr!=0? pr->error->message: "NULL",
              ret == 0);

     pr->precision = 1e-6;
     pr->damping = 0.0;

     CuAssert(tc,
              db->error->message,
              page_db_get_scores(db, &pr->scores) == 0);

     CuAssert(tc,
              pr->error->message,
              page_rank_compute(pr,
                                st,
                                page_db_link_stream_next,
                                page_db_link_stream_reset) == 0);

     page_db_link_stream_delete(st);

     float total_score = 0.0;
     for (int i=0; i<5; ++i)
          total_score += scores[i] = *((float*)mmap_array_idx(pr->scores, i));

     for (int i=0; i<5; ++i) {
          CuAssert(tc,
                   db->error->message,
                   page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0);

          CuAssertPtrNotNull(tc,
                             score = mmap_array_idx(pr->value1, idx));

          CuAssertDblEquals(tc, scores[idx]/total_score, *score, 1e-6);
     }
     CHECK_DELETE(tc, pr->scores->error->message, mmap_array_delete(pr->scores));
     CHECK_DELETE(tc, pr->error->message, page_rank_delete(pr));

     // With content scores, damping = 0.5
     // ------------------------------------------------------------------------
     CuAssert(tc,
              db->error->message,
              page_db_link_stream_new(&st, db) == 0);
     st->only_diff_domain = 0;

     ret = page_rank_new(&pr, test_dir, 5);
     CuAssert(tc,
              pr!=0? pr->error->message: "NULL",
              ret == 0);

     pr->precision = 1e-6;
     pr->damping = 0.5;

     CuAssert(tc,
              db->error->message,
              page_db_get_scores(db, &pr->scores) == 0);

     CuAssert(tc,
              pr->error->message,
              page_rank_compute(pr,
                                st,
                                page_db_link_stream_next,
                                page_db_link_stream_reset) == 0);

     page_db_link_stream_delete(st);

     float expected_pr[] = {
          0.06386554621848739,
          0.08739495798319329,
          0.1647058823529412,
          0.25546218487394956,
          0.4285714285714286
     };

     for (int i=0; i<5; ++i) {
          CuAssert(tc,
                   db->error->message,
                   page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0);

          CuAssertPtrNotNull(tc,
                             score = mmap_array_idx(pr->value1, idx));

          CuAssertDblEquals(tc, expected_pr[i], *score, 1e-6);
     }
     CHECK_DELETE(tc, pr->scores->error->message, mmap_array_delete(pr->scores));
     CHECK_DELETE(tc, pr->error->message, page_rank_delete(pr));

     page_db_delete(db);
}