예제 #1
0
파일: test_hits.c 프로젝트: okey/aduana
/* Checks the accuracy of the HITS computation */
void
test_hits(CuTest *tc) {
     /* Compute the HITS score of the following graph
      *        +-->2---+
      *        |   |   |
      *        |   v   v
      *        1-->5<--3
      *        ^   ^   |
      *        |   |   |
      *        +---4<--+
      *
      * The link matrix L[i,j], where L[i,j] = 1 means 'i' links to 'j' is:
      *
      *        +-         -+
      *        | 0 1 0 0 1 |
      *        | 0 0 1 0 1 |
      *    L = | 0 0 0 1 1 |
      *        | 1 0 0 0 1 |
      *        | 0 0 0 0 0 |
      *        +-         -+
      */
     char test_dir[] = "test-pagedb-XXXXXX";
     mkdtemp(test_dir);

     PageDB *db;
     int ret = page_db_new(&db, test_dir);
     CuAssert(tc,
              db!=0? db->error->message: "NULL",
              ret == 0);
     db->persist = 0;

     char *urls[5] = {"1", "2", "3", "4", "5" };
     LinkInfo links_1[] = {{"2", 0.1}, {"5", 0.1}};
     LinkInfo links_2[] = {{"3", 0.1}, {"5", 0.1}};
     LinkInfo links_3[] = {{"4", 0.1}, {"5", 0.1}};
     LinkInfo links_4[] = {{"1", 0.1}, {"5", 0.1}};
     LinkInfo *links[5] = {
          links_1, links_2, links_3, links_4, 0
     };
     int n_links[5] = {2, 2, 2, 2, 0};

     for (int i=0; i<5; ++i) {
          CrawledPage *cp = crawled_page_new(urls[i]);
          for (int j=0; j<n_links[i]; ++j)
               crawled_page_add_link(cp, links[i][j].url, links[i][j].score);
          cp->score = i/5.0;
          crawled_page_set_hash64(cp, i);

          PageInfoList *pil;
          CuAssert(tc,
                   db->error->message,
                   page_db_add(db, cp, &pil) == 0);
          page_info_list_delete(pil);
          crawled_page_delete(cp);
     }

     PageDBLinkStream *st;
     CuAssert(tc,
              db->error->message,
              page_db_link_stream_new(&st, db) == 0);
     st->only_diff_domain = 0;

     Hits *hits;
     ret = hits_new(&hits, test_dir, 5);
     CuAssert(tc,
              hits!=0? hits->error->message: "NULL",
              ret == 0);

     hits->precision = 1e-8;
     CuAssert(tc,
              hits->error->message,
              hits_compute(hits,
                           st,
                           page_db_link_stream_next,
                           page_db_link_stream_reset) == 0);
     page_db_link_stream_delete(st);

     uint64_t idx;
     float *h_score;
     float *a_score;
     float h_scores[5] = {0.250, 0.250, 0.250, 0.250, 0.000};
     float a_scores[5] = {0.125, 0.125, 0.125, 0.125, 0.500};

     for (int i=0; i<5; ++i) {
          CuAssert(tc,
                   db->error->message,
                   page_db_get_idx(db, page_db_hash(urls[i]), &idx) == 0);

          CuAssertPtrNotNull(tc,
                             h_score = mmap_array_idx(hits->h1, idx));
          CuAssertPtrNotNull(tc,
                             a_score = mmap_array_idx(hits->a1, idx));

          CuAssertDblEquals(tc, h_scores[i], *h_score, 1e-6);
          CuAssertDblEquals(tc, a_scores[i], *a_score, 1e-6);
     }
     CuAssert(tc,
              hits->error->message,
              hits_delete(hits) == 0);

     page_db_delete(db);
}
예제 #2
0
파일: test_pagedb.c 프로젝트: plafl/aduana
static void
test_page_db_crawl(CuTest *tc) {
     printf("%s\n", __func__);

     char test_dir[] = "test-pagedb-XXXXXX";
     mkdtemp(test_dir);

     PageDB *db;
     int ret = page_db_new(&db, test_dir);
     CuAssert(tc,
              db!=0? db->error->message: "NULL",
              ret == 0);
     page_db_set_persist(db, 0);
     page_db_set_domain_temp(db, 20, 60.0);

     const size_t n_links = 10;

     LinkInfo links[n_links + 1];
     for (size_t j=0; j<=n_links; ++j) {
          sprintf(links[j].url = malloc(100),
                  "http://test_domain_%zu.org/test_url_%zu", j%100, j);
          links[j].score = j;
     }
     clock_t start = clock();
     for (size_t i=1; i<=test_n_pages; ++i) {
          if (i % 10000 == 0) {
               double delta = (double)(clock() - start)/(double)CLOCKS_PER_SEC;
               if (delta > 0) {
                    printf("%10zuK/%zuK: %.0f pages/sec\n",
                           i/1000, test_n_pages/1000, ((double)i)/delta);
               }
          }
          free(links[0].url);
          for (size_t j=0; j<n_links; ++j)
               links[j] = links[j+1];
          sprintf(links[n_links].url = malloc(50), "test_url_%zu", i + n_links);
          links[n_links].score = i;

          CrawledPage *cp = crawled_page_new(links[0].url);
          for (size_t j=1; j<=n_links; ++j)
               crawled_page_add_link(cp, links[j].url, 0.5);

          PageInfoList *pil;
          CuAssert(tc,
                   db->error->message,
                   page_db_add(db, cp, &pil) == 0);
          page_info_list_delete(pil);
          crawled_page_delete(cp);
     }
     for (size_t j=0; j<=n_links; ++j)
          free(links[j].url);

     PageDBLinkStream *st;
     CuAssert(tc,
              db->error->message,
              page_db_link_stream_new(&st, db) == 0);
     st->only_diff_domain = 0;

     Hits *hits;
     ret = hits_new(&hits, test_dir, test_n_pages);
     CuAssert(tc,
              hits!=0? hits->error->message: "NULL",
              ret == 0);

     hits->precision = 1e-3;
     HitsError hits_err = hits_compute(hits,
                                       st,
                                       page_db_link_stream_next,
                                       page_db_link_stream_reset);
     if (hits_err == hits_error_precision)
          hits_err = 0;

     CuAssert(tc, hits->error->message, hits_err == 0);

     CHECK_DELETE(tc, hits->error->message, hits_delete(hits));

     page_db_link_stream_delete(st);

     page_db_delete(db);
}