/** Close database */ PageDBError page_db_delete(PageDB *db) { if (!db) return 0; mdb_env_close(db->txn_manager->env); if (txn_manager_delete(db->txn_manager) != 0) { page_db_set_error(db, page_db_error_internal, __func__); page_db_add_error(db, "deleting transaction manager"); page_db_add_error(db, db->txn_manager->error->message); return db->error->code; } if (!db->persist) { char *data = build_path(db->path, "data.mdb"); char *lock = build_path(db->path, "lock.mdb"); // proceeed even the data files cannot be deleted from disk (void)remove(data); (void)remove(lock); (void)remove(db->path); free(data); free(lock); } free(db->path); domain_temp_delete(db->domain_temp); error_delete(db->error); free(db); return 0; }
PageRankScorerError page_rank_scorer_delete(PageRankScorer *prs) { if (page_rank_delete(prs->page_rank) != 0) { page_rank_scorer_set_error(prs, page_rank_scorer_error_internal, __func__); page_rank_scorer_add_error(prs, "deleting PageRank"); page_rank_scorer_add_error(prs, prs->page_rank? prs->page_rank->error->message : "unknown error"); return prs->error->code; } error_delete(prs->error); free(prs); return 0; }
FINI static void _error_fini() { { struct local_error *context = (struct local_error *)local_storage_get(&local_error_key); if (context) { error_delete(context); } } error_is_valid = false; { UNUSED const bool ret = local_storage_destroy(&local_error_key); assert(ret); } }
FreqSchedulerError freq_scheduler_cursor_write(FreqScheduler *sch, MDB_cursor *cursor, uint64_t hash, float freq) { if (freq <= 0) return 0; ScheduleKey sk = { .score = 0, .hash = hash }; MDB_val key = { .mv_size = sizeof(sk), .mv_data = &sk, }; MDB_val val = { .mv_size = sizeof(float), .mv_data = &freq, }; int mdb_rc; if ((mdb_rc = mdb_cursor_put(cursor, &key, &val, 0)) != 0) { freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, "adding page to schedule"); freq_scheduler_add_error(sch, mdb_strerror(mdb_rc)); } return sch->error->code; } FreqSchedulerError freq_scheduler_load_simple(FreqScheduler *sch, float freq_default, float freq_scale) { char *error1 = 0; char *error2 = 0; MDB_cursor *cursor = 0; HashInfoStream *st; if (hashinfo_stream_new(&st, sch->page_db) != 0) { error1 = "creating stream"; error2 = st? sch->page_db->error->message: "NULL"; goto on_error; } if (freq_scheduler_cursor_open(sch, &cursor) != 0) goto on_error; StreamState ss; uint64_t hash; PageInfo *pi; while ((ss = hashinfo_stream_next(st, &hash, &pi)) == stream_state_next) { if ((pi->n_crawls > 0) && ((sch->max_n_crawls == 0) || (pi->n_crawls < sch->max_n_crawls)) && !page_info_is_seed(pi)){ float freq = freq_default; if (freq_scale > 0) { float rate = page_info_rate(pi); if (rate > 0) { freq = freq_scale * rate; } } if (freq_scheduler_cursor_write(sch, cursor, hash, freq) != 0) goto on_error; } page_info_delete(pi); } if (ss != stream_state_end) { error1 = "incorrect stream state"; error2 = 0; hashinfo_stream_delete(st); goto on_error; } hashinfo_stream_delete(st); if (freq_scheduler_cursor_commit(sch, cursor) != 0) goto on_error; return sch->error->code; on_error: freq_scheduler_cursor_abort(sch, cursor); freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, error1); freq_scheduler_add_error(sch, error2); return sch->error->code; } FreqSchedulerError freq_scheduler_load_mmap(FreqScheduler *sch, MMapArray *freqs) { char *error1 = 0; char *error2 = 0; MDB_cursor *cursor = 0; if (txn_manager_expand( sch->txn_manager, 2*freqs->n_elements*freqs->element_size) != 0) { error1 = "resizing database"; error2 = sch->txn_manager->error->message; goto on_error; } if (freq_scheduler_cursor_open(sch, &cursor) != 0) goto on_error; for (size_t i=0; i<freqs->n_elements; ++i) { PageFreq *f = mmap_array_idx(freqs, i); ScheduleKey sk = { .score = 1.0/f->freq, .hash = f->hash }; MDB_val key = { .mv_size = sizeof(sk), .mv_data = &sk, }; MDB_val val = { .mv_size = sizeof(float), .mv_data = &f->freq, }; int mdb_rc; if ((mdb_rc = mdb_cursor_put(cursor, &key, &val, 0)) != 0) { error1 = "adding page to schedule"; error2 = mdb_strerror(mdb_rc); goto on_error; } } if (freq_scheduler_cursor_commit(sch, cursor) != 0) goto on_error; return sch->error->code; on_error: freq_scheduler_cursor_abort(sch, cursor); freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, error1); freq_scheduler_add_error(sch, error2); return sch->error->code; } FreqSchedulerError freq_scheduler_request(FreqScheduler *sch, size_t max_requests, PageRequest **request) { char *error1 = 0; char *error2 = 0; MDB_cursor *cursor = 0; if (freq_scheduler_cursor_open(sch, &cursor) != 0) goto on_error; PageRequest *req = *request = page_request_new(max_requests); if (!req) { error1 = "allocating memory"; goto on_error; } int interrupt_requests = 0; while ((req->n_urls < max_requests) && !interrupt_requests) { MDB_val key; MDB_val val; ScheduleKey sk; float freq; int mdb_rc; int crawl = 0; switch (mdb_rc = mdb_cursor_get(cursor, &key, &val, MDB_FIRST)) { case 0: // copy data before deleting cursor sk = *(ScheduleKey*)key.mv_data; freq = *(float*)val.mv_data; PageInfo *pi = 0; if (page_db_get_info(sch->page_db, sk.hash, &pi) != 0) { error1 = "retrieving PageInfo from PageDB"; error2 = sch->page_db->error->message; goto on_error; } if (pi) { if (sch->margin >= 0) { double elapsed = difftime(time(0), 0) - pi->last_crawl; if (elapsed < 1.0/(freq*(1.0 + sch->margin))) interrupt_requests = 1; } crawl = (sch->max_n_crawls == 0) || (pi->n_crawls < sch->max_n_crawls); } if (!interrupt_requests) { if ((mdb_rc = mdb_cursor_del(cursor, 0)) != 0) { error1 = "deleting head of schedule"; error2 = mdb_strerror(mdb_rc); goto on_error; } if (crawl) { if (page_request_add_url(req, pi->url) != 0) { error1 = "adding url to request"; goto on_error; } sk.score += 1.0/freq; val.mv_data = &freq; key.mv_data = &sk; if ((mdb_rc = mdb_cursor_put(cursor, &key, &val, 0)) != 0) { error1 = "moving element inside schedule"; error2 = mdb_strerror(mdb_rc); goto on_error; } } } page_info_delete(pi); break; case MDB_NOTFOUND: // no more pages left interrupt_requests = 1; break; default: error1 = "getting head of schedule"; error2 = mdb_strerror(mdb_rc); goto on_error; } } if (freq_scheduler_cursor_commit(sch, cursor) != 0) goto on_error; return sch->error->code; on_error: freq_scheduler_cursor_abort(sch, cursor); freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, error1); freq_scheduler_add_error(sch, error2); return sch->error->code; } FreqSchedulerError freq_scheduler_add(FreqScheduler *sch, const CrawledPage *page) { if (page_db_add(sch->page_db, page, 0) != 0) { freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, "adding crawled page"); freq_scheduler_add_error(sch, sch->page_db->error->message); } return sch->error->code; } void freq_scheduler_delete(FreqScheduler *sch) { mdb_env_close(sch->txn_manager->env); (void)txn_manager_delete(sch->txn_manager); if (!sch->persist) { char *data = build_path(sch->path, "data.mdb"); char *lock = build_path(sch->path, "lock.mdb"); remove(data); remove(lock); free(data); free(lock); remove(sch->path); } free(sch->path); error_delete(sch->error); free(sch); } FreqSchedulerError freq_scheduler_dump(FreqScheduler *sch, FILE *output) { MDB_cursor *cursor; if (freq_scheduler_cursor_open(sch, &cursor) != 0) return sch->error->code; int end = 0; MDB_cursor_op cursor_op = MDB_FIRST; do { int mdb_rc; MDB_val key; MDB_val val; ScheduleKey *key_data; float *val_data; switch (mdb_rc = mdb_cursor_get(cursor, &key, &val, cursor_op)) { case 0: key_data = (ScheduleKey*)key.mv_data; val_data = (float*)val.mv_data; fprintf(output, "%.2e %016"PRIx64" %.2e\n", key_data->score, key_data->hash, *val_data); break; case MDB_NOTFOUND: end = 1; break; default: freq_scheduler_set_error(sch, freq_scheduler_error_internal, __func__); freq_scheduler_add_error(sch, "iterating over database"); freq_scheduler_add_error(sch, mdb_strerror(mdb_rc)); end = 1; break; } cursor_op = MDB_NEXT; } while (!end); freq_scheduler_cursor_abort(sch, cursor); return sch->error->code; }