ex_t histogram(bfpath *bfp) { ex_t rc; uint count; void *dsh, *dbe; dsv_t val; rhistogram_t hist; dbe = ds_init(bfp); if (dbe == NULL) return EX_ERROR; dsh = ds_open(dbe, bfp, DS_READ); if (dsh == NULL) return EX_ERROR; if (DST_OK != ds_txn_begin(dsh)) { ds_close(dsh); ds_cleanup(dbe); fprintf(stderr, "cannot begin transaction!\n"); return EX_ERROR; } ds_get_msgcounts(dsh, &val); mgood = val.goodcount; mbad = val.spamcount; memset(&hist, 0, sizeof(hist)); rc = ds_foreach(dsh, ds_histogram_hook, &hist); if (DST_OK != ds_txn_commit(dsh)) { ds_close(dsh); ds_cleanup(dbe); fprintf(stderr, "cannot commit transaction!\n"); return EX_ERROR; } ds_close(dsh); ds_cleanup(dbe); count = print_histogram(&hist); if (verbose > 0) { printf("hapaxes: ham %7u, spam %7u\n", ham_hapax, spam_hapax); printf(" pure: ham %7u, spam %7u\n", ham_only, spam_only); } else { printf("hapaxes: ham %7u (%5.2f%%), spam %7u (%5.2f%%)\n", ham_hapax, PCT(ham_hapax), spam_hapax, PCT(spam_hapax)); printf(" pure: ham %7u (%5.2f%%), spam %7u (%5.2f%%)\n", ham_only, PCT(ham_only), spam_only, PCT(spam_only)); } return rc; }
static void ds_open_failure(bfpath *bfp, void *dbe) { fprintf(stderr, "Error accessing file or directory '%s'.\n", bfp->filepath); if (errno != 0) fprintf(stderr, "error #%d - %s.\n", errno, strerror(errno)); if (dbe != NULL) ds_cleanup(dbe); exit(EX_ERROR); }
static ex_t dump_wordlist(bfpath *bfp) { ex_t rc; void *dbe; token_count = 0; dbe = ds_init(bfp); rc = ds_oper(dbe, bfp, DS_READ, ds_dump_hook, NULL); ds_cleanup(dbe); if (rc != EX_OK) fprintf(stderr, "error dumping tokens!\n"); else if (verbose) fprintf(dbgout, "%d tokens dumped\n", token_count); return rc; }
int main(int argc, char **argv) /*@globals errno,stderr,stdout@*/ { ex_t exitcode = EX_OK; fBogotune = true; /* for rob_compute_spamicity() */ dbgout = stderr; progtype = build_progtype(progname, DB_TYPE); ham_files = filelist_new("ham"); spam_files = filelist_new("spam"); /* process args and read mailboxes */ process_arglist(argc, argv); /* directories from command line and config file are already handled */ if (ds_flag == DS_DSK) { bfpath *bfp; if (ds_path == NULL) ds_path = get_directory(PR_ENV_BOGO); if (ds_path == NULL) ds_path = get_directory(PR_ENV_HOME); if (ds_path == NULL) { fprintf(stderr, "Cannot derive bogofilter directory from environment, aborting.\n"); exit(EX_ERROR); } set_bogohome(ds_path); bfp = bfpath_create(ds_path); if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) { fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath); exit(EX_ERROR); } if (bfp->exists && bfp->isdir) { bfpath_free(bfp); ds_path = mxcat(ds_path, DIRSEP_S, WORDLIST, NULL); bfp = bfpath_create(ds_path); if (!bfpath_check_mode(bfp, BFP_MUST_EXIST)) { fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath); exit(EX_ERROR); } } env = ds_init(bfp); init_wordlist("word", ds_path, 0, WL_REGULAR); } bogotune_init(); if (ds_flag == DS_DSK) load_wordlist(load_hook, train); /* if encoding not yet set, assume old style */ if (encoding == E_UNKNOWN) encoding = E_RAW; if (bogolex_file != NULL) bogolex(); else bogotune(); bogotune_free(); if (ds_flag == DS_DSK) ds_cleanup(env); exit(exitcode); }
static ex_t display_words(bfpath *bfp, int argc, char **argv, bool show_probability) { byte buf[BUFSIZE]; buff_t *buff = buff_new(buf, 0, BUFSIZE); const byte *word; const char *path = bfp->filepath; const char *head_format = !show_probability ? "%-30s %6s %6s\n" : "%-30s %6s %6s %6s\n"; const char *data_format = !show_probability ? "%-30s %6lu %6lu\n" : "%-30s %6lu %6lu %f\n"; void *dsh = NULL; /* initialize to silence bogus gcc warning */ void *dbe; int rv = 0; ex_t ec = EX_OK; dsv_t msgcnts; /* protect against broken stat(2) that succeeds for empty names */ if (path == NULL || *path == '\0') { fprintf(stderr, "Expecting non-empty directory or file name.\n"); return EX_ERROR; } dbe = ds_init(bfp); dsh = ds_open(dbe, bfp, DS_READ);; if (dsh == NULL) /* print error, cleanup, and exit */ ds_open_failure(bfp, dbe); if (DST_OK != ds_txn_begin(dsh)) { ds_close(dsh); ds_cleanup(dbe); fprintf(stderr, "Cannot begin transaction.\n"); return EX_ERROR; } if (show_probability) { ds_get_msgcounts(dsh, &msgcnts); robs = ROBS; robx = ROBX; } fprintf(fpo, head_format, "", "spam", "good", " Fisher"); while (argc >= 0) { dsv_t val; word_t *token; int rc; unsigned long spam_count; unsigned long good_count; double rob_prob = 0.0; if (argc == 0) { if (get_token(buff, stdin) != 0) break; token = &buff->t; } else { word = (const byte *) *argv++; if (--argc == 0) argc = -1; token = word_news((const char *)word); } rc = ds_read(dsh, token, &val); switch (rc) { case 0: spam_count = val.spamcount; good_count = val.goodcount; if (!show_probability) fprintf(fpo, data_format, token->u.text, spam_count, good_count); else { rob_prob = calc_prob(good_count, spam_count, msgcnts.goodcount, msgcnts.spamcount); fprintf(fpo, data_format, token->u.text, spam_count, good_count, rob_prob); } break; case 1: break; default: fprintf(stderr, "Cannot read from database.\n"); ec = EX_ERROR; goto finish; } if (token != &buff->t) word_free(token); } finish: if (DST_OK != rv ? ds_txn_abort(dsh) : ds_txn_commit(dsh)) { fprintf(stderr, "Cannot %s transaction.\n", rv ? "abort" : "commit"); ec = EX_ERROR; } ds_close(dsh); ds_cleanup(dbe); buff_free(buff); return ec; }
static int load_wordlist(bfpath *bfp) { void *dsh; byte buf[BUFSIZE]; byte *p; int rv = 0; size_t len; int load_count = 0; unsigned long line = 0; unsigned long count[IX_SIZE], date; YYYYMMDD today_save = today; void *dbe = ds_init(bfp); dsh = ds_open(dbe, bfp, (dbmode_t)(DS_WRITE | DS_LOAD)); if (dsh == NULL) /* print error, cleanup, and exit */ ds_open_failure(bfp, dbe); memset(buf, '\0', BUFSIZE); if (DST_OK != ds_txn_begin(dsh)) exit(EX_ERROR); for (;;) { dsv_t data; word_t *token; if (fgets((char *)buf, BUFSIZE, fpin) == NULL) { if (ferror(fpin)) { perror(progname); rv = 2; } break; } line++; len = strlen((char *)buf); /* too short. */ if (len < 4) continue; p = spanword(buf); len = strlen((const char *)buf); if (max_token_len != 0 && len > max_token_len) continue; /* too long - discard */ spamcount = (uint) atoi((const char *)p); if ((int) spamcount < 0) spamcount = 0; p = spanword(p); goodcount = (uint) atoi((const char *)p); if ((int) goodcount < 0) goodcount = 0; p = spanword(p); date = (uint) atoi((const char *)p); p = spanword(p); if (*p != '\0') { fprintf(stderr, "%s: Unexpected input [%s] on line %lu. " "Expecting whitespace before count.\n", progname, buf, line); rv = 1; break; } if (date == 0) /* date as YYYYMMDD */ date = today_save; if (replace_nonascii_characters) do_replace_nonascii_characters(buf, len); token = word_new(buf, len); data.goodcount = goodcount; data.spamcount = spamcount; data.date = date; if (is_count((const char *)buf) && !(maintain && discard_token(token, &data))) { load_count += 1; /* Slower, but allows multiple lists to be concatenated */ set_date(date); switch (ds_read(dsh, token, &data)) { case 0: case 1: break; default: rv = 1; } data.spamcount += spamcount; data.goodcount += goodcount; if (ds_write(dsh, token, &data)) rv = 1; } word_free(token); } if (rv) { fprintf(stderr, "read or write error, aborting.\n"); ds_txn_abort(dsh); } else { switch (ds_txn_commit(dsh)) { case DST_FAILURE: case DST_TEMPFAIL: fprintf(stderr, "commit failed\n"); exit(EX_ERROR); case DST_OK: break; } } ds_close(dsh); ds_cleanup(dbe); if (verbose) fprintf(dbgout, "%d tokens loaded\n", load_count); return rv; }