コード例 #1
0
static rc_t bogotune(void)
{
    bool skip;
    result_t *best;

    int beg, end;
    uint cnt, scan;
    rc_t status = RC_OK;

    beg = time(NULL);

    ham_cutoff = 0.0;
    spam_cutoff = 0.1;

    /* Note: memory usage highest while reading messages */
    /* usage decreases as distribute() converts to count format */

    /* read all messages, merge training sets, look up scoring sets */
    ns_cnt = filelist_read(REG_GOOD, ham_files);
    sp_cnt = filelist_read(REG_SPAM, spam_files);
    cnt = ns_cnt + sp_cnt;

    end = time(NULL);
    if (verbose >= TIME) {
	show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec");
    }

    distribute(REG_GOOD, ns_msglists);
    distribute(REG_SPAM, sp_msglists);

    create_countlists(ns_msglists);
    create_countlists(sp_msglists);

    if (verbose >= TIME && time(NULL) - end > 2) {
	end = time(NULL);
	show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec");
    }

    if (verbose > PARMS+1) {
	tunelist_print(ns_and_sp);
	tunelist_print(ns_msglists);
	tunelist_print(sp_msglists);
    }

    ns_cnt = count_messages(ns_msglists);
    sp_cnt = count_messages(sp_msglists);

    if (ds_flag == DS_DSK && !check_msg_counts())
	exit(exit_zero ? EX_OK : EX_ERROR);

    fflush(stdout);

    check_percent = CHECK_PCT;	/* for checking low scoring spam
				** and high scoring non-spam */

    ns_scores = xcalloc(ns_cnt, sizeof(double));
    sp_scores = xcalloc(sp_cnt, sizeof(double));

    robs = DEFAULT_ROBS;
    robx = DEFAULT_ROBX;
    min_dev = DEFAULT_MIN_DEV;

    if (check_for_high_ns_scores() | check_for_low_sp_scores())
	scoring_error();

    /*
    ** 5.  Calculate x and cache size
    ** Calculate x with bogoutil's -r option (a new addition).
    ** Bound the calculated value within [0.4, 0.6] and set the range to be
    ** investigated to [x-0.1, x+0.1].
    */

    robx = get_robx();
    if (ds_flag == DS_DSK) {
	db_cachesize = calc_db_cachesize();
	printf("Recommended db cache size is %u MB\n", db_cachesize);
    }

    /*
    ** 6.  Calculate fp target
    ** The fp target will be derived thus: score non-spams with s and md as
    ** shipped, and determine the count that will result from a spam cutoff
    ** of 0.95; if that is < 0.25%, try 0.9375 etc.
    */

    min_dev = 0.02;

    /* set target and spam_cutoff */

    if (coerced_target == 0)
	set_thresh(ns_cnt, ns_scores);
    else {
	/* if coerced target ... */
	target = coerced_target;
	spam_cutoff = ns_scores[target-1];
    }

    skip = ROUND(spam_cutoff,100000) < SCAN_CUTOFF;
    printf("False-positive target is %u (cutoff %8.6f)\n", target, spam_cutoff);

#ifdef	TEST
    if (test) {
	printf("m: %8.6f, s: %8.6f, x: %0.16f\n", min_dev, robs, robx);
	if (verbose < PARMS)
	    print_ns_scores(target-2, target+2, 0);
    }
#endif

    if (!esf_flag && (sp_esf < 1.0 || ns_esf < 1.0))
	fprintf(stderr, "Warning:  Using ESF values (sp=%8.6f, ns=%8.6f) from config file.\n", sp_esf, ns_esf);

    /* No longer needed */
    wordhash_free(train);
    train = NULL;

    for (scan=0; scan <= 1 && !skip; scan ++) {
	uint r_count;
	uint rsi, rxi, mdi, spi, nsi;
	result_t *results, *r, *sorted;

	printf("Performing %s scan:\n", scan==0 ? "coarse" : "fine");

	switch (scan) {
	case 0:		/* COARSE */
	    /*
	    ** 7.  Coarsely scan s, md and x
	    ** The coarse s scan will range from 1 to 0.01 in half decades, and the
	    ** coarse md scan will range from 0.05 to 0.45 in steps of 0.05.  The
	    ** coarse x scan will use steps of 0.05. The trough must be surrounded on
	    ** six sides by values below the 33% quantile (unless bounded on one or
	    ** more sides).
	    */
	    init_coarse(robx);
	    break;
	case 1:		/* FINE */
	    /*
	    ** 8.  Finely scan the peak region
	    ** The fine s scan will range over the estimated s +/- half a decade in
	    ** steps of a quarter decade, and the fine md scan will range over the
	    ** estimated md +/- 0.075 in steps of 0.015.  The fine x scan will range
	    ** over the estimated x +/- 0.04 in steps of 0.02.  Scans of s and md
	    ** are bounded by the limits of the coarse scan.  Again, the trough must
	    ** be surrounded on six sides by values below the 33% quantile.  If no
	    ** such trough exists, a warning is given.
	    */
	    init_fine(robs, min_dev, robx, spex, nsex);
	    break;
	}

	r_count = rsval->cnt * mdval->cnt * rxval->cnt * spexp->cnt * nsexp->cnt;
	results = (result_t *) xcalloc(r_count, sizeof(result_t));

	print_all_parms(r_count);

	if (verbose >= SUMMARY) {
	    if (verbose >= SUMMARY+1)
		printf("%3s ", "cnt");
	    if (verbose >= SUMMARY+2)
		printf(" %s %s %s      ", "s", "m", "x");
	    printf(" %4s %5s   %4s %8s %8s %7s %3s %3s\n",
		   "rs", "md", "rx", "spesf", "nsesf", "cutoff", "fp", "fn");
	}

	cnt = 0;
	beg = time(NULL);
	for (rsi = 0; rsi < rsval->cnt; rsi++) {
	  robs = rsval->data[rsi];
	  for (mdi = 0; mdi < mdval->cnt; mdi++) {
	    min_dev = mdval->data[mdi];
	    for (rxi = 0; rxi < rxval->cnt; rxi++) {
	      robx = rxval->data[rxi];
	      for (spi = 0; spi < spexp->cnt; spi++) {
		spex = spexp->data[spi];
		sp_esf = ESF_SEL(sp_esf, pow(0.75, spex));
		for (nsi = 0; nsi < nsexp->cnt; nsi++) {
		    uint fp, fn;
		    nsex = nsexp->data[nsi];
		    ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex));

		    /* save parms */
		    r = &results[cnt++];
		    r->idx = cnt;
		    r->rsi = rsi; r->rs = robs;
		    r->rxi = rxi; r->rx = robx;
		    r->mdi = mdi; r->md = min_dev;
		    r->spi = spi; r->sp_exp = spex;
		    r->nsi = nsi; r->ns_exp = nsex;

		    if (verbose >= SUMMARY) {
			if (verbose >= SUMMARY+1)
			    printf("%3u ", cnt);
			if (verbose >= SUMMARY+2)
			    printf(" %u %u %u %u %u  ",
				rsi, mdi, rxi, spi, nsi);
			printf("%6.4f %5.3f %5.3f %8.6f %8.6f",
			    robs, min_dev, robx, sp_esf, ns_esf);
			fflush(stdout);
		    }

		    spam_cutoff = 0.01;
		    score_ns(ns_scores);	/* scores in descending order */

		    /* Determine spam_cutoff and false_pos */
		    for (fp = target; fp < ns_cnt; fp += 1) {
			spam_cutoff = ns_scores[fp-1];
			if (spam_cutoff < 0.999999)
			    break;
			if (coerced_target != 0)
			    break;
		    }
		    if (ns_cnt < fp)
			fprintf(stderr,
				"Too few false positives to determine a valid cutoff\n");

		    score_sp(sp_scores);	/* scores in ascending order */
		    fn = get_fn_count(sp_cnt, sp_scores);

		    /* save results */
		    r->co = spam_cutoff;
		    r->fp = fp;
		    r->fn = fn;

		    if (verbose < SUMMARY)
			progress(cnt, r_count);
		    else {
			printf(" %8.6f %2u %3u\n", spam_cutoff, fp, fn);
			fflush(stdout);
		    }

#ifdef	TEST
		    if (test && spam_cutoff < 0.501) {
			printf("co: %0.16f\n", spam_cutoff);
			print_ns_scores(0, fp, 2);
			print_sp_scores(fn-10, fn, 10);
		    }
#endif
		    if (fMakeCheck && cnt >= cMakeCheck)
			break;
		}
		if (fMakeCheck && cnt >= cMakeCheck)
		    break;
	      }
	      if (fMakeCheck && cnt >= cMakeCheck)
		  break;
	    }
	    if (fMakeCheck && cnt >= cMakeCheck)
		break;
	  }
	  fflush(stdout);
	  if (fMakeCheck && cnt >= cMakeCheck)
	      break;
	}

	if (verbose >= TIME) {
	    end = time(NULL);
	    show_elapsed_time(beg, end, cnt, (double)(end-beg)/cnt, "iterations", "secs");
	}

	printf("\n");

	/* Scan complete, now find minima */

	sorted = results_sort(r_count, results);
	top_ten(sorted, r_count);

	best = count_outliers(r_count, sorted, results);
	robs = rsval->data[best->rsi];
	robx = rxval->data[best->rxi];
	min_dev = mdval->data[best->mdi];

	spex = spexp->data[best->spi]; sp_esf = ESF_SEL(sp_esf, pow(0.75, spex));
	nsex = nsexp->data[best->nsi]; ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex));

	printf(
    "Minimum found at s %6.4f, md %5.3f, x %5.3f, spesf %8.6f, nsesf %8.6f\n",
    		robs, min_dev, robx, sp_esf, ns_esf);
	printf("        fp %u (%6.4f%%), fn %u (%6.4f%%)\n",
		best->fp, best->fp*100.0/ns_cnt,
		best->fn, best->fn*100.0/sp_cnt);
	printf("\n");

	data_free(rsval);
	data_free(rxval);
	data_free(mdval);
	data_free(spexp);
	data_free(nsexp);

	xfree(results);
	xfree(sorted);
    }

    /*
    ** 9.  Suggest possible spam and non-spam cutoff values
    ** With the final x, md and s values, score the spams and non-spams and
    ** sort the non-spam scores decreasing and the spam scores increasing;
    ** then, traverse the non-spam list until the 0.2% point; report cutoffs
    ** that give 0.05%, 0.1% and 0.2% fp.
    */

    final_recommendations(skip);

    return status;
}
コード例 #2
0
ファイル: bogoutil.c プロジェクト: Distrotech/bogofilter
int main(int argc, char *argv[])
{
    ex_t rc = EX_OK;

    bfpath *bfp;
    bfpath_mode mode;

    fBogoutil = true;

    signal_setup();			/* setup to catch signals */
    atexit(bf_exit);

    progtype = build_progtype(progname, DB_TYPE);

    set_today();			/* compute current date for token age */

    process_arglist(argc, argv);
    process_config_files(false, longopts_bogoutil);	/* need to read lock sizes */

    /* Extra or missing parameters */
    if (flag != M_WORD && flag != M_LIST_LOGFILES && argc != optind) {
	fprintf(stderr, "Missing or extraneous argument.\n");
	usage(stderr);
	exit(EX_ERROR);
    }

    bfp = bfpath_create(ds_file);
    if (bogohome == NULL)
	set_bogohome( "." );		/* set default */

    bfpath_set_bogohome(bfp);

    mode = get_mode(flag);
    if (bfpath_check_mode(bfp, mode)) {
	if (bfp->isdir)
	    bfpath_set_filename(bfp, WORDLIST);
    }

    if (!bfpath_check_mode(bfp, mode)) {
	fprintf(stderr, "Can't open wordlist '%s'\n", bfp->filepath);
	exit(EX_ERROR);
    }

    errno = 0;		/* clear error status */

    switch (flag) {
	case M_RECOVER:
	    ds_init(bfp);
	    rc = ds_recover(bfp, false);
	    break;
	case M_CRECOVER:
	    ds_init(bfp);
	    rc = ds_recover(bfp, true);
	    break;
	case M_CHECKPOINT:
	    ds_init(bfp);
	    rc = ds_checkpoint(bfp);
	    break;
	case M_LIST_LOGFILES:
	    dsm_init(bfp);
	    rc = ds_list_logfiles(bfp, argc - optind, argv + optind);
	    break;
	case M_PURGELOGS:
	    ds_init(bfp);
	    rc = ds_purgelogs(bfp);
	    break;
	case M_REMOVEENV:
	    dsm_init(bfp);
	    rc = ds_remove(bfp);
	    break;
	case M_VERIFY:
	    dsm_init(bfp);
	    rc = ds_verify(bfp);
	    break;
	case M_LEAFPAGES:
	    {
		u_int32_t c;

		dsm_init(bfp);
		c = ds_leafpages(bfp);
		if (c == 0xffffffff) {
		    fprintf(stderr, "%s: error getting leaf page count.\n", ds_file);
		    rc = EX_ERROR;
		} else if (c == 0) {
		    puts("UNKNOWN");
		} else {
		    printf("%lu\n", (unsigned long)c);
		}
	    }
	    break;
	case M_PAGESIZE:
	    {
		u_int32_t s;

		dsm_init(bfp);
		s = ds_pagesize(bfp);
		if (s == 0xffffffff) {
		    fprintf(stderr, "%s: error getting page size.\n", ds_file);
		} else if (s == 0) {
		    puts("UNKNOWN");
		} else {
		    printf("%lu\n", (unsigned long)s);
		}
	    }
	    break;
	case M_DUMP:
	    rc = dump_wordlist(bfp);
	    break;
	case M_LOAD:
	    rc = load_wordlist(bfp) ? EX_ERROR : EX_OK;
	    break;
	case M_MAINTAIN:
	    maintain = true;
	    rc = maintain_wordlist_file(bfp);
	    break;
	case M_WORD:
	    argc -= optind;
	    argv += optind;
	    rc = display_words(bfp, argc, argv, prob);
	    break;
	case M_HIST:
	    rc = histogram(bfp);
	    break;
	case M_ROBX:
	    rc = get_robx(bfp);
	    break;
	case M_NONE:
	default:
	    /* should have been handled above */
	    abort();
	    break;
    }

    bfpath_free(bfp);

    return rc;
}