Example #1
0
static void distribute(int mode, tunelist_t *ns_or_sp)
{
    int good = mode == REG_GOOD;
    int bad  = 1 - good;

    bool divvy = ds_flag == DS_RAM && user_robx < EPS && !msg_count_file;

    mlitem_t *item;
    mlhead_t *msgs = ns_or_sp->msgs;

    int score_count = 0;
    int train_count = 0;

    static int train_good = 0;
    static int train_bad  = 0;

    double ratio = scale(msgs->count,
			 LIST_COUNT + TEST_COUNT,	/* small count */
			 LIST_COUNT + LIST_COUNT,	/* large count */
			 LIST_COUNT / TEST_COUNT,	/* small ratio */
			 LIST_COUNT / LIST_COUNT);	/* large ratio */

    for (item = msgs->head; item != NULL; item = item->next) {
	wordhash_t *wh = item->wh;

	/* training set */
	if (divvy && train_count / ratio < score_count + 1) {
	    wordhash_set_counts(wh, good, bad);
	    wordhash_add(train, wh, &wordprop_init);
	    train_count += 1;
	    wordhash_free(wh);
	    train_good += good;
	    train_bad  += bad;
	}
	/* scoring set  */
	else {
	    uint bin = divvy ? MOD(score_count,3) : 0;
	    msglist_add(ns_or_sp->u.sets[bin], wh);
	    score_count += 1;
	}
	item->wh = NULL;
    }

    if (divvy) {
	wordhash_insert(train, w_msg_count, sizeof(wordprop_t), &wordprop_init);
	set_msg_counts(train_good, train_bad);
    }

    if (verbose > 1)
	printf("%s:  train_count = %d, score_count = %d\n",
	       good ? "ns" : "sp",
	       train_count, score_count);

    return;
}
Example #2
0
void set_msg_counts_from_str(char *str)
{
    uint b, g;
    b = atoi(str);
    str = strchr(str, ' ') + 1;
    g = atoi(str);

    set_msg_counts(g, b);

    msg_count_header_len= strlen(msg_count_header);
}
Example #3
0
static int load_hook(word_t *key, dsv_t *data, void *userdata)
/* returns 0 if ok, 1 if not ok */
{
    wordprop_t *tokenprop = wordhash_insert(train, key, sizeof(wordprop_t), &wordprop_init);

    (void) userdata;	/* quiet compiler complaint */

    tokenprop->cnts.bad = data->spamcount;
    tokenprop->cnts.good = data->goodcount;

    if (word_cmps(key, ".MSG_COUNT") == 0)
	set_msg_counts(data->goodcount, data->spamcount);

    if (word_cmps(key, ".ENCODING") == 0) {
	if (encoding == E_UNKNOWN)
	    encoding = data->spamcount;
	if (encoding != data->spamcount) {
	    fprintf(stderr, "Can't mix database encodings, i.e. utf-8 and any other.\n");
	    exit(EX_ERROR);
	}
    }

    return 0;
}