コード例 #1
0
static rc_t bogotune(void)
{
    bool skip;
    result_t *best;

    int beg, end;
    uint cnt, scan;
    rc_t status = RC_OK;

    beg = time(NULL);

    ham_cutoff = 0.0;
    spam_cutoff = 0.1;

    /* Note: memory usage highest while reading messages */
    /* usage decreases as distribute() converts to count format */

    /* read all messages, merge training sets, look up scoring sets */
    ns_cnt = filelist_read(REG_GOOD, ham_files);
    sp_cnt = filelist_read(REG_SPAM, spam_files);
    cnt = ns_cnt + sp_cnt;

    end = time(NULL);
    if (verbose >= TIME) {
	show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec");
    }

    distribute(REG_GOOD, ns_msglists);
    distribute(REG_SPAM, sp_msglists);

    create_countlists(ns_msglists);
    create_countlists(sp_msglists);

    if (verbose >= TIME && time(NULL) - end > 2) {
	end = time(NULL);
	show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec");
    }

    if (verbose > PARMS+1) {
	tunelist_print(ns_and_sp);
	tunelist_print(ns_msglists);
	tunelist_print(sp_msglists);
    }

    ns_cnt = count_messages(ns_msglists);
    sp_cnt = count_messages(sp_msglists);

    if (ds_flag == DS_DSK && !check_msg_counts())
	exit(exit_zero ? EX_OK : EX_ERROR);

    fflush(stdout);

    check_percent = CHECK_PCT;	/* for checking low scoring spam
				** and high scoring non-spam */

    ns_scores = xcalloc(ns_cnt, sizeof(double));
    sp_scores = xcalloc(sp_cnt, sizeof(double));

    robs = DEFAULT_ROBS;
    robx = DEFAULT_ROBX;
    min_dev = DEFAULT_MIN_DEV;

    if (check_for_high_ns_scores() | check_for_low_sp_scores())
	scoring_error();

    /*
    ** 5.  Calculate x and cache size
    ** Calculate x with bogoutil's -r option (a new addition).
    ** Bound the calculated value within [0.4, 0.6] and set the range to be
    ** investigated to [x-0.1, x+0.1].
    */

    robx = get_robx();
    if (ds_flag == DS_DSK) {
	db_cachesize = calc_db_cachesize();
	printf("Recommended db cache size is %u MB\n", db_cachesize);
    }

    /*
    ** 6.  Calculate fp target
    ** The fp target will be derived thus: score non-spams with s and md as
    ** shipped, and determine the count that will result from a spam cutoff
    ** of 0.95; if that is < 0.25%, try 0.9375 etc.
    */

    min_dev = 0.02;

    /* set target and spam_cutoff */

    if (coerced_target == 0)
	set_thresh(ns_cnt, ns_scores);
    else {
	/* if coerced target ... */
	target = coerced_target;
	spam_cutoff = ns_scores[target-1];
    }

    skip = ROUND(spam_cutoff,100000) < SCAN_CUTOFF;
    printf("False-positive target is %u (cutoff %8.6f)\n", target, spam_cutoff);

#ifdef	TEST
    if (test) {
	printf("m: %8.6f, s: %8.6f, x: %0.16f\n", min_dev, robs, robx);
	if (verbose < PARMS)
	    print_ns_scores(target-2, target+2, 0);
    }
#endif

    if (!esf_flag && (sp_esf < 1.0 || ns_esf < 1.0))
	fprintf(stderr, "Warning:  Using ESF values (sp=%8.6f, ns=%8.6f) from config file.\n", sp_esf, ns_esf);

    /* No longer needed */
    wordhash_free(train);
    train = NULL;

    for (scan=0; scan <= 1 && !skip; scan ++) {
	uint r_count;
	uint rsi, rxi, mdi, spi, nsi;
	result_t *results, *r, *sorted;

	printf("Performing %s scan:\n", scan==0 ? "coarse" : "fine");

	switch (scan) {
	case 0:		/* COARSE */
	    /*
	    ** 7.  Coarsely scan s, md and x
	    ** The coarse s scan will range from 1 to 0.01 in half decades, and the
	    ** coarse md scan will range from 0.05 to 0.45 in steps of 0.05.  The
	    ** coarse x scan will use steps of 0.05. The trough must be surrounded on
	    ** six sides by values below the 33% quantile (unless bounded on one or
	    ** more sides).
	    */
	    init_coarse(robx);
	    break;
	case 1:		/* FINE */
	    /*
	    ** 8.  Finely scan the peak region
	    ** The fine s scan will range over the estimated s +/- half a decade in
	    ** steps of a quarter decade, and the fine md scan will range over the
	    ** estimated md +/- 0.075 in steps of 0.015.  The fine x scan will range
	    ** over the estimated x +/- 0.04 in steps of 0.02.  Scans of s and md
	    ** are bounded by the limits of the coarse scan.  Again, the trough must
	    ** be surrounded on six sides by values below the 33% quantile.  If no
	    ** such trough exists, a warning is given.
	    */
	    init_fine(robs, min_dev, robx, spex, nsex);
	    break;
	}

	r_count = rsval->cnt * mdval->cnt * rxval->cnt * spexp->cnt * nsexp->cnt;
	results = (result_t *) xcalloc(r_count, sizeof(result_t));

	print_all_parms(r_count);

	if (verbose >= SUMMARY) {
	    if (verbose >= SUMMARY+1)
		printf("%3s ", "cnt");
	    if (verbose >= SUMMARY+2)
		printf(" %s %s %s      ", "s", "m", "x");
	    printf(" %4s %5s   %4s %8s %8s %7s %3s %3s\n",
		   "rs", "md", "rx", "spesf", "nsesf", "cutoff", "fp", "fn");
	}

	cnt = 0;
	beg = time(NULL);
	for (rsi = 0; rsi < rsval->cnt; rsi++) {
	  robs = rsval->data[rsi];
	  for (mdi = 0; mdi < mdval->cnt; mdi++) {
	    min_dev = mdval->data[mdi];
	    for (rxi = 0; rxi < rxval->cnt; rxi++) {
	      robx = rxval->data[rxi];
	      for (spi = 0; spi < spexp->cnt; spi++) {
		spex = spexp->data[spi];
		sp_esf = ESF_SEL(sp_esf, pow(0.75, spex));
		for (nsi = 0; nsi < nsexp->cnt; nsi++) {
		    uint fp, fn;
		    nsex = nsexp->data[nsi];
		    ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex));

		    /* save parms */
		    r = &results[cnt++];
		    r->idx = cnt;
		    r->rsi = rsi; r->rs = robs;
		    r->rxi = rxi; r->rx = robx;
		    r->mdi = mdi; r->md = min_dev;
		    r->spi = spi; r->sp_exp = spex;
		    r->nsi = nsi; r->ns_exp = nsex;

		    if (verbose >= SUMMARY) {
			if (verbose >= SUMMARY+1)
			    printf("%3u ", cnt);
			if (verbose >= SUMMARY+2)
			    printf(" %u %u %u %u %u  ",
				rsi, mdi, rxi, spi, nsi);
			printf("%6.4f %5.3f %5.3f %8.6f %8.6f",
			    robs, min_dev, robx, sp_esf, ns_esf);
			fflush(stdout);
		    }

		    spam_cutoff = 0.01;
		    score_ns(ns_scores);	/* scores in descending order */

		    /* Determine spam_cutoff and false_pos */
		    for (fp = target; fp < ns_cnt; fp += 1) {
			spam_cutoff = ns_scores[fp-1];
			if (spam_cutoff < 0.999999)
			    break;
			if (coerced_target != 0)
			    break;
		    }
		    if (ns_cnt < fp)
			fprintf(stderr,
				"Too few false positives to determine a valid cutoff\n");

		    score_sp(sp_scores);	/* scores in ascending order */
		    fn = get_fn_count(sp_cnt, sp_scores);

		    /* save results */
		    r->co = spam_cutoff;
		    r->fp = fp;
		    r->fn = fn;

		    if (verbose < SUMMARY)
			progress(cnt, r_count);
		    else {
			printf(" %8.6f %2u %3u\n", spam_cutoff, fp, fn);
			fflush(stdout);
		    }

#ifdef	TEST
		    if (test && spam_cutoff < 0.501) {
			printf("co: %0.16f\n", spam_cutoff);
			print_ns_scores(0, fp, 2);
			print_sp_scores(fn-10, fn, 10);
		    }
#endif
		    if (fMakeCheck && cnt >= cMakeCheck)
			break;
		}
		if (fMakeCheck && cnt >= cMakeCheck)
		    break;
	      }
	      if (fMakeCheck && cnt >= cMakeCheck)
		  break;
	    }
	    if (fMakeCheck && cnt >= cMakeCheck)
		break;
	  }
	  fflush(stdout);
	  if (fMakeCheck && cnt >= cMakeCheck)
	      break;
	}

	if (verbose >= TIME) {
	    end = time(NULL);
	    show_elapsed_time(beg, end, cnt, (double)(end-beg)/cnt, "iterations", "secs");
	}

	printf("\n");

	/* Scan complete, now find minima */

	sorted = results_sort(r_count, results);
	top_ten(sorted, r_count);

	best = count_outliers(r_count, sorted, results);
	robs = rsval->data[best->rsi];
	robx = rxval->data[best->rxi];
	min_dev = mdval->data[best->mdi];

	spex = spexp->data[best->spi]; sp_esf = ESF_SEL(sp_esf, pow(0.75, spex));
	nsex = nsexp->data[best->nsi]; ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex));

	printf(
    "Minimum found at s %6.4f, md %5.3f, x %5.3f, spesf %8.6f, nsesf %8.6f\n",
    		robs, min_dev, robx, sp_esf, ns_esf);
	printf("        fp %u (%6.4f%%), fn %u (%6.4f%%)\n",
		best->fp, best->fp*100.0/ns_cnt,
		best->fn, best->fn*100.0/sp_cnt);
	printf("\n");

	data_free(rsval);
	data_free(rxval);
	data_free(mdval);
	data_free(spexp);
	data_free(nsexp);

	xfree(results);
	xfree(sorted);
    }

    /*
    ** 9.  Suggest possible spam and non-spam cutoff values
    ** With the final x, md and s values, score the spams and non-spams and
    ** sort the non-spam scores decreasing and the spam scores increasing;
    ** then, traverse the non-spam list until the 0.2% point; report cutoffs
    ** that give 0.05%, 0.1% and 0.2% fp.
    */

    final_recommendations(skip);

    return status;
}
コード例 #2
0
int main(int argc, char* argv[]) {
  if (argc<2) {
    cout << "Usage: ./compute_profile dir" << endl;
    exit(1);
  }
 
  // directory setup
  string BASE_DIR = "/fs/nara-scratch/qwang37/brain_data/"+string(argv[1]);
  string partial_profile;
  string d2s, d2;
  
  // io streams on the final mean_conn_profile file
  ifstream in;
  ofstream out;
  //clock
  clock_t begin_t0;

  // read the dimensions
  int dim_profile, num_records;
  in.open("/fs/nara-scratch/qwang37/brain_data/scripts/dims");
  in >> dim_profile; in >> num_records; in >> dimlow();

  // initialize hash table
  string keyfile = "/fs/nara-scratch/qwang37/brain_data/scripts/keyfile";
  // initialize the coarse map
  cout << "initializing coarse map" << endl;
  string coarsefile = "/fs/nara-scratch/qwang37/brain_data/scripts/coarse_map_file";
  init_coarse(dim_profile, coarsefile);

  // compute conn_profile of subjects in a single data chunk
  cout << "Computing conn profile of data: " << argv[1] << endl;
  mkdir( (BASE_DIR + "/processed").c_str(), S_IRWXU|S_IRWXG|S_IROTH|S_IXOTH );
  d2s = exec("ls " + BASE_DIR + " -l | egrep '^d' | awk '$9~/^S/ {print $9}'");

  stringstream d2s_in(d2s);

    
  d2s_in >> d2;
  while(d2s_in.good()) { // stringstream::goodbit is set to false when any of eofbit, failbit, or badbit is set   
    string totaldir = BASE_DIR + '/' + d2;
    string connfile =  totaldir + "/track_aal_90_0/fdt_matrix3.dot";
    string coordfile = totaldir + "/track_aal_90_0/coords_pruned";
    string coordfile_local2std = totaldir + "/track_aal_90_0/coords_standard";
    string coordfile_std2local = totaldir + "/track_aal_90_0/coords_standard_in_diff";
    // clear conn profile
    init_profile(dim_profile, num_records, keyfile, conn_profile());
    // do something
    cout << "Processing " + totaldir << endl;
    begin_t0 = clock();
    append_conn(connfile, coordfile, coordfile_local2std, coordfile_std2local);
    cout << "completed in " << float(clock()-begin_t0)/CLOCKS_PER_SEC << " seconds" << endl;

    // write
    partial_profile = BASE_DIR + '/' + d2 + "/partial_profile_std2local";
    cout << "writing to disk...\n";
    begin_t0 = clock();
    write_profile(partial_profile);
    cout << "completed in " << float( (clock()-begin_t0)/CLOCKS_PER_SEC ) << " seconds" << endl;

    // move
    rename( (BASE_DIR + '/' + d2).c_str(), (BASE_DIR + "/processed/" + d2).c_str() );

    d2s_in >> d2;
  }

}