static rc_t bogotune(void) { bool skip; result_t *best; int beg, end; uint cnt, scan; rc_t status = RC_OK; beg = time(NULL); ham_cutoff = 0.0; spam_cutoff = 0.1; /* Note: memory usage highest while reading messages */ /* usage decreases as distribute() converts to count format */ /* read all messages, merge training sets, look up scoring sets */ ns_cnt = filelist_read(REG_GOOD, ham_files); sp_cnt = filelist_read(REG_SPAM, spam_files); cnt = ns_cnt + sp_cnt; end = time(NULL); if (verbose >= TIME) { show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec"); } distribute(REG_GOOD, ns_msglists); distribute(REG_SPAM, sp_msglists); create_countlists(ns_msglists); create_countlists(sp_msglists); if (verbose >= TIME && time(NULL) - end > 2) { end = time(NULL); show_elapsed_time(beg, end, ns_cnt + sp_cnt, (double)cnt/(end-beg), "messages", "msg/sec"); } if (verbose > PARMS+1) { tunelist_print(ns_and_sp); tunelist_print(ns_msglists); tunelist_print(sp_msglists); } ns_cnt = count_messages(ns_msglists); sp_cnt = count_messages(sp_msglists); if (ds_flag == DS_DSK && !check_msg_counts()) exit(exit_zero ? EX_OK : EX_ERROR); fflush(stdout); check_percent = CHECK_PCT; /* for checking low scoring spam ** and high scoring non-spam */ ns_scores = xcalloc(ns_cnt, sizeof(double)); sp_scores = xcalloc(sp_cnt, sizeof(double)); robs = DEFAULT_ROBS; robx = DEFAULT_ROBX; min_dev = DEFAULT_MIN_DEV; if (check_for_high_ns_scores() | check_for_low_sp_scores()) scoring_error(); /* ** 5. Calculate x and cache size ** Calculate x with bogoutil's -r option (a new addition). ** Bound the calculated value within [0.4, 0.6] and set the range to be ** investigated to [x-0.1, x+0.1]. */ robx = get_robx(); if (ds_flag == DS_DSK) { db_cachesize = calc_db_cachesize(); printf("Recommended db cache size is %u MB\n", db_cachesize); } /* ** 6. Calculate fp target ** The fp target will be derived thus: score non-spams with s and md as ** shipped, and determine the count that will result from a spam cutoff ** of 0.95; if that is < 0.25%, try 0.9375 etc. */ min_dev = 0.02; /* set target and spam_cutoff */ if (coerced_target == 0) set_thresh(ns_cnt, ns_scores); else { /* if coerced target ... */ target = coerced_target; spam_cutoff = ns_scores[target-1]; } skip = ROUND(spam_cutoff,100000) < SCAN_CUTOFF; printf("False-positive target is %u (cutoff %8.6f)\n", target, spam_cutoff); #ifdef TEST if (test) { printf("m: %8.6f, s: %8.6f, x: %0.16f\n", min_dev, robs, robx); if (verbose < PARMS) print_ns_scores(target-2, target+2, 0); } #endif if (!esf_flag && (sp_esf < 1.0 || ns_esf < 1.0)) fprintf(stderr, "Warning: Using ESF values (sp=%8.6f, ns=%8.6f) from config file.\n", sp_esf, ns_esf); /* No longer needed */ wordhash_free(train); train = NULL; for (scan=0; scan <= 1 && !skip; scan ++) { uint r_count; uint rsi, rxi, mdi, spi, nsi; result_t *results, *r, *sorted; printf("Performing %s scan:\n", scan==0 ? "coarse" : "fine"); switch (scan) { case 0: /* COARSE */ /* ** 7. Coarsely scan s, md and x ** The coarse s scan will range from 1 to 0.01 in half decades, and the ** coarse md scan will range from 0.05 to 0.45 in steps of 0.05. The ** coarse x scan will use steps of 0.05. The trough must be surrounded on ** six sides by values below the 33% quantile (unless bounded on one or ** more sides). */ init_coarse(robx); break; case 1: /* FINE */ /* ** 8. Finely scan the peak region ** The fine s scan will range over the estimated s +/- half a decade in ** steps of a quarter decade, and the fine md scan will range over the ** estimated md +/- 0.075 in steps of 0.015. The fine x scan will range ** over the estimated x +/- 0.04 in steps of 0.02. Scans of s and md ** are bounded by the limits of the coarse scan. Again, the trough must ** be surrounded on six sides by values below the 33% quantile. If no ** such trough exists, a warning is given. */ init_fine(robs, min_dev, robx, spex, nsex); break; } r_count = rsval->cnt * mdval->cnt * rxval->cnt * spexp->cnt * nsexp->cnt; results = (result_t *) xcalloc(r_count, sizeof(result_t)); print_all_parms(r_count); if (verbose >= SUMMARY) { if (verbose >= SUMMARY+1) printf("%3s ", "cnt"); if (verbose >= SUMMARY+2) printf(" %s %s %s ", "s", "m", "x"); printf(" %4s %5s %4s %8s %8s %7s %3s %3s\n", "rs", "md", "rx", "spesf", "nsesf", "cutoff", "fp", "fn"); } cnt = 0; beg = time(NULL); for (rsi = 0; rsi < rsval->cnt; rsi++) { robs = rsval->data[rsi]; for (mdi = 0; mdi < mdval->cnt; mdi++) { min_dev = mdval->data[mdi]; for (rxi = 0; rxi < rxval->cnt; rxi++) { robx = rxval->data[rxi]; for (spi = 0; spi < spexp->cnt; spi++) { spex = spexp->data[spi]; sp_esf = ESF_SEL(sp_esf, pow(0.75, spex)); for (nsi = 0; nsi < nsexp->cnt; nsi++) { uint fp, fn; nsex = nsexp->data[nsi]; ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex)); /* save parms */ r = &results[cnt++]; r->idx = cnt; r->rsi = rsi; r->rs = robs; r->rxi = rxi; r->rx = robx; r->mdi = mdi; r->md = min_dev; r->spi = spi; r->sp_exp = spex; r->nsi = nsi; r->ns_exp = nsex; if (verbose >= SUMMARY) { if (verbose >= SUMMARY+1) printf("%3u ", cnt); if (verbose >= SUMMARY+2) printf(" %u %u %u %u %u ", rsi, mdi, rxi, spi, nsi); printf("%6.4f %5.3f %5.3f %8.6f %8.6f", robs, min_dev, robx, sp_esf, ns_esf); fflush(stdout); } spam_cutoff = 0.01; score_ns(ns_scores); /* scores in descending order */ /* Determine spam_cutoff and false_pos */ for (fp = target; fp < ns_cnt; fp += 1) { spam_cutoff = ns_scores[fp-1]; if (spam_cutoff < 0.999999) break; if (coerced_target != 0) break; } if (ns_cnt < fp) fprintf(stderr, "Too few false positives to determine a valid cutoff\n"); score_sp(sp_scores); /* scores in ascending order */ fn = get_fn_count(sp_cnt, sp_scores); /* save results */ r->co = spam_cutoff; r->fp = fp; r->fn = fn; if (verbose < SUMMARY) progress(cnt, r_count); else { printf(" %8.6f %2u %3u\n", spam_cutoff, fp, fn); fflush(stdout); } #ifdef TEST if (test && spam_cutoff < 0.501) { printf("co: %0.16f\n", spam_cutoff); print_ns_scores(0, fp, 2); print_sp_scores(fn-10, fn, 10); } #endif if (fMakeCheck && cnt >= cMakeCheck) break; } if (fMakeCheck && cnt >= cMakeCheck) break; } if (fMakeCheck && cnt >= cMakeCheck) break; } if (fMakeCheck && cnt >= cMakeCheck) break; } fflush(stdout); if (fMakeCheck && cnt >= cMakeCheck) break; } if (verbose >= TIME) { end = time(NULL); show_elapsed_time(beg, end, cnt, (double)(end-beg)/cnt, "iterations", "secs"); } printf("\n"); /* Scan complete, now find minima */ sorted = results_sort(r_count, results); top_ten(sorted, r_count); best = count_outliers(r_count, sorted, results); robs = rsval->data[best->rsi]; robx = rxval->data[best->rxi]; min_dev = mdval->data[best->mdi]; spex = spexp->data[best->spi]; sp_esf = ESF_SEL(sp_esf, pow(0.75, spex)); nsex = nsexp->data[best->nsi]; ns_esf = ESF_SEL(ns_esf, pow(0.75, nsex)); printf( "Minimum found at s %6.4f, md %5.3f, x %5.3f, spesf %8.6f, nsesf %8.6f\n", robs, min_dev, robx, sp_esf, ns_esf); printf(" fp %u (%6.4f%%), fn %u (%6.4f%%)\n", best->fp, best->fp*100.0/ns_cnt, best->fn, best->fn*100.0/sp_cnt); printf("\n"); data_free(rsval); data_free(rxval); data_free(mdval); data_free(spexp); data_free(nsexp); xfree(results); xfree(sorted); } /* ** 9. Suggest possible spam and non-spam cutoff values ** With the final x, md and s values, score the spams and non-spams and ** sort the non-spam scores decreasing and the spam scores increasing; ** then, traverse the non-spam list until the 0.2% point; report cutoffs ** that give 0.05%, 0.1% and 0.2% fp. */ final_recommendations(skip); return status; }
int main(int argc, char* argv[]) { if (argc<2) { cout << "Usage: ./compute_profile dir" << endl; exit(1); } // directory setup string BASE_DIR = "/fs/nara-scratch/qwang37/brain_data/"+string(argv[1]); string partial_profile; string d2s, d2; // io streams on the final mean_conn_profile file ifstream in; ofstream out; //clock clock_t begin_t0; // read the dimensions int dim_profile, num_records; in.open("/fs/nara-scratch/qwang37/brain_data/scripts/dims"); in >> dim_profile; in >> num_records; in >> dimlow(); // initialize hash table string keyfile = "/fs/nara-scratch/qwang37/brain_data/scripts/keyfile"; // initialize the coarse map cout << "initializing coarse map" << endl; string coarsefile = "/fs/nara-scratch/qwang37/brain_data/scripts/coarse_map_file"; init_coarse(dim_profile, coarsefile); // compute conn_profile of subjects in a single data chunk cout << "Computing conn profile of data: " << argv[1] << endl; mkdir( (BASE_DIR + "/processed").c_str(), S_IRWXU|S_IRWXG|S_IROTH|S_IXOTH ); d2s = exec("ls " + BASE_DIR + " -l | egrep '^d' | awk '$9~/^S/ {print $9}'"); stringstream d2s_in(d2s); d2s_in >> d2; while(d2s_in.good()) { // stringstream::goodbit is set to false when any of eofbit, failbit, or badbit is set string totaldir = BASE_DIR + '/' + d2; string connfile = totaldir + "/track_aal_90_0/fdt_matrix3.dot"; string coordfile = totaldir + "/track_aal_90_0/coords_pruned"; string coordfile_local2std = totaldir + "/track_aal_90_0/coords_standard"; string coordfile_std2local = totaldir + "/track_aal_90_0/coords_standard_in_diff"; // clear conn profile init_profile(dim_profile, num_records, keyfile, conn_profile()); // do something cout << "Processing " + totaldir << endl; begin_t0 = clock(); append_conn(connfile, coordfile, coordfile_local2std, coordfile_std2local); cout << "completed in " << float(clock()-begin_t0)/CLOCKS_PER_SEC << " seconds" << endl; // write partial_profile = BASE_DIR + '/' + d2 + "/partial_profile_std2local"; cout << "writing to disk...\n"; begin_t0 = clock(); write_profile(partial_profile); cout << "completed in " << float( (clock()-begin_t0)/CLOCKS_PER_SEC ) << " seconds" << endl; // move rename( (BASE_DIR + '/' + d2).c_str(), (BASE_DIR + "/processed/" + d2).c_str() ); d2s_in >> d2; } }