void chksum_print_results(hFILE *f, chksum_results_t *results) { digest_line_t *dline = &(results->all); hputs("###\tset\tcount\t\tb_seq\tname_b_seq\tb_seq_qual\tb_seq_tags(BC,FI,QT,RT,TC)\n", f); print_dline(f, "all", dline, 0); print_dline(f, "all", dline, 1); HashIter *iter = HashTableIterCreate(); HashItem *hi; while ( (hi = HashTableIterNext(results->rgHash, iter)) != NULL) { print_dline(f, hi->key, hi->data.p, 0); print_dline(f, hi->key, hi->data.p, 1); } HashTableIterDestroy(iter); }
void cram_stats_dump(cram_stats *st) { int i; fprintf(stderr, "cram_stats:\n"); for (i = 0; i < MAX_STAT_VAL; i++) { if (!st->freqs[i]) continue; fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]); } if (st->h) { HashIter *iter= HashTableIterCreate(); HashItem *hi; while ((hi = HashTableIterNext(st->h, iter))) { fprintf(stderr, "\t%d\t%d\n", (int)(size_t)hi->key, (int)hi->data.i); } HashTableIterDestroy(iter); } }
int main(int argc, char **argv) { Settings settings; int i; char *bam_file = NULL; samfile_t *fp_bam = NULL; const char *override_intensity_dir = NULL; const char *filter_file = NULL; int ntiles = 0; int nreads = 0; int nst = 0; SurvTable *sts[(N_TILES+1)*N_READS]; int nct = 0; CalTable *cts[(N_TILES+1)*N_READS]; settings.prefix = NULL; settings.quiet = 0; settings.filter_bad_tiles = 0; settings.spatial_filter = 0; settings.region_size = 0; settings.nregions_x = 0; settings.nregions_y = 0; settings.n_bins_left = 2; settings.n_bins_right = 2; settings.cstart[0] = 0; settings.cstart[1] = 0; settings.cstart[2] = 0; settings.intensity_dir = NULL; settings.snp_file = NULL; settings.snp_hash = NULL; settings.read_length[0] = 0; settings.read_length[1] = 0; settings.read_length[2] = 0; settings.working_dir = NULL; settings.cmdline = get_command_line(argc, argv); /* Parse args */ for (i = 1; i < argc && argv[i][0] == '-'; i++) { if (!strcmp(argv[i], "-")) { break; } else if (!strcmp(argv[i], "-intensity-dir")) { if(override_intensity_dir != NULL) { fprintf(stderr, "ERROR: -intensity-dir option specified multiple times\n"); usage(1); } check_arg(i,argc,"-intensity-dir"); override_intensity_dir = argv[++i]; } else if (!strcmp(argv[i], "-snp_file")) { if(settings.snp_file != NULL) { fprintf(stderr, "ERROR: -snp_file specified multiple times\n"); usage(1); } check_arg(i,argc,"-snp_file"); settings.snp_file = argv[++i]; } else if (!strcmp(argv[i], "-filter_file")) { if(filter_file != NULL) { fprintf(stderr, "ERROR: -filter_file specified multiple times\n"); usage(1); } check_arg(i,argc,"-filter_file"); filter_file = argv[++i]; } else if (!strcmp(argv[i], "-q")) { settings.quiet = 1; } else if (!strcmp(argv[i], "-p")) { if(settings.prefix != NULL) { fprintf(stderr, "ERROR: -p option specified multiple times\n"); usage(1); } check_arg(i,argc,"-p"); settings.prefix = argv[++i]; } else if (!strcmp(argv[i], "-filter-bad-tiles")){ settings.filter_bad_tiles = atoi(argv[++i]); if(settings.filter_bad_tiles < 1){ fprintf(stderr,"ERROR: invalid argument to -filter_bad_tiles\n"); usage(1); } } else if (!strcmp(argv[i], "-cstart1")){ check_arg(i,argc,"-cstart1"); settings.cstart[1] = atoi(argv[++i]); if(settings.cstart[1] < 1){ fprintf(stderr,"ERROR: invalid argument to -cstart1\n"); usage(1); } /* cycles are indexed from 0 not 1 */ --settings.cstart[1]; } else if (!strcmp(argv[i], "-cstart2")){ check_arg(i,argc,"-cstart2"); settings.cstart[2] = atoi(argv[++i]); if(settings.cstart[2] < 1){ fprintf(stderr,"ERROR: invalid argument to -cstart2\n"); usage(1); } /* cycles are indexed from 0 not 1 */ --settings.cstart[2]; } else if (!strcmp(argv[i], "-cstart")){ check_arg(i,argc,"-cstart"); settings.cstart[0] = atoi(argv[++i]); if(settings.cstart[0] < 1){ fprintf(stderr,"ERROR: invalid argument to -cstart\n"); usage(1); } /* cycles are indexed from 0 not 1 */ --settings.cstart[0]; } else if (!strcmp(argv[i], "-nL")){ check_arg(i,argc,"-nL"); settings.n_bins_left = atoi(argv[++i]); if(settings.n_bins_left < 0){ fprintf(stderr,"ERROR: invalid argument to -nL\n"); usage(1); } } else if (!strcmp(argv[i], "-nR")){ check_arg(i,argc,"-nR"); settings.n_bins_right = atoi(argv[++i]); if(settings.n_bins_right < 0){ fprintf(stderr,"ERROR: invalid argument to -nR\n"); usage(1); } } else if (!strcmp(argv[i], "-h")) { usage(0); } else { fprintf(stderr,"ERROR: Unknown option %s\n", argv[i]); usage(1); } } if ((argc-i) < 1) usage(0); /* preserve starting directory b/c makeSurvTable is going to chdir all over the place */ settings.working_dir = alloc_getcwd(); if (NULL == settings.working_dir) { fprintf(stderr, "ERROR: can't obtain working directory: %s\n", strerror(errno)); exit(EXIT_FAILURE); } /* get absolute intensity dir*/ if (override_intensity_dir) { settings.intensity_dir = get_real_path_name(override_intensity_dir); if (NULL == settings.intensity_dir) { fprintf(stderr, "ERROR: can't process intensity dir: %s\n", override_intensity_dir); exit(EXIT_FAILURE); } } else { fprintf(stderr,"ERROR: you must specify an intensity dir\n"); exit(EXIT_FAILURE); } /* read the snp_file */ if (NULL != settings.snp_file) { settings.snp_hash = readSnpFile(settings.snp_file); if (NULL == settings.snp_hash) { fprintf(stderr, "ERROR: reading snp file %s\n", settings.snp_file); exit(EXIT_FAILURE); } } /* read filter file */ if (NULL != filter_file) { FILE *fp = fopen(filter_file, "rb"); if (!fp) die("Can't open filter file %s\n", filter_file); Header filter_header; readHeader(fp, &filter_header); readFilterData(fp, &filter_header); settings.spatial_filter = 1; settings.region_size = filter_header.region_size; settings.nregions_x = filter_header.nregions_x; settings.nregions_y = filter_header.nregions_y; } /* Look for CIF directories */ get_cif_dirs(settings.intensity_dir); /* open the bam file */ bam_file = argv[i++]; fp_bam = samopen(bam_file, "rb", 0); if (NULL == fp_bam) { fprintf(stderr, "ERROR: can't open bam file file %s: %s\n", bam_file, strerror(errno)); exit(EXIT_FAILURE); } /* make the survival table */ nst = makeSurvTable(&settings, fp_bam, sts, &ntiles, &nreads); if (0 == nst) { fprintf(stderr,"ERROR: failed to make survival table\n"); exit(EXIT_FAILURE); } if (!settings.quiet) { fprintf(stderr, "Processed %8d traces\n", nreads); if (NULL != settings.snp_hash) { HashIter *tileIter = HashTableIterCreate(); HashItem *hashItem; size_t nsnps = 0; while ((hashItem = HashTableIterNext(settings.snp_hash, tileIter))) nsnps += hashItem->data.i; fprintf(stderr, "Ignored %lu snps\n", nsnps); } } /* back to where we belong */ checked_chdir(settings.working_dir); if (!settings.spatial_filter) makeGlobalSurvTable(&settings, ntiles, sts); outputSurvTable(&settings, sts); nct = makeCalTable(&settings, sts, cts); if (0 == nct) { fprintf(stderr,"ERROR: failed to make calibration table\n"); exit(EXIT_FAILURE); } outputCalTable(&settings, cts); /* close the bam file */ samclose(fp_bam); freeCalTable(&settings, cts); freeSurvTable(&settings, sts); if (NULL != settings.working_dir) free(settings.working_dir); return EXIT_SUCCESS; }
/* * Computes entropy from integer frequencies for various encoding methods and * picks the best encoding. * * FIXME: we could reuse some of the code here for the actual encoding * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman. * * Returns the best codec to use. */ enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) { enum cram_encoding best_encoding = E_NULL; int best_size = INT_MAX, bits; int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k; int *vals = NULL, *freqs = NULL, vals_alloc = 0, *codes; //cram_stats_dump(st); /* Count number of unique symbols */ for (nvals = i = 0; i < MAX_STAT_VAL; i++) { if (!st->freqs[i]) continue; if (nvals >= vals_alloc) { vals_alloc = vals_alloc ? vals_alloc*2 : 1024; vals = realloc(vals, vals_alloc * sizeof(int)); freqs = realloc(freqs, vals_alloc * sizeof(int)); if (!vals || !freqs) { if (vals) free(vals); if (freqs) free(freqs); return E_HUFFMAN; // Cannot do much else atm } } vals[nvals] = i; freqs[nvals] = st->freqs[i]; ntot += freqs[nvals]; if (max_val < i) max_val = i; if (min_val > i) min_val = i; nvals++; } if (st->h) { HashIter *iter= HashTableIterCreate(); HashItem *hi; int i; while ((hi = HashTableIterNext(st->h, iter))) { if (nvals >= vals_alloc) { vals_alloc = vals_alloc ? vals_alloc*2 : 1024; vals = realloc(vals, vals_alloc * sizeof(int)); freqs = realloc(freqs, vals_alloc * sizeof(int)); if (!vals || !freqs) return E_HUFFMAN; // Cannot do much else atm } i = (size_t)hi->key; vals[nvals]=i; freqs[nvals] = hi->data.i; ntot += freqs[nvals]; if (max_val < i) max_val = i; if (min_val > i) min_val = i; nvals++; } HashTableIterDestroy(iter); } st->nvals = nvals; assert(ntot == st->nsamp); #if 0 // RANDOMISER switch(random()%10) { case 0: return E_HUFFMAN; case 1: return E_HUFFMAN; //case 1: return E_BETA; // Java doesn't support E_BETA for BYTE vals default: return E_EXTERNAL; } #endif if (nvals <= 1) { free(vals); free(freqs); if (fd->verbose > 1) fprintf(stderr, "0 values => 0 bits\n"); return E_HUFFMAN; } if (fd->verbose > 1) fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n", min_val, max_val, nvals, ntot); /* Theoretical entropy */ if (fd->verbose > 1) { double dbits = 0; for (i = 0; i < nvals; i++) { dbits += freqs[i] * log((double)freqs[i]/ntot); } dbits /= -log(2); if (fd->verbose > 1) fprintf(stderr, "Entropy = %f\n", dbits); } if (nvals > 1 && ntot > 256) { #if 0 /* * CRUDE huffman estimator. Round to closest and round up from 0 * to 1 bit. * * With and without ITF8 incase we have a few discrete values but with * large magnitude. * * Note rans0/arith0 and Z_HUFFMAN_ONLY vs internal huffman can be * compared in this way, but order-1 (eg rans1) or maybe LZ77 modes * may detect the correlation of high bytes to low bytes in multi- * byte values. So this predictor breaks down. */ double dbits = 0; // entropy + ~huffman double dbitsH = 0; double dbitsE = 0; // external entropy + ~huffman double dbitsEH = 0; int F[256] = {0}, n = 0; double e = 0; // accumulated error bits for (i = 0; i < nvals; i++) { double x; int X; unsigned int v = vals[i]; //Better encoding would cope with sign. //v = ABS(vals[i])*2+(vals[i]<0); if (!(v & ~0x7f)) { F[v] += freqs[i], n+=freqs[i]; } else if (!(v & ~0x3fff)) { F[(v>>8) |0x80] += freqs[i]; F[ v &0xff] += freqs[i], n+=2*freqs[i]; } else if (!(v & ~0x1fffff)) {