Exemple #1
0
void chksum_print_results(hFILE *f, chksum_results_t *results)
{
    digest_line_t *dline = &(results->all);

    hputs("###\tset\tcount\t\tb_seq\tname_b_seq\tb_seq_qual\tb_seq_tags(BC,FI,QT,RT,TC)\n", f);

    print_dline(f, "all", dline, 0);
    print_dline(f, "all", dline, 1);

    HashIter *iter = HashTableIterCreate();
    HashItem *hi;
    while ( (hi = HashTableIterNext(results->rgHash, iter)) != NULL) {
        print_dline(f, hi->key, hi->data.p, 0);
        print_dline(f, hi->key, hi->data.p, 1);
    }
    HashTableIterDestroy(iter);
}
void cram_stats_dump(cram_stats *st) {
    int i;
    fprintf(stderr, "cram_stats:\n");
    for (i = 0; i < MAX_STAT_VAL; i++) {
	if (!st->freqs[i])
	    continue;
	fprintf(stderr, "\t%d\t%d\n", i, st->freqs[i]);
    }
    if (st->h) {
	HashIter *iter=  HashTableIterCreate();
	HashItem *hi;

	while ((hi = HashTableIterNext(st->h, iter))) {
	    fprintf(stderr, "\t%d\t%d\n", (int)(size_t)hi->key,
		    (int)hi->data.i);
	}
	HashTableIterDestroy(iter);
    }
}
Exemple #3
0
int main(int argc, char **argv) {

    Settings settings;
    int i;
    char *bam_file = NULL;
    samfile_t *fp_bam = NULL;
    const char *override_intensity_dir = NULL;
    const char *filter_file = NULL;
    int ntiles = 0;
    int nreads = 0;
    int nst = 0;
    SurvTable *sts[(N_TILES+1)*N_READS];
    int nct = 0;
    CalTable *cts[(N_TILES+1)*N_READS];

    settings.prefix = NULL;
    settings.quiet = 0;
    settings.filter_bad_tiles = 0;
    settings.spatial_filter = 0;
    settings.region_size = 0;
    settings.nregions_x = 0;
    settings.nregions_y = 0;
    settings.n_bins_left = 2;
    settings.n_bins_right = 2;
    settings.cstart[0] = 0;
    settings.cstart[1] = 0;
    settings.cstart[2] = 0;
    settings.intensity_dir = NULL;
    settings.snp_file = NULL;
    settings.snp_hash = NULL;
    settings.read_length[0] = 0;
    settings.read_length[1] = 0;
    settings.read_length[2] = 0;
    settings.working_dir = NULL;
    
    settings.cmdline = get_command_line(argc, argv);

    /* Parse args */
    for (i = 1; i < argc && argv[i][0] == '-'; i++) {
	if (!strcmp(argv[i], "-")) {
	    break;
	} else if (!strcmp(argv[i], "-intensity-dir")) {
            if(override_intensity_dir != NULL) {
		fprintf(stderr, "ERROR: -intensity-dir option specified multiple times\n");
                usage(1);
            }
            check_arg(i,argc,"-intensity-dir");
            override_intensity_dir = argv[++i];

	} else if (!strcmp(argv[i], "-snp_file")) {
            if(settings.snp_file != NULL) {
		fprintf(stderr, "ERROR: -snp_file specified multiple times\n");
                usage(1);
            }
            check_arg(i,argc,"-snp_file");
            settings.snp_file = argv[++i];
	} else if (!strcmp(argv[i], "-filter_file")) {
            if(filter_file != NULL) {
		fprintf(stderr, "ERROR: -filter_file specified multiple times\n");
                usage(1);
            }
            check_arg(i,argc,"-filter_file");
            filter_file = argv[++i];

	} else if (!strcmp(argv[i], "-q")) {
	    settings.quiet = 1;
	} else if (!strcmp(argv[i], "-p")) {
            if(settings.prefix != NULL) {
		fprintf(stderr, "ERROR: -p option specified multiple times\n");
                usage(1);
            }
            check_arg(i,argc,"-p");
            settings.prefix = argv[++i];

	} else if (!strcmp(argv[i], "-filter-bad-tiles")){
            settings.filter_bad_tiles = atoi(argv[++i]);
            if(settings.filter_bad_tiles < 1){
                fprintf(stderr,"ERROR: invalid argument to -filter_bad_tiles\n");
                usage(1);
            }
	} else if (!strcmp(argv[i], "-cstart1")){
            check_arg(i,argc,"-cstart1");
            settings.cstart[1] = atoi(argv[++i]);
            if(settings.cstart[1] < 1){
                fprintf(stderr,"ERROR: invalid argument to -cstart1\n");
                usage(1);
            }
            /* cycles are indexed from 0 not 1 */
            --settings.cstart[1];
	} else if (!strcmp(argv[i], "-cstart2")){
            check_arg(i,argc,"-cstart2");
            settings.cstart[2] = atoi(argv[++i]);
            if(settings.cstart[2] < 1){
                fprintf(stderr,"ERROR: invalid argument to -cstart2\n");
                usage(1);
            }
            /* cycles are indexed from 0 not 1 */
            --settings.cstart[2];
	} else if (!strcmp(argv[i], "-cstart")){
            check_arg(i,argc,"-cstart");
            settings.cstart[0] = atoi(argv[++i]);
            if(settings.cstart[0] < 1){
                fprintf(stderr,"ERROR: invalid argument to -cstart\n");
                usage(1);
            }
            /* cycles are indexed from 0 not 1 */
            --settings.cstart[0];
	} else if (!strcmp(argv[i], "-nL")){
            check_arg(i,argc,"-nL");
            settings.n_bins_left = atoi(argv[++i]);
            if(settings.n_bins_left < 0){
                fprintf(stderr,"ERROR: invalid argument to -nL\n");
                usage(1);
            }
	} else if (!strcmp(argv[i], "-nR")){
            check_arg(i,argc,"-nR");
            settings.n_bins_right = atoi(argv[++i]);
            if(settings.n_bins_right < 0){
                fprintf(stderr,"ERROR: invalid argument to -nR\n");
                usage(1);
            }

	} else if (!strcmp(argv[i], "-h")) {
	    usage(0);
	} else {
            fprintf(stderr,"ERROR: Unknown option %s\n", argv[i]);
	    usage(1);
	}
    }

    if ((argc-i) < 1)
	usage(0);

    /* preserve starting directory b/c makeSurvTable is going to chdir all over the place */
    settings.working_dir = alloc_getcwd();
    if (NULL == settings.working_dir) {
        fprintf(stderr, "ERROR: can't obtain working directory: %s\n",
                strerror(errno));
        exit(EXIT_FAILURE);
    }

    /* get absolute intensity dir*/
    if (override_intensity_dir) {
        settings.intensity_dir = get_real_path_name(override_intensity_dir);
        if (NULL == settings.intensity_dir) {
            fprintf(stderr, "ERROR: can't process intensity dir: %s\n",
                    override_intensity_dir);
            exit(EXIT_FAILURE);
        }
    } else {
        fprintf(stderr,"ERROR: you must specify an intensity dir\n");
        exit(EXIT_FAILURE);
    }

    /* read the snp_file */
    if (NULL != settings.snp_file) {
        settings.snp_hash = readSnpFile(settings.snp_file);
        if (NULL == settings.snp_hash) {
            fprintf(stderr, "ERROR: reading snp file %s\n", settings.snp_file);
            exit(EXIT_FAILURE);
        }
    }

    /* read filter file */
    if (NULL != filter_file) {
        FILE *fp = fopen(filter_file, "rb");
        if (!fp) die("Can't open filter file %s\n", filter_file);
        Header filter_header;
        readHeader(fp, &filter_header);
        readFilterData(fp, &filter_header);
        settings.spatial_filter = 1;
        settings.region_size = filter_header.region_size;
        settings.nregions_x = filter_header.nregions_x;
        settings.nregions_y = filter_header.nregions_y;
    }

    /* Look for CIF directories */
    get_cif_dirs(settings.intensity_dir);

    /* open the bam file */
    bam_file = argv[i++];
    fp_bam = samopen(bam_file, "rb", 0);
    if (NULL == fp_bam) {
        fprintf(stderr, "ERROR: can't open bam file file %s: %s\n",
                bam_file, strerror(errno));
        exit(EXIT_FAILURE);
    }

    /* make the survival table */
    nst = makeSurvTable(&settings, fp_bam, sts, &ntiles, &nreads);
    if (0 == nst) {
        fprintf(stderr,"ERROR: failed to make survival table\n");
        exit(EXIT_FAILURE);
    }

    if (!settings.quiet) {
        fprintf(stderr, "Processed %8d traces\n", nreads);
        if (NULL != settings.snp_hash) {
            HashIter *tileIter = HashTableIterCreate();
            HashItem *hashItem;
            size_t nsnps = 0;
            while ((hashItem = HashTableIterNext(settings.snp_hash, tileIter)))
                nsnps += hashItem->data.i;
            fprintf(stderr, "Ignored %lu snps\n", nsnps);
        }
    }

    /* back to where we belong */
    checked_chdir(settings.working_dir);

    if (!settings.spatial_filter) makeGlobalSurvTable(&settings, ntiles, sts);

    outputSurvTable(&settings, sts);

    nct = makeCalTable(&settings, sts, cts);
    if (0 == nct) {
        fprintf(stderr,"ERROR: failed to make calibration table\n");
        exit(EXIT_FAILURE);
    }

    outputCalTable(&settings, cts);

    /* close the bam file */
    samclose(fp_bam);

    freeCalTable(&settings, cts);

    freeSurvTable(&settings, sts);

    if (NULL != settings.working_dir) free(settings.working_dir);

    return EXIT_SUCCESS;

}
/*
 * Computes entropy from integer frequencies for various encoding methods and
 * picks the best encoding.
 *
 * FIXME: we could reuse some of the code here for the actual encoding
 * parameters too. Eg the best 'k' for SUBEXP or the code lengths for huffman.
 *
 * Returns the best codec to use.
 */
enum cram_encoding cram_stats_encoding(cram_fd *fd, cram_stats *st) {
    enum cram_encoding best_encoding = E_NULL;
    int best_size = INT_MAX, bits;
    int nvals, i, ntot = 0, max_val = 0, min_val = INT_MAX, k;
    int *vals = NULL, *freqs = NULL, vals_alloc = 0, *codes;

    //cram_stats_dump(st);

    /* Count number of unique symbols */
    for (nvals = i = 0; i < MAX_STAT_VAL; i++) {
	if (!st->freqs[i])
	    continue;
	if (nvals >= vals_alloc) {
	    vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
	    vals  = realloc(vals,  vals_alloc * sizeof(int));
	    freqs = realloc(freqs, vals_alloc * sizeof(int));
	    if (!vals || !freqs) {
		if (vals)  free(vals);
		if (freqs) free(freqs);
		return E_HUFFMAN; // Cannot do much else atm
	    }
	}
	vals[nvals] = i;
	freqs[nvals] = st->freqs[i];
	ntot += freqs[nvals];
	if (max_val < i) max_val = i;
	if (min_val > i) min_val = i;
	nvals++;
    }
    if (st->h) {
	HashIter *iter=  HashTableIterCreate();
	HashItem *hi;
	int i;

	while ((hi = HashTableIterNext(st->h, iter))) {
	    if (nvals >= vals_alloc) {
		vals_alloc = vals_alloc ? vals_alloc*2 : 1024;
		vals  = realloc(vals,  vals_alloc * sizeof(int));
		freqs = realloc(freqs, vals_alloc * sizeof(int));
		if (!vals || !freqs)
		    return E_HUFFMAN; // Cannot do much else atm
	    }
	    i = (size_t)hi->key;
	    vals[nvals]=i;
	    freqs[nvals] = hi->data.i;
	    ntot += freqs[nvals];
	    if (max_val < i) max_val = i;
	    if (min_val > i) min_val = i;
	    nvals++;
	}
	HashTableIterDestroy(iter);
    }

    st->nvals = nvals;
    assert(ntot == st->nsamp);

#if 0
    // RANDOMISER
    switch(random()%10) {
    case 0:  return E_HUFFMAN;
    case 1:  return E_HUFFMAN;
    //case 1:  return E_BETA; // Java doesn't support E_BETA for BYTE vals
    default: return E_EXTERNAL;
    }
#endif

    if (nvals <= 1) {
	free(vals);
	free(freqs);

	if (fd->verbose > 1)
	    fprintf(stderr, "0 values => 0 bits\n");

	return E_HUFFMAN;
    }

    if (fd->verbose > 1)
	fprintf(stderr, "Range = %d..%d, nvals=%d, ntot=%d\n",
		min_val, max_val, nvals, ntot);

    /* Theoretical entropy */
    if (fd->verbose > 1) {
	double dbits = 0;
	for (i = 0; i < nvals; i++) {
	    dbits += freqs[i] * log((double)freqs[i]/ntot);
	}
	dbits /= -log(2);
	if (fd->verbose > 1)
	    fprintf(stderr, "Entropy = %f\n", dbits);
    }

    if (nvals > 1 && ntot > 256) {
#if 0
	/*
	 * CRUDE huffman estimator. Round to closest and round up from 0
	 * to 1 bit.
	 *
	 * With and without ITF8 incase we have a few discrete values but with
	 * large magnitude.
	 *
	 * Note rans0/arith0 and Z_HUFFMAN_ONLY vs internal huffman can be
	 * compared in this way, but order-1 (eg rans1) or maybe LZ77 modes
	 * may detect the correlation of high bytes to low bytes in multi-
	 * byte values. So this predictor breaks down.
	 */
	double dbits = 0;  // entropy + ~huffman
	double dbitsH = 0;
	double dbitsE = 0; // external entropy + ~huffman
	double dbitsEH = 0;
	int F[256] = {0}, n = 0;
	double e = 0; // accumulated error bits
	for (i = 0; i < nvals; i++) {
	    double x; int X;
	    unsigned int v = vals[i];

	    //Better encoding would cope with sign.
	    //v = ABS(vals[i])*2+(vals[i]<0);

	    if (!(v & ~0x7f)) {
		F[v]             += freqs[i], n+=freqs[i];
	    } else if (!(v & ~0x3fff)) {
		F[(v>>8) |0x80] += freqs[i];
		F[ v     &0xff] += freqs[i], n+=2*freqs[i];
	    } else if (!(v & ~0x1fffff)) {