Example #1
0
 /**
   * returns 0 on success, -1 on failure.
   */
 int writeToFile(scram_fd* fp) {
     int r1 = scram_put_seq(fp, read1);
     if (r1 == 0 and isPaired()) {
         return scram_put_seq(fp, read2);
     } else {
         return r1;
     }
 }
Example #2
0
int main(int argc, char **argv) {
    scram_fd **in, *out;
    int n_input, i;
    bam_seq_t **s;
    char imode[10], *in_f = "", omode[10], *out_f = "";
    int level = '\0'; // nul terminate string => auto level
    int c, verbose = 0;
    int s_opt = 0, S_opt = 0, embed_ref = 0;
    char *ref_fn = NULL;
    int start, end;
    char ref_name[1024] = {0};
    refs_t *refs = NULL;

    /* Parse command line arguments */
    while ((c = getopt(argc, argv, "u0123456789hvs:S:V:r:XI:O:R:")) != -1) {
	switch (c) {
	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
	    level = c;
	    break;
	    
	case 'u':
	    level = '0';
	    break;

	case 'h':
	    usage(stdout);
	    return 0;

	case 'v':
	    verbose++;
	    break;

	case 's':
	    s_opt = atoi(optarg);
	    break;

	case 'S':
	    S_opt = atoi(optarg);
	    break;

	case 'V':
	    cram_set_option(NULL, CRAM_OPT_VERSION, optarg);
	    break;

	case 'r':
	    ref_fn = optarg;
	    break;

	case 'X':
	    embed_ref = 1;
	    break;

	case 'I':
	    in_f = parse_format(optarg);
	    break;

	case 'O':
	    out_f = parse_format(optarg);
	    break;

	case 'R': {
	    char *cp = strchr(optarg, ':');
	    if (cp) {
		*cp = 0;
		switch (sscanf(cp+1, "%d-%d", &start, &end)) {
		case 1:
		    end = start;
		    break;
		case 2:
		    break;
		default:
		    fprintf(stderr, "Malformed range format\n");
		    return 1;
		}
	    } else {
		start = INT_MIN;
		end   = INT_MAX;
	    }
	    strncpy(ref_name, optarg, 1023);
	    break;
	}

	case '?':
	    fprintf(stderr, "Unrecognised option: -%c\n", optopt);
	    usage(stderr);
	    return 1;
	}
    }    

    /* Open output file */
    sprintf(omode, "w%s%c", out_f, level);
    if (!(out = scram_open("-", omode))) {
	fprintf(stderr, "Failed to open bam file %s\n", argv[optind+1]);
	return 1;
    }

    /* Open multiple input files */
    sprintf(imode, "r%s%c", in_f, level);
    n_input = argc - optind;
    if (!n_input) {
	fprintf(stderr, "No input files specified.\n");
	return 1;
    }
    if (!(in = malloc(n_input * sizeof(*in))))
	return 1;
    if (!(s = malloc(n_input * sizeof(*s))))
	return 1;
    for (i = 0; i < n_input; i++, optind++) {
	s[i] = NULL;
	if (*in_f == 0)
	    sprintf(imode, "r%s%c", detect_format(argv[optind]), level);
	if (!(in[i] = scram_open(argv[optind], imode))) {
	    fprintf(stderr, "Failed to open bam file %s\n", argv[optind]);
	    return 1;
	}
	if (i && !hdr_compare(scram_get_header(in[0]),
			      scram_get_header(in[i]))) {
	    fprintf(stderr, "Incompatible reference sequence list.\n");
	    fprintf(stderr, "Currently the @SQ lines need to be identical"
		    " in all files.\n");
	    return 1;
	}

	if (!refs && scram_get_refs(in[i]))
	    refs = scram_get_refs(in[i]);

	if (refs && scram_set_option(in[i], CRAM_OPT_SHARED_REF, refs))
	    return 1;
    }

    /* Set any format specific options */
    if (refs)
	scram_set_option(out, CRAM_OPT_SHARED_REF, refs);

    if (scram_set_option(out, CRAM_OPT_VERBOSITY, verbose))
	return 1;
    if (s_opt)
	if (scram_set_option(out, CRAM_OPT_SEQS_PER_SLICE, s_opt))
	    return 1;

    if (S_opt)
	if (scram_set_option(out, CRAM_OPT_SLICES_PER_CONTAINER, S_opt))
	    return 1;

    if (embed_ref)
	if (scram_set_option(out, CRAM_OPT_EMBED_REF, embed_ref))
	    return 1;
    
    /* Copy header and refs from in to out, for writing purposes */
    // FIXME: do proper merging of @PG lines
    // FIXME: track mapping of old PG aux name to new PG aux name per seq
    scram_set_header(out, sam_hdr_dup(scram_get_header(in[0])));

    // Needs doing after loading the header.
    if (ref_fn)
	if (scram_set_option(out, CRAM_OPT_REFERENCE, ref_fn))
	    return 1;

    if (scram_get_header(in[0])) {
	if (scram_write_header(out))
	    return 1;
    }


    /* Do the actual file format conversion */
    fprintf(stderr, "Opening and loading initial seqs\n");
    for (i = 0; i < n_input; i++) {
	if (scram_get_seq(in[i], &s[i]) < 0) {
	    if (scram_close(in[i]))
		return 1;
	    in[i] = NULL;
	    free(s[i]);
	    continue;
	}
    }

    fprintf(stderr, "Merging...\n");
    for (;;) {
	int64_t best_val = INT64_MAX;
	int best_j = 0, j;

	for (j = 0; j < n_input; j++) {
	    bam_seq_t *b = s[j];
	    uint64_t x;
	    if (!in[j])
		continue;

	    x = (((uint64_t)bam_ref(b))<<33)
		| (bam_pos(b)<<2)
		| (bam_strand(b)<<1)
		| !(bam_flag(b) & BAM_FREAD1);
	    if (best_val > x) {
		best_val = x;
		best_j = j;
	    }
	}
	
	if (best_val == INT64_MAX) { // all closed
	    break;
	}

	if (-1 == scram_put_seq(out, s[best_j]))
	    return 1;
	
	if (scram_get_seq(in[best_j], &s[best_j]) < 0) {
	    if (scram_close(in[best_j]))
		return 1;
	    in[best_j] = NULL;
	    free(s[best_j]);
	}
    }

    for (i = 0; i < n_input; i++) {
	if (!in[i])
	    continue;
	scram_close(in[i]);
	if (s[i])
	    free(s[i]);
    }

    /* Finally tidy up and close files */
    if (scram_close(out))
	return 1;
    free(in);
    free(s);

    return 0;
}
Example #3
0
int main(int argc, char **argv) {
    scram_fd *in, *out;
    bam_seq_t *s;
    char imode[10], *in_f = "", omode[10], *out_f = "";
    int level = '\0'; // nul terminate string => auto level
    int c, verbose = 0;
    int s_opt = 0, S_opt = 0, embed_ref = 0, ignore_md5 = 0, decode_md = 0;
    char *ref_fn = NULL;
    int start, end, multi_seq = -1, no_ref = 0;
    int use_bz2 = 0, use_arith = 0, use_lzma = 0;
    char ref_name[1024] = {0};
    refs_t *refs;
    int nthreads = 1;
    t_pool *p = NULL;
    int max_reads = -1;
    enum quality_binning binning = BINNING_NONE;

    /* Parse command line arguments */
    while ((c = getopt(argc, argv, "u0123456789hvs:S:V:r:xXeI:O:R:!MmjJZt:BN:")) != -1) {
	switch (c) {
	case '0': case '1': case '2': case '3': case '4':
	case '5': case '6': case '7': case '8': case '9':
	    level = c;
	    break;
	    
	case 'u':
	    level = '0';
	    break;

	case 'h':
	    usage(stdout);
	    return 0;

	case 'v':
	    verbose++;
	    break;

	case 's':
	    s_opt = atoi(optarg);
	    break;

	case 'S':
	    S_opt = atoi(optarg);
	    break;

	case 'm':
	    decode_md = 1;
	    break;

	case 'V':
	    if (cram_set_option(NULL, CRAM_OPT_VERSION, optarg))
		return 1;
	    break;

	case 'r':
	    ref_fn = optarg;
	    break;

	case 'X':
	    fprintf(stderr, "-X is deprecated in favour of -e.\n");
	case 'e':
	    embed_ref = 1;
	    break;

	case 'x':
	    no_ref = 1;
	    break;

	case 'I':
	    in_f = parse_format(optarg);
	    break;

	case 'O':
	    out_f = parse_format(optarg);
	    break;

	case 'R': {
	    char *cp = strchr(optarg, ':');
	    if (cp) {
		*cp = 0;
		switch (sscanf(cp+1, "%d-%d", &start, &end)) {
		case 1:
		    end = start;
		    break;
		case 2:
		    break;
		default:
		    fprintf(stderr, "Malformed range format\n");
		    return 1;
		}
	    } else {
		start = INT_MIN;
		end   = INT_MAX;
	    }
	    strncpy(ref_name, optarg, 1023);
	    break;
	}

	case '!':
	    ignore_md5 = 1;
	    break;

	case 'M':
	    multi_seq = 1;
	    break;

	case 'j':
#ifdef HAVE_LIBBZ2
	    use_bz2 = 1;
#else
	    fprintf(stderr, "Warning: bzip2 support is not compiled into this"
		    " version.\nPlease recompile.\n");
#endif
	    break;

	case 'J':
	    use_arith = 1;
	    break;

	case 'Z':
#ifdef HAVE_LIBLZMA
	    use_lzma = 1;
#else
	    fprintf(stderr, "Warning: lzma support is not compiled into this"
		    " version.\nPlease recompile.\n");
#endif
	    break;

	case 't':
	    nthreads = atoi(optarg);
	    if (nthreads < 1) {
		fprintf(stderr, "Number of threads needs to be >= 1\n");
		return 1;
	    }
	    break;

	case 'B':
	    binning = BINNING_ILLUMINA;
	    break;

	case 'N': // For debugging
	    max_reads = atoi(optarg);
	    break;

	case '?':
	    fprintf(stderr, "Unrecognised option: -%c\n", optopt);
	    usage(stderr);
	    return 1;
	}
    }    

    if (argc - optind > 2) {
	fprintf(stderr, "Usage: scramble [input_file [output_file]]\n");
	return 1;
    }
    

    /* Open up input and output files */
    sprintf(imode, "r%s%c", in_f, level);
    if (argc - optind > 0) {
	if (*in_f == 0)
	    sprintf(imode, "r%s%c", detect_format(argv[optind]), level);
	if (!(in = scram_open(argv[optind], imode))) {
	    fprintf(stderr, "Failed to open file %s\n", argv[optind]);
	    return 1;
	}
    } else {
	if (!(in = scram_open("-", imode))) {
	    fprintf(stderr, "Failed to open file %s\n", argv[optind]);
	    return 1;
	}
    }
    if (!in->is_bam && ref_fn) {
	cram_load_reference(in->c, ref_fn);
	if (!in->c->refs && !embed_ref) {
	    fprintf(stderr, "Unable to find an appropriate reference.\n"
		    "Please specify a valid reference with "
		    "-r ref.fa option.\n");
	    return 1;
	}
    }

    sprintf(omode, "w%s%c", out_f, level);
    if (argc - optind > 1) {
	if (*out_f == 0)
	    sprintf(omode, "w%s%c", detect_format(argv[optind+1]), level);
	if (!(out = scram_open(argv[optind+1], omode))) {
	    fprintf(stderr, "Failed to open file %s\n", argv[optind+1]);
	    return 1;
	}
    } else {
	if (!(out = scram_open("-", omode))) {
	    fprintf(stderr, "Failed to open file %s\n", argv[optind+1]);
	    return 1;
	}
    }


    /* Set any format specific options */
    scram_set_refs(out, refs = scram_get_refs(in));

    scram_set_option(out, CRAM_OPT_VERBOSITY, verbose);
    if (s_opt)
	if (scram_set_option(out, CRAM_OPT_SEQS_PER_SLICE, s_opt))
	    return 1;

    if (S_opt)
	if (scram_set_option(out, CRAM_OPT_SLICES_PER_CONTAINER, S_opt))
	    return 1;

    if (embed_ref)
	if (scram_set_option(out, CRAM_OPT_EMBED_REF, embed_ref))
	    return 1;

    if (use_bz2)
	if (scram_set_option(out, CRAM_OPT_USE_BZIP2, use_bz2))
	    return 1;

    if (use_arith)
	if (scram_set_option(out, CRAM_OPT_USE_ARITH, use_arith))
	    return 1;

    if (use_lzma)
	if (scram_set_option(out, CRAM_OPT_USE_LZMA, use_lzma))
	    return 1;

    if (binning != BINNING_NONE)
	if (scram_set_option(out, CRAM_OPT_BINNING, binning))
	    return 1;

    if (no_ref)
	if (scram_set_option(out, CRAM_OPT_NO_REF, no_ref))
	    return 1;

    if (multi_seq)
	if (scram_set_option(out, CRAM_OPT_MULTI_SEQ_PER_SLICE, multi_seq))
	    return 1;

    if (decode_md) {
	if (no_ref) {
	    fprintf(stderr, "Cannot use -m in conjunction with -x.\n");
	    return 1;
	}
	if (scram_set_option(in, CRAM_OPT_DECODE_MD, decode_md))
	    return 1;
    }

    if (nthreads > 1) {
	if (NULL == (p = t_pool_init(nthreads*2, nthreads)))
	    return 1;

	if (scram_set_option(in,  CRAM_OPT_THREAD_POOL, p))
	    return 1;
	if (scram_set_option(out, CRAM_OPT_THREAD_POOL, p))
	    return 1;
    }

    if (ignore_md5)
	if (scram_set_option(in, CRAM_OPT_IGNORE_MD5, ignore_md5))
	    return 1;
    

    /* Copy header and refs from in to out, for writing purposes */
    scram_set_header(out, scram_get_header(in));

    // Needs doing after loading the header.
    if (ref_fn) {
	if (scram_set_option(out, CRAM_OPT_REFERENCE, ref_fn))
	    return 1;
    } else {
	// Attempt to fill out a cram->refs[] array from @SQ headers
	scram_set_option(out, CRAM_OPT_REFERENCE, NULL);
    }

    if (scram_get_header(out)) {
	char *arg_list = stringify_argv(argc, argv);

	if (!arg_list)
	    return 1;

	if (sam_hdr_add_PG(scram_get_header(out), "scramble",
			   "VN", PACKAGE_VERSION,
			   "CL", arg_list, NULL))
	    return 1;

	if (scram_write_header(out))
	    return 1;

	free(arg_list);

    }


    /* Support for sub-range queries, currently implemented for CRAM only */
    if (*ref_name != 0) {
	cram_range r;
	int refid;

	if (in->is_bam) {
	    fprintf(stderr, "Currently the -R option is only implemented for CRAM indices\n");
	    return 1;
	}
	    
	cram_index_load(in->c, argv[optind]);

	refid = sam_hdr_name2ref(in->c->header, ref_name);

	if (refid == -1 && *ref_name != '*') {
	    fprintf(stderr, "Unknown reference name '%s'\n", ref_name);
	    return 1;
	}
	r.refid = refid;
	r.start = start;
	r.end = end;
	if (scram_set_option(in, CRAM_OPT_RANGE, &r))
	    return 1;
    }

    /* Do the actual file format conversion */
    s = NULL;

    while (scram_get_seq(in, &s) >= 0) {
	if (-1 == scram_put_seq(out, s)) {
	    fprintf(stderr, "Failed to encode sequence\n");
	    return 1;
	}
	if (max_reads >= 0)
	    if (--max_reads == 0)
		break;
    }

    if (max_reads == -1) {
	switch(scram_eof(in)) {
	case 0:
	    fprintf(stderr, "Failed to decode sequence\n");
	    return 1;
	case 2:
	    fprintf(stderr, "Warning: no end-of-file block identified. "
		    "File may be truncated.\n");
	    break;
	case 1: default:
	    // expected case
	    break;
	}
    }

    /* Finally tidy up and close files */
    if (scram_close(in))
	return 1;
    if (scram_close(out))
	return 1;

    if (p)
	t_pool_destroy(p, 0);

    if (s)
	free(s);

    return 0;
}
Example #4
0
 // return 0 on success, -1 on failure
 int writeToFile(scram_fd* fp) {
     return scram_put_seq(fp, read);
 }