示例#1
1
文件: sam.c 项目: atks/vt
static void copy_check_alignment(const char *infname, const char *informat,
    const char *outfname, const char *outmode, const char *outref)
{
    samFile *in = sam_open(infname, "r");
    samFile *out = sam_open(outfname, outmode);
    bam1_t *aln = bam_init1();
    bam_hdr_t *header = NULL;
    int res;

    if (!in) {
        fail("couldn't open %s", infname);
        goto err;
    }
    if (!out) {
        fail("couldn't open %s with mode %s", outfname, outmode);
        goto err;
    }
    if (!aln) {
        fail("bam_init1() failed");
        goto err;
    }

    if (outref) {
        if (hts_set_opt(out, CRAM_OPT_REFERENCE, outref) < 0) {
            fail("setting reference %s for %s", outref, outfname);
            goto err;
        }
    }

    header = sam_hdr_read(in);
    if (!header) {
        fail("reading header from %s", infname);
        goto err;
    }
    if (sam_hdr_write(out, header) < 0) fail("writing headers to %s", outfname);

    while ((res = sam_read1(in, header, aln)) >= 0) {
        int mod4 = ((intptr_t) bam_get_cigar(aln)) % 4;
        if (mod4 != 0)
            fail("%s CIGAR not 4-byte aligned; offset is 4k+%d for \"%s\"",
                 informat, mod4, bam_get_qname(aln));

        if (sam_write1(out, header, aln) < 0) fail("writing to %s", outfname);
    }
    if (res < -1) {
        fail("failed to read alignment from %s", infname);
    }

 err:
    bam_destroy1(aln);
    bam_hdr_destroy(header);
    if (in) sam_close(in);
    if (out) sam_close(out);
}
示例#2
0
int bam_idxstats(int argc, char *argv[])
{
    hts_idx_t* idx;
    bam_hdr_t* header;
    samFile* fp;

    if (argc < 2) {
        fprintf(pysamerr, "Usage: samtools idxstats <in.bam>\n");
        return 1;
    }
    fp = sam_open(argv[1], "r");
    if (fp == NULL) { fprintf(pysamerr, "[%s] fail to open BAM.\n", __func__); return 1; }
    header = sam_hdr_read(fp);
    idx = sam_index_load(fp, argv[1]);
    if (idx == NULL) { fprintf(pysamerr, "[%s] fail to load the index.\n", __func__); return 1; }

    int i;
    for (i = 0; i < header->n_targets; ++i) {
        // Print out contig name and length
        printf("%s\t%d", header->target_name[i], header->target_len[i]);
        // Now fetch info about it from the meta bin
        uint64_t u, v;
        hts_idx_get_stat(idx, i, &u, &v);
        printf("\t%" PRIu64 "\t%" PRIu64 "\n", u, v);
    }
    // Dump information about unmapped reads
    printf("*\t0\t0\t%" PRIu64 "\n", hts_idx_get_n_no_coor(idx));
    bam_hdr_destroy(header);
    hts_idx_destroy(idx);
    sam_close(fp);
    return 0;
}
示例#3
0
文件: alignment.cpp 项目: ktym/vg
// remember to clean up with bam_destroy1(b);
bam1_t* alignment_to_bam(const string& sam_header,
                         const Alignment& alignment,
                         const string& refseq,
                         const int32_t refpos,
                         const string& cigar,
                         const string& mateseq,
                         const int32_t matepos,
                         const int32_t tlen) {

    assert(!sam_header.empty());
    string sam_file = "data:" + sam_header + alignment_to_sam(alignment, refseq, refpos, cigar, mateseq, matepos, tlen);
    const char* sam = sam_file.c_str();
    samFile *in = sam_open(sam, "r");
    bam_hdr_t *header = sam_hdr_read(in);
    bam1_t *aln = bam_init1();
    if (sam_read1(in, header, aln) >= 0) {
        bam_hdr_destroy(header);
        sam_close(in); // clean up
        return aln;
    } else {
        cerr << "[vg::alignment] Failure to parse SAM record" << endl
             << sam << endl;
        exit(1);
    }
}
示例#4
0
    BamFilePrivate(const std::string& fn)
        : filename_(fn)
        , firstAlignmentOffset_(-1)
    {
        // ensure we've updated htslib verbosity with requested verbosity here
        hts_verbose = ( PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity);

        // attempt open
        auto f = RawOpen();

#if !defined (PBBAM_NO_CHECK_EOF) || PBBAM_AUTOVALIDATE
        // sanity check on file
        const int eofCheck = bgzf_check_EOF(f->fp.bgzf);
        if (eofCheck <= 0 ) {
            // 1:  EOF present & correct
            // 2:  not seekable (e.g. reading from stdin)
            // 0:  EOF absent
            // -1: some other error
            std::stringstream e;
            if (eofCheck == 0)
                e << fn << " : is missing EOF block" << std::endl;
            else
                e << fn << " : unknown error while checking EOF block" << std::endl;
            throw std::runtime_error(e.str());
        }
#endif

        // attempt fetch header
        std::unique_ptr<bam_hdr_t, internal::HtslibHeaderDeleter> hdr(sam_hdr_read(f.get()));
        header_ = internal::BamHeaderMemory::FromRawData(hdr.get());

        // cache first alignment offset
        firstAlignmentOffset_ = bgzf_tell(f->fp.bgzf);
    }
示例#5
0
bufReader initBufReader2(const char*fname,int doCheck,char *fai_fname){
  bufReader ret;
  ret.fn = strdup(fname);
  int newlen=strlen(fname);//<-just to avoid valgrind -O3 uninitialized warning
  ret.fp = openBAM(ret.fn,doCheck);
  if (fai_fname && hts_set_fai_filename(ret.fp, fai_fname) != 0) {
    fprintf(stderr, "[%s] failed to process %s\n",
	    __func__, fai_fname);
    exit(EXIT_FAILURE);
  }
  ret.isEOF =0;
  ret.itr=NULL;
  ret.idx=NULL;
 
  ret.hdr = sam_hdr_read(ret.fp);
  if(strlen(ret.hdr->text)==0){
    fprintf(stderr,"\t-> No header information could be found for BAM/CRAM file: \'%s\' will exit\n",fname);
    exit(1);
  }
  
  checkIfSorted(ret.hdr->text);

  if(ret.hdr==NULL) {
    fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", ret.fn);
    exit(0);
  }
  
  return ret;
}
示例#6
0
int bam_access_openhts(char *hts_file, char *ref_file){
	assert(hts_file != NULL);
	//Assign memory for the file name etc holding struct
	fholder = malloc(sizeof(file_holder));
	check_mem(fholder);
	//Beginning and end of tmp struct for bam access
	fholder->beg = 0; fholder->end = 0x7fffffff;  // The max 32 bit integer.
	//Open a file for read from compressed bam.
	fholder->in = hts_open(hts_file, "r");
	check(fholder->in != 0,"HTS file %s failed to open.",hts_file);
  fholder->idx = sam_index_load(fholder->in,hts_file);
	check(fholder->idx != 0,"HTS index file %s failed to open.",hts_file);
	if(ref_file){
	  hts_set_fai_filename(fholder->in, ref_file);
	}else{

	  if(fholder->in->format.format == cram) log_warn("No reference file provided for a cram input file, if the reference described in the cram header can't be located this script may fail.");
	}
  //Check for generic header read method.
  fholder->head = sam_hdr_read(fholder->in);
	return 0;
error:
	if(fholder->in) hts_close(fholder->in);
	if(fholder) free(fholder);
	return -1;
}
示例#7
0
BAMOrderedReader::BAMOrderedReader(std::string bam_file, std::vector<GenomeInterval>& intervals)
:bam_file(bam_file), intervals(intervals), sam(0), hdr(0), idx(0), itr(0)
{
    const char* fname = bam_file.c_str();
    int len = strlen(fname);
    if ( strcasecmp(".bam",fname+len-4) )
    {
        fprintf(stderr, "[%s:%d %s] Not a BAM file: %s\n", __FILE__, __LINE__, __FUNCTION__, bam_file.c_str());
        exit(1);
    }

    sam = sam_open(bam_file.c_str(), "r");
    hdr = sam_hdr_read(sam);
    s = bam_init1();

    idx = bam_index_load(bam_file.c_str());
    if (idx==0)
    {
        fprintf(stderr, "[%s:%d %s] fail to load index for %s\n", __FILE__, __LINE__, __FUNCTION__, bam_file.c_str());
        abort();
    }
    else
    {
        index_loaded = true;
    }

    str = {0,0,0};

    intervals_present =  intervals.size()!=0;
    interval_index = 0;

    random_access_enabled = intervals_present && index_loaded;
};
示例#8
0
文件: sam.c 项目: Annak17/partis
static int aux_fields1(void)
{
    static const char sam[] = "data:"
"@SQ\tSN:one\tLN:1000\n"
"@SQ\tSN:two\tLN:500\n"
"r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tZZ:i:1000000\n";

    // Canonical form of the alignment record above, as output by sam_format1()
    static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tZZ:i:1000000";

    samFile *in = sam_open(sam, "r");
    bam_hdr_t *header = sam_hdr_read(in);
    bam1_t *aln = bam_init1();
    uint8_t *p;
    uint32_t n;
    kstring_t ks = { 0, 0, NULL };

    if (sam_read1(in, header, aln) >= 0) {
        if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k')
            fail("XA field is '%c', expected 'k'", bam_aux2A(p));

        if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37)
            fail("Xi field is %d, expected 37", bam_aux2i(p));

        if ((p = check_bam_aux_get(aln, "Xf", 'f')) && fabs(bam_aux2f(p) - PI) > 1E-6)
            fail("Xf field is %.12f, expected pi", bam_aux2f(p));

        if ((p = check_bam_aux_get(aln, "Xd", 'd')) && fabs(bam_aux2f(p) - E) > 1E-6)
            fail("Xf field is %.12f, expected e", bam_aux2f(p));

        if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), HELLO) != 0)
            fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), HELLO);

        if ((p = check_bam_aux_get(aln, "XH", 'H')) && strcmp(bam_aux2Z(p), BEEF) != 0)
            fail("XH field is \"%s\", expected \"%s\"", bam_aux2Z(p), BEEF);

        // TODO Invent and use bam_aux2B()
        if ((p = check_bam_aux_get(aln, "XB", 'B')) && ! (memcmp(p, "Bc", 2) == 0 && (memcpy(&n, p+2, 4), n) == 3 && memcmp(p+6, "\xfe\x00\x02", 3) == 0))
            fail("XB field is %c,..., expected c,-2,0,+2", p[1]);

        if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000)
            fail("ZZ field is %d, expected 1000000", bam_aux2i(p));

        if (sam_format1(header, aln, &ks) < 0)
            fail("can't format record");

        if (strcmp(ks.s, r1) != 0)
            fail("record formatted incorrectly: \"%s\"", ks.s);

        free(ks.s);
    }
    else fail("can't read record");

    bam_destroy1(aln);
    bam_hdr_destroy(header);
    sam_close(in);

    return 1;
}
示例#9
0
    bool init() {
        // Open files
        file_iter = sam_open(file_name.c_str(), "rb", 0);
        replace_iter = sam_open(replace_name.c_str(), "r", 0);
        out_file = sam_open(out_name.c_str(), "wb", 0);
        
        
        if (file_iter == NULL || replace_iter == NULL || out_file == NULL) {
            return false;
        }
        
        // Iterate through BAM
        file_header = sam_hdr_read(file_iter);
        replace_header = sam_hdr_read(replace_iter);
        
        build_translation();

        return true;
    }
示例#10
0
文件: alignment.cpp 项目: ktym/vg
bam_hdr_t* hts_file_header(string& filename, string& header) {
    samFile *in = hts_open(filename.c_str(), "r");
    if (in == NULL) {
        cerr << "[vg::alignment] could not open " << filename << endl;
        exit(1);
    }
    bam_hdr_t *hdr = sam_hdr_read(in);
    header = hdr->text;
    bam_hdr_destroy(hdr);
    hts_close(in);
    return hdr;
}
示例#11
0
文件: sam.c.pysam.c 项目: msto/pysam
samfile_t *samopen(const char *fn, const char *mode, const void *aux)
{
    // hts_open() is really sam_open(), except for #define games
    samFile *hts_fp = hts_open(fn, mode);
    if (hts_fp == NULL)  return NULL;

    samfile_t *fp = malloc(sizeof (samfile_t));
    if (!fp) {
        sam_close(hts_fp);
        return NULL;
    }
    fp->file = hts_fp;
    fp->x.bam = hts_fp->fp.bgzf;
    if (strchr(mode, 'r')) {
        if (aux) {
            if (hts_set_fai_filename(fp->file, aux) != 0) {
                sam_close(hts_fp);
                free(fp);
                return NULL;
            }
        }
        fp->header = sam_hdr_read(fp->file);  // samclose() will free this
        if (fp->header == NULL) {
            sam_close(hts_fp);
            free(fp);
            return NULL;
        }
        fp->is_write = 0;
        if (fp->header->n_targets == 0 && bam_verbose >= 1)
            fprintf(samtools_stderr, "[samopen] no @SQ lines in the header.\n");
    }
    else {
        enum htsExactFormat fmt = hts_get_format(fp->file)->format;
        fp->header = (bam_hdr_t *)aux;  // For writing, we won't free it
        fp->is_write = 1;
        if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) {
            if (sam_hdr_write(fp->file, fp->header) < 0) {
                if (bam_verbose >= 1)
                    fprintf(samtools_stderr, "[samopen] Couldn't write header\n");
                sam_close(hts_fp);
                free(fp);
                return NULL;
            }
        }
    }

    return fp;
}
示例#12
0
void bam_parser(opt_t *opt) {
    samFile *in = sam_open(opt->in_name, "r");
    if(in == NULL) die("bam_parser: fail to open file '%s'", opt->in_name);
    // if output file exists but not force to overwrite
    if(access(opt->out_name, F_OK)!=-1 && opt->f==false) die("bam_parser: %s exists, use opetion -f to overwrite", opt->out_name);
    bam_hdr_t *header = sam_hdr_read(in);
    bam1_t *aln = bam_init1();
    int8_t *p;
    int32_t n;
    int ret;
    while((ret=sam_read1(in, header, aln)) >= 0)
        printf("name=%s\nflag=%d\nseq=%s\nqual=%s\nlane_id=%d\n", get_read_name(aln), aln->core.flag, get_sequence(aln), get_qualities(aln), get_lane_id(aln));

    bam_destroy1(aln);
    sam_close(in);
}
示例#13
0
static void flanks_sam_open()
{
  if(!futil_path_has_extension(sam_path, ".bam") &&
     !futil_path_has_extension(sam_path, ".sam"))
  {
    cmd_print_usage("Mapped flanks is not .sam or .bam file: %s", sam_path);
  }

  bool isbam = futil_path_has_extension(sam_path, ".bam");

  samfh = sam_open(sam_path, isbam ? "rb" : "rs");
  if(samfh == NULL) die("Cannot open SAM/BAM %s", sam_path);

  // Load BAM header
  bam_header = sam_hdr_read(samfh);
  bamentry = bam_init1();
}
示例#14
0
bufReader initBufReader2(const char*fname){
  bufReader ret;
  ret.fn = strdup(fname);
  int newlen=strlen(fname);//<-just to avoid valgrind -O3 uninitialized warning
  ret.fp = openBAM(ret.fn);
  ret.isEOF =0;
  ret.itr=NULL;
  ret.idx=NULL;
 
  ret.hdr = sam_hdr_read(ret.fp);
  if(ret.hdr==NULL) {
    fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", ret.fn);
    exit(0);
  }
  
  return ret;
}
示例#15
0
文件: alignment.cpp 项目: ktym/vg
int hts_for_each(string& filename, function<void(Alignment&)> lambda) {

    samFile *in = hts_open(filename.c_str(), "r");
    if (in == NULL) return 0;
    bam_hdr_t *hdr = sam_hdr_read(in);
    map<string, string> rg_sample;
    parse_rg_sample_map(hdr->text, rg_sample);
    bam1_t *b = bam_init1();
    while (sam_read1(in, hdr, b) >= 0) {
        Alignment a = bam_to_alignment(b, rg_sample);
        lambda(a);
    }
    bam_destroy1(b);
    bam_hdr_destroy(hdr);
    hts_close(in);
    return 1;

}
示例#16
0
文件: alignment.cpp 项目: ktym/vg
bam_hdr_t* hts_string_header(string& header,
                             map<string, int64_t>& path_length,
                             map<string, string>& rg_sample) {
    stringstream hdr;
    hdr << "@HD\tVN:1.5\tSO:unknown\n";
    for (auto& p : path_length) {
        hdr << "@SQ\tSN:" << p.first << "\t" << "LN:" << p.second << "\n";
    }
    for (auto& s : rg_sample) {
        hdr << "@RG\tID:" << s.first << "\t" << "SM:" << s.second << "\n";
    }
    hdr << "@PG\tID:0\tPN:vg\n";
    header = hdr.str();
    string sam = "data:" + header;
    samFile *in = sam_open(sam.c_str(), "r");
    bam_hdr_t *h = sam_hdr_read(in);
    sam_close(in);
    return h;
}
示例#17
0
static int view_sam(hFILE *hfp, const char *filename)
{
    samFile *in = hts_hopen(hfp, filename, "r");
    if (in == NULL) return 0;
    samFile *out = dup_stdout("w");
    bam_hdr_t *hdr = sam_hdr_read(in);

    if (show_headers) sam_hdr_write(out, hdr);
    if (mode == view_all) {
        bam1_t *b = bam_init1();
        while (sam_read1(in, hdr, b) >= 0)
            sam_write1(out, hdr, b);
        bam_destroy1(b);
    }

    bam_hdr_destroy(hdr);
    hts_close(out);
    hts_close(in);
    return 1;
}
示例#18
0
文件: bwaseqio.c 项目: jwbargsten/bwa
bwa_seqio_t *bwa_bam_open(const char *fn, int which)
{
	bwa_seqio_t *bs;
#ifndef USE_HTSLIB
	bam_header_t *h;
#endif
	bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t));
	bs->is_bam = 1;
	bs->which = which;
#ifdef USE_HTSLIB
	bs->fp = sam_open(fn, "rb");
#else
	bs->fp = bam_open(fn, "r");
#endif
	if (0 == bs->fp) err_fatal_simple("Couldn't open bam file");
#ifdef USE_HTSLIB
	bs->h = sam_hdr_read(bs->fp);
#else
	h = bam_header_read(bs->fp);
	bam_header_destroy(h);
#endif
	return bs;
}
示例#19
0
文件: alignment.cpp 项目: ktym/vg
int hts_for_each_parallel(string& filename, function<void(Alignment&)> lambda) {

    samFile *in = hts_open(filename.c_str(), "r");
    if (in == NULL) return 0;
    bam_hdr_t *hdr = sam_hdr_read(in);
    map<string, string> rg_sample;
    parse_rg_sample_map(hdr->text, rg_sample);

    int thread_count = get_thread_count();
    vector<bam1_t*> bs; bs.resize(thread_count);
    for (auto& b : bs) {
        b = bam_init1();
    }

    bool more_data = true;
#pragma omp parallel shared(in, hdr, more_data, rg_sample)
    {
        int tid = omp_get_thread_num();
        while (more_data) {
            bam1_t* b = bs[tid];
#pragma omp critical (hts_input)
            if (more_data) {
                more_data = sam_read1(in, hdr, b) >= 0;
            }
            if (more_data) {
                Alignment a = bam_to_alignment(b, rg_sample);
                lambda(a);
            }
        }
    }

    for (auto& b : bs) bam_destroy1(b);
    bam_hdr_destroy(hdr);
    hts_close(in);
    return 1;

}
示例#20
0
BM_mappedRead * extractReads(char * bamFile,
                             char ** contigs,
                             int numContigs,
                             uint16_t * groups,
                             char * prettyName,
                             int headersOnly,
                             int minMapQual,
                             int maxMisMatches,
                             int ignoreSuppAlignments,
                             int ignoreSecondaryAlignments) {
    //-----
    // code uses the pattern outlined in samtools view (sam_view.c)
    // thanks lh3!
    //
    int i = 0;
    int result = -1;
    int hh = 0;

    int supp_check = 0x0; // include supp mappings
    if (ignoreSuppAlignments) {
        supp_check |= BAM_FSUPPLEMENTARY;
    }
    if (ignoreSecondaryAlignments) {
        supp_check |= BAM_FSECONDARY;
    }

    // we need to let the users know if their pairings
    // will be corrupted
    int p_corrupt = 0;

    // helper variables
    samFile *in = 0;
    bam_hdr_t *header = NULL;
    bam1_t *b = bam_init1();

    BM_mappedRead * root = 0;
    BM_mappedRead * prev = 0;

    // open file handlers
    if ((in = sam_open(bamFile, "r")) == 0) {
        fprintf(stderr,
                "ERROR: Failed to open \"%s\" for reading.\n",
                bamFile);
    }
    else {
        // retrieve the header
        if ((header = sam_hdr_read(in)) == 0) {
            fprintf(stderr,
                    "ERROR: Failed to read the header from \"%s\".\n",
                    bamFile);
        }
        else {
            // check the index is intact
            hts_idx_t *idx = sam_index_load(in, bamFile); // load index
            if (idx == 0) { // index is unavailable
                fprintf(stderr,
                        "ERROR: Random retrieval only works "\
                        "for indexed files.\n");
            }
            else {
                cfuhash_table_t *pair_buffer = \
                    cfuhash_new_with_initial_size(1000000);
                cfuhash_set_flag(pair_buffer, CFUHASH_FROZEN_UNTIL_GROWS);

                for (hh = 0; hh < numContigs; ++hh) {
                    // parse a region in the format like `chr2:100-200'
                    hts_itr_t *iter = sam_itr_querys(idx, header, contigs[hh]);
                    if (iter == NULL) { // reference name is not found
                        fprintf(stderr,
                                "WARNING: Could not find contig: "\
                                "[%s] in BAM: [%s].\n",
                                contigs[hh],
                                bamFile);
                    }

                    // fetch alignments
                    int line = 0;
                    while ((result = sam_itr_next(in, iter, b)) >= 0) {
                        bam1_core_t core = b->core;
                        line += 1;
                        // only high quality?, primary? mappings
                        if ( core.qual < minMapQual)
                            continue;
                        if ((core.flag & supp_check) != 0)
                            continue;
                        if(bam_aux2i(bam_aux_get(b, "NM")) > maxMisMatches) {
                            continue;
                        }

                        char * seqId = bam_get_qname(b);
                        char * seq = 0;
                        char * qual = 0;
                        int qual_len = 0;
                        int seq_len = 0;

                        // get sequence and quality
                        if(0 == headersOnly) {
                            // no point allocating unused space
                            seq = calloc(core.l_qseq+1, sizeof(char));
                            qual = calloc(core.l_qseq+1, sizeof(char));
                            uint8_t *s = bam_get_seq(b);
                            if (core.flag&BAM_FREVERSE) {
                                // reverse the read
                                int r = 0;
                                for (i = core.l_qseq-1; i >=0 ; --i) {
                                    seq[r]="=TGKCYSBAWRDMHVN"[bam_seqi(s,
                                                                       i)];
                                    ++r;
                                }
                            }
                            else {
                                for (i = 0; i < core.l_qseq; ++i) {
                                    seq[i]="=ACMGRSVTWYHKDBN"[bam_seqi(s,
                                                                       i)];
                                }
                            }
                            seq_len = core.l_qseq;

                            s = bam_get_qual(b);
                            if (s[0] != 0xff) {
                                qual_len = core.l_qseq;
                                for (i = 0; i < core.l_qseq; ++i) {
                                    qual[i] = (char)(s[i] + 33);
                                }
                            }
                            else if (qual != 0) {
                                free(qual);
                                qual = 0;
                            }
                        }

                        // work out pairing information
                        uint8_t rpi = RPI_ERROR;
                        if (core.flag&BAM_FPAIRED) {
                            if(core.flag&BAM_FMUNMAP) {
                                if (core.flag&BAM_FREAD1) {
                                    rpi = RPI_SNGL_FIR;
                                }
                                else if (core.flag&BAM_FREAD2) {
                                    rpi = RPI_SNGL_SEC;
                                }
                            }
                            else {
                                if (core.flag&BAM_FREAD1) {
                                    rpi = RPI_FIR;
                                }
                                else if (core.flag&BAM_FREAD2) {
                                    rpi = RPI_SEC;
                                }
                            }
                        }
                        else {
                            rpi = RPI_SNGL;
                        }

                        // make the funky Id
                        #define MAX_SEQ_ID_LEN 80
                        char * seq_id = calloc(MAX_SEQ_ID_LEN,
                                               sizeof(char));
                        // allocate the string to the buffer but check to
                        // ensure we're not cutting anything off
                        int id_len = snprintf(seq_id,
                                              MAX_SEQ_ID_LEN,
                                              "b_%s;c_%s;r_%s",
                                              prettyName,
                                              contigs[hh],
                                              seqId);
                        if(id_len >= MAX_SEQ_ID_LEN) {
                            seq_id = calloc(id_len+1, sizeof(char));
                            snprintf(seq_id,
                                     id_len+1, // don't forget the NULL!
                                     "b_%s;c_%s;r_%s",
                                     prettyName,
                                     contigs[hh],
                                     seqId);
                        }

                        // make the mapped read struct
                        prev = makeMappedRead(seq_id,
                                              seq,
                                              qual,
                                              id_len,
                                              seq_len,
                                              qual_len,
                                              rpi,
                                              groups[hh],
                                              prev);

                        if (0 == root) { root = prev; }

                        if(rpi == RPI_SNGL || \
                           rpi == RPI_SNGL_FIR || \
                           rpi == RPI_SNGL_SEC) {
                            // we can just add away
                            // indicate singleton reads by pointing the
                            // partner pointer to itself
                            prev->partnerRead = prev;
                        }
                        else {
                            // RPI_FIR or RPI_SEC
                            // work out pairing information using the hash
                            // we append a 1 or 2 to the end so that
                            // we don't accidentally pair 1's with 1's etc.
                            char * stripped_result;
                            if(rpi == RPI_FIR) {
                                stripped_result = \
                                    pairStripper(seqId,
                                                 core.l_qname-1,
                                                 '2');
                            }
                            else {
                                stripped_result = \
                                    pairStripper(seqId,
                                                 core.l_qname-1,
                                                 '1');
                            }

                            char * stripped = seqId;
                            if(stripped_result)
                                stripped = stripped_result;

                            //fprintf(stdout, "SEARCH %s\n", stripped);
                            // now stripped always holds a stripped value
                            // see if it is in the hash already
                            BM_mappedRead * stored_MR = \
                                cfuhash_get(pair_buffer,
                                            stripped);

                            if (0 != stored_MR) {
                                // exists in the hash -> Add the pair info
                                if(rpi == RPI_FIR) {
                                    prev->partnerRead = stored_MR;
                                }
                                else {
                                    stored_MR->partnerRead = prev;
                                }

                                // delete the entry from the hash
                                cfuhash_delete(pair_buffer,
                                               stripped);
                            }
                            else {
                                // we should put it in the hash
                                // make sure to change it into something
                                // we will find next time
                                if(rpi == RPI_FIR)
                                    stripped[strlen(stripped)-1] = '1';
                                else
                                    stripped[strlen(stripped)-1] = '2';

                                // check to make sure we're not overwriting
                                // anything important. cfuhash overwrites
                                // duplicate entries, so we need to grab
                                // it and put it to "SNGL_XXX" before we
                                // lose the pointer
                                BM_mappedRead * OWMMR = \
                                    cfuhash_put(pair_buffer,
                                                stripped, prev);
                                if(OWMMR) {
                                    if(OWMMR->rpi == RPI_FIR)
                                        OWMMR->rpi = RPI_SNGL_FIR;
                                    else
                                        OWMMR->rpi = RPI_SNGL_SEC;
                                    OWMMR->partnerRead = OWMMR;
                                    printPairCorruptionWarning(p_corrupt);
                                    p_corrupt = 1;
                                }


                            }

                            if(stripped_result != 0) { // free this!
                                free(stripped_result);
                                stripped_result = 0;
                            }
                        }
                    }
                    hts_itr_destroy(iter);
                    if (result < -1) {
                        fprintf(stderr, "ERROR: retrieval of reads from "\
                                        "contig:  \"%s\" failed due to "\
                                        "truncated file or corrupt BAM index "\
                                        "file\n", header->target_name[hh]);
                        break;
                    }
                }

                // any entries left in the hash are pairs whose mates did
                // not meet quality standards
                size_t key_size = 0;
                char * key;
                BM_mappedRead * LOMMR;
                size_t pr_size = 1;
                if(cfuhash_each_data(pair_buffer,
                                     (void**)&key,
                                     &key_size,
                                     (void**)&LOMMR,
                                     &pr_size)) {
                    do {
                        // get the mapped read
                        // update it's pairing so we know it's really single
                        if (LOMMR->rpi == RPI_FIR)
                            LOMMR->rpi = RPI_SNGL_FIR;
                        else if (LOMMR->rpi == RPI_SEC)
                            LOMMR->rpi = RPI_SNGL_SEC;

                        // indicate singleton reads by pointing the
                        // partner pointer to itself
                        LOMMR->partnerRead = LOMMR;

                    } while(cfuhash_next_data(pair_buffer,
                                              (void**)&key,
                                              &key_size,
                                              (void**)&LOMMR,
                                              &pr_size));
                }

                cfuhash_clear(pair_buffer);
                cfuhash_destroy(pair_buffer);
            }
            hts_idx_destroy(idx); // destroy the BAM index
        }
    }
    // always do this
    if (in) sam_close(in);
    bam_destroy1(b);
    if ( header ) bam_hdr_destroy(header);

    return root;
}
示例#21
0
static bool init(const parsed_opts_t* opts, state_t** state_out) {
    state_t* retval = (state_t*) calloc(1, sizeof(state_t));
    if (retval == NULL) {
        fprintf(stderr, "[init] Out of memory allocating state struct.\n");
        return false;
    }
    *state_out = retval;

    // Open files
    retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in);
    if (retval->input_file == NULL) {
        fprintf(stderr, "[init] Could not open input file: %s\n", opts->input_name);
        return false;
    }
    retval->input_header = sam_hdr_read(retval->input_file);

    retval->output_header = bam_hdr_dup(retval->input_header);
    retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out);

    if (retval->output_file == NULL) {
        print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name);
        return false;
    }

    if (opts->rg_line) {
        // Append new RG line to header.
        // Check does not already exist
        if ( confirm_rg(retval->output_header, opts->rg_id) ) {
            fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n");
            return false;
        }
        retval->rg_id = strdup(opts->rg_id);
        size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2;
        char* new_header = malloc(new_len);
        if (!new_header) {
            fprintf(stderr, "[init] Out of memory whilst writing new header.\n");
            return false;
        }
        sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line);
        free(retval->output_header->text);
        retval->output_header->text = new_header;
        retval->output_header->l_text = (int)new_len - 1;
    } else {
        if (opts->rg_id) {
            // Confirm what has been supplied exists
            if ( !confirm_rg(retval->output_header, opts->rg_id) ) {
                fprintf(stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n");
                return false;
            }
            retval->rg_id = strdup(opts->rg_id);
        } else {
            if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) {
                fprintf(stderr, "No RG specified on command line or in existing header.\n");
                return false;
            }
        }
    }

    switch (opts->mode) {
        case overwrite_all:
            retval->mode_func = &overwrite_all_func;
            break;
        case orphan_only:
            retval->mode_func = &orphan_only_func;
            break;
    }

    return true;
}
示例#22
0
int main(int argc, char *argv[])
{
    samFile *in;
    char *fn_ref = 0;
    int flag = 0, c, clevel = -1, ignore_sam_err = 0;
    char moder[8];
    bam_hdr_t *h;
    bam1_t *b;
    htsFile *out;
    char modew[8];
    int r = 0, exit_code = 0;
    hts_opt *in_opts = NULL, *out_opts = NULL, *last = NULL;
    int nreads = 0;
    int benchmark = 0;

    while ((c = getopt(argc, argv, "IbDCSl:t:i:o:N:B")) >= 0) {
        switch (c) {
        case 'S': flag |= 1; break;
        case 'b': flag |= 2; break;
        case 'D': flag |= 4; break;
        case 'C': flag |= 8; break;
        case 'B': benchmark = 1; break;
        case 'l': clevel = atoi(optarg); flag |= 2; break;
        case 't': fn_ref = optarg; break;
        case 'I': ignore_sam_err = 1; break;
        case 'i': if (add_option(&in_opts,  optarg)) return 1; break;
        case 'o': if (add_option(&out_opts, optarg)) return 1; break;
        case 'N': nreads = atoi(optarg);
        }
    }
    if (argc == optind) {
        fprintf(stderr, "Usage: samview [-bSCSIB] [-N num_reads] [-l level] [-o option=value] <in.bam>|<in.sam>|<in.cram> [region]\n");
        return 1;
    }
    strcpy(moder, "r");
    if (flag&4) strcat(moder, "c");
    else if ((flag&1) == 0) strcat(moder, "b");

    in = sam_open(argv[optind], moder);
    if (in == NULL) {
        fprintf(stderr, "Error opening \"%s\"\n", argv[optind]);
        return EXIT_FAILURE;
    }
    h = sam_hdr_read(in);
    h->ignore_sam_err = ignore_sam_err;
    b = bam_init1();

    strcpy(modew, "w");
    if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel);
    if (flag&8) strcat(modew, "c");
    else if (flag&2) strcat(modew, "b");
    out = hts_open("-", modew);
    if (out == NULL) {
        fprintf(stderr, "Error opening standard output\n");
        return EXIT_FAILURE;
    }

    /* CRAM output */
    if (flag & 8) {
        int ret;

        // Parse input header and use for CRAM output
        out->fp.cram->header = sam_hdr_parse_(h->text, h->l_text);

        // Create CRAM references arrays
        if (fn_ref)
            ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, fn_ref);
        else
            // Attempt to fill out a cram->refs[] array from @SQ headers
            ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL);

        if (ret != 0)
            return EXIT_FAILURE;
    }

    // Process any options; currently cram only.
    for (; in_opts;  in_opts = (last=in_opts)->next, free(last)) {
        hts_set_opt(in,  in_opts->opt,  in_opts->val);
        if (in_opts->opt == CRAM_OPT_REFERENCE)
            if (hts_set_opt(out,  in_opts->opt,  in_opts->val) != 0)
                return EXIT_FAILURE;
    }
    for (; out_opts;  out_opts = (last=out_opts)->next, free(last))
        if (hts_set_opt(out, out_opts->opt,  out_opts->val) != 0)
            return EXIT_FAILURE;

    if (!benchmark)
        sam_hdr_write(out, h);
    if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region
        int i;
        hts_idx_t *idx;
        if ((idx = sam_index_load(in, argv[optind])) == 0) {
            fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__);
            return 1;
        }
        for (i = optind + 1; i < argc; ++i) {
            hts_itr_t *iter;
            if ((iter = sam_itr_querys(idx, h, argv[i])) == 0) {
                fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]);
                continue;
            }
            while ((r = sam_itr_next(in, iter, b)) >= 0) {
                if (!benchmark && sam_write1(out, h, b) < 0) {
                    fprintf(stderr, "Error writing output.\n");
                    exit_code = 1;
                    break;
                }
                if (nreads && --nreads == 0)
                    break;
            }
            hts_itr_destroy(iter);
        }
        hts_idx_destroy(idx);
    } else while ((r = sam_read1(in, h, b)) >= 0) {
        if (!benchmark && sam_write1(out, h, b) < 0) {
            fprintf(stderr, "Error writing output.\n");
            exit_code = 1;
            break;
        }
        if (nreads && --nreads == 0)
            break;
    }

    if (r < -1) {
        fprintf(stderr, "Error parsing input.\n");
        exit_code = 1;
    }

    r = sam_close(out);
    if (r < 0) {
        fprintf(stderr, "Error closing output.\n");
        exit_code = 1;
    }

    bam_destroy1(b);
    bam_hdr_destroy(h);

    r = sam_close(in);
    if (r < 0) {
        fprintf(stderr, "Error closing input.\n");
        exit_code = 1;
    }

    return exit_code;
}
示例#23
0
multiReader::multiReader(int argc,char**argv){
  gz=Z_NULL;
  myglf=NULL;myvcf=NULL;mpil=NULL;bglObj=NULL;
  
  nLines=50;
  
  fname=NULL;
  intName=1;
  minQ = MINQ;

  nInd =0;
  isSim =0;
  args=NULL;
  args = setArgStruct(argc,argv);
  fprintf(args->argumentFile,"\t-> Command: \n");
  for(int i=0;i<argc;i++)
    fprintf(args->argumentFile,"%s ",argv[i]);
  //  fprintf(args->argumentFile,"\n\n");
  if(args->argumentFile!=stderr)
    fprintf(args->argumentFile,"\n\t-> angsd version: %s (htslib: %s) build(%s %s)\n",ANGSD_VERSION,hts_version(),__DATE__,__TIME__); 
  void printTime(FILE *fp);
  printTime(args->argumentFile); 


  //type = args->inputtype;

  if(args->argc==2) {
    if((!strcasecmp(args->argv[1],"-beagle")) ||
       (!strcasecmp(args->argv[1],"-glf")) ||
       (!strcasecmp(args->argv[1],"-glf3")) ||
       (!strcasecmp(args->argv[1],"-pileup")) ||
       (!strcasecmp(args->argv[1],"-vcf-GL")) ||
       (!strcasecmp(args->argv[1],"-vcf-pl")) ||
       (!strcasecmp(args->argv[1],"-glf10_text")) ||
       (!strcasecmp(args->argv[1],"-vcf-GP"))) {
      printArg(stdout,args);
      exit(0);
    }else if ((!strcasecmp(args->argv[1],"-bam")) ||
	      (!strcasecmp(args->argv[1],"-b"))){
      setArgsBam(args);
      exit(0);
    }else
      return;
  }
  
  getOptions(args);
  if(args->fai==NULL){
    int printAndExit =0;
    switch(args->inputtype)
      {
      case INPUT_GLF:
	printAndExit=1;
	break;
      case INPUT_GLF10_TEXT:
	printAndExit=1;
	break;
      case INPUT_GLF3:
	printAndExit=1;
	break;
      case INPUT_BEAGLE:
	printAndExit=1;
	break;
      case INPUT_PILEUP:
	printAndExit=1;
	break;
      }
    if(printAndExit){
      fprintf(stderr,"\t-> Must supply -fai file\n");
      exit(0);
    }
  }
  
  if(args->fai){
    if(!(args->hd=getHeadFromFai(args->fai)))
      exit(0);
  }else{
    if(args->nams.size()==0){
      fprintf(stderr,"\t-> Must choose inputfile -bam/-glf/-glf3/-pileup/-i/-vcf-gl/-vcf-gp/-vcf-pl/-glf10_text filename\n");
      exit(0);
    }
    if(args->inputtype==INPUT_BAM){
      htsFile *in=sam_open(args->nams[0],"r");
      assert(in);
      args->hd= sam_hdr_read(in);
      hts_close(in);
    }
  }
 
  if(!(INPUT_VCF_GL||INPUT_VCF_GP)){
    if(args->hd==NULL){
      fprintf(stderr,"For non-bams you should include -fai arguments\n");
      exit(0);
    }
  }

  if((args->inputtype==INPUT_PILEUP||args->inputtype==INPUT_GLF||args->inputtype==INPUT_GLF3||args->inputtype==INPUT_GLF10_TEXT)){
    if(nInd==0){
      fprintf(stderr,"\t-> Must supply -nInd when using -glf/-glf3/-pileup/-glf10_text files\n");
      exit(0);
    }
  }else
    args->nInd = args->nams.size();

  if(args->inputtype==INPUT_VCF_GP||args->inputtype==INPUT_VCF_GL){
    if(args->regions.size()>1){
      fprintf(stderr,"\t-> Only one region can be specified with using bcf (i doubt more is needed)  will exit\n");
      exit(0);
    }else if(args->regions.size()<=1){
      myvcf = new vcfReader(args->infile,NULL);
      args->hd=bcf_hdr_2_bam_hdr_t(myvcf->hs);
      args->nInd = myvcf->hs->nsamples;
    }
  }
  //make args->hd
  
  revMap = buildRevTable(args->hd);
  args->revMap = revMap;
  setArgsBam(args);
  if(args->inputtype==INPUT_VCF_GL){
    if(args->regions.size()==1){
      char tmp[1024];
      int start=args->regions[0].start;
      int stop=args->regions[0].stop;
      int ref=args->regions[0].refID;
      snprintf(tmp,1024,"%s:%d-%d",args->hd->target_name[ref],start+1,stop);
      //    fprintf(stderr,"tmp:%s\n",tmp);
      //    exit(0);
      myvcf->seek(tmp);
    }
    
  }


  
  if(fname==NULL)
    return;

  gz=Z_NULL;
  gz=gzopen(fname,"r");   
  if(gz==Z_NULL){
    fprintf(stderr,"\t-> Problem opening file: \'%s\'\n",fname);
    exit(0);
  }

  switch(args->inputtype){
  case INPUT_PILEUP:{
    mpil = new mpileup(args->nInd,gz,args->revMap,minQ);
    break;
  }
  case INPUT_GLF:{
    myglf = new glfReader(args->nInd,gz,10,isSim);
    break;
  }
  case INPUT_GLF3:{
    isSim = 1; //Added by FGV on 22/02/2015: GLF3 is always simulated data until a better alternative can be found
    myglf = new glfReader(args->nInd,gz,3,isSim);
    break;
  }
  case INPUT_GLF10_TEXT:{
    myglf_text = new glfReader_text(args->nInd,gz,args->revMap);
    break;
  }

  case INPUT_BEAGLE:{
    bglObj = new beagle_reader(gz,args->revMap,intName,args->nInd);
    break;
  }
    
  default:{
    break;
  }
  }
  if(args->inputtype==INPUT_VCF_GL||args->inputtype==INPUT_VCF_GL){
    fprintf(stderr,"\t-> VCF still beta. Remember that\n");
    fprintf(stderr,"\t   1. indels are are discarded\n");
    fprintf(stderr,"\t   2. will use chrom, pos PL columns\n");
    fprintf(stderr,"\t   3. GL tags are interpreted as log10 and are scaled to ln (NOT USED)\n");
    fprintf(stderr,"\t   4. GP tags are interpreted directly as unscaled post probs (spec says phredscaled...) (NOT USED)\n");
    fprintf(stderr,"\t   5. FILTER column is currently NOT used (not sure what concensus is)\n");
    fprintf(stderr,"\t   6. -sites does NOT work with vcf input but -r does\n");
    fprintf(stderr,"\t   7. vcffilereading is still BETA, please report strange behaviour\n");
  }
}
示例#24
0
int ctx_calls2vcf(int argc, char **argv)
{
  const char *in_path = NULL, *out_path = NULL, *out_type = NULL;
  // Filtering parameters
  int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1;
  // Alignment parameters
  int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1;
  // ref paths
  char const*const* ref_paths = NULL;
  size_t nref_paths = 0;
  // flank file
  const char *sam_path = NULL;

  //
  // Things we figure out by looking at the input
  //
  bool isbubble = false;
  // samples in VCF, (0 for bubble, does not include ref in breakpoint calls)
  size_t i, kmer_size, num_samples;

  //
  // Reference genome
  //
  // Hash map of chromosome name -> sequence
  ChromHash *genome;
  ReadBuffer chroms;

  // Arg parsing
  char cmd[100];
  char shortopts[300];
  cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts));
  int c;

  // silence error messages from getopt_long
  // opterr = 0;

  while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) {
    cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd));
    switch(c) {
      case 0: /* flag set */ break;
      case 'h': cmd_print_usage(NULL); break;
      case 'o': cmd_check(!out_path, cmd); out_path = optarg; break;
      case 'O': cmd_check(!out_type, cmd); out_type = optarg; break;
      case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break;
      case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break;
      case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break;
      case 'A': cmd_check(max_align_len  < 0,cmd); max_align_len  = cmd_uint32(cmd, optarg); break;
      case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break;
      case 'm': nwmatch = cmd_int32(cmd, optarg); break;
      case 'M': nwmismatch = cmd_int32(cmd, optarg); break;
      case 'g': nwgapopen = cmd_int32(cmd, optarg); break;
      case 'G': nwgapextend = cmd_int32(cmd, optarg); break;
      case ':': /* BADARG */
      case '?': /* BADCH getopt_long has already printed error */
        die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]);
      default: ctx_assert2(0, "shouldn't reach here: %c", c);
    }
  }

  // Defaults for unset values
  if(out_path == NULL) out_path = "-";
  if(max_align_len  < 0) max_align_len  = DEFAULT_MAX_ALIGN;
  if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE;

  if(optind+2 > argc)
    cmd_print_usage("Require <in.txt.gz> and at least one reference");

  in_path = argv[optind++];
  ref_paths = (char const*const*)argv + optind;
  nref_paths = argc - optind;

  // These functions call die() on error
  gzFile gzin = futil_gzopen(in_path, "r");

  // Read call file header
  cJSON *json = json_hdr_load(gzin, in_path);

  // Check we can handle the kmer size
  kmer_size = json_hdr_get_kmer_size(json, in_path);
  db_graph_check_kmer_size(kmer_size, in_path);

  // Get format (bubble or breakpoint file)
  cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path);
  if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false;
  else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true;
  else die("Unknown format: '%s'", json_fmt->valuestring);

  status("Reading %s in %s format", futil_inpath_str(in_path),
         isbubble ? "bubble" : "breakpoint");

  if(isbubble) {
    // bubble specific
    if(sam_path == NULL)
      cmd_print_usage("Require -F <flanks.sam> with bubble file");
    if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ;
  }
  else {
    // breakpoint specific
    if(min_mapq >= 0)
      cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls");
  }

  // Open flank file if it exists
  htsFile *samfh = NULL;
  bam_hdr_t *bam_hdr = NULL;
  bam1_t *mflank = NULL;

  if(sam_path)
  {
    if((samfh = hts_open(sam_path, "r")) == NULL)
      die("Cannot open SAM/BAM %s", sam_path);

    // Load BAM header
    bam_hdr = sam_hdr_read(samfh);
    if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path);
    mflank = bam_init1();
  }

  // Output VCF has 0 samples if bubbles file, otherwise has N where N is
  // number of samples/colours in the breakpoint graph
  size_t num_graph_samples = json_hdr_get_ncols(json, in_path);
  size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path);

  num_samples = 0;
  if(!isbubble) {
    // If last colour has "is_ref", drop number of samples by one
    num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1
                                                       : num_graph_samples;
  }

  //
  // Open output file
  //
  if(!out_path) out_path = "-";
  int mode = vcf_misc_get_outtype(out_type, out_path);
  futil_create_output(out_path);
  htsFile *vcffh = hts_open(out_path, modes_htslib[mode]);

  status("[calls2vcf] Reading %s call file with %zu samples",
         isbubble ? "Bubble" : "Breakpoint", num_graph_samples);
  status("[calls2vcf] %zu sample output to: %s format: %s",
         num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]);

  if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq);
  status("[calls2vcf] max alignment length: %i", max_align_len);
  status("[calls2vcf] max VCF allele length: %i", max_allele_len);
  status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i",
         nwmatch, nwmismatch, nwgapopen, nwgapextend);

  // Load reference genome
  read_buf_alloc(&chroms, 1024);
  genome = chrom_hash_init();
  chrom_hash_load(ref_paths, nref_paths, &chroms, genome);

  // convert to upper case
  char *s;
  for(i = 0; i < chroms.len; i++)
    for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s);

  if(!isbubble) brkpnt_check_refs_match(json, genome, in_path);

  bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size,
                                   ref_paths, nref_paths,
                                   chroms.b, chroms.len);

  if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header");

  AlignedCall *call = acall_init();
  CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr);

  scoring_t *scoring = call_decomp_get_scoring(aligner);
  scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend,
               false, false, 0, 0, 0, 0);

  CallFileEntry centry;
  call_file_entry_alloc(&centry);

  char kmer_str[50];
  sprintf(kmer_str, ";K%zu", kmer_size);

  if(isbubble)
  {
    // Bubble calls
    DecompBubble *bubbles = decomp_bubble_init();

    // Set scoring for aligning 3' flank
    scoring = decomp_bubble_get_scoring(bubbles);
    scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend,
                 true, true, 0, 0, 0, 0);

    while(call_file_read(gzin, in_path, &centry)) {
      do {
        if(sam_read1(samfh, bam_hdr, mflank) < 0)
          die("We've run out of SAM entries!");
      } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY));

      // Align call
      strbuf_reset(&call->info);
      decomp_bubble_call(bubbles, genome, kmer_size, min_mapq,
                         &centry, mflank, bam_hdr, call);
      strbuf_append_str(&call->info, kmer_str);
      acall_decompose(aligner, call, max_align_len, max_allele_len);
    }

    // print bubble stats
    DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats));
    decomp_bubble_cpy_stats(bub_stats, bubbles);
    print_bubble_stats(bub_stats);
    ctx_free(bub_stats);

    decomp_bubble_destroy(bubbles);
  }
  else
  {
    // Breakpoint calls
    DecompBreakpoint *breakpoints = decomp_brkpt_init();

    while(call_file_read(gzin, in_path, &centry)) {
      strbuf_reset(&call->info);
      decomp_brkpt_call(breakpoints, genome, num_samples, &centry, call);
      strbuf_append_str(&call->info, kmer_str);
      acall_decompose(aligner, call, max_align_len, max_allele_len);
    }

    // print bubble stats
    DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats));
    decomp_brkpt_cpy_stats(brk_stats, breakpoints);
    print_breakpoint_stats(brk_stats);
    ctx_free(brk_stats);

    decomp_brkpt_destroy(breakpoints);
  }

  // Print stats
  DecomposeStats *astats = ctx_calloc(1, sizeof(*astats));
  call_decomp_cpy_stats(astats, aligner);
  print_acall_stats(astats);
  ctx_free(astats);

  call_file_entry_dealloc(&centry);
  call_decomp_destroy(aligner);
  acall_destroy(call);

  // Finished - clean up
  cJSON_Delete(json);
  gzclose(gzin);

  bcf_hdr_destroy(vcfhdr);
  hts_close(vcffh);

  for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]);
  read_buf_dealloc(&chroms);
  chrom_hash_destroy(genome);

  if(sam_path) {
    hts_close(samfh);
    bam_hdr_destroy(bam_hdr);
    bam_destroy1(mflank);
  }

  return EXIT_SUCCESS;
}
示例#25
0
int main_samview(int argc, char *argv[])
{
	samFile *in;
	char *fn_ref = 0;
	int flag = 0, c, clevel = -1, ignore_sam_err = 0;
	char moder[8];
	bam_hdr_t *h;
	bam1_t *b;

	while ((c = getopt(argc, argv, "IbSl:t:")) >= 0) {
		switch (c) {
		case 'S': flag |= 1; break;
		case 'b': flag |= 2; break;
		case 'l': clevel = atoi(optarg); flag |= 2; break;
		case 't': fn_ref = optarg; break;
		case 'I': ignore_sam_err = 1; break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "Usage: samview [-bSI] [-l level] <in.bam>|<in.sam> [region]\n");
		return 1;
	}
	strcpy(moder, "r");
	if ((flag&1) == 0) strcat(moder, "b");

	in = sam_open(argv[optind], moder, fn_ref);
	h = sam_hdr_read(in);
	h->ignore_sam_err = ignore_sam_err;
	b = bam_init1();

	if ((flag&4) == 0) { // SAM/BAM output
		htsFile *out;
		char modew[8];
		strcpy(modew, "w");
		if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel);
		if (flag&2) strcat(modew, "b");
		out = hts_open("-", modew, 0);
		sam_hdr_write(out, h);
		if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region
			int i;
			hts_idx_t *idx;
			if ((idx = bam_index_load(argv[optind])) == 0) {
				fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__);
				return 1;
			}
			for (i = optind + 1; i < argc; ++i) {
				hts_itr_t *iter;
				if ((iter = bam_itr_querys(idx, h, argv[i])) == 0) {
					fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]);
					continue;
				}
				while (bam_itr_next((BGZF*)in->fp, iter, b) >= 0) sam_write1(out, h, b);
				hts_itr_destroy(iter);
			}
			hts_idx_destroy(idx);
		} else while (sam_read1(in, h, b) >= 0) sam_write1(out, h, b);
		sam_close(out);
	}

	bam_destroy1(b);
	bam_hdr_destroy(h);
	sam_close(in);
	return 0;
}
示例#26
0
int main_bedcov(int argc, char *argv[])
{
    gzFile fp;
    kstring_t str;
    kstream_t *ks;
    hts_idx_t **idx;
    aux_t **aux;
    int *n_plp, dret, i, n, c, min_mapQ = 0;
    int64_t *cnt;
    const bam_pileup1_t **plp;
    int usage = 0;

    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
    static const struct option lopts[] = {
        SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
        { NULL, 0, NULL, 0 }
    };

    while ((c = getopt_long(argc, argv, "Q:", lopts, NULL)) >= 0) {
        switch (c) {
        case 'Q': min_mapQ = atoi(optarg); break;
        default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                  /* else fall-through */
        case '?': usage = 1; break;
        }
        if (usage) break;
    }
    if (usage || optind + 2 > argc) {
        fprintf(pysam_stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
        fprintf(pysam_stderr, "  -Q INT       Only count bases of at least INT quality [0]\n");
        sam_global_opt_help(pysam_stderr, "-.--.");
        return 1;
    }
    memset(&str, 0, sizeof(kstring_t));
    n = argc - optind - 1;
    aux = calloc(n, sizeof(aux_t*));
    idx = calloc(n, sizeof(hts_idx_t*));
    for (i = 0; i < n; ++i) {
        aux[i] = calloc(1, sizeof(aux_t));
        aux[i]->min_mapQ = min_mapQ;
        aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in);
        if (aux[i]->fp)
            idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
        if (aux[i]->fp == 0 || idx[i] == 0) {
            fprintf(pysam_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
            return 2;
        }
        // TODO bgzf_set_cache_size(aux[i]->fp, 20);
        aux[i]->header = sam_hdr_read(aux[i]->fp);
        if (aux[i]->header == NULL) {
            fprintf(pysam_stderr, "ERROR: failed to read header for '%s'\n",
                    argv[i+optind+1]);
            return 2;
        }
    }
    cnt = calloc(n, 8);

    fp = gzopen(argv[optind], "rb");
    ks = ks_init(fp);
    n_plp = calloc(n, sizeof(int));
    plp = calloc(n, sizeof(bam_pileup1_t*));
    while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
        char *p, *q;
        int tid, beg, end, pos;
        bam_mplp_t mplp;

        for (p = q = str.s; *p && *p != '\t'; ++p);
        if (*p != '\t') goto bed_error;
        *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t';
        if (tid < 0) goto bed_error;
        for (q = p = p + 1; isdigit(*p); ++p);
        if (*p != '\t') goto bed_error;
        *p = 0; beg = atoi(q); *p = '\t';
        for (q = p = p + 1; isdigit(*p); ++p);
        if (*p == '\t' || *p == 0) {
            int c = *p;
            *p = 0; end = atoi(q); *p = c;
        } else goto bed_error;

        for (i = 0; i < n; ++i) {
            if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
            aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
        }
        mplp = bam_mplp_init(n, read_bam, (void**)aux);
        bam_mplp_set_maxcnt(mplp, 64000);
        memset(cnt, 0, 8 * n);
        while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
            if (pos >= beg && pos < end)
                for (i = 0; i < n; ++i) cnt[i] += n_plp[i];
        for (i = 0; i < n; ++i) {
            kputc('\t', &str);
            kputl(cnt[i], &str);
        }
        fputs(str.s, pysam_stdout) & fputc('\n', pysam_stdout);
        bam_mplp_destroy(mplp);
        continue;

bed_error:
        fprintf(pysam_stderr, "Errors in BED line '%s'\n", str.s);
    }
    free(n_plp); free(plp);
    ks_destroy(ks);
    gzclose(fp);

    free(cnt);
    for (i = 0; i < n; ++i) {
        if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
        hts_idx_destroy(idx[i]);
        bam_hdr_destroy(aux[i]->header);
        sam_close(aux[i]->fp);
        free(aux[i]);
    }
    free(aux); free(idx);
    free(str.s);
    sam_global_args_free(&ga);
    return 0;
}
示例#27
0
文件: bam_md.c 项目: pd3/samtools
int bam_fillmd(int argc, char *argv[])
{
    int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag;
    samFile *fp = NULL, *fpout = NULL;
    bam_hdr_t *header = NULL;
    faidx_t *fai = NULL;
    char *ref = NULL, mode_w[8], *ref_file;
    bam1_t *b = NULL;
    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;

    static const struct option lopts[] = {
        SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0),
        { NULL, 0, NULL, 0 }
    };

    flt_flag = UPDATE_NM | UPDATE_MD;
    is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0;
    strcpy(mode_w, "w");
    while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) {
        switch (c) {
        case 'r':
            is_realn = 1;
            break;
        case 'e':
            flt_flag |= USE_EQUAL;
            break;
        case 'd':
            flt_flag |= DROP_TAG;
            break;
        case 'q':
            flt_flag |= BIN_QUAL;
            break;
        case 'h':
            flt_flag |= HASH_QNM;
            break;
        case 'N':
            flt_flag &= ~(UPDATE_MD|UPDATE_NM);
            break;
        case 'b':
            is_bam_out = 1;
            break;
        case 'u':
            is_uncompressed = is_bam_out = 1;
            break;
        case 'S':
            break;
        case 'n':
            max_nm = atoi(optarg);
            break;
        case 'C':
            capQ = atoi(optarg);
            break;
        case 'A':
            baq_flag |= 1;
            break;
        case 'E':
            baq_flag |= 2;
            break;
        default:
            if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
            fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c);
        /* else fall-through */
        case '?':
            return calmd_usage();
        }
    }
    if (is_bam_out) strcat(mode_w, "b");
    else strcat(mode_w, "h");
    if (is_uncompressed) strcat(mode_w, "0");
    if (optind + (ga.reference == NULL) >= argc)
        return calmd_usage();
    fp = sam_open_format(argv[optind], "r", &ga.in);
    if (fp == NULL) {
        print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]);
        return 1;
    }

    header = sam_hdr_read(fp);
    if (header == NULL || header->n_targets == 0) {
        fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n");
        goto fail;
    }

    fpout = sam_open_format("-", mode_w, &ga.out);
    if (fpout == NULL) {
        print_error_errno("calmd", "Failed to open output");
        goto fail;
    }
    if (sam_hdr_write(fpout, header) < 0) {
        print_error_errno("calmd", "Failed to write sam header");
        goto fail;
    }

    ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference;
    fai = fai_load(ref_file);

    if (!fai) {
        print_error_errno("calmd", "Failed to open reference file '%s'", ref_file);
        goto fail;
    }

    b = bam_init1();
    if (!b) {
        fprintf(stderr, "[bam_fillmd] Failed to allocate bam struct\n");
        goto fail;
    }
    while ((ret = sam_read1(fp, header, b)) >= 0) {
        if (b->core.tid >= 0) {
            if (tid != b->core.tid) {
                free(ref);
                ref = fai_fetch(fai, header->target_name[b->core.tid], &len);
                tid = b->core.tid;
                if (ref == 0) { // FIXME: Should this always be fatal?
                    fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n",
                            header->target_name[tid]);
                    if (is_realn || capQ > 10) goto fail; // Would otherwise crash
                }
            }
            if (is_realn) sam_prob_realn(b, ref, len, baq_flag);
            if (capQ > 10) {
                int q = sam_cap_mapq(b, ref, len, capQ);
                if (b->core.qual > q) b->core.qual = q;
            }
            if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm);
        }
        if (sam_write1(fpout, header, b) < 0) {
            print_error_errno("calmd", "failed to write to output file");
            goto fail;
        }
    }
    if (ret < -1) {
        fprintf(stderr, "[bam_fillmd] Error reading input.\n");
        goto fail;
    }
    bam_destroy1(b);
    bam_hdr_destroy(header);

    free(ref);
    fai_destroy(fai);
    sam_close(fp);
    if (sam_close(fpout) < 0) {
        fprintf(stderr, "[bam_fillmd] error when closing output file\n");
        return 1;
    }
    return 0;

fail:
    free(ref);
    if (b) bam_destroy1(b);
    if (header) bam_hdr_destroy(header);
    if (fai) fai_destroy(fai);
    if (fp) sam_close(fp);
    if (fpout) sam_close(fpout);
    return 1;
}
示例#28
0
文件: bam2depth.c 项目: pd3/samtools
int main_depth(int argc, char *argv[])
{
    int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0;
    int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1;
    const bam_pileup1_t **plp;
    char *reg = 0; // specified region
    void *bed = 0; // BED data structure
    char *file_list = NULL, **fn = NULL;
    bam_hdr_t *h = NULL; // BAM header of the 1st input
    aux_t **data;
    bam_mplp_t mplp;
    int last_pos = -1, last_tid = -1, ret;

    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
    static const struct option lopts[] = {
        SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
        { NULL, 0, NULL, 0 }
    };

    // parse the command line
    while ((n = getopt_long(argc, argv, "r:b:q:Q:l:f:am:d:", lopts, NULL)) >= 0) {
        switch (n) {
        case 'l':
            min_len = atoi(optarg);
            break; // minimum query length
        case 'r':
            reg = strdup(optarg);
            break;   // parsing a region requires a BAM header
        case 'b':
            bed = bed_read(optarg); // BED or position list file can be parsed now
            if (!bed) {
                print_error_errno("depth", "Could not read file \"%s\"", optarg);
                return 1;
            }
            break;
        case 'q':
            baseQ = atoi(optarg);
            break;   // base quality threshold
        case 'Q':
            mapQ = atoi(optarg);
            break;    // mapping quality threshold
        case 'f':
            file_list = optarg;
            break;
        case 'a':
            all++;
            break;
        case 'd':
        case 'm':
            max_depth = atoi(optarg);
            break; // maximum coverage depth
        default:
            if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break;
        /* else fall-through */
        case '?':
            return usage();
        }
    }
    if (optind == argc && !file_list)
        return usage();

    // initialize the auxiliary data structures
    if (file_list)
    {
        if ( read_file_list(file_list,&nfiles,&fn) ) return 1;
        n = nfiles;
        argv = fn;
        optind = 0;
    }
    else
        n = argc - optind; // the number of BAMs on the command line
    data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input
    reg_tid = 0;
    beg = 0;
    end = INT_MAX;  // set the default region
    for (i = 0; i < n; ++i) {
        int rf;
        data[i] = calloc(1, sizeof(aux_t));
        data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM
        if (data[i]->fp == NULL) {
            print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]);
            status = EXIT_FAILURE;
            goto depth_end;
        }
        rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ;
        if (baseQ) rf |= SAM_QUAL;
        if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) {
            fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n");
            return 1;
        }
        if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
            fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
            return 1;
        }
        data[i]->min_mapQ = mapQ;                    // set the mapQ filter
        data[i]->min_len  = min_len;                 // set the qlen filter
        data[i]->hdr = sam_hdr_read(data[i]->fp);    // read the BAM header
        if (data[i]->hdr == NULL) {
            fprintf(stderr, "Couldn't read header for \"%s\"\n",
                    argv[optind+i]);
            status = EXIT_FAILURE;
            goto depth_end;
        }
        if (reg) { // if a region is specified
            hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]);  // load the index
            if (idx == NULL) {
                print_error("depth", "can't load index for \"%s\"", argv[optind+i]);
                status = EXIT_FAILURE;
                goto depth_end;
            }
            data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator
            hts_idx_destroy(idx); // the index is not needed any more; free the memory
            if (data[i]->iter == NULL) {
                print_error("depth", "can't parse region \"%s\"", reg);
                status = EXIT_FAILURE;
                goto depth_end;
            }
        }
    }

    h = data[0]->hdr; // easy access to the header of the 1st BAM
    if (reg) {
        beg = data[0]->iter->beg; // and to the parsed region coordinates
        end = data[0]->iter->end;
        reg_tid = data[0]->iter->tid;
    }

    // the core multi-pileup loop
    mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization
    if (0 < max_depth)
        bam_mplp_set_maxcnt(mplp,max_depth);  // set maximum coverage depth
    n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM
    plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp)
    while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position
        if (pos < beg || pos >= end) continue; // out of range; skip
        if (tid >= h->n_targets) continue;     // diff number of @SQ lines per file?
        if (all) {
            while (tid > last_tid) {
                if (last_tid >= 0 && !reg) {
                    // Deal with remainder or entirety of last tid.
                    while (++last_pos < h->target_len[last_tid]) {
                        // Horribly inefficient, but the bed API is an obfuscated black box.
                        if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
                            continue;
                        fputs(h->target_name[last_tid], stdout);
                        printf("\t%d", last_pos+1);
                        for (i = 0; i < n; i++)
                            putchar('\t'), putchar('0');
                        putchar('\n');
                    }
                }
                last_tid++;
                last_pos = -1;
                if (all < 2)
                    break;
            }

            // Deal with missing portion of current tid
            while (++last_pos < pos) {
                if (last_pos < beg) continue; // out of range; skip
                if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0)
                    continue;
                fputs(h->target_name[tid], stdout);
                printf("\t%d", last_pos+1);
                for (i = 0; i < n; i++)
                    putchar('\t'), putchar('0');
                putchar('\n');
            }

            last_tid = tid;
            last_pos = pos;
        }
        if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue;
        fputs(h->target_name[tid], stdout);
        printf("\t%d", pos+1); // a customized printf() would be faster
        for (i = 0; i < n; ++i) { // base level filters have to go here
            int j, m = 0;
            for (j = 0; j < n_plp[i]; ++j) {
                const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know
                if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos
                else if (bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality
            }
            printf("\t%d", n_plp[i] - m); // this the depth to output
        }
        putchar('\n');
    }
    if (ret < 0) status = EXIT_FAILURE;
    free(n_plp);
    free(plp);
    bam_mplp_destroy(mplp);

    if (all) {
        // Handle terminating region
        if (last_tid < 0 && reg && all > 1) {
            last_tid = reg_tid;
            last_pos = beg-1;
        }
        while (last_tid >= 0 && last_tid < h->n_targets) {
            while (++last_pos < h->target_len[last_tid]) {
                if (last_pos >= end) break;
                if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0)
                    continue;
                fputs(h->target_name[last_tid], stdout);
                printf("\t%d", last_pos+1);
                for (i = 0; i < n; i++)
                    putchar('\t'), putchar('0');
                putchar('\n');
            }
            last_tid++;
            last_pos = -1;
            if (all < 2 || reg)
                break;
        }
    }

depth_end:
    for (i = 0; i < n && data[i]; ++i) {
        bam_hdr_destroy(data[i]->hdr);
        if (data[i]->fp) sam_close(data[i]->fp);
        hts_itr_destroy(data[i]->iter);
        free(data[i]);
    }
    free(data);
    free(reg);
    if (bed) bed_destroy(bed);
    if ( file_list )
    {
        for (i=0; i<n; i++) free(fn[i]);
        free(fn);
    }
    sam_global_args_free(&ga);
    return status;
}
示例#29
0
static int mpileup(mplp_conf_t *conf)
{
    if (conf->nfiles == 0) {
        fprintf(stderr,"[%s] no input file/data given\n", __func__);
        exit(EXIT_FAILURE);
    }

    mplp_ref_t mp_ref = MPLP_REF_INIT;
    conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t));
    conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*));
    conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*));
    conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int));

    // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index
    // must be kept in the memory for the whole time which can be a problem with many bams.
    // Therefore if none or only one region is requested, we initialize the bam iterator as
    // before and free the index. Only when multiple regions are queried, we keep the index.
    int nregs = 0;
    if ( conf->reg_fname )
    {
        if ( conf->reg_is_file )
        {
            conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL);
            if ( !conf->reg ) {
                fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
                exit(EXIT_FAILURE);
            }
        }
        else
        {
            conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL);
            if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) {
                fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname);
                exit(EXIT_FAILURE);
            }
        }
        nregs = regidx_nregs(conf->reg);
        conf->reg_itr = regitr_init(conf->reg);
        regitr_loop(conf->reg_itr);   // region iterator now positioned at the first region
    }

    // read the header of each file in the list and initialize data
    // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least!
    bam_hdr_t *hdr = NULL;      // header of first file in input list
    int i;
    for (i = 0; i < conf->nfiles; ++i) {
        bam_hdr_t *h_tmp;
        conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t));
        conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb");
        if ( !conf->mplp_data[i]->fp )
        {
            fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno));
            exit(EXIT_FAILURE);
        }
        if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) {
            fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n");
            exit(EXIT_FAILURE);
        }
        if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) {
            fprintf(stderr, "[%s] failed to process %s: %s\n",
                    __func__, conf->fai_fname, strerror(errno));
            exit(EXIT_FAILURE);
        }
        conf->mplp_data[i]->conf = conf;
        conf->mplp_data[i]->ref = &mp_ref;
        h_tmp = sam_hdr_read(conf->mplp_data[i]->fp);
        if ( !h_tmp ) {
            fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]);
            exit(EXIT_FAILURE);
        }
        conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet
        conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]);
        if ( conf->mplp_data[i]->bam_id<0 )
        {
            // no usable readgroups in this bam, it can be skipped
            sam_close(conf->mplp_data[i]->fp);
            free(conf->mplp_data[i]);
            bam_hdr_destroy(h_tmp);
            free(conf->files[i]);
            if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1));
            conf->nfiles--;
            i--;
            continue;
        }
        if (conf->reg) {
            hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]);
            if (idx == NULL) {
                fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]);
                exit(EXIT_FAILURE);
            }
            conf->buf.l = 0;
            ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1);
            conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s);
            if ( !conf->mplp_data[i]->iter ) 
            {
                conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
                if ( conf->mplp_data[i]->iter ) {
                    fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
                    exit(EXIT_FAILURE);
                }
                fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
                exit(EXIT_FAILURE);
            }
            if ( nregs==1 ) // no need to keep the index in memory
               hts_idx_destroy(idx);
            else
                conf->mplp_data[i]->idx = idx;
        }

        if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */
        else {
            // FIXME: check consistency between h and h_tmp
            bam_hdr_destroy(h_tmp);

            // we store only the first file's header; it's (alleged to be)
            // compatible with the i-th file's target_name lookup needs
            conf->mplp_data[i]->h = hdr;
        }
    }
    // allocate data storage proportionate to number of samples being studied sm->n
    bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n);
    conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int));
    conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int));
    conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*));  

    fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles);
    // write the VCF header
    conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type));
    if (conf->bcf_fp == NULL) {
        fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno));
        exit(EXIT_FAILURE);
    }
    if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads);

    // BCF header creation
    conf->bcf_hdr = bcf_hdr_init("w");
    conf->buf.l = 0;

    if (conf->record_cmd_line)
    {
        ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version());
        bcf_hdr_append(conf->bcf_hdr, conf->buf.s);

        conf->buf.l = 0;
        ksprintf(&conf->buf, "##bcftoolsCommand=mpileup");
        for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]);
        kputc('\n', &conf->buf);
        bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
    }

    if (conf->fai_fname)
    {
        conf->buf.l = 0;
        ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname);
        bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
    }

    // Translate BAM @SQ tags to BCF ##contig tags
    // todo: use/write new BAM header manipulation routines, fill also UR, M5
    for (i=0; i<hdr->n_targets; i++)
    {
        conf->buf.l = 0;
        ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]);
        bcf_hdr_append(conf->bcf_hdr, conf->buf.s);
    }
    conf->buf.l = 0;

    bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">");
#if CDF_MWU_TESTS
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">");
#endif
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">");
    bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">");
    bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">");
    if ( conf->fmt_flag&B2B_FMT_DP )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">");
    if ( conf->fmt_flag&B2B_FMT_DV )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">");
    if ( conf->fmt_flag&B2B_FMT_DPR )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
    if ( conf->fmt_flag&B2B_INFO_DPR )
        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">");
    if ( conf->fmt_flag&B2B_FMT_DP4 )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">");
    if ( conf->fmt_flag&B2B_FMT_SP )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">");
    if ( conf->fmt_flag&B2B_FMT_AD )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">");
    if ( conf->fmt_flag&B2B_FMT_ADF )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">");
    if ( conf->fmt_flag&B2B_FMT_ADR )
        bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">");
    if ( conf->fmt_flag&B2B_INFO_AD )
        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">");
    if ( conf->fmt_flag&B2B_INFO_ADF )
        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">");
    if ( conf->fmt_flag&B2B_INFO_ADR )
        bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">");
    if ( conf->gvcf )
        gvcf_update_header(conf->gvcf, conf->bcf_hdr);

    int nsmpl;
    const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl);
    for (i=0; i<nsmpl; i++)
        bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]);
    bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr);

    conf->bca = bcf_call_init(-1., conf->min_baseQ);
    conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t));
    conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ;
    conf->bca->min_frac = conf->min_frac;
    conf->bca->min_support = conf->min_support;
    conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE;

    conf->bc.bcf_hdr = conf->bcf_hdr;
    conf->bc.n  = nsmpl;
    conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL));
    if (conf->fmt_flag)
    {
        assert( sizeof(float)==sizeof(int32_t) );
        conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4);
        conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32
        if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) )
        {
            // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample
            conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
            conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t));
            for (i=0; i<nsmpl; i++)
            {
                conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES;
                conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES;
            }
        }
    }

    // init mpileup
    conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data);
    if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter);
    if ( (double)conf->max_depth * conf->nfiles > 1<<20)
        fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles);
    if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 )
        fprintf(stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl);
    bam_mplp_set_maxcnt(conf->iter, conf->max_depth);
    conf->max_indel_depth = conf->max_indel_depth * nsmpl;
    conf->bcf_rec = bcf_init1();
    bam_mplp_constructor(conf->iter, pileup_constructor);

    // Run mpileup for multiple regions
    if ( nregs )
    {
        int ireg = 0;
        do 
        {
            // first region is already positioned
            if ( ireg++ > 0 )
            {
                conf->buf.l = 0;
                ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end);

                for (i=0; i<conf->nfiles; i++) 
                {
                    hts_itr_destroy(conf->mplp_data[i]->iter);
                    conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s);
                    if ( !conf->mplp_data[i]->iter ) 
                    {
                        conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq);
                        if ( conf->mplp_data[i]->iter ) {
                            fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s);
                            exit(EXIT_FAILURE);
                        }
                        fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]);
                        exit(EXIT_FAILURE);
                    }
                    bam_mplp_reset(conf->iter);
                }
            }
            mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end);
        }
        while ( regitr_loop(conf->reg_itr) );
    }
    else
        mpileup_reg(conf,0,0);

    flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL);

    // clean up
    free(conf->bc.tmp.s);
    bcf_destroy1(conf->bcf_rec);
    if (conf->bcf_fp)
    {
        hts_close(conf->bcf_fp);
        bcf_hdr_destroy(conf->bcf_hdr);
        bcf_call_destroy(conf->bca);
        free(conf->bc.PL);
        free(conf->bc.DP4);
        free(conf->bc.ADR);
        free(conf->bc.ADF);
        free(conf->bc.fmt_arr);
        free(conf->bcr);
    }
    if ( conf->gvcf ) gvcf_destroy(conf->gvcf);
    free(conf->buf.s);
    for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]);
    free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp);
    bam_mplp_destroy(conf->iter);
    bam_hdr_destroy(hdr);
    for (i = 0; i < conf->nfiles; ++i) {
        if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx);
        sam_close(conf->mplp_data[i]->fp);
        if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter);
        free(conf->mplp_data[i]);
    }
    if ( conf->reg_itr ) regitr_destroy(conf->reg_itr);
    free(conf->mplp_data); free(conf->plp); free(conf->n_plp);
    free(mp_ref.ref[0]);
    free(mp_ref.ref[1]);
    return 0;
}
示例#30
0
// currently, this function ONLY works if each read has one hit
static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct)
{
    bam_hdr_t *header;
    bam1_t *b[2];
    int curr, has_prev, pre_end = 0, cur_end = 0;
    kstring_t str;

    str.l = str.m = 0; str.s = 0;
    header = sam_hdr_read(in);
    if (header == NULL) {
        fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n");
        exit(1);
    }
    // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted.
    if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) {
        char *p, *q;
        p = strstr(header->text, "\tSO:coordinate");
        q = strchr(header->text, '\n');
        // Looking for SO:coordinate within the @HD line only
        // (e.g. must ignore in a @CO comment line later in header)
        if ((p != 0) && (p < q)) {
            fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n");
            exit(1);
        }
    }
    sam_hdr_write(out, header);

    b[0] = bam_init1();
    b[1] = bam_init1();
    curr = 0; has_prev = 0;
    while (sam_read1(in, header, b[curr]) >= 0) {
        bam1_t *cur = b[curr], *pre = b[1-curr];
        if (cur->core.flag & BAM_FSECONDARY)
        {
            if ( !remove_reads ) sam_write1(out, header, cur);
            continue; // skip secondary alignments
        }
        if (cur->core.flag & BAM_FSUPPLEMENTARY)
        {
            sam_write1(out, header, cur);
            continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from)
        }
        if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag
        {
            cur->core.flag |= BAM_FUNMAP;
        }
        if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end
        {
            cur_end = bam_endpos(cur);

            // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag
            if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP;
        }
        if (has_prev) { // do we have a pair of reads to examine?
            if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name
                pre->core.flag |= BAM_FPAIRED;
                cur->core.flag |= BAM_FPAIRED;
                sync_mate(pre, cur);

                if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))
                    && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE
                {
                    uint32_t cur5, pre5;
                    cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos;
                    pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos;
                    cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5;
                } else cur->core.isize = pre->core.isize = 0;
                if (add_ct) bam_template_cigar(pre, cur, &str);
                // TODO: Add code to properly check if read is in a proper pair based on ISIZE distribution
                if (proper_pair_check && !plausibly_properly_paired(pre,cur)) {
                    pre->core.flag &= ~BAM_FPROPER_PAIR;
                    cur->core.flag &= ~BAM_FPROPER_PAIR;
                }

                // Write out result
                if ( !remove_reads ) {
                    sam_write1(out, header, pre);
                    sam_write1(out, header, cur);
                } else {
                    // If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags
                    if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
                    if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
                    if(!(pre->core.flag&BAM_FUNMAP)) sam_write1(out, header, pre);
                    if(!(cur->core.flag&BAM_FUNMAP)) sam_write1(out, header, cur);
                }
                has_prev = 0;
            } else { // unpaired?  clear bad info and write it out
                if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
                    pre->core.flag |= BAM_FUNMAP;
                    pre->core.tid = -1;
                    pre->core.pos = -1;
                }
                pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
                pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);
                if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) sam_write1(out, header, pre);
            }
        } else has_prev = 1;
        curr = 1 - curr;
        pre_end = cur_end;
    }
    if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired
        bam1_t *pre = b[1-curr];
        if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped
            pre->core.flag |= BAM_FUNMAP;
            pre->core.tid = -1;
            pre->core.pos = -1;
        }
        pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0;
        pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR);

        sam_write1(out, header, pre);
    }
    bam_hdr_destroy(header);
    bam_destroy1(b[0]);
    bam_destroy1(b[1]);
    free(str.s);
}