static void copy_check_alignment(const char *infname, const char *informat, const char *outfname, const char *outmode, const char *outref) { samFile *in = sam_open(infname, "r"); samFile *out = sam_open(outfname, outmode); bam1_t *aln = bam_init1(); bam_hdr_t *header = NULL; int res; if (!in) { fail("couldn't open %s", infname); goto err; } if (!out) { fail("couldn't open %s with mode %s", outfname, outmode); goto err; } if (!aln) { fail("bam_init1() failed"); goto err; } if (outref) { if (hts_set_opt(out, CRAM_OPT_REFERENCE, outref) < 0) { fail("setting reference %s for %s", outref, outfname); goto err; } } header = sam_hdr_read(in); if (!header) { fail("reading header from %s", infname); goto err; } if (sam_hdr_write(out, header) < 0) fail("writing headers to %s", outfname); while ((res = sam_read1(in, header, aln)) >= 0) { int mod4 = ((intptr_t) bam_get_cigar(aln)) % 4; if (mod4 != 0) fail("%s CIGAR not 4-byte aligned; offset is 4k+%d for \"%s\"", informat, mod4, bam_get_qname(aln)); if (sam_write1(out, header, aln) < 0) fail("writing to %s", outfname); } if (res < -1) { fail("failed to read alignment from %s", infname); } err: bam_destroy1(aln); bam_hdr_destroy(header); if (in) sam_close(in); if (out) sam_close(out); }
int bam_idxstats(int argc, char *argv[]) { hts_idx_t* idx; bam_hdr_t* header; samFile* fp; if (argc < 2) { fprintf(pysamerr, "Usage: samtools idxstats <in.bam>\n"); return 1; } fp = sam_open(argv[1], "r"); if (fp == NULL) { fprintf(pysamerr, "[%s] fail to open BAM.\n", __func__); return 1; } header = sam_hdr_read(fp); idx = sam_index_load(fp, argv[1]); if (idx == NULL) { fprintf(pysamerr, "[%s] fail to load the index.\n", __func__); return 1; } int i; for (i = 0; i < header->n_targets; ++i) { // Print out contig name and length printf("%s\t%d", header->target_name[i], header->target_len[i]); // Now fetch info about it from the meta bin uint64_t u, v; hts_idx_get_stat(idx, i, &u, &v); printf("\t%" PRIu64 "\t%" PRIu64 "\n", u, v); } // Dump information about unmapped reads printf("*\t0\t0\t%" PRIu64 "\n", hts_idx_get_n_no_coor(idx)); bam_hdr_destroy(header); hts_idx_destroy(idx); sam_close(fp); return 0; }
// remember to clean up with bam_destroy1(b); bam1_t* alignment_to_bam(const string& sam_header, const Alignment& alignment, const string& refseq, const int32_t refpos, const string& cigar, const string& mateseq, const int32_t matepos, const int32_t tlen) { assert(!sam_header.empty()); string sam_file = "data:" + sam_header + alignment_to_sam(alignment, refseq, refpos, cigar, mateseq, matepos, tlen); const char* sam = sam_file.c_str(); samFile *in = sam_open(sam, "r"); bam_hdr_t *header = sam_hdr_read(in); bam1_t *aln = bam_init1(); if (sam_read1(in, header, aln) >= 0) { bam_hdr_destroy(header); sam_close(in); // clean up return aln; } else { cerr << "[vg::alignment] Failure to parse SAM record" << endl << sam << endl; exit(1); } }
BamFilePrivate(const std::string& fn) : filename_(fn) , firstAlignmentOffset_(-1) { // ensure we've updated htslib verbosity with requested verbosity here hts_verbose = ( PacBio::BAM::HtslibVerbosity == -1 ? 0 : PacBio::BAM::HtslibVerbosity); // attempt open auto f = RawOpen(); #if !defined (PBBAM_NO_CHECK_EOF) || PBBAM_AUTOVALIDATE // sanity check on file const int eofCheck = bgzf_check_EOF(f->fp.bgzf); if (eofCheck <= 0 ) { // 1: EOF present & correct // 2: not seekable (e.g. reading from stdin) // 0: EOF absent // -1: some other error std::stringstream e; if (eofCheck == 0) e << fn << " : is missing EOF block" << std::endl; else e << fn << " : unknown error while checking EOF block" << std::endl; throw std::runtime_error(e.str()); } #endif // attempt fetch header std::unique_ptr<bam_hdr_t, internal::HtslibHeaderDeleter> hdr(sam_hdr_read(f.get())); header_ = internal::BamHeaderMemory::FromRawData(hdr.get()); // cache first alignment offset firstAlignmentOffset_ = bgzf_tell(f->fp.bgzf); }
bufReader initBufReader2(const char*fname,int doCheck,char *fai_fname){ bufReader ret; ret.fn = strdup(fname); int newlen=strlen(fname);//<-just to avoid valgrind -O3 uninitialized warning ret.fp = openBAM(ret.fn,doCheck); if (fai_fname && hts_set_fai_filename(ret.fp, fai_fname) != 0) { fprintf(stderr, "[%s] failed to process %s\n", __func__, fai_fname); exit(EXIT_FAILURE); } ret.isEOF =0; ret.itr=NULL; ret.idx=NULL; ret.hdr = sam_hdr_read(ret.fp); if(strlen(ret.hdr->text)==0){ fprintf(stderr,"\t-> No header information could be found for BAM/CRAM file: \'%s\' will exit\n",fname); exit(1); } checkIfSorted(ret.hdr->text); if(ret.hdr==NULL) { fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", ret.fn); exit(0); } return ret; }
int bam_access_openhts(char *hts_file, char *ref_file){ assert(hts_file != NULL); //Assign memory for the file name etc holding struct fholder = malloc(sizeof(file_holder)); check_mem(fholder); //Beginning and end of tmp struct for bam access fholder->beg = 0; fholder->end = 0x7fffffff; // The max 32 bit integer. //Open a file for read from compressed bam. fholder->in = hts_open(hts_file, "r"); check(fholder->in != 0,"HTS file %s failed to open.",hts_file); fholder->idx = sam_index_load(fholder->in,hts_file); check(fholder->idx != 0,"HTS index file %s failed to open.",hts_file); if(ref_file){ hts_set_fai_filename(fholder->in, ref_file); }else{ if(fholder->in->format.format == cram) log_warn("No reference file provided for a cram input file, if the reference described in the cram header can't be located this script may fail."); } //Check for generic header read method. fholder->head = sam_hdr_read(fholder->in); return 0; error: if(fholder->in) hts_close(fholder->in); if(fholder) free(fholder); return -1; }
BAMOrderedReader::BAMOrderedReader(std::string bam_file, std::vector<GenomeInterval>& intervals) :bam_file(bam_file), intervals(intervals), sam(0), hdr(0), idx(0), itr(0) { const char* fname = bam_file.c_str(); int len = strlen(fname); if ( strcasecmp(".bam",fname+len-4) ) { fprintf(stderr, "[%s:%d %s] Not a BAM file: %s\n", __FILE__, __LINE__, __FUNCTION__, bam_file.c_str()); exit(1); } sam = sam_open(bam_file.c_str(), "r"); hdr = sam_hdr_read(sam); s = bam_init1(); idx = bam_index_load(bam_file.c_str()); if (idx==0) { fprintf(stderr, "[%s:%d %s] fail to load index for %s\n", __FILE__, __LINE__, __FUNCTION__, bam_file.c_str()); abort(); } else { index_loaded = true; } str = {0,0,0}; intervals_present = intervals.size()!=0; interval_index = 0; random_access_enabled = intervals_present && index_loaded; };
static int aux_fields1(void) { static const char sam[] = "data:" "@SQ\tSN:one\tLN:1000\n" "@SQ\tSN:two\tLN:500\n" "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:" xstr(PI) "\tXd:d:" xstr(E) "\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,+2\tZZ:i:1000000\n"; // Canonical form of the alignment record above, as output by sam_format1() static const char r1[] = "r1\t0\tone\t500\t20\t8M\t*\t0\t0\tATGCATGC\tqqqqqqqq\tXA:A:k\tXi:i:37\tXf:f:3.14159\tXd:d:2.71828\tXZ:Z:" HELLO "\tXH:H:" BEEF "\tXB:B:c,-2,0,2\tZZ:i:1000000"; samFile *in = sam_open(sam, "r"); bam_hdr_t *header = sam_hdr_read(in); bam1_t *aln = bam_init1(); uint8_t *p; uint32_t n; kstring_t ks = { 0, 0, NULL }; if (sam_read1(in, header, aln) >= 0) { if ((p = check_bam_aux_get(aln, "XA", 'A')) && bam_aux2A(p) != 'k') fail("XA field is '%c', expected 'k'", bam_aux2A(p)); if ((p = check_bam_aux_get(aln, "Xi", 'C')) && bam_aux2i(p) != 37) fail("Xi field is %d, expected 37", bam_aux2i(p)); if ((p = check_bam_aux_get(aln, "Xf", 'f')) && fabs(bam_aux2f(p) - PI) > 1E-6) fail("Xf field is %.12f, expected pi", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "Xd", 'd')) && fabs(bam_aux2f(p) - E) > 1E-6) fail("Xf field is %.12f, expected e", bam_aux2f(p)); if ((p = check_bam_aux_get(aln, "XZ", 'Z')) && strcmp(bam_aux2Z(p), HELLO) != 0) fail("XZ field is \"%s\", expected \"%s\"", bam_aux2Z(p), HELLO); if ((p = check_bam_aux_get(aln, "XH", 'H')) && strcmp(bam_aux2Z(p), BEEF) != 0) fail("XH field is \"%s\", expected \"%s\"", bam_aux2Z(p), BEEF); // TODO Invent and use bam_aux2B() if ((p = check_bam_aux_get(aln, "XB", 'B')) && ! (memcmp(p, "Bc", 2) == 0 && (memcpy(&n, p+2, 4), n) == 3 && memcmp(p+6, "\xfe\x00\x02", 3) == 0)) fail("XB field is %c,..., expected c,-2,0,+2", p[1]); if ((p = check_bam_aux_get(aln, "ZZ", 'I')) && bam_aux2i(p) != 1000000) fail("ZZ field is %d, expected 1000000", bam_aux2i(p)); if (sam_format1(header, aln, &ks) < 0) fail("can't format record"); if (strcmp(ks.s, r1) != 0) fail("record formatted incorrectly: \"%s\"", ks.s); free(ks.s); } else fail("can't read record"); bam_destroy1(aln); bam_hdr_destroy(header); sam_close(in); return 1; }
bool init() { // Open files file_iter = sam_open(file_name.c_str(), "rb", 0); replace_iter = sam_open(replace_name.c_str(), "r", 0); out_file = sam_open(out_name.c_str(), "wb", 0); if (file_iter == NULL || replace_iter == NULL || out_file == NULL) { return false; } // Iterate through BAM file_header = sam_hdr_read(file_iter); replace_header = sam_hdr_read(replace_iter); build_translation(); return true; }
bam_hdr_t* hts_file_header(string& filename, string& header) { samFile *in = hts_open(filename.c_str(), "r"); if (in == NULL) { cerr << "[vg::alignment] could not open " << filename << endl; exit(1); } bam_hdr_t *hdr = sam_hdr_read(in); header = hdr->text; bam_hdr_destroy(hdr); hts_close(in); return hdr; }
samfile_t *samopen(const char *fn, const char *mode, const void *aux) { // hts_open() is really sam_open(), except for #define games samFile *hts_fp = hts_open(fn, mode); if (hts_fp == NULL) return NULL; samfile_t *fp = malloc(sizeof (samfile_t)); if (!fp) { sam_close(hts_fp); return NULL; } fp->file = hts_fp; fp->x.bam = hts_fp->fp.bgzf; if (strchr(mode, 'r')) { if (aux) { if (hts_set_fai_filename(fp->file, aux) != 0) { sam_close(hts_fp); free(fp); return NULL; } } fp->header = sam_hdr_read(fp->file); // samclose() will free this if (fp->header == NULL) { sam_close(hts_fp); free(fp); return NULL; } fp->is_write = 0; if (fp->header->n_targets == 0 && bam_verbose >= 1) fprintf(samtools_stderr, "[samopen] no @SQ lines in the header.\n"); } else { enum htsExactFormat fmt = hts_get_format(fp->file)->format; fp->header = (bam_hdr_t *)aux; // For writing, we won't free it fp->is_write = 1; if (!(fmt == text_format || fmt == sam) || strchr(mode, 'h')) { if (sam_hdr_write(fp->file, fp->header) < 0) { if (bam_verbose >= 1) fprintf(samtools_stderr, "[samopen] Couldn't write header\n"); sam_close(hts_fp); free(fp); return NULL; } } } return fp; }
void bam_parser(opt_t *opt) { samFile *in = sam_open(opt->in_name, "r"); if(in == NULL) die("bam_parser: fail to open file '%s'", opt->in_name); // if output file exists but not force to overwrite if(access(opt->out_name, F_OK)!=-1 && opt->f==false) die("bam_parser: %s exists, use opetion -f to overwrite", opt->out_name); bam_hdr_t *header = sam_hdr_read(in); bam1_t *aln = bam_init1(); int8_t *p; int32_t n; int ret; while((ret=sam_read1(in, header, aln)) >= 0) printf("name=%s\nflag=%d\nseq=%s\nqual=%s\nlane_id=%d\n", get_read_name(aln), aln->core.flag, get_sequence(aln), get_qualities(aln), get_lane_id(aln)); bam_destroy1(aln); sam_close(in); }
static void flanks_sam_open() { if(!futil_path_has_extension(sam_path, ".bam") && !futil_path_has_extension(sam_path, ".sam")) { cmd_print_usage("Mapped flanks is not .sam or .bam file: %s", sam_path); } bool isbam = futil_path_has_extension(sam_path, ".bam"); samfh = sam_open(sam_path, isbam ? "rb" : "rs"); if(samfh == NULL) die("Cannot open SAM/BAM %s", sam_path); // Load BAM header bam_header = sam_hdr_read(samfh); bamentry = bam_init1(); }
bufReader initBufReader2(const char*fname){ bufReader ret; ret.fn = strdup(fname); int newlen=strlen(fname);//<-just to avoid valgrind -O3 uninitialized warning ret.fp = openBAM(ret.fn); ret.isEOF =0; ret.itr=NULL; ret.idx=NULL; ret.hdr = sam_hdr_read(ret.fp); if(ret.hdr==NULL) { fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", ret.fn); exit(0); } return ret; }
int hts_for_each(string& filename, function<void(Alignment&)> lambda) { samFile *in = hts_open(filename.c_str(), "r"); if (in == NULL) return 0; bam_hdr_t *hdr = sam_hdr_read(in); map<string, string> rg_sample; parse_rg_sample_map(hdr->text, rg_sample); bam1_t *b = bam_init1(); while (sam_read1(in, hdr, b) >= 0) { Alignment a = bam_to_alignment(b, rg_sample); lambda(a); } bam_destroy1(b); bam_hdr_destroy(hdr); hts_close(in); return 1; }
bam_hdr_t* hts_string_header(string& header, map<string, int64_t>& path_length, map<string, string>& rg_sample) { stringstream hdr; hdr << "@HD\tVN:1.5\tSO:unknown\n"; for (auto& p : path_length) { hdr << "@SQ\tSN:" << p.first << "\t" << "LN:" << p.second << "\n"; } for (auto& s : rg_sample) { hdr << "@RG\tID:" << s.first << "\t" << "SM:" << s.second << "\n"; } hdr << "@PG\tID:0\tPN:vg\n"; header = hdr.str(); string sam = "data:" + header; samFile *in = sam_open(sam.c_str(), "r"); bam_hdr_t *h = sam_hdr_read(in); sam_close(in); return h; }
static int view_sam(hFILE *hfp, const char *filename) { samFile *in = hts_hopen(hfp, filename, "r"); if (in == NULL) return 0; samFile *out = dup_stdout("w"); bam_hdr_t *hdr = sam_hdr_read(in); if (show_headers) sam_hdr_write(out, hdr); if (mode == view_all) { bam1_t *b = bam_init1(); while (sam_read1(in, hdr, b) >= 0) sam_write1(out, hdr, b); bam_destroy1(b); } bam_hdr_destroy(hdr); hts_close(out); hts_close(in); return 1; }
bwa_seqio_t *bwa_bam_open(const char *fn, int which) { bwa_seqio_t *bs; #ifndef USE_HTSLIB bam_header_t *h; #endif bs = (bwa_seqio_t*)calloc(1, sizeof(bwa_seqio_t)); bs->is_bam = 1; bs->which = which; #ifdef USE_HTSLIB bs->fp = sam_open(fn, "rb"); #else bs->fp = bam_open(fn, "r"); #endif if (0 == bs->fp) err_fatal_simple("Couldn't open bam file"); #ifdef USE_HTSLIB bs->h = sam_hdr_read(bs->fp); #else h = bam_header_read(bs->fp); bam_header_destroy(h); #endif return bs; }
int hts_for_each_parallel(string& filename, function<void(Alignment&)> lambda) { samFile *in = hts_open(filename.c_str(), "r"); if (in == NULL) return 0; bam_hdr_t *hdr = sam_hdr_read(in); map<string, string> rg_sample; parse_rg_sample_map(hdr->text, rg_sample); int thread_count = get_thread_count(); vector<bam1_t*> bs; bs.resize(thread_count); for (auto& b : bs) { b = bam_init1(); } bool more_data = true; #pragma omp parallel shared(in, hdr, more_data, rg_sample) { int tid = omp_get_thread_num(); while (more_data) { bam1_t* b = bs[tid]; #pragma omp critical (hts_input) if (more_data) { more_data = sam_read1(in, hdr, b) >= 0; } if (more_data) { Alignment a = bam_to_alignment(b, rg_sample); lambda(a); } } } for (auto& b : bs) bam_destroy1(b); bam_hdr_destroy(hdr); hts_close(in); return 1; }
BM_mappedRead * extractReads(char * bamFile, char ** contigs, int numContigs, uint16_t * groups, char * prettyName, int headersOnly, int minMapQual, int maxMisMatches, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { //----- // code uses the pattern outlined in samtools view (sam_view.c) // thanks lh3! // int i = 0; int result = -1; int hh = 0; int supp_check = 0x0; // include supp mappings if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // we need to let the users know if their pairings // will be corrupted int p_corrupt = 0; // helper variables samFile *in = 0; bam_hdr_t *header = NULL; bam1_t *b = bam_init1(); BM_mappedRead * root = 0; BM_mappedRead * prev = 0; // open file handlers if ((in = sam_open(bamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", bamFile); } else { // retrieve the header if ((header = sam_hdr_read(in)) == 0) { fprintf(stderr, "ERROR: Failed to read the header from \"%s\".\n", bamFile); } else { // check the index is intact hts_idx_t *idx = sam_index_load(in, bamFile); // load index if (idx == 0) { // index is unavailable fprintf(stderr, "ERROR: Random retrieval only works "\ "for indexed files.\n"); } else { cfuhash_table_t *pair_buffer = \ cfuhash_new_with_initial_size(1000000); cfuhash_set_flag(pair_buffer, CFUHASH_FROZEN_UNTIL_GROWS); for (hh = 0; hh < numContigs; ++hh) { // parse a region in the format like `chr2:100-200' hts_itr_t *iter = sam_itr_querys(idx, header, contigs[hh]); if (iter == NULL) { // reference name is not found fprintf(stderr, "WARNING: Could not find contig: "\ "[%s] in BAM: [%s].\n", contigs[hh], bamFile); } // fetch alignments int line = 0; while ((result = sam_itr_next(in, iter, b)) >= 0) { bam1_core_t core = b->core; line += 1; // only high quality?, primary? mappings if ( core.qual < minMapQual) continue; if ((core.flag & supp_check) != 0) continue; if(bam_aux2i(bam_aux_get(b, "NM")) > maxMisMatches) { continue; } char * seqId = bam_get_qname(b); char * seq = 0; char * qual = 0; int qual_len = 0; int seq_len = 0; // get sequence and quality if(0 == headersOnly) { // no point allocating unused space seq = calloc(core.l_qseq+1, sizeof(char)); qual = calloc(core.l_qseq+1, sizeof(char)); uint8_t *s = bam_get_seq(b); if (core.flag&BAM_FREVERSE) { // reverse the read int r = 0; for (i = core.l_qseq-1; i >=0 ; --i) { seq[r]="=TGKCYSBAWRDMHVN"[bam_seqi(s, i)]; ++r; } } else { for (i = 0; i < core.l_qseq; ++i) { seq[i]="=ACMGRSVTWYHKDBN"[bam_seqi(s, i)]; } } seq_len = core.l_qseq; s = bam_get_qual(b); if (s[0] != 0xff) { qual_len = core.l_qseq; for (i = 0; i < core.l_qseq; ++i) { qual[i] = (char)(s[i] + 33); } } else if (qual != 0) { free(qual); qual = 0; } } // work out pairing information uint8_t rpi = RPI_ERROR; if (core.flag&BAM_FPAIRED) { if(core.flag&BAM_FMUNMAP) { if (core.flag&BAM_FREAD1) { rpi = RPI_SNGL_FIR; } else if (core.flag&BAM_FREAD2) { rpi = RPI_SNGL_SEC; } } else { if (core.flag&BAM_FREAD1) { rpi = RPI_FIR; } else if (core.flag&BAM_FREAD2) { rpi = RPI_SEC; } } } else { rpi = RPI_SNGL; } // make the funky Id #define MAX_SEQ_ID_LEN 80 char * seq_id = calloc(MAX_SEQ_ID_LEN, sizeof(char)); // allocate the string to the buffer but check to // ensure we're not cutting anything off int id_len = snprintf(seq_id, MAX_SEQ_ID_LEN, "b_%s;c_%s;r_%s", prettyName, contigs[hh], seqId); if(id_len >= MAX_SEQ_ID_LEN) { seq_id = calloc(id_len+1, sizeof(char)); snprintf(seq_id, id_len+1, // don't forget the NULL! "b_%s;c_%s;r_%s", prettyName, contigs[hh], seqId); } // make the mapped read struct prev = makeMappedRead(seq_id, seq, qual, id_len, seq_len, qual_len, rpi, groups[hh], prev); if (0 == root) { root = prev; } if(rpi == RPI_SNGL || \ rpi == RPI_SNGL_FIR || \ rpi == RPI_SNGL_SEC) { // we can just add away // indicate singleton reads by pointing the // partner pointer to itself prev->partnerRead = prev; } else { // RPI_FIR or RPI_SEC // work out pairing information using the hash // we append a 1 or 2 to the end so that // we don't accidentally pair 1's with 1's etc. char * stripped_result; if(rpi == RPI_FIR) { stripped_result = \ pairStripper(seqId, core.l_qname-1, '2'); } else { stripped_result = \ pairStripper(seqId, core.l_qname-1, '1'); } char * stripped = seqId; if(stripped_result) stripped = stripped_result; //fprintf(stdout, "SEARCH %s\n", stripped); // now stripped always holds a stripped value // see if it is in the hash already BM_mappedRead * stored_MR = \ cfuhash_get(pair_buffer, stripped); if (0 != stored_MR) { // exists in the hash -> Add the pair info if(rpi == RPI_FIR) { prev->partnerRead = stored_MR; } else { stored_MR->partnerRead = prev; } // delete the entry from the hash cfuhash_delete(pair_buffer, stripped); } else { // we should put it in the hash // make sure to change it into something // we will find next time if(rpi == RPI_FIR) stripped[strlen(stripped)-1] = '1'; else stripped[strlen(stripped)-1] = '2'; // check to make sure we're not overwriting // anything important. cfuhash overwrites // duplicate entries, so we need to grab // it and put it to "SNGL_XXX" before we // lose the pointer BM_mappedRead * OWMMR = \ cfuhash_put(pair_buffer, stripped, prev); if(OWMMR) { if(OWMMR->rpi == RPI_FIR) OWMMR->rpi = RPI_SNGL_FIR; else OWMMR->rpi = RPI_SNGL_SEC; OWMMR->partnerRead = OWMMR; printPairCorruptionWarning(p_corrupt); p_corrupt = 1; } } if(stripped_result != 0) { // free this! free(stripped_result); stripped_result = 0; } } } hts_itr_destroy(iter); if (result < -1) { fprintf(stderr, "ERROR: retrieval of reads from "\ "contig: \"%s\" failed due to "\ "truncated file or corrupt BAM index "\ "file\n", header->target_name[hh]); break; } } // any entries left in the hash are pairs whose mates did // not meet quality standards size_t key_size = 0; char * key; BM_mappedRead * LOMMR; size_t pr_size = 1; if(cfuhash_each_data(pair_buffer, (void**)&key, &key_size, (void**)&LOMMR, &pr_size)) { do { // get the mapped read // update it's pairing so we know it's really single if (LOMMR->rpi == RPI_FIR) LOMMR->rpi = RPI_SNGL_FIR; else if (LOMMR->rpi == RPI_SEC) LOMMR->rpi = RPI_SNGL_SEC; // indicate singleton reads by pointing the // partner pointer to itself LOMMR->partnerRead = LOMMR; } while(cfuhash_next_data(pair_buffer, (void**)&key, &key_size, (void**)&LOMMR, &pr_size)); } cfuhash_clear(pair_buffer); cfuhash_destroy(pair_buffer); } hts_idx_destroy(idx); // destroy the BAM index } } // always do this if (in) sam_close(in); bam_destroy1(b); if ( header ) bam_hdr_destroy(header); return root; }
static bool init(const parsed_opts_t* opts, state_t** state_out) { state_t* retval = (state_t*) calloc(1, sizeof(state_t)); if (retval == NULL) { fprintf(stderr, "[init] Out of memory allocating state struct.\n"); return false; } *state_out = retval; // Open files retval->input_file = sam_open_format(opts->input_name, "r", &opts->ga.in); if (retval->input_file == NULL) { fprintf(stderr, "[init] Could not open input file: %s\n", opts->input_name); return false; } retval->input_header = sam_hdr_read(retval->input_file); retval->output_header = bam_hdr_dup(retval->input_header); retval->output_file = sam_open_format(opts->output_name == NULL?"-":opts->output_name, "w", &opts->ga.out); if (retval->output_file == NULL) { print_error_errno("addreplacerg", "Could not open output file: %s\n", opts->output_name); return false; } if (opts->rg_line) { // Append new RG line to header. // Check does not already exist if ( confirm_rg(retval->output_header, opts->rg_id) ) { fprintf(stderr, "[init] ID of new RG line specified conflicts with that of an existing header RG line. Overwrite not yet implemented.\n"); return false; } retval->rg_id = strdup(opts->rg_id); size_t new_len = strlen( retval->output_header->text ) + strlen( opts->rg_line ) + 2; char* new_header = malloc(new_len); if (!new_header) { fprintf(stderr, "[init] Out of memory whilst writing new header.\n"); return false; } sprintf(new_header,"%s%s\n", retval->output_header->text, opts->rg_line); free(retval->output_header->text); retval->output_header->text = new_header; retval->output_header->l_text = (int)new_len - 1; } else { if (opts->rg_id) { // Confirm what has been supplied exists if ( !confirm_rg(retval->output_header, opts->rg_id) ) { fprintf(stderr, "RG ID supplied does not exist in header. Supply full @RG line with -r instead?\n"); return false; } retval->rg_id = strdup(opts->rg_id); } else { if ((retval->rg_id = get_first_rgid(retval->output_header)) == NULL ) { fprintf(stderr, "No RG specified on command line or in existing header.\n"); return false; } } } switch (opts->mode) { case overwrite_all: retval->mode_func = &overwrite_all_func; break; case orphan_only: retval->mode_func = &orphan_only_func; break; } return true; }
int main(int argc, char *argv[]) { samFile *in; char *fn_ref = 0; int flag = 0, c, clevel = -1, ignore_sam_err = 0; char moder[8]; bam_hdr_t *h; bam1_t *b; htsFile *out; char modew[8]; int r = 0, exit_code = 0; hts_opt *in_opts = NULL, *out_opts = NULL, *last = NULL; int nreads = 0; int benchmark = 0; while ((c = getopt(argc, argv, "IbDCSl:t:i:o:N:B")) >= 0) { switch (c) { case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'D': flag |= 4; break; case 'C': flag |= 8; break; case 'B': benchmark = 1; break; case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; case 'i': if (add_option(&in_opts, optarg)) return 1; break; case 'o': if (add_option(&out_opts, optarg)) return 1; break; case 'N': nreads = atoi(optarg); } } if (argc == optind) { fprintf(stderr, "Usage: samview [-bSCSIB] [-N num_reads] [-l level] [-o option=value] <in.bam>|<in.sam>|<in.cram> [region]\n"); return 1; } strcpy(moder, "r"); if (flag&4) strcat(moder, "c"); else if ((flag&1) == 0) strcat(moder, "b"); in = sam_open(argv[optind], moder); if (in == NULL) { fprintf(stderr, "Error opening \"%s\"\n", argv[optind]); return EXIT_FAILURE; } h = sam_hdr_read(in); h->ignore_sam_err = ignore_sam_err; b = bam_init1(); strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&8) strcat(modew, "c"); else if (flag&2) strcat(modew, "b"); out = hts_open("-", modew); if (out == NULL) { fprintf(stderr, "Error opening standard output\n"); return EXIT_FAILURE; } /* CRAM output */ if (flag & 8) { int ret; // Parse input header and use for CRAM output out->fp.cram->header = sam_hdr_parse_(h->text, h->l_text); // Create CRAM references arrays if (fn_ref) ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, fn_ref); else // Attempt to fill out a cram->refs[] array from @SQ headers ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL); if (ret != 0) return EXIT_FAILURE; } // Process any options; currently cram only. for (; in_opts; in_opts = (last=in_opts)->next, free(last)) { hts_set_opt(in, in_opts->opt, in_opts->val); if (in_opts->opt == CRAM_OPT_REFERENCE) if (hts_set_opt(out, in_opts->opt, in_opts->val) != 0) return EXIT_FAILURE; } for (; out_opts; out_opts = (last=out_opts)->next, free(last)) if (hts_set_opt(out, out_opts->opt, out_opts->val) != 0) return EXIT_FAILURE; if (!benchmark) sam_hdr_write(out, h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region int i; hts_idx_t *idx; if ((idx = sam_index_load(in, argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = sam_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while ((r = sam_itr_next(in, iter, b)) >= 0) { if (!benchmark && sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } if (nreads && --nreads == 0) break; } hts_itr_destroy(iter); } hts_idx_destroy(idx); } else while ((r = sam_read1(in, h, b)) >= 0) { if (!benchmark && sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } if (nreads && --nreads == 0) break; } if (r < -1) { fprintf(stderr, "Error parsing input.\n"); exit_code = 1; } r = sam_close(out); if (r < 0) { fprintf(stderr, "Error closing output.\n"); exit_code = 1; } bam_destroy1(b); bam_hdr_destroy(h); r = sam_close(in); if (r < 0) { fprintf(stderr, "Error closing input.\n"); exit_code = 1; } return exit_code; }
multiReader::multiReader(int argc,char**argv){ gz=Z_NULL; myglf=NULL;myvcf=NULL;mpil=NULL;bglObj=NULL; nLines=50; fname=NULL; intName=1; minQ = MINQ; nInd =0; isSim =0; args=NULL; args = setArgStruct(argc,argv); fprintf(args->argumentFile,"\t-> Command: \n"); for(int i=0;i<argc;i++) fprintf(args->argumentFile,"%s ",argv[i]); // fprintf(args->argumentFile,"\n\n"); if(args->argumentFile!=stderr) fprintf(args->argumentFile,"\n\t-> angsd version: %s (htslib: %s) build(%s %s)\n",ANGSD_VERSION,hts_version(),__DATE__,__TIME__); void printTime(FILE *fp); printTime(args->argumentFile); //type = args->inputtype; if(args->argc==2) { if((!strcasecmp(args->argv[1],"-beagle")) || (!strcasecmp(args->argv[1],"-glf")) || (!strcasecmp(args->argv[1],"-glf3")) || (!strcasecmp(args->argv[1],"-pileup")) || (!strcasecmp(args->argv[1],"-vcf-GL")) || (!strcasecmp(args->argv[1],"-vcf-pl")) || (!strcasecmp(args->argv[1],"-glf10_text")) || (!strcasecmp(args->argv[1],"-vcf-GP"))) { printArg(stdout,args); exit(0); }else if ((!strcasecmp(args->argv[1],"-bam")) || (!strcasecmp(args->argv[1],"-b"))){ setArgsBam(args); exit(0); }else return; } getOptions(args); if(args->fai==NULL){ int printAndExit =0; switch(args->inputtype) { case INPUT_GLF: printAndExit=1; break; case INPUT_GLF10_TEXT: printAndExit=1; break; case INPUT_GLF3: printAndExit=1; break; case INPUT_BEAGLE: printAndExit=1; break; case INPUT_PILEUP: printAndExit=1; break; } if(printAndExit){ fprintf(stderr,"\t-> Must supply -fai file\n"); exit(0); } } if(args->fai){ if(!(args->hd=getHeadFromFai(args->fai))) exit(0); }else{ if(args->nams.size()==0){ fprintf(stderr,"\t-> Must choose inputfile -bam/-glf/-glf3/-pileup/-i/-vcf-gl/-vcf-gp/-vcf-pl/-glf10_text filename\n"); exit(0); } if(args->inputtype==INPUT_BAM){ htsFile *in=sam_open(args->nams[0],"r"); assert(in); args->hd= sam_hdr_read(in); hts_close(in); } } if(!(INPUT_VCF_GL||INPUT_VCF_GP)){ if(args->hd==NULL){ fprintf(stderr,"For non-bams you should include -fai arguments\n"); exit(0); } } if((args->inputtype==INPUT_PILEUP||args->inputtype==INPUT_GLF||args->inputtype==INPUT_GLF3||args->inputtype==INPUT_GLF10_TEXT)){ if(nInd==0){ fprintf(stderr,"\t-> Must supply -nInd when using -glf/-glf3/-pileup/-glf10_text files\n"); exit(0); } }else args->nInd = args->nams.size(); if(args->inputtype==INPUT_VCF_GP||args->inputtype==INPUT_VCF_GL){ if(args->regions.size()>1){ fprintf(stderr,"\t-> Only one region can be specified with using bcf (i doubt more is needed) will exit\n"); exit(0); }else if(args->regions.size()<=1){ myvcf = new vcfReader(args->infile,NULL); args->hd=bcf_hdr_2_bam_hdr_t(myvcf->hs); args->nInd = myvcf->hs->nsamples; } } //make args->hd revMap = buildRevTable(args->hd); args->revMap = revMap; setArgsBam(args); if(args->inputtype==INPUT_VCF_GL){ if(args->regions.size()==1){ char tmp[1024]; int start=args->regions[0].start; int stop=args->regions[0].stop; int ref=args->regions[0].refID; snprintf(tmp,1024,"%s:%d-%d",args->hd->target_name[ref],start+1,stop); // fprintf(stderr,"tmp:%s\n",tmp); // exit(0); myvcf->seek(tmp); } } if(fname==NULL) return; gz=Z_NULL; gz=gzopen(fname,"r"); if(gz==Z_NULL){ fprintf(stderr,"\t-> Problem opening file: \'%s\'\n",fname); exit(0); } switch(args->inputtype){ case INPUT_PILEUP:{ mpil = new mpileup(args->nInd,gz,args->revMap,minQ); break; } case INPUT_GLF:{ myglf = new glfReader(args->nInd,gz,10,isSim); break; } case INPUT_GLF3:{ isSim = 1; //Added by FGV on 22/02/2015: GLF3 is always simulated data until a better alternative can be found myglf = new glfReader(args->nInd,gz,3,isSim); break; } case INPUT_GLF10_TEXT:{ myglf_text = new glfReader_text(args->nInd,gz,args->revMap); break; } case INPUT_BEAGLE:{ bglObj = new beagle_reader(gz,args->revMap,intName,args->nInd); break; } default:{ break; } } if(args->inputtype==INPUT_VCF_GL||args->inputtype==INPUT_VCF_GL){ fprintf(stderr,"\t-> VCF still beta. Remember that\n"); fprintf(stderr,"\t 1. indels are are discarded\n"); fprintf(stderr,"\t 2. will use chrom, pos PL columns\n"); fprintf(stderr,"\t 3. GL tags are interpreted as log10 and are scaled to ln (NOT USED)\n"); fprintf(stderr,"\t 4. GP tags are interpreted directly as unscaled post probs (spec says phredscaled...) (NOT USED)\n"); fprintf(stderr,"\t 5. FILTER column is currently NOT used (not sure what concensus is)\n"); fprintf(stderr,"\t 6. -sites does NOT work with vcf input but -r does\n"); fprintf(stderr,"\t 7. vcffilereading is still BETA, please report strange behaviour\n"); } }
int ctx_calls2vcf(int argc, char **argv) { const char *in_path = NULL, *out_path = NULL, *out_type = NULL; // Filtering parameters int32_t min_mapq = -1, max_align_len = -1, max_allele_len = -1; // Alignment parameters int nwmatch = 1, nwmismatch = -2, nwgapopen = -4, nwgapextend = -1; // ref paths char const*const* ref_paths = NULL; size_t nref_paths = 0; // flank file const char *sam_path = NULL; // // Things we figure out by looking at the input // bool isbubble = false; // samples in VCF, (0 for bubble, does not include ref in breakpoint calls) size_t i, kmer_size, num_samples; // // Reference genome // // Hash map of chromosome name -> sequence ChromHash *genome; ReadBuffer chroms; // Arg parsing char cmd[100]; char shortopts[300]; cmd_long_opts_to_short(longopts, shortopts, sizeof(shortopts)); int c; // silence error messages from getopt_long // opterr = 0; while((c = getopt_long_only(argc, argv, shortopts, longopts, NULL)) != -1) { cmd_get_longopt_str(longopts, c, cmd, sizeof(cmd)); switch(c) { case 0: /* flag set */ break; case 'h': cmd_print_usage(NULL); break; case 'o': cmd_check(!out_path, cmd); out_path = optarg; break; case 'O': cmd_check(!out_type, cmd); out_type = optarg; break; case 'f': cmd_check(!futil_get_force(), cmd); futil_set_force(true); break; case 'F': cmd_check(!sam_path,cmd); sam_path = optarg; break; case 'Q': cmd_check(min_mapq < 0,cmd); min_mapq = cmd_uint32(cmd, optarg); break; case 'A': cmd_check(max_align_len < 0,cmd); max_align_len = cmd_uint32(cmd, optarg); break; case 'L': cmd_check(max_allele_len < 0,cmd); max_allele_len = cmd_uint32(cmd, optarg); break; case 'm': nwmatch = cmd_int32(cmd, optarg); break; case 'M': nwmismatch = cmd_int32(cmd, optarg); break; case 'g': nwgapopen = cmd_int32(cmd, optarg); break; case 'G': nwgapextend = cmd_int32(cmd, optarg); break; case ':': /* BADARG */ case '?': /* BADCH getopt_long has already printed error */ die("`"CMD" "SUBCMD" -h` for help. Bad option: %s", argv[optind-1]); default: ctx_assert2(0, "shouldn't reach here: %c", c); } } // Defaults for unset values if(out_path == NULL) out_path = "-"; if(max_align_len < 0) max_align_len = DEFAULT_MAX_ALIGN; if(max_allele_len < 0) max_allele_len = DEFAULT_MAX_ALLELE; if(optind+2 > argc) cmd_print_usage("Require <in.txt.gz> and at least one reference"); in_path = argv[optind++]; ref_paths = (char const*const*)argv + optind; nref_paths = argc - optind; // These functions call die() on error gzFile gzin = futil_gzopen(in_path, "r"); // Read call file header cJSON *json = json_hdr_load(gzin, in_path); // Check we can handle the kmer size kmer_size = json_hdr_get_kmer_size(json, in_path); db_graph_check_kmer_size(kmer_size, in_path); // Get format (bubble or breakpoint file) cJSON *json_fmt = json_hdr_get(json, "file_format", cJSON_String, in_path); if(strcmp(json_fmt->valuestring,"CtxBreakpoints") == 0) isbubble = false; else if(strcmp(json_fmt->valuestring,"CtxBubbles") == 0) isbubble = true; else die("Unknown format: '%s'", json_fmt->valuestring); status("Reading %s in %s format", futil_inpath_str(in_path), isbubble ? "bubble" : "breakpoint"); if(isbubble) { // bubble specific if(sam_path == NULL) cmd_print_usage("Require -F <flanks.sam> with bubble file"); if(min_mapq < 0) min_mapq = DEFAULT_MIN_MAPQ; } else { // breakpoint specific if(min_mapq >= 0) cmd_print_usage("-Q,--min-mapq <Q> only valid with bubble calls"); } // Open flank file if it exists htsFile *samfh = NULL; bam_hdr_t *bam_hdr = NULL; bam1_t *mflank = NULL; if(sam_path) { if((samfh = hts_open(sam_path, "r")) == NULL) die("Cannot open SAM/BAM %s", sam_path); // Load BAM header bam_hdr = sam_hdr_read(samfh); if(bam_hdr == NULL) die("Cannot load BAM header: %s", sam_path); mflank = bam_init1(); } // Output VCF has 0 samples if bubbles file, otherwise has N where N is // number of samples/colours in the breakpoint graph size_t num_graph_samples = json_hdr_get_ncols(json, in_path); size_t num_graph_nonref = json_hdr_get_nonref_ncols(json, in_path); num_samples = 0; if(!isbubble) { // If last colour has "is_ref", drop number of samples by one num_samples = num_graph_nonref < num_graph_samples ? num_graph_samples-1 : num_graph_samples; } // // Open output file // if(!out_path) out_path = "-"; int mode = vcf_misc_get_outtype(out_type, out_path); futil_create_output(out_path); htsFile *vcffh = hts_open(out_path, modes_htslib[mode]); status("[calls2vcf] Reading %s call file with %zu samples", isbubble ? "Bubble" : "Breakpoint", num_graph_samples); status("[calls2vcf] %zu sample output to: %s format: %s", num_samples, futil_outpath_str(out_path), hsmodes_htslib[mode]); if(isbubble) status("[calls2vcf] min. MAPQ: %i", min_mapq); status("[calls2vcf] max alignment length: %i", max_align_len); status("[calls2vcf] max VCF allele length: %i", max_allele_len); status("[calls2vcf] alignment match:%i mismatch:%i gap open:%i extend:%i", nwmatch, nwmismatch, nwgapopen, nwgapextend); // Load reference genome read_buf_alloc(&chroms, 1024); genome = chrom_hash_init(); chrom_hash_load(ref_paths, nref_paths, &chroms, genome); // convert to upper case char *s; for(i = 0; i < chroms.len; i++) for(s = chroms.b[i].seq.b; *s; s++) *s = toupper(*s); if(!isbubble) brkpnt_check_refs_match(json, genome, in_path); bcf_hdr_t *vcfhdr = make_vcf_hdr(json, in_path, !isbubble, kmer_size, ref_paths, nref_paths, chroms.b, chroms.len); if(bcf_hdr_write(vcffh, vcfhdr) != 0) die("Cannot write VCF header"); AlignedCall *call = acall_init(); CallDecomp *aligner = call_decomp_init(vcffh, vcfhdr); scoring_t *scoring = call_decomp_get_scoring(aligner); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, false, false, 0, 0, 0, 0); CallFileEntry centry; call_file_entry_alloc(¢ry); char kmer_str[50]; sprintf(kmer_str, ";K%zu", kmer_size); if(isbubble) { // Bubble calls DecompBubble *bubbles = decomp_bubble_init(); // Set scoring for aligning 3' flank scoring = decomp_bubble_get_scoring(bubbles); scoring_init(scoring, nwmatch, nwmismatch, nwgapopen, nwgapextend, true, true, 0, 0, 0, 0); while(call_file_read(gzin, in_path, ¢ry)) { do { if(sam_read1(samfh, bam_hdr, mflank) < 0) die("We've run out of SAM entries!"); } while(mflank->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)); // Align call strbuf_reset(&call->info); decomp_bubble_call(bubbles, genome, kmer_size, min_mapq, ¢ry, mflank, bam_hdr, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBubbleStats *bub_stats = ctx_calloc(1, sizeof(*bub_stats)); decomp_bubble_cpy_stats(bub_stats, bubbles); print_bubble_stats(bub_stats); ctx_free(bub_stats); decomp_bubble_destroy(bubbles); } else { // Breakpoint calls DecompBreakpoint *breakpoints = decomp_brkpt_init(); while(call_file_read(gzin, in_path, ¢ry)) { strbuf_reset(&call->info); decomp_brkpt_call(breakpoints, genome, num_samples, ¢ry, call); strbuf_append_str(&call->info, kmer_str); acall_decompose(aligner, call, max_align_len, max_allele_len); } // print bubble stats DecompBreakpointStats *brk_stats = ctx_calloc(1, sizeof(*brk_stats)); decomp_brkpt_cpy_stats(brk_stats, breakpoints); print_breakpoint_stats(brk_stats); ctx_free(brk_stats); decomp_brkpt_destroy(breakpoints); } // Print stats DecomposeStats *astats = ctx_calloc(1, sizeof(*astats)); call_decomp_cpy_stats(astats, aligner); print_acall_stats(astats); ctx_free(astats); call_file_entry_dealloc(¢ry); call_decomp_destroy(aligner); acall_destroy(call); // Finished - clean up cJSON_Delete(json); gzclose(gzin); bcf_hdr_destroy(vcfhdr); hts_close(vcffh); for(i = 0; i < chroms.len; i++) seq_read_dealloc(&chroms.b[i]); read_buf_dealloc(&chroms); chrom_hash_destroy(genome); if(sam_path) { hts_close(samfh); bam_hdr_destroy(bam_hdr); bam_destroy1(mflank); } return EXIT_SUCCESS; }
int main_samview(int argc, char *argv[]) { samFile *in; char *fn_ref = 0; int flag = 0, c, clevel = -1, ignore_sam_err = 0; char moder[8]; bam_hdr_t *h; bam1_t *b; while ((c = getopt(argc, argv, "IbSl:t:")) >= 0) { switch (c) { case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; } } if (argc == optind) { fprintf(stderr, "Usage: samview [-bSI] [-l level] <in.bam>|<in.sam> [region]\n"); return 1; } strcpy(moder, "r"); if ((flag&1) == 0) strcat(moder, "b"); in = sam_open(argv[optind], moder, fn_ref); h = sam_hdr_read(in); h->ignore_sam_err = ignore_sam_err; b = bam_init1(); if ((flag&4) == 0) { // SAM/BAM output htsFile *out; char modew[8]; strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&2) strcat(modew, "b"); out = hts_open("-", modew, 0); sam_hdr_write(out, h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region int i; hts_idx_t *idx; if ((idx = bam_index_load(argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = bam_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while (bam_itr_next((BGZF*)in->fp, iter, b) >= 0) sam_write1(out, h, b); hts_itr_destroy(iter); } hts_idx_destroy(idx); } else while (sam_read1(in, h, b) >= 0) sam_write1(out, h, b); sam_close(out); } bam_destroy1(b); bam_hdr_destroy(h); sam_close(in); return 0; }
int main_bedcov(int argc, char *argv[]) { gzFile fp; kstring_t str; kstream_t *ks; hts_idx_t **idx; aux_t **aux; int *n_plp, dret, i, n, c, min_mapQ = 0; int64_t *cnt; const bam_pileup1_t **plp; int usage = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), { NULL, 0, NULL, 0 } }; while ((c = getopt_long(argc, argv, "Q:", lopts, NULL)) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage = 1; break; } if (usage) break; } if (usage || optind + 2 > argc) { fprintf(pysam_stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n"); fprintf(pysam_stderr, " -Q INT Only count bases of at least INT quality [0]\n"); sam_global_opt_help(pysam_stderr, "-.--."); return 1; } memset(&str, 0, sizeof(kstring_t)); n = argc - optind - 1; aux = calloc(n, sizeof(aux_t*)); idx = calloc(n, sizeof(hts_idx_t*)); for (i = 0; i < n; ++i) { aux[i] = calloc(1, sizeof(aux_t)); aux[i]->min_mapQ = min_mapQ; aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in); if (aux[i]->fp) idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); if (aux[i]->fp == 0 || idx[i] == 0) { fprintf(pysam_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); return 2; } // TODO bgzf_set_cache_size(aux[i]->fp, 20); aux[i]->header = sam_hdr_read(aux[i]->fp); if (aux[i]->header == NULL) { fprintf(pysam_stderr, "ERROR: failed to read header for '%s'\n", argv[i+optind+1]); return 2; } } cnt = calloc(n, 8); fp = gzopen(argv[optind], "rb"); ks = ks_init(fp); n_plp = calloc(n, sizeof(int)); plp = calloc(n, sizeof(bam_pileup1_t*)); while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { char *p, *q; int tid, beg, end, pos; bam_mplp_t mplp; for (p = q = str.s; *p && *p != '\t'; ++p); if (*p != '\t') goto bed_error; *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t'; if (tid < 0) goto bed_error; for (q = p = p + 1; isdigit(*p); ++p); if (*p != '\t') goto bed_error; *p = 0; beg = atoi(q); *p = '\t'; for (q = p = p + 1; isdigit(*p); ++p); if (*p == '\t' || *p == 0) { int c = *p; *p = 0; end = atoi(q); *p = c; } else goto bed_error; for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end); } mplp = bam_mplp_init(n, read_bam, (void**)aux); bam_mplp_set_maxcnt(mplp, 64000); memset(cnt, 0, 8 * n); while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) if (pos >= beg && pos < end) for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); } fputs(str.s, pysam_stdout) & fputc('\n', pysam_stdout); bam_mplp_destroy(mplp); continue; bed_error: fprintf(pysam_stderr, "Errors in BED line '%s'\n", str.s); } free(n_plp); free(plp); ks_destroy(ks); gzclose(fp); free(cnt); for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); hts_idx_destroy(idx[i]); bam_hdr_destroy(aux[i]->header); sam_close(aux[i]->fp); free(aux[i]); } free(aux); free(idx); free(str.s); sam_global_args_free(&ga); return 0; }
int bam_fillmd(int argc, char *argv[]) { int c, flt_flag, tid = -2, ret, len, is_bam_out, is_uncompressed, max_nm, is_realn, capQ, baq_flag; samFile *fp = NULL, *fpout = NULL; bam_hdr_t *header = NULL; faidx_t *fai = NULL; char *ref = NULL, mode_w[8], *ref_file; bam1_t *b = NULL; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 0, 0, 0), { NULL, 0, NULL, 0 } }; flt_flag = UPDATE_NM | UPDATE_MD; is_bam_out = is_uncompressed = is_realn = max_nm = capQ = baq_flag = 0; strcpy(mode_w, "w"); while ((c = getopt_long(argc, argv, "EqreuNhbSC:n:Ad", lopts, NULL)) >= 0) { switch (c) { case 'r': is_realn = 1; break; case 'e': flt_flag |= USE_EQUAL; break; case 'd': flt_flag |= DROP_TAG; break; case 'q': flt_flag |= BIN_QUAL; break; case 'h': flt_flag |= HASH_QNM; break; case 'N': flt_flag &= ~(UPDATE_MD|UPDATE_NM); break; case 'b': is_bam_out = 1; break; case 'u': is_uncompressed = is_bam_out = 1; break; case 'S': break; case 'n': max_nm = atoi(optarg); break; case 'C': capQ = atoi(optarg); break; case 'A': baq_flag |= 1; break; case 'E': baq_flag |= 2; break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; fprintf(stderr, "[bam_fillmd] unrecognized option '-%c'\n\n", c); /* else fall-through */ case '?': return calmd_usage(); } } if (is_bam_out) strcat(mode_w, "b"); else strcat(mode_w, "h"); if (is_uncompressed) strcat(mode_w, "0"); if (optind + (ga.reference == NULL) >= argc) return calmd_usage(); fp = sam_open_format(argv[optind], "r", &ga.in); if (fp == NULL) { print_error_errno("calmd", "Failed to open input file '%s'", argv[optind]); return 1; } header = sam_hdr_read(fp); if (header == NULL || header->n_targets == 0) { fprintf(stderr, "[bam_fillmd] input SAM does not have header. Abort!\n"); goto fail; } fpout = sam_open_format("-", mode_w, &ga.out); if (fpout == NULL) { print_error_errno("calmd", "Failed to open output"); goto fail; } if (sam_hdr_write(fpout, header) < 0) { print_error_errno("calmd", "Failed to write sam header"); goto fail; } ref_file = argc > optind + 1 ? argv[optind+1] : ga.reference; fai = fai_load(ref_file); if (!fai) { print_error_errno("calmd", "Failed to open reference file '%s'", ref_file); goto fail; } b = bam_init1(); if (!b) { fprintf(stderr, "[bam_fillmd] Failed to allocate bam struct\n"); goto fail; } while ((ret = sam_read1(fp, header, b)) >= 0) { if (b->core.tid >= 0) { if (tid != b->core.tid) { free(ref); ref = fai_fetch(fai, header->target_name[b->core.tid], &len); tid = b->core.tid; if (ref == 0) { // FIXME: Should this always be fatal? fprintf(stderr, "[bam_fillmd] fail to find sequence '%s' in the reference.\n", header->target_name[tid]); if (is_realn || capQ > 10) goto fail; // Would otherwise crash } } if (is_realn) sam_prob_realn(b, ref, len, baq_flag); if (capQ > 10) { int q = sam_cap_mapq(b, ref, len, capQ); if (b->core.qual > q) b->core.qual = q; } if (ref) bam_fillmd1_core(b, ref, len, flt_flag, max_nm); } if (sam_write1(fpout, header, b) < 0) { print_error_errno("calmd", "failed to write to output file"); goto fail; } } if (ret < -1) { fprintf(stderr, "[bam_fillmd] Error reading input.\n"); goto fail; } bam_destroy1(b); bam_hdr_destroy(header); free(ref); fai_destroy(fai); sam_close(fp); if (sam_close(fpout) < 0) { fprintf(stderr, "[bam_fillmd] error when closing output file\n"); return 1; } return 0; fail: free(ref); if (b) bam_destroy1(b); if (header) bam_hdr_destroy(header); if (fai) fai_destroy(fai); if (fp) sam_close(fp); if (fpout) sam_close(fpout); return 1; }
int main_depth(int argc, char *argv[]) { int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure char *file_list = NULL, **fn = NULL; bam_hdr_t *h = NULL; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; int last_pos = -1, last_tid = -1, ret; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), { NULL, 0, NULL, 0 } }; // parse the command line while ((n = getopt_long(argc, argv, "r:b:q:Q:l:f:am:d:", lopts, NULL)) >= 0) { switch (n) { case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); // BED or position list file can be parsed now if (!bed) { print_error_errno("depth", "Could not read file \"%s\"", optarg); return 1; } break; case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold case 'f': file_list = optarg; break; case 'a': all++; break; case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return usage(); } } if (optind == argc && !file_list) return usage(); // initialize the auxiliary data structures if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; n = nfiles; argv = fn; optind = 0; } else n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input reg_tid = 0; beg = 0; end = INT_MAX; // set the default region for (i = 0; i < n; ++i) { int rf; data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM if (data[i]->fp == NULL) { print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; if (baseQ) rf |= SAM_QUAL; if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); return 1; } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); return 1; } data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header if (data[i]->hdr == NULL) { fprintf(stderr, "Couldn't read header for \"%s\"\n", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } if (reg) { // if a region is specified hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index if (idx == NULL) { print_error("depth", "can't load index for \"%s\"", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator hts_idx_destroy(idx); // the index is not needed any more; free the memory if (data[i]->iter == NULL) { print_error("depth", "can't parse region \"%s\"", reg); status = EXIT_FAILURE; goto depth_end; } } } h = data[0]->hdr; // easy access to the header of the 1st BAM if (reg) { beg = data[0]->iter->beg; // and to the parsed region coordinates end = data[0]->iter->end; reg_tid = data[0]->iter->tid; } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization if (0 < max_depth) bam_mplp_set_maxcnt(mplp,max_depth); // set maximum coverage depth n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (tid >= h->n_targets) continue; // diff number of @SQ lines per file? if (all) { while (tid > last_tid) { if (last_tid >= 0 && !reg) { // Deal with remainder or entirety of last tid. while (++last_pos < h->target_len[last_tid]) { // Horribly inefficient, but the bed API is an obfuscated black box. if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); for (i = 0; i < n; i++) putchar('\t'), putchar('0'); putchar('\n'); } } last_tid++; last_pos = -1; if (all < 2) break; } // Deal with missing portion of current tid while (++last_pos < pos) { if (last_pos < beg) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0) continue; fputs(h->target_name[tid], stdout); printf("\t%d", last_pos+1); for (i = 0; i < n; i++) putchar('\t'), putchar('0'); putchar('\n'); } last_tid = tid; last_pos = pos; } if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } if (ret < 0) status = EXIT_FAILURE; free(n_plp); free(plp); bam_mplp_destroy(mplp); if (all) { // Handle terminating region if (last_tid < 0 && reg && all > 1) { last_tid = reg_tid; last_pos = beg-1; } while (last_tid >= 0 && last_tid < h->n_targets) { while (++last_pos < h->target_len[last_tid]) { if (last_pos >= end) break; if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); for (i = 0; i < n; i++) putchar('\t'), putchar('0'); putchar('\n'); } last_tid++; last_pos = -1; if (all < 2 || reg) break; } } depth_end: for (i = 0; i < n && data[i]; ++i) { bam_hdr_destroy(data[i]->hdr); if (data[i]->fp) sam_close(data[i]->fp); hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); if ( file_list ) { for (i=0; i<n; i++) free(fn[i]); free(fn); } sam_global_args_free(&ga); return status; }
static int mpileup(mplp_conf_t *conf) { if (conf->nfiles == 0) { fprintf(stderr,"[%s] no input file/data given\n", __func__); exit(EXIT_FAILURE); } mplp_ref_t mp_ref = MPLP_REF_INIT; conf->gplp = (mplp_pileup_t *) calloc(1,sizeof(mplp_pileup_t)); conf->mplp_data = (mplp_aux_t**) calloc(conf->nfiles, sizeof(mplp_aux_t*)); conf->plp = (const bam_pileup1_t**) calloc(conf->nfiles, sizeof(bam_pileup1_t*)); conf->n_plp = (int*) calloc(conf->nfiles, sizeof(int)); // Allow to run mpileup on multiple regions in one go. This comes at cost: the bai index // must be kept in the memory for the whole time which can be a problem with many bams. // Therefore if none or only one region is requested, we initialize the bam iterator as // before and free the index. Only when multiple regions are queried, we keep the index. int nregs = 0; if ( conf->reg_fname ) { if ( conf->reg_is_file ) { conf->reg = regidx_init(conf->reg_fname,NULL,NULL,0,NULL); if ( !conf->reg ) { fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname); exit(EXIT_FAILURE); } } else { conf->reg = regidx_init(NULL,regidx_parse_reg,NULL,sizeof(char*),NULL); if ( regidx_insert_list(conf->reg,conf->reg_fname,',') !=0 ) { fprintf(stderr,"Could not parse the regions: %s\n", conf->reg_fname); exit(EXIT_FAILURE); } } nregs = regidx_nregs(conf->reg); conf->reg_itr = regitr_init(conf->reg); regitr_loop(conf->reg_itr); // region iterator now positioned at the first region } // read the header of each file in the list and initialize data // beware: mpileup has always assumed that tid's are consistent in the headers, add sanity check at least! bam_hdr_t *hdr = NULL; // header of first file in input list int i; for (i = 0; i < conf->nfiles; ++i) { bam_hdr_t *h_tmp; conf->mplp_data[i] = (mplp_aux_t*) calloc(1, sizeof(mplp_aux_t)); conf->mplp_data[i]->fp = sam_open(conf->files[i], "rb"); if ( !conf->mplp_data[i]->fp ) { fprintf(stderr, "[%s] failed to open %s: %s\n", __func__, conf->files[i], strerror(errno)); exit(EXIT_FAILURE); } if (hts_set_opt(conf->mplp_data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); exit(EXIT_FAILURE); } if (conf->fai_fname && hts_set_fai_filename(conf->mplp_data[i]->fp, conf->fai_fname) != 0) { fprintf(stderr, "[%s] failed to process %s: %s\n", __func__, conf->fai_fname, strerror(errno)); exit(EXIT_FAILURE); } conf->mplp_data[i]->conf = conf; conf->mplp_data[i]->ref = &mp_ref; h_tmp = sam_hdr_read(conf->mplp_data[i]->fp); if ( !h_tmp ) { fprintf(stderr,"[%s] fail to read the header of %s\n", __func__, conf->files[i]); exit(EXIT_FAILURE); } conf->mplp_data[i]->h = i ? hdr : h_tmp; // for j==0, "h" has not been set yet conf->mplp_data[i]->bam_id = bam_smpl_add_bam(conf->bsmpl,h_tmp->text,conf->files[i]); if ( conf->mplp_data[i]->bam_id<0 ) { // no usable readgroups in this bam, it can be skipped sam_close(conf->mplp_data[i]->fp); free(conf->mplp_data[i]); bam_hdr_destroy(h_tmp); free(conf->files[i]); if ( i+1<conf->nfiles ) memmove(&conf->files[i],&conf->files[i+1],sizeof(*conf->files)*(conf->nfiles-i-1)); conf->nfiles--; i--; continue; } if (conf->reg) { hts_idx_t *idx = sam_index_load(conf->mplp_data[i]->fp, conf->files[i]); if (idx == NULL) { fprintf(stderr, "[%s] fail to load index for %s\n", __func__, conf->files[i]); exit(EXIT_FAILURE); } conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg+1,conf->reg_itr->end+1); conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->buf.s); if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); exit(EXIT_FAILURE); } fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); exit(EXIT_FAILURE); } if ( nregs==1 ) // no need to keep the index in memory hts_idx_destroy(idx); else conf->mplp_data[i]->idx = idx; } if ( !hdr ) hdr = h_tmp; /* save the header of first file in list */ else { // FIXME: check consistency between h and h_tmp bam_hdr_destroy(h_tmp); // we store only the first file's header; it's (alleged to be) // compatible with the i-th file's target_name lookup needs conf->mplp_data[i]->h = hdr; } } // allocate data storage proportionate to number of samples being studied sm->n bam_smpl_get_samples(conf->bsmpl, &conf->gplp->n); conf->gplp->n_plp = (int*) calloc(conf->gplp->n, sizeof(int)); conf->gplp->m_plp = (int*) calloc(conf->gplp->n, sizeof(int)); conf->gplp->plp = (bam_pileup1_t**) calloc(conf->gplp->n, sizeof(bam_pileup1_t*)); fprintf(stderr, "[%s] %d samples in %d input files\n", __func__, conf->gplp->n, conf->nfiles); // write the VCF header conf->bcf_fp = hts_open(conf->output_fname?conf->output_fname:"-", hts_bcf_wmode(conf->output_type)); if (conf->bcf_fp == NULL) { fprintf(stderr, "[%s] failed to write to %s: %s\n", __func__, conf->output_fname? conf->output_fname : "standard output", strerror(errno)); exit(EXIT_FAILURE); } if ( conf->n_threads ) hts_set_threads(conf->bcf_fp, conf->n_threads); // BCF header creation conf->bcf_hdr = bcf_hdr_init("w"); conf->buf.l = 0; if (conf->record_cmd_line) { ksprintf(&conf->buf, "##bcftoolsVersion=%s+htslib-%s\n",bcftools_version(),hts_version()); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); conf->buf.l = 0; ksprintf(&conf->buf, "##bcftoolsCommand=mpileup"); for (i=1; i<conf->argc; i++) ksprintf(&conf->buf, " %s", conf->argv[i]); kputc('\n', &conf->buf); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } if (conf->fai_fname) { conf->buf.l = 0; ksprintf(&conf->buf, "##reference=file://%s\n", conf->fai_fname); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } // Translate BAM @SQ tags to BCF ##contig tags // todo: use/write new BAM header manipulation routines, fill also UR, M5 for (i=0; i<hdr->n_targets; i++) { conf->buf.l = 0; ksprintf(&conf->buf, "##contig=<ID=%s,length=%d>", hdr->target_name[i], hdr->target_len[i]); bcf_hdr_append(conf->bcf_hdr, conf->buf.s); } conf->buf.l = 0; bcf_hdr_append(conf->bcf_hdr,"##ALT=<ID=*,Description=\"Represents allele(s) other than observed.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=INDEL,Number=0,Type=Flag,Description=\"Indicates that the variant is an INDEL.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IDV,Number=1,Type=Integer,Description=\"Maximum number of reads supporting an indel\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=IMF,Number=1,Type=Float,Description=\"Maximum fraction of reads supporting an indel\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DP,Number=1,Type=Integer,Description=\"Raw read depth\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=VDB,Number=1,Type=Float,Description=\"Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)\",Version=\"3\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias (bigger is better)\">"); #if CDF_MWU_TESTS bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=RPB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Read Position Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=BQB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Base Quality Bias [CDF] (bigger is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQSB2,Number=1,Type=Float,Description=\"Mann-Whitney U test of Mapping Quality vs Strand Bias [CDF] (bigger is better)\">"); #endif bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=SGB,Number=1,Type=Float,Description=\"Segregation based metric.\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=MQ0F,Number=1,Type=Float,Description=\"Fraction of MQ0 reads (smaller is better)\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=I16,Number=16,Type=Float,Description=\"Auxiliary tag used for calling, see description of bcf_callret1_t in bam2bcf.h\">"); bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=QS,Number=R,Type=Float,Description=\"Auxiliary tag used for calling\">"); bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=PL,Number=G,Type=Integer,Description=\"List of Phred-scaled genotype likelihoods\">"); if ( conf->fmt_flag&B2B_FMT_DP ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP,Number=1,Type=Integer,Description=\"Number of high-quality bases\">"); if ( conf->fmt_flag&B2B_FMT_DV ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DV,Number=1,Type=Integer,Description=\"Number of high-quality non-reference bases\">"); if ( conf->fmt_flag&B2B_FMT_DPR ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_INFO_DPR ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=DPR,Number=R,Type=Integer,Description=\"Number of high-quality bases observed for each allele\">"); if ( conf->fmt_flag&B2B_FMT_DP4 ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=DP4,Number=4,Type=Integer,Description=\"Number of high-quality ref-fwd, ref-reverse, alt-fwd and alt-reverse bases\">"); if ( conf->fmt_flag&B2B_FMT_SP ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=SP,Number=1,Type=Integer,Description=\"Phred-scaled strand bias P-value\">"); if ( conf->fmt_flag&B2B_FMT_AD ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths\">"); if ( conf->fmt_flag&B2B_FMT_ADF ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADF,Number=R,Type=Integer,Description=\"Allelic depths on the forward strand\">"); if ( conf->fmt_flag&B2B_FMT_ADR ) bcf_hdr_append(conf->bcf_hdr,"##FORMAT=<ID=ADR,Number=R,Type=Integer,Description=\"Allelic depths on the reverse strand\">"); if ( conf->fmt_flag&B2B_INFO_AD ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=AD,Number=R,Type=Integer,Description=\"Total allelic depths\">"); if ( conf->fmt_flag&B2B_INFO_ADF ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADF,Number=R,Type=Integer,Description=\"Total allelic depths on the forward strand\">"); if ( conf->fmt_flag&B2B_INFO_ADR ) bcf_hdr_append(conf->bcf_hdr,"##INFO=<ID=ADR,Number=R,Type=Integer,Description=\"Total allelic depths on the reverse strand\">"); if ( conf->gvcf ) gvcf_update_header(conf->gvcf, conf->bcf_hdr); int nsmpl; const char **smpl = bam_smpl_get_samples(conf->bsmpl, &nsmpl); for (i=0; i<nsmpl; i++) bcf_hdr_add_sample(conf->bcf_hdr, smpl[i]); bcf_hdr_write(conf->bcf_fp, conf->bcf_hdr); conf->bca = bcf_call_init(-1., conf->min_baseQ); conf->bcr = (bcf_callret1_t*) calloc(nsmpl, sizeof(bcf_callret1_t)); conf->bca->openQ = conf->openQ, conf->bca->extQ = conf->extQ, conf->bca->tandemQ = conf->tandemQ; conf->bca->min_frac = conf->min_frac; conf->bca->min_support = conf->min_support; conf->bca->per_sample_flt = conf->flag & MPLP_PER_SAMPLE; conf->bc.bcf_hdr = conf->bcf_hdr; conf->bc.n = nsmpl; conf->bc.PL = (int32_t*) malloc(15 * nsmpl * sizeof(*conf->bc.PL)); if (conf->fmt_flag) { assert( sizeof(float)==sizeof(int32_t) ); conf->bc.DP4 = (int32_t*) malloc(nsmpl * sizeof(int32_t) * 4); conf->bc.fmt_arr = (uint8_t*) malloc(nsmpl * sizeof(float)); // all fmt_flag fields, float and int32 if ( conf->fmt_flag&(B2B_INFO_DPR|B2B_FMT_DPR|B2B_INFO_AD|B2B_INFO_ADF|B2B_INFO_ADR|B2B_FMT_AD|B2B_FMT_ADF|B2B_FMT_ADR) ) { // first B2B_MAX_ALLELES fields for total numbers, the rest per-sample conf->bc.ADR = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); conf->bc.ADF = (int32_t*) malloc((nsmpl+1)*B2B_MAX_ALLELES*sizeof(int32_t)); for (i=0; i<nsmpl; i++) { conf->bcr[i].ADR = conf->bc.ADR + (i+1)*B2B_MAX_ALLELES; conf->bcr[i].ADF = conf->bc.ADF + (i+1)*B2B_MAX_ALLELES; } } } // init mpileup conf->iter = bam_mplp_init(conf->nfiles, mplp_func, (void**)conf->mplp_data); if ( conf->flag & MPLP_SMART_OVERLAPS ) bam_mplp_init_overlaps(conf->iter); if ( (double)conf->max_depth * conf->nfiles > 1<<20) fprintf(stderr, "Warning: Potential memory hog, up to %.0fM reads in the pileup!\n", (double)conf->max_depth*conf->nfiles); if ( (double)conf->max_depth * conf->nfiles / nsmpl < 250 ) fprintf(stderr, "Note: The maximum per-sample depth with -d %d is %.1fx\n", conf->max_depth,(double)conf->max_depth * conf->nfiles / nsmpl); bam_mplp_set_maxcnt(conf->iter, conf->max_depth); conf->max_indel_depth = conf->max_indel_depth * nsmpl; conf->bcf_rec = bcf_init1(); bam_mplp_constructor(conf->iter, pileup_constructor); // Run mpileup for multiple regions if ( nregs ) { int ireg = 0; do { // first region is already positioned if ( ireg++ > 0 ) { conf->buf.l = 0; ksprintf(&conf->buf,"%s:%u-%u",conf->reg_itr->seq,conf->reg_itr->beg,conf->reg_itr->end); for (i=0; i<conf->nfiles; i++) { hts_itr_destroy(conf->mplp_data[i]->iter); conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->buf.s); if ( !conf->mplp_data[i]->iter ) { conf->mplp_data[i]->iter = sam_itr_querys(conf->mplp_data[i]->idx, conf->mplp_data[i]->h, conf->reg_itr->seq); if ( conf->mplp_data[i]->iter ) { fprintf(stderr,"[E::%s] fail to parse region '%s'\n", __func__, conf->buf.s); exit(EXIT_FAILURE); } fprintf(stderr,"[E::%s] the sequence \"%s\" not found: %s\n",__func__,conf->reg_itr->seq,conf->files[i]); exit(EXIT_FAILURE); } bam_mplp_reset(conf->iter); } } mpileup_reg(conf,conf->reg_itr->beg,conf->reg_itr->end); } while ( regitr_loop(conf->reg_itr) ); } else mpileup_reg(conf,0,0); flush_bcf_records(conf, conf->bcf_fp, conf->bcf_hdr, NULL); // clean up free(conf->bc.tmp.s); bcf_destroy1(conf->bcf_rec); if (conf->bcf_fp) { hts_close(conf->bcf_fp); bcf_hdr_destroy(conf->bcf_hdr); bcf_call_destroy(conf->bca); free(conf->bc.PL); free(conf->bc.DP4); free(conf->bc.ADR); free(conf->bc.ADF); free(conf->bc.fmt_arr); free(conf->bcr); } if ( conf->gvcf ) gvcf_destroy(conf->gvcf); free(conf->buf.s); for (i = 0; i < conf->gplp->n; ++i) free(conf->gplp->plp[i]); free(conf->gplp->plp); free(conf->gplp->n_plp); free(conf->gplp->m_plp); free(conf->gplp); bam_mplp_destroy(conf->iter); bam_hdr_destroy(hdr); for (i = 0; i < conf->nfiles; ++i) { if ( nregs>1 ) hts_idx_destroy(conf->mplp_data[i]->idx); sam_close(conf->mplp_data[i]->fp); if ( conf->mplp_data[i]->iter) hts_itr_destroy(conf->mplp_data[i]->iter); free(conf->mplp_data[i]); } if ( conf->reg_itr ) regitr_destroy(conf->reg_itr); free(conf->mplp_data); free(conf->plp); free(conf->n_plp); free(mp_ref.ref[0]); free(mp_ref.ref[1]); return 0; }
// currently, this function ONLY works if each read has one hit static void bam_mating_core(samFile* in, samFile* out, int remove_reads, int proper_pair_check, int add_ct) { bam_hdr_t *header; bam1_t *b[2]; int curr, has_prev, pre_end = 0, cur_end = 0; kstring_t str; str.l = str.m = 0; str.s = 0; header = sam_hdr_read(in); if (header == NULL) { fprintf(stderr, "[bam_mating_core] ERROR: Couldn't read header\n"); exit(1); } // Accept unknown, unsorted, or queryname sort order, but error on coordinate sorted. if ((header->l_text > 3) && (strncmp(header->text, "@HD", 3) == 0)) { char *p, *q; p = strstr(header->text, "\tSO:coordinate"); q = strchr(header->text, '\n'); // Looking for SO:coordinate within the @HD line only // (e.g. must ignore in a @CO comment line later in header) if ((p != 0) && (p < q)) { fprintf(stderr, "[bam_mating_core] ERROR: Coordinate sorted, require grouped/sorted by queryname.\n"); exit(1); } } sam_hdr_write(out, header); b[0] = bam_init1(); b[1] = bam_init1(); curr = 0; has_prev = 0; while (sam_read1(in, header, b[curr]) >= 0) { bam1_t *cur = b[curr], *pre = b[1-curr]; if (cur->core.flag & BAM_FSECONDARY) { if ( !remove_reads ) sam_write1(out, header, cur); continue; // skip secondary alignments } if (cur->core.flag & BAM_FSUPPLEMENTARY) { sam_write1(out, header, cur); continue; // pass supplementary alignments through unchanged (TODO:make them match read they came from) } if (cur->core.tid < 0 || cur->core.pos < 0) // If unmapped set the flag { cur->core.flag |= BAM_FUNMAP; } if ((cur->core.flag&BAM_FUNMAP) == 0) // If mapped calculate end { cur_end = bam_endpos(cur); // Check cur_end isn't past the end of the contig we're on, if it is set the UNMAP'd flag if (cur_end > (int)header->target_len[cur->core.tid]) cur->core.flag |= BAM_FUNMAP; } if (has_prev) { // do we have a pair of reads to examine? if (strcmp(bam_get_qname(cur), bam_get_qname(pre)) == 0) { // identical pair name pre->core.flag |= BAM_FPAIRED; cur->core.flag |= BAM_FPAIRED; sync_mate(pre, cur); if (pre->core.tid == cur->core.tid && !(cur->core.flag&(BAM_FUNMAP|BAM_FMUNMAP)) && !(pre->core.flag&(BAM_FUNMAP|BAM_FMUNMAP))) // if safe set TLEN/ISIZE { uint32_t cur5, pre5; cur5 = (cur->core.flag&BAM_FREVERSE)? cur_end : cur->core.pos; pre5 = (pre->core.flag&BAM_FREVERSE)? pre_end : pre->core.pos; cur->core.isize = pre5 - cur5; pre->core.isize = cur5 - pre5; } else cur->core.isize = pre->core.isize = 0; if (add_ct) bam_template_cigar(pre, cur, &str); // TODO: Add code to properly check if read is in a proper pair based on ISIZE distribution if (proper_pair_check && !plausibly_properly_paired(pre,cur)) { pre->core.flag &= ~BAM_FPROPER_PAIR; cur->core.flag &= ~BAM_FPROPER_PAIR; } // Write out result if ( !remove_reads ) { sam_write1(out, header, pre); sam_write1(out, header, cur); } else { // If we have to remove reads make sure we do it in a way that doesn't create orphans with bad flags if(pre->core.flag&BAM_FUNMAP) cur->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if(cur->core.flag&BAM_FUNMAP) pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if(!(pre->core.flag&BAM_FUNMAP)) sam_write1(out, header, pre); if(!(cur->core.flag&BAM_FUNMAP)) sam_write1(out, header, cur); } has_prev = 0; } else { // unpaired? clear bad info and write it out if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped pre->core.flag |= BAM_FUNMAP; pre->core.tid = -1; pre->core.pos = -1; } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); if ( !remove_reads || !(pre->core.flag&BAM_FUNMAP) ) sam_write1(out, header, pre); } } else has_prev = 1; curr = 1 - curr; pre_end = cur_end; } if (has_prev && !remove_reads) { // If we still have a BAM in the buffer it must be unpaired bam1_t *pre = b[1-curr]; if (pre->core.tid < 0 || pre->core.pos < 0 || pre->core.flag&BAM_FUNMAP) { // If unmapped pre->core.flag |= BAM_FUNMAP; pre->core.tid = -1; pre->core.pos = -1; } pre->core.mtid = -1; pre->core.mpos = -1; pre->core.isize = 0; pre->core.flag &= ~(BAM_FPAIRED|BAM_FMREVERSE|BAM_FPROPER_PAIR); sam_write1(out, header, pre); } bam_hdr_destroy(header); bam_destroy1(b[0]); bam_destroy1(b[1]); free(str.s); }