int main(int argc, char **argv) { dlib::BamHandle in = dlib::BamHandle("bed_test.bam"); dlib::ParsedBed bed = dlib::ParsedBed("bed_test.bed", in.header); bam1_t *b = bam_init1(); size_t diffs = 0; void *lh3bed = bed_read("bed_test.bed"); samFile *so = sam_open("disagreed.bam", "wb9"); sam_hdr_write(so, in.header); size_t disagrees = 0, agrees = 0; int dbr = 0, lh3r = 0; while(in.read(b) != -1) { if(b->core.flag & (BAM_FUNMAP)) continue; if((dbr = bed.bam1_test(b)) != (lh3r = bed_overlap(lh3bed, in.header->target_name[b->core.tid], b->core.pos, bam_endpos(b)))) { LOG_EXIT("dbr: %i. lh3r: %i. Contig: %s. Position: %i. endpos; %i\n", dbr, lh3r, in.header->target_name[b->core.tid], b->core.pos, bam_endpos(b)); if(++disagrees % 100 == 0) LOG_DEBUG("disagrees: %lu.\n", disagrees); sam_write1(so, in.header, b); } else { if(++agrees % 500000 == 0) LOG_DEBUG("agrees: %lu.\n", agrees); } } sam_close(so); bam_destroy1(b); bed_destroy(lh3bed); return EXIT_SUCCESS; }
int bam_mpileup(int argc, char *argv[]) { int c; const char *file_list = NULL; char **fn = NULL; int nfiles = 0, use_orphan = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); #define MPLP_PRINT_POS 0x4000 mplp.max_mq = 60; mplp.min_baseQ = 13; mplp.capQ_thres = 0; mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN; while ((c = getopt(argc, argv, "Agf:r:l:M:q:Q:uaRC:BDSd:L:b:P:o:e:h:Im:F:EG:6Os")) >= 0) { switch (c) { case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; case 'l': mplp.bed = bed_read(optarg); break; case 'P': mplp.pl_list = strdup(optarg); break; case 'g': mplp.flag |= MPLP_GLF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_GLF; break; case 'a': mplp.flag |= MPLP_NO_ORPHAN | MPLP_REALN; break; case 'B': mplp.flag &= ~MPLP_REALN; break; case 'D': mplp.flag |= MPLP_FMT_DP; break; case 'S': mplp.flag |= MPLP_FMT_SP; break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_EXT_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; case 'O': mplp.flag |= MPLP_PRINT_POS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'M': mplp.max_mq = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; case 'b': file_list = optarg; break; case 'o': mplp.openQ = atoi(optarg); break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; case 'L': mplp.max_indel_depth = atoi(optarg); break; case 'G': { FILE *fp_rg; char buf[1024]; mplp.rghash = bcf_str2id_init(); if ((fp_rg = fopen(optarg, "r")) == 0) fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg); while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me... bcf_str2id_add(mplp.rghash, strdup(buf)); fclose(fp_rg); } break; } } if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools mpileup [options] in1.bam [in2.bam [...]]\n\n"); fprintf(stderr, "Input options:\n\n"); fprintf(stderr, " -6 assume the quality is in the Illumina-1.3+ encoding\n"); fprintf(stderr, " -A count anomalous read pairs\n"); fprintf(stderr, " -B disable BAQ computation\n"); fprintf(stderr, " -b FILE list of input BAM files [null]\n"); fprintf(stderr, " -C INT parameter for adjusting mapQ; 0 to disable [0]\n"); fprintf(stderr, " -d INT max per-BAM depth to avoid excessive memory usage [%d]\n", mplp.max_depth); fprintf(stderr, " -E extended BAQ for higher sensitivity but lower specificity\n"); fprintf(stderr, " -f FILE faidx indexed reference sequence file [null]\n"); fprintf(stderr, " -G FILE exclude read groups listed in FILE [null]\n"); fprintf(stderr, " -l FILE list of positions (chr pos) or regions (BED) [null]\n"); fprintf(stderr, " -M INT cap mapping quality at INT [%d]\n", mplp.max_mq); fprintf(stderr, " -r STR region in which pileup is generated [null]\n"); fprintf(stderr, " -R ignore RG tags\n"); fprintf(stderr, " -q INT skip alignments with mapQ smaller than INT [%d]\n", mplp.min_mq); fprintf(stderr, " -Q INT skip bases with baseQ/BAQ smaller than INT [%d]\n", mplp.min_baseQ); fprintf(stderr, "\nOutput options:\n\n"); fprintf(stderr, " -D output per-sample DP in BCF (require -g/-u)\n"); fprintf(stderr, " -g generate BCF output (genotype likelihoods)\n"); fprintf(stderr, " -O output base positions on reads (disabled by -g/-u)\n"); fprintf(stderr, " -s output mapping quality (disabled by -g/-u)\n"); fprintf(stderr, " -S output per-sample strand bias P-value in BCF (require -g/-u)\n"); fprintf(stderr, " -u generate uncompress BCF output\n"); fprintf(stderr, "\nSNP/INDEL genotype likelihoods options (effective with `-g' or `-u'):\n\n"); fprintf(stderr, " -e INT Phred-scaled gap extension seq error probability [%d]\n", mplp.extQ); fprintf(stderr, " -F FLOAT minimum fraction of gapped reads for candidates [%g]\n", mplp.min_frac); fprintf(stderr, " -h INT coefficient for homopolymer errors [%d]\n", mplp.tandemQ); fprintf(stderr, " -I do not perform indel calling\n"); fprintf(stderr, " -L INT max per-sample depth for INDEL calling [%d]\n", mplp.max_indel_depth); fprintf(stderr, " -m INT minimum gapped reads for indel candidates [%d]\n", mplp.min_support); fprintf(stderr, " -o INT Phred-scaled gap open sequencing error probability [%d]\n", mplp.openQ); fprintf(stderr, " -P STR comma separated list of platforms for indels [all]\n"); fprintf(stderr, "\n"); fprintf(stderr, "Notes: Assuming diploid individuals.\n\n"); return 1; } if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; mpileup(&mplp,nfiles,fn); for (c=0; c<nfiles; c++) free(fn[c]); free(fn); } else mpileup(&mplp, argc - optind, argv + optind); if (mplp.rghash) bcf_str2id_thorough_destroy(mplp.rghash); free(mplp.reg); free(mplp.pl_list); if (mplp.fai) fai_destroy(mplp.fai); if (mplp.bed) bed_destroy(mplp.bed); return 0; }
int main(int argc, char *argv[]) { int c, skip = -1, meta = -1, list_chrms = 0, force = 0, print_header = 0, bed_reg = 0; ti_conf_t conf = ti_conf_gff; const char *reheader = NULL; while ((c = getopt(argc, argv, "p:s:b:e:0S:c:lhfBr:")) >= 0) { switch (c) { case 'B': bed_reg = 1; break; case '0': conf.preset |= TI_FLAG_UCSC; break; case 'S': skip = atoi(optarg); break; case 'c': meta = optarg[0]; break; case 'p': if (strcmp(optarg, "gff") == 0) conf = ti_conf_gff; else if (strcmp(optarg, "bed") == 0) conf = ti_conf_bed; else if (strcmp(optarg, "sam") == 0) conf = ti_conf_sam; else if (strcmp(optarg, "vcf") == 0 || strcmp(optarg, "vcf4") == 0) conf = ti_conf_vcf; else if (strcmp(optarg, "psltbl") == 0) conf = ti_conf_psltbl; else { fprintf(stderr, "[main] unrecognized preset '%s'\n", optarg); return 1; } break; case 's': conf.sc = atoi(optarg); break; case 'b': conf.bc = atoi(optarg); break; case 'e': conf.ec = atoi(optarg); break; case 'l': list_chrms = 1; break; case 'h': print_header = 1; break; case 'f': force = 1; break; case 'r': reheader = optarg; break; } } if (skip >= 0) conf.line_skip = skip; if (meta >= 0) conf.meta_char = meta; if (optind == argc) { fprintf(stderr, "\n"); fprintf(stderr, "Program: tabix (TAB-delimited file InderXer)\n"); fprintf(stderr, "Version: %s\n\n", PACKAGE_VERSION); fprintf(stderr, "Usage: tabix <in.tab.bgz> [region1 [region2 [...]]]\n\n"); fprintf(stderr, "Options: -p STR preset: gff, bed, sam, vcf, psltbl [gff]\n"); fprintf(stderr, " -s INT sequence name column [1]\n"); fprintf(stderr, " -b INT start column [4]\n"); fprintf(stderr, " -e INT end column; can be identical to '-b' [5]\n"); fprintf(stderr, " -S INT skip first INT lines [0]\n"); fprintf(stderr, " -c CHAR symbol for comment/meta lines [#]\n"); fprintf(stderr, " -r FILE replace the header with the content of FILE [null]\n"); fprintf(stderr, " -B region1 is a BED file (entire file will be read)\n"); fprintf(stderr, " -0 zero-based coordinate\n"); fprintf(stderr, " -h print the header lines\n"); fprintf(stderr, " -l list chromosome names\n"); fprintf(stderr, " -f force to overwrite the index\n"); fprintf(stderr, "\n"); return 1; } if (list_chrms) { ti_index_t *idx; int i, n; const char **names; idx = ti_index_load(argv[optind]); if (idx == 0) { fprintf(stderr, "[main] fail to load the index file.\n"); return 1; } names = ti_seqname(idx, &n); for (i = 0; i < n; ++i) printf("%s\n", names[i]); free(names); ti_index_destroy(idx); return 0; } if (reheader) return reheader_file(reheader,argv[optind],conf.meta_char); struct stat stat_tbi,stat_vcf; char *fnidx = calloc(strlen(argv[optind]) + 5, 1); strcat(strcpy(fnidx, argv[optind]), ".tbi"); if (optind + 1 == argc) { if (force == 0) { if (stat(fnidx, &stat_tbi) == 0) { // Before complaining, check if the VCF file isn't newer. This is a common source of errors, // people tend not to notice that tabix failed stat(argv[optind], &stat_vcf); if ( stat_vcf.st_mtime <= stat_tbi.st_mtime ) { fprintf(stderr, "[tabix] the index file exists. Please use '-f' to overwrite.\n"); free(fnidx); return 1; } } } if ( bgzf_check_bgzf(argv[optind])!=1 ) { fprintf(stderr,"[tabix] was bgzip used to compress this file? %s\n", argv[optind]); free(fnidx); return 1; } return ti_index_build(argv[optind], &conf); } { // retrieve tabix_t *t; // Common source of errors: new VCF is used with an old index stat(fnidx, &stat_tbi); stat(argv[optind], &stat_vcf); if ( force==0 && stat_vcf.st_mtime > stat_tbi.st_mtime ) { fprintf(stderr, "[tabix] the index file is older than the vcf file. Please use '-f' to overwrite or reindex.\n"); free(fnidx); return 1; } free(fnidx); if ((t = ti_open(argv[optind], 0)) == 0) { fprintf(stderr, "[main] fail to open the data file.\n"); return 1; } if (strcmp(argv[optind+1], ".") == 0) { // retrieve all ti_iter_t iter; const char *s; int len; iter = ti_query(t, 0, 0, 0); while ((s = ti_read(t, iter, &len)) != 0) { fputs(s, stdout); fputc('\n', stdout); } ti_iter_destroy(iter); } else { // retrieve from specified regions int i, len; ti_iter_t iter; const char *s; const ti_conf_t *idxconf; if (ti_lazy_index_load(t) < 0 && bed_reg == 0) { fprintf(stderr,"[tabix] failed to load the index file.\n"); return 1; } idxconf = ti_get_conf(t->idx); if ( print_header ) { // If requested, print the header lines here iter = ti_query(t, 0, 0, 0); while ((s = ti_read(t, iter, &len)) != 0) { if ((int)(*s) != idxconf->meta_char) break; fputs(s, stdout); fputc('\n', stdout); } ti_iter_destroy(iter); } if (bed_reg) { extern int bed_overlap(const void *_h, const char *chr, int beg, int end); extern void *bed_read(const char *fn); extern void bed_destroy(void *_h); const ti_conf_t *conf_ = idxconf? idxconf : &conf; // use the index file if available void *bed = bed_read(argv[optind+1]); // load the BED file ti_interval_t intv; if (bed == 0) { fprintf(stderr, "[main] fail to read the BED file.\n"); return 1; } iter = ti_query(t, 0, 0, 0); while ((s = ti_read(t, iter, &len)) != 0) { int c; ti_get_intv(conf_, len, (char*)s, &intv); c = *intv.se; *intv.se = '\0'; if (bed_overlap(bed, intv.ss, intv.beg, intv.end)) { *intv.se = c; puts(s); } *intv.se = c; } ti_iter_destroy(iter); bed_destroy(bed); } else { for (i = optind + 1; i < argc; ++i) { int tid, beg, end; if (ti_parse_region(t->idx, argv[i], &tid, &beg, &end) == 0) { iter = ti_queryi(t, tid, beg, end); while ((s = ti_read(t, iter, &len)) != 0) { fputs(s, stdout); fputc('\n', stdout); } ti_iter_destroy(iter); } // else fprintf(stderr, "[main] invalid region: unknown target name or minus interval.\n"); } } } ti_close(t); } return 0; }
int main_depth(int argc, char *argv[]) #endif { int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure bam_header_t *h = 0; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line while ((n = getopt(argc, argv, "r:b:q:Q:")) >= 0) { switch (n) { case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold } } if (optind == argc) { fprintf(stderr, "Usage: bam2depth [-r reg] [-q baseQthres] [-Q mapQthres] [-b in.bed] <in1.bam> [...]\n"); return 1; } // initialize the auxiliary data structures n = argc - optind; // the number of BAMs on the command line data = (aux_t **) calloc(n, sizeof(void*)); // data[i] for the i-th input beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { bam_header_t *htmp; data[i] = (aux_t *) calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM data[i]->min_mapQ = mapQ; // set the mapQ filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { h = htmp; // keep the header of the 1st BAM if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header if (tid >= 0) { // if a region is specified and parsed successfully bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator bam_index_destroy(idx); // the index is not needed any more; phase out of the memory } } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization n_plp = (int*) calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = (bam_pileup1_t **) calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } free(n_plp); free(plp); bam_mplp_destroy(mplp); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); return 0; }
int main_depth(int argc, char *argv[]) { int i, n, tid, reg_tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0; int all = 0, status = EXIT_SUCCESS, nfiles, max_depth = -1; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure char *file_list = NULL, **fn = NULL; bam_hdr_t *h = NULL; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; int last_pos = -1, last_tid = -1, ret; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), { NULL, 0, NULL, 0 } }; // parse the command line while ((n = getopt_long(argc, argv, "r:b:q:Q:l:f:am:d:", lopts, NULL)) >= 0) { switch (n) { case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); // BED or position list file can be parsed now if (!bed) { print_error_errno("depth", "Could not read file \"%s\"", optarg); return 1; } break; case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold case 'f': file_list = optarg; break; case 'a': all++; break; case 'd': case 'm': max_depth = atoi(optarg); break; // maximum coverage depth default: if (parse_sam_global_opt(n, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': return usage(); } } if (optind == argc && !file_list) return usage(); // initialize the auxiliary data structures if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; n = nfiles; argv = fn; optind = 0; } else n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input reg_tid = 0; beg = 0; end = INT_MAX; // set the default region for (i = 0; i < n; ++i) { int rf; data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = sam_open_format(argv[optind+i], "r", &ga.in); // open BAM if (data[i]->fp == NULL) { print_error_errno("depth", "Could not open \"%s\"", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } rf = SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ; if (baseQ) rf |= SAM_QUAL; if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, rf)) { fprintf(stderr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); return 1; } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(stderr, "Failed to set CRAM_OPT_DECODE_MD value\n"); return 1; } data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header if (data[i]->hdr == NULL) { fprintf(stderr, "Couldn't read header for \"%s\"\n", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } if (reg) { // if a region is specified hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index if (idx == NULL) { print_error("depth", "can't load index for \"%s\"", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator hts_idx_destroy(idx); // the index is not needed any more; free the memory if (data[i]->iter == NULL) { print_error("depth", "can't parse region \"%s\"", reg); status = EXIT_FAILURE; goto depth_end; } } } h = data[0]->hdr; // easy access to the header of the 1st BAM if (reg) { beg = data[0]->iter->beg; // and to the parsed region coordinates end = data[0]->iter->end; reg_tid = data[0]->iter->tid; } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization if (0 < max_depth) bam_mplp_set_maxcnt(mplp,max_depth); // set maximum coverage depth n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) while ((ret=bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (tid >= h->n_targets) continue; // diff number of @SQ lines per file? if (all) { while (tid > last_tid) { if (last_tid >= 0 && !reg) { // Deal with remainder or entirety of last tid. while (++last_pos < h->target_len[last_tid]) { // Horribly inefficient, but the bed API is an obfuscated black box. if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); for (i = 0; i < n; i++) putchar('\t'), putchar('0'); putchar('\n'); } } last_tid++; last_pos = -1; if (all < 2) break; } // Deal with missing portion of current tid while (++last_pos < pos) { if (last_pos < beg) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], last_pos, last_pos + 1) == 0) continue; fputs(h->target_name[tid], stdout); printf("\t%d", last_pos+1); for (i = 0; i < n; i++) putchar('\t'), putchar('0'); putchar('\n'); } last_tid = tid; last_pos = pos; } if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } if (ret < 0) status = EXIT_FAILURE; free(n_plp); free(plp); bam_mplp_destroy(mplp); if (all) { // Handle terminating region if (last_tid < 0 && reg && all > 1) { last_tid = reg_tid; last_pos = beg-1; } while (last_tid >= 0 && last_tid < h->n_targets) { while (++last_pos < h->target_len[last_tid]) { if (last_pos >= end) break; if (bed && bed_overlap(bed, h->target_name[last_tid], last_pos, last_pos + 1) == 0) continue; fputs(h->target_name[last_tid], stdout); printf("\t%d", last_pos+1); for (i = 0; i < n; i++) putchar('\t'), putchar('0'); putchar('\n'); } last_tid++; last_pos = -1; if (all < 2 || reg) break; } } depth_end: for (i = 0; i < n && data[i]; ++i) { bam_hdr_destroy(data[i]->hdr); if (data[i]->fp) sam_close(data[i]->fp); hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); if ( file_list ) { for (i=0; i<n; i++) free(fn[i]); free(fn); } sam_global_args_free(&ga); return status; }
int main_depth(int argc, char *argv[]) #endif { int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, nfiles; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure char *file_list = NULL, **fn = NULL; bam_header_t *h = 0; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) { switch (n) { case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); break; // BED or position list file can be parsed now case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold case 'f': file_list = optarg; break; } } if (optind == argc && !file_list) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); fprintf(stderr, "Options:\n"); fprintf(stderr, " -b <bed> list of positions or regions\n"); fprintf(stderr, " -f <list> list of input BAM filenames, one per line [null]\n"); fprintf(stderr, " -l <int> minQLen\n"); fprintf(stderr, " -q <int> base quality threshold\n"); fprintf(stderr, " -Q <int> mapping quality threshold\n"); fprintf(stderr, " -r <chr:from-to> region\n"); fprintf(stderr, "\n"); return 1; } // initialize the auxiliary data structures if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; n = nfiles; argv = fn; optind = 0; } else n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(void*)); // data[i] for the i-th input beg = 0; end = 1<<30; tid = -1; // set the default region for (i = 0; i < n; ++i) { bam_header_t *htmp; data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = bam_open(argv[optind+i], "r"); // open BAM data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter htmp = bam_header_read(data[i]->fp); // read the BAM header if (i == 0) { h = htmp; // keep the header of the 1st BAM if (reg) bam_parse_region(h, reg, &tid, &beg, &end); // also parse the region } else bam_header_destroy(htmp); // if not the 1st BAM, trash the header if (tid >= 0) { // if a region is specified and parsed successfully bam_index_t *idx = bam_index_load(argv[optind+i]); // load the index data[i]->iter = bam_iter_query(idx, tid, beg, end); // set the iterator bam_index_destroy(idx); // the index is not needed any more; phase out of the memory } } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization bam_mplp_set_maxcnt(mplp,2147483647); // set max_depth to int max n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = calloc(n, sizeof(void*)); // plp[i] points to the array of covering reads (internal in mplp) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam1_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } free(n_plp); free(plp); bam_mplp_destroy(mplp); bam_header_destroy(h); for (i = 0; i < n; ++i) { bam_close(data[i]->fp); if (data[i]->iter) bam_iter_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); if ( file_list ) { for (i=0; i<n; i++) free(fn[i]); free(fn); } return 0; }
int main_samview(int argc, char *argv[]) { int index; for(index = 0; index < argc; index++) { printf("The %d is %s\n",index,argv[index]); } getchar();return 0; int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0; int is_long_help = 0, n_threads = 0; int64_t count = 0; samFile *in = 0, *out = 0, *un_out=0; bam_hdr_t *header = NULL; char out_mode[5], out_un_mode[5], *out_format = ""; char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; samview_settings_t settings = { .rghash = NULL, .min_mapQ = 0, .flag_on = 0, .flag_off = 0, .min_qlen = 0, .remove_B = 0, .subsam_seed = 0, .subsam_frac = -1., .library = NULL, .bed = NULL, }; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'), { "threads", required_argument, NULL, '@' }, { NULL, 0, NULL, 0 } }; /* parse command-line options */ strcpy(out_mode, "w"); strcpy(out_un_mode, "w"); while ((c = getopt_long(argc, argv, "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:", lopts, NULL)) >= 0) { switch (c) { case 's': if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) { srand(settings.subsam_seed); settings.subsam_seed = rand(); } settings.subsam_frac = strtod(q, &q); break; case 'm': settings.min_qlen = atoi(optarg); break; case 'c': is_count = 1; break; case 'S': break; case 'b': out_format = "b"; break; case 'C': out_format = "c"; break; case 't': fn_list = strdup(optarg); break; case 'h': is_header = 1; break; case 'H': is_header_only = 1; break; case 'o': fn_out = strdup(optarg); break; case 'U': fn_un_out = strdup(optarg); break; case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; case 'q': settings.min_mapQ = atoi(optarg); break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; case 'l': settings.library = strdup(optarg); break; case 'L': if ((settings.bed = bed_read(optarg)) == NULL) { print_error_errno("view", "Could not read file \"%s\"", optarg); ret = 1; goto view_end; } break; case 'r': if (add_read_group_single("view", &settings, optarg) != 0) { ret = 1; goto view_end; } break; case 'R': if (add_read_groups_file("view", &settings, optarg) != 0) { ret = 1; goto view_end; } break; /* REMOVED as htslib doesn't support this //case 'x': out_format = "x"; break; //case 'X': out_format = "X"; break; */ case '?': is_long_help = 1; break; case 'B': settings.remove_B = 1; break; case '@': n_threads = strtol(optarg, 0, 0); break; case 'x': { if (strlen(optarg) != 2) { fprintf(stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n"); return usage(stderr, EXIT_FAILURE, is_long_help); } settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len)); settings.remove_aux[settings.remove_aux_len-1] = optarg; } break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) return usage(stderr, EXIT_FAILURE, is_long_help); break; } } if (compress_level >= 0 && !*out_format) out_format = "b"; if (is_header_only) is_header = 1; // File format auto-detection first if (fn_out) sam_open_mode(out_mode+1, fn_out, NULL); if (fn_un_out) sam_open_mode(out_un_mode+1, fn_un_out, NULL); // Overridden by manual -b, -C if (*out_format) out_mode[1] = out_un_mode[1] = *out_format; out_mode[2] = out_un_mode[2] = '\0'; // out_(un_)mode now 1 or 2 bytes long, followed by nul. if (compress_level >= 0) { char tmp[2]; tmp[0] = compress_level + '0'; tmp[1] = '\0'; strcat(out_mode, tmp); strcat(out_un_mode, tmp); } if (argc == optind && isatty(STDIN_FILENO)) return usage(stdout, EXIT_SUCCESS, is_long_help); // potential memory leak... fn_in = (optind < argc)? argv[optind] : "-"; // generate the fn_list if necessary if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference); // open file handlers if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) { print_error_errno("view", "failed to open \"%s\" for reading", fn_in); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(in, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if ((header = sam_hdr_read(in)) == 0) { fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", fn_in); ret = 1; goto view_end; } if (settings.rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... char *tmp; int l; tmp = drop_rg(header->text, settings.rghash, &l); free(header->text); header->text = tmp; header->l_text = l; } if (!is_count) { if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { print_error_errno("view", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output"); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(out, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if (*out_format || is_header || out_mode[1] == 'b' || out_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(out, header) != 0) { fprintf(stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } if (fn_un_out) { if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(un_out, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if (*out_format || is_header || out_un_mode[1] == 'b' || out_un_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(un_out, header) != 0) { fprintf(stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } } } if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); } if (is_header_only) goto view_end; // no need to print alignments if (optind + 1 >= argc) { // convert/print the entire file bam1_t *b = bam_init1(); int r; while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' if (!process_aln(header, b, &settings)) { if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } if (r < -1) { fprintf(stderr, "[main_samview] truncated file.\n"); ret = 1; } bam_destroy1(b); } else { // retrieve alignments in specified regions int i; bam1_t *b; hts_idx_t *idx = sam_index_load(in, fn_in); // load index if (idx == 0) { // index is unavailable fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); ret = 1; goto view_end; } b = bam_init1(); for (i = optind + 1; i < argc; ++i) { int result; hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' if (iter == NULL) { // region invalid or reference name not found int beg, end; if (hts_parse_reg(argv[i], &beg, &end)) fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); else fprintf(stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); continue; } // fetch alignments while ((result = sam_itr_next(in, iter, b)) >= 0) { if (!process_aln(header, b, &settings)) { if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } hts_itr_destroy(iter); if (result < -1) { fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); ret = 1; break; } } bam_destroy1(b); hts_idx_destroy(idx); // destroy the BAM index } view_end: if (is_count && ret == 0) printf("%" PRId64 "\n", count); // close files, free and return if (in) check_sam_close("view", in, fn_in, "standard input", &ret); if (out) check_sam_close("view", out, fn_out, "standard output", &ret); if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret); free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); sam_global_args_free(&ga); if ( header ) bam_hdr_destroy(header); if (settings.bed) bed_destroy(settings.bed); if (settings.rghash) { khint_t k; for (k = 0; k < kh_end(settings.rghash); ++k) if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k)); kh_destroy(rg, settings.rghash); } if (settings.remove_aux_len) { free(settings.remove_aux); } return ret; } static int usage(FILE *fp, int exit_status, int is_long_help) { fprintf(fp, "\n" "Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n" "\n" "Options:\n" // output options " -b output BAM\n" " -C output CRAM (requires -T)\n" " -1 use fast BAM compression (implies -b)\n" " -u uncompressed BAM output (implies -b)\n" " -h include header in SAM output\n" " -H print SAM header only (no alignments)\n" " -c print only the count of matching records\n" " -o FILE output file name [stdout]\n" " -U FILE output reads not selected by filters to FILE [null]\n" // extra input " -t FILE FILE listing reference names and lengths (see long help) [null]\n" // read filters " -L FILE only include reads overlapping this BED FILE [null]\n" " -r STR only include reads in read group STR [null]\n" " -R FILE only include reads with read group listed in FILE [null]\n" " -q INT only include reads with mapping quality >= INT [0]\n" " -l STR only include reads in library STR [null]\n" " -m INT only include reads with number of CIGAR operations consuming\n" " query sequence >= INT [0]\n" " -f INT only include reads with all bits set in INT set in FLAG [0]\n" " -F INT only include reads with none of the bits set in INT set in FLAG [0]\n" // read processing " -x STR read tag to strip (repeatable) [null]\n" " -B collapse the backward CIGAR operation\n" " -s FLOAT integer part sets seed of random number generator [0];\n" " rest sets fraction of templates to subsample [no subsampling]\n" // general options " -@, --threads INT\n" " number of BAM/CRAM compression threads [0]\n" " -? print long help, including note about region specification\n" " -S ignored (input format is auto-detected)\n"); sam_global_opt_help(fp, "-.O.T"); fprintf(fp, "\n"); if (is_long_help) fprintf(fp, "Notes:\n" "\n" "1. This command now auto-detects the input format (BAM/CRAM/SAM).\n" " Further control over the CRAM format can be specified by using the\n" " --output-fmt-option, e.g. to specify the number of sequences per slice\n" " and to use avoid reference based compression:\n" "\n" "\tsamtools view -C --output-fmt-option seqs_per_slice=5000 \\\n" "\t --output-fmt-option no_ref -o out.cram in.bam\n" "\n" " Options can also be specified as a comma separated list within the\n" " --output-fmt value too. For example this is equivalent to the above\n" "\n" "\tsamtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n" "\t -o out.cram in.bam\n" "\n" "2. The file supplied with `-t' is SPACE/TAB delimited with the first\n" " two fields of each line consisting of the reference name and the\n" " corresponding sequence length. The `.fai' file generated by \n" " `samtools faidx' is suitable for use as this file. This may be an\n" " empty file if reads are unaligned.\n" "\n" "3. SAM->BAM conversion: samtools view -bT ref.fa in.sam.gz\n" "\n" "4. BAM->SAM conversion: samtools view -h in.bam\n" "\n" "5. A region should be presented in one of the following formats:\n" " `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n" " specified, the input alignment file must be a sorted and indexed\n" " alignment (BAM/CRAM) file.\n" "\n" "6. Option `-u' is preferred over `-b' when the output is piped to\n" " another samtools command.\n" "\n"); return exit_status; }
int bam_mpileup(int argc, char *argv[]) { int c; const char *file_list = NULL; char **fn = NULL; int nfiles = 0, use_orphan = 0; mplp_conf_t mplp; memset(&mplp, 0, sizeof(mplp_conf_t)); mplp.min_baseQ = 13; mplp.capQ_thres = 0; mplp.max_depth = 250; mplp.max_indel_depth = 250; mplp.openQ = 40; mplp.extQ = 20; mplp.tandemQ = 100; mplp.min_frac = 0.002; mplp.min_support = 1; mplp.flag = MPLP_NO_ORPHAN | MPLP_REALN | MPLP_SMART_OVERLAPS; mplp.argc = argc; mplp.argv = argv; mplp.rflag_filter = BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP; mplp.output_fname = NULL; static const struct option lopts[] = { {"rf", required_argument, NULL, 1}, // require flag {"ff", required_argument, NULL, 2}, // filter flag {"incl-flags", required_argument, NULL, 1}, {"excl-flags", required_argument, NULL, 2}, {"output", required_argument, NULL, 3}, {"open-prob", required_argument, NULL, 4}, {"illumina1.3+", no_argument, NULL, '6'}, {"count-orphans", no_argument, NULL, 'A'}, {"bam-list", required_argument, NULL, 'b'}, {"no-BAQ", no_argument, NULL, 'B'}, {"no-baq", no_argument, NULL, 'B'}, {"adjust-MQ", required_argument, NULL, 'C'}, {"adjust-mq", required_argument, NULL, 'C'}, {"max-depth", required_argument, NULL, 'd'}, {"redo-BAQ", no_argument, NULL, 'E'}, {"redo-baq", no_argument, NULL, 'E'}, {"fasta-ref", required_argument, NULL, 'f'}, {"exclude-RG", required_argument, NULL, 'G'}, {"exclude-rg", required_argument, NULL, 'G'}, {"positions", required_argument, NULL, 'l'}, {"region", required_argument, NULL, 'r'}, {"ignore-RG", no_argument, NULL, 'R'}, {"ignore-rg", no_argument, NULL, 'R'}, {"min-MQ", required_argument, NULL, 'q'}, {"min-mq", required_argument, NULL, 'q'}, {"min-BQ", required_argument, NULL, 'Q'}, {"min-bq", required_argument, NULL, 'Q'}, {"ignore-overlaps", no_argument, NULL, 'x'}, {"BCF", no_argument, NULL, 'g'}, {"bcf", no_argument, NULL, 'g'}, {"VCF", no_argument, NULL, 'v'}, {"vcf", no_argument, NULL, 'v'}, {"output-BP", no_argument, NULL, 'O'}, {"output-bp", no_argument, NULL, 'O'}, {"output-MQ", no_argument, NULL, 's'}, {"output-mq", no_argument, NULL, 's'}, {"output-tags", required_argument, NULL, 't'}, {"uncompressed", no_argument, NULL, 'u'}, {"ext-prob", required_argument, NULL, 'e'}, {"gap-frac", required_argument, NULL, 'F'}, {"tandem-qual", required_argument, NULL, 'h'}, {"skip-indels", no_argument, NULL, 'I'}, {"max-idepth", required_argument, NULL, 'L'}, {"min-ireads ", required_argument, NULL, 'm'}, {"per-sample-mF", no_argument, NULL, 'p'}, {"per-sample-mf", no_argument, NULL, 'p'}, {"platforms", required_argument, NULL, 'P'}, {NULL, 0, NULL, 0} }; while ((c = getopt_long(argc, argv, "Agf:r:l:q:Q:uRC:BDSd:L:b:P:po:e:h:Im:F:EG:6OsVvxt:",lopts,NULL)) >= 0) { switch (c) { case 'x': mplp.flag &= ~MPLP_SMART_OVERLAPS; break; case 1 : mplp.rflag_require = bam_str2flag(optarg); if ( mplp.rflag_require<0 ) { fprintf(stderr,"Could not parse --rf %s\n", optarg); return 1; } break; case 2 : mplp.rflag_filter = bam_str2flag(optarg); if ( mplp.rflag_filter<0 ) { fprintf(stderr,"Could not parse --ff %s\n", optarg); return 1; } break; case 3 : mplp.output_fname = optarg; break; case 4 : mplp.openQ = atoi(optarg); break; case 'f': mplp.fai = fai_load(optarg); if (mplp.fai == 0) return 1; mplp.fai_fname = optarg; break; case 'd': mplp.max_depth = atoi(optarg); break; case 'r': mplp.reg = strdup(optarg); break; case 'l': // In the original version the whole BAM was streamed which is inefficient // with few BED intervals and big BAMs. Todo: devise a heuristic to determine // best strategy, that is streaming or jumping. mplp.bed = bed_read(optarg); if (!mplp.bed) { print_error_errno("Could not read file \"%s\"", optarg); return 1; } break; case 'P': mplp.pl_list = strdup(optarg); break; case 'p': mplp.flag |= MPLP_PER_SAMPLE; break; case 'g': mplp.flag |= MPLP_BCF; break; case 'v': mplp.flag |= MPLP_BCF | MPLP_VCF; break; case 'u': mplp.flag |= MPLP_NO_COMP | MPLP_BCF; break; case 'B': mplp.flag &= ~MPLP_REALN; break; case 'D': mplp.fmt_flag |= B2B_FMT_DP; fprintf(stderr, "[warning] samtools mpileup option `-D` is functional, but deprecated. Please switch to `-t DP` in future.\n"); break; case 'S': mplp.fmt_flag |= B2B_FMT_SP; fprintf(stderr, "[warning] samtools mpileup option `-S` is functional, but deprecated. Please switch to `-t SP` in future.\n"); break; case 'V': mplp.fmt_flag |= B2B_FMT_DV; fprintf(stderr, "[warning] samtools mpileup option `-V` is functional, but deprecated. Please switch to `-t DV` in future.\n"); break; case 'I': mplp.flag |= MPLP_NO_INDEL; break; case 'E': mplp.flag |= MPLP_REDO_BAQ; break; case '6': mplp.flag |= MPLP_ILLUMINA13; break; case 'R': mplp.flag |= MPLP_IGNORE_RG; break; case 's': mplp.flag |= MPLP_PRINT_MAPQ; break; case 'O': mplp.flag |= MPLP_PRINT_POS; break; case 'C': mplp.capQ_thres = atoi(optarg); break; case 'q': mplp.min_mq = atoi(optarg); break; case 'Q': mplp.min_baseQ = atoi(optarg); break; case 'b': file_list = optarg; break; case 'o': { char *end; long value = strtol(optarg, &end, 10); // Distinguish between -o INT and -o FILE (a bit of a hack!) if (*end == '\0') mplp.openQ = value; else mplp.output_fname = optarg; } break; case 'e': mplp.extQ = atoi(optarg); break; case 'h': mplp.tandemQ = atoi(optarg); break; case 'A': use_orphan = 1; break; case 'F': mplp.min_frac = atof(optarg); break; case 'm': mplp.min_support = atoi(optarg); break; case 'L': mplp.max_indel_depth = atoi(optarg); break; case 'G': { FILE *fp_rg; char buf[1024]; mplp.rghash = khash_str2int_init(); if ((fp_rg = fopen(optarg, "r")) == 0) fprintf(stderr, "(%s) Fail to open file %s. Continue anyway.\n", __func__, optarg); while (!feof(fp_rg) && fscanf(fp_rg, "%s", buf) > 0) // this is not a good style, but forgive me... khash_str2int_inc(mplp.rghash, strdup(buf)); fclose(fp_rg); } break; case 't': mplp.fmt_flag |= parse_format_flag(optarg); break; default: fprintf(stderr,"Invalid option: '%c'\n", c); return 1; } } if ( !(mplp.flag&MPLP_REALN) && mplp.flag&MPLP_REDO_BAQ ) { fprintf(stderr,"Error: The -B option cannot be combined with -E\n"); return 1; } if (use_orphan) mplp.flag &= ~MPLP_NO_ORPHAN; if (argc == 1) { print_usage(stderr, &mplp); return 1; } int ret; if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; ret = mpileup(&mplp,nfiles,fn); for (c=0; c<nfiles; c++) free(fn[c]); free(fn); } else ret = mpileup(&mplp, argc - optind, argv + optind); if (mplp.rghash) khash_str2int_destroy_free(mplp.rghash); free(mplp.reg); free(mplp.pl_list); if (mplp.fai) fai_destroy(mplp.fai); if (mplp.bed) bed_destroy(mplp.bed); return ret; }
int main_depth(int argc, char *argv[]) { int i, n, tid, beg, end, pos, *n_plp, baseQ = 0, mapQ = 0, min_len = 0, status = EXIT_SUCCESS, nfiles; const bam_pileup1_t **plp; char *reg = 0; // specified region void *bed = 0; // BED data structure char *file_list = NULL, **fn = NULL; bam_hdr_t *h = NULL; // BAM header of the 1st input aux_t **data; bam_mplp_t mplp; // parse the command line while ((n = getopt(argc, argv, "r:b:q:Q:l:f:")) >= 0) { switch (n) { case 'l': min_len = atoi(optarg); break; // minimum query length case 'r': reg = strdup(optarg); break; // parsing a region requires a BAM header case 'b': bed = bed_read(optarg); // BED or position list file can be parsed now if (!bed) { print_error_errno("Could not read file \"%s\"", optarg); return 1; } break; case 'q': baseQ = atoi(optarg); break; // base quality threshold case 'Q': mapQ = atoi(optarg); break; // mapping quality threshold case 'f': file_list = optarg; break; } } if (optind == argc && !file_list) { fprintf(pysamerr, "\n"); fprintf(pysamerr, "Usage: samtools depth [options] in1.bam [in2.bam [...]]\n"); fprintf(pysamerr, "Options:\n"); fprintf(pysamerr, " -b <bed> list of positions or regions\n"); fprintf(pysamerr, " -f <list> list of input BAM filenames, one per line [null]\n"); fprintf(pysamerr, " -l <int> read length threshold (ignore reads shorter than <int>)\n"); fprintf(pysamerr, " -q <int> base quality threshold\n"); fprintf(pysamerr, " -Q <int> mapping quality threshold\n"); fprintf(pysamerr, " -r <chr:from-to> region\n"); fprintf(pysamerr, "\n"); return 1; } // initialize the auxiliary data structures if (file_list) { if ( read_file_list(file_list,&nfiles,&fn) ) return 1; n = nfiles; argv = fn; optind = 0; } else n = argc - optind; // the number of BAMs on the command line data = calloc(n, sizeof(aux_t*)); // data[i] for the i-th input beg = 0; end = 1<<30; // set the default region for (i = 0; i < n; ++i) { data[i] = calloc(1, sizeof(aux_t)); data[i]->fp = sam_open(argv[optind+i], "r"); // open BAM if (data[i]->fp == NULL) { print_error_errno("Could not open \"%s\"", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } if (hts_set_opt(data[i]->fp, CRAM_OPT_REQUIRED_FIELDS, SAM_FLAG | SAM_RNAME | SAM_POS | SAM_MAPQ | SAM_CIGAR | SAM_SEQ)) { fprintf(pysamerr, "Failed to set CRAM_OPT_REQUIRED_FIELDS value\n"); return 1; } if (hts_set_opt(data[i]->fp, CRAM_OPT_DECODE_MD, 0)) { fprintf(pysamerr, "Failed to set CRAM_OPT_DECODE_MD value\n"); return 1; } data[i]->min_mapQ = mapQ; // set the mapQ filter data[i]->min_len = min_len; // set the qlen filter data[i]->hdr = sam_hdr_read(data[i]->fp); // read the BAM header if (reg) { // if a region is specified hts_idx_t *idx = sam_index_load(data[i]->fp, argv[optind+i]); // load the index if (idx == NULL) { print_error("can't load index for \"%s\"", argv[optind+i]); status = EXIT_FAILURE; goto depth_end; } data[i]->iter = sam_itr_querys(idx, data[i]->hdr, reg); // set the iterator hts_idx_destroy(idx); // the index is not needed any more; free the memory if (data[i]->iter == NULL) { print_error("can't parse region \"%s\"", reg); status = EXIT_FAILURE; goto depth_end; } } } h = data[0]->hdr; // easy access to the header of the 1st BAM if (reg) { beg = data[0]->iter->beg; // and to the parsed region coordinates end = data[0]->iter->end; } // the core multi-pileup loop mplp = bam_mplp_init(n, read_bam, (void**)data); // initialization n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads (internal in mplp) while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) { // come to the next covered position if (pos < beg || pos >= end) continue; // out of range; skip if (bed && bed_overlap(bed, h->target_name[tid], pos, pos + 1) == 0) continue; // not in BED; skip fputs(h->target_name[tid], stdout); printf("\t%d", pos+1); // a customized printf() would be faster for (i = 0; i < n; ++i) { // base level filters have to go here int j, m = 0; for (j = 0; j < n_plp[i]; ++j) { const bam_pileup1_t *p = plp[i] + j; // DON'T modfity plp[][] unless you really know if (p->is_del || p->is_refskip) ++m; // having dels or refskips at tid:pos else if (bam_get_qual(p->b)[p->qpos] < baseQ) ++m; // low base quality } printf("\t%d", n_plp[i] - m); // this the depth to output } putchar('\n'); } free(n_plp); free(plp); bam_mplp_destroy(mplp); depth_end: for (i = 0; i < n && data[i]; ++i) { bam_hdr_destroy(data[i]->hdr); if (data[i]->fp) sam_close(data[i]->fp); hts_itr_destroy(data[i]->iter); free(data[i]); } free(data); free(reg); if (bed) bed_destroy(bed); if ( file_list ) { for (i=0; i<n; i++) free(fn[i]); free(fn); } return status; }
int main(int argc, char *argv[]) { int c, i, n, ret, res; int tid, pos, *n_plp; cmdopt_t o; bam_mplp_t mplp; const bam_pileup1_t **plp; aux_t **data; bam_hdr_t *h = 0; sv_t sv1; qual_sum_t qual2; khiter_t k_iter; khash_t(sv_hash) *sv_h = kh_init(sv_hash); khash_t(sv_geno) *geno_h = kh_init(sv_geno); khash_t(colmap) *smp_cols; khash_t(ped) *ped_h = 0; mempool_t *mp; char **samples; o.min_q = 40; o.min_s = 80; o.min_len = 150; o.min_dp = 10; o.bed = 0, o.fnped = 0, o.mi_prob=0.005; while ((c = getopt(argc, argv, "hq:s:l:d:b:p:m:")) >= 0) { if (c == 'h') { usage(stderr, &o); return 0; } else if (c == 'q') o.min_q = atoi(optarg); else if (c == 's') o.min_s = atoi(optarg); else if (c == 'l') o.min_len = atoi(optarg); else if (c == 'd') o.min_dp = atoi(optarg); else if (c == 'p') o.fnped = optarg; else if (c == 'm') o.mi_prob = atof(optarg); else if (c == 'b') { if ((o.bed = bed_read(optarg)) == NULL) { return -1; } } } if (o.mi_prob < 0.0000000000001 || o.mi_prob > 0.1) { fprintf(stderr, "Error. Probability of a mendelian inconsistency must be between 0.1 and 0.0000000000001.\n"); } if (argc - optind < 1) { usage(stderr, &o); return 1; } // Open files and initalize aux data // n = argc - optind; data = calloc(n, sizeof(aux_t*)); samples = (char**)malloc(n * sizeof(char*)); for (i = 0; i < n; ++i) { data[i] = calloc(1, sizeof (aux_t)); data[i]->fp = sam_open(argv[optind + i], "r"); if (!data[i]->fp) { fprintf(stderr, "Input file \"%s\" could not be opened.\n", argv[optind + 1]); return 1; } data[i]->min_mapq = o.min_q; data[i]->min_as = o.min_s; data[i]->min_len = o.min_len; data[i]->hdr = sam_hdr_read(data[i]->fp); if (!data[i]->hdr) { fprintf(stderr, "Could not read the header for input file \"%s\".\n", argv[optind + 1]); return 1; } samples[i] = find_sample(data[i]->hdr, &res); if (!samples[i]) { fprintf(stderr, "Warning. No sample name detected for bam %s. Using filename\n", argv[optind + i]); samples[i] = argv[optind + i]; } } h = data[0]->hdr; smp_cols = map_samples(samples, n); if (o.fnped) { if ((ped_h = read_ped(o.fnped, smp_cols)) == 0) { return -1; } } // The core data processing loop // mplp = bam_mplp_init(n, read_bam, (void**)data); n_plp = calloc(n, sizeof(int)); // n_plp[i] is the number of covering reads from the i-th BAM plp = calloc(n, sizeof(bam_pileup1_t*)); // plp[i] points to the array of covering reads in mplp //quals = (qual_vec_t*)calloc(n, sizeof(qual_vec_t)); mp = mp_init(); while ((ret = bam_mplp_auto(mplp, &tid, &pos, n_plp, plp)) > 0) { // iterate of positions with coverage int n_sv; if (o.bed && tid >= 0 && !bed_overlap(o.bed, h->target_name[tid], pos, pos+1)) continue; n_sv = plp2sv(h, tid, pos, n, n_plp, plp, sv_h); if (n_sv > 1) { fprintf(stderr, "Warning: more than two alleles detected at %s:%d\n", h->target_name[tid], pos); } if (n_sv) { fprintf(stderr, "SV detected at %d:%d\n", tid, pos); for (k_iter = kh_begin(sv_h); k_iter != kh_end(sv_h); ++k_iter) { if (kh_exist(sv_h, k_iter)) { sv1 = kh_value(sv_h, k_iter); fprintf(stderr, "SV tid1=%d, tid2=%d, pos1=%d, pos2=%d, ori1=%d, ori2=%d, allele=%d\n", sv1.tid1, sv1.tid2, sv1.pos1, sv1.pos2, sv1.ori1, sv1.ori2, sv1.allele); } } res = get_qual_data(h, tid, pos, n, n_plp, plp, n_sv + 1, sv_h, geno_h, mp); if (res < 0) { fprintf(stderr, "Error collecting quality data from reads\n"); return -1; } kh_clear(sv_hash, sv_h); } } print_header(h, optind, n, argv); genotype_sv(h, n, geno_h, o.min_dp, ped_h, o.mi_prob); free(n_plp); free(plp); bam_mplp_destroy(mplp); mp_destroy(mp); if (o.bed) bed_destroy(o.bed); for (i = 0; i < n; ++i) { bam_hdr_destroy(data[i]->hdr); sam_close(data[i]->fp); free(data[i]); free(samples[i]); } free(data); free(samples); kh_destroy(sv_hash, sv_h); kh_destroy(sv_geno, geno_h); kh_destroy(colmap, smp_cols); kh_destroy(ped, ped_h); return 0; }
int main_view(int argc, char *argv[]) { int i, c, n_files = 0, out_bcf = 0, clevel = -1, multi_flag = 0, excl = 0, not_vcf = 0, in_mem = 0, u_set = 0; long seekn = -1, n_rec = LONG_MAX, n_read = 0; bgtm_t *bm = 0; bcf1_t *b; htsFile *out = 0; char modew[8], *reg = 0, *site_flt = 0; void *bed = 0; int n_groups = 0; char *gexpr[BGT_MAX_GROUPS], *aexpr = 0, *dbfn = 0, *fmt = 0; bgt_file_t **files = 0; fmf_t *vardb = 0; while ((c = getopt(argc, argv, "ubs:r:l:CMGB:ef:g:a:i:n:SHt:d:")) >= 0) { if (c == 'b') out_bcf = 1; else if (c == 'r') reg = optarg; else if (c == 'l') clevel = atoi(optarg); else if (c == 'e') excl = 1; else if (c == 'u') u_set = 1; else if (c == 'B') bed = bed_read(optarg); else if (c == 'C') multi_flag |= BGT_F_SET_AC; else if (c == 'G') multi_flag |= BGT_F_NO_GT; else if (c == 'S') multi_flag |= BGT_F_NO_GT | BGT_F_CNT_AL, not_vcf = 1; else if (c == 'H') multi_flag |= BGT_F_NO_GT | BGT_F_CNT_HAP, not_vcf = 1; else if (c == 'M') in_mem = 1; else if (c == 'i') seekn = atol(optarg) - 1; else if (c == 'n') n_rec = atol(optarg); else if (c == 'f') site_flt = optarg; else if (c == 't') fmt = optarg, not_vcf = 1; else if (c == 'd') dbfn = optarg; else if (c == 's' && n_groups < BGT_MAX_GROUPS) gexpr[n_groups++] = optarg; else if (c == 'a') aexpr = optarg; } if (n_rec < 0) { fprintf(stderr, "[E::%s] option -n must be at least 0.\n", __func__); return 1; } if (clevel > 9) clevel = 9; if (u_set) clevel = 0, out_bcf = 1; if (n_groups > 1) multi_flag |= BGT_F_SET_AC; if (argc - optind < 1) { fprintf(stderr, "Usage: bgt %s [options] <bgt-prefix> [...]", argv[0]); fputc('\n', stderr); fprintf(stderr, "Options:\n"); fprintf(stderr, " Sample selection:\n"); fprintf(stderr, " -s EXPR samples list (,sample1,sample2 or a file or expr; see Notes below) [all]\n"); fprintf(stderr, " Site selection:\n"); fprintf(stderr, " -r STR region [all]\n"); fprintf(stderr, " -B FILE extract variants overlapping BED FILE []\n"); fprintf(stderr, " -e exclude variants overlapping BED FILE (effective with -B)\n"); fprintf(stderr, " -i INT process from the INT-th record (1-based) []\n"); fprintf(stderr, " -n INT process at most INT records []\n"); fprintf(stderr, " -d FILE variant annotations in FMF (to work with -a) []\n"); fprintf(stderr, " -M load variant annotations in RAM (only with -d)\n"); fprintf(stderr, " -a EXPR alleles list chr:1basedPos:refLen:seq (,allele1,allele2 or a file or expr) []\n"); fprintf(stderr, " -f STR frequency filters []\n"); fprintf(stderr, " VCF output:\n"); fprintf(stderr, " -b BCF output (effective without -S/-H)\n"); fprintf(stderr, " -l INT compression level for BCF [default]\n"); fprintf(stderr, " -u equivalent to -bl0 (overriding -b and -l)\n"); fprintf(stderr, " -G don't output sample genotypes\n"); fprintf(stderr, " -C write AC/AN to the INFO field (auto applied with -f or multipl -s)\n"); fprintf(stderr, " Non-VCF output:\n"); fprintf(stderr, " -S show samples with a set of alleles (with -a)\n"); fprintf(stderr, " -H count of haplotypes with a set of alleles (with -a)\n"); fprintf(stderr, " -t STR comma-delimited list of fields to output. Accepted variables:\n"); fprintf(stderr, " AC, AN, AC#, AN#, CHROM, POS, END, REF, ALT (# for a group number)\n"); fprintf(stderr, "Notes:\n"); fprintf(stderr, " For option -s/-a, EXPR can be one of:\n"); fprintf(stderr, " 1) comma-delimited list following a colon/comma. e.g. -s,NA12878,NA12044\n"); fprintf(stderr, " 2) space-delimited file with the first column giving a sample/allele name. e.g. -s list.txt\n"); fprintf(stderr, " 3) expression if .spl/-d file contains metadata. e.g.: -s\"gender=='M'&&population!='CEU'\"\n"); fprintf(stderr, " If multiple -s is specified, the AC/AN of the first group will be written to VCF INFO AC1/AN1,\n"); fprintf(stderr, " the second to AC2/AN2, etc.\n"); return 1; } if (dbfn && in_mem) vardb = fmf_read(dbfn), dbfn = 0; if ((multi_flag&(BGT_F_CNT_AL|BGT_F_CNT_HAP)) && aexpr == 0) { fprintf(stderr, "[E::%s] -a must be specified when -S/-H is in use.\n", __func__); return 1; } n_files = argc - optind; files = (bgt_file_t**)calloc(n_files, sizeof(bgt_file_t*)); for (i = 0; i < n_files; ++i) { files[i] = bgt_open(argv[optind+i]); if (files[i] == 0) { fprintf(stderr, "[E::%s] failed to open BGT with prefix '%s'\n", __func__, argv[optind+i]); return 1; // FIXME: memory leak } } bm = bgtm_reader_init(n_files, files); bgtm_set_flag(bm, multi_flag); if (site_flt && bgtm_set_flt_site(bm, site_flt) != 0) { fprintf(stderr, "[E::%s] failed to set frequency filters. Syntax error?\n", __func__); return 1; } if (reg && bgtm_set_region(bm, reg) < 0) { fprintf(stderr, "[E::%s] failed to set region. Region format error?\n", __func__); return 1; } if (bed) bgtm_set_bed(bm, bed, excl); if (fmt && bgtm_set_table(bm, fmt) < 0) { fprintf(stderr, "[E::%s] failed to set tabular output.\n", __func__); return 1; } if (seekn > 0) bgtm_set_start(bm, seekn); if (aexpr) { int n_al; n_al = bgtm_set_alleles(bm, aexpr, vardb, dbfn); if (n_al < 0) { fprintf(stderr, "[E::%s] failed to set alleles.\n", __func__); return 1; } else if (n_al == 0) fprintf(stderr, "[W::%s] no alleles selected.\n", __func__); } for (i = 0; i < n_groups; ++i) { if (bgtm_add_group(bm, gexpr[i]) < 0) { fprintf(stderr, "[E::%s] failed to add sample group '%s'.\n", __func__, gexpr[i]); return 1; } } bgtm_prepare(bm); // bgtm_prepare() generates the VCF header if (!not_vcf) { strcpy(modew, "w"); if (out_bcf) strcat(modew, "b"); sprintf(modew + strlen(modew), "%d", clevel); out = hts_open("-", modew, 0); vcf_hdr_write(out, bm->h_out); } b = bcf_init1(); while (bgtm_read(bm, b) >= 0 && n_read < n_rec) { if (out) vcf_write1(out, bm->h_out, b); if (fmt && bm->n_fields > 0) puts(bm->tbl_line.s); ++n_read; } bcf_destroy1(b); if (not_vcf && bm->n_aal > 0) { if (bm->flag & BGT_F_CNT_HAP) { bgt_hapcnt_t *hc; int n_hap; char *s; hc = bgtm_hapcnt(bm, &n_hap); s = bgtm_hapcnt_print_destroy(bm, n_hap, hc); fputs(s, stdout); free(s); } if (bm->flag & BGT_F_CNT_AL) { char *s; if ((s = bgtm_alcnt_print(bm)) != 0) fputs(s, stdout); free(s); } } if (out) hts_close(out); bgtm_reader_destroy(bm); if (bed) bed_destroy(bed); for (i = 0; i < n_files; ++i) bgt_close(files[i]); free(files); if (vardb) fmf_destroy(vardb); return 0; }