static int mplp_func(void *data, bam1_t *b) { extern int bam_realn(bam1_t *b, const char *ref); extern int bam_prob_realn_core(bam1_t *b, const char *ref, int ref_len, int flag); extern int bam_cap_mapQ(bam1_t *b, char *ref, int ref_len, int thres); char *ref; mplp_aux_t *ma = (mplp_aux_t*)data; int ret, skip = 0, ref_len; do { int has_ref; ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b); if (ret < 0) break; // The 'B' cigar operation is not part of the specification, considering as obsolete. // bam_remove_B(b); if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) { // exclude unmapped reads skip = 1; continue; } if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) { skip = 1; continue; } if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) { skip = 1; continue; } if (ma->conf->bed) { // test overlap skip = !bed_overlap(ma->conf->bed, ma->h->target_name[b->core.tid], b->core.pos, bam_endpos(b)); if (skip) continue; } if (ma->conf->rghash) { // exclude read groups uint8_t *rg = bam_aux_get(b, "RG"); skip = (rg && khash_str2int_get(ma->conf->rghash, (const char*)(rg+1), NULL)==0); if (skip) continue; } if (ma->conf->flag & MPLP_ILLUMINA13) { int i; uint8_t *qual = bam_get_qual(b); for (i = 0; i < b->core.l_qseq; ++i) qual[i] = qual[i] > 31? qual[i] - 31 : 0; } if (ma->conf->fai && b->core.tid >= 0) { has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", __func__, b->core.pos, ref_len, b->core.tid); skip = 1; continue; } } else { has_ref = 0; } skip = 0; if (has_ref && (ma->conf->flag&MPLP_REALN)) bam_prob_realn_core(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); if (has_ref && ma->conf->capQ_thres > 10) { int q = bam_cap_mapQ(b, ref, ref_len, ma->conf->capQ_thres); if (q < 0) skip = 1; else if (b->core.qual > q) b->core.qual = q; } if (b->core.qual < ma->conf->min_mq) skip = 1; else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) skip = 1; } while (skip); return ret; }
// This function reads a BAM alignment from one BAM file. static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure int ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); if ((int)b->core.qual < aux->min_mapQ) b->core.flag |= BAM_FUNMAP; return ret; }
int samfetch(samfile_t *fp, const bam_index_t *idx, int tid, int beg, int end, void *data, bam_fetch_f func) { bam1_t *b = bam_init1(); hts_itr_t *iter = sam_itr_queryi(idx, tid, beg, end); int ret; while ((ret = sam_itr_next(fp->file, iter, b)) >= 0) func(b, data); hts_itr_destroy(iter); bam_destroy1(b); return (ret == -1)? 0 : ret; }
static int read_bam(void *data, bam1_t *b) { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure int ret; while (1) { ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->header, b); if ( ret<0 ) break; if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; if ( (int)b->core.qual < aux->min_mapQ ) continue; break; } return ret; }
// This function reads a BAM alignment from one BAM file. static int read_bam(void *data, bam1_t *b) // read level filters better go here to avoid pileup { aux_t *aux = (aux_t*)data; // data in fact is a pointer to an auxiliary structure int ret; while (1) { ret = aux->iter? sam_itr_next(aux->fp, aux->iter, b) : sam_read1(aux->fp, aux->hdr, b); if ( ret<0 ) break; if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue; if ( (int)b->core.qual < aux->min_mapQ ) continue; if ( aux->min_len && bam_cigar2qlen(b->core.n_cigar, bam_get_cigar(b)) < aux->min_len ) continue; break; } return ret; }
bool bam_streamer:: next() { if (nullptr == _bfp) return false; int ret; if (nullptr == _hitr) { ret = samread(_bfp, _brec._bp); } else { ret = sam_itr_next(_bfp->file, _hitr, _brec._bp); } _is_record_set=(ret >= 0); if (_is_record_set) _record_no++; return _is_record_set; }
BM_mappedRead * extractReads(char * bamFile, char ** contigs, int numContigs, uint16_t * groups, char * prettyName, int headersOnly, int minMapQual, int maxMisMatches, int ignoreSuppAlignments, int ignoreSecondaryAlignments) { //----- // code uses the pattern outlined in samtools view (sam_view.c) // thanks lh3! // int i = 0; int result = -1; int hh = 0; int supp_check = 0x0; // include supp mappings if (ignoreSuppAlignments) { supp_check |= BAM_FSUPPLEMENTARY; } if (ignoreSecondaryAlignments) { supp_check |= BAM_FSECONDARY; } // we need to let the users know if their pairings // will be corrupted int p_corrupt = 0; // helper variables samFile *in = 0; bam_hdr_t *header = NULL; bam1_t *b = bam_init1(); BM_mappedRead * root = 0; BM_mappedRead * prev = 0; // open file handlers if ((in = sam_open(bamFile, "r")) == 0) { fprintf(stderr, "ERROR: Failed to open \"%s\" for reading.\n", bamFile); } else { // retrieve the header if ((header = sam_hdr_read(in)) == 0) { fprintf(stderr, "ERROR: Failed to read the header from \"%s\".\n", bamFile); } else { // check the index is intact hts_idx_t *idx = sam_index_load(in, bamFile); // load index if (idx == 0) { // index is unavailable fprintf(stderr, "ERROR: Random retrieval only works "\ "for indexed files.\n"); } else { cfuhash_table_t *pair_buffer = \ cfuhash_new_with_initial_size(1000000); cfuhash_set_flag(pair_buffer, CFUHASH_FROZEN_UNTIL_GROWS); for (hh = 0; hh < numContigs; ++hh) { // parse a region in the format like `chr2:100-200' hts_itr_t *iter = sam_itr_querys(idx, header, contigs[hh]); if (iter == NULL) { // reference name is not found fprintf(stderr, "WARNING: Could not find contig: "\ "[%s] in BAM: [%s].\n", contigs[hh], bamFile); } // fetch alignments int line = 0; while ((result = sam_itr_next(in, iter, b)) >= 0) { bam1_core_t core = b->core; line += 1; // only high quality?, primary? mappings if ( core.qual < minMapQual) continue; if ((core.flag & supp_check) != 0) continue; if(bam_aux2i(bam_aux_get(b, "NM")) > maxMisMatches) { continue; } char * seqId = bam_get_qname(b); char * seq = 0; char * qual = 0; int qual_len = 0; int seq_len = 0; // get sequence and quality if(0 == headersOnly) { // no point allocating unused space seq = calloc(core.l_qseq+1, sizeof(char)); qual = calloc(core.l_qseq+1, sizeof(char)); uint8_t *s = bam_get_seq(b); if (core.flag&BAM_FREVERSE) { // reverse the read int r = 0; for (i = core.l_qseq-1; i >=0 ; --i) { seq[r]="=TGKCYSBAWRDMHVN"[bam_seqi(s, i)]; ++r; } } else { for (i = 0; i < core.l_qseq; ++i) { seq[i]="=ACMGRSVTWYHKDBN"[bam_seqi(s, i)]; } } seq_len = core.l_qseq; s = bam_get_qual(b); if (s[0] != 0xff) { qual_len = core.l_qseq; for (i = 0; i < core.l_qseq; ++i) { qual[i] = (char)(s[i] + 33); } } else if (qual != 0) { free(qual); qual = 0; } } // work out pairing information uint8_t rpi = RPI_ERROR; if (core.flag&BAM_FPAIRED) { if(core.flag&BAM_FMUNMAP) { if (core.flag&BAM_FREAD1) { rpi = RPI_SNGL_FIR; } else if (core.flag&BAM_FREAD2) { rpi = RPI_SNGL_SEC; } } else { if (core.flag&BAM_FREAD1) { rpi = RPI_FIR; } else if (core.flag&BAM_FREAD2) { rpi = RPI_SEC; } } } else { rpi = RPI_SNGL; } // make the funky Id #define MAX_SEQ_ID_LEN 80 char * seq_id = calloc(MAX_SEQ_ID_LEN, sizeof(char)); // allocate the string to the buffer but check to // ensure we're not cutting anything off int id_len = snprintf(seq_id, MAX_SEQ_ID_LEN, "b_%s;c_%s;r_%s", prettyName, contigs[hh], seqId); if(id_len >= MAX_SEQ_ID_LEN) { seq_id = calloc(id_len+1, sizeof(char)); snprintf(seq_id, id_len+1, // don't forget the NULL! "b_%s;c_%s;r_%s", prettyName, contigs[hh], seqId); } // make the mapped read struct prev = makeMappedRead(seq_id, seq, qual, id_len, seq_len, qual_len, rpi, groups[hh], prev); if (0 == root) { root = prev; } if(rpi == RPI_SNGL || \ rpi == RPI_SNGL_FIR || \ rpi == RPI_SNGL_SEC) { // we can just add away // indicate singleton reads by pointing the // partner pointer to itself prev->partnerRead = prev; } else { // RPI_FIR or RPI_SEC // work out pairing information using the hash // we append a 1 or 2 to the end so that // we don't accidentally pair 1's with 1's etc. char * stripped_result; if(rpi == RPI_FIR) { stripped_result = \ pairStripper(seqId, core.l_qname-1, '2'); } else { stripped_result = \ pairStripper(seqId, core.l_qname-1, '1'); } char * stripped = seqId; if(stripped_result) stripped = stripped_result; //fprintf(stdout, "SEARCH %s\n", stripped); // now stripped always holds a stripped value // see if it is in the hash already BM_mappedRead * stored_MR = \ cfuhash_get(pair_buffer, stripped); if (0 != stored_MR) { // exists in the hash -> Add the pair info if(rpi == RPI_FIR) { prev->partnerRead = stored_MR; } else { stored_MR->partnerRead = prev; } // delete the entry from the hash cfuhash_delete(pair_buffer, stripped); } else { // we should put it in the hash // make sure to change it into something // we will find next time if(rpi == RPI_FIR) stripped[strlen(stripped)-1] = '1'; else stripped[strlen(stripped)-1] = '2'; // check to make sure we're not overwriting // anything important. cfuhash overwrites // duplicate entries, so we need to grab // it and put it to "SNGL_XXX" before we // lose the pointer BM_mappedRead * OWMMR = \ cfuhash_put(pair_buffer, stripped, prev); if(OWMMR) { if(OWMMR->rpi == RPI_FIR) OWMMR->rpi = RPI_SNGL_FIR; else OWMMR->rpi = RPI_SNGL_SEC; OWMMR->partnerRead = OWMMR; printPairCorruptionWarning(p_corrupt); p_corrupt = 1; } } if(stripped_result != 0) { // free this! free(stripped_result); stripped_result = 0; } } } hts_itr_destroy(iter); if (result < -1) { fprintf(stderr, "ERROR: retrieval of reads from "\ "contig: \"%s\" failed due to "\ "truncated file or corrupt BAM index "\ "file\n", header->target_name[hh]); break; } } // any entries left in the hash are pairs whose mates did // not meet quality standards size_t key_size = 0; char * key; BM_mappedRead * LOMMR; size_t pr_size = 1; if(cfuhash_each_data(pair_buffer, (void**)&key, &key_size, (void**)&LOMMR, &pr_size)) { do { // get the mapped read // update it's pairing so we know it's really single if (LOMMR->rpi == RPI_FIR) LOMMR->rpi = RPI_SNGL_FIR; else if (LOMMR->rpi == RPI_SEC) LOMMR->rpi = RPI_SNGL_SEC; // indicate singleton reads by pointing the // partner pointer to itself LOMMR->partnerRead = LOMMR; } while(cfuhash_next_data(pair_buffer, (void**)&key, &key_size, (void**)&LOMMR, &pr_size)); } cfuhash_clear(pair_buffer); cfuhash_destroy(pair_buffer); } hts_idx_destroy(idx); // destroy the BAM index } } // always do this if (in) sam_close(in); bam_destroy1(b); if ( header ) bam_hdr_destroy(header); return root; }
int main(int argc, char *argv[]) { samFile *in; char *fn_ref = 0; int flag = 0, c, clevel = -1, ignore_sam_err = 0; char moder[8]; bam_hdr_t *h; bam1_t *b; htsFile *out; char modew[8]; int r = 0, exit_code = 0; hts_opt *in_opts = NULL, *out_opts = NULL, *last = NULL; int nreads = 0; int benchmark = 0; while ((c = getopt(argc, argv, "IbDCSl:t:i:o:N:B")) >= 0) { switch (c) { case 'S': flag |= 1; break; case 'b': flag |= 2; break; case 'D': flag |= 4; break; case 'C': flag |= 8; break; case 'B': benchmark = 1; break; case 'l': clevel = atoi(optarg); flag |= 2; break; case 't': fn_ref = optarg; break; case 'I': ignore_sam_err = 1; break; case 'i': if (add_option(&in_opts, optarg)) return 1; break; case 'o': if (add_option(&out_opts, optarg)) return 1; break; case 'N': nreads = atoi(optarg); } } if (argc == optind) { fprintf(stderr, "Usage: samview [-bSCSIB] [-N num_reads] [-l level] [-o option=value] <in.bam>|<in.sam>|<in.cram> [region]\n"); return 1; } strcpy(moder, "r"); if (flag&4) strcat(moder, "c"); else if ((flag&1) == 0) strcat(moder, "b"); in = sam_open(argv[optind], moder); if (in == NULL) { fprintf(stderr, "Error opening \"%s\"\n", argv[optind]); return EXIT_FAILURE; } h = sam_hdr_read(in); h->ignore_sam_err = ignore_sam_err; b = bam_init1(); strcpy(modew, "w"); if (clevel >= 0 && clevel <= 9) sprintf(modew + 1, "%d", clevel); if (flag&8) strcat(modew, "c"); else if (flag&2) strcat(modew, "b"); out = hts_open("-", modew); if (out == NULL) { fprintf(stderr, "Error opening standard output\n"); return EXIT_FAILURE; } /* CRAM output */ if (flag & 8) { int ret; // Parse input header and use for CRAM output out->fp.cram->header = sam_hdr_parse_(h->text, h->l_text); // Create CRAM references arrays if (fn_ref) ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, fn_ref); else // Attempt to fill out a cram->refs[] array from @SQ headers ret = cram_set_option(out->fp.cram, CRAM_OPT_REFERENCE, NULL); if (ret != 0) return EXIT_FAILURE; } // Process any options; currently cram only. for (; in_opts; in_opts = (last=in_opts)->next, free(last)) { hts_set_opt(in, in_opts->opt, in_opts->val); if (in_opts->opt == CRAM_OPT_REFERENCE) if (hts_set_opt(out, in_opts->opt, in_opts->val) != 0) return EXIT_FAILURE; } for (; out_opts; out_opts = (last=out_opts)->next, free(last)) if (hts_set_opt(out, out_opts->opt, out_opts->val) != 0) return EXIT_FAILURE; if (!benchmark) sam_hdr_write(out, h); if (optind + 1 < argc && !(flag&1)) { // BAM input and has a region int i; hts_idx_t *idx; if ((idx = sam_index_load(in, argv[optind])) == 0) { fprintf(stderr, "[E::%s] fail to load the BAM index\n", __func__); return 1; } for (i = optind + 1; i < argc; ++i) { hts_itr_t *iter; if ((iter = sam_itr_querys(idx, h, argv[i])) == 0) { fprintf(stderr, "[E::%s] fail to parse region '%s'\n", __func__, argv[i]); continue; } while ((r = sam_itr_next(in, iter, b)) >= 0) { if (!benchmark && sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } if (nreads && --nreads == 0) break; } hts_itr_destroy(iter); } hts_idx_destroy(idx); } else while ((r = sam_read1(in, h, b)) >= 0) { if (!benchmark && sam_write1(out, h, b) < 0) { fprintf(stderr, "Error writing output.\n"); exit_code = 1; break; } if (nreads && --nreads == 0) break; } if (r < -1) { fprintf(stderr, "Error parsing input.\n"); exit_code = 1; } r = sam_close(out); if (r < 0) { fprintf(stderr, "Error closing output.\n"); exit_code = 1; } bam_destroy1(b); bam_hdr_destroy(h); r = sam_close(in); if (r < 0) { fprintf(stderr, "Error closing input.\n"); exit_code = 1; } return exit_code; }
static int mplp_func(void *data, bam1_t *b) { char *ref; mplp_aux_t *ma = (mplp_aux_t*)data; int ret, ref_len; while (1) { int has_ref; ret = ma->iter? sam_itr_next(ma->fp, ma->iter, b) : sam_read1(ma->fp, ma->h, b); if (ret < 0) break; // The 'B' cigar operation is not part of the specification, considering as obsolete. // bam_remove_B(b); if (b->core.tid < 0 || (b->core.flag&BAM_FUNMAP)) continue; // exclude unmapped reads if (ma->conf->rflag_require && !(ma->conf->rflag_require&b->core.flag)) continue; if (ma->conf->rflag_filter && ma->conf->rflag_filter&b->core.flag) continue; if (ma->conf->bed) { // test overlap regitr_t *itr = ma->conf->bed_itr; int beg = b->core.pos, end = bam_endpos(b)-1; int overlap = regidx_overlap(ma->conf->bed, ma->h->target_name[b->core.tid],beg,end, itr); if ( !ma->conf->bed_logic && !overlap ) { // exclude only reads which are fully contained in the region while ( regitr_overlap(itr) ) { if ( beg < itr->beg ) { overlap = 1; break; } if ( end > itr->end ) { overlap = 1; break; } } } if ( !overlap ) continue; } if ( bam_smpl_get_sample_id(ma->conf->bsmpl,ma->bam_id,b)<0 ) continue; if (ma->conf->flag & MPLP_ILLUMINA13) { int i; uint8_t *qual = bam_get_qual(b); for (i = 0; i < b->core.l_qseq; ++i) qual[i] = qual[i] > 31? qual[i] - 31 : 0; } if (ma->conf->fai && b->core.tid >= 0) { has_ref = mplp_get_ref(ma, b->core.tid, &ref, &ref_len); if (has_ref && ref_len <= b->core.pos) { // exclude reads outside of the reference sequence fprintf(stderr,"[%s] Skipping because %d is outside of %d [ref:%d]\n", __func__, b->core.pos, ref_len, b->core.tid); continue; } } else { has_ref = 0; } if (has_ref && (ma->conf->flag&MPLP_REALN)) sam_prob_realn(b, ref, ref_len, (ma->conf->flag & MPLP_REDO_BAQ)? 7 : 3); if (has_ref && ma->conf->capQ_thres > 10) { int q = sam_cap_mapq(b, ref, ref_len, ma->conf->capQ_thres); if (q < 0) continue; // skip else if (b->core.qual > q) b->core.qual = q; } if (b->core.qual < ma->conf->min_mq) continue; else if ((ma->conf->flag&MPLP_NO_ORPHAN) && (b->core.flag&BAM_FPAIRED) && !(b->core.flag&BAM_FPROPER_PAIR)) continue; return ret; }; return ret; }
int main_samview(int argc, char *argv[]) { int index; for(index = 0; index < argc; index++) { printf("The %d is %s\n",index,argv[index]); } getchar();return 0; int c, is_header = 0, is_header_only = 0, ret = 0, compress_level = -1, is_count = 0; int is_long_help = 0, n_threads = 0; int64_t count = 0; samFile *in = 0, *out = 0, *un_out=0; bam_hdr_t *header = NULL; char out_mode[5], out_un_mode[5], *out_format = ""; char *fn_in = 0, *fn_out = 0, *fn_list = 0, *q, *fn_un_out = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; samview_settings_t settings = { .rghash = NULL, .min_mapQ = 0, .flag_on = 0, .flag_off = 0, .min_qlen = 0, .remove_B = 0, .subsam_seed = 0, .subsam_frac = -1., .library = NULL, .bed = NULL, }; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, 'O', 0, 'T'), { "threads", required_argument, NULL, '@' }, { NULL, 0, NULL, 0 } }; /* parse command-line options */ strcpy(out_mode, "w"); strcpy(out_un_mode, "w"); while ((c = getopt_long(argc, argv, "SbBcCt:h1Ho:O:q:f:F:ul:r:?T:R:L:s:@:m:x:U:", lopts, NULL)) >= 0) { switch (c) { case 's': if ((settings.subsam_seed = strtol(optarg, &q, 10)) != 0) { srand(settings.subsam_seed); settings.subsam_seed = rand(); } settings.subsam_frac = strtod(q, &q); break; case 'm': settings.min_qlen = atoi(optarg); break; case 'c': is_count = 1; break; case 'S': break; case 'b': out_format = "b"; break; case 'C': out_format = "c"; break; case 't': fn_list = strdup(optarg); break; case 'h': is_header = 1; break; case 'H': is_header_only = 1; break; case 'o': fn_out = strdup(optarg); break; case 'U': fn_un_out = strdup(optarg); break; case 'f': settings.flag_on |= strtol(optarg, 0, 0); break; case 'F': settings.flag_off |= strtol(optarg, 0, 0); break; case 'q': settings.min_mapQ = atoi(optarg); break; case 'u': compress_level = 0; break; case '1': compress_level = 1; break; case 'l': settings.library = strdup(optarg); break; case 'L': if ((settings.bed = bed_read(optarg)) == NULL) { print_error_errno("view", "Could not read file \"%s\"", optarg); ret = 1; goto view_end; } break; case 'r': if (add_read_group_single("view", &settings, optarg) != 0) { ret = 1; goto view_end; } break; case 'R': if (add_read_groups_file("view", &settings, optarg) != 0) { ret = 1; goto view_end; } break; /* REMOVED as htslib doesn't support this //case 'x': out_format = "x"; break; //case 'X': out_format = "X"; break; */ case '?': is_long_help = 1; break; case 'B': settings.remove_B = 1; break; case '@': n_threads = strtol(optarg, 0, 0); break; case 'x': { if (strlen(optarg) != 2) { fprintf(stderr, "main_samview: Error parsing -x auxiliary tags should be exactly two characters long.\n"); return usage(stderr, EXIT_FAILURE, is_long_help); } settings.remove_aux = (char**)realloc(settings.remove_aux, sizeof(char*) * (++settings.remove_aux_len)); settings.remove_aux[settings.remove_aux_len-1] = optarg; } break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) != 0) return usage(stderr, EXIT_FAILURE, is_long_help); break; } } if (compress_level >= 0 && !*out_format) out_format = "b"; if (is_header_only) is_header = 1; // File format auto-detection first if (fn_out) sam_open_mode(out_mode+1, fn_out, NULL); if (fn_un_out) sam_open_mode(out_un_mode+1, fn_un_out, NULL); // Overridden by manual -b, -C if (*out_format) out_mode[1] = out_un_mode[1] = *out_format; out_mode[2] = out_un_mode[2] = '\0'; // out_(un_)mode now 1 or 2 bytes long, followed by nul. if (compress_level >= 0) { char tmp[2]; tmp[0] = compress_level + '0'; tmp[1] = '\0'; strcat(out_mode, tmp); strcat(out_un_mode, tmp); } if (argc == optind && isatty(STDIN_FILENO)) return usage(stdout, EXIT_SUCCESS, is_long_help); // potential memory leak... fn_in = (optind < argc)? argv[optind] : "-"; // generate the fn_list if necessary if (fn_list == 0 && ga.reference) fn_list = samfaipath(ga.reference); // open file handlers if ((in = sam_open_format(fn_in, "r", &ga.in)) == 0) { print_error_errno("view", "failed to open \"%s\" for reading", fn_in); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(in, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if ((header = sam_hdr_read(in)) == 0) { fprintf(stderr, "[main_samview] fail to read the header from \"%s\".\n", fn_in); ret = 1; goto view_end; } if (settings.rghash) { // FIXME: I do not know what "bam_header_t::n_text" is for... char *tmp; int l; tmp = drop_rg(header->text, settings.rghash, &l); free(header->text); header->text = tmp; header->l_text = l; } if (!is_count) { if ((out = sam_open_format(fn_out? fn_out : "-", out_mode, &ga.out)) == 0) { print_error_errno("view", "failed to open \"%s\" for writing", fn_out? fn_out : "standard output"); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(out, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if (*out_format || is_header || out_mode[1] == 'b' || out_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(out, header) != 0) { fprintf(stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } if (fn_un_out) { if ((un_out = sam_open_format(fn_un_out, out_un_mode, &ga.out)) == 0) { print_error_errno("view", "failed to open \"%s\" for writing", fn_un_out); ret = 1; goto view_end; } if (fn_list) { if (hts_set_fai_filename(un_out, fn_list) != 0) { fprintf(stderr, "[main_samview] failed to use reference \"%s\".\n", fn_list); ret = 1; goto view_end; } } if (*out_format || is_header || out_un_mode[1] == 'b' || out_un_mode[1] == 'c' || (ga.out.format != sam && ga.out.format != unknown_format)) { if (sam_hdr_write(un_out, header) != 0) { fprintf(stderr, "[main_samview] failed to write the SAM header\n"); ret = 1; goto view_end; } } } } if (n_threads > 1) { if (out) hts_set_threads(out, n_threads); } if (is_header_only) goto view_end; // no need to print alignments if (optind + 1 >= argc) { // convert/print the entire file bam1_t *b = bam_init1(); int r; while ((r = sam_read1(in, header, b)) >= 0) { // read one alignment from `in' if (!process_aln(header, b, &settings)) { if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } if (r < -1) { fprintf(stderr, "[main_samview] truncated file.\n"); ret = 1; } bam_destroy1(b); } else { // retrieve alignments in specified regions int i; bam1_t *b; hts_idx_t *idx = sam_index_load(in, fn_in); // load index if (idx == 0) { // index is unavailable fprintf(stderr, "[main_samview] random alignment retrieval only works for indexed BAM or CRAM files.\n"); ret = 1; goto view_end; } b = bam_init1(); for (i = optind + 1; i < argc; ++i) { int result; hts_itr_t *iter = sam_itr_querys(idx, header, argv[i]); // parse a region in the format like `chr2:100-200' if (iter == NULL) { // region invalid or reference name not found int beg, end; if (hts_parse_reg(argv[i], &beg, &end)) fprintf(stderr, "[main_samview] region \"%s\" specifies an unknown reference name. Continue anyway.\n", argv[i]); else fprintf(stderr, "[main_samview] region \"%s\" could not be parsed. Continue anyway.\n", argv[i]); continue; } // fetch alignments while ((result = sam_itr_next(in, iter, b)) >= 0) { if (!process_aln(header, b, &settings)) { if (!is_count) { if (check_sam_write1(out, header, b, fn_out, &ret) < 0) break; } count++; } else { if (un_out) { if (check_sam_write1(un_out, header, b, fn_un_out, &ret) < 0) break; } } } hts_itr_destroy(iter); if (result < -1) { fprintf(stderr, "[main_samview] retrieval of region \"%s\" failed due to truncated file or corrupt BAM index file\n", argv[i]); ret = 1; break; } } bam_destroy1(b); hts_idx_destroy(idx); // destroy the BAM index } view_end: if (is_count && ret == 0) printf("%" PRId64 "\n", count); // close files, free and return if (in) check_sam_close("view", in, fn_in, "standard input", &ret); if (out) check_sam_close("view", out, fn_out, "standard output", &ret); if (un_out) check_sam_close("view", un_out, fn_un_out, "file", &ret); free(fn_list); free(fn_out); free(settings.library); free(fn_un_out); sam_global_args_free(&ga); if ( header ) bam_hdr_destroy(header); if (settings.bed) bed_destroy(settings.bed); if (settings.rghash) { khint_t k; for (k = 0; k < kh_end(settings.rghash); ++k) if (kh_exist(settings.rghash, k)) free((char*)kh_key(settings.rghash, k)); kh_destroy(rg, settings.rghash); } if (settings.remove_aux_len) { free(settings.remove_aux); } return ret; } static int usage(FILE *fp, int exit_status, int is_long_help) { fprintf(fp, "\n" "Usage: samtools view [options] <in.bam>|<in.sam>|<in.cram> [region ...]\n" "\n" "Options:\n" // output options " -b output BAM\n" " -C output CRAM (requires -T)\n" " -1 use fast BAM compression (implies -b)\n" " -u uncompressed BAM output (implies -b)\n" " -h include header in SAM output\n" " -H print SAM header only (no alignments)\n" " -c print only the count of matching records\n" " -o FILE output file name [stdout]\n" " -U FILE output reads not selected by filters to FILE [null]\n" // extra input " -t FILE FILE listing reference names and lengths (see long help) [null]\n" // read filters " -L FILE only include reads overlapping this BED FILE [null]\n" " -r STR only include reads in read group STR [null]\n" " -R FILE only include reads with read group listed in FILE [null]\n" " -q INT only include reads with mapping quality >= INT [0]\n" " -l STR only include reads in library STR [null]\n" " -m INT only include reads with number of CIGAR operations consuming\n" " query sequence >= INT [0]\n" " -f INT only include reads with all bits set in INT set in FLAG [0]\n" " -F INT only include reads with none of the bits set in INT set in FLAG [0]\n" // read processing " -x STR read tag to strip (repeatable) [null]\n" " -B collapse the backward CIGAR operation\n" " -s FLOAT integer part sets seed of random number generator [0];\n" " rest sets fraction of templates to subsample [no subsampling]\n" // general options " -@, --threads INT\n" " number of BAM/CRAM compression threads [0]\n" " -? print long help, including note about region specification\n" " -S ignored (input format is auto-detected)\n"); sam_global_opt_help(fp, "-.O.T"); fprintf(fp, "\n"); if (is_long_help) fprintf(fp, "Notes:\n" "\n" "1. This command now auto-detects the input format (BAM/CRAM/SAM).\n" " Further control over the CRAM format can be specified by using the\n" " --output-fmt-option, e.g. to specify the number of sequences per slice\n" " and to use avoid reference based compression:\n" "\n" "\tsamtools view -C --output-fmt-option seqs_per_slice=5000 \\\n" "\t --output-fmt-option no_ref -o out.cram in.bam\n" "\n" " Options can also be specified as a comma separated list within the\n" " --output-fmt value too. For example this is equivalent to the above\n" "\n" "\tsamtools view --output-fmt cram,seqs_per_slice=5000,no_ref \\\n" "\t -o out.cram in.bam\n" "\n" "2. The file supplied with `-t' is SPACE/TAB delimited with the first\n" " two fields of each line consisting of the reference name and the\n" " corresponding sequence length. The `.fai' file generated by \n" " `samtools faidx' is suitable for use as this file. This may be an\n" " empty file if reads are unaligned.\n" "\n" "3. SAM->BAM conversion: samtools view -bT ref.fa in.sam.gz\n" "\n" "4. BAM->SAM conversion: samtools view -h in.bam\n" "\n" "5. A region should be presented in one of the following formats:\n" " `chr1', `chr2:1,000' and `chr3:1000-2,000'. When a region is\n" " specified, the input alignment file must be a sorted and indexed\n" " alignment (BAM/CRAM) file.\n" "\n" "6. Option `-u' is preferred over `-b' when the output is piped to\n" " another samtools command.\n" "\n"); return exit_status; }
/*! @abstract Merge multiple sorted BAM. @param is_by_qname whether to sort by query name @param out output BAM file name @param mode sam_open() mode to be used to create the final output file (overrides level settings from UNCOMP and LEVEL1 flags) @param headers name of SAM file from which to copy '@' header lines, or NULL to copy them from the first file to be merged @param n number of files to be merged @param fn names of files to be merged @param flag flags that control how the merge is undertaken @param reg region to merge @param n_threads number of threads to use (passed to htslib) @discussion Padding information may NOT correctly maintained. This function is NOT thread safe. */ int bam_merge_core2(int by_qname, const char *out, const char *mode, const char *headers, int n, char * const *fn, int flag, const char *reg, int n_threads) { samFile *fpout, **fp; heap1_t *heap; bam_hdr_t *hout = NULL; int i, j, *RG_len = NULL; uint64_t idx = 0; char **RG = NULL; hts_itr_t **iter = NULL; bam_hdr_t **hdr = NULL; trans_tbl_t *translation_tbl = NULL; // Is there a specified pre-prepared header to use for output? if (headers) { samFile* fpheaders = sam_open(headers, "r"); if (fpheaders == NULL) { const char *message = strerror(errno); fprintf(pysamerr, "[bam_merge_core] cannot open '%s': %s\n", headers, message); return -1; } hout = sam_hdr_read(fpheaders); sam_close(fpheaders); } g_is_by_qname = by_qname; fp = (samFile**)calloc(n, sizeof(samFile*)); heap = (heap1_t*)calloc(n, sizeof(heap1_t)); iter = (hts_itr_t**)calloc(n, sizeof(hts_itr_t*)); hdr = (bam_hdr_t**)calloc(n, sizeof(bam_hdr_t*)); translation_tbl = (trans_tbl_t*)calloc(n, sizeof(trans_tbl_t)); // prepare RG tag from file names if (flag & MERGE_RG) { RG = (char**)calloc(n, sizeof(char*)); RG_len = (int*)calloc(n, sizeof(int)); for (i = 0; i != n; ++i) { int l = strlen(fn[i]); const char *s = fn[i]; if (l > 4 && strcmp(s + l - 4, ".bam") == 0) l -= 4; for (j = l - 1; j >= 0; --j) if (s[j] == '/') break; ++j; l -= j; RG[i] = (char*)calloc(l + 1, 1); RG_len[i] = l; strncpy(RG[i], s + j, l); } } // open and read the header from each file for (i = 0; i < n; ++i) { bam_hdr_t *hin; fp[i] = sam_open(fn[i], "r"); if (fp[i] == NULL) { int j; fprintf(pysamerr, "[bam_merge_core] fail to open file %s\n", fn[i]); for (j = 0; j < i; ++j) sam_close(fp[j]); free(fp); free(heap); // FIXME: possible memory leak return -1; } hin = sam_hdr_read(fp[i]); if (hout) trans_tbl_init(hout, hin, translation_tbl+i, flag & MERGE_COMBINE_RG, flag & MERGE_COMBINE_PG); else { // As yet, no headers to merge into... hout = bam_hdr_dup(hin); // ...so no need to translate header into itself trans_tbl_init(hout, hin, translation_tbl+i, true, true); } // TODO sam_itr_next() doesn't yet work for SAM files, // so for those keep the headers around for use with sam_read1() if (hts_get_format(fp[i])->format == sam) hdr[i] = hin; else { bam_hdr_destroy(hin); hdr[i] = NULL; } if ((translation_tbl+i)->lost_coord_sort && !by_qname) { fprintf(pysamerr, "[bam_merge_core] Order of targets in file %s caused coordinate sort to be lost\n", fn[i]); } } // Transform the header into standard form pretty_header(&hout->text,hout->l_text); // If we're only merging a specified region move our iters to start at that point if (reg) { int* rtrans = rtrans_build(n, hout->n_targets, translation_tbl); int tid, beg, end; const char *name_lim = hts_parse_reg(reg, &beg, &end); char *name = malloc(name_lim - reg + 1); memcpy(name, reg, name_lim - reg); name[name_lim - reg] = '\0'; tid = bam_name2id(hout, name); free(name); if (tid < 0) { fprintf(pysamerr, "[%s] Malformated region string or undefined reference name\n", __func__); return -1; } for (i = 0; i < n; ++i) { hts_idx_t *idx = sam_index_load(fp[i], fn[i]); // (rtrans[i*n+tid]) Look up what hout tid translates to in input tid space int mapped_tid = rtrans[i*hout->n_targets+tid]; if (mapped_tid != INT32_MIN) { iter[i] = sam_itr_queryi(idx, mapped_tid, beg, end); } else { iter[i] = sam_itr_queryi(idx, HTS_IDX_NONE, 0, 0); } hts_idx_destroy(idx); if (iter[i] == NULL) break; } free(rtrans); } else { for (i = 0; i < n; ++i) { if (hdr[i] == NULL) { iter[i] = sam_itr_queryi(NULL, HTS_IDX_REST, 0, 0); if (iter[i] == NULL) break; } else iter[i] = NULL; } } if (i < n) { fprintf(pysamerr, "[%s] Memory allocation failed\n", __func__); return -1; } // Load the first read from each file into the heap for (i = 0; i < n; ++i) { heap1_t *h = heap + i; h->i = i; h->b = bam_init1(); if ((iter[i]? sam_itr_next(fp[i], iter[i], h->b) : sam_read1(fp[i], hdr[i], h->b)) >= 0) { bam_translate(h->b, translation_tbl + i); h->pos = ((uint64_t)h->b->core.tid<<32) | (uint32_t)((int32_t)h->b->core.pos+1)<<1 | bam_is_rev(h->b); h->idx = idx++; } else { h->pos = HEAP_EMPTY; bam_destroy1(h->b); h->b = NULL; } } // Open output file and write header if ((fpout = sam_open(out, mode)) == 0) { fprintf(pysamerr, "[%s] fail to create the output file.\n", __func__); return -1; } sam_hdr_write(fpout, hout); if (!(flag & MERGE_UNCOMP)) hts_set_threads(fpout, n_threads); // Begin the actual merge ks_heapmake(heap, n, heap); while (heap->pos != HEAP_EMPTY) { bam1_t *b = heap->b; if (flag & MERGE_RG) { uint8_t *rg = bam_aux_get(b, "RG"); if (rg) bam_aux_del(b, rg); bam_aux_append(b, "RG", 'Z', RG_len[heap->i] + 1, (uint8_t*)RG[heap->i]); } sam_write1(fpout, hout, b); if ((j = (iter[heap->i]? sam_itr_next(fp[heap->i], iter[heap->i], b) : sam_read1(fp[heap->i], hdr[heap->i], b))) >= 0) { bam_translate(b, translation_tbl + heap->i); heap->pos = ((uint64_t)b->core.tid<<32) | (uint32_t)((int)b->core.pos+1)<<1 | bam_is_rev(b); heap->idx = idx++; } else if (j == -1) { heap->pos = HEAP_EMPTY; bam_destroy1(heap->b); heap->b = NULL; } else fprintf(pysamerr, "[bam_merge_core] '%s' is truncated. Continue anyway.\n", fn[heap->i]); ks_heapadjust(heap, 0, n, heap); } // Clean up and close if (flag & MERGE_RG) { for (i = 0; i != n; ++i) free(RG[i]); free(RG); free(RG_len); } for (i = 0; i < n; ++i) { trans_tbl_destroy(translation_tbl + i); hts_itr_destroy(iter[i]); bam_hdr_destroy(hdr[i]); sam_close(fp[i]); } bam_hdr_destroy(hout); sam_close(fpout); free(translation_tbl); free(fp); free(heap); free(iter); free(hdr); return 0; }
int methyltest_main(int argc, char** argv) { parse_methyltest_options(argc, argv); omp_set_num_threads(opt::num_threads); Fast5Map name_map(opt::reads_file); ModelMap models = read_models_fofn(opt::models_fofn, mtest_alphabet); // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize writers OutputHandles handles; handles.site_writer = fopen(std::string(opt::bam_file + ".methyltest.sites.bed").c_str(), "w"); handles.read_writer = fopen(std::string(opt::bam_file + ".methyltest.reads.tsv").c_str(), "w"); handles.strand_writer = fopen(std::string(opt::bam_file + ".methyltest.strand.tsv").c_str(), "w"); // Write a header to the reads.tsv file fprintf(handles.read_writer, "name\tsum_ll_ratio\tn_cpg\tcomplement_model\ttags\n"); // strand header fprintf(handles.strand_writer, "name\tsum_ll_ratio\tn_cpg\tmodel\n"); // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_processed = 0; size_t num_records_buffered = 0; Progress progress("[methyltest]"); do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_processed + i; if( (record->core.flag & BAM_FUNMAP) == 0) { calculate_methylation_for_read(models, name_map, fai, hdr, record, read_idx, handles); } } num_reads_processed += num_records_buffered; num_records_buffered = 0; } } while(result >= 0); assert(num_records_buffered == 0); progress.end(); // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup fclose(handles.site_writer); fclose(handles.read_writer); fclose(handles.strand_writer); sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); return EXIT_SUCCESS; }
void train_one_round(const Fast5Map& name_map, size_t round) { const PoreModelMap& current_models = PoreModelSet::get_models(opt::trained_model_type); // Initialize the training summary stats for each kmer for each model ModelTrainingMap model_training_data; for(auto current_model_iter = current_models.begin(); current_model_iter != current_models.end(); current_model_iter++) { // one summary entry per kmer in the model std::vector<StateSummary> summaries(current_model_iter->second.get_num_states()); model_training_data[current_model_iter->first] = summaries; } // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_realigned = 0; size_t num_records_buffered = 0; Progress progress("[methyltrain]"); do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_realigned + i; if( (record->core.flag & BAM_FUNMAP) == 0) { add_aligned_events(name_map, fai, hdr, record, read_idx, clip_start, clip_end, round, model_training_data); } } num_reads_realigned += num_records_buffered; num_records_buffered = 0; } if(opt::progress) { fprintf(stderr, "Realigned %zu reads in %.1lfs\r", num_reads_realigned, progress.get_elapsed_seconds()); } } while(result >= 0); assert(num_records_buffered == 0); progress.end(); // open the summary file std::stringstream summary_fn; summary_fn << "methyltrain" << opt::out_suffix << ".summary"; FILE* summary_fp = fopen(summary_fn.str().c_str(), "w"); fprintf(summary_fp, "model_short_name\tkmer\tnum_matches\tnum_skips\t" "num_stays\tnum_events_for_training\twas_trained\t" "trained_level_mean\ttrained_level_stdv\n"); // open the tsv file with the raw training data std::stringstream training_fn; training_fn << "methyltrain" << opt::out_suffix << ".round" << round << ".events.tsv"; std::ofstream training_ofs(training_fn.str()); // write out a header for the training data StateTrainingData::write_header(training_ofs); // iterate over models: template, complement_pop1, complement_pop2 for(auto model_training_iter = model_training_data.begin(); model_training_iter != model_training_data.end(); model_training_iter++) { // Initialize the trained model from the input model auto current_model_iter = current_models.find(model_training_iter->first); assert(current_model_iter != current_models.end()); std::string model_name = model_training_iter->first; std::string model_short_name = current_model_iter->second.metadata.get_short_name(); // Initialize the new model from the current model PoreModel updated_model = current_model_iter->second; uint32_t k = updated_model.k; const std::vector<StateSummary>& summaries = model_training_iter->second; // Generate the complete set of kmers std::string gen_kmer(k, 'A'); std::vector<std::string> all_kmers; for(size_t ki = 0; ki < summaries.size(); ++ki) { all_kmers.push_back(gen_kmer); mtrain_alphabet->lexicographic_next(gen_kmer); } assert(gen_kmer == std::string(k, 'A')); assert(all_kmers.front() == std::string(k, 'A')); assert(all_kmers.back() == std::string(k, 'T')); // Update means for each kmer #pragma omp parallel for for(size_t ki = 0; ki < summaries.size(); ++ki) { assert(ki < all_kmers.size()); std::string kmer = all_kmers[ki]; // write the observed values to a tsv file #pragma omp critical { for(size_t ei = 0; ei < summaries[ki].events.size(); ++ei) { summaries[ki].events[ei].write_tsv(training_ofs, model_short_name, kmer); } } bool is_m_kmer = kmer.find('M') != std::string::npos; bool update_kmer = opt::training_target == TT_ALL_KMERS || (is_m_kmer && opt::training_target == TT_METHYLATED_KMERS) || (!is_m_kmer && opt::training_target == TT_UNMETHYLATED_KMERS); bool trained = false; // only train if there are a sufficient number of events for this kmer if(update_kmer && summaries[ki].events.size() >= opt::min_number_of_events_to_train) { // train a mixture model where a minority of k-mers aren't methylated ParamMixture mixture; float incomplete_methylation_rate = 0.05f; std::string um_kmer = mtrain_alphabet->unmethylate(kmer); size_t um_ki = mtrain_alphabet->kmer_rank(um_kmer.c_str(), k); // Initialize the training parameters. If this is a kmer containing // a methylation site we train a two component mixture, otherwise // just fit a gaussian float major_weight = is_m_kmer ? 1 - incomplete_methylation_rate : 1.0f; mixture.log_weights.push_back(log(major_weight)); mixture.params.push_back(current_model_iter->second.get_parameters(ki)); if(is_m_kmer) { // add second unmethylated component mixture.log_weights.push_back(std::log(incomplete_methylation_rate)); mixture.params.push_back(current_model_iter->second.get_parameters(um_ki)); } if(opt::verbose > 1) { fprintf(stderr, "INIT__MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(), std::exp(mixture.log_weights[0]), mixture.params[0].level_mean, mixture.params[0].level_stdv, std::exp(mixture.log_weights[1]), mixture.params[1].level_mean, mixture.params[1].level_stdv); } ParamMixture trained_mixture = train_gaussian_mixture(summaries[ki].events, mixture); if(opt::verbose > 1) { fprintf(stderr, "TRAIN_MIX %s\t%s\t[%.2lf %.2lf %.2lf]\t[%.2lf %.2lf %.2lf]\n", model_training_iter->first.c_str(), kmer.c_str(), std::exp(trained_mixture.log_weights[0]), trained_mixture.params[0].level_mean, trained_mixture.params[0].level_stdv, std::exp(trained_mixture.log_weights[1]), trained_mixture.params[1].level_mean, trained_mixture.params[1].level_stdv); } #pragma omp critical updated_model.states[ki] = trained_mixture.params[0]; if (model_stdv()) { ParamMixture ig_mixture; // weights ig_mixture.log_weights = trained_mixture.log_weights; // states ig_mixture.params.emplace_back(trained_mixture.params[0]); if(is_m_kmer) { ig_mixture.params.emplace_back(current_model_iter->second.get_parameters(um_ki)); } // run training auto trained_ig_mixture = train_invgaussian_mixture(summaries[ki].events, ig_mixture); LOG("methyltrain", debug) << "IG_INIT__MIX " << model_training_iter->first.c_str() << " " << kmer.c_str() << " [" << std::fixed << std::setprecision(5) << ig_mixture.params[0].sd_mean << " " << ig_mixture.params[1].sd_mean << "]" << std::endl << "IG_TRAIN_MIX " << model_training_iter->first.c_str() << " " << kmer.c_str() << " [" << trained_ig_mixture.params[0].sd_mean << " " << trained_ig_mixture.params[1].sd_mean << "]" << std::endl; // update state #pragma omp critical { updated_model.states[ki] = trained_ig_mixture.params[0]; } } trained = true; } #pragma omp critical { fprintf(summary_fp, "%s\t%s\t%d\t%d\t%d\t%zu\t%d\t%.2lf\t%.2lf\n", model_short_name.c_str(), kmer.c_str(), summaries[ki].num_matches, summaries[ki].num_skips, summaries[ki].num_stays, summaries[ki].events.size(), trained, updated_model.states[ki].level_mean, updated_model.states[ki].level_stdv); } // add the updated model into the collection (or replace what is already there) PoreModelSet::insert_model(opt::trained_model_type, updated_model); } } // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); fclose(summary_fp); }
int scorereads_main(int argc, char** argv) { parse_scorereads_options(argc, argv); omp_set_num_threads(opt::num_threads); Fast5Map name_map(opt::reads_file); ModelMap models; if (!opt::models_fofn.empty()) models = read_models_fofn(opt::models_fofn); // Open the BAM and iterate over reads // load bam file htsFile* bam_fh = sam_open(opt::bam_file.c_str(), "r"); assert(bam_fh != NULL); // load bam index file std::string index_filename = opt::bam_file + ".bai"; hts_idx_t* bam_idx = bam_index_load(index_filename.c_str()); assert(bam_idx != NULL); // read the bam header bam_hdr_t* hdr = sam_hdr_read(bam_fh); // load reference fai file faidx_t *fai = fai_load(opt::genome_file.c_str()); hts_itr_t* itr; // If processing a region of the genome, only emit events aligned to this window int clip_start = -1; int clip_end = -1; if(opt::region.empty()) { // TODO: is this valid? itr = sam_itr_queryi(bam_idx, HTS_IDX_START, 0, 0); } else { fprintf(stderr, "Region: %s\n", opt::region.c_str()); itr = sam_itr_querys(bam_idx, hdr, opt::region.c_str()); hts_parse_reg(opt::region.c_str(), &clip_start, &clip_end); } #ifndef H5_HAVE_THREADSAFE if(opt::num_threads > 1) { fprintf(stderr, "You enabled multi-threading but you do not have a threadsafe HDF5\n"); fprintf(stderr, "Please recompile nanopolish's built-in libhdf5 or run with -t 1\n"); exit(1); } #endif // Initialize iteration std::vector<bam1_t*> records(opt::batch_size, NULL); for(size_t i = 0; i < records.size(); ++i) { records[i] = bam_init1(); } int result; size_t num_reads_realigned = 0; size_t num_records_buffered = 0; do { assert(num_records_buffered < records.size()); // read a record into the next slot in the buffer result = sam_itr_next(bam_fh, itr, records[num_records_buffered]); num_records_buffered += result >= 0; // realign if we've hit the max buffer size or reached the end of file if(num_records_buffered == records.size() || result < 0) { #pragma omp parallel for schedule(dynamic) for(size_t i = 0; i < num_records_buffered; ++i) { bam1_t* record = records[i]; size_t read_idx = num_reads_realigned + i; if( (record->core.flag & BAM_FUNMAP) == 0) { //load read std::string read_name = bam_get_qname(record); std::string fast5_path = name_map.get_path(read_name); SquiggleRead sr(read_name, fast5_path); // TODO: early exit when have processed all of the reads in readnames if (!opt::readnames.empty() && std::find(opt::readnames.begin(), opt::readnames.end(), read_name) == opt::readnames.end() ) continue; for(size_t strand_idx = 0; strand_idx < NUM_STRANDS; ++strand_idx) { std::vector<EventAlignment> ao = alignment_from_read(sr, strand_idx, read_idx, models, fai, hdr, record, clip_start, clip_end); if (ao.size() == 0) continue; // Update pore model based on alignment if ( opt::calibrate ) recalibrate_model(sr, strand_idx, ao, false); double score = model_score(sr, strand_idx, fai, ao, 500); if (score > 0) continue; #pragma omp critical(print) std::cout << read_name << " " << ( strand_idx ? "complement" : "template" ) << " " << sr.pore_model[strand_idx].name << " " << score << std::endl; } } } num_reads_realigned += num_records_buffered; num_records_buffered = 0; } } while(result >= 0); // cleanup records for(size_t i = 0; i < records.size(); ++i) { bam_destroy1(records[i]); } // cleanup sam_itr_destroy(itr); bam_hdr_destroy(hdr); fai_destroy(fai); sam_close(bam_fh); hts_idx_destroy(bam_idx); return 0; }
loci_stats *bam_access_get_position_base_counts(char *chr, int posn){ char *region = NULL; hts_itr_t *iter = NULL; bam1_t* b = NULL; bam_plp_t buf; loci_stats *stats = malloc(sizeof(loci_stats *)); check_mem(stats); stats->base_counts = malloc(sizeof(int) * 4); check_mem(stats->base_counts); stats->base_counts[0] = 0; stats->base_counts[1] = 0; stats->base_counts[2] = 0; stats->base_counts[3] = 0; fholder->stats = stats; region = malloc((sizeof(char *) * (strlen(chr)+1))+sizeof(":")+sizeof("-")+(sizeof(char)*((no_of_digits(posn)*2)+1))); sprintf(region,"%s:%d-%d",chr,posn,posn); fholder->beg = posn; fholder->end = posn; // initialize pileup buf = bam_plp_init(pileup_func, (void *)fholder); bam_plp_set_maxcnt(buf,maxitercnt); /* sam_fetch(fholder->in, fholder->idx, ref, fholder->beg, fholder->end, buf, fetch_algo_func); */ //Replace fetch with iterator for htslib compatibility. b = bam_init1(); iter = sam_itr_querys(fholder->idx, fholder->head, region); int result; int count = 0; while ((result = sam_itr_next(fholder->in, iter, b)) >= 0) { if(b->core.qual < min_map_qual || (b->core.flag & BAM_FUNMAP) || !(b->core.flag & BAM_FPROPER_PAIR) || (b->core.flag & BAM_FMUNMAP)//Proper pair and mate unmapped || (b->core.flag & BAM_FDUP)//1024 is PCR/optical duplicate || (b->core.flag & BAM_FSECONDARY) || (b->core.flag & BAM_FQCFAIL)//Secondary alignment, quality fail || (b->core.flag & BAM_FSUPPLEMENTARY) ) continue; count++; bam_plp_push(buf, b); } sam_itr_destroy(iter); bam_plp_push(buf, 0); int tid, pos, n_plp = -1; const bam_pileup1_t *pil; while ( (pil=bam_plp_next(buf, &tid, &pos, &n_plp)) > 0) { if((pos+1) != posn) continue; int i=0; for(i=0;i<n_plp;i++){ const bam_pileup1_t *p = pil + i; int qual = bam_get_qual(p->b)[p->qpos]; uint8_t c = bam_seqi(bam_get_seq(p->b), p->qpos); if(!(p->is_del) && qual >= min_base_qual && p->b->core.qual >= min_map_qual){ //&& (c == 1 /*A*/|| c == 2 /*C*/|| c == 4 /*G*/|| c == 8 /*T*/)){ //Now we add a new read pos struct to the list since the read is valid. //char cbase = toupper(bam_nt16_rev_table[c]); switch(c){ case 1: fholder->stats->base_counts[0]++; break; case 2: fholder->stats->base_counts[1]++; break; case 4: fholder->stats->base_counts[2]++; break; case 8: fholder->stats->base_counts[3]++; break; default: break; }; // End of args switch statement */ } } } //End of iteration through pileup //bam_plp_push(buf, 0); // finalize pileup bam_plp_destroy(buf); free(region); return fholder->stats; error: //if(region) free(region); if(fholder->stats){ if(fholder->stats->base_counts) free(fholder->stats->base_counts); free(fholder->stats); } if(iter) sam_itr_destroy(iter); if(b) bam_destroy1(b); if(region) free(region); return NULL; }