bam_header_t *sam_header_read2(const char *fn) { bam_header_t *header; int c, dret, n_targets = 0; gzFile fp; kstream_t *ks; kstring_t *str; kstring_t samstr = { 0, 0, NULL }; if (fn == 0) return 0; fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); if (fp == 0) return 0; ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) > 0) { ksprintf(&samstr, "@SQ\tSN:%s", str->s); ks_getuntil(ks, 0, str, &dret); ksprintf(&samstr, "\tLN:%d\n", atoi(str->s)); n_targets++; if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : ""); free(samstr.s); fprintf(pysamerr, "[sam_header_read2] %d sequences loaded.\n", n_targets); return header; }
bam_header_t *sam_header_read2(const char *fn) { bam_header_t *header; int c, dret, ret; gzFile fp; kstream_t *ks; kstring_t *str; kh_ref_t *hash; khiter_t k; hash = kh_init(ref); fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); assert(fp); ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) > 0) { char *s = strdup(str->s); int len, i; i = kh_size(hash); ks_getuntil(ks, 0, str, &dret); len = atoi(str->s); k = kh_put(ref, hash, s, &ret); kh_value(hash, k) = (uint64_t)len<<32 | i; if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); header = hash2header(hash); kh_destroy(ref, hash); return header; }
int main(int argc, char *argv[]) { gzFile fp; kstream_t *ks; khash_t(s) *hash; mask32_t *q = 0; kstring_t *str; int i, dret, c, last = 0; while ((c = getopt(argc, argv, "")) >= 0) { } if (argc <= optind + 1) { fprintf(stderr, "Usage: uniq-dist <in.mask.fa> <in-sorted.list>\n"); return 1; } str = (kstring_t*)calloc(1, sizeof(kstring_t)); fprintf(stderr, "[uniq-dist] loading mask...\n"); hash = load_mask(argv[optind]); fp = gzopen(argv[optind+1], "r"); ks = ks_init(fp); fprintf(stderr, "[uniq-dist] calculating unique distance...\n"); while (ks_getuntil(ks, 0, str, &dret) >= 0) { khint_t k; mask32_t *p; int pos; k = kh_get(s, hash, str->s); p = (k != kh_end(hash))? &kh_val(hash, k) : 0; ks_getuntil(ks, 0, str, &dret); pos = atoi(str->s) - 1; if (p && pos >= 0 && pos < p->ori_len) { if (p != q) q = p; // change of reference else { if (last >= pos) { fprintf(stderr, "[uniq-dist] out of order: %s:%d <= %d\n", kh_key(hash, k), pos+1, last+1); } else { for (i = last, c = 0; i < pos; ++i) if (p->mask[i/32] & 1u<<i%32) ++c; if (last > 0) printf("%s\t%d\t%d\t%d\n", kh_key(hash, k), last, pos, c); } } last = pos; } if (dret != '\n') while ((c = ks_getc(ks)) != -1 && c != '\n'); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); // hash table is not freed... return 0; }
int main(int argc, char *argv[]) { gzFile fp; kstream_t *ks; khash_t(s) *hash; kstring_t *str; int dret, c, complement = 0; while ((c = getopt(argc, argv, "c")) >= 0) { switch (c) { case 'c': complement = 1; break; } } if (argc <= optind + 1) { fprintf(stderr, "Usage: apply_mask_l [-c] <in.mask.fa> <in.list>\n"); return 1; } str = (kstring_t*)calloc(1, sizeof(kstring_t)); fprintf(stderr, "[apply_mask_l] loading mask...\n"); hash = load_mask(argv[optind]); fp = gzopen(argv[optind+1], "r"); ks = ks_init(fp); fprintf(stderr, "[apply_mask_l] filtering list...\n"); while (ks_getuntil(ks, 0, str, &dret) >= 0) { khint_t k; mask32_t *p; int pos, do_print = 0; k = kh_get(s, hash, str->s); p = (k != kh_end(hash))? &kh_val(hash, k) : 0; ks_getuntil(ks, 0, str, &dret); pos = atoi(str->s) - 1; if (complement == 0) { if (p && pos < p->ori_len && (p->mask[pos/32]&1u<<pos%32)) do_print = 1; } else { if (p && pos < p->ori_len && (p->mask[pos/32]&1u<<pos%32) == 0) do_print = 1; } if (do_print) printf("%s\t%d", kh_key(hash, k), pos + 1); if (dret != '\n') { if (do_print) putchar('\t'); while ((c = ks_getc(ks)) != -1 && c != '\n') if (do_print) putchar(c); } if (do_print) putchar('\n'); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); // hash table is not freed... return 0; }
reghash_t *stk_reg_read(const char *fn) { reghash_t *h = kh_init(reg); gzFile fp; kstream_t *ks; int dret; kstring_t *str; // read the list str = calloc(1, sizeof(kstring_t)); fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, str, &dret) >= 0) { int beg = -1, end = -1; reglist_t *p; khint_t k = kh_get(reg, h, str->s); if (k == kh_end(h)) { int ret; char *s = strdup(str->s); k = kh_put(reg, h, s, &ret); memset(&kh_val(h, k), 0, sizeof(reglist_t)); } p = &kh_val(h, k); if (dret != '\n') { if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { beg = atoi(str->s); if (dret != '\n') { if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { end = atoi(str->s); if (end < 0) end = -1; } } } } // skip the rest of the line if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column if (beg < 0) beg = 0, end = INT_MAX; if (p->n == p->m) { p->m = p->m? p->m<<1 : 4; p->a = realloc(p->a, p->m * 8); } p->a[p->n++] = (uint64_t)beg<<32 | end; } ks_destroy(ks); gzclose(fp); free(str->s); free(str); return h; }
bam_header_t *sam_header_read2(const char *fn) { bam_header_t *header; int c, dret, ret, error = 0; gzFile fp; kstream_t *ks; kstring_t *str; kh_ref_t *hash; khiter_t k; if (fn == 0) return 0; fp = gzopen(fn, "r"); if (fp == 0) return 0; hash = kh_init(ref); ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) > 0) { char *s = malloc(strlen(str->s) + 1); strcpy(s,str->s); int len, i; i = kh_size(hash); ks_getuntil(ks, 0, str, &dret); len = atoi(str->s); k = kh_put(ref, hash, s, &ret); if (ret == 0) { Rprintf("[sam_header_read2] duplicated sequence name: %s\n", s); error = 1; } kh_value(hash, k) = (uint64_t)len<<32 | i; if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); Rprintf("[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); if (error) return 0; header = hash2header(hash); kh_destroy(ref, hash); return header; }
int stk_mutfa(int argc, char *argv[]) { khash_t(reg) *h = kh_init(reg); gzFile fp; kseq_t *seq; kstream_t *ks; int l, i, dret; kstring_t *str; khint_t k; if (argc < 3) { fprintf(stderr, "Usage: seqtk mutfa <in.fa> <in.snp>\n\n"); fprintf(stderr, "Note: <in.snp> contains at least four columns per line which are:\n"); fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); return 1; } // read the list str = calloc(1, sizeof(kstring_t)); fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, str, &dret) >= 0) { char *s = strdup(str->s); int beg = 0, ret; reglist_t *p; k = kh_get(reg, h, s); if (k == kh_end(h)) { k = kh_put(reg, h, s, &ret); memset(&kh_val(h, k), 0, sizeof(reglist_t)); } p = &kh_val(h, k); if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col ks_getuntil(ks, 0, str, &dret); // 3rd col ks_getuntil(ks, 0, str, &dret); // 4th col // skip the rest of the line if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); if (isalpha(str->s[0]) && str->l == 1) { if (p->n == p->m) { p->m = p->m? p->m<<1 : 4; p->a = realloc(p->a, p->m * 8); } p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; } } ks_destroy(ks); gzclose(fp); free(str->s); free(str); // mutfa fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { reglist_t *p; k = kh_get(reg, h, seq->name.s); if (k != kh_end(h)) { p = &kh_val(h, k); for (i = 0; i < p->n; ++i) { int beg = p->a[i]>>32; if (beg < seq->seq.l) seq->seq.s[beg] = (int)p->a[i]; } } printf(">%s", seq->name.s); for (i = 0; i < l; ++i) { if (i%60 == 0) putchar('\n'); putchar(seq->seq.s[i]); } putchar('\n'); }
bcf_hdr_t *vcf_hdr_read(htsFile *fp) { if (!fp->is_bin) { kstring_t txt, *s = &fp->line; bcf_hdr_t *h; h = bcf_hdr_init(); txt.l = txt.m = 0; txt.s = 0; while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { if (s->l == 0) continue; if (s->s[0] != '#') { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] no sample line\n", __func__); free(txt.s); bcf_hdr_destroy(h); return 0; } if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here int dret; gzFile f; kstream_t *ks; kstring_t tmp; tmp.l = tmp.m = 0; tmp.s = 0; f = gzopen(fp->fn_aux, "r"); ks = ks_init(f); while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { int c; kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt); ks_getuntil(ks, 0, &tmp, &dret); kputs(",length=", &txt); kputw(atol(tmp.s), &txt); kputsn(">\n", 2, &txt); if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line } free(tmp.s); ks_destroy(ks); gzclose(f); } kputsn(s->s, s->l, &txt); if (s->s[1] != '#') break; kputc('\n', &txt); } h->l_text = txt.l + 1; // including NULL h->text = txt.s; bcf_hdr_parse(h); // check tabix index, are all contigs listed in the header? add the missing ones tbx_t *idx = tbx_index_load(fp->fn); if ( idx ) { int i, n, need_sync = 0; const char **names = tbx_seqnames(idx, &n); for (i=0; i<n; i++) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]); if ( hrec ) continue; hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); hrec->key = strdup("contig"); bcf_hrec_add_key(hrec, "ID", strlen("ID")); bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); bcf_hrec_add_key(hrec, "length", strlen("length")); bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0); // what is a good default value? bcf_hdr_add_hrec(h, hrec); need_sync = 1; } free(names); tbx_destroy(idx); if ( need_sync ) { bcf_hdr_sync(h); bcf_hdr_fmt_text(h); } } return h; } else return bcf_hdr_read((BGZF*)fp->fp); }