/* composition */ int stk_comp(int argc, char *argv[]) { gzFile fp; kseq_t *seq; int l, c, upper_only = 0; reghash_t *h = 0; reglist_t dummy; while ((c = getopt(argc, argv, "ur:")) >= 0) { switch (c) { case 'u': upper_only = 1; break; case 'r': h = stk_reg_read(optarg); break; } } if (argc == optind) { fprintf(stderr, "Usage: seqtk comp [-u] [-r in.bed] <in.fa>\n\n"); fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n"); return 1; } fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r"); seq = kseq_init(fp); dummy.n= dummy.m = 1; dummy.a = calloc(1, 8); while ((l = kseq_read(seq)) >= 0) { int i, k; reglist_t *p = 0; if (h) { khint_t k = kh_get(reg, h, seq->name.s); if (k != kh_end(h)) p = &kh_val(h, k); } else { p = &dummy; dummy.a[0] = l; } for (k = 0; p && k < p->n; ++k) { int beg = p->a[k]>>32, end = p->a[k]&0xffffffff; int la, lb, lc, na, nb, nc, cnt[11]; if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb]; else la = 'a', lb = -1, lc = 0; na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; memset(cnt, 0, 11 * sizeof(int)); for (i = beg; i < end; ++i) { int is_CpG = 0, a, b, c; a = na; b = nb; c = nc; na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb]; if (b == 2 || b == 10) { // C or Y if (nb == 4 || nb == 5) is_CpG = 1; } else if (b == 4 || b == 5) { // G or R if (lb == 2 || lb == 10) is_CpG = 1; } if (upper_only == 0 || isupper(a)) { if (c > 1) ++cnt[c+2]; if (c == 1) ++cnt[seq_nt16to4_table[b]]; if (b == 10 || b == 5) ++cnt[9]; else if (c == 2) { ++cnt[8]; } if (is_CpG) { ++cnt[7]; if (b == 10 || b == 5) ++cnt[10]; } } la = a; lb = b; lc = c; } if (h) printf("%s\t%d\t%d", seq->name.s, beg, end); else printf("%s\t%d", seq->name.s, l); for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]); putchar('\n'); } fflush(stdout); } free(dummy.a); kseq_destroy(seq); gzclose(fp); return 0; }
int stk_subseq(int argc, char *argv[]) { khash_t(reg) *h = kh_init(reg); gzFile fp; kseq_t *seq; int l, i, j, c, is_tab = 0, line = 1024; khint_t k; while ((c = getopt(argc, argv, "tl:")) >= 0) { switch (c) { case 't': is_tab = 1; break; case 'l': line = atoi(optarg); break; } } if (optind + 2 > argc) { fprintf(stderr, "\n"); fprintf(stderr, "Usage: seqtk subseq [options] <in.fa> <in.bed>|<name.list>\n\n"); fprintf(stderr, "Options: -t TAB delimited output\n"); fprintf(stderr, " -l INT sequence line length [%d]\n\n", line); fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n"); return 1; } h = stk_reg_read(argv[optind+1]); // subseq fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { reglist_t *p; k = kh_get(reg, h, seq->name.s); if (k == kh_end(h)) continue; p = &kh_val(h, k); for (i = 0; i < p->n; ++i) { int beg = p->a[i]>>32, end = p->a[i]; if (beg >= seq->seq.l) { fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l); continue; } if (end > seq->seq.l) end = seq->seq.l; if (is_tab == 0) { printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s); if (beg > 0 || (int)p->a[i] != INT_MAX) { if (end == INT_MAX) { if (beg) printf(":%d", beg+1); } else printf(":%d-%d", beg+1, end); } } else printf("%s\t%d\t", seq->name.s, beg + 1); if (end > seq->seq.l) end = seq->seq.l; for (j = 0; j < end - beg; ++j) { if (is_tab == 0 && j % line == 0) putchar('\n'); putchar(seq->seq.s[j + beg]); } putchar('\n'); if (seq->qual.l != seq->seq.l || is_tab) continue; printf("+"); for (j = 0; j < end - beg; ++j) { if (j % line == 0) putchar('\n'); putchar(seq->qual.s[j + beg]); } putchar('\n'); } } // free kseq_destroy(seq); gzclose(fp); stk_reg_destroy(h); return 0; }
int stk_maskseq(int argc, char *argv[]) { khash_t(reg) *h = kh_init(reg); gzFile fp; kseq_t *seq; int l, i, j, c, is_complement = 0, is_lower = 0; khint_t k; while ((c = getopt(argc, argv, "cl")) >= 0) { switch (c) { case 'c': is_complement = 1; break; case 'l': is_lower = 1; break; } } if (argc - optind < 2) { fprintf(pysamerr, "Usage: seqtk maskseq [-cl] <in.fa> <in.bed>\n\n"); fprintf(pysamerr, "Options: -c mask the complement regions\n"); fprintf(pysamerr, " -l soft mask (to lower cases)\n"); return 1; } h = stk_reg_read(argv[optind+1]); // maskseq fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { k = kh_get(reg, h, seq->name.s); if (k == kh_end(h)) { // not found in the hash table if (is_complement) { for (j = 0; j < l; ++j) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; } } else { reglist_t *p = &kh_val(h, k); if (!is_complement) { for (i = 0; i < p->n; ++i) { int beg = p->a[i]>>32, end = p->a[i]; if (beg >= seq->seq.l) { fprintf(pysamerr, "[maskseq] start position >= the sequence length.\n"); continue; } if (end >= seq->seq.l) end = seq->seq.l; if (is_lower) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]); else for (j = beg; j < end; ++j) seq->seq.s[j] = 'N'; } } else { int8_t *mask = calloc(seq->seq.l, 1); for (i = 0; i < p->n; ++i) { int beg = p->a[i]>>32, end = p->a[i]; if (end >= seq->seq.l) end = seq->seq.l; for (j = beg; j < end; ++j) mask[j] = 1; } for (j = 0; j < l; ++j) if (mask[j] == 0) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N'; free(mask); } } printf(">%s", seq->name.s); for (j = 0; j < seq->seq.l; ++j) { if (j%60 == 0) putchar('\n'); putchar(seq->seq.s[j]); } putchar('\n'); }