Exemple #1
0
/* composition */
int stk_comp(int argc, char *argv[])
{
	gzFile fp;
	kseq_t *seq;
	int l, c, upper_only = 0;
	reghash_t *h = 0;
	reglist_t dummy;
	while ((c = getopt(argc, argv, "ur:")) >= 0) {
		switch (c) {
			case 'u': upper_only = 1; break;
			case 'r': h = stk_reg_read(optarg); break;
		}
	}
	if (argc == optind) {
		fprintf(stderr, "Usage:  seqtk comp [-u] [-r in.bed] <in.fa>\n\n");
		fprintf(stderr, "Output format: chr, length, #A, #C, #G, #T, #2, #3, #4, #CpG, #tv, #ts, #CpG-ts\n");
		return 1;
	}
	fp = (strcmp(argv[optind], "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(argv[optind], "r");
	seq = kseq_init(fp);
	dummy.n= dummy.m = 1; dummy.a = calloc(1, 8);
	while ((l = kseq_read(seq)) >= 0) {
		int i, k;
		reglist_t *p = 0;
		if (h) {
			khint_t k = kh_get(reg, h, seq->name.s);
			if (k != kh_end(h)) p = &kh_val(h, k);
		} else {
			p = &dummy;
			dummy.a[0] = l;
		}
		for (k = 0; p && k < p->n; ++k) {
			int beg = p->a[k]>>32, end = p->a[k]&0xffffffff;
			int la, lb, lc, na, nb, nc, cnt[11];
			if (beg > 0) la = seq->seq.s[beg-1], lb = seq_nt16_table[la], lc = bitcnt_table[lb];
			else la = 'a', lb = -1, lc = 0;
			na = seq->seq.s[beg]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
			memset(cnt, 0, 11 * sizeof(int));
			for (i = beg; i < end; ++i) {
				int is_CpG = 0, a, b, c;
				a = na; b = nb; c = nc;
				na = seq->seq.s[i+1]; nb = seq_nt16_table[na]; nc = bitcnt_table[nb];
				if (b == 2 || b == 10) { // C or Y
					if (nb == 4 || nb == 5) is_CpG = 1;
				} else if (b == 4 || b == 5) { // G or R
					if (lb == 2 || lb == 10) is_CpG = 1;
				}
				if (upper_only == 0 || isupper(a)) {
					if (c > 1) ++cnt[c+2];
					if (c == 1) ++cnt[seq_nt16to4_table[b]];
					if (b == 10 || b == 5) ++cnt[9];
					else if (c == 2) {
						++cnt[8];
					}
					if (is_CpG) {
						++cnt[7];
						if (b == 10 || b == 5) ++cnt[10];
					}
				}
				la = a; lb = b; lc = c;
			}
			if (h) printf("%s\t%d\t%d", seq->name.s, beg, end);
			else printf("%s\t%d", seq->name.s, l);
			for (i = 0; i < 11; ++i) printf("\t%d", cnt[i]);
			putchar('\n');
		}
		fflush(stdout);
	}
	free(dummy.a);
	kseq_destroy(seq);
	gzclose(fp);
	return 0;
}
Exemple #2
0
int stk_subseq(int argc, char *argv[])
{
	khash_t(reg) *h = kh_init(reg);
	gzFile fp;
	kseq_t *seq;
	int l, i, j, c, is_tab = 0, line = 1024;
	khint_t k;
	while ((c = getopt(argc, argv, "tl:")) >= 0) {
		switch (c) {
		case 't': is_tab = 1; break;
		case 'l': line = atoi(optarg); break;
		}
	}
	if (optind + 2 > argc) {
		fprintf(stderr, "\n");
		fprintf(stderr, "Usage:   seqtk subseq [options] <in.fa> <in.bed>|<name.list>\n\n");
		fprintf(stderr, "Options: -t       TAB delimited output\n");
		fprintf(stderr, "         -l INT   sequence line length [%d]\n\n", line);
		fprintf(stderr, "Note: Use 'samtools faidx' if only a few regions are intended.\n\n");
		return 1;
	}
	h = stk_reg_read(argv[optind+1]);
	// subseq
	fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
	seq = kseq_init(fp);
	while ((l = kseq_read(seq)) >= 0) {
		reglist_t *p;
		k = kh_get(reg, h, seq->name.s);
		if (k == kh_end(h)) continue;
		p = &kh_val(h, k);
		for (i = 0; i < p->n; ++i) {
			int beg = p->a[i]>>32, end = p->a[i];
			if (beg >= seq->seq.l) {
				fprintf(stderr, "[subseq] %s: %d >= %ld\n", seq->name.s, beg, seq->seq.l);
				continue;
			}
			if (end > seq->seq.l) end = seq->seq.l;
			if (is_tab == 0) {
				printf("%c%s", seq->qual.l == seq->seq.l? '@' : '>', seq->name.s);
				if (beg > 0 || (int)p->a[i] != INT_MAX) {
					if (end == INT_MAX) {
						if (beg) printf(":%d", beg+1);
					} else printf(":%d-%d", beg+1, end);
				}
			} else printf("%s\t%d\t", seq->name.s, beg + 1);
			if (end > seq->seq.l) end = seq->seq.l;
			for (j = 0; j < end - beg; ++j) {
				if (is_tab == 0 && j % line == 0) putchar('\n');
				putchar(seq->seq.s[j + beg]);
			}
			putchar('\n');
			if (seq->qual.l != seq->seq.l || is_tab) continue;
			printf("+");
			for (j = 0; j < end - beg; ++j) {
				if (j % line == 0) putchar('\n');
				putchar(seq->qual.s[j + beg]);
			}
			putchar('\n');
		}
	}
	// free
	kseq_destroy(seq);
	gzclose(fp);
	stk_reg_destroy(h);
	return 0;
}
Exemple #3
0
int stk_maskseq(int argc, char *argv[])
{
	khash_t(reg) *h = kh_init(reg);
	gzFile fp;
	kseq_t *seq;
	int l, i, j, c, is_complement = 0, is_lower = 0;
	khint_t k;
	while ((c = getopt(argc, argv, "cl")) >= 0) {
		switch (c) {
		case 'c': is_complement = 1; break;
		case 'l': is_lower = 1; break;
		}
	}
	if (argc - optind < 2) {
		fprintf(pysamerr, "Usage:   seqtk maskseq [-cl] <in.fa> <in.bed>\n\n");
		fprintf(pysamerr, "Options: -c     mask the complement regions\n");
		fprintf(pysamerr, "         -l     soft mask (to lower cases)\n");
		return 1;
	}
	h = stk_reg_read(argv[optind+1]);
	// maskseq
	fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
	seq = kseq_init(fp);
	while ((l = kseq_read(seq)) >= 0) {
		k = kh_get(reg, h, seq->name.s);
		if (k == kh_end(h)) { // not found in the hash table
			if (is_complement) {
				for (j = 0; j < l; ++j)
					seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N';
			}
		} else {
			reglist_t *p = &kh_val(h, k);
			if (!is_complement) {
				for (i = 0; i < p->n; ++i) {
					int beg = p->a[i]>>32, end = p->a[i];
					if (beg >= seq->seq.l) {
						fprintf(pysamerr, "[maskseq] start position >= the sequence length.\n");
						continue;
					}
					if (end >= seq->seq.l) end = seq->seq.l;
					if (is_lower) for (j = beg; j < end; ++j) seq->seq.s[j] = tolower(seq->seq.s[j]);
					else for (j = beg; j < end; ++j) seq->seq.s[j] = 'N';
				}
			} else {
				int8_t *mask = calloc(seq->seq.l, 1);
				for (i = 0; i < p->n; ++i) {
					int beg = p->a[i]>>32, end = p->a[i];
					if (end >= seq->seq.l) end = seq->seq.l;
					for (j = beg; j < end; ++j) mask[j] = 1;
				}
				for (j = 0; j < l; ++j)
					if (mask[j] == 0) seq->seq.s[j] = is_lower? tolower(seq->seq.s[j]) : 'N';
				free(mask);
			}
		}
		printf(">%s", seq->name.s);
		for (j = 0; j < seq->seq.l; ++j) {
			if (j%60 == 0) putchar('\n');
			putchar(seq->seq.s[j]);
		}
		putchar('\n');
	}