Exemplo n.º 1
0
bam_header_t *sam_header_read2(const char *fn)
{
    bam_header_t *header;
    int c, dret, n_targets = 0;
    gzFile fp;
    kstream_t *ks;
    kstring_t *str;
    kstring_t samstr = { 0, 0, NULL };
    if (fn == 0) return 0;
    fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
    if (fp == 0) return 0;
    ks = ks_init(fp);
    str = (kstring_t*)calloc(1, sizeof(kstring_t));
    while (ks_getuntil(ks, 0, str, &dret) > 0) {
        ksprintf(&samstr, "@SQ\tSN:%s", str->s);
        ks_getuntil(ks, 0, str, &dret);
        ksprintf(&samstr, "\tLN:%d\n", atoi(str->s));
        n_targets++;
        if (dret != '\n')
            while ((c = ks_getc(ks)) != '\n' && c != -1);
    }
    ks_destroy(ks);
    gzclose(fp);
    free(str->s); free(str);
    header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : "");
    free(samstr.s);
    fprintf(pysamerr, "[sam_header_read2] %d sequences loaded.\n", n_targets);
    return header;
}
Exemplo n.º 2
0
bam_header_t *sam_header_read2(const char *fn)
{
	bam_header_t *header;
	int c, dret, ret;
	gzFile fp;
	kstream_t *ks;
	kstring_t *str;
	kh_ref_t *hash;
	khiter_t k;
	hash = kh_init(ref);
	fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
	assert(fp);
	ks = ks_init(fp);
	str = (kstring_t*)calloc(1, sizeof(kstring_t));
	while (ks_getuntil(ks, 0, str, &dret) > 0) {
		char *s = strdup(str->s);
		int len, i;
		i = kh_size(hash);
		ks_getuntil(ks, 0, str, &dret);
		len = atoi(str->s);
		k = kh_put(ref, hash, s, &ret);
		kh_value(hash, k) = (uint64_t)len<<32 | i;
		if (dret != '\n')
			while ((c = ks_getc(ks)) != '\n' && c != -1);
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
	header = hash2header(hash);
	kh_destroy(ref, hash);
	return header;
}
Exemplo n.º 3
0
int main(int argc, char *argv[])
{
	gzFile fp;
	kstream_t *ks;
	khash_t(s) *hash;
	mask32_t *q = 0;
	kstring_t *str;
	int i, dret, c, last = 0;

	while ((c = getopt(argc, argv, "")) >= 0) {
	}
	if (argc <= optind + 1) {
		fprintf(stderr, "Usage: uniq-dist <in.mask.fa> <in-sorted.list>\n");
		return 1;
	}

	str = (kstring_t*)calloc(1, sizeof(kstring_t));
	fprintf(stderr, "[uniq-dist] loading mask...\n");
	hash = load_mask(argv[optind]);
	fp = gzopen(argv[optind+1], "r");
	ks = ks_init(fp);
	fprintf(stderr, "[uniq-dist] calculating unique distance...\n");
	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
		khint_t k;
		mask32_t *p;
		int pos;
		k = kh_get(s, hash, str->s);
		p = (k != kh_end(hash))? &kh_val(hash, k) : 0;
		ks_getuntil(ks, 0, str, &dret);
		pos = atoi(str->s) - 1;
		if (p && pos >= 0 && pos < p->ori_len) {
			if (p != q) q = p; // change of reference
			else {
				if (last >= pos) {
					fprintf(stderr, "[uniq-dist] out of order: %s:%d <= %d\n", kh_key(hash, k), pos+1, last+1);
				} else {
					for (i = last, c = 0; i < pos; ++i)
						if (p->mask[i/32] & 1u<<i%32) ++c;
					if (last > 0) printf("%s\t%d\t%d\t%d\n", kh_key(hash, k), last, pos, c);
				}
			}
			last = pos;
		}
		if (dret != '\n')
			while ((c = ks_getc(ks)) != -1 && c != '\n');
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	// hash table is not freed...
	return 0;
}
Exemplo n.º 4
0
int main(int argc, char *argv[])
{
	gzFile fp;
	kstream_t *ks;
	khash_t(s) *hash;
	kstring_t *str;
	int dret, c, complement = 0;

	while ((c = getopt(argc, argv, "c")) >= 0) {
		switch (c) {
			case 'c': complement = 1; break;
		}
	}
	if (argc <= optind + 1) {
		fprintf(stderr, "Usage: apply_mask_l [-c] <in.mask.fa> <in.list>\n");
		return 1;
	}

	str = (kstring_t*)calloc(1, sizeof(kstring_t));
	fprintf(stderr, "[apply_mask_l] loading mask...\n");
	hash = load_mask(argv[optind]);
	fp = gzopen(argv[optind+1], "r");
	ks = ks_init(fp);
	fprintf(stderr, "[apply_mask_l] filtering list...\n");
	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
		khint_t k;
		mask32_t *p;
		int pos, do_print = 0;
		k = kh_get(s, hash, str->s);
		p = (k != kh_end(hash))? &kh_val(hash, k) : 0;
		ks_getuntil(ks, 0, str, &dret);
		pos = atoi(str->s) - 1;
		if (complement == 0) {
			if (p && pos < p->ori_len && (p->mask[pos/32]&1u<<pos%32)) do_print = 1;
		} else {
			if (p && pos < p->ori_len && (p->mask[pos/32]&1u<<pos%32) == 0) do_print = 1;
		}
		if (do_print) printf("%s\t%d", kh_key(hash, k), pos + 1);
		if (dret != '\n') {
			if (do_print) putchar('\t');
			while ((c = ks_getc(ks)) != -1 && c != '\n')
				if (do_print) putchar(c);
		}
		if (do_print) putchar('\n');
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	// hash table is not freed...
	return 0;
}
Exemplo n.º 5
0
reghash_t *stk_reg_read(const char *fn)
{
	reghash_t *h = kh_init(reg);
	gzFile fp;
	kstream_t *ks;
	int dret;
	kstring_t *str;
	// read the list
	str = calloc(1, sizeof(kstring_t));
	fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
	ks = ks_init(fp);
	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
		int beg = -1, end = -1;
		reglist_t *p;
		khint_t k = kh_get(reg, h, str->s);
		if (k == kh_end(h)) {
			int ret;
			char *s = strdup(str->s);
			k = kh_put(reg, h, s, &ret);
			memset(&kh_val(h, k), 0, sizeof(reglist_t));
		}
		p = &kh_val(h, k);
		if (dret != '\n') {
			if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
				beg = atoi(str->s);
				if (dret != '\n') {
					if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
						end = atoi(str->s);
						if (end < 0) end = -1;
					}
				}
			}
		}
		// skip the rest of the line
		if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
		if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
		if (beg < 0) beg = 0, end = INT_MAX;
		if (p->n == p->m) {
			p->m = p->m? p->m<<1 : 4;
			p->a = realloc(p->a, p->m * 8);
		}
		p->a[p->n++] = (uint64_t)beg<<32 | end;
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	return h;
}
Exemplo n.º 6
0
bam_header_t *sam_header_read2(const char *fn)
{
	bam_header_t *header;
	int c, dret, ret, error = 0;
	gzFile fp;
	kstream_t *ks;
	kstring_t *str;
	kh_ref_t *hash;
	khiter_t k;
	if (fn == 0) return 0;
	fp = gzopen(fn, "r");
	if (fp == 0) return 0;
	hash = kh_init(ref);
	ks = ks_init(fp);
	str = (kstring_t*)calloc(1, sizeof(kstring_t));
	while (ks_getuntil(ks, 0, str, &dret) > 0) {
		char *s = malloc(strlen(str->s) + 1);
		strcpy(s,str->s);

		int len, i;
		i = kh_size(hash);
		ks_getuntil(ks, 0, str, &dret);
		len = atoi(str->s);
		k = kh_put(ref, hash, s, &ret);
		if (ret == 0) {
			Rprintf("[sam_header_read2] duplicated sequence name: %s\n", s);
			error = 1;
		}
		kh_value(hash, k) = (uint64_t)len<<32 | i;
		if (dret != '\n')
			while ((c = ks_getc(ks)) != '\n' && c != -1);
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	Rprintf("[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
	if (error) return 0;
	header = hash2header(hash);
	kh_destroy(ref, hash);
	return header;
}
Exemplo n.º 7
0
int stk_mutfa(int argc, char *argv[])
{
	khash_t(reg) *h = kh_init(reg);
	gzFile fp;
	kseq_t *seq;
	kstream_t *ks;
	int l, i, dret;
	kstring_t *str;
	khint_t k;
	if (argc < 3) {
		fprintf(stderr, "Usage: seqtk mutfa <in.fa> <in.snp>\n\n");
		fprintf(stderr, "Note: <in.snp> contains at least four columns per line which are:\n");
		fprintf(stderr, "      'chr  1-based-pos  any  base-changed-to'.\n");
		return 1;
	}
	// read the list
	str = calloc(1, sizeof(kstring_t));
	fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
	ks = ks_init(fp);
	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
		char *s = strdup(str->s);
		int beg = 0, ret;
		reglist_t *p;
		k = kh_get(reg, h, s);
		if (k == kh_end(h)) {
			k = kh_put(reg, h, s, &ret);
			memset(&kh_val(h, k), 0, sizeof(reglist_t));
		}
		p = &kh_val(h, k);
		if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col
		ks_getuntil(ks, 0, str, &dret); // 3rd col
		ks_getuntil(ks, 0, str, &dret); // 4th col
		// skip the rest of the line
		if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
		if (isalpha(str->s[0]) && str->l == 1) {
			if (p->n == p->m) {
				p->m = p->m? p->m<<1 : 4;
				p->a = realloc(p->a, p->m * 8);
			}
			p->a[p->n++] = (uint64_t)beg<<32 | str->s[0];
		}
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	// mutfa
	fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
	seq = kseq_init(fp);
	while ((l = kseq_read(seq)) >= 0) {
		reglist_t *p;
		k = kh_get(reg, h, seq->name.s);
		if (k != kh_end(h)) {
			p = &kh_val(h, k);
			for (i = 0; i < p->n; ++i) {
				int beg = p->a[i]>>32;
				if (beg < seq->seq.l)
					seq->seq.s[beg] = (int)p->a[i];
			}
		}
		printf(">%s", seq->name.s);
		for (i = 0; i < l; ++i) {
			if (i%60 == 0) putchar('\n');
			putchar(seq->seq.s[i]);
		}
		putchar('\n');
	}
Exemplo n.º 8
0
Arquivo: vcf.c Projeto: goshng/cocoa
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
{
	if (!fp->is_bin) {
		kstring_t txt, *s = &fp->line;
		bcf_hdr_t *h;
		h = bcf_hdr_init();
		txt.l = txt.m = 0; txt.s = 0;
		while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
			if (s->l == 0) continue;
			if (s->s[0] != '#') {
				if (hts_verbose >= 2)
					fprintf(stderr, "[E::%s] no sample line\n", __func__);
				free(txt.s);
				bcf_hdr_destroy(h);
				return 0;
			}
			if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
				int dret;
				gzFile f;
				kstream_t *ks;
				kstring_t tmp;
				tmp.l = tmp.m = 0; tmp.s = 0;
				f = gzopen(fp->fn_aux, "r");
				ks = ks_init(f);
				while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
					int c;
					kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
					ks_getuntil(ks, 0, &tmp, &dret);
					kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
					kputsn(">\n", 2, &txt);
					if (dret != '\n')
						while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
				}
				free(tmp.s);
				ks_destroy(ks);
				gzclose(f);
			}
			kputsn(s->s, s->l, &txt);
			if (s->s[1] != '#') break;
			kputc('\n', &txt);
		}
		h->l_text = txt.l + 1; // including NULL
		h->text = txt.s;
		bcf_hdr_parse(h);
        // check tabix index, are all contigs listed in the header? add the missing ones
        tbx_t *idx = tbx_index_load(fp->fn);
        if ( idx )
        {
			int i, n, need_sync = 0;
			const char **names = tbx_seqnames(idx, &n);
			for (i=0; i<n; i++)
			{
                bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]);
                if ( hrec ) continue;
                hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
                hrec->key = strdup("contig");
                bcf_hrec_add_key(hrec, "ID", strlen("ID"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
                bcf_hrec_add_key(hrec, "length", strlen("length"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0);   // what is a good default value?
                bcf_hdr_add_hrec(h, hrec);
                need_sync = 1;
			}
			free(names);
			tbx_destroy(idx);
            if ( need_sync )
            {
                bcf_hdr_sync(h);
                bcf_hdr_fmt_text(h);
            }
		}
		return h;
	} else return bcf_hdr_read((BGZF*)fp->fp);
}