Ejemplo n.º 1
0
bam_header_t *sam_header_read2(const char *fn)
{
	bam_header_t *header;
	int c, dret, ret;
	gzFile fp;
	kstream_t *ks;
	kstring_t *str;
	kh_ref_t *hash;
	khiter_t k;
	hash = kh_init(ref);
	fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
	assert(fp);
	ks = ks_init(fp);
	str = (kstring_t*)calloc(1, sizeof(kstring_t));
	while (ks_getuntil(ks, 0, str, &dret) > 0) {
		char *s = strdup(str->s);
		int len, i;
		i = kh_size(hash);
		ks_getuntil(ks, 0, str, &dret);
		len = atoi(str->s);
		k = kh_put(ref, hash, s, &ret);
		kh_value(hash, k) = (uint64_t)len<<32 | i;
		if (dret != '\n')
			while ((c = ks_getc(ks)) != '\n' && c != -1);
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
	header = hash2header(hash);
	kh_destroy(ref, hash);
	return header;
}
Ejemplo n.º 2
0
bam_header_t *sam_header_read2(const char *fn)
{
    bam_header_t *header;
    int c, dret, n_targets = 0;
    gzFile fp;
    kstream_t *ks;
    kstring_t *str;
    kstring_t samstr = { 0, 0, NULL };
    if (fn == 0) return 0;
    fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r");
    if (fp == 0) return 0;
    ks = ks_init(fp);
    str = (kstring_t*)calloc(1, sizeof(kstring_t));
    while (ks_getuntil(ks, 0, str, &dret) > 0) {
        ksprintf(&samstr, "@SQ\tSN:%s", str->s);
        ks_getuntil(ks, 0, str, &dret);
        ksprintf(&samstr, "\tLN:%d\n", atoi(str->s));
        n_targets++;
        if (dret != '\n')
            while ((c = ks_getc(ks)) != '\n' && c != -1);
    }
    ks_destroy(ks);
    gzclose(fp);
    free(str->s); free(str);
    header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : "");
    free(samstr.s);
    fprintf(pysamerr, "[sam_header_read2] %d sequences loaded.\n", n_targets);
    return header;
}
Ejemplo n.º 3
0
Archivo: vcf.c Proyecto: 9beckert/TIR
int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn)
{
	vcf_t *v;
	gzFile fp;
	kstream_t *ks;
	kstring_t s, rn;
	int dret;
	if (bp == 0) return -1;
	if (!bp->is_vcf) return 0;
	s.l = s.m = 0; s.s = 0;
	rn.m = rn.l = h->l_nm; rn.s = h->name;
	v = (vcf_t*)bp->v;
	fp = gzopen(fn, "r");
	ks = ks_init(fp);
	while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
		bcf_str2id_add(v->refhash, strdup(s.s));
		kputs(s.s, &rn); kputc('\0', &rn);
		if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret);
	}
	ks_destroy(ks);
	gzclose(fp);
	h->l_nm = rn.l; h->name = rn.s;
	bcf_hdr_sync(h);
	free(s.s);
	return 0;
}
Ejemplo n.º 4
0
int main(int argc, char *argv[])
{
	gzFile fp;
	kstream_t *ks;
	khash_t(s) *hash;
	mask32_t *q = 0;
	kstring_t *str;
	int i, dret, c, last = 0;

	while ((c = getopt(argc, argv, "")) >= 0) {
	}
	if (argc <= optind + 1) {
		fprintf(stderr, "Usage: uniq-dist <in.mask.fa> <in-sorted.list>\n");
		return 1;
	}

	str = (kstring_t*)calloc(1, sizeof(kstring_t));
	fprintf(stderr, "[uniq-dist] loading mask...\n");
	hash = load_mask(argv[optind]);
	fp = gzopen(argv[optind+1], "r");
	ks = ks_init(fp);
	fprintf(stderr, "[uniq-dist] calculating unique distance...\n");
	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
		khint_t k;
		mask32_t *p;
		int pos;
		k = kh_get(s, hash, str->s);
		p = (k != kh_end(hash))? &kh_val(hash, k) : 0;
		ks_getuntil(ks, 0, str, &dret);
		pos = atoi(str->s) - 1;
		if (p && pos >= 0 && pos < p->ori_len) {
			if (p != q) q = p; // change of reference
			else {
				if (last >= pos) {
					fprintf(stderr, "[uniq-dist] out of order: %s:%d <= %d\n", kh_key(hash, k), pos+1, last+1);
				} else {
					for (i = last, c = 0; i < pos; ++i)
						if (p->mask[i/32] & 1u<<i%32) ++c;
					if (last > 0) printf("%s\t%d\t%d\t%d\n", kh_key(hash, k), last, pos, c);
				}
			}
			last = pos;
		}
		if (dret != '\n')
			while ((c = ks_getc(ks)) != -1 && c != '\n');
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	// hash table is not freed...
	return 0;
}
Ejemplo n.º 5
0
int main(int argc, char *argv[])
{
	gzFile fp;
	kstream_t *ks;
	khash_t(s) *hash;
	kstring_t *str;
	int dret, c, complement = 0;

	while ((c = getopt(argc, argv, "c")) >= 0) {
		switch (c) {
			case 'c': complement = 1; break;
		}
	}
	if (argc <= optind + 1) {
		fprintf(stderr, "Usage: apply_mask_l [-c] <in.mask.fa> <in.list>\n");
		return 1;
	}

	str = (kstring_t*)calloc(1, sizeof(kstring_t));
	fprintf(stderr, "[apply_mask_l] loading mask...\n");
	hash = load_mask(argv[optind]);
	fp = gzopen(argv[optind+1], "r");
	ks = ks_init(fp);
	fprintf(stderr, "[apply_mask_l] filtering list...\n");
	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
		khint_t k;
		mask32_t *p;
		int pos, do_print = 0;
		k = kh_get(s, hash, str->s);
		p = (k != kh_end(hash))? &kh_val(hash, k) : 0;
		ks_getuntil(ks, 0, str, &dret);
		pos = atoi(str->s) - 1;
		if (complement == 0) {
			if (p && pos < p->ori_len && (p->mask[pos/32]&1u<<pos%32)) do_print = 1;
		} else {
			if (p && pos < p->ori_len && (p->mask[pos/32]&1u<<pos%32) == 0) do_print = 1;
		}
		if (do_print) printf("%s\t%d", kh_key(hash, k), pos + 1);
		if (dret != '\n') {
			if (do_print) putchar('\t');
			while ((c = ks_getc(ks)) != -1 && c != '\n')
				if (do_print) putchar(c);
		}
		if (do_print) putchar('\n');
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	// hash table is not freed...
	return 0;
}
Ejemplo n.º 6
0
reghash_t *stk_reg_read(const char *fn)
{
	reghash_t *h = kh_init(reg);
	gzFile fp;
	kstream_t *ks;
	int dret;
	kstring_t *str;
	// read the list
	str = calloc(1, sizeof(kstring_t));
	fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
	ks = ks_init(fp);
	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
		int beg = -1, end = -1;
		reglist_t *p;
		khint_t k = kh_get(reg, h, str->s);
		if (k == kh_end(h)) {
			int ret;
			char *s = strdup(str->s);
			k = kh_put(reg, h, s, &ret);
			memset(&kh_val(h, k), 0, sizeof(reglist_t));
		}
		p = &kh_val(h, k);
		if (dret != '\n') {
			if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
				beg = atoi(str->s);
				if (dret != '\n') {
					if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) {
						end = atoi(str->s);
						if (end < 0) end = -1;
					}
				}
			}
		}
		// skip the rest of the line
		if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
		if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column
		if (beg < 0) beg = 0, end = INT_MAX;
		if (p->n == p->m) {
			p->m = p->m? p->m<<1 : 4;
			p->a = realloc(p->a, p->m * 8);
		}
		p->a[p->n++] = (uint64_t)beg<<32 | end;
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	return h;
}
Ejemplo n.º 7
0
int paf_read(paf_file_t *pf, paf_rec_t *r)
{
	int ret, dret;
file_read_more:
	ret = ks_getuntil((kstream_t*)pf->fp, KS_SEP_LINE, &pf->buf, &dret);
	if (ret < 0) return ret;
	ret = paf_parse(pf->buf.l, pf->buf.s, r);
	if (ret < 0) goto file_read_more;
	return ret;
}
Ejemplo n.º 8
0
bam_header_t *sam_header_read2(const char *fn)
{
	bam_header_t *header;
	int c, dret, ret, error = 0;
	gzFile fp;
	kstream_t *ks;
	kstring_t *str;
	kh_ref_t *hash;
	khiter_t k;
	if (fn == 0) return 0;
	fp = gzopen(fn, "r");
	if (fp == 0) return 0;
	hash = kh_init(ref);
	ks = ks_init(fp);
	str = (kstring_t*)calloc(1, sizeof(kstring_t));
	while (ks_getuntil(ks, 0, str, &dret) > 0) {
		char *s = malloc(strlen(str->s) + 1);
		strcpy(s,str->s);

		int len, i;
		i = kh_size(hash);
		ks_getuntil(ks, 0, str, &dret);
		len = atoi(str->s);
		k = kh_put(ref, hash, s, &ret);
		if (ret == 0) {
			Rprintf("[sam_header_read2] duplicated sequence name: %s\n", s);
			error = 1;
		}
		kh_value(hash, k) = (uint64_t)len<<32 | i;
		if (dret != '\n')
			while ((c = ks_getc(ks)) != '\n' && c != -1);
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	Rprintf("[sam_header_read2] %d sequences loaded.\n", kh_size(hash));
	if (error) return 0;
	header = hash2header(hash);
	kh_destroy(ref, hash);
	return header;
}
Ejemplo n.º 9
0
Archivo: prob1.c Proyecto: NCIP/alview
int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn)
{
	gzFile fp;
	kstring_t s;
	kstream_t *ks;
	long double sum;
	int dret, k;
	memset(&s, 0, sizeof(kstring_t));
	fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r");
	ks = ks_init(fp);
	memset(ma->phi, 0, sizeof(double) * (ma->M + 1));
	while (ks_getuntil(ks, '\n', &s, &dret) >= 0) {
		if (strstr(s.s, "[afs] ") == s.s) {
			char *p = s.s + 6;
			for (k = 0; k <= ma->M; ++k) {
				int x;
				double y;
				x = strtol(p, &p, 10);
				if (x != k && (errno == EINVAL || errno == ERANGE)) return -1;
				++p;
				y = strtod(p, &p);
				if (y == 0. && (errno == EINVAL || errno == ERANGE)) return -1;
				ma->phi[ma->M - k] += y;
			}
		}
	}
	ks_destroy(ks);
	gzclose(fp);
	free(s.s);
	for (sum = 0., k = 0; k <= ma->M; ++k) sum += ma->phi[k];
	fprintf(stderr, "[prior]");
	for (k = 0; k <= ma->M; ++k) ma->phi[k] /= sum;
	for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lg", k, ma->phi[ma->M - k]);
	fputc('\n', stderr);
	for (sum = 0., k = 1; k < ma->M; ++k) sum += ma->phi[ma->M - k] * (2.* k * (ma->M - k) / ma->M / (ma->M - 1));
	fprintf(stderr, "heterozygosity=%lf, ",  (double)sum);
	for (sum = 0., k = 1; k <= ma->M; ++k) sum += k * ma->phi[ma->M - k] / ma->M;
	fprintf(stderr, "theta=%lf\n", (double)sum);
	bcf_p1_indel_prior(ma, MC_DEF_INDEL);
	return 0;
}
Ejemplo n.º 10
0
Archivo: vcf.c Proyecto: 9beckert/TIR
bcf_hdr_t *vcf_hdr_read(bcf_t *bp)
{
	kstring_t meta, smpl;
	int dret;
	vcf_t *v;
	bcf_hdr_t *h;
	if (!bp->is_vcf) return bcf_hdr_read(bp);
	h = calloc(1, sizeof(bcf_hdr_t));
	v = (vcf_t*)bp->v;
	v->line.l = 0;
	memset(&meta, 0, sizeof(kstring_t));
	memset(&smpl, 0, sizeof(kstring_t));
	while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) {
		if (v->line.l < 2) continue;
		if (v->line.s[0] != '#') return 0; // no sample line
		if (v->line.s[0] == '#' && v->line.s[1] == '#') {
			kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta);
		} else if (v->line.s[0] == '#') {
			int k;
			ks_tokaux_t aux;
			char *p;
			for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
				if (k >= 9) {
					kputsn(p, aux.p - p, &smpl);
					kputc('\0', &smpl);
				}
			}
			break;
		}
	}
	kputc('\0', &meta);
	h->name = 0;
	h->sname = smpl.s; h->l_smpl = smpl.l;
	h->txt = meta.s; h->l_txt = meta.l;
	bcf_hdr_sync(h);
	return h;
}
Ejemplo n.º 11
0
int stk_mutfa(int argc, char *argv[])
{
	khash_t(reg) *h = kh_init(reg);
	gzFile fp;
	kseq_t *seq;
	kstream_t *ks;
	int l, i, dret;
	kstring_t *str;
	khint_t k;
	if (argc < 3) {
		fprintf(stderr, "Usage: seqtk mutfa <in.fa> <in.snp>\n\n");
		fprintf(stderr, "Note: <in.snp> contains at least four columns per line which are:\n");
		fprintf(stderr, "      'chr  1-based-pos  any  base-changed-to'.\n");
		return 1;
	}
	// read the list
	str = calloc(1, sizeof(kstring_t));
	fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r");
	ks = ks_init(fp);
	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
		char *s = strdup(str->s);
		int beg = 0, ret;
		reglist_t *p;
		k = kh_get(reg, h, s);
		if (k == kh_end(h)) {
			k = kh_put(reg, h, s, &ret);
			memset(&kh_val(h, k), 0, sizeof(reglist_t));
		}
		p = &kh_val(h, k);
		if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col
		ks_getuntil(ks, 0, str, &dret); // 3rd col
		ks_getuntil(ks, 0, str, &dret); // 4th col
		// skip the rest of the line
		if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n');
		if (isalpha(str->s[0]) && str->l == 1) {
			if (p->n == p->m) {
				p->m = p->m? p->m<<1 : 4;
				p->a = realloc(p->a, p->m * 8);
			}
			p->a[p->n++] = (uint64_t)beg<<32 | str->s[0];
		}
	}
	ks_destroy(ks);
	gzclose(fp);
	free(str->s); free(str);
	// mutfa
	fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r");
	seq = kseq_init(fp);
	while ((l = kseq_read(seq)) >= 0) {
		reglist_t *p;
		k = kh_get(reg, h, seq->name.s);
		if (k != kh_end(h)) {
			p = &kh_val(h, k);
			for (i = 0; i < p->n; ++i) {
				int beg = p->a[i]>>32;
				if (beg < seq->seq.l)
					seq->seq.s[beg] = (int)p->a[i];
			}
		}
		printf(">%s", seq->name.s);
		for (i = 0; i < l; ++i) {
			if (i%60 == 0) putchar('\n');
			putchar(seq->seq.s[i]);
		}
		putchar('\n');
	}
Ejemplo n.º 12
0
Archivo: arg.c Proyecto: lh3/fastARG
arg_t *arg_load(gzFile fp)
{
	kstream_t *ks;
	kstring_t *str;
	int dret, lineno = 0;
	arg_t *a;
	a = arg_init();
	ks = ks_init(fp);
	str = (kstring_t*)calloc(1, sizeof(kstring_t));
	while (ks_getuntil(ks, 0, str, &dret) >= 0) {
		int n1, n2, i;
		++lineno;
		if (str->l != 1) {
			fprintf(stderr, "[arg_load] invalid initial character at line %d\n", lineno);
			exit(1);
		}
		if (str->s[0] == 'C' || str->s[0] == 'R') {
			argnode_t *an1, *an2;
			int beg, end, n_mut;
			// read
			ks_getuntil(ks, 0, str, &dret); n1 = atoi(str->s);
			ks_getuntil(ks, 0, str, &dret); n2 = atoi(str->s);
			if (n1 <= n2) {
				fprintf(stderr, "[arg_load] invalid edge (%d)\n", lineno);
				exit(1);
			}
			// add to the node array
			if (n1 + 1 > a->max_size) {
				int old_size = a->max_size;
				a->max_size = n1 + 1;
				kroundup32(a->max_size);
				a->node = (argnode_t*)realloc(a->node, sizeof(argnode_t) * a->max_size);
				memset(a->node + old_size, 0, (a->max_size - old_size) * sizeof(argnode_t));
			}
			an1 = a->node + n1; an2 = a->node + n2;
			an1->nid = n1; an2->nid = n2;
			if (an1->n_nei == 3 || an2->n_nei == 3) {
				fprintf(stderr, "[arg_load] multifurcated node: %d or %d (%d)\n", n1, n2, lineno);
				exit(1);
			}
			an1->nei[an1->n_nei++] = n2; an2->nei[an2->n_nei++] = n1;
			// read intervals
			ks_getuntil(ks, 0, str, &dret); beg = atoi(str->s);
			ks_getuntil(ks, 0, str, &dret); end = atoi(str->s);
			if (an2->end) { // a recombination node
				if (an2->end == beg) an2->x = an2->end, an2->end = end;
				else if (an2->beg == end) { // and also swap
					int x = an2->nei[1]; an2->nei[1] = an2->nei[2]; an2->nei[2] = x;
					an2->x = an2->beg, an2->beg = beg;
				} else {
					fprintf(stderr, "[arg_load] inconsisten interval at node %d (%d)\n", n2, lineno);
					exit(1);
				}
			} else an2->beg = beg, an2->end = end;
			// read mutations
			ks_getuntil(ks, 0, str, &dret); n_mut = atoi(str->s);
			if (n_mut) {
				an2->mut = (int*)realloc(an2->mut, sizeof(int) * (an2->n_mut + n_mut));
				for (i = 0; i < n_mut; ++i) {
					ks_getuntil(ks, 0, str, &dret);
					an2->mut[an2->n_mut++] = atoi(str->s);
				}
			}
			// save interval and swap if necessary
		} else if (str->s[0] == 'N') {
			ks_getuntil(ks, 0, str, &dret); a->n = atoi(str->s);
			ks_getuntil(ks, 0, str, &dret); a->m = atoi(str->s);
		} else if (str->s[0] == 'S') {
			ks_getuntil(ks, 0, str, &dret); a->root = atoi(str->s);
			ks_getuntil(ks, 0, str, &dret);
			if (str->l != a->m) {
				fprintf(stderr, "[arg_load] inconsistent root sequence (%d)\n", lineno);
				exit(1);
			}
			a->rootseq = (uint64_t*)calloc((a->m + 63) / 64, 8);
			for (i = 0; i < a->m; ++i)
				if (str->s[i] == '1') arg_setseq1(a->rootseq, i);
		} else ks_getuntil(ks, '\n', str, &dret);
	}
	free(str->s); free(str);
	ks_destroy(ks);
	return a;
}
Ejemplo n.º 13
0
int main(int argc, char *argv[])
{
    gzFile fp;
    kstream_t *ks;
    kstring_t s, t[N_TMPSTR];
    int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0;
    long m_cigar = 0, n_cigar = 0;
    unsigned *af, *cigar = 0;

    while ((c = getopt(argc, argv, "pc")) >= 0) {
        switch (c) {
            case 'p': is_padded = 1; break;
            case 'c': write_cns = 1; break;
        }
    }
    if (argc == optind) {
        fprintf(stderr, "\nUsage:   ace2sam [-pc] <in.ace>\n\n");
        fprintf(stderr, "Options: -p     output padded SAM\n");
        fprintf(stderr, "         -c     write the contig sequence in SAM\n\n");
        fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n");
        fprintf(stderr, "       2. The order of reads in AF and in RD must be identical\n");
        fprintf(stderr, "       3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n");
        fprintf(stderr, "       4. This program writes the headerless SAM to stdout and header to stderr\n\n");
        return 1;
    }

    s.l = s.m = 0; s.s = 0;
    af_n = af_max = af_i = 0; af = 0;
    for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0;
    fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
    ks = ks_init(fp);
    while (ks_getuntil(ks, 0, &s, &dret) >= 0) {
        if (strcmp(s.s, "CO") == 0) { // contig sequence
            kstring_t *cns;
            t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line
            af_n = af_i = 0; // reset the af array
            ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name
            ks_getuntil(ks, '\n', &s, &dret); // read the whole line
            while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence
            remove_pads(&t[1], &t[2]); // construct the unpadded sequence
            // compute the array for mapping padded positions to unpadded positions
            p2u = realloc(p2u, t[1].m * sizeof(int));
            for (i = k = 0; i < t[1].l; ++i) {
                p2u[i] = k;
                if (t[1].s[i] != '*') ++k;
            }
            // write out the SAM header and contig sequences
            fprintf(stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line
            cns = &t[is_padded?1:2];
            fprintf(stderr, "S >%s\n", t[0].s);
            for (i = 0; i < cns->l; i += LINE_LEN) {
                fputs("S ", stderr);
                for (k = 0; k < LINE_LEN && i + k < cns->l; ++k)
                    fputc(cns->s[i + k], stderr);
                fputc('\n', stderr);
            }

#define __padded2cigar(sp) do { \
        int i, l_M = 0, l_D = 0; \
        for (i = 0; i < sp.l; ++i) { \
            if (sp.s[i] == '*') { \
                if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
                ++l_D; l_M = 0; \
            } else { \
                if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
                ++l_M; l_D = 0; \
            } \
        } \
        if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \
        else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \
    } while (0)

            if (write_cns) { // write the consensus SAM line (dummy read)
                n_cigar = 0;
                if (is_padded) __padded2cigar(t[1]);
                else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4);
                kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]);
                for (i = 0; i < n_cigar; ++i) {
                    kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]);
                }
                kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]);
            }
        } else if (strcmp(s.s, "BQ") == 0) { // contig quality
Ejemplo n.º 14
0
int main_bedcov(int argc, char *argv[])
{
    gzFile fp;
    kstring_t str;
    kstream_t *ks;
    hts_idx_t **idx;
    aux_t **aux;
    int *n_plp, dret, i, n, c, min_mapQ = 0;
    int64_t *cnt;
    const bam_pileup1_t **plp;
    int usage = 0;

    sam_global_args ga = SAM_GLOBAL_ARGS_INIT;
    static const struct option lopts[] = {
        SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0),
        { NULL, 0, NULL, 0 }
    };

    while ((c = getopt_long(argc, argv, "Q:", lopts, NULL)) >= 0) {
        switch (c) {
        case 'Q': min_mapQ = atoi(optarg); break;
        default:  if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break;
                  /* else fall-through */
        case '?': usage = 1; break;
        }
        if (usage) break;
    }
    if (usage || optind + 2 > argc) {
        fprintf(pysam_stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n");
        fprintf(pysam_stderr, "  -Q INT       Only count bases of at least INT quality [0]\n");
        sam_global_opt_help(pysam_stderr, "-.--.");
        return 1;
    }
    memset(&str, 0, sizeof(kstring_t));
    n = argc - optind - 1;
    aux = calloc(n, sizeof(aux_t*));
    idx = calloc(n, sizeof(hts_idx_t*));
    for (i = 0; i < n; ++i) {
        aux[i] = calloc(1, sizeof(aux_t));
        aux[i]->min_mapQ = min_mapQ;
        aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in);
        if (aux[i]->fp)
            idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
        if (aux[i]->fp == 0 || idx[i] == 0) {
            fprintf(pysam_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
            return 2;
        }
        // TODO bgzf_set_cache_size(aux[i]->fp, 20);
        aux[i]->header = sam_hdr_read(aux[i]->fp);
        if (aux[i]->header == NULL) {
            fprintf(pysam_stderr, "ERROR: failed to read header for '%s'\n",
                    argv[i+optind+1]);
            return 2;
        }
    }
    cnt = calloc(n, 8);

    fp = gzopen(argv[optind], "rb");
    ks = ks_init(fp);
    n_plp = calloc(n, sizeof(int));
    plp = calloc(n, sizeof(bam_pileup1_t*));
    while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
        char *p, *q;
        int tid, beg, end, pos;
        bam_mplp_t mplp;

        for (p = q = str.s; *p && *p != '\t'; ++p);
        if (*p != '\t') goto bed_error;
        *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t';
        if (tid < 0) goto bed_error;
        for (q = p = p + 1; isdigit(*p); ++p);
        if (*p != '\t') goto bed_error;
        *p = 0; beg = atoi(q); *p = '\t';
        for (q = p = p + 1; isdigit(*p); ++p);
        if (*p == '\t' || *p == 0) {
            int c = *p;
            *p = 0; end = atoi(q); *p = c;
        } else goto bed_error;

        for (i = 0; i < n; ++i) {
            if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
            aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
        }
        mplp = bam_mplp_init(n, read_bam, (void**)aux);
        bam_mplp_set_maxcnt(mplp, 64000);
        memset(cnt, 0, 8 * n);
        while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
            if (pos >= beg && pos < end)
                for (i = 0; i < n; ++i) cnt[i] += n_plp[i];
        for (i = 0; i < n; ++i) {
            kputc('\t', &str);
            kputl(cnt[i], &str);
        }
        fputs(str.s, pysam_stdout) & fputc('\n', pysam_stdout);
        bam_mplp_destroy(mplp);
        continue;

bed_error:
        fprintf(pysam_stderr, "Errors in BED line '%s'\n", str.s);
    }
    free(n_plp); free(plp);
    ks_destroy(ks);
    gzclose(fp);

    free(cnt);
    for (i = 0; i < n; ++i) {
        if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
        hts_idx_destroy(idx[i]);
        bam_hdr_destroy(aux[i]->header);
        sam_close(aux[i]->fp);
        free(aux[i]);
    }
    free(aux); free(idx);
    free(str.s);
    sam_global_args_free(&ga);
    return 0;
}
Ejemplo n.º 15
0
int ingest1(const char *input,const char *output,char *ref,bool exit_on_mismatch=true) {
  cerr << "Input: " << input << "\tOutput: "<<output<<endl;

  kstream_t *ks;
  kstring_t str = {0,0,0};    
  gzFile fp = gzopen(input, "r");
  VarBuffer vbuf(1000);
  int prev_rid = -1;
  if(fp==NULL) {
    fprintf(stderr,"problem opening %s\n",input);
    exit(1);
  }

  char *out_fname = (char *)malloc(strlen(output)+5);
  strcpy(out_fname,output);
  strcat(out_fname,".tmp");
  if(fileexists(out_fname)) {
    fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname);
    exit(1);
  }
  printf("depth: %s\n",out_fname);
  gzFile depth_fp = gzopen(out_fname, "wb1");
  strcpy(out_fname,output);
  strcat(out_fname,".bcf");
  if(fileexists(out_fname)) {
    fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname);
    exit(1);
  }
  printf("variants: %s\n",out_fname);
  htsFile *variant_fp=hts_open(out_fname,"wb1");
  if(variant_fp==NULL) {
    fprintf(stderr,"problem opening %s\n",input);
    exit(1);    
  }

  ks = ks_init(fp);
  htsFile *hfp=hts_open(input, "r");
  bcf_hdr_t *hdr_in =  bcf_hdr_read(hfp);
  hts_close(hfp);
  //this is a hack to fix gvcfs where AD is incorrectly defined in the header. (vcf4.2 does not technically allow Number=R)
  bcf_hdr_remove(hdr_in,BCF_HL_FMT,"AD");
  assert(  bcf_hdr_append(hdr_in,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed. For indels this value only includes reads which confidently support each allele (posterior prob 0.999 or higher that read contains indicated allele vs all other intersecting indel alleles)\">") == 0);

  //this is a hack to fix broken gvcfs where GQ is incorrectly labelled as float (v4.3 spec says it should be integer)
  bcf_hdr_remove(hdr_in,BCF_HL_FMT,"GQ");
  assert(  bcf_hdr_append(hdr_in,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">") == 0);


  //  bcf_hdr_t  *hdr_out=hdr_in;
  bcf_hdr_t *hdr_out =  bcf_hdr_dup(hdr_in);
  remove_hdr_lines(hdr_out,BCF_HL_INFO);
  remove_hdr_lines(hdr_out,BCF_HL_FLT);
  bcf_hdr_sync(hdr_out);

  //here we add FORMAT/PF. which is the pass filter flag for alts.
  assert(  bcf_hdr_append(hdr_out,"##FORMAT=<ID=PF,Number=A,Type=Integer,Description=\"variant was PASS filter in original sample gvcf\">") == 0);

  args_t *norm_args = init_vcfnorm(hdr_out,ref);
  norm_args->check_ref |= CHECK_REF_WARN;
  bcf1_t *bcf_rec = bcf_init();
  bcf_hdr_write(variant_fp, hdr_out);
  kstring_t work1 = {0,0,0};            
  int buf[5];
  ks_tokaux_t aux;
  int ndec=0;
  int ref_len,alt_len;
  while(    ks_getuntil(ks, '\n', &str, 0) >=0) {
    //    fprintf(stderr,"%s\n",str.s);
    if(str.s[0]!='#')  {
      char *ptr = kstrtok(str.s,"\t",&aux);//chrom
      ptr = kstrtok(NULL,NULL,&aux);//pos
      work1.l=0;
      kputsn(str.s,ptr-str.s-1, &work1);   
      buf[0] =  bcf_hdr_name2id(hdr_in, work1.s);
      assert(      buf[0]>=0);
      buf[1]=atoi(ptr)-1;
      ptr = kstrtok(NULL,NULL,&aux);//ID
      ptr = kstrtok(NULL,NULL,&aux);//REF

      ref_len=0;
      while(ptr[ref_len]!='\t') ref_len++;

      ptr = kstrtok(NULL,NULL,&aux);//ALT

      bool is_variant=false;
      alt_len=0;
      while(ptr[alt_len]!='\t') alt_len++;
      if(ptr[0]!='.') 
	is_variant=true;

      char * QUAL_ptr = kstrtok(NULL, NULL, &aux);
      assert (QUAL_ptr != NULL);
      
      for(int i=0;i<2;i++)  ptr = kstrtok(NULL,NULL,&aux);// gets us to INFO

      //find END if it is there
      char *end_ptr=strstr(ptr,"END=") ;
      if(end_ptr!=NULL) 
	buf[2]=atoi(end_ptr+4)-1;
      else
	buf[2]=buf[1]+alt_len-1;

      ptr  = kstrtok(NULL,NULL,&aux);//FORMAT
      //find index of DP (if present)
      //if not present, dont output anything (indels ignored)

      char *DP_ptr = find_format(ptr,"DP");
      int GQX = 0;
      int QUAL = 0;

      // AH: change code to use the minimum of GQ and QUAL fields if
      // GQX is not defined. See here:
      // https://support.basespace.illumina.com/knowledgebase/articles/144844-vcf-file
      // "GQXGenotype quality. GQX is the minimum of the GQ value
      // and the QUAL column. In general, these are similar values;
      // taking the minimum makes GQX the more conservative measure of
      // genotype quality."
      if(DP_ptr!=NULL) {
	buf[3]=atoi(DP_ptr);
	char *GQX_ptr = find_format(ptr,"GQX");
	if (GQX_ptr == NULL) 
	  {
	    GQX_ptr = find_format(ptr,"GQ");
	    GQX = atoi(GQX_ptr);
	    if (QUAL_ptr[0] != '.') 
	      {
		QUAL = atoi(QUAL_ptr);
		if (QUAL < GQX)
		  GQX = QUAL;
	      }
	  }
	else
	  {
	    GQX = atoi(GQX_ptr);
	  }
	
	//trying to reduce entropy on GQ to get better compression performance.
	//1. rounds down to nearest 10. 
	//2. sets gq to min(gq,100). 
	buf[4]=GQX/10;
	buf[4]*=10;
	if(buf[4]>100) buf[4]=100;

	//	printf("%d\t%d\t%d\t%d\t%d\n",buf[0],buf[1],buf[2],buf[3],buf[4]);
	if(gzwrite(depth_fp,buf,5*sizeof(int))!=(5*sizeof(int)))
	  die("ERROR: problem writing "+(string)out_fname+".tmp");
      }
      if(is_variant) {//wass this a variant? if so write it out to the bcf
	norm_args->ntotal++;
	vcf_parse(&str,hdr_in,bcf_rec);
	//	cerr<<bcf_rec->rid<<":"<<bcf_rec->pos<<endl;
	if(prev_rid!=bcf_rec->rid) 
	  vbuf.flush(variant_fp,hdr_out);
	else
	  vbuf.flush(bcf_rec->pos,variant_fp,hdr_out);
	prev_rid=bcf_rec->rid;
	int32_t pass = bcf_has_filter(hdr_in, bcf_rec, ".");
	bcf_update_format_int32(hdr_out,bcf_rec,"PF",&pass,1);
	bcf_update_filter(hdr_out,bcf_rec,NULL,0);
	if(bcf_rec->n_allele>2) {//split multi-allelics (using vcfnorm.c from bcftools1.3
	  norm_args->nsplit++;
	  split_multiallelic_to_biallelics(norm_args,bcf_rec );
	  for(int i=0;i<norm_args->ntmp_lines;i++){
	    remove_info(norm_args->tmp_lines[i]);
	    if(realign(norm_args,norm_args->tmp_lines[i]) != ERR_REF_MISMATCH)
	      ndec+=decompose(norm_args->tmp_lines[i],hdr_out,vbuf);
	    else
	      if(exit_on_mismatch)
		die("vcf did not match the reference");
	      else
		norm_args->nskipped++;
	  }
	}
	else {
	  remove_info(bcf_rec);
	  if( realign(norm_args,bcf_rec) !=  ERR_REF_MISMATCH)
	    ndec+=decompose(bcf_rec,hdr_out,vbuf);
	  else
	    if(exit_on_mismatch)
	      die("vcf did not match the reference");
	    else
	      norm_args->nskipped++;
	}
	vbuf.flush(bcf_rec->pos,variant_fp,hdr_out);
      }
    }
  }
  vbuf.flush(variant_fp,hdr_out);
  bcf_hdr_destroy(hdr_in);
  bcf_hdr_destroy(hdr_out);
  bcf_destroy1(bcf_rec);
  ks_destroy(ks);
  gzclose(fp);
  gzclose(depth_fp);  
  free(str.s);
  free(work1.s);
  hts_close(variant_fp);
  destroy_data(norm_args);
  fprintf(stderr,"Variant lines   total/split/realigned/skipped:\t%d/%d/%d/%d\n", norm_args->ntotal,norm_args->nsplit,norm_args->nchanged,norm_args->nskipped);
  fprintf(stderr,"Decomposed %d MNPs\n", ndec);


  fprintf(stderr,"Indexing %s\n",out_fname);
  bcf_index_build(out_fname, BCF_LIDX_SHIFT);
  free(out_fname);
  return 0;
}
Ejemplo n.º 16
0
Archivo: vcf.c Proyecto: 9beckert/TIR
int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b)
{
	int dret, k, i, sync = 0;
	vcf_t *v = (vcf_t*)bp->v;
	char *p, *q;
	kstring_t str, rn;
	ks_tokaux_t aux, a2;
	if (!bp->is_vcf) return bcf_read(bp, h, b);
	v->line.l = 0;
	str.l = 0; str.m = b->m_str; str.s = b->str;
	rn.l = rn.m = h->l_nm; rn.s = h->name;
	if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1;
	b->n_smpl = h->n_smpl;
	for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) {
		*(char*)aux.p = 0;
		if (k == 0) { // ref
			int tid = bcf_str2id(v->refhash, p);
			if (tid < 0) {
				tid = bcf_str2id_add(v->refhash, strdup(p));
				kputs(p, &rn); kputc('\0', &rn);
				sync = 1;
			}
			b->tid = tid;
		} else if (k == 1) { // pos
			b->pos = atoi(p) - 1;
		} else if (k == 5) { // qual
			b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0;
		} else if (k <= 8) { // variable length strings
			kputs(p, &str); kputc('\0', &str);
			b->l_str = str.l; b->m_str = str.m; b->str = str.s;
			if (k == 8) bcf_sync(b);
		} else { // k > 9
			if (strncmp(p, "./.", 3) == 0) {
				for (i = 0; i < b->n_gi; ++i) {
					if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
						((uint8_t*)b->gi[i].data)[k-9] = 1<<7;
					} else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
						((uint8_t*)b->gi[i].data)[k-9] = 0;
					} else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
						((int32_t*)b->gi[i].data)[k-9] = 0;
					} else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
						((uint16_t*)b->gi[i].data)[k-9] = 0;
					} else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
						int y = b->n_alleles * (b->n_alleles + 1) / 2;
						memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y);
					} else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
						int y = b->n_alleles * (b->n_alleles + 1) / 2;
						memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4);
					}
				}
				goto endblock;
			}
			for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) {
				if (b->gi[i].fmt == bcf_str2int("GT", 2)) {
					((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6;
				} else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) {
					double _x = strtod(q, &q);
					int x = (int)(_x + .499);
					if (x > 255) x = 255;
					((uint8_t*)b->gi[i].data)[k-9] = x;
				} else if (b->gi[i].fmt == bcf_str2int("SP", 2)) {
					int x = strtol(q, &q, 10);
					if (x > 0xffff) x = 0xffff;
					((uint32_t*)b->gi[i].data)[k-9] = x;
				} else if (b->gi[i].fmt == bcf_str2int("DP", 2)) {
					int x = strtol(q, &q, 10);
					if (x > 0xffff) x = 0xffff;
					((uint16_t*)b->gi[i].data)[k-9] = x;
				} else if (b->gi[i].fmt == bcf_str2int("PL", 2)) {
					int x, y, j;
					uint8_t *data = (uint8_t*)b->gi[i].data;
					y = b->n_alleles * (b->n_alleles + 1) / 2;
					for (j = 0; j < y; ++j) {
						x = strtol(q, &q, 10);
						if (x > 255) x = 255;
						data[(k-9) * y + j] = x;
						++q;
					}
				} else if (b->gi[i].fmt == bcf_str2int("GL", 2)) {
					int j, y;
					float x, *data = (float*)b->gi[i].data;
					y = b->n_alleles * (b->n_alleles + 1) / 2;
					for (j = 0; j < y; ++j) {
						x = strtod(q, &q);
						data[(k-9) * y + j] = x > 0? -x/10. : x;
						++q;
					}
				}
			}
		endblock: i = i;
		}
	}
	h->l_nm = rn.l; h->name = rn.s;
	if (sync) bcf_hdr_sync(h);
	return v->line.l + 1;
}
Ejemplo n.º 17
0
Archivo: vcf.c Proyecto: goshng/cocoa
bcf_hdr_t *vcf_hdr_read(htsFile *fp)
{
	if (!fp->is_bin) {
		kstring_t txt, *s = &fp->line;
		bcf_hdr_t *h;
		h = bcf_hdr_init();
		txt.l = txt.m = 0; txt.s = 0;
		while (hts_getline(fp, KS_SEP_LINE, s) >= 0) {
			if (s->l == 0) continue;
			if (s->s[0] != '#') {
				if (hts_verbose >= 2)
					fprintf(stderr, "[E::%s] no sample line\n", __func__);
				free(txt.s);
				bcf_hdr_destroy(h);
				return 0;
			}
			if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here
				int dret;
				gzFile f;
				kstream_t *ks;
				kstring_t tmp;
				tmp.l = tmp.m = 0; tmp.s = 0;
				f = gzopen(fp->fn_aux, "r");
				ks = ks_init(f);
				while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) {
					int c;
					kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt);
					ks_getuntil(ks, 0, &tmp, &dret);
					kputs(",length=", &txt); kputw(atol(tmp.s), &txt);
					kputsn(">\n", 2, &txt);
					if (dret != '\n')
						while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line
				}
				free(tmp.s);
				ks_destroy(ks);
				gzclose(f);
			}
			kputsn(s->s, s->l, &txt);
			if (s->s[1] != '#') break;
			kputc('\n', &txt);
		}
		h->l_text = txt.l + 1; // including NULL
		h->text = txt.s;
		bcf_hdr_parse(h);
        // check tabix index, are all contigs listed in the header? add the missing ones
        tbx_t *idx = tbx_index_load(fp->fn);
        if ( idx )
        {
			int i, n, need_sync = 0;
			const char **names = tbx_seqnames(idx, &n);
			for (i=0; i<n; i++)
			{
                bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]);
                if ( hrec ) continue;
                hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t));
                hrec->key = strdup("contig");
                bcf_hrec_add_key(hrec, "ID", strlen("ID"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0);
                bcf_hrec_add_key(hrec, "length", strlen("length"));
                bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0);   // what is a good default value?
                bcf_hdr_add_hrec(h, hrec);
                need_sync = 1;
			}
			free(names);
			tbx_destroy(idx);
            if ( need_sync )
            {
                bcf_hdr_sync(h);
                bcf_hdr_fmt_text(h);
            }
		}
		return h;
	} else return bcf_hdr_read((BGZF*)fp->fp);
}
Ejemplo n.º 18
0
int main_bedcov(int argc, char *argv[])
{
    gzFile fp;
    kstring_t str;
    kstream_t *ks;
    hts_idx_t **idx;
    aux_t **aux;
    int *n_plp, dret, i, n, c, min_mapQ = 0;
    int64_t *cnt;
    const bam_pileup1_t **plp;

    while ((c = getopt(argc, argv, "Q:")) >= 0) {
        switch (c) {
        case 'Q': min_mapQ = atoi(optarg); break;
        }
    }
    if (optind + 2 > argc) {
        fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n");
        return 1;
    }
    memset(&str, 0, sizeof(kstring_t));
    n = argc - optind - 1;
    aux = calloc(n, sizeof(aux_t*));
    idx = calloc(n, sizeof(hts_idx_t*));
    for (i = 0; i < n; ++i) {
        aux[i] = calloc(1, sizeof(aux_t));
        aux[i]->min_mapQ = min_mapQ;
        aux[i]->fp = sam_open(argv[i+optind+1], "r");
        idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]);
        if (aux[i]->fp == 0 || idx[i] == 0) {
            fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]);
            return 2;
        }
        // TODO bgzf_set_cache_size(aux[i]->fp, 20);
        aux[i]->header = sam_hdr_read(aux[i]->fp);
    }
    cnt = calloc(n, 8);

    fp = gzopen(argv[optind], "rb");
    ks = ks_init(fp);
    n_plp = calloc(n, sizeof(int));
    plp = calloc(n, sizeof(bam_pileup1_t*));
    while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
        char *p, *q;
        int tid, beg, end, pos;
        bam_mplp_t mplp;

        for (p = q = str.s; *p && *p != '\t'; ++p);
        if (*p != '\t') goto bed_error;
        *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t';
        if (tid < 0) goto bed_error;
        for (q = p = p + 1; isdigit(*p); ++p);
        if (*p != '\t') goto bed_error;
        *p = 0; beg = atoi(q); *p = '\t';
        for (q = p = p + 1; isdigit(*p); ++p);
        if (*p == '\t' || *p == 0) {
            int c = *p;
            *p = 0; end = atoi(q); *p = c;
        } else goto bed_error;

        for (i = 0; i < n; ++i) {
            if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
            aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end);
        }
        mplp = bam_mplp_init(n, read_bam, (void**)aux);
        bam_mplp_set_maxcnt(mplp, 64000);
        memset(cnt, 0, 8 * n);
        while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0)
            if (pos >= beg && pos < end)
                for (i = 0; i < n; ++i) cnt[i] += n_plp[i];
        for (i = 0; i < n; ++i) {
            kputc('\t', &str);
            kputl(cnt[i], &str);
        }
        puts(str.s);
        bam_mplp_destroy(mplp);
        continue;

bed_error:
        fprintf(stderr, "Errors in BED line '%s'\n", str.s);
    }
    free(n_plp); free(plp);
    ks_destroy(ks);
    gzclose(fp);

    free(cnt);
    for (i = 0; i < n; ++i) {
        if (aux[i]->iter) hts_itr_destroy(aux[i]->iter);
        hts_idx_destroy(idx[i]);
        bam_hdr_destroy(aux[i]->header);
        sam_close(aux[i]->fp);
        free(aux[i]);
    }
    free(aux); free(idx);
    free(str.s);
    return 0;
}
Ejemplo n.º 19
0
int main(int argc, char *argv[])
{
	int c, dret, lineno = 0, n_rows = 0, m_rows = 0, n_cols = 0, max_hap = 0;
	int64_t n_missing = 0, n_tot = 0;
	gzFile fp;
	kstream_t *ks;
	kstring_t str = {0,0,0};
	int8_t **C = 0;
	double **M, *X, min_maf = 0.0;
	char **names = 0;

//	_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW | _MM_MASK_DIV_ZERO));
	_MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_DIV_ZERO));
	while ((c = getopt(argc, argv, "m:")) >= 0) {
		if (c == 'm') min_maf = atof(optarg);
	}
	if (argc - optind == 0) {
		fprintf(stderr, "Usage: naivepca [-m min_maf] <in.txt>\n");
		return 1;
	}

	fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r");
	if (fp == 0) {
		fprintf(stderr, "[E::%s] failed to open file '%s'. Abort.\n", __func__, argv[optind]);
		return 2;
	}
	ks = ks_init(fp);

	// read the matrix into C
	while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) {
		int8_t *q;
		char *p, *name = str.s;
		int i;
		++lineno;
		for (p = str.s; *p && *p != '\t' && *p != ' '; ++p);
		if (*p) {
			*p++ = 0;
			for (; *p && (*p == '\t' || *p == ' '); ++p);
		}
		if (*p == 0) {
			fprintf(stderr, "[W::%s] line %d has one field; skipped.\n", __func__, lineno);
			continue;
		}
		if (n_cols != 0) {
			if (n_cols != str.s + str.l - p) {
				fprintf(stderr, "[W::%s] line %d has a different number of columns; skipped.\n", __func__, lineno);
				continue;
			}
		} else n_cols = str.s + str.l - p;
		if (n_rows == m_rows) {
			m_rows = m_rows? m_rows<<1 : 16;
			C = (int8_t**)realloc(C, m_rows * sizeof(int8_t*));
			names = (char**)realloc(names, m_rows * sizeof(char*));
		}
		names[n_rows] = strdup(name);
		q = C[n_rows++] = (int8_t*)calloc(n_cols, sizeof(double));
		for (i = 0; i < n_cols; ++i) {
			if (p[i] >= '0' && p[i] <= '9') q[i] = p[i] - '0';
			else q[i] = -1, ++n_missing;
			max_hap = max_hap > q[i]? max_hap : q[i];
		}
		n_tot += n_cols;
	}
	free(str.s);
	fprintf(stderr, "[M::%s] read %d samples and %d sites; ploidy is %d\n", __func__, n_rows, n_cols, max_hap);
	fprintf(stderr, "[M::%s] %.3f%% of genotypes are missing\n", __func__, (double)n_missing / n_tot);

	{ // normalize the matrix into M
		int i, j, *sum, *cnt, n_dropped = 0;
		double *mu, *pp;
		sum = (int*)calloc(n_cols, sizeof(int));
		cnt = (int*)calloc(n_cols, sizeof(int));
		mu = (double*)calloc(n_cols, sizeof(double));
		pp = (double*)calloc(n_cols, sizeof(double));
		for (i = 0; i < n_rows; ++i) {
			int8_t *q = C[i];
			for (j = 0; j < n_cols; ++j)
				if (q[j] >= 0) sum[j] += q[j], ++cnt[j];
		}
		for (j = 0; j < n_cols; ++j) {
			if (cnt[j] > 0) {
				mu[j] = (double)sum[j] / cnt[j];
				pp[j] = mu[j] / max_hap;
				if (pp[j] < min_maf || 1. - pp[j] < min_maf) ++n_dropped;
			} else ++n_dropped;
		}
		fprintf(stderr, "[M::%s] %d rare sites are dropped\n", __func__, n_dropped);
		M = (double**)calloc(n_rows, sizeof(double*));
		for (i = 0; i < n_rows; ++i) {
			int8_t *q = C[i];
			double *r;
			r = M[i] = (double*)calloc(n_cols, sizeof(double));
			for (j = 0; j < n_cols; ++j)
				r[j] = q[j] < 0 || pp[j] < min_maf || 1. - pp[j] < min_maf || pp[j] == 0. || 1 - pp[j] == 0. ? 0. : (q[j] - mu[j]) / sqrt(pp[j] * (1. - pp[j]));
		}
		free(sum); free(cnt); free(mu); free(pp);
		for (i = 0; i < n_rows; ++i) free(C[i]);
		free(C);
	}

	{ // multiplication
		int i, j, k;
		X = (double*)calloc(n_rows * n_rows, sizeof(double));
		for (i = 0; i < n_rows; ++i) {
			double *zi = M[i];
			for (j = 0; j <= i; ++j) {
				double t = 0., *zj = M[j];
				for (k = 0; k < n_cols; ++k)
					t += zi[k] * zj[k];
				X[i*n_rows + j] = X[j*n_rows + i] = t / n_cols;
			}
		}
		for (i = 0; i < n_rows; ++i) free(M[i]);
		free(M);
	}

	{ // print eigan vectors
		double *ev;
		int i, j;
		evsrt_t *evsrt;
		ev = (double*)calloc(n_rows, sizeof(double));
		evsrt = (evsrt_t*)calloc(n_rows, sizeof(evsrt_t));
		n_eigen_symm(X, n_rows, ev);
		for (i = 0; i < n_rows; ++i)
			evsrt[i].ev = ev[i], evsrt[i].i = i;
		ks_introsort(ev, n_rows, evsrt);
		for (i = 0; i < n_rows; ++i) {
			printf("%s", names[i]);
			for (j = 0; j < n_rows; ++j)
				printf("\t%.6f", X[i*n_rows + evsrt[j].i] * evsrt[j].ev);
			putchar('\n');
			free(names[i]);
		}
		free(ev); free(evsrt);
		free(X); free(names);
	}
	
	ks_destroy(ks);
	gzclose(fp);
	return 0;
}