bam_header_t *sam_header_read2(const char *fn) { bam_header_t *header; int c, dret, ret; gzFile fp; kstream_t *ks; kstring_t *str; kh_ref_t *hash; khiter_t k; hash = kh_init(ref); fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); assert(fp); ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) > 0) { char *s = strdup(str->s); int len, i; i = kh_size(hash); ks_getuntil(ks, 0, str, &dret); len = atoi(str->s); k = kh_put(ref, hash, s, &ret); kh_value(hash, k) = (uint64_t)len<<32 | i; if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); header = hash2header(hash); kh_destroy(ref, hash); return header; }
bam_header_t *sam_header_read2(const char *fn) { bam_header_t *header; int c, dret, n_targets = 0; gzFile fp; kstream_t *ks; kstring_t *str; kstring_t samstr = { 0, 0, NULL }; if (fn == 0) return 0; fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); if (fp == 0) return 0; ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) > 0) { ksprintf(&samstr, "@SQ\tSN:%s", str->s); ks_getuntil(ks, 0, str, &dret); ksprintf(&samstr, "\tLN:%d\n", atoi(str->s)); n_targets++; if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : ""); free(samstr.s); fprintf(pysamerr, "[sam_header_read2] %d sequences loaded.\n", n_targets); return header; }
int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn) { vcf_t *v; gzFile fp; kstream_t *ks; kstring_t s, rn; int dret; if (bp == 0) return -1; if (!bp->is_vcf) return 0; s.l = s.m = 0; s.s = 0; rn.m = rn.l = h->l_nm; rn.s = h->name; v = (vcf_t*)bp->v; fp = gzopen(fn, "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { bcf_str2id_add(v->refhash, strdup(s.s)); kputs(s.s, &rn); kputc('\0', &rn); if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); } ks_destroy(ks); gzclose(fp); h->l_nm = rn.l; h->name = rn.s; bcf_hdr_sync(h); free(s.s); return 0; }
int main(int argc, char *argv[]) { gzFile fp; kstream_t *ks; khash_t(s) *hash; mask32_t *q = 0; kstring_t *str; int i, dret, c, last = 0; while ((c = getopt(argc, argv, "")) >= 0) { } if (argc <= optind + 1) { fprintf(stderr, "Usage: uniq-dist <in.mask.fa> <in-sorted.list>\n"); return 1; } str = (kstring_t*)calloc(1, sizeof(kstring_t)); fprintf(stderr, "[uniq-dist] loading mask...\n"); hash = load_mask(argv[optind]); fp = gzopen(argv[optind+1], "r"); ks = ks_init(fp); fprintf(stderr, "[uniq-dist] calculating unique distance...\n"); while (ks_getuntil(ks, 0, str, &dret) >= 0) { khint_t k; mask32_t *p; int pos; k = kh_get(s, hash, str->s); p = (k != kh_end(hash))? &kh_val(hash, k) : 0; ks_getuntil(ks, 0, str, &dret); pos = atoi(str->s) - 1; if (p && pos >= 0 && pos < p->ori_len) { if (p != q) q = p; // change of reference else { if (last >= pos) { fprintf(stderr, "[uniq-dist] out of order: %s:%d <= %d\n", kh_key(hash, k), pos+1, last+1); } else { for (i = last, c = 0; i < pos; ++i) if (p->mask[i/32] & 1u<<i%32) ++c; if (last > 0) printf("%s\t%d\t%d\t%d\n", kh_key(hash, k), last, pos, c); } } last = pos; } if (dret != '\n') while ((c = ks_getc(ks)) != -1 && c != '\n'); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); // hash table is not freed... return 0; }
int main(int argc, char *argv[]) { gzFile fp; kstream_t *ks; khash_t(s) *hash; kstring_t *str; int dret, c, complement = 0; while ((c = getopt(argc, argv, "c")) >= 0) { switch (c) { case 'c': complement = 1; break; } } if (argc <= optind + 1) { fprintf(stderr, "Usage: apply_mask_l [-c] <in.mask.fa> <in.list>\n"); return 1; } str = (kstring_t*)calloc(1, sizeof(kstring_t)); fprintf(stderr, "[apply_mask_l] loading mask...\n"); hash = load_mask(argv[optind]); fp = gzopen(argv[optind+1], "r"); ks = ks_init(fp); fprintf(stderr, "[apply_mask_l] filtering list...\n"); while (ks_getuntil(ks, 0, str, &dret) >= 0) { khint_t k; mask32_t *p; int pos, do_print = 0; k = kh_get(s, hash, str->s); p = (k != kh_end(hash))? &kh_val(hash, k) : 0; ks_getuntil(ks, 0, str, &dret); pos = atoi(str->s) - 1; if (complement == 0) { if (p && pos < p->ori_len && (p->mask[pos/32]&1u<<pos%32)) do_print = 1; } else { if (p && pos < p->ori_len && (p->mask[pos/32]&1u<<pos%32) == 0) do_print = 1; } if (do_print) printf("%s\t%d", kh_key(hash, k), pos + 1); if (dret != '\n') { if (do_print) putchar('\t'); while ((c = ks_getc(ks)) != -1 && c != '\n') if (do_print) putchar(c); } if (do_print) putchar('\n'); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); // hash table is not freed... return 0; }
reghash_t *stk_reg_read(const char *fn) { reghash_t *h = kh_init(reg); gzFile fp; kstream_t *ks; int dret; kstring_t *str; // read the list str = calloc(1, sizeof(kstring_t)); fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, str, &dret) >= 0) { int beg = -1, end = -1; reglist_t *p; khint_t k = kh_get(reg, h, str->s); if (k == kh_end(h)) { int ret; char *s = strdup(str->s); k = kh_put(reg, h, s, &ret); memset(&kh_val(h, k), 0, sizeof(reglist_t)); } p = &kh_val(h, k); if (dret != '\n') { if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { beg = atoi(str->s); if (dret != '\n') { if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { end = atoi(str->s); if (end < 0) end = -1; } } } } // skip the rest of the line if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column if (beg < 0) beg = 0, end = INT_MAX; if (p->n == p->m) { p->m = p->m? p->m<<1 : 4; p->a = realloc(p->a, p->m * 8); } p->a[p->n++] = (uint64_t)beg<<32 | end; } ks_destroy(ks); gzclose(fp); free(str->s); free(str); return h; }
int paf_read(paf_file_t *pf, paf_rec_t *r) { int ret, dret; file_read_more: ret = ks_getuntil((kstream_t*)pf->fp, KS_SEP_LINE, &pf->buf, &dret); if (ret < 0) return ret; ret = paf_parse(pf->buf.l, pf->buf.s, r); if (ret < 0) goto file_read_more; return ret; }
bam_header_t *sam_header_read2(const char *fn) { bam_header_t *header; int c, dret, ret, error = 0; gzFile fp; kstream_t *ks; kstring_t *str; kh_ref_t *hash; khiter_t k; if (fn == 0) return 0; fp = gzopen(fn, "r"); if (fp == 0) return 0; hash = kh_init(ref); ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) > 0) { char *s = malloc(strlen(str->s) + 1); strcpy(s,str->s); int len, i; i = kh_size(hash); ks_getuntil(ks, 0, str, &dret); len = atoi(str->s); k = kh_put(ref, hash, s, &ret); if (ret == 0) { Rprintf("[sam_header_read2] duplicated sequence name: %s\n", s); error = 1; } kh_value(hash, k) = (uint64_t)len<<32 | i; if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); Rprintf("[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); if (error) return 0; header = hash2header(hash); kh_destroy(ref, hash); return header; }
int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn) { gzFile fp; kstring_t s; kstream_t *ks; long double sum; int dret, k; memset(&s, 0, sizeof(kstring_t)); fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); memset(ma->phi, 0, sizeof(double) * (ma->M + 1)); while (ks_getuntil(ks, '\n', &s, &dret) >= 0) { if (strstr(s.s, "[afs] ") == s.s) { char *p = s.s + 6; for (k = 0; k <= ma->M; ++k) { int x; double y; x = strtol(p, &p, 10); if (x != k && (errno == EINVAL || errno == ERANGE)) return -1; ++p; y = strtod(p, &p); if (y == 0. && (errno == EINVAL || errno == ERANGE)) return -1; ma->phi[ma->M - k] += y; } } } ks_destroy(ks); gzclose(fp); free(s.s); for (sum = 0., k = 0; k <= ma->M; ++k) sum += ma->phi[k]; fprintf(stderr, "[prior]"); for (k = 0; k <= ma->M; ++k) ma->phi[k] /= sum; for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lg", k, ma->phi[ma->M - k]); fputc('\n', stderr); for (sum = 0., k = 1; k < ma->M; ++k) sum += ma->phi[ma->M - k] * (2.* k * (ma->M - k) / ma->M / (ma->M - 1)); fprintf(stderr, "heterozygosity=%lf, ", (double)sum); for (sum = 0., k = 1; k <= ma->M; ++k) sum += k * ma->phi[ma->M - k] / ma->M; fprintf(stderr, "theta=%lf\n", (double)sum); bcf_p1_indel_prior(ma, MC_DEF_INDEL); return 0; }
bcf_hdr_t *vcf_hdr_read(bcf_t *bp) { kstring_t meta, smpl; int dret; vcf_t *v; bcf_hdr_t *h; if (!bp->is_vcf) return bcf_hdr_read(bp); h = calloc(1, sizeof(bcf_hdr_t)); v = (vcf_t*)bp->v; v->line.l = 0; memset(&meta, 0, sizeof(kstring_t)); memset(&smpl, 0, sizeof(kstring_t)); while (ks_getuntil(v->ks, '\n', &v->line, &dret) >= 0) { if (v->line.l < 2) continue; if (v->line.s[0] != '#') return 0; // no sample line if (v->line.s[0] == '#' && v->line.s[1] == '#') { kputsn(v->line.s, v->line.l, &meta); kputc('\n', &meta); } else if (v->line.s[0] == '#') { int k; ks_tokaux_t aux; char *p; for (p = kstrtok(v->line.s, "\t\n", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) { if (k >= 9) { kputsn(p, aux.p - p, &smpl); kputc('\0', &smpl); } } break; } } kputc('\0', &meta); h->name = 0; h->sname = smpl.s; h->l_smpl = smpl.l; h->txt = meta.s; h->l_txt = meta.l; bcf_hdr_sync(h); return h; }
int stk_mutfa(int argc, char *argv[]) { khash_t(reg) *h = kh_init(reg); gzFile fp; kseq_t *seq; kstream_t *ks; int l, i, dret; kstring_t *str; khint_t k; if (argc < 3) { fprintf(stderr, "Usage: seqtk mutfa <in.fa> <in.snp>\n\n"); fprintf(stderr, "Note: <in.snp> contains at least four columns per line which are:\n"); fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); return 1; } // read the list str = calloc(1, sizeof(kstring_t)); fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, str, &dret) >= 0) { char *s = strdup(str->s); int beg = 0, ret; reglist_t *p; k = kh_get(reg, h, s); if (k == kh_end(h)) { k = kh_put(reg, h, s, &ret); memset(&kh_val(h, k), 0, sizeof(reglist_t)); } p = &kh_val(h, k); if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col ks_getuntil(ks, 0, str, &dret); // 3rd col ks_getuntil(ks, 0, str, &dret); // 4th col // skip the rest of the line if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); if (isalpha(str->s[0]) && str->l == 1) { if (p->n == p->m) { p->m = p->m? p->m<<1 : 4; p->a = realloc(p->a, p->m * 8); } p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; } } ks_destroy(ks); gzclose(fp); free(str->s); free(str); // mutfa fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { reglist_t *p; k = kh_get(reg, h, seq->name.s); if (k != kh_end(h)) { p = &kh_val(h, k); for (i = 0; i < p->n; ++i) { int beg = p->a[i]>>32; if (beg < seq->seq.l) seq->seq.s[beg] = (int)p->a[i]; } } printf(">%s", seq->name.s); for (i = 0; i < l; ++i) { if (i%60 == 0) putchar('\n'); putchar(seq->seq.s[i]); } putchar('\n'); }
arg_t *arg_load(gzFile fp) { kstream_t *ks; kstring_t *str; int dret, lineno = 0; arg_t *a; a = arg_init(); ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) >= 0) { int n1, n2, i; ++lineno; if (str->l != 1) { fprintf(stderr, "[arg_load] invalid initial character at line %d\n", lineno); exit(1); } if (str->s[0] == 'C' || str->s[0] == 'R') { argnode_t *an1, *an2; int beg, end, n_mut; // read ks_getuntil(ks, 0, str, &dret); n1 = atoi(str->s); ks_getuntil(ks, 0, str, &dret); n2 = atoi(str->s); if (n1 <= n2) { fprintf(stderr, "[arg_load] invalid edge (%d)\n", lineno); exit(1); } // add to the node array if (n1 + 1 > a->max_size) { int old_size = a->max_size; a->max_size = n1 + 1; kroundup32(a->max_size); a->node = (argnode_t*)realloc(a->node, sizeof(argnode_t) * a->max_size); memset(a->node + old_size, 0, (a->max_size - old_size) * sizeof(argnode_t)); } an1 = a->node + n1; an2 = a->node + n2; an1->nid = n1; an2->nid = n2; if (an1->n_nei == 3 || an2->n_nei == 3) { fprintf(stderr, "[arg_load] multifurcated node: %d or %d (%d)\n", n1, n2, lineno); exit(1); } an1->nei[an1->n_nei++] = n2; an2->nei[an2->n_nei++] = n1; // read intervals ks_getuntil(ks, 0, str, &dret); beg = atoi(str->s); ks_getuntil(ks, 0, str, &dret); end = atoi(str->s); if (an2->end) { // a recombination node if (an2->end == beg) an2->x = an2->end, an2->end = end; else if (an2->beg == end) { // and also swap int x = an2->nei[1]; an2->nei[1] = an2->nei[2]; an2->nei[2] = x; an2->x = an2->beg, an2->beg = beg; } else { fprintf(stderr, "[arg_load] inconsisten interval at node %d (%d)\n", n2, lineno); exit(1); } } else an2->beg = beg, an2->end = end; // read mutations ks_getuntil(ks, 0, str, &dret); n_mut = atoi(str->s); if (n_mut) { an2->mut = (int*)realloc(an2->mut, sizeof(int) * (an2->n_mut + n_mut)); for (i = 0; i < n_mut; ++i) { ks_getuntil(ks, 0, str, &dret); an2->mut[an2->n_mut++] = atoi(str->s); } } // save interval and swap if necessary } else if (str->s[0] == 'N') { ks_getuntil(ks, 0, str, &dret); a->n = atoi(str->s); ks_getuntil(ks, 0, str, &dret); a->m = atoi(str->s); } else if (str->s[0] == 'S') { ks_getuntil(ks, 0, str, &dret); a->root = atoi(str->s); ks_getuntil(ks, 0, str, &dret); if (str->l != a->m) { fprintf(stderr, "[arg_load] inconsistent root sequence (%d)\n", lineno); exit(1); } a->rootseq = (uint64_t*)calloc((a->m + 63) / 64, 8); for (i = 0; i < a->m; ++i) if (str->s[i] == '1') arg_setseq1(a->rootseq, i); } else ks_getuntil(ks, '\n', str, &dret); } free(str->s); free(str); ks_destroy(ks); return a; }
int main(int argc, char *argv[]) { gzFile fp; kstream_t *ks; kstring_t s, t[N_TMPSTR]; int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0; long m_cigar = 0, n_cigar = 0; unsigned *af, *cigar = 0; while ((c = getopt(argc, argv, "pc")) >= 0) { switch (c) { case 'p': is_padded = 1; break; case 'c': write_cns = 1; break; } } if (argc == optind) { fprintf(stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n"); fprintf(stderr, "Options: -p output padded SAM\n"); fprintf(stderr, " -c write the contig sequence in SAM\n\n"); fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n"); fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n"); fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n"); fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n"); return 1; } s.l = s.m = 0; s.s = 0; af_n = af_max = af_i = 0; af = 0; for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0; fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { if (strcmp(s.s, "CO") == 0) { // contig sequence kstring_t *cns; t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line af_n = af_i = 0; // reset the af array ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name ks_getuntil(ks, '\n', &s, &dret); // read the whole line while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence remove_pads(&t[1], &t[2]); // construct the unpadded sequence // compute the array for mapping padded positions to unpadded positions p2u = realloc(p2u, t[1].m * sizeof(int)); for (i = k = 0; i < t[1].l; ++i) { p2u[i] = k; if (t[1].s[i] != '*') ++k; } // write out the SAM header and contig sequences fprintf(stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line cns = &t[is_padded?1:2]; fprintf(stderr, "S >%s\n", t[0].s); for (i = 0; i < cns->l; i += LINE_LEN) { fputs("S ", stderr); for (k = 0; k < LINE_LEN && i + k < cns->l; ++k) fputc(cns->s[i + k], stderr); fputc('\n', stderr); } #define __padded2cigar(sp) do { \ int i, l_M = 0, l_D = 0; \ for (i = 0; i < sp.l; ++i) { \ if (sp.s[i] == '*') { \ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ ++l_D; l_M = 0; \ } else { \ if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ ++l_M; l_D = 0; \ } \ } \ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ } while (0) if (write_cns) { // write the consensus SAM line (dummy read) n_cigar = 0; if (is_padded) __padded2cigar(t[1]); else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]); for (i = 0; i < n_cigar; ++i) { kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]); } kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]); } } else if (strcmp(s.s, "BQ") == 0) { // contig quality
int main_bedcov(int argc, char *argv[]) { gzFile fp; kstring_t str; kstream_t *ks; hts_idx_t **idx; aux_t **aux; int *n_plp, dret, i, n, c, min_mapQ = 0; int64_t *cnt; const bam_pileup1_t **plp; int usage = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), { NULL, 0, NULL, 0 } }; while ((c = getopt_long(argc, argv, "Q:", lopts, NULL)) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage = 1; break; } if (usage) break; } if (usage || optind + 2 > argc) { fprintf(pysam_stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n"); fprintf(pysam_stderr, " -Q INT Only count bases of at least INT quality [0]\n"); sam_global_opt_help(pysam_stderr, "-.--."); return 1; } memset(&str, 0, sizeof(kstring_t)); n = argc - optind - 1; aux = calloc(n, sizeof(aux_t*)); idx = calloc(n, sizeof(hts_idx_t*)); for (i = 0; i < n; ++i) { aux[i] = calloc(1, sizeof(aux_t)); aux[i]->min_mapQ = min_mapQ; aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in); if (aux[i]->fp) idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); if (aux[i]->fp == 0 || idx[i] == 0) { fprintf(pysam_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); return 2; } // TODO bgzf_set_cache_size(aux[i]->fp, 20); aux[i]->header = sam_hdr_read(aux[i]->fp); if (aux[i]->header == NULL) { fprintf(pysam_stderr, "ERROR: failed to read header for '%s'\n", argv[i+optind+1]); return 2; } } cnt = calloc(n, 8); fp = gzopen(argv[optind], "rb"); ks = ks_init(fp); n_plp = calloc(n, sizeof(int)); plp = calloc(n, sizeof(bam_pileup1_t*)); while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { char *p, *q; int tid, beg, end, pos; bam_mplp_t mplp; for (p = q = str.s; *p && *p != '\t'; ++p); if (*p != '\t') goto bed_error; *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t'; if (tid < 0) goto bed_error; for (q = p = p + 1; isdigit(*p); ++p); if (*p != '\t') goto bed_error; *p = 0; beg = atoi(q); *p = '\t'; for (q = p = p + 1; isdigit(*p); ++p); if (*p == '\t' || *p == 0) { int c = *p; *p = 0; end = atoi(q); *p = c; } else goto bed_error; for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end); } mplp = bam_mplp_init(n, read_bam, (void**)aux); bam_mplp_set_maxcnt(mplp, 64000); memset(cnt, 0, 8 * n); while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) if (pos >= beg && pos < end) for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); } fputs(str.s, pysam_stdout) & fputc('\n', pysam_stdout); bam_mplp_destroy(mplp); continue; bed_error: fprintf(pysam_stderr, "Errors in BED line '%s'\n", str.s); } free(n_plp); free(plp); ks_destroy(ks); gzclose(fp); free(cnt); for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); hts_idx_destroy(idx[i]); bam_hdr_destroy(aux[i]->header); sam_close(aux[i]->fp); free(aux[i]); } free(aux); free(idx); free(str.s); sam_global_args_free(&ga); return 0; }
int ingest1(const char *input,const char *output,char *ref,bool exit_on_mismatch=true) { cerr << "Input: " << input << "\tOutput: "<<output<<endl; kstream_t *ks; kstring_t str = {0,0,0}; gzFile fp = gzopen(input, "r"); VarBuffer vbuf(1000); int prev_rid = -1; if(fp==NULL) { fprintf(stderr,"problem opening %s\n",input); exit(1); } char *out_fname = (char *)malloc(strlen(output)+5); strcpy(out_fname,output); strcat(out_fname,".tmp"); if(fileexists(out_fname)) { fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname); exit(1); } printf("depth: %s\n",out_fname); gzFile depth_fp = gzopen(out_fname, "wb1"); strcpy(out_fname,output); strcat(out_fname,".bcf"); if(fileexists(out_fname)) { fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname); exit(1); } printf("variants: %s\n",out_fname); htsFile *variant_fp=hts_open(out_fname,"wb1"); if(variant_fp==NULL) { fprintf(stderr,"problem opening %s\n",input); exit(1); } ks = ks_init(fp); htsFile *hfp=hts_open(input, "r"); bcf_hdr_t *hdr_in = bcf_hdr_read(hfp); hts_close(hfp); //this is a hack to fix gvcfs where AD is incorrectly defined in the header. (vcf4.2 does not technically allow Number=R) bcf_hdr_remove(hdr_in,BCF_HL_FMT,"AD"); assert( bcf_hdr_append(hdr_in,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed. For indels this value only includes reads which confidently support each allele (posterior prob 0.999 or higher that read contains indicated allele vs all other intersecting indel alleles)\">") == 0); //this is a hack to fix broken gvcfs where GQ is incorrectly labelled as float (v4.3 spec says it should be integer) bcf_hdr_remove(hdr_in,BCF_HL_FMT,"GQ"); assert( bcf_hdr_append(hdr_in,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">") == 0); // bcf_hdr_t *hdr_out=hdr_in; bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr_in); remove_hdr_lines(hdr_out,BCF_HL_INFO); remove_hdr_lines(hdr_out,BCF_HL_FLT); bcf_hdr_sync(hdr_out); //here we add FORMAT/PF. which is the pass filter flag for alts. assert( bcf_hdr_append(hdr_out,"##FORMAT=<ID=PF,Number=A,Type=Integer,Description=\"variant was PASS filter in original sample gvcf\">") == 0); args_t *norm_args = init_vcfnorm(hdr_out,ref); norm_args->check_ref |= CHECK_REF_WARN; bcf1_t *bcf_rec = bcf_init(); bcf_hdr_write(variant_fp, hdr_out); kstring_t work1 = {0,0,0}; int buf[5]; ks_tokaux_t aux; int ndec=0; int ref_len,alt_len; while( ks_getuntil(ks, '\n', &str, 0) >=0) { // fprintf(stderr,"%s\n",str.s); if(str.s[0]!='#') { char *ptr = kstrtok(str.s,"\t",&aux);//chrom ptr = kstrtok(NULL,NULL,&aux);//pos work1.l=0; kputsn(str.s,ptr-str.s-1, &work1); buf[0] = bcf_hdr_name2id(hdr_in, work1.s); assert( buf[0]>=0); buf[1]=atoi(ptr)-1; ptr = kstrtok(NULL,NULL,&aux);//ID ptr = kstrtok(NULL,NULL,&aux);//REF ref_len=0; while(ptr[ref_len]!='\t') ref_len++; ptr = kstrtok(NULL,NULL,&aux);//ALT bool is_variant=false; alt_len=0; while(ptr[alt_len]!='\t') alt_len++; if(ptr[0]!='.') is_variant=true; char * QUAL_ptr = kstrtok(NULL, NULL, &aux); assert (QUAL_ptr != NULL); for(int i=0;i<2;i++) ptr = kstrtok(NULL,NULL,&aux);// gets us to INFO //find END if it is there char *end_ptr=strstr(ptr,"END=") ; if(end_ptr!=NULL) buf[2]=atoi(end_ptr+4)-1; else buf[2]=buf[1]+alt_len-1; ptr = kstrtok(NULL,NULL,&aux);//FORMAT //find index of DP (if present) //if not present, dont output anything (indels ignored) char *DP_ptr = find_format(ptr,"DP"); int GQX = 0; int QUAL = 0; // AH: change code to use the minimum of GQ and QUAL fields if // GQX is not defined. See here: // https://support.basespace.illumina.com/knowledgebase/articles/144844-vcf-file // "GQXGenotype quality. GQX is the minimum of the GQ value // and the QUAL column. In general, these are similar values; // taking the minimum makes GQX the more conservative measure of // genotype quality." if(DP_ptr!=NULL) { buf[3]=atoi(DP_ptr); char *GQX_ptr = find_format(ptr,"GQX"); if (GQX_ptr == NULL) { GQX_ptr = find_format(ptr,"GQ"); GQX = atoi(GQX_ptr); if (QUAL_ptr[0] != '.') { QUAL = atoi(QUAL_ptr); if (QUAL < GQX) GQX = QUAL; } } else { GQX = atoi(GQX_ptr); } //trying to reduce entropy on GQ to get better compression performance. //1. rounds down to nearest 10. //2. sets gq to min(gq,100). buf[4]=GQX/10; buf[4]*=10; if(buf[4]>100) buf[4]=100; // printf("%d\t%d\t%d\t%d\t%d\n",buf[0],buf[1],buf[2],buf[3],buf[4]); if(gzwrite(depth_fp,buf,5*sizeof(int))!=(5*sizeof(int))) die("ERROR: problem writing "+(string)out_fname+".tmp"); } if(is_variant) {//wass this a variant? if so write it out to the bcf norm_args->ntotal++; vcf_parse(&str,hdr_in,bcf_rec); // cerr<<bcf_rec->rid<<":"<<bcf_rec->pos<<endl; if(prev_rid!=bcf_rec->rid) vbuf.flush(variant_fp,hdr_out); else vbuf.flush(bcf_rec->pos,variant_fp,hdr_out); prev_rid=bcf_rec->rid; int32_t pass = bcf_has_filter(hdr_in, bcf_rec, "."); bcf_update_format_int32(hdr_out,bcf_rec,"PF",&pass,1); bcf_update_filter(hdr_out,bcf_rec,NULL,0); if(bcf_rec->n_allele>2) {//split multi-allelics (using vcfnorm.c from bcftools1.3 norm_args->nsplit++; split_multiallelic_to_biallelics(norm_args,bcf_rec ); for(int i=0;i<norm_args->ntmp_lines;i++){ remove_info(norm_args->tmp_lines[i]); if(realign(norm_args,norm_args->tmp_lines[i]) != ERR_REF_MISMATCH) ndec+=decompose(norm_args->tmp_lines[i],hdr_out,vbuf); else if(exit_on_mismatch) die("vcf did not match the reference"); else norm_args->nskipped++; } } else { remove_info(bcf_rec); if( realign(norm_args,bcf_rec) != ERR_REF_MISMATCH) ndec+=decompose(bcf_rec,hdr_out,vbuf); else if(exit_on_mismatch) die("vcf did not match the reference"); else norm_args->nskipped++; } vbuf.flush(bcf_rec->pos,variant_fp,hdr_out); } } } vbuf.flush(variant_fp,hdr_out); bcf_hdr_destroy(hdr_in); bcf_hdr_destroy(hdr_out); bcf_destroy1(bcf_rec); ks_destroy(ks); gzclose(fp); gzclose(depth_fp); free(str.s); free(work1.s); hts_close(variant_fp); destroy_data(norm_args); fprintf(stderr,"Variant lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", norm_args->ntotal,norm_args->nsplit,norm_args->nchanged,norm_args->nskipped); fprintf(stderr,"Decomposed %d MNPs\n", ndec); fprintf(stderr,"Indexing %s\n",out_fname); bcf_index_build(out_fname, BCF_LIDX_SHIFT); free(out_fname); return 0; }
int vcf_read(bcf_t *bp, bcf_hdr_t *h, bcf1_t *b) { int dret, k, i, sync = 0; vcf_t *v = (vcf_t*)bp->v; char *p, *q; kstring_t str, rn; ks_tokaux_t aux, a2; if (!bp->is_vcf) return bcf_read(bp, h, b); v->line.l = 0; str.l = 0; str.m = b->m_str; str.s = b->str; rn.l = rn.m = h->l_nm; rn.s = h->name; if (ks_getuntil(v->ks, '\n', &v->line, &dret) < 0) return -1; b->n_smpl = h->n_smpl; for (p = kstrtok(v->line.s, "\t", &aux), k = 0; p; p = kstrtok(0, 0, &aux), ++k) { *(char*)aux.p = 0; if (k == 0) { // ref int tid = bcf_str2id(v->refhash, p); if (tid < 0) { tid = bcf_str2id_add(v->refhash, strdup(p)); kputs(p, &rn); kputc('\0', &rn); sync = 1; } b->tid = tid; } else if (k == 1) { // pos b->pos = atoi(p) - 1; } else if (k == 5) { // qual b->qual = (p[0] >= '0' && p[0] <= '9')? atof(p) : 0; } else if (k <= 8) { // variable length strings kputs(p, &str); kputc('\0', &str); b->l_str = str.l; b->m_str = str.m; b->str = str.s; if (k == 8) bcf_sync(b); } else { // k > 9 if (strncmp(p, "./.", 3) == 0) { for (i = 0; i < b->n_gi; ++i) { if (b->gi[i].fmt == bcf_str2int("GT", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = 1<<7; } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { ((int32_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { ((uint16_t*)b->gi[i].data)[k-9] = 0; } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { int y = b->n_alleles * (b->n_alleles + 1) / 2; memset((uint8_t*)b->gi[i].data + (k - 9) * y, 0, y); } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { int y = b->n_alleles * (b->n_alleles + 1) / 2; memset((float*)b->gi[i].data + (k - 9) * y, 0, y * 4); } } goto endblock; } for (q = kstrtok(p, ":", &a2), i = 0; q && i < b->n_gi; q = kstrtok(0, 0, &a2), ++i) { if (b->gi[i].fmt == bcf_str2int("GT", 2)) { ((uint8_t*)b->gi[i].data)[k-9] = (q[0] - '0')<<3 | (q[2] - '0') | (q[1] == '/'? 0 : 1) << 6; } else if (b->gi[i].fmt == bcf_str2int("GQ", 2)) { double _x = strtod(q, &q); int x = (int)(_x + .499); if (x > 255) x = 255; ((uint8_t*)b->gi[i].data)[k-9] = x; } else if (b->gi[i].fmt == bcf_str2int("SP", 2)) { int x = strtol(q, &q, 10); if (x > 0xffff) x = 0xffff; ((uint32_t*)b->gi[i].data)[k-9] = x; } else if (b->gi[i].fmt == bcf_str2int("DP", 2)) { int x = strtol(q, &q, 10); if (x > 0xffff) x = 0xffff; ((uint16_t*)b->gi[i].data)[k-9] = x; } else if (b->gi[i].fmt == bcf_str2int("PL", 2)) { int x, y, j; uint8_t *data = (uint8_t*)b->gi[i].data; y = b->n_alleles * (b->n_alleles + 1) / 2; for (j = 0; j < y; ++j) { x = strtol(q, &q, 10); if (x > 255) x = 255; data[(k-9) * y + j] = x; ++q; } } else if (b->gi[i].fmt == bcf_str2int("GL", 2)) { int j, y; float x, *data = (float*)b->gi[i].data; y = b->n_alleles * (b->n_alleles + 1) / 2; for (j = 0; j < y; ++j) { x = strtod(q, &q); data[(k-9) * y + j] = x > 0? -x/10. : x; ++q; } } } endblock: i = i; } } h->l_nm = rn.l; h->name = rn.s; if (sync) bcf_hdr_sync(h); return v->line.l + 1; }
bcf_hdr_t *vcf_hdr_read(htsFile *fp) { if (!fp->is_bin) { kstring_t txt, *s = &fp->line; bcf_hdr_t *h; h = bcf_hdr_init(); txt.l = txt.m = 0; txt.s = 0; while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { if (s->l == 0) continue; if (s->s[0] != '#') { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] no sample line\n", __func__); free(txt.s); bcf_hdr_destroy(h); return 0; } if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here int dret; gzFile f; kstream_t *ks; kstring_t tmp; tmp.l = tmp.m = 0; tmp.s = 0; f = gzopen(fp->fn_aux, "r"); ks = ks_init(f); while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { int c; kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt); ks_getuntil(ks, 0, &tmp, &dret); kputs(",length=", &txt); kputw(atol(tmp.s), &txt); kputsn(">\n", 2, &txt); if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line } free(tmp.s); ks_destroy(ks); gzclose(f); } kputsn(s->s, s->l, &txt); if (s->s[1] != '#') break; kputc('\n', &txt); } h->l_text = txt.l + 1; // including NULL h->text = txt.s; bcf_hdr_parse(h); // check tabix index, are all contigs listed in the header? add the missing ones tbx_t *idx = tbx_index_load(fp->fn); if ( idx ) { int i, n, need_sync = 0; const char **names = tbx_seqnames(idx, &n); for (i=0; i<n; i++) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]); if ( hrec ) continue; hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); hrec->key = strdup("contig"); bcf_hrec_add_key(hrec, "ID", strlen("ID")); bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); bcf_hrec_add_key(hrec, "length", strlen("length")); bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0); // what is a good default value? bcf_hdr_add_hrec(h, hrec); need_sync = 1; } free(names); tbx_destroy(idx); if ( need_sync ) { bcf_hdr_sync(h); bcf_hdr_fmt_text(h); } } return h; } else return bcf_hdr_read((BGZF*)fp->fp); }
int main_bedcov(int argc, char *argv[]) { gzFile fp; kstring_t str; kstream_t *ks; hts_idx_t **idx; aux_t **aux; int *n_plp, dret, i, n, c, min_mapQ = 0; int64_t *cnt; const bam_pileup1_t **plp; while ((c = getopt(argc, argv, "Q:")) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; } } if (optind + 2 > argc) { fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n"); return 1; } memset(&str, 0, sizeof(kstring_t)); n = argc - optind - 1; aux = calloc(n, sizeof(aux_t*)); idx = calloc(n, sizeof(hts_idx_t*)); for (i = 0; i < n; ++i) { aux[i] = calloc(1, sizeof(aux_t)); aux[i]->min_mapQ = min_mapQ; aux[i]->fp = sam_open(argv[i+optind+1], "r"); idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); if (aux[i]->fp == 0 || idx[i] == 0) { fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); return 2; } // TODO bgzf_set_cache_size(aux[i]->fp, 20); aux[i]->header = sam_hdr_read(aux[i]->fp); } cnt = calloc(n, 8); fp = gzopen(argv[optind], "rb"); ks = ks_init(fp); n_plp = calloc(n, sizeof(int)); plp = calloc(n, sizeof(bam_pileup1_t*)); while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { char *p, *q; int tid, beg, end, pos; bam_mplp_t mplp; for (p = q = str.s; *p && *p != '\t'; ++p); if (*p != '\t') goto bed_error; *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t'; if (tid < 0) goto bed_error; for (q = p = p + 1; isdigit(*p); ++p); if (*p != '\t') goto bed_error; *p = 0; beg = atoi(q); *p = '\t'; for (q = p = p + 1; isdigit(*p); ++p); if (*p == '\t' || *p == 0) { int c = *p; *p = 0; end = atoi(q); *p = c; } else goto bed_error; for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end); } mplp = bam_mplp_init(n, read_bam, (void**)aux); bam_mplp_set_maxcnt(mplp, 64000); memset(cnt, 0, 8 * n); while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) if (pos >= beg && pos < end) for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); } puts(str.s); bam_mplp_destroy(mplp); continue; bed_error: fprintf(stderr, "Errors in BED line '%s'\n", str.s); } free(n_plp); free(plp); ks_destroy(ks); gzclose(fp); free(cnt); for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); hts_idx_destroy(idx[i]); bam_hdr_destroy(aux[i]->header); sam_close(aux[i]->fp); free(aux[i]); } free(aux); free(idx); free(str.s); return 0; }
int main(int argc, char *argv[]) { int c, dret, lineno = 0, n_rows = 0, m_rows = 0, n_cols = 0, max_hap = 0; int64_t n_missing = 0, n_tot = 0; gzFile fp; kstream_t *ks; kstring_t str = {0,0,0}; int8_t **C = 0; double **M, *X, min_maf = 0.0; char **names = 0; // _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW | _MM_MASK_DIV_ZERO)); _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_DIV_ZERO)); while ((c = getopt(argc, argv, "m:")) >= 0) { if (c == 'm') min_maf = atof(optarg); } if (argc - optind == 0) { fprintf(stderr, "Usage: naivepca [-m min_maf] <in.txt>\n"); return 1; } fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[E::%s] failed to open file '%s'. Abort.\n", __func__, argv[optind]); return 2; } ks = ks_init(fp); // read the matrix into C while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { int8_t *q; char *p, *name = str.s; int i; ++lineno; for (p = str.s; *p && *p != '\t' && *p != ' '; ++p); if (*p) { *p++ = 0; for (; *p && (*p == '\t' || *p == ' '); ++p); } if (*p == 0) { fprintf(stderr, "[W::%s] line %d has one field; skipped.\n", __func__, lineno); continue; } if (n_cols != 0) { if (n_cols != str.s + str.l - p) { fprintf(stderr, "[W::%s] line %d has a different number of columns; skipped.\n", __func__, lineno); continue; } } else n_cols = str.s + str.l - p; if (n_rows == m_rows) { m_rows = m_rows? m_rows<<1 : 16; C = (int8_t**)realloc(C, m_rows * sizeof(int8_t*)); names = (char**)realloc(names, m_rows * sizeof(char*)); } names[n_rows] = strdup(name); q = C[n_rows++] = (int8_t*)calloc(n_cols, sizeof(double)); for (i = 0; i < n_cols; ++i) { if (p[i] >= '0' && p[i] <= '9') q[i] = p[i] - '0'; else q[i] = -1, ++n_missing; max_hap = max_hap > q[i]? max_hap : q[i]; } n_tot += n_cols; } free(str.s); fprintf(stderr, "[M::%s] read %d samples and %d sites; ploidy is %d\n", __func__, n_rows, n_cols, max_hap); fprintf(stderr, "[M::%s] %.3f%% of genotypes are missing\n", __func__, (double)n_missing / n_tot); { // normalize the matrix into M int i, j, *sum, *cnt, n_dropped = 0; double *mu, *pp; sum = (int*)calloc(n_cols, sizeof(int)); cnt = (int*)calloc(n_cols, sizeof(int)); mu = (double*)calloc(n_cols, sizeof(double)); pp = (double*)calloc(n_cols, sizeof(double)); for (i = 0; i < n_rows; ++i) { int8_t *q = C[i]; for (j = 0; j < n_cols; ++j) if (q[j] >= 0) sum[j] += q[j], ++cnt[j]; } for (j = 0; j < n_cols; ++j) { if (cnt[j] > 0) { mu[j] = (double)sum[j] / cnt[j]; pp[j] = mu[j] / max_hap; if (pp[j] < min_maf || 1. - pp[j] < min_maf) ++n_dropped; } else ++n_dropped; } fprintf(stderr, "[M::%s] %d rare sites are dropped\n", __func__, n_dropped); M = (double**)calloc(n_rows, sizeof(double*)); for (i = 0; i < n_rows; ++i) { int8_t *q = C[i]; double *r; r = M[i] = (double*)calloc(n_cols, sizeof(double)); for (j = 0; j < n_cols; ++j) r[j] = q[j] < 0 || pp[j] < min_maf || 1. - pp[j] < min_maf || pp[j] == 0. || 1 - pp[j] == 0. ? 0. : (q[j] - mu[j]) / sqrt(pp[j] * (1. - pp[j])); } free(sum); free(cnt); free(mu); free(pp); for (i = 0; i < n_rows; ++i) free(C[i]); free(C); } { // multiplication int i, j, k; X = (double*)calloc(n_rows * n_rows, sizeof(double)); for (i = 0; i < n_rows; ++i) { double *zi = M[i]; for (j = 0; j <= i; ++j) { double t = 0., *zj = M[j]; for (k = 0; k < n_cols; ++k) t += zi[k] * zj[k]; X[i*n_rows + j] = X[j*n_rows + i] = t / n_cols; } } for (i = 0; i < n_rows; ++i) free(M[i]); free(M); } { // print eigan vectors double *ev; int i, j; evsrt_t *evsrt; ev = (double*)calloc(n_rows, sizeof(double)); evsrt = (evsrt_t*)calloc(n_rows, sizeof(evsrt_t)); n_eigen_symm(X, n_rows, ev); for (i = 0; i < n_rows; ++i) evsrt[i].ev = ev[i], evsrt[i].i = i; ks_introsort(ev, n_rows, evsrt); for (i = 0; i < n_rows; ++i) { printf("%s", names[i]); for (j = 0; j < n_rows; ++j) printf("\t%.6f", X[i*n_rows + evsrt[j].i] * evsrt[j].ev); putchar('\n'); free(names[i]); } free(ev); free(evsrt); free(X); free(names); } ks_destroy(ks); gzclose(fp); return 0; }