int main(int argc, char **argv) { struct ks_config config; memset(&config, 0, sizeof(struct ks_config)); config.revoke_refresh = 30; int c; while((c = getopt(argc, argv, "u:p:t:r:")) != -1) switch(c) { case 'u': config.username = optarg; break; case 'p': config.password = optarg; break; case 't': config.tenant = optarg; break; case 'r': config.revoke_refresh = atoi(optarg); break; default: return usage(); } if(optind + 2 > argc || !config.username || !config.password || !config.tenant || !config.revoke_refresh) return usage(); config.url = argv[optind++]; if(argc - optind == 3 && !strcasecmp(argv[optind], "validate_login")) { if(ks_init(&config)) return 1; char *username = ks_validate_login(argv[optind + 1], argv[optind + 2]); if(username) { printf("%s\n", username); free(username); } ks_deinit(); } else if(argc - optind == 2 && !strcasecmp(argv[optind], "validate_token")) { if(ks_init(&config)) return 1; char *username = ks_validate_token(argv[optind + 1]); if(username) { printf("%s\n", username); free(username); } ks_deinit(); } else return usage(); return 0; }
int vcf_dictread(bcf_t *bp, bcf_hdr_t *h, const char *fn) { vcf_t *v; gzFile fp; kstream_t *ks; kstring_t s, rn; int dret; if (bp == 0) return -1; if (!bp->is_vcf) return 0; s.l = s.m = 0; s.s = 0; rn.m = rn.l = h->l_nm; rn.s = h->name; v = (vcf_t*)bp->v; fp = gzopen(fn, "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { bcf_str2id_add(v->refhash, strdup(s.s)); kputs(s.s, &rn); kputc('\0', &rn); if (dret != '\n') ks_getuntil(ks, '\n', &s, &dret); } ks_destroy(ks); gzclose(fp); h->l_nm = rn.l; h->name = rn.s; bcf_hdr_sync(h); free(s.s); return 0; }
bam_header_t *sam_header_read2(const char *fn) { bam_header_t *header; int c, dret, ret; gzFile fp; kstream_t *ks; kstring_t *str; kh_ref_t *hash; khiter_t k; hash = kh_init(ref); fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); assert(fp); ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) > 0) { char *s = strdup(str->s); int len, i; i = kh_size(hash); ks_getuntil(ks, 0, str, &dret); len = atoi(str->s); k = kh_put(ref, hash, s, &ret); kh_value(hash, k) = (uint64_t)len<<32 | i; if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); fprintf(stderr, "[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); header = hash2header(hash); kh_destroy(ref, hash); return header; }
bam_header_t *sam_header_read2(const char *fn) { bam_header_t *header; int c, dret, n_targets = 0; gzFile fp; kstream_t *ks; kstring_t *str; kstring_t samstr = { 0, 0, NULL }; if (fn == 0) return 0; fp = (strcmp(fn, "-") == 0)? gzdopen(fileno(stdin), "r") : gzopen(fn, "r"); if (fp == 0) return 0; ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) > 0) { ksprintf(&samstr, "@SQ\tSN:%s", str->s); ks_getuntil(ks, 0, str, &dret); ksprintf(&samstr, "\tLN:%d\n", atoi(str->s)); n_targets++; if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); header = sam_hdr_parse(samstr.l, samstr.s? samstr.s : ""); free(samstr.s); fprintf(pysamerr, "[sam_header_read2] %d sequences loaded.\n", n_targets); return header; }
paf_file_t *paf_open(const char *fn) { kstream_t *ks; gzFile fp; paf_file_t *pf; fp = fn && strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) return 0; ks = ks_init(fp); pf = (paf_file_t*)calloc(1, sizeof(paf_file_t)); pf->fp = ks; return pf; }
void __init __noreturn hal_main() { /* 开辟鸿蒙,谁为情种?最初的一切*/ km_cluster_init(); build_ram_list(); /* Go back to ARCH, we have inited the basic paging allocator */ hal_arch_init(HAL_ARCH_INIT_PHASE_EARLY); /* KERNEL */ kc_init(); kp_init(); ks_init(); hal_malloc_init(); hal_dpc_init(); hal_time_init(); hal_arch_init(HAL_ARCH_INIT_PHASE_MIDDLE); hal_console_init(); printk("GridOS 启动中...\n"); ke_module_entry(); local_irq_enable(); /* Driver pakcage loading, and it must be the last file */ hal_boot_module_loop(start_driver_ctx); if (last_package_id == driver_package_id && driver_pakcage) { ke_startup_driver_process(driver_pakcage, driver_size); } else { if (driver_size) printk("Driver package is not the last one, BSS in it may overlay the useful file data after it..."); else printk("No driver package was loaded..."); } printk("Hal startup ok.\n"); kernel_test(); while (1) { kt_schedule_driver(); /* If have no process, sleep */ if (!kt_schedule_running_count()) dumy_idle_ops(0); else kt_schedule(); } }
int main(int argc, char *argv[]) { gzFile fp; kstream_t *ks; khash_t(s) *hash; mask32_t *q = 0; kstring_t *str; int i, dret, c, last = 0; while ((c = getopt(argc, argv, "")) >= 0) { } if (argc <= optind + 1) { fprintf(stderr, "Usage: uniq-dist <in.mask.fa> <in-sorted.list>\n"); return 1; } str = (kstring_t*)calloc(1, sizeof(kstring_t)); fprintf(stderr, "[uniq-dist] loading mask...\n"); hash = load_mask(argv[optind]); fp = gzopen(argv[optind+1], "r"); ks = ks_init(fp); fprintf(stderr, "[uniq-dist] calculating unique distance...\n"); while (ks_getuntil(ks, 0, str, &dret) >= 0) { khint_t k; mask32_t *p; int pos; k = kh_get(s, hash, str->s); p = (k != kh_end(hash))? &kh_val(hash, k) : 0; ks_getuntil(ks, 0, str, &dret); pos = atoi(str->s) - 1; if (p && pos >= 0 && pos < p->ori_len) { if (p != q) q = p; // change of reference else { if (last >= pos) { fprintf(stderr, "[uniq-dist] out of order: %s:%d <= %d\n", kh_key(hash, k), pos+1, last+1); } else { for (i = last, c = 0; i < pos; ++i) if (p->mask[i/32] & 1u<<i%32) ++c; if (last > 0) printf("%s\t%d\t%d\t%d\n", kh_key(hash, k), last, pos, c); } } last = pos; } if (dret != '\n') while ((c = ks_getc(ks)) != -1 && c != '\n'); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); // hash table is not freed... return 0; }
int main(int argc, char *argv[]) { gzFile fp; kstream_t *ks; khash_t(s) *hash; kstring_t *str; int dret, c, complement = 0; while ((c = getopt(argc, argv, "c")) >= 0) { switch (c) { case 'c': complement = 1; break; } } if (argc <= optind + 1) { fprintf(stderr, "Usage: apply_mask_l [-c] <in.mask.fa> <in.list>\n"); return 1; } str = (kstring_t*)calloc(1, sizeof(kstring_t)); fprintf(stderr, "[apply_mask_l] loading mask...\n"); hash = load_mask(argv[optind]); fp = gzopen(argv[optind+1], "r"); ks = ks_init(fp); fprintf(stderr, "[apply_mask_l] filtering list...\n"); while (ks_getuntil(ks, 0, str, &dret) >= 0) { khint_t k; mask32_t *p; int pos, do_print = 0; k = kh_get(s, hash, str->s); p = (k != kh_end(hash))? &kh_val(hash, k) : 0; ks_getuntil(ks, 0, str, &dret); pos = atoi(str->s) - 1; if (complement == 0) { if (p && pos < p->ori_len && (p->mask[pos/32]&1u<<pos%32)) do_print = 1; } else { if (p && pos < p->ori_len && (p->mask[pos/32]&1u<<pos%32) == 0) do_print = 1; } if (do_print) printf("%s\t%d", kh_key(hash, k), pos + 1); if (dret != '\n') { if (do_print) putchar('\t'); while ((c = ks_getc(ks)) != -1 && c != '\n') if (do_print) putchar(c); } if (do_print) putchar('\n'); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); // hash table is not freed... return 0; }
reghash_t *stk_reg_read(const char *fn) { reghash_t *h = kh_init(reg); gzFile fp; kstream_t *ks; int dret; kstring_t *str; // read the list str = calloc(1, sizeof(kstring_t)); fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, str, &dret) >= 0) { int beg = -1, end = -1; reglist_t *p; khint_t k = kh_get(reg, h, str->s); if (k == kh_end(h)) { int ret; char *s = strdup(str->s); k = kh_put(reg, h, s, &ret); memset(&kh_val(h, k), 0, sizeof(reglist_t)); } p = &kh_val(h, k); if (dret != '\n') { if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { beg = atoi(str->s); if (dret != '\n') { if (ks_getuntil(ks, 0, str, &dret) > 0 && isdigit(str->s[0])) { end = atoi(str->s); if (end < 0) end = -1; } } } } // skip the rest of the line if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); if (end < 0 && beg > 0) end = beg, beg = beg - 1; // if there is only one column if (beg < 0) beg = 0, end = INT_MAX; if (p->n == p->m) { p->m = p->m? p->m<<1 : 4; p->a = realloc(p->a, p->m * 8); } p->a[p->n++] = (uint64_t)beg<<32 | end; } ks_destroy(ks); gzclose(fp); free(str->s); free(str); return h; }
bcf_t *vcf_open(const char *fn, const char *mode) { bcf_t *bp; vcf_t *v; if (strchr(mode, 'b')) return bcf_open(fn, mode); bp = calloc(1, sizeof(bcf_t)); v = calloc(1, sizeof(vcf_t)); bp->is_vcf = 1; bp->v = v; v->refhash = bcf_str2id_init(); if (strchr(mode, 'r')) { v->fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); v->ks = ks_init(v->fp); } else if (strchr(mode, 'w')) v->fpout = strcmp(fn, "-")? fopen(fn, "w") : stdout; return bp; }
bam_header_t *sam_header_read2(const char *fn) { bam_header_t *header; int c, dret, ret, error = 0; gzFile fp; kstream_t *ks; kstring_t *str; kh_ref_t *hash; khiter_t k; if (fn == 0) return 0; fp = gzopen(fn, "r"); if (fp == 0) return 0; hash = kh_init(ref); ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) > 0) { char *s = malloc(strlen(str->s) + 1); strcpy(s,str->s); int len, i; i = kh_size(hash); ks_getuntil(ks, 0, str, &dret); len = atoi(str->s); k = kh_put(ref, hash, s, &ret); if (ret == 0) { Rprintf("[sam_header_read2] duplicated sequence name: %s\n", s); error = 1; } kh_value(hash, k) = (uint64_t)len<<32 | i; if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); } ks_destroy(ks); gzclose(fp); free(str->s); free(str); Rprintf("[sam_header_read2] %d sequences loaded.\n", kh_size(hash)); if (error) return 0; header = hash2header(hash); kh_destroy(ref, hash); return header; }
int bcf_p1_read_prior(bcf_p1aux_t *ma, const char *fn) { gzFile fp; kstring_t s; kstream_t *ks; long double sum; int dret, k; memset(&s, 0, sizeof(kstring_t)); fp = strcmp(fn, "-")? gzopen(fn, "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); memset(ma->phi, 0, sizeof(double) * (ma->M + 1)); while (ks_getuntil(ks, '\n', &s, &dret) >= 0) { if (strstr(s.s, "[afs] ") == s.s) { char *p = s.s + 6; for (k = 0; k <= ma->M; ++k) { int x; double y; x = strtol(p, &p, 10); if (x != k && (errno == EINVAL || errno == ERANGE)) return -1; ++p; y = strtod(p, &p); if (y == 0. && (errno == EINVAL || errno == ERANGE)) return -1; ma->phi[ma->M - k] += y; } } } ks_destroy(ks); gzclose(fp); free(s.s); for (sum = 0., k = 0; k <= ma->M; ++k) sum += ma->phi[k]; fprintf(stderr, "[prior]"); for (k = 0; k <= ma->M; ++k) ma->phi[k] /= sum; for (k = 0; k <= ma->M; ++k) fprintf(stderr, " %d:%.3lg", k, ma->phi[ma->M - k]); fputc('\n', stderr); for (sum = 0., k = 1; k < ma->M; ++k) sum += ma->phi[ma->M - k] * (2.* k * (ma->M - k) / ma->M / (ma->M - 1)); fprintf(stderr, "heterozygosity=%lf, ", (double)sum); for (sum = 0., k = 1; k <= ma->M; ++k) sum += k * ma->phi[ma->M - k] / ma->M; fprintf(stderr, "theta=%lf\n", (double)sum); bcf_p1_indel_prior(ma, MC_DEF_INDEL); return 0; }
htsFile * sam_popen(char *cmd) { htsFile *fp = (htsFile*)calloc(1, sizeof(htsFile)); int fid, fid2; assert(fp); popen_fd = popen(cmd, "r"); //Global fid = fileno(popen_fd); fid2 = dup(fid); //otherwise, the file descriptor is closed by zlib and pclose() won't work!! if(popen_fd == NULL) return 0; if(fp == NULL) return 0; hFILE *hfile = hdopen(fid2, "r"); //does this exist? if(hfile == NULL) return 0; fp->is_be = ed_is_big(); BGZF *gzfp = bgzf_hopen(hfile, "r"); fp->fp.voidp = ks_init(gzfp); fp->format.format = sam; return(fp); }
bcf_hdr_t *vcf_hdr_read(htsFile *fp) { if (!fp->is_bin) { kstring_t txt, *s = &fp->line; bcf_hdr_t *h; h = bcf_hdr_init(); txt.l = txt.m = 0; txt.s = 0; while (hts_getline(fp, KS_SEP_LINE, s) >= 0) { if (s->l == 0) continue; if (s->s[0] != '#') { if (hts_verbose >= 2) fprintf(stderr, "[E::%s] no sample line\n", __func__); free(txt.s); bcf_hdr_destroy(h); return 0; } if (s->s[1] != '#' && fp->fn_aux) { // insert contigs here int dret; gzFile f; kstream_t *ks; kstring_t tmp; tmp.l = tmp.m = 0; tmp.s = 0; f = gzopen(fp->fn_aux, "r"); ks = ks_init(f); while (ks_getuntil(ks, 0, &tmp, &dret) >= 0) { int c; kputs("##contig=<ID=", &txt); kputs(tmp.s, &txt); ks_getuntil(ks, 0, &tmp, &dret); kputs(",length=", &txt); kputw(atol(tmp.s), &txt); kputsn(">\n", 2, &txt); if (dret != '\n') while ((c = ks_getc(ks)) != '\n' && c != -1); // skip the rest of the line } free(tmp.s); ks_destroy(ks); gzclose(f); } kputsn(s->s, s->l, &txt); if (s->s[1] != '#') break; kputc('\n', &txt); } h->l_text = txt.l + 1; // including NULL h->text = txt.s; bcf_hdr_parse(h); // check tabix index, are all contigs listed in the header? add the missing ones tbx_t *idx = tbx_index_load(fp->fn); if ( idx ) { int i, n, need_sync = 0; const char **names = tbx_seqnames(idx, &n); for (i=0; i<n; i++) { bcf_hrec_t *hrec = bcf_hdr_get_hrec(h, BCF_DT_CTG, (char*) names[i]); if ( hrec ) continue; hrec = (bcf_hrec_t*) calloc(1,sizeof(bcf_hrec_t)); hrec->key = strdup("contig"); bcf_hrec_add_key(hrec, "ID", strlen("ID")); bcf_hrec_set_val(hrec, hrec->nkeys-1, (char*) names[i], strlen(names[i]), 0); bcf_hrec_add_key(hrec, "length", strlen("length")); bcf_hrec_set_val(hrec, hrec->nkeys-1, "-1", strlen("-1"), 0); // what is a good default value? bcf_hdr_add_hrec(h, hrec); need_sync = 1; } free(names); tbx_destroy(idx); if ( need_sync ) { bcf_hdr_sync(h); bcf_hdr_fmt_text(h); } } return h; } else return bcf_hdr_read((BGZF*)fp->fp); }
int stk_mutfa(int argc, char *argv[]) { khash_t(reg) *h = kh_init(reg); gzFile fp; kseq_t *seq; kstream_t *ks; int l, i, dret; kstring_t *str; khint_t k; if (argc < 3) { fprintf(stderr, "Usage: seqtk mutfa <in.fa> <in.snp>\n\n"); fprintf(stderr, "Note: <in.snp> contains at least four columns per line which are:\n"); fprintf(stderr, " 'chr 1-based-pos any base-changed-to'.\n"); return 1; } // read the list str = calloc(1, sizeof(kstring_t)); fp = strcmp(argv[2], "-")? gzopen(argv[2], "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, str, &dret) >= 0) { char *s = strdup(str->s); int beg = 0, ret; reglist_t *p; k = kh_get(reg, h, s); if (k == kh_end(h)) { k = kh_put(reg, h, s, &ret); memset(&kh_val(h, k), 0, sizeof(reglist_t)); } p = &kh_val(h, k); if (ks_getuntil(ks, 0, str, &dret) > 0) beg = atol(str->s) - 1; // 2nd col ks_getuntil(ks, 0, str, &dret); // 3rd col ks_getuntil(ks, 0, str, &dret); // 4th col // skip the rest of the line if (dret != '\n') while ((dret = ks_getc(ks)) > 0 && dret != '\n'); if (isalpha(str->s[0]) && str->l == 1) { if (p->n == p->m) { p->m = p->m? p->m<<1 : 4; p->a = realloc(p->a, p->m * 8); } p->a[p->n++] = (uint64_t)beg<<32 | str->s[0]; } } ks_destroy(ks); gzclose(fp); free(str->s); free(str); // mutfa fp = strcmp(argv[1], "-")? gzopen(argv[1], "r") : gzdopen(fileno(stdin), "r"); seq = kseq_init(fp); while ((l = kseq_read(seq)) >= 0) { reglist_t *p; k = kh_get(reg, h, seq->name.s); if (k != kh_end(h)) { p = &kh_val(h, k); for (i = 0; i < p->n; ++i) { int beg = p->a[i]>>32; if (beg < seq->seq.l) seq->seq.s[beg] = (int)p->a[i]; } } printf(">%s", seq->name.s); for (i = 0; i < l; ++i) { if (i%60 == 0) putchar('\n'); putchar(seq->seq.s[i]); } putchar('\n'); }
arg_t *arg_load(gzFile fp) { kstream_t *ks; kstring_t *str; int dret, lineno = 0; arg_t *a; a = arg_init(); ks = ks_init(fp); str = (kstring_t*)calloc(1, sizeof(kstring_t)); while (ks_getuntil(ks, 0, str, &dret) >= 0) { int n1, n2, i; ++lineno; if (str->l != 1) { fprintf(stderr, "[arg_load] invalid initial character at line %d\n", lineno); exit(1); } if (str->s[0] == 'C' || str->s[0] == 'R') { argnode_t *an1, *an2; int beg, end, n_mut; // read ks_getuntil(ks, 0, str, &dret); n1 = atoi(str->s); ks_getuntil(ks, 0, str, &dret); n2 = atoi(str->s); if (n1 <= n2) { fprintf(stderr, "[arg_load] invalid edge (%d)\n", lineno); exit(1); } // add to the node array if (n1 + 1 > a->max_size) { int old_size = a->max_size; a->max_size = n1 + 1; kroundup32(a->max_size); a->node = (argnode_t*)realloc(a->node, sizeof(argnode_t) * a->max_size); memset(a->node + old_size, 0, (a->max_size - old_size) * sizeof(argnode_t)); } an1 = a->node + n1; an2 = a->node + n2; an1->nid = n1; an2->nid = n2; if (an1->n_nei == 3 || an2->n_nei == 3) { fprintf(stderr, "[arg_load] multifurcated node: %d or %d (%d)\n", n1, n2, lineno); exit(1); } an1->nei[an1->n_nei++] = n2; an2->nei[an2->n_nei++] = n1; // read intervals ks_getuntil(ks, 0, str, &dret); beg = atoi(str->s); ks_getuntil(ks, 0, str, &dret); end = atoi(str->s); if (an2->end) { // a recombination node if (an2->end == beg) an2->x = an2->end, an2->end = end; else if (an2->beg == end) { // and also swap int x = an2->nei[1]; an2->nei[1] = an2->nei[2]; an2->nei[2] = x; an2->x = an2->beg, an2->beg = beg; } else { fprintf(stderr, "[arg_load] inconsisten interval at node %d (%d)\n", n2, lineno); exit(1); } } else an2->beg = beg, an2->end = end; // read mutations ks_getuntil(ks, 0, str, &dret); n_mut = atoi(str->s); if (n_mut) { an2->mut = (int*)realloc(an2->mut, sizeof(int) * (an2->n_mut + n_mut)); for (i = 0; i < n_mut; ++i) { ks_getuntil(ks, 0, str, &dret); an2->mut[an2->n_mut++] = atoi(str->s); } } // save interval and swap if necessary } else if (str->s[0] == 'N') { ks_getuntil(ks, 0, str, &dret); a->n = atoi(str->s); ks_getuntil(ks, 0, str, &dret); a->m = atoi(str->s); } else if (str->s[0] == 'S') { ks_getuntil(ks, 0, str, &dret); a->root = atoi(str->s); ks_getuntil(ks, 0, str, &dret); if (str->l != a->m) { fprintf(stderr, "[arg_load] inconsistent root sequence (%d)\n", lineno); exit(1); } a->rootseq = (uint64_t*)calloc((a->m + 63) / 64, 8); for (i = 0; i < a->m; ++i) if (str->s[i] == '1') arg_setseq1(a->rootseq, i); } else ks_getuntil(ks, '\n', str, &dret); } free(str->s); free(str); ks_destroy(ks); return a; }
int main(int argc, char *argv[]) { gzFile fp; kstream_t *ks; kstring_t s, t[N_TMPSTR]; int dret, i, k, af_n, af_max, af_i, c, is_padded = 0, write_cns = 0, *p2u = 0; long m_cigar = 0, n_cigar = 0; unsigned *af, *cigar = 0; while ((c = getopt(argc, argv, "pc")) >= 0) { switch (c) { case 'p': is_padded = 1; break; case 'c': write_cns = 1; break; } } if (argc == optind) { fprintf(stderr, "\nUsage: ace2sam [-pc] <in.ace>\n\n"); fprintf(stderr, "Options: -p output padded SAM\n"); fprintf(stderr, " -c write the contig sequence in SAM\n\n"); fprintf(stderr, "Notes: 1. Fields must appear in the following order: (CO->[BQ]->(AF)->(RD->QA))\n"); fprintf(stderr, " 2. The order of reads in AF and in RD must be identical\n"); fprintf(stderr, " 3. Except in BQ, words and numbers must be separated by a single SPACE or TAB\n"); fprintf(stderr, " 4. This program writes the headerless SAM to stdout and header to stderr\n\n"); return 1; } s.l = s.m = 0; s.s = 0; af_n = af_max = af_i = 0; af = 0; for (i = 0; i < N_TMPSTR; ++i) t[i].l = t[i].m = 0, t[i].s = 0; fp = strcmp(argv[1], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); ks = ks_init(fp); while (ks_getuntil(ks, 0, &s, &dret) >= 0) { if (strcmp(s.s, "CO") == 0) { // contig sequence kstring_t *cns; t[0].l = t[1].l = t[2].l = t[3].l = t[4].l = 0; // 0: name; 1: padded ctg; 2: unpadded ctg/padded read; 3: unpadded read; 4: SAM line af_n = af_i = 0; // reset the af array ks_getuntil(ks, 0, &s, &dret); kputs(s.s, &t[0]); // contig name ks_getuntil(ks, '\n', &s, &dret); // read the whole line while (ks_getuntil(ks, '\n', &s, &dret) >= 0 && s.l > 0) kputsn(s.s, s.l, &t[1]); // read the padded consensus sequence remove_pads(&t[1], &t[2]); // construct the unpadded sequence // compute the array for mapping padded positions to unpadded positions p2u = realloc(p2u, t[1].m * sizeof(int)); for (i = k = 0; i < t[1].l; ++i) { p2u[i] = k; if (t[1].s[i] != '*') ++k; } // write out the SAM header and contig sequences fprintf(stderr, "H @SQ\tSN:%s\tLN:%llu\n", t[0].s, (unsigned long long)(t[is_padded?1:2].l)); // The SAM header line cns = &t[is_padded?1:2]; fprintf(stderr, "S >%s\n", t[0].s); for (i = 0; i < cns->l; i += LINE_LEN) { fputs("S ", stderr); for (k = 0; k < LINE_LEN && i + k < cns->l; ++k) fputc(cns->s[i + k], stderr); fputc('\n', stderr); } #define __padded2cigar(sp) do { \ int i, l_M = 0, l_D = 0; \ for (i = 0; i < sp.l; ++i) { \ if (sp.s[i] == '*') { \ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ ++l_D; l_M = 0; \ } else { \ if (l_D) write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ ++l_M; l_D = 0; \ } \ } \ if (l_M) write_cigar(cigar, n_cigar, m_cigar, l_M<<4); \ else write_cigar(cigar, n_cigar, m_cigar, l_D<<4 | 2); \ } while (0) if (write_cns) { // write the consensus SAM line (dummy read) n_cigar = 0; if (is_padded) __padded2cigar(t[1]); else write_cigar(cigar, n_cigar, m_cigar, t[2].l<<4); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t516\t", &t[4]); kputsn(t[0].s, t[0].l, &t[4]); kputs("\t1\t60\t", &t[4]); for (i = 0; i < n_cigar; ++i) { kputw(cigar[i]>>4, &t[4]); kputc("MIDNSHP=X"[cigar[i]&0xf], &t[4]); } kputs("\t*\t0\t0\t", &t[4]); kputsn(t[2].s, t[2].l, &t[4]); kputs("\t*", &t[4]); } } else if (strcmp(s.s, "BQ") == 0) { // contig quality
int main_bedcov(int argc, char *argv[]) { gzFile fp; kstring_t str; kstream_t *ks; hts_idx_t **idx; aux_t **aux; int *n_plp, dret, i, n, c, min_mapQ = 0; int64_t *cnt; const bam_pileup1_t **plp; int usage = 0; sam_global_args ga = SAM_GLOBAL_ARGS_INIT; static const struct option lopts[] = { SAM_OPT_GLOBAL_OPTIONS('-', 0, '-', '-', 0), { NULL, 0, NULL, 0 } }; while ((c = getopt_long(argc, argv, "Q:", lopts, NULL)) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; default: if (parse_sam_global_opt(c, optarg, lopts, &ga) == 0) break; /* else fall-through */ case '?': usage = 1; break; } if (usage) break; } if (usage || optind + 2 > argc) { fprintf(pysam_stderr, "Usage: samtools bedcov [options] <in.bed> <in1.bam> [...]\n\n"); fprintf(pysam_stderr, " -Q INT Only count bases of at least INT quality [0]\n"); sam_global_opt_help(pysam_stderr, "-.--."); return 1; } memset(&str, 0, sizeof(kstring_t)); n = argc - optind - 1; aux = calloc(n, sizeof(aux_t*)); idx = calloc(n, sizeof(hts_idx_t*)); for (i = 0; i < n; ++i) { aux[i] = calloc(1, sizeof(aux_t)); aux[i]->min_mapQ = min_mapQ; aux[i]->fp = sam_open_format(argv[i+optind+1], "r", &ga.in); if (aux[i]->fp) idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); if (aux[i]->fp == 0 || idx[i] == 0) { fprintf(pysam_stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); return 2; } // TODO bgzf_set_cache_size(aux[i]->fp, 20); aux[i]->header = sam_hdr_read(aux[i]->fp); if (aux[i]->header == NULL) { fprintf(pysam_stderr, "ERROR: failed to read header for '%s'\n", argv[i+optind+1]); return 2; } } cnt = calloc(n, 8); fp = gzopen(argv[optind], "rb"); ks = ks_init(fp); n_plp = calloc(n, sizeof(int)); plp = calloc(n, sizeof(bam_pileup1_t*)); while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { char *p, *q; int tid, beg, end, pos; bam_mplp_t mplp; for (p = q = str.s; *p && *p != '\t'; ++p); if (*p != '\t') goto bed_error; *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t'; if (tid < 0) goto bed_error; for (q = p = p + 1; isdigit(*p); ++p); if (*p != '\t') goto bed_error; *p = 0; beg = atoi(q); *p = '\t'; for (q = p = p + 1; isdigit(*p); ++p); if (*p == '\t' || *p == 0) { int c = *p; *p = 0; end = atoi(q); *p = c; } else goto bed_error; for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end); } mplp = bam_mplp_init(n, read_bam, (void**)aux); bam_mplp_set_maxcnt(mplp, 64000); memset(cnt, 0, 8 * n); while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) if (pos >= beg && pos < end) for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); } fputs(str.s, pysam_stdout) & fputc('\n', pysam_stdout); bam_mplp_destroy(mplp); continue; bed_error: fprintf(pysam_stderr, "Errors in BED line '%s'\n", str.s); } free(n_plp); free(plp); ks_destroy(ks); gzclose(fp); free(cnt); for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); hts_idx_destroy(idx[i]); bam_hdr_destroy(aux[i]->header); sam_close(aux[i]->fp); free(aux[i]); } free(aux); free(idx); free(str.s); sam_global_args_free(&ga); return 0; }
int ingest1(const char *input,const char *output,char *ref,bool exit_on_mismatch=true) { cerr << "Input: " << input << "\tOutput: "<<output<<endl; kstream_t *ks; kstring_t str = {0,0,0}; gzFile fp = gzopen(input, "r"); VarBuffer vbuf(1000); int prev_rid = -1; if(fp==NULL) { fprintf(stderr,"problem opening %s\n",input); exit(1); } char *out_fname = (char *)malloc(strlen(output)+5); strcpy(out_fname,output); strcat(out_fname,".tmp"); if(fileexists(out_fname)) { fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname); exit(1); } printf("depth: %s\n",out_fname); gzFile depth_fp = gzopen(out_fname, "wb1"); strcpy(out_fname,output); strcat(out_fname,".bcf"); if(fileexists(out_fname)) { fprintf(stderr,"%s file already exists. will not overwrite\n",out_fname); exit(1); } printf("variants: %s\n",out_fname); htsFile *variant_fp=hts_open(out_fname,"wb1"); if(variant_fp==NULL) { fprintf(stderr,"problem opening %s\n",input); exit(1); } ks = ks_init(fp); htsFile *hfp=hts_open(input, "r"); bcf_hdr_t *hdr_in = bcf_hdr_read(hfp); hts_close(hfp); //this is a hack to fix gvcfs where AD is incorrectly defined in the header. (vcf4.2 does not technically allow Number=R) bcf_hdr_remove(hdr_in,BCF_HL_FMT,"AD"); assert( bcf_hdr_append(hdr_in,"##FORMAT=<ID=AD,Number=R,Type=Integer,Description=\"Allelic depths for the ref and alt alleles in the order listed. For indels this value only includes reads which confidently support each allele (posterior prob 0.999 or higher that read contains indicated allele vs all other intersecting indel alleles)\">") == 0); //this is a hack to fix broken gvcfs where GQ is incorrectly labelled as float (v4.3 spec says it should be integer) bcf_hdr_remove(hdr_in,BCF_HL_FMT,"GQ"); assert( bcf_hdr_append(hdr_in,"##FORMAT=<ID=GQ,Number=1,Type=Integer,Description=\"Genotype Quality\">") == 0); // bcf_hdr_t *hdr_out=hdr_in; bcf_hdr_t *hdr_out = bcf_hdr_dup(hdr_in); remove_hdr_lines(hdr_out,BCF_HL_INFO); remove_hdr_lines(hdr_out,BCF_HL_FLT); bcf_hdr_sync(hdr_out); //here we add FORMAT/PF. which is the pass filter flag for alts. assert( bcf_hdr_append(hdr_out,"##FORMAT=<ID=PF,Number=A,Type=Integer,Description=\"variant was PASS filter in original sample gvcf\">") == 0); args_t *norm_args = init_vcfnorm(hdr_out,ref); norm_args->check_ref |= CHECK_REF_WARN; bcf1_t *bcf_rec = bcf_init(); bcf_hdr_write(variant_fp, hdr_out); kstring_t work1 = {0,0,0}; int buf[5]; ks_tokaux_t aux; int ndec=0; int ref_len,alt_len; while( ks_getuntil(ks, '\n', &str, 0) >=0) { // fprintf(stderr,"%s\n",str.s); if(str.s[0]!='#') { char *ptr = kstrtok(str.s,"\t",&aux);//chrom ptr = kstrtok(NULL,NULL,&aux);//pos work1.l=0; kputsn(str.s,ptr-str.s-1, &work1); buf[0] = bcf_hdr_name2id(hdr_in, work1.s); assert( buf[0]>=0); buf[1]=atoi(ptr)-1; ptr = kstrtok(NULL,NULL,&aux);//ID ptr = kstrtok(NULL,NULL,&aux);//REF ref_len=0; while(ptr[ref_len]!='\t') ref_len++; ptr = kstrtok(NULL,NULL,&aux);//ALT bool is_variant=false; alt_len=0; while(ptr[alt_len]!='\t') alt_len++; if(ptr[0]!='.') is_variant=true; char * QUAL_ptr = kstrtok(NULL, NULL, &aux); assert (QUAL_ptr != NULL); for(int i=0;i<2;i++) ptr = kstrtok(NULL,NULL,&aux);// gets us to INFO //find END if it is there char *end_ptr=strstr(ptr,"END=") ; if(end_ptr!=NULL) buf[2]=atoi(end_ptr+4)-1; else buf[2]=buf[1]+alt_len-1; ptr = kstrtok(NULL,NULL,&aux);//FORMAT //find index of DP (if present) //if not present, dont output anything (indels ignored) char *DP_ptr = find_format(ptr,"DP"); int GQX = 0; int QUAL = 0; // AH: change code to use the minimum of GQ and QUAL fields if // GQX is not defined. See here: // https://support.basespace.illumina.com/knowledgebase/articles/144844-vcf-file // "GQXGenotype quality. GQX is the minimum of the GQ value // and the QUAL column. In general, these are similar values; // taking the minimum makes GQX the more conservative measure of // genotype quality." if(DP_ptr!=NULL) { buf[3]=atoi(DP_ptr); char *GQX_ptr = find_format(ptr,"GQX"); if (GQX_ptr == NULL) { GQX_ptr = find_format(ptr,"GQ"); GQX = atoi(GQX_ptr); if (QUAL_ptr[0] != '.') { QUAL = atoi(QUAL_ptr); if (QUAL < GQX) GQX = QUAL; } } else { GQX = atoi(GQX_ptr); } //trying to reduce entropy on GQ to get better compression performance. //1. rounds down to nearest 10. //2. sets gq to min(gq,100). buf[4]=GQX/10; buf[4]*=10; if(buf[4]>100) buf[4]=100; // printf("%d\t%d\t%d\t%d\t%d\n",buf[0],buf[1],buf[2],buf[3],buf[4]); if(gzwrite(depth_fp,buf,5*sizeof(int))!=(5*sizeof(int))) die("ERROR: problem writing "+(string)out_fname+".tmp"); } if(is_variant) {//wass this a variant? if so write it out to the bcf norm_args->ntotal++; vcf_parse(&str,hdr_in,bcf_rec); // cerr<<bcf_rec->rid<<":"<<bcf_rec->pos<<endl; if(prev_rid!=bcf_rec->rid) vbuf.flush(variant_fp,hdr_out); else vbuf.flush(bcf_rec->pos,variant_fp,hdr_out); prev_rid=bcf_rec->rid; int32_t pass = bcf_has_filter(hdr_in, bcf_rec, "."); bcf_update_format_int32(hdr_out,bcf_rec,"PF",&pass,1); bcf_update_filter(hdr_out,bcf_rec,NULL,0); if(bcf_rec->n_allele>2) {//split multi-allelics (using vcfnorm.c from bcftools1.3 norm_args->nsplit++; split_multiallelic_to_biallelics(norm_args,bcf_rec ); for(int i=0;i<norm_args->ntmp_lines;i++){ remove_info(norm_args->tmp_lines[i]); if(realign(norm_args,norm_args->tmp_lines[i]) != ERR_REF_MISMATCH) ndec+=decompose(norm_args->tmp_lines[i],hdr_out,vbuf); else if(exit_on_mismatch) die("vcf did not match the reference"); else norm_args->nskipped++; } } else { remove_info(bcf_rec); if( realign(norm_args,bcf_rec) != ERR_REF_MISMATCH) ndec+=decompose(bcf_rec,hdr_out,vbuf); else if(exit_on_mismatch) die("vcf did not match the reference"); else norm_args->nskipped++; } vbuf.flush(bcf_rec->pos,variant_fp,hdr_out); } } } vbuf.flush(variant_fp,hdr_out); bcf_hdr_destroy(hdr_in); bcf_hdr_destroy(hdr_out); bcf_destroy1(bcf_rec); ks_destroy(ks); gzclose(fp); gzclose(depth_fp); free(str.s); free(work1.s); hts_close(variant_fp); destroy_data(norm_args); fprintf(stderr,"Variant lines total/split/realigned/skipped:\t%d/%d/%d/%d\n", norm_args->ntotal,norm_args->nsplit,norm_args->nchanged,norm_args->nskipped); fprintf(stderr,"Decomposed %d MNPs\n", ndec); fprintf(stderr,"Indexing %s\n",out_fname); bcf_index_build(out_fname, BCF_LIDX_SHIFT); free(out_fname); return 0; }
int main_bedcov(int argc, char *argv[]) { gzFile fp; kstring_t str; kstream_t *ks; hts_idx_t **idx; aux_t **aux; int *n_plp, dret, i, n, c, min_mapQ = 0; int64_t *cnt; const bam_pileup1_t **plp; while ((c = getopt(argc, argv, "Q:")) >= 0) { switch (c) { case 'Q': min_mapQ = atoi(optarg); break; } } if (optind + 2 > argc) { fprintf(stderr, "Usage: samtools bedcov <in.bed> <in1.bam> [...]\n"); return 1; } memset(&str, 0, sizeof(kstring_t)); n = argc - optind - 1; aux = calloc(n, sizeof(aux_t*)); idx = calloc(n, sizeof(hts_idx_t*)); for (i = 0; i < n; ++i) { aux[i] = calloc(1, sizeof(aux_t)); aux[i]->min_mapQ = min_mapQ; aux[i]->fp = sam_open(argv[i+optind+1], "r"); idx[i] = sam_index_load(aux[i]->fp, argv[i+optind+1]); if (aux[i]->fp == 0 || idx[i] == 0) { fprintf(stderr, "ERROR: fail to open index BAM file '%s'\n", argv[i+optind+1]); return 2; } // TODO bgzf_set_cache_size(aux[i]->fp, 20); aux[i]->header = sam_hdr_read(aux[i]->fp); } cnt = calloc(n, 8); fp = gzopen(argv[optind], "rb"); ks = ks_init(fp); n_plp = calloc(n, sizeof(int)); plp = calloc(n, sizeof(bam_pileup1_t*)); while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { char *p, *q; int tid, beg, end, pos; bam_mplp_t mplp; for (p = q = str.s; *p && *p != '\t'; ++p); if (*p != '\t') goto bed_error; *p = 0; tid = bam_name2id(aux[0]->header, q); *p = '\t'; if (tid < 0) goto bed_error; for (q = p = p + 1; isdigit(*p); ++p); if (*p != '\t') goto bed_error; *p = 0; beg = atoi(q); *p = '\t'; for (q = p = p + 1; isdigit(*p); ++p); if (*p == '\t' || *p == 0) { int c = *p; *p = 0; end = atoi(q); *p = c; } else goto bed_error; for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); aux[i]->iter = sam_itr_queryi(idx[i], tid, beg, end); } mplp = bam_mplp_init(n, read_bam, (void**)aux); bam_mplp_set_maxcnt(mplp, 64000); memset(cnt, 0, 8 * n); while (bam_mplp_auto(mplp, &tid, &pos, n_plp, plp) > 0) if (pos >= beg && pos < end) for (i = 0; i < n; ++i) cnt[i] += n_plp[i]; for (i = 0; i < n; ++i) { kputc('\t', &str); kputl(cnt[i], &str); } puts(str.s); bam_mplp_destroy(mplp); continue; bed_error: fprintf(stderr, "Errors in BED line '%s'\n", str.s); } free(n_plp); free(plp); ks_destroy(ks); gzclose(fp); free(cnt); for (i = 0; i < n; ++i) { if (aux[i]->iter) hts_itr_destroy(aux[i]->iter); hts_idx_destroy(idx[i]); bam_hdr_destroy(aux[i]->header); sam_close(aux[i]->fp); free(aux[i]); } free(aux); free(idx); free(str.s); return 0; }
int main(int argc, char *argv[]) { int choice = atoi(argv[1]); bool isPairEnd = false; if (choice) { char *kmerPath = argv[2]; char *refPath = argv[3]; char *taxonomyNodesPath = argv[4]; char *giTaxidPath = argv[5]; char *dirPath = argv[6]; //vector<uint64_t> fKmer; _kmer = (uint8_t) atoi(argv[7]); string bwt_s; vector<uint32_t> nKmerTaxonID; fprintf(stderr,"start preprocessing......\n"); uint64_t hash_index_size = (uint64_t)1 <<((PREINDEXLEN<<1) + 1); uint64_t *hash_index = new uint64_t[hash_index_size](); preprocess(refPath, kmerPath, taxonomyNodesPath, giTaxidPath, bwt_s, nKmerTaxonID, hash_index); fprintf(stderr,"writing index....\n"); //char *dirPath = "."; // bwt bt(bwt_s.c_str(), bwt_s.length(),hash_index); bt.bwt_init(); bt.write_info(dirPath, nKmerTaxonID); //uint64_t sp,ep; //bt.exactMatch("GCTTCGCTGTTATTGGCACCAATTGGATCAC",31, sp,ep); } else { char *readPath; char *taxonomyNodesPath; char * dirPath; char *readPath_s; fprintf(stderr,"%d", argc); if (argc > 7 ) { isPairEnd = true; readPath = argv[2]; readPath_s = argv[3]; taxonomyNodesPath = argv[4]; dirPath = argv[5]; _kmer = (uint8_t) atoi(argv[6]); _interval = atoi(argv[7]); } else { readPath = argv[2]; taxonomyNodesPath = argv[3]; dirPath = argv[4]; _kmer = (uint8_t) atoi(argv[5]); _interval = atoi(argv[6]); } --_kmer; bwt bt(_kmer); fprintf(stderr,"loading index\n"); bt.load_info(dirPath); taxonTree(taxonomyNodesPath); //map<uint32_t, uint32_t>::iterator it = taxonomyTree.begin(); //while (it!=taxonomyTree.end()) { // cout<<it->first<<"\t"<<it->second<<endl; // ++it; //} //cout<<taxonomyTree.size()<<endl; fprintf(stderr,"classifying...\n"); gzFile fp; //uint32_t *nKmerTaxonID = bt.taxonIDTab; fp = gzopen(readPath, "r"); if (!fp) return FILE_OPEN_ERROR; kstream_t *_fp = ks_init(fp); kseq_t *seqs = (kseq_t *) calloc(N_NEEDED, sizeof(kseq_t)); if (!seqs) return MEM_ALLOCATE_ERROR; // //parameters for pair-end reads gzFile fp_s; kstream_t *_fp_s; kseq_t *seqs_s; if (isPairEnd) { fp_s = gzopen(readPath_s, "r"); if (!fp_s) return FILE_OPEN_ERROR; _fp_s = ks_init(fp_s); seqs_s = (kseq_t *) calloc(N_NEEDED, sizeof(kseq_t)); if (!seqs_s) return MEM_ALLOCATE_ERROR; } if (!seqs) return MEM_ALLOCATE_ERROR; cly_r *results = (cly_r *)calloc(2 * N_NEEDED, sizeof(cly_r)); struct timeval tv1, tv2; int n_seqs; total_sequences = 0; gettimeofday(&tv1,NULL); if (isPairEnd) { while ((n_seqs = read_reads(_fp, seqs, N_NEEDED) )> 0 && read_reads(_fp_s, seqs_s, N_NEEDED)) { classify_seq(seqs, n_seqs , bt, results, 0); classify_seq(seqs_s, n_seqs, bt, results, 1); output_results(results, n_seqs, isPairEnd); total_sequences += n_seqs; } } else { while ((n_seqs = read_reads(_fp, seqs, N_NEEDED) )> 0) { classify_seq(seqs, n_seqs , bt, results, 0); output_results(results, n_seqs, isPairEnd); total_sequences += n_seqs; } } gettimeofday(&tv2,NULL); report_stats(tv1,tv2); //fprintf(stderr,"%f seconds\n",((float)t)/CLOCKS_PER_SEC); if (results) free(results); if (seqs) free(seqs); } //char *st; //uint32_t *uts; //uint64_t z; //load_index(dirPath, st, uts, &z ); //cerr<<<<endl; //fprintf(stderr,"%s\n",st); //uint64_t sp, ep; //bwt b(st, uts, z); //b.bwt_init(); //char *read = "GGCT"; //cout<<b.exactMatch(read,4,sp,ep )<<endl; //cout<<sp<<"\t"<<ep<<endl; //read reads file //output(kmerValue, kmerInfo, _2kmers); return NORMAL_EXIT; }
int main(int argc, char *argv[]) { int c, dret, lineno = 0, n_rows = 0, m_rows = 0, n_cols = 0, max_hap = 0; int64_t n_missing = 0, n_tot = 0; gzFile fp; kstream_t *ks; kstring_t str = {0,0,0}; int8_t **C = 0; double **M, *X, min_maf = 0.0; char **names = 0; // _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_OVERFLOW | _MM_MASK_UNDERFLOW | _MM_MASK_DIV_ZERO)); _MM_SET_EXCEPTION_MASK(_MM_GET_EXCEPTION_MASK() & ~(_MM_MASK_INVALID | _MM_MASK_DIV_ZERO)); while ((c = getopt(argc, argv, "m:")) >= 0) { if (c == 'm') min_maf = atof(optarg); } if (argc - optind == 0) { fprintf(stderr, "Usage: naivepca [-m min_maf] <in.txt>\n"); return 1; } fp = strcmp(argv[optind], "-")? gzopen(argv[optind], "r") : gzdopen(fileno(stdin), "r"); if (fp == 0) { fprintf(stderr, "[E::%s] failed to open file '%s'. Abort.\n", __func__, argv[optind]); return 2; } ks = ks_init(fp); // read the matrix into C while (ks_getuntil(ks, KS_SEP_LINE, &str, &dret) >= 0) { int8_t *q; char *p, *name = str.s; int i; ++lineno; for (p = str.s; *p && *p != '\t' && *p != ' '; ++p); if (*p) { *p++ = 0; for (; *p && (*p == '\t' || *p == ' '); ++p); } if (*p == 0) { fprintf(stderr, "[W::%s] line %d has one field; skipped.\n", __func__, lineno); continue; } if (n_cols != 0) { if (n_cols != str.s + str.l - p) { fprintf(stderr, "[W::%s] line %d has a different number of columns; skipped.\n", __func__, lineno); continue; } } else n_cols = str.s + str.l - p; if (n_rows == m_rows) { m_rows = m_rows? m_rows<<1 : 16; C = (int8_t**)realloc(C, m_rows * sizeof(int8_t*)); names = (char**)realloc(names, m_rows * sizeof(char*)); } names[n_rows] = strdup(name); q = C[n_rows++] = (int8_t*)calloc(n_cols, sizeof(double)); for (i = 0; i < n_cols; ++i) { if (p[i] >= '0' && p[i] <= '9') q[i] = p[i] - '0'; else q[i] = -1, ++n_missing; max_hap = max_hap > q[i]? max_hap : q[i]; } n_tot += n_cols; } free(str.s); fprintf(stderr, "[M::%s] read %d samples and %d sites; ploidy is %d\n", __func__, n_rows, n_cols, max_hap); fprintf(stderr, "[M::%s] %.3f%% of genotypes are missing\n", __func__, (double)n_missing / n_tot); { // normalize the matrix into M int i, j, *sum, *cnt, n_dropped = 0; double *mu, *pp; sum = (int*)calloc(n_cols, sizeof(int)); cnt = (int*)calloc(n_cols, sizeof(int)); mu = (double*)calloc(n_cols, sizeof(double)); pp = (double*)calloc(n_cols, sizeof(double)); for (i = 0; i < n_rows; ++i) { int8_t *q = C[i]; for (j = 0; j < n_cols; ++j) if (q[j] >= 0) sum[j] += q[j], ++cnt[j]; } for (j = 0; j < n_cols; ++j) { if (cnt[j] > 0) { mu[j] = (double)sum[j] / cnt[j]; pp[j] = mu[j] / max_hap; if (pp[j] < min_maf || 1. - pp[j] < min_maf) ++n_dropped; } else ++n_dropped; } fprintf(stderr, "[M::%s] %d rare sites are dropped\n", __func__, n_dropped); M = (double**)calloc(n_rows, sizeof(double*)); for (i = 0; i < n_rows; ++i) { int8_t *q = C[i]; double *r; r = M[i] = (double*)calloc(n_cols, sizeof(double)); for (j = 0; j < n_cols; ++j) r[j] = q[j] < 0 || pp[j] < min_maf || 1. - pp[j] < min_maf || pp[j] == 0. || 1 - pp[j] == 0. ? 0. : (q[j] - mu[j]) / sqrt(pp[j] * (1. - pp[j])); } free(sum); free(cnt); free(mu); free(pp); for (i = 0; i < n_rows; ++i) free(C[i]); free(C); } { // multiplication int i, j, k; X = (double*)calloc(n_rows * n_rows, sizeof(double)); for (i = 0; i < n_rows; ++i) { double *zi = M[i]; for (j = 0; j <= i; ++j) { double t = 0., *zj = M[j]; for (k = 0; k < n_cols; ++k) t += zi[k] * zj[k]; X[i*n_rows + j] = X[j*n_rows + i] = t / n_cols; } } for (i = 0; i < n_rows; ++i) free(M[i]); free(M); } { // print eigan vectors double *ev; int i, j; evsrt_t *evsrt; ev = (double*)calloc(n_rows, sizeof(double)); evsrt = (evsrt_t*)calloc(n_rows, sizeof(evsrt_t)); n_eigen_symm(X, n_rows, ev); for (i = 0; i < n_rows; ++i) evsrt[i].ev = ev[i], evsrt[i].i = i; ks_introsort(ev, n_rows, evsrt); for (i = 0; i < n_rows; ++i) { printf("%s", names[i]); for (j = 0; j < n_rows; ++j) printf("\t%.6f", X[i*n_rows + evsrt[j].i] * evsrt[j].ev); putchar('\n'); free(names[i]); } free(ev); free(evsrt); free(X); free(names); } ks_destroy(ks); gzclose(fp); return 0; }