void vcf_read_header(gzFile vcf_fh, VCFInfo *vcf_info) { char *line, *cur, *token; int tok_num; int n_fix_header, i; /* const char delim[] = " \t"; */ const char delim[] = "\t"; n_fix_header = sizeof(vcf_fix_headers) / sizeof(const char *); vcf_info->n_header_line = 0; while(util_gzgetline(vcf_fh, &vcf_info->buf, &vcf_info->buf_size) != -1) { line = vcf_info->buf; if(util_str_starts_with(line, "##")) { /* header line */ vcf_info->n_header_line += 1; } else if(util_str_starts_with(line, "#CHROM")) { /* this should be last header line that contains list of fixed fields */ vcf_info->n_header_line += 1; cur = vcf_info->buf; line = util_str_dup(vcf_info->buf); tok_num = 0; while((token = strsep(&cur, delim)) != NULL) { if(tok_num < n_fix_header) { if(strcmp(token, vcf_fix_headers[tok_num]) != 0) { my_warn("expected token %d to be %s but got '%s'", tok_num, vcf_fix_headers[tok_num], token); } } tok_num += 1; } vcf_info->n_sample = tok_num - n_fix_header; /* * read sample names from remaining part of header */ vcf_info->sample_names = my_malloc(sizeof(char *) * vcf_info->n_sample); cur = line; tok_num = 0; i = 0; while((token = strsep(&cur, delim)) != NULL) { if(tok_num >= n_fix_header) { vcf_info->sample_names[i] = util_str_dup(token); i += 1; } tok_num += 1; } my_free(line); break; } else { my_err("expected last line in header to start with #CHROM"); } } }
/** * Gets next line of IMPUTE file and parses it into ImputeInfo datastructure. * * IMPUTE files are described here: * http://www.stats.ox.ac.uk/~marchini/software/gwas/file_format.html * * example line: * --- rs149201999 16050408 T C 0.966 0.034 0 0.395 0.467 .... * * If geno_probs array is non-null genotype probabilities are parsed and * stored in the provided array. The array must be of length * n_samples*3. * * If haplotypes array is non-null phased genotypes are parsed and * stored in the provided array. The array must be of length * n_samples*2. * * IMPUTE files contain EITHER haplotypes OR genotypes so only * one of geno_probs or haplotypes should be non-null (at most). * * Returns 0 on success, -1 if at EOF. */ int impute_read_line(gzFile fh, ImputeInfo *impute_info, SNP *snp, float *geno_probs, char *haplotypes) { char *cur, *token; int n_fix_header, ref_len, alt_len; size_t tok_num; const char delim[] = " \t"; /* read a line */ if(util_gzgetline(fh, &impute_info->buf, &impute_info->buf_size) == -1) { return -1; } cur = impute_info->buf; tok_num = 0; /* SNP name, often just set to "---" */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER); } /* SNP identifier (rs_id) */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER); } util_strncpy(snp->name, token, sizeof(snp->name)); /* pos */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER); } snp->pos = util_parse_long(token); /* ref allele */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER); } util_strncpy(snp->allele1, token, sizeof(snp->allele1)); /* alt allele */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER); } alt_len = util_strncpy(snp->allele2, token, sizeof(snp->allele2)); /* now parse haplotypes and/or genotype likelihoods */ if(geno_probs && haplotypes) { my_err("impute2 files contain EITHER genotypes or haplotypes, but " "both requested\n"); } else if(geno_probs) { impute_parse_geno_probs(geno_probs, cur, impute_info->n_samples); } else if(haplotypes) { impute_parse_haplotypes(haplotypes, cur, impute_info->n_samples); } }
/** * Gets next line of VCF file and parses it into VCFInfo datastructure. * * If geno_probs array is non-null genotype likelihoods are parsed and * stored in the provided array. The array must be of length * n_sample*3. * * If haplotypes array is non-null phased genotypes are parsed and * stored in the provided array. The array must be of length * n_sample*2. * * Returns 0 on success, -1 if at EOF. */ int vcf_read_line(gzFile vcf_fh, VCFInfo *vcf_info, SNP *snp, float *geno_probs, char *haplotypes) { char *cur, *token; int n_fix_header, ref_len, alt_len; size_t tok_num; /* Used to allow space or tab delimiters here but now only allow * tab. This is because VCF specification indicates that fields * should be tab-delimited, and occasionally some fields contain * spaces. */ /* const char delim[] = " \t";*/ const char delim[] = "\t"; n_fix_header = sizeof(vcf_fix_headers) / sizeof(const char *); /* read a line */ if(util_gzgetline(vcf_fh, &vcf_info->buf, &vcf_info->buf_size) == -1) { return -1; } cur = vcf_info->buf; tok_num = 0; /* chrom */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } /* we don't bother to store chromosome since we store * SNPs from each chromosome in their own table */ /* util_strncpy(snp->chrom, token, sizeof(snp->chrom)); */ /* pos */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } snp->pos = util_parse_long(token); /* ID */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } util_strncpy(snp->name, token, sizeof(snp->name)); /* ref */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } ref_len = util_strncpy(snp->allele1, token, sizeof(snp->allele1)); /* used to warn about truncations, but makes program too * chatty if there are a lot of them */ vcf_info->ref_len = 0; /* vcf_info->ref_len = strlen(token); */ /* if(ref_len != vcf_info->ref_len) { */ /* my_warn("truncating long allele (%ld bp) to %ld bp\n", */ /* vcf_info->ref_len, ref_len); */ /* } */ /* alt */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } alt_len = util_strncpy(snp->allele2, token, sizeof(snp->allele2)); vcf_info->alt_len = 0; /* vcf_info->alt_len = strlen(token); */ /* if(alt_len != vcf_info->alt_len) { */ /* my_warn("truncating long allele (%ld bp) to %ld bp\n", */ /* vcf_info->alt_len, alt_len); */ /* } */ /* qual */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } util_strncpy(vcf_info->qual, token, sizeof(vcf_info->qual)); /* filter */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } util_strncpy(vcf_info->filter, token, sizeof(vcf_info->filter)); /* info */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } util_strncpy(vcf_info->info, token, sizeof(vcf_info->info)); /* format */ token = strsep(&cur, delim); if(token == NULL) { my_err("expected at least %d tokens per line\n", n_fix_header); } util_strncpy(vcf_info->format, token, sizeof(vcf_info->format)); /* now parse haplotypes and/or genotype likelihoods */ if(geno_probs && haplotypes) { char *cur_copy; /* Both genotype probs and haplotypes requested. * Need to copy string because it is modified * by the tokenizing in the parsing functions. * * This could be made more efficient by doing the parsing * of both types of data at same time */ cur_copy = my_malloc(strlen(cur)+1); strcpy(cur_copy, cur); vcf_parse_geno_probs(vcf_info, geno_probs, cur_copy); my_free(cur_copy); vcf_parse_haplotypes(vcf_info, haplotypes, cur); } else if(geno_probs) { vcf_parse_geno_probs(vcf_info, geno_probs, cur); } else if(haplotypes) { vcf_parse_haplotypes(vcf_info, haplotypes, cur); } /* my_free(line); */ return 0; }