示例#1
0
文件: vcf.c 项目: smozaffari/WASP
void vcf_read_header(gzFile vcf_fh, VCFInfo *vcf_info) {
  char *line, *cur, *token;
  int tok_num;
  int n_fix_header, i;
  
  /* const char delim[] = " \t"; */
  const char delim[] = "\t";

  n_fix_header = sizeof(vcf_fix_headers) / sizeof(const char *);

  vcf_info->n_header_line = 0;
  
  while(util_gzgetline(vcf_fh, &vcf_info->buf, &vcf_info->buf_size) != -1) {
    line = vcf_info->buf;
  
    if(util_str_starts_with(line, "##")) {
      /* header line */
      vcf_info->n_header_line += 1;
    }
    else if(util_str_starts_with(line, "#CHROM")) {
      /* this should be last header line that contains list of fixed fields */
      vcf_info->n_header_line += 1;
	
      cur = vcf_info->buf;
      line = util_str_dup(vcf_info->buf);
      tok_num = 0;
      while((token = strsep(&cur, delim)) != NULL) {
	if(tok_num < n_fix_header) {
	  if(strcmp(token, vcf_fix_headers[tok_num]) != 0) {
	    my_warn("expected token %d to be %s but got '%s'",
		    tok_num, vcf_fix_headers[tok_num], token);
	  }
	}
	tok_num += 1;
      }
      vcf_info->n_sample = tok_num - n_fix_header;

      /*
       * read sample names from remaining part of header
       */
      vcf_info->sample_names = my_malloc(sizeof(char *) * vcf_info->n_sample);
      cur = line;
      tok_num = 0;
      i = 0;
      while((token = strsep(&cur, delim)) != NULL) {
	if(tok_num >= n_fix_header) {
	  vcf_info->sample_names[i] = util_str_dup(token);
	  i += 1;
	}
	tok_num += 1;
      }
      my_free(line);

      break;
    } else {
      my_err("expected last line in header to start with #CHROM");
    }
  }
}
示例#2
0
文件: impute.c 项目: Q-KIM/WASP
/**
 * Gets next line of IMPUTE file and parses it into ImputeInfo datastructure.
 *
 * IMPUTE files are described here:
 * http://www.stats.ox.ac.uk/~marchini/software/gwas/file_format.html
 * 
 * example line: 
 *     --- rs149201999 16050408 T C 0.966 0.034 0 0.395 0.467 ....
 * 
 * If geno_probs array is non-null genotype probabilities are parsed and
 * stored in the provided array. The array must be of length
 * n_samples*3.
 *
 * If haplotypes array is non-null phased genotypes are parsed and
 * stored in the provided array. The array must be of length
 * n_samples*2.
 *
 * IMPUTE files contain EITHER haplotypes OR genotypes so only
 * one of geno_probs or haplotypes should be non-null (at most).
 *
 * Returns 0 on success, -1 if at EOF.
 */
int impute_read_line(gzFile fh, ImputeInfo *impute_info, SNP *snp,
		     float *geno_probs, char *haplotypes) {
  char *cur, *token;
  int n_fix_header, ref_len, alt_len;
  size_t tok_num;
  const char delim[] = " \t";

  /* read a line */
  if(util_gzgetline(fh, &impute_info->buf, &impute_info->buf_size) == -1) {
    return -1;
  }
  
  cur = impute_info->buf;
  tok_num = 0;

  /* SNP name, often just set to "---" */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER);
  }

  /* SNP identifier (rs_id) */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER);
  }
  util_strncpy(snp->name, token, sizeof(snp->name));
    
  /* pos */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER);
  }
  snp->pos = util_parse_long(token);
  
  /* ref allele */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER);
  }
  util_strncpy(snp->allele1, token, sizeof(snp->allele1));
    
  /* alt allele */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", IMPUTE_FIX_HEADER);
  }
  alt_len = util_strncpy(snp->allele2, token, sizeof(snp->allele2));
  
  /* now parse haplotypes and/or genotype likelihoods */
  if(geno_probs && haplotypes) {
    my_err("impute2 files contain EITHER genotypes or haplotypes, but "
	   "both requested\n");
  }
  else if(geno_probs) {
    impute_parse_geno_probs(geno_probs, cur, impute_info->n_samples);
  }
  else if(haplotypes) {
    impute_parse_haplotypes(haplotypes, cur, impute_info->n_samples);
  }

}
示例#3
0
文件: vcf.c 项目: smozaffari/WASP
/**
 * Gets next line of VCF file and parses it into VCFInfo datastructure.
 *
 * If geno_probs array is non-null genotype likelihoods are parsed and
 * stored in the provided array. The array must be of length
 * n_sample*3.
 *
 * If haplotypes array is non-null phased genotypes are parsed and
 * stored in the provided array. The array must be of length
 * n_sample*2.
 *
 * Returns 0 on success, -1 if at EOF.
 */
int vcf_read_line(gzFile vcf_fh, VCFInfo *vcf_info, SNP *snp,
		  float *geno_probs, char *haplotypes) {
  char *cur, *token;
  int n_fix_header, ref_len, alt_len;
  size_t tok_num;

  /* Used to allow space or tab delimiters here but now only allow
   * tab.  This is because VCF specification indicates that fields
   * should be tab-delimited, and occasionally some fields contain
   * spaces.
   */
  /* const char delim[] = " \t";*/
  const char delim[] = "\t";

  n_fix_header = sizeof(vcf_fix_headers) / sizeof(const char *);

  /* read a line */
  if(util_gzgetline(vcf_fh, &vcf_info->buf, &vcf_info->buf_size) == -1) {
    return -1;
  }
  
  cur = vcf_info->buf;
  tok_num = 0;

  /* chrom */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", n_fix_header);
  }

  /* we don't bother to store chromosome since we store 
   * SNPs from each chromosome in their own table
   */
  /* util_strncpy(snp->chrom, token, sizeof(snp->chrom)); */
  
  
  /* pos */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", n_fix_header);
  }
  snp->pos = util_parse_long(token);
  
  /* ID */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", n_fix_header);
  }
  util_strncpy(snp->name, token, sizeof(snp->name));
  
  /* ref */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", n_fix_header);
  }
  ref_len = util_strncpy(snp->allele1, token, sizeof(snp->allele1));

  /* used to warn about truncations, but makes program too
   * chatty if there are a lot of them
   */
  vcf_info->ref_len = 0;
  /* vcf_info->ref_len = strlen(token); */
  /* if(ref_len != vcf_info->ref_len) { */
  /*   my_warn("truncating long allele (%ld bp) to %ld bp\n", */
  /* 	    vcf_info->ref_len, ref_len); */
  /* } */
  
  /* alt */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", n_fix_header);
  }
  alt_len = util_strncpy(snp->allele2, token, sizeof(snp->allele2));
  
  vcf_info->alt_len = 0;
  /* vcf_info->alt_len = strlen(token); */
  /* if(alt_len != vcf_info->alt_len) { */
  /*   my_warn("truncating long allele (%ld bp) to %ld bp\n", */
  /* 	    vcf_info->alt_len, alt_len); */
  /* } */

  /* qual */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", n_fix_header);
  }
  util_strncpy(vcf_info->qual, token, sizeof(vcf_info->qual));

  /* filter */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", n_fix_header);
  }
  util_strncpy(vcf_info->filter, token, sizeof(vcf_info->filter));


  /* info */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", n_fix_header);
  }
  util_strncpy(vcf_info->info, token, sizeof(vcf_info->info));

  
  /* format */
  token = strsep(&cur, delim);
  if(token == NULL) {
    my_err("expected at least %d tokens per line\n", n_fix_header);
  }
  util_strncpy(vcf_info->format, token, sizeof(vcf_info->format));

  /* now parse haplotypes and/or genotype likelihoods */
  if(geno_probs && haplotypes) {
    char *cur_copy;    
    /* Both genotype probs and haplotypes requested.
     * Need to copy string because it is modified
     * by the tokenizing in the parsing functions.
     *
     * This could be made more efficient by doing the parsing
     * of both types of data at same time
     */
    cur_copy = my_malloc(strlen(cur)+1);
    strcpy(cur_copy, cur);
    
    vcf_parse_geno_probs(vcf_info, geno_probs, cur_copy);
    my_free(cur_copy);

    vcf_parse_haplotypes(vcf_info, haplotypes, cur);
  } else if(geno_probs) {
    vcf_parse_geno_probs(vcf_info, geno_probs, cur);
  } else if(haplotypes) {
    vcf_parse_haplotypes(vcf_info, haplotypes, cur);
  }

  /* my_free(line); */

  return 0;
}