Example #1
0
static void report_qual_type(char min_qual, char max_qual) {
  fprintf(stderr, "\n");
  fprintf(stderr, "guessing quality format:\n");

  if((min_qual == -1) || (max_qual == -1)) {
    fprintf(stderr, "  no valid quality scores to guess quality type from\n");
  }

  fprintf(stderr, "  min_qual:%c, max_qual:%c\n", min_qual, max_qual);
  if(min_qual < MIN_QUAL_SOLEXA) {
    fprintf(stderr, "  quality vals appear to be Sanger / Illum 1.8+ format"
	    " (Phred+33)\n");
    
    if(max_qual >= 'h') {
      my_warn("%s:%d: quality vals may be mix of Phred+33 and Phred+64\n"
	      "         You should probably fix this.", __FILE__, __LINE__);
    }
  } else {
    if(min_qual < MIN_QUAL_ILLUM_1_3) {
      my_warn("%s:%d: quality vals appear to be OLD solexa format, "
	      "may need to convert prior to processing.\n", __FILE__, __LINE__);
    }
    else if(min_qual < MIN_QUAL_ILLUM_1_5) {
      fprintf(stderr, "  quality vals appear to be Illumina 1.3+ format "
	      "(Phred+64)\n  should probably use -I flag for bwa aln "
	      "(relevant only if using -q argument)\n");
    }
    else {
      fprintf(stderr, "  quality vals appear to be Illumina 1.5+ format "
	      "(Phred+64)\n  should probably use -I flag for bwa aln "
	      "(relevant only if using -q argument)\n");
      
    }
  }
}
Example #2
0
/**
 * sets attributes of the provided SeedMatch data structure to give
 * the total number of matches and kmer id(s) for the provided
 * nucleotide array. The kmer ids can be used to index matches in
 * the seed match table. There can be multiple kmer ids, because the
 * nucleotides are allowed to contain ambiguity codes.
 */
void seed_table_lookup(SeedTable *seed_tab, unsigned char *nucs, 
		       SeedMatch *seed_match) {
  unsigned int kmer_id, i;
  unsigned char **unambig;

  if(ambi_has_ambi(nucs, seed_tab->seed_len)) {
    /* this seed contains ambiguous nucleotides. convert
     * to all possible seeds containing non-ambiguous nucleotides.
     */
    seed_match->n_kmer = ambi_resolve(nucs, seed_tab->seed_len,
				      seed_tab->unambig_nucs,
				      SEED_TABLE_MAX_UNAMBIG);
    
    if(seed_match->n_kmer == 0) {
      my_warn("seed contains too many ambiguous nucleotides");
    }
    unambig = seed_tab->unambig_nucs;

    /* count total number of matches, set kmer ids */
    seed_match->n_match = 0;
    for(i = 0; i < seed_match->n_kmer; i++) {
      kmer_id = kmer_nucs_to_id(unambig[i], seed_tab->seed_len);
      seed_match->n_match += seed_tab->n_match[kmer_id];
      seed_match->kmer_ids[i] = kmer_id;
    }
  } else {
    /* there were no ambiguous nucleotides, just use original seed */
    kmer_id = kmer_nucs_to_id(nucs, seed_tab->seed_len);
    seed_match->n_match = seed_tab->n_match[kmer_id];
    seed_match->n_kmer = 1;
    seed_match->kmer_ids[0] = kmer_id;
  }
}
Example #3
0
char		*my_read(int fd, int ret)
{
  static int	i = 0;
  static char	buff[4097];

  if (i == 0 || buff[i] == '\0')
    {
      i = 0;
      if ((ret = read(fd, buff, 4096)) > 0 && !my_str_isprintable(buff))
	ret = my_fread(buff, fd);
      if (ret <= 0)
        {
	  if (ret)
	    my_warn(NULL, fd);
          return (NULL);
        }
      buff[ret] = '\0';
    }
  ret = i;
  while (buff[i] && buff[i] != '\n' && buff[i] != ';')
    jump_inhibitors(buff, &i);
  if (buff[i])
    i = i + 1;
  return (read_one_line(buff + ret));
}
Example #4
0
static int check_seq(ReadSeq *read) {
  int i, j;
  char c;
  int err;
  /* string of valid nucleotide identifiers, including ambiguity codes */
  static const char *valid_nucs = "ATCGNatcgnMRWSYKmrwsyk";

  err = FALSE;
  for(i = 0; i < read->read_len; i++) {
    c = read->line2[i];

    j = 0;
    while(c != valid_nucs[j]) {
      if(valid_nucs[j] == '\0') {
	my_warn("%s:%d: read contains invalid base '%c'", 
		__FILE__, __LINE__, c);
	err = TRUE;
	break;
      }
      j++;
    }

    if(err) {
      read->status = FASTQ_ERR;
      break;
    }
  }

  return read->status;
}
Example #5
0
/**
 * Checks that the header has expected 7 fields. This could be changed
 * to allow for variety of header types.
 */
static int check_header(ReadSeq *read) {
  char *offset;
  size_t assigned;

  /* parse read attributes from header line, assuming it has standard
   * formatting. Example header line:
   * IPAR1:1:2:18330:12837#0/1
   */
  offset = index(read->line1, ':');
  if(offset == NULL) {
    assigned = 0;
  } else {
    /* replace first ':' with ' ', so that string directive of
     * sscanf stops after parsing machine name
     */
    offset[0] = ' ';
    /* parse attributes */
    assigned = sscanf(read->line1, "@%s %d:%d:%d:%d#%d/%d",
		      read->machine, &read->lane, &read->tile, &read->x, 
		      &read->y,  &read->run_num, &read->type);
    offset[0] = ':';
  }
  if(assigned != 7) {
    /* failed to completely parse header */
    my_warn("%s:%d: could only parse %d out of 7 expected fields from header",
	    __FILE__, __LINE__, assigned);
    read->status = FASTQ_ERR;
  }

  return read->status;
}
Example #6
0
/**
 * Parses a read in fastq format.
 * Returns FASTQ_END at end of file, FASTQ_OK on success, FASTQ_ERR on problem
 */
static int parse_fastq_read(ReadSeq *read, gzFile f) {
  size_t qual_len;

  read->status = FASTQ_OK;

  read_fastq_lines(read, f);
  if(read->status != FASTQ_OK) {
    return read->status;
  }

  
  /* check_header(read); */
  /* if(read->status != FASTQ_OK) { */
  /*   return read->status; */
  /* } */


  /* third line should start with '+' separator */
  if(read->line3[0] != '+') {
    my_warn("%s:%d: third line does not start with '+'", 
	  __FILE__, __LINE__);
    read->status = FASTQ_ERR;
    return read->status;
  }
   
  /* check length of read and quality */
  read->read_len = strlen(read->line2);
  qual_len = strlen(read->line4);
  
  if(read->read_len < 1) {
    my_warn("%s:%d: read has no bases\n", __FILE__, __LINE__);
    return read->status;
  }

  /* next line should be quality scores */
  if(read->read_len != qual_len) {
    my_warn("%s:%d: read len (%ld) does not match quality score len (%ld)",
	  __FILE__, __LINE__, read->read_len, qual_len);
    read->status = FASTQ_ERR;
    return read->status;
  }

  check_seq(read);
  check_qual(read);

  return read->status;
}
Example #7
0
void vcf_read_header(gzFile vcf_fh, VCFInfo *vcf_info) {
  char *line, *cur, *token;
  int tok_num;
  int n_fix_header, i;
  
  /* const char delim[] = " \t"; */
  const char delim[] = "\t";

  n_fix_header = sizeof(vcf_fix_headers) / sizeof(const char *);

  vcf_info->n_header_line = 0;
  
  while(util_gzgetline(vcf_fh, &vcf_info->buf, &vcf_info->buf_size) != -1) {
    line = vcf_info->buf;
  
    if(util_str_starts_with(line, "##")) {
      /* header line */
      vcf_info->n_header_line += 1;
    }
    else if(util_str_starts_with(line, "#CHROM")) {
      /* this should be last header line that contains list of fixed fields */
      vcf_info->n_header_line += 1;
	
      cur = vcf_info->buf;
      line = util_str_dup(vcf_info->buf);
      tok_num = 0;
      while((token = strsep(&cur, delim)) != NULL) {
	if(tok_num < n_fix_header) {
	  if(strcmp(token, vcf_fix_headers[tok_num]) != 0) {
	    my_warn("expected token %d to be %s but got '%s'",
		    tok_num, vcf_fix_headers[tok_num], token);
	  }
	}
	tok_num += 1;
      }
      vcf_info->n_sample = tok_num - n_fix_header;

      /*
       * read sample names from remaining part of header
       */
      vcf_info->sample_names = my_malloc(sizeof(char *) * vcf_info->n_sample);
      cur = line;
      tok_num = 0;
      i = 0;
      while((token = strsep(&cur, delim)) != NULL) {
	if(tok_num >= n_fix_header) {
	  vcf_info->sample_names[i] = util_str_dup(token);
	  i += 1;
	}
	tok_num += 1;
      }
      my_free(line);

      break;
    } else {
      my_err("expected last line in header to start with #CHROM");
    }
  }
}
Example #8
0
/**
 * Checks that quality characters fall within valid range
 */
static int check_qual(ReadSeq *read) {
  int i;
  char c;

  read->min_qual = -1;
  read->max_qual = -1;

  for(i = 0; i < read->read_len; i++) {
    c = read->line4[i];

    if(read->min_qual == -1) {
      read->min_qual = c;
      read->max_qual = c;
    } else {
      if(c < read->min_qual) {
	read->min_qual = c;
      }
      if(c > read->max_qual) {
	read->max_qual = c;
      }
    }
  }

  if(read->min_qual < MIN_QUAL) {
    my_warn("%s:%d: read has invalid quality value with ascii code %d",
	    __FILE__, __LINE__, read->min_qual);
    read->status = FASTQ_ERR;
  }
  if(read->max_qual > MAX_QUAL) {
    my_warn("%s:%d: read has invalid quality value with ascii code %d",
	    __FILE__, __LINE__, read->max_qual);
    read->status = FASTQ_ERR;
  }

  return read->status;
}
Example #9
0
/**
 * Adds location for a seed match to the provided seed table.
 */
void seed_table_add_match(SeedTable *seed_tab, unsigned int offset,
			  unsigned char *nucs) {
  unsigned int kmer_id, i, j;
  int n_unambig;
  unsigned char **unambig;

  if(seed_tab->match_buf == NULL) {
    seed_tab_init_match_mem(seed_tab);
  }

  /* convert seeds with ambiguity codes to all possible 
   * non-ambiguous seqs 
   */
  if(ambi_has_ambi(nucs, seed_tab->seed_len)) {
      n_unambig = ambi_resolve(nucs, seed_tab->seed_len,
			       seed_tab->unambig_nucs, 
			       SEED_TABLE_MAX_UNAMBIG);

      if(n_unambig == 0) {
	my_warn("seed contains too many ambiguous nucleotides");
	return;
      }
      unambig = seed_tab->unambig_nucs;
  } else {
    /* no ambiguous nucleotides, just use original seed */
    unambig = &nucs;
    n_unambig = 1;
  }
  for(i = 0; i < n_unambig; i++) {
    kmer_id = kmer_nucs_to_id(unambig[i], seed_tab->seed_len);

    /* cur is number of matches already added to array */
    j = seed_tab->cur[kmer_id];
    if(j >= seed_tab->n_match[kmer_id]) {
      my_err("%s:%d: more matches than expected to kmer", 
	     __FILE__, __LINE__);
    }

    /* add genomic position (offset) to match array */
    seed_tab->match[kmer_id][j] = offset;

    /* update cur to point to next element of match array */
    seed_tab->cur[kmer_id] += 1;
  }
}
Example #10
0
static void check_line_len(ReadSeq *read, char *line, gzFile f) {
  size_t len, n;
  char c;

  len = strlen(line);
  if(len == 0) {
    return;
  }
  if(line[len-1] == '\n') {
    return;
  }
  
  /* line did not terminate with a '\n' */
  my_warn("%s:%d: line did not terminate with '\\n':  \n'%s'\n", 
          __FILE__, __LINE__, line, len);

  read->status = FASTQ_ERR;

  /* seek in file until next '\n' is found */
  n = 0;
  while((c = gzgetc(f)) != -1) {
    n++;

    if(n < 10) {
      if(isprint(c)) {
        fprintf(stderr, "  extra character %ld: '%c'\n", n, c);
      }
      else {
        fprintf(stderr, "  unprintable extra character %ld: '\\%d'\n", n, c);
      }
    } else if(n == 10) {
      fprintf(stderr, "  ...\n");      
    }
    
    if(c == '\n') {
      fprintf(stderr, "  read %ld extra characters to reach end of line\n", n);
      return;
    }
  }

  fprintf(stderr, "  read %ld extra characters to reach end of file\n", n);
  return;
}
Example #11
0
/**
 * Counts a seed match, but does not actually add its location
 * to the seed table
 */
void seed_table_count_match(SeedTable *seed_tab, unsigned char *nucs) {
  unsigned int kmer_id;
  unsigned char **unambig;
  int n_unambig, i;

  /* convert seeds with ambiguity codes to all possible 
   * non-ambiguous seqs 
   */
  if(ambi_has_ambi(nucs, seed_tab->seed_len)) {
      n_unambig = ambi_resolve(nucs, seed_tab->seed_len,
			       seed_tab->unambig_nucs, 
			       SEED_TABLE_MAX_UNAMBIG);

      if(n_unambig == 0) {
	my_warn("seed contains too many ambiguous nucleotides");
	return;
      }
      unambig = seed_tab->unambig_nucs;
  } else {
    /* no ambiguous nucleotides, just use original seed */
    unambig = &nucs;
    n_unambig = 1;
  }
  for(i = 0; i < n_unambig; i++) {
    kmer_id = kmer_nucs_to_id(unambig[i], seed_tab->seed_len);
    /* increment number of matches to this kmer */

    if(seed_tab->n_match[kmer_id] == UINT_MAX) {
      my_err("%s:%d maximum number of seed matches (%u) "
	     "exceeded for kmer %u", __FILE__, __LINE__, 
	     UINT_MAX, kmer_id);
    }
    seed_tab->n_match[kmer_id] += 1;
    seed_tab->total_match += 1;
  }
}
Example #12
0
void map_reads(gzFile *output_files, gzFile multi_out_file, 
	       gzFile unmapped_out_file, gzFile reads_f, Mapper *mapper,
	       int reads_format, int output_type) {
  FastqRead fastq_read;
  MapRead map_read;
  long warn_count, n_fastq_rec, n_fastq_err;
  long n_map_uniq, n_map_multi, n_map_none;

  map_read.fwd_nucs = my_new(unsigned char, FASTQ_MAX_LINE);
  map_read.rev_nucs = my_new(unsigned char, FASTQ_MAX_LINE);

  n_map_uniq = n_map_multi = n_map_none = 0;
  warn_count = n_fastq_err = n_fastq_rec = 0;

  /* loop over all records in FASTQ file */
  while(TRUE) {
    long r = 0;

    /* read fastq record from file */
    if(reads_format == READS_FORMAT_FASTQ) {
      r = fastq_parse_read(&fastq_read, reads_f);
    }
    else if(reads_format == READS_FORMAT_QSEQ) {
      r = fastq_parse_qseq_read(&fastq_read, reads_f);
    } 
    else {
      my_err("%s:%d: unknown read format", __FILE__, __LINE__);
    }
    
    if(r == FASTQ_END) {
      /* we have reached the end of the file */
      break;
    }

    if(r == FASTQ_ERR) {
      /* this fastq record contains an error */
      if(warn_count < FASTQ_MAX_WARN) {
	warn_count += 1;
	my_warn("%s:%d: skipping invalid fastq record:\n", 
		__FILE__, __LINE__);
	fprintf(stderr, "  %s\n  %s\n  %s\n  %s\n", fastq_read.line1, 
		fastq_read.line2, fastq_read.line3, fastq_read.line4);
      }
      n_fastq_err += 1;
    }
    else if(fastq_read.read_len != mapper->seed_finder_fwd->read_len) {
      /* check that read length is correct */
	warn_count += 1;
	my_warn("%s:%d: specified read length is %u, but got %d, "
		"skipping read\n",  __FILE__, __LINE__, 
		mapper->seed_finder_fwd->read_len, fastq_read.read_len);
	n_fastq_err += 1;
    }
    else if(r == FASTQ_OK) {
      n_fastq_rec += 1;

      read_from_fastq_record(&map_read, &fastq_read);

      if((n_fastq_rec % 1000000) == 0) {
	fprintf(stderr, ".");
      }

      /* try to map this read to genome */
      mapper_map_one_read(mapper, &map_read);

      if(map_read.map_code == MAP_CODE_NONE) {
	/* read does not map to genome */
	n_map_none += 1;
	if(output_type == OUTPUT_TYPE_SINGLE) {
	  write_unmapped_read(output_files[0], &map_read);
	} else {
	  write_unmapped_read(unmapped_out_file, &map_read);
	}
      }
      else if(map_read.map_code == MAP_CODE_MULTI) {
	/* read maps to multiple genomic locations */
	n_map_multi += 1;

	if(output_type == OUTPUT_TYPE_SINGLE) {
	  write_read(output_files, mapper->chr_tab, &map_read, FALSE);
	} else {
	  write_read(&multi_out_file, mapper->chr_tab, &map_read, FALSE);
	}
      }
      else if(map_read.map_code == MAP_CODE_UNIQUE) {
	/* read maps to single genomic location */
	n_map_uniq += 1;
	
	if(output_type == OUTPUT_TYPE_SINGLE) {
	  write_read(output_files, mapper->chr_tab, &map_read, FALSE);
	} else {
	  write_read(output_files, mapper->chr_tab, &map_read, TRUE);
	}
      }
      else {
	my_err("%s:%d: unknown mapping code", __FILE__, __LINE__);
      }
    } else {
      my_err("%s:%d: unknown fastq status", __FILE__, __LINE__);
    }
  }

  fprintf(stderr, "\ndone\n");
  fprintf(stderr, "fastq errors: %ld\n", n_fastq_err);
  fprintf(stderr, "fastq records (without errors): %ld\n", n_fastq_rec);
  fprintf(stderr, "unmapped reads: %ld\n", n_map_none);
  fprintf(stderr, "uniquely mapping reads: %ld\n", n_map_uniq);
  fprintf(stderr, "multiply mapping reads: %ld\n", n_map_multi);

  my_free(map_read.fwd_nucs);
  my_free(map_read.rev_nucs);
}
Example #13
0
void merge_vcf(int n_vcf, char **vcf_filenames) {
  FileInfo *f_info;
  int n_done, n_chrom, i, *is_lowest, *lowest, n_lowest;
  int ret, use_geno_probs, use_haplotypes;
  Chromosome *chrom_tab;

  f_info = init_file_info(n_vcf, vcf_filenames);
  
  /* find chromosomes that are present in ALL VCFs */
  chrom_tab = chrom_table_intersect(f_info, n_vcf, &n_chrom);
  n_done = 0;
  is_lowest = my_malloc(sizeof(int) * n_vcf);
  lowest = my_malloc(sizeof(int) * n_vcf);

  /* only use genotypes and haplotypes if they are present in ALL files */
  use_geno_probs = TRUE;
  use_haplotypes = TRUE;
  
  /* read first SNP from all files */
  for(i = 0; i < n_vcf; i++) {
    ret = vcf_read_line(f_info[i].gzf, f_info[i].vcf, &f_info[i].cur_snp);
    if(ret == -1) {
      /* file is over */
      n_done += 1;
      f_info[i].is_done = TRUE;
      my_warn("file %s contains no SNPs\n", vcf_filenames[i]);
      f_info[i].cur_chrom = NULL;
    } else {
      set_cur_chrom(&f_info[i], chrom_tab, n_chrom);

      if(!f_info[i].cur_snp.has_geno_probs) {
	if(use_geno_probs) {
	  fprintf(stderr, "Not using genotype likelihoods (GL) because "
		  "not present in file %s\n", vcf_filenames[i]);
	}
	use_geno_probs = FALSE;
      }
      if(!f_info[i].cur_snp.has_haplotypes) {
	if(use_haplotypes) {
	  fprintf(stderr, "Not using genotypes (GT) because "
		  "not present in file %s\n", vcf_filenames[i]);
	}
	use_haplotypes = FALSE;
      }
    }
  }
  
  fprintf(stderr, "parsing files\n");

  while(n_done < n_vcf) {
    /* find SNP(s) with lowest (chrom, pos) */
    find_lowest(f_info, n_vcf, is_lowest, lowest, &n_lowest);

    /* merge counts and write line for these SNPs */
    write_output(stdout, f_info, n_vcf, is_lowest, lowest,
		 use_geno_probs, use_haplotypes);
    
    /* advance files with lowest SNPs */
    for(i = 0; i < n_vcf; i++) {
      if(!f_info[i].is_done && is_lowest[i]) {
	if(vcf_read_line(f_info[i].gzf, f_info[i].vcf, &f_info[i].cur_snp) == -1) {
	  /* have reached end of this file */
	  n_done += 1;
	  f_info[i].is_done = TRUE;
	}
      }
    }
  }
  
  fprintf(stderr, "done!\n");

  free_file_info(f_info, n_vcf);
  for(i = 0; i < n_chrom; i++) {
    my_free(chrom_tab[i].name);
    my_free(chrom_tab[i].assembly);
  }
  my_free(chrom_tab);
  my_free(is_lowest);
  
}
Example #14
0
int main(int argc, char **argv) {
  ReadSeq read;
  gzFile gzf, out_gzf;
  long rec_num, line_num, n_err;
  int file_num = 0;
  char min_qual, max_qual;
  char *input_filename, *output_dir, *prefix;
  char output_filename[MAX_LINE];
  int n_written;
  
  if(argc != 3) {
    fprintf(stderr, "usage: %s <fastq_file.txt.gz> <output_dir>\n", argv[0]);
    exit(2);
  }

  input_filename = argv[1];
  output_dir = argv[2];

  gzf = util_must_gzopen(input_filename, "rb");

  prefix = get_prefix(output_dir, input_filename);

  rec_num = 0;
  line_num = 1;
  n_err = 0;
  min_qual = max_qual = -1;

  out_gzf = NULL;

  rec_num = 0;

  while(TRUE) {
    long r = 0;

    r = parse_fastq_read(&read, gzf);

    if(r == FASTQ_END) {
      /* we have reached the end of the file */
      break;
    }

    if(r == FASTQ_ERR) {
      my_warn("%s:%d: invalid fastq record starting on line %ld:\n", 
	      __FILE__, __LINE__, line_num);
      n_err += 1;
      fprintf(stderr, "  %s\n  %s\n  %s\n  %s\n", read.line1, 
	      read.line2, read.line3, read.line4);
    }
    
    if(r == FASTQ_OK) {
      /* record max and min quality values observed */
      if((min_qual == -1) || (min_qual > read.min_qual)) {
	min_qual = read.min_qual;
      }
      if((max_qual == -1) || (max_qual < read.max_qual)) {
	max_qual = read.max_qual;
      }

      if(out_gzf == NULL || rec_num > READS_PER_FILE) {
	if(out_gzf) {
	  /* close old output file */
	  fprintf(stderr, "\n");
	  gzclose(out_gzf);
	}
	
	file_num += 1;
	
	n_written = snprintf(output_filename, MAX_LINE, "%s.%d.txt.gz", 
			     prefix, file_num);

	if(n_written > MAX_LINE) {
	  my_err("%s:%d: filename too long\n", __FILE__, __LINE__);
	}

	fprintf(stderr, "writing to file '%s'\n",  output_filename);
	out_gzf = util_must_gzopen(output_filename, "wb");

	rec_num = 0;
      }
      
      /* write record to file */
      n_written = gzprintf(out_gzf, "%s\n%s\n%s\n%s\n", read.line1,
			   read.line2, read.line3, read.line4);
      if(n_written == 0) {
	my_err("%s:%d: failed to write to output file", __FILE__, __LINE__);
      }

      rec_num += 1;
    }

    line_num += 4;

    if((rec_num % 100000) == 0) {
        fprintf(stderr, ".");
    }
  }

  report_qual_type(min_qual, max_qual);

  fprintf(stderr, "\n");
  fprintf(stderr, "fastq records: written=%ld, errors=%ld\n", rec_num, n_err);

  gzclose(gzf);
  if(out_gzf) {
    gzclose(out_gzf);
  }

  my_free(prefix);

  return 0;
}
Example #15
0
/** 
 * Reads the four lines of the fastq record
 */
static int read_fastq_lines(ReadSeq *read, gzFile f) {
  /* read the four lines that make up fastq record */
  if(gzgets(f, read->line1, MAX_LINE) == NULL) {
    /* end of file */
    read->status = FASTQ_END;
    read->line1[0] = '\0';
    read->line2[0] = '\0';
    read->line3[0] = '\0';
    read->line4[0] = '\0';
    return FASTQ_END;
  }

  /* check that this line was a header starting with '@' */
  if(read->line1[0] != '@') {
    my_warn("%s:%d: fastq header line does not start with '@'",
	    __FILE__, __LINE__);
    read->status = FASTQ_ERR;
    read->line2[0] = '\0';
    read->line3[0] = '\0';
    read->line4[0] = '\0';
    /* move ahead in file to next line that starts with '@' */
    seek_next_header(f);
    return read->status;
  }

  check_line_len(read, read->line1, f);
  util_str_rstrip(read->line1);

  /* read second line */
  if(gzgets(f, read->line2, MAX_LINE) == NULL) {
    /* end of file */
    my_warn("%s:%d: fastq file ended mid-record\n",
	    __FILE__, __LINE__);
    read->status = FASTQ_ERR;
    read->line2[0] = '\0';
    read->line3[0] = '\0';
    read->line4[0] = '\0';
    return FASTQ_ERR;
  }
  check_line_len(read, read->line2, f);
  util_str_rstrip(read->line2);

  /* read third line */
  if(gzgets(f, read->line3, MAX_LINE) == NULL) {
    /* end of file */
    my_warn("%s:%d: fastq file ended mid-record\n",
	    __FILE__, __LINE__);
    read->status = FASTQ_ERR;
    read->line3[0] = '\0';
    read->line4[0] = '\0';
    return FASTQ_ERR;
  }
  check_line_len(read, read->line3, f);
  util_str_rstrip(read->line3);

  /* read fourth line */
  if(gzgets(f, read->line4, MAX_LINE) == NULL) {
    /* end of file */
    my_warn("%s:%d: fastq file ended mid-record\n", __FILE__, __LINE__);
    read->status = FASTQ_ERR;
    read->line4[0] = '\0';
    return FASTQ_ERR;
  }
  check_line_len(read, read->line4, f);
  util_str_rstrip(read->line4);

  return read->status;
}
Example #16
0
void vcf_parse_haplotypes(VCFInfo *vcf_info, char *haplotypes,
			  char *cur) {
  int gt_idx, hap1, hap2, i, n;
  static int warn_phase = TRUE;
  static int warn_parse = TRUE;
  long expect_haps, n_haps;
  char gt_str[VCF_MAX_FORMAT];
  
  /* char delim[] = " \t"; */
  char delim[] = "\t";
  char inner_delim[] = ":";
  char *inner_cur, *tok, *inner_tok;

  /* get index of GT token in format string*/
  gt_idx = get_format_index(vcf_info->format, "GT");
  if(gt_idx == -1) {
    my_err("%s:%d: VCF format string does not specify GT token "
	   "so cannot obtain haplotypes. Format string: '%s'.\n"
	   "To use this file, you must run snp2h5 without "
	   "the --haplotype option.",
	   __FILE__, __LINE__, vcf_info->format);
  }
  
  expect_haps = vcf_info->n_sample * 2;
  
  n_haps = 0;
  
  while((tok = strsep(&cur, delim)) != NULL) {
    /* Each genotype string is delimited by ':'
     * The GT portions of the string are delimited by '/' or '|'
     * '|' indicates phased, '/' indicates unphased.
     */
    util_strncpy(gt_str, tok, sizeof(gt_str));
    
    i = 0;
    inner_cur = gt_str;
    while((i <= gt_idx) && (inner_tok = strsep(&inner_cur, inner_delim)) != NULL) {
      if(i == gt_idx) {
	n = sscanf(inner_tok, "%d|%d", &hap1, &hap2);
	if(n != 2) {
	  /* try with '/' separator instead */
	  n = sscanf(inner_tok, "%d/%d", &hap1, &hap2);

	  if(n == 2) {
	    if(warn_phase) {
	      my_warn("%s:%d: some genotypes are unphased (delimited "
		      "with '/' instead of '|')\n", __FILE__, __LINE__,
		      inner_tok);
	      warn_phase = FALSE;
	    }
	  } else {
	    if(warn_parse) {
	      my_warn("%s:%d: could not parse some genotype "
		      "strings that look like: '%s'\n", __FILE__, __LINE__,
		      inner_tok);
	      warn_parse = FALSE;
	    }
	    hap1 = VCF_GTYPE_MISSING;
	    hap2 = VCF_GTYPE_MISSING;
	  }
	}

	if((hap1 != VCF_GTYPE_MISSING && hap1 != 0 && hap1 != 1)  ||
	   (hap2 != VCF_GTYPE_MISSING && hap2 != 0 && hap2 != 1)) {

	  /* Copy number polymorphisms and multi-allelic SNPs
	   * can have values other than 0 and 1 (e.g. 3, 4, ...).
	   * Combined haplotype test does not currently deal with 
	   * these. Set the genotypes to MISSING (-1)
	   */
	  hap1 = VCF_GTYPE_MISSING;
	  hap2 = VCF_GTYPE_MISSING;
	}

	if((n_haps + 2) > expect_haps) {
	  my_err("%s:%d: more genotypes per line than expected",
		 __FILE__, __LINE__);
	}
	haplotypes[n_haps] = hap1;
	haplotypes[n_haps+1] = hap2;

	n_haps += 2;
      }
      
      i++;
    }
  }

  if(n_haps != expect_haps) {
    my_err("%s:%d: expected %ld genotype values per line, but got "
	   "%ld", __FILE__, __LINE__, expect_haps, n_haps);
  }
}