static void report_qual_type(char min_qual, char max_qual) { fprintf(stderr, "\n"); fprintf(stderr, "guessing quality format:\n"); if((min_qual == -1) || (max_qual == -1)) { fprintf(stderr, " no valid quality scores to guess quality type from\n"); } fprintf(stderr, " min_qual:%c, max_qual:%c\n", min_qual, max_qual); if(min_qual < MIN_QUAL_SOLEXA) { fprintf(stderr, " quality vals appear to be Sanger / Illum 1.8+ format" " (Phred+33)\n"); if(max_qual >= 'h') { my_warn("%s:%d: quality vals may be mix of Phred+33 and Phred+64\n" " You should probably fix this.", __FILE__, __LINE__); } } else { if(min_qual < MIN_QUAL_ILLUM_1_3) { my_warn("%s:%d: quality vals appear to be OLD solexa format, " "may need to convert prior to processing.\n", __FILE__, __LINE__); } else if(min_qual < MIN_QUAL_ILLUM_1_5) { fprintf(stderr, " quality vals appear to be Illumina 1.3+ format " "(Phred+64)\n should probably use -I flag for bwa aln " "(relevant only if using -q argument)\n"); } else { fprintf(stderr, " quality vals appear to be Illumina 1.5+ format " "(Phred+64)\n should probably use -I flag for bwa aln " "(relevant only if using -q argument)\n"); } } }
/** * sets attributes of the provided SeedMatch data structure to give * the total number of matches and kmer id(s) for the provided * nucleotide array. The kmer ids can be used to index matches in * the seed match table. There can be multiple kmer ids, because the * nucleotides are allowed to contain ambiguity codes. */ void seed_table_lookup(SeedTable *seed_tab, unsigned char *nucs, SeedMatch *seed_match) { unsigned int kmer_id, i; unsigned char **unambig; if(ambi_has_ambi(nucs, seed_tab->seed_len)) { /* this seed contains ambiguous nucleotides. convert * to all possible seeds containing non-ambiguous nucleotides. */ seed_match->n_kmer = ambi_resolve(nucs, seed_tab->seed_len, seed_tab->unambig_nucs, SEED_TABLE_MAX_UNAMBIG); if(seed_match->n_kmer == 0) { my_warn("seed contains too many ambiguous nucleotides"); } unambig = seed_tab->unambig_nucs; /* count total number of matches, set kmer ids */ seed_match->n_match = 0; for(i = 0; i < seed_match->n_kmer; i++) { kmer_id = kmer_nucs_to_id(unambig[i], seed_tab->seed_len); seed_match->n_match += seed_tab->n_match[kmer_id]; seed_match->kmer_ids[i] = kmer_id; } } else { /* there were no ambiguous nucleotides, just use original seed */ kmer_id = kmer_nucs_to_id(nucs, seed_tab->seed_len); seed_match->n_match = seed_tab->n_match[kmer_id]; seed_match->n_kmer = 1; seed_match->kmer_ids[0] = kmer_id; } }
char *my_read(int fd, int ret) { static int i = 0; static char buff[4097]; if (i == 0 || buff[i] == '\0') { i = 0; if ((ret = read(fd, buff, 4096)) > 0 && !my_str_isprintable(buff)) ret = my_fread(buff, fd); if (ret <= 0) { if (ret) my_warn(NULL, fd); return (NULL); } buff[ret] = '\0'; } ret = i; while (buff[i] && buff[i] != '\n' && buff[i] != ';') jump_inhibitors(buff, &i); if (buff[i]) i = i + 1; return (read_one_line(buff + ret)); }
static int check_seq(ReadSeq *read) { int i, j; char c; int err; /* string of valid nucleotide identifiers, including ambiguity codes */ static const char *valid_nucs = "ATCGNatcgnMRWSYKmrwsyk"; err = FALSE; for(i = 0; i < read->read_len; i++) { c = read->line2[i]; j = 0; while(c != valid_nucs[j]) { if(valid_nucs[j] == '\0') { my_warn("%s:%d: read contains invalid base '%c'", __FILE__, __LINE__, c); err = TRUE; break; } j++; } if(err) { read->status = FASTQ_ERR; break; } } return read->status; }
/** * Checks that the header has expected 7 fields. This could be changed * to allow for variety of header types. */ static int check_header(ReadSeq *read) { char *offset; size_t assigned; /* parse read attributes from header line, assuming it has standard * formatting. Example header line: * IPAR1:1:2:18330:12837#0/1 */ offset = index(read->line1, ':'); if(offset == NULL) { assigned = 0; } else { /* replace first ':' with ' ', so that string directive of * sscanf stops after parsing machine name */ offset[0] = ' '; /* parse attributes */ assigned = sscanf(read->line1, "@%s %d:%d:%d:%d#%d/%d", read->machine, &read->lane, &read->tile, &read->x, &read->y, &read->run_num, &read->type); offset[0] = ':'; } if(assigned != 7) { /* failed to completely parse header */ my_warn("%s:%d: could only parse %d out of 7 expected fields from header", __FILE__, __LINE__, assigned); read->status = FASTQ_ERR; } return read->status; }
/** * Parses a read in fastq format. * Returns FASTQ_END at end of file, FASTQ_OK on success, FASTQ_ERR on problem */ static int parse_fastq_read(ReadSeq *read, gzFile f) { size_t qual_len; read->status = FASTQ_OK; read_fastq_lines(read, f); if(read->status != FASTQ_OK) { return read->status; } /* check_header(read); */ /* if(read->status != FASTQ_OK) { */ /* return read->status; */ /* } */ /* third line should start with '+' separator */ if(read->line3[0] != '+') { my_warn("%s:%d: third line does not start with '+'", __FILE__, __LINE__); read->status = FASTQ_ERR; return read->status; } /* check length of read and quality */ read->read_len = strlen(read->line2); qual_len = strlen(read->line4); if(read->read_len < 1) { my_warn("%s:%d: read has no bases\n", __FILE__, __LINE__); return read->status; } /* next line should be quality scores */ if(read->read_len != qual_len) { my_warn("%s:%d: read len (%ld) does not match quality score len (%ld)", __FILE__, __LINE__, read->read_len, qual_len); read->status = FASTQ_ERR; return read->status; } check_seq(read); check_qual(read); return read->status; }
void vcf_read_header(gzFile vcf_fh, VCFInfo *vcf_info) { char *line, *cur, *token; int tok_num; int n_fix_header, i; /* const char delim[] = " \t"; */ const char delim[] = "\t"; n_fix_header = sizeof(vcf_fix_headers) / sizeof(const char *); vcf_info->n_header_line = 0; while(util_gzgetline(vcf_fh, &vcf_info->buf, &vcf_info->buf_size) != -1) { line = vcf_info->buf; if(util_str_starts_with(line, "##")) { /* header line */ vcf_info->n_header_line += 1; } else if(util_str_starts_with(line, "#CHROM")) { /* this should be last header line that contains list of fixed fields */ vcf_info->n_header_line += 1; cur = vcf_info->buf; line = util_str_dup(vcf_info->buf); tok_num = 0; while((token = strsep(&cur, delim)) != NULL) { if(tok_num < n_fix_header) { if(strcmp(token, vcf_fix_headers[tok_num]) != 0) { my_warn("expected token %d to be %s but got '%s'", tok_num, vcf_fix_headers[tok_num], token); } } tok_num += 1; } vcf_info->n_sample = tok_num - n_fix_header; /* * read sample names from remaining part of header */ vcf_info->sample_names = my_malloc(sizeof(char *) * vcf_info->n_sample); cur = line; tok_num = 0; i = 0; while((token = strsep(&cur, delim)) != NULL) { if(tok_num >= n_fix_header) { vcf_info->sample_names[i] = util_str_dup(token); i += 1; } tok_num += 1; } my_free(line); break; } else { my_err("expected last line in header to start with #CHROM"); } } }
/** * Checks that quality characters fall within valid range */ static int check_qual(ReadSeq *read) { int i; char c; read->min_qual = -1; read->max_qual = -1; for(i = 0; i < read->read_len; i++) { c = read->line4[i]; if(read->min_qual == -1) { read->min_qual = c; read->max_qual = c; } else { if(c < read->min_qual) { read->min_qual = c; } if(c > read->max_qual) { read->max_qual = c; } } } if(read->min_qual < MIN_QUAL) { my_warn("%s:%d: read has invalid quality value with ascii code %d", __FILE__, __LINE__, read->min_qual); read->status = FASTQ_ERR; } if(read->max_qual > MAX_QUAL) { my_warn("%s:%d: read has invalid quality value with ascii code %d", __FILE__, __LINE__, read->max_qual); read->status = FASTQ_ERR; } return read->status; }
/** * Adds location for a seed match to the provided seed table. */ void seed_table_add_match(SeedTable *seed_tab, unsigned int offset, unsigned char *nucs) { unsigned int kmer_id, i, j; int n_unambig; unsigned char **unambig; if(seed_tab->match_buf == NULL) { seed_tab_init_match_mem(seed_tab); } /* convert seeds with ambiguity codes to all possible * non-ambiguous seqs */ if(ambi_has_ambi(nucs, seed_tab->seed_len)) { n_unambig = ambi_resolve(nucs, seed_tab->seed_len, seed_tab->unambig_nucs, SEED_TABLE_MAX_UNAMBIG); if(n_unambig == 0) { my_warn("seed contains too many ambiguous nucleotides"); return; } unambig = seed_tab->unambig_nucs; } else { /* no ambiguous nucleotides, just use original seed */ unambig = &nucs; n_unambig = 1; } for(i = 0; i < n_unambig; i++) { kmer_id = kmer_nucs_to_id(unambig[i], seed_tab->seed_len); /* cur is number of matches already added to array */ j = seed_tab->cur[kmer_id]; if(j >= seed_tab->n_match[kmer_id]) { my_err("%s:%d: more matches than expected to kmer", __FILE__, __LINE__); } /* add genomic position (offset) to match array */ seed_tab->match[kmer_id][j] = offset; /* update cur to point to next element of match array */ seed_tab->cur[kmer_id] += 1; } }
static void check_line_len(ReadSeq *read, char *line, gzFile f) { size_t len, n; char c; len = strlen(line); if(len == 0) { return; } if(line[len-1] == '\n') { return; } /* line did not terminate with a '\n' */ my_warn("%s:%d: line did not terminate with '\\n': \n'%s'\n", __FILE__, __LINE__, line, len); read->status = FASTQ_ERR; /* seek in file until next '\n' is found */ n = 0; while((c = gzgetc(f)) != -1) { n++; if(n < 10) { if(isprint(c)) { fprintf(stderr, " extra character %ld: '%c'\n", n, c); } else { fprintf(stderr, " unprintable extra character %ld: '\\%d'\n", n, c); } } else if(n == 10) { fprintf(stderr, " ...\n"); } if(c == '\n') { fprintf(stderr, " read %ld extra characters to reach end of line\n", n); return; } } fprintf(stderr, " read %ld extra characters to reach end of file\n", n); return; }
/** * Counts a seed match, but does not actually add its location * to the seed table */ void seed_table_count_match(SeedTable *seed_tab, unsigned char *nucs) { unsigned int kmer_id; unsigned char **unambig; int n_unambig, i; /* convert seeds with ambiguity codes to all possible * non-ambiguous seqs */ if(ambi_has_ambi(nucs, seed_tab->seed_len)) { n_unambig = ambi_resolve(nucs, seed_tab->seed_len, seed_tab->unambig_nucs, SEED_TABLE_MAX_UNAMBIG); if(n_unambig == 0) { my_warn("seed contains too many ambiguous nucleotides"); return; } unambig = seed_tab->unambig_nucs; } else { /* no ambiguous nucleotides, just use original seed */ unambig = &nucs; n_unambig = 1; } for(i = 0; i < n_unambig; i++) { kmer_id = kmer_nucs_to_id(unambig[i], seed_tab->seed_len); /* increment number of matches to this kmer */ if(seed_tab->n_match[kmer_id] == UINT_MAX) { my_err("%s:%d maximum number of seed matches (%u) " "exceeded for kmer %u", __FILE__, __LINE__, UINT_MAX, kmer_id); } seed_tab->n_match[kmer_id] += 1; seed_tab->total_match += 1; } }
void map_reads(gzFile *output_files, gzFile multi_out_file, gzFile unmapped_out_file, gzFile reads_f, Mapper *mapper, int reads_format, int output_type) { FastqRead fastq_read; MapRead map_read; long warn_count, n_fastq_rec, n_fastq_err; long n_map_uniq, n_map_multi, n_map_none; map_read.fwd_nucs = my_new(unsigned char, FASTQ_MAX_LINE); map_read.rev_nucs = my_new(unsigned char, FASTQ_MAX_LINE); n_map_uniq = n_map_multi = n_map_none = 0; warn_count = n_fastq_err = n_fastq_rec = 0; /* loop over all records in FASTQ file */ while(TRUE) { long r = 0; /* read fastq record from file */ if(reads_format == READS_FORMAT_FASTQ) { r = fastq_parse_read(&fastq_read, reads_f); } else if(reads_format == READS_FORMAT_QSEQ) { r = fastq_parse_qseq_read(&fastq_read, reads_f); } else { my_err("%s:%d: unknown read format", __FILE__, __LINE__); } if(r == FASTQ_END) { /* we have reached the end of the file */ break; } if(r == FASTQ_ERR) { /* this fastq record contains an error */ if(warn_count < FASTQ_MAX_WARN) { warn_count += 1; my_warn("%s:%d: skipping invalid fastq record:\n", __FILE__, __LINE__); fprintf(stderr, " %s\n %s\n %s\n %s\n", fastq_read.line1, fastq_read.line2, fastq_read.line3, fastq_read.line4); } n_fastq_err += 1; } else if(fastq_read.read_len != mapper->seed_finder_fwd->read_len) { /* check that read length is correct */ warn_count += 1; my_warn("%s:%d: specified read length is %u, but got %d, " "skipping read\n", __FILE__, __LINE__, mapper->seed_finder_fwd->read_len, fastq_read.read_len); n_fastq_err += 1; } else if(r == FASTQ_OK) { n_fastq_rec += 1; read_from_fastq_record(&map_read, &fastq_read); if((n_fastq_rec % 1000000) == 0) { fprintf(stderr, "."); } /* try to map this read to genome */ mapper_map_one_read(mapper, &map_read); if(map_read.map_code == MAP_CODE_NONE) { /* read does not map to genome */ n_map_none += 1; if(output_type == OUTPUT_TYPE_SINGLE) { write_unmapped_read(output_files[0], &map_read); } else { write_unmapped_read(unmapped_out_file, &map_read); } } else if(map_read.map_code == MAP_CODE_MULTI) { /* read maps to multiple genomic locations */ n_map_multi += 1; if(output_type == OUTPUT_TYPE_SINGLE) { write_read(output_files, mapper->chr_tab, &map_read, FALSE); } else { write_read(&multi_out_file, mapper->chr_tab, &map_read, FALSE); } } else if(map_read.map_code == MAP_CODE_UNIQUE) { /* read maps to single genomic location */ n_map_uniq += 1; if(output_type == OUTPUT_TYPE_SINGLE) { write_read(output_files, mapper->chr_tab, &map_read, FALSE); } else { write_read(output_files, mapper->chr_tab, &map_read, TRUE); } } else { my_err("%s:%d: unknown mapping code", __FILE__, __LINE__); } } else { my_err("%s:%d: unknown fastq status", __FILE__, __LINE__); } } fprintf(stderr, "\ndone\n"); fprintf(stderr, "fastq errors: %ld\n", n_fastq_err); fprintf(stderr, "fastq records (without errors): %ld\n", n_fastq_rec); fprintf(stderr, "unmapped reads: %ld\n", n_map_none); fprintf(stderr, "uniquely mapping reads: %ld\n", n_map_uniq); fprintf(stderr, "multiply mapping reads: %ld\n", n_map_multi); my_free(map_read.fwd_nucs); my_free(map_read.rev_nucs); }
void merge_vcf(int n_vcf, char **vcf_filenames) { FileInfo *f_info; int n_done, n_chrom, i, *is_lowest, *lowest, n_lowest; int ret, use_geno_probs, use_haplotypes; Chromosome *chrom_tab; f_info = init_file_info(n_vcf, vcf_filenames); /* find chromosomes that are present in ALL VCFs */ chrom_tab = chrom_table_intersect(f_info, n_vcf, &n_chrom); n_done = 0; is_lowest = my_malloc(sizeof(int) * n_vcf); lowest = my_malloc(sizeof(int) * n_vcf); /* only use genotypes and haplotypes if they are present in ALL files */ use_geno_probs = TRUE; use_haplotypes = TRUE; /* read first SNP from all files */ for(i = 0; i < n_vcf; i++) { ret = vcf_read_line(f_info[i].gzf, f_info[i].vcf, &f_info[i].cur_snp); if(ret == -1) { /* file is over */ n_done += 1; f_info[i].is_done = TRUE; my_warn("file %s contains no SNPs\n", vcf_filenames[i]); f_info[i].cur_chrom = NULL; } else { set_cur_chrom(&f_info[i], chrom_tab, n_chrom); if(!f_info[i].cur_snp.has_geno_probs) { if(use_geno_probs) { fprintf(stderr, "Not using genotype likelihoods (GL) because " "not present in file %s\n", vcf_filenames[i]); } use_geno_probs = FALSE; } if(!f_info[i].cur_snp.has_haplotypes) { if(use_haplotypes) { fprintf(stderr, "Not using genotypes (GT) because " "not present in file %s\n", vcf_filenames[i]); } use_haplotypes = FALSE; } } } fprintf(stderr, "parsing files\n"); while(n_done < n_vcf) { /* find SNP(s) with lowest (chrom, pos) */ find_lowest(f_info, n_vcf, is_lowest, lowest, &n_lowest); /* merge counts and write line for these SNPs */ write_output(stdout, f_info, n_vcf, is_lowest, lowest, use_geno_probs, use_haplotypes); /* advance files with lowest SNPs */ for(i = 0; i < n_vcf; i++) { if(!f_info[i].is_done && is_lowest[i]) { if(vcf_read_line(f_info[i].gzf, f_info[i].vcf, &f_info[i].cur_snp) == -1) { /* have reached end of this file */ n_done += 1; f_info[i].is_done = TRUE; } } } } fprintf(stderr, "done!\n"); free_file_info(f_info, n_vcf); for(i = 0; i < n_chrom; i++) { my_free(chrom_tab[i].name); my_free(chrom_tab[i].assembly); } my_free(chrom_tab); my_free(is_lowest); }
int main(int argc, char **argv) { ReadSeq read; gzFile gzf, out_gzf; long rec_num, line_num, n_err; int file_num = 0; char min_qual, max_qual; char *input_filename, *output_dir, *prefix; char output_filename[MAX_LINE]; int n_written; if(argc != 3) { fprintf(stderr, "usage: %s <fastq_file.txt.gz> <output_dir>\n", argv[0]); exit(2); } input_filename = argv[1]; output_dir = argv[2]; gzf = util_must_gzopen(input_filename, "rb"); prefix = get_prefix(output_dir, input_filename); rec_num = 0; line_num = 1; n_err = 0; min_qual = max_qual = -1; out_gzf = NULL; rec_num = 0; while(TRUE) { long r = 0; r = parse_fastq_read(&read, gzf); if(r == FASTQ_END) { /* we have reached the end of the file */ break; } if(r == FASTQ_ERR) { my_warn("%s:%d: invalid fastq record starting on line %ld:\n", __FILE__, __LINE__, line_num); n_err += 1; fprintf(stderr, " %s\n %s\n %s\n %s\n", read.line1, read.line2, read.line3, read.line4); } if(r == FASTQ_OK) { /* record max and min quality values observed */ if((min_qual == -1) || (min_qual > read.min_qual)) { min_qual = read.min_qual; } if((max_qual == -1) || (max_qual < read.max_qual)) { max_qual = read.max_qual; } if(out_gzf == NULL || rec_num > READS_PER_FILE) { if(out_gzf) { /* close old output file */ fprintf(stderr, "\n"); gzclose(out_gzf); } file_num += 1; n_written = snprintf(output_filename, MAX_LINE, "%s.%d.txt.gz", prefix, file_num); if(n_written > MAX_LINE) { my_err("%s:%d: filename too long\n", __FILE__, __LINE__); } fprintf(stderr, "writing to file '%s'\n", output_filename); out_gzf = util_must_gzopen(output_filename, "wb"); rec_num = 0; } /* write record to file */ n_written = gzprintf(out_gzf, "%s\n%s\n%s\n%s\n", read.line1, read.line2, read.line3, read.line4); if(n_written == 0) { my_err("%s:%d: failed to write to output file", __FILE__, __LINE__); } rec_num += 1; } line_num += 4; if((rec_num % 100000) == 0) { fprintf(stderr, "."); } } report_qual_type(min_qual, max_qual); fprintf(stderr, "\n"); fprintf(stderr, "fastq records: written=%ld, errors=%ld\n", rec_num, n_err); gzclose(gzf); if(out_gzf) { gzclose(out_gzf); } my_free(prefix); return 0; }
/** * Reads the four lines of the fastq record */ static int read_fastq_lines(ReadSeq *read, gzFile f) { /* read the four lines that make up fastq record */ if(gzgets(f, read->line1, MAX_LINE) == NULL) { /* end of file */ read->status = FASTQ_END; read->line1[0] = '\0'; read->line2[0] = '\0'; read->line3[0] = '\0'; read->line4[0] = '\0'; return FASTQ_END; } /* check that this line was a header starting with '@' */ if(read->line1[0] != '@') { my_warn("%s:%d: fastq header line does not start with '@'", __FILE__, __LINE__); read->status = FASTQ_ERR; read->line2[0] = '\0'; read->line3[0] = '\0'; read->line4[0] = '\0'; /* move ahead in file to next line that starts with '@' */ seek_next_header(f); return read->status; } check_line_len(read, read->line1, f); util_str_rstrip(read->line1); /* read second line */ if(gzgets(f, read->line2, MAX_LINE) == NULL) { /* end of file */ my_warn("%s:%d: fastq file ended mid-record\n", __FILE__, __LINE__); read->status = FASTQ_ERR; read->line2[0] = '\0'; read->line3[0] = '\0'; read->line4[0] = '\0'; return FASTQ_ERR; } check_line_len(read, read->line2, f); util_str_rstrip(read->line2); /* read third line */ if(gzgets(f, read->line3, MAX_LINE) == NULL) { /* end of file */ my_warn("%s:%d: fastq file ended mid-record\n", __FILE__, __LINE__); read->status = FASTQ_ERR; read->line3[0] = '\0'; read->line4[0] = '\0'; return FASTQ_ERR; } check_line_len(read, read->line3, f); util_str_rstrip(read->line3); /* read fourth line */ if(gzgets(f, read->line4, MAX_LINE) == NULL) { /* end of file */ my_warn("%s:%d: fastq file ended mid-record\n", __FILE__, __LINE__); read->status = FASTQ_ERR; read->line4[0] = '\0'; return FASTQ_ERR; } check_line_len(read, read->line4, f); util_str_rstrip(read->line4); return read->status; }
void vcf_parse_haplotypes(VCFInfo *vcf_info, char *haplotypes, char *cur) { int gt_idx, hap1, hap2, i, n; static int warn_phase = TRUE; static int warn_parse = TRUE; long expect_haps, n_haps; char gt_str[VCF_MAX_FORMAT]; /* char delim[] = " \t"; */ char delim[] = "\t"; char inner_delim[] = ":"; char *inner_cur, *tok, *inner_tok; /* get index of GT token in format string*/ gt_idx = get_format_index(vcf_info->format, "GT"); if(gt_idx == -1) { my_err("%s:%d: VCF format string does not specify GT token " "so cannot obtain haplotypes. Format string: '%s'.\n" "To use this file, you must run snp2h5 without " "the --haplotype option.", __FILE__, __LINE__, vcf_info->format); } expect_haps = vcf_info->n_sample * 2; n_haps = 0; while((tok = strsep(&cur, delim)) != NULL) { /* Each genotype string is delimited by ':' * The GT portions of the string are delimited by '/' or '|' * '|' indicates phased, '/' indicates unphased. */ util_strncpy(gt_str, tok, sizeof(gt_str)); i = 0; inner_cur = gt_str; while((i <= gt_idx) && (inner_tok = strsep(&inner_cur, inner_delim)) != NULL) { if(i == gt_idx) { n = sscanf(inner_tok, "%d|%d", &hap1, &hap2); if(n != 2) { /* try with '/' separator instead */ n = sscanf(inner_tok, "%d/%d", &hap1, &hap2); if(n == 2) { if(warn_phase) { my_warn("%s:%d: some genotypes are unphased (delimited " "with '/' instead of '|')\n", __FILE__, __LINE__, inner_tok); warn_phase = FALSE; } } else { if(warn_parse) { my_warn("%s:%d: could not parse some genotype " "strings that look like: '%s'\n", __FILE__, __LINE__, inner_tok); warn_parse = FALSE; } hap1 = VCF_GTYPE_MISSING; hap2 = VCF_GTYPE_MISSING; } } if((hap1 != VCF_GTYPE_MISSING && hap1 != 0 && hap1 != 1) || (hap2 != VCF_GTYPE_MISSING && hap2 != 0 && hap2 != 1)) { /* Copy number polymorphisms and multi-allelic SNPs * can have values other than 0 and 1 (e.g. 3, 4, ...). * Combined haplotype test does not currently deal with * these. Set the genotypes to MISSING (-1) */ hap1 = VCF_GTYPE_MISSING; hap2 = VCF_GTYPE_MISSING; } if((n_haps + 2) > expect_haps) { my_err("%s:%d: more genotypes per line than expected", __FILE__, __LINE__); } haplotypes[n_haps] = hap1; haplotypes[n_haps+1] = hap2; n_haps += 2; } i++; } } if(n_haps != expect_haps) { my_err("%s:%d: expected %ld genotype values per line, but got " "%ld", __FILE__, __LINE__, expect_haps, n_haps); } }