// Read VCF file void vcf_file::scan_file(const string &chr, const string &exclude_chr, bool force_write_index) { bool filter_by_chr = (chr != ""); bool exclude_by_chr = (exclude_chr != ""); string index_filename = filename + ".vcfidx"; bool could_read_index_file = false; if (force_write_index == false) could_read_index_file = read_index_file(index_filename); string CHROM, last_CHROM=""; int POS, last_POS = -1; if (could_read_index_file == false) { printLOG("Building new index file.\n"); string line, CHROM, last_CHROM = ""; streampos filepos; char c; N_entries=0; N_indv = 0; while (!feof()) { filepos = get_filepos(); c = peek(); if ((c == '\n') || (c == '\r')) { read_line(line); continue; } else if (c == EOF) break; if (c == '#') { read_line(line); if (line[1] == '#') { // Meta information parse_meta(line); } else { // Must be header information: #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... ) parse_header(line); } } else { // Must be a data line read_CHROM_and_POS_and_skip_remainder_of_line(CHROM, POS); if (last_CHROM != CHROM) { printLOG("\tScanning Chromosome: " + CHROM + "\n"); last_CHROM = CHROM; } if (POS == last_POS) { one_off_warning("\tWarning - file contains entries with the same position. This is not supported by vcftools, and may cause unexpected behaviour.\n"); } last_POS = POS; entry_file_locations.push_back(filepos); N_entries++; } } write_index_file(index_filename); } printLOG("File contains " + int2str(N_entries) + " entries and " + int2str(N_indv) + " individuals.\n"); vector<string> meta_lines = meta; meta.resize(0); for (unsigned int ui=0; ui<meta_lines.size(); ui++) parse_meta(meta_lines[ui]); has_genotypes = (N_indv > 0); bool already_found_required_chr = false; bool already_filtered_required_chr = false; if ((exclude_by_chr == true) || (filter_by_chr == true)) { printLOG("Filtering by chromosome.\n"); for (unsigned int ui=0; ui<N_entries; ui++) { if (already_found_required_chr == true) { printLOG("Skipping Remainder.\n"); entry_file_locations.erase(entry_file_locations.begin()+ui, entry_file_locations.end()); break; } if (already_filtered_required_chr == true) { printLOG("Skipping Remainder.\n"); break; } set_filepos(entry_file_locations[ui]); read_CHROM_only(CHROM); if (last_CHROM != CHROM) { printLOG("\tChromosome: " + CHROM + "\n"); if ((filter_by_chr == true) && (last_CHROM == chr)) already_found_required_chr = true; if ((exclude_by_chr == true) && (last_CHROM == exclude_chr)) already_filtered_required_chr = true; last_CHROM = CHROM; } if ((exclude_by_chr == true) && (CHROM == exclude_chr)) { entry_file_locations[ui] = -1; continue; } if ((filter_by_chr == true) && (CHROM != chr)) { entry_file_locations[ui] = -1; continue; } } sort(entry_file_locations.begin(), entry_file_locations.end()); while((entry_file_locations.size() > 0) && (entry_file_locations[0] < 0)) entry_file_locations.pop_front(); N_entries = entry_file_locations.size(); printLOG("Keeping " + int2str(N_entries) + " entries on specified chromosomes.\n"); } include_indv.clear(); include_indv.resize(N_indv, true); include_entry.clear(); include_entry.resize(N_entries, true); include_genotype.clear(); include_genotype.resize(N_entries, vector<bool>(N_indv, true)); }
/** * Open has to be called before any of the other functions are called. * Throws a string exception if it is unable to open the index file, or if * there is a format error in the sarray index file. * * Will throw an exception if a file set is already open. */ void open(std::string sidx_file) { index_file = sidx_file; ASSERT_MSG(!array_open, "sarray already open"); index_info = read_index_file(index_file); initialize(); }