// Read VCF file void vcf_file::scan_file(const string &chr, const string &exclude_chr, bool force_write_index) { bool filter_by_chr = (chr != ""); bool exclude_by_chr = (exclude_chr != ""); string index_filename = filename + ".vcfidx"; bool could_read_index_file = false; if (force_write_index == false) could_read_index_file = read_index_file(index_filename); string CHROM, last_CHROM=""; int POS, last_POS = -1; if (could_read_index_file == false) { printLOG("Building new index file.\n"); string line, CHROM, last_CHROM = ""; streampos filepos; char c; N_entries=0; N_indv = 0; while (!feof()) { filepos = get_filepos(); c = peek(); if ((c == '\n') || (c == '\r')) { read_line(line); continue; } else if (c == EOF) break; if (c == '#') { read_line(line); if (line[1] == '#') { // Meta information parse_meta(line); } else { // Must be header information: #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... ) parse_header(line); } } else { // Must be a data line read_CHROM_and_POS_and_skip_remainder_of_line(CHROM, POS); if (last_CHROM != CHROM) { printLOG("\tScanning Chromosome: " + CHROM + "\n"); last_CHROM = CHROM; } if (POS == last_POS) { one_off_warning("\tWarning - file contains entries with the same position. This is not supported by vcftools, and may cause unexpected behaviour.\n"); } last_POS = POS; entry_file_locations.push_back(filepos); N_entries++; } } write_index_file(index_filename); } printLOG("File contains " + int2str(N_entries) + " entries and " + int2str(N_indv) + " individuals.\n"); vector<string> meta_lines = meta; meta.resize(0); for (unsigned int ui=0; ui<meta_lines.size(); ui++) parse_meta(meta_lines[ui]); has_genotypes = (N_indv > 0); bool already_found_required_chr = false; bool already_filtered_required_chr = false; if ((exclude_by_chr == true) || (filter_by_chr == true)) { printLOG("Filtering by chromosome.\n"); for (unsigned int ui=0; ui<N_entries; ui++) { if (already_found_required_chr == true) { printLOG("Skipping Remainder.\n"); entry_file_locations.erase(entry_file_locations.begin()+ui, entry_file_locations.end()); break; } if (already_filtered_required_chr == true) { printLOG("Skipping Remainder.\n"); break; } set_filepos(entry_file_locations[ui]); read_CHROM_only(CHROM); if (last_CHROM != CHROM) { printLOG("\tChromosome: " + CHROM + "\n"); if ((filter_by_chr == true) && (last_CHROM == chr)) already_found_required_chr = true; if ((exclude_by_chr == true) && (last_CHROM == exclude_chr)) already_filtered_required_chr = true; last_CHROM = CHROM; } if ((exclude_by_chr == true) && (CHROM == exclude_chr)) { entry_file_locations[ui] = -1; continue; } if ((filter_by_chr == true) && (CHROM != chr)) { entry_file_locations[ui] = -1; continue; } } sort(entry_file_locations.begin(), entry_file_locations.end()); while((entry_file_locations.size() > 0) && (entry_file_locations[0] < 0)) entry_file_locations.pop_front(); N_entries = entry_file_locations.size(); printLOG("Keeping " + int2str(N_entries) + " entries on specified chromosomes.\n"); } include_indv.clear(); include_indv.resize(N_indv, true); include_entry.clear(); include_entry.resize(N_entries, true); include_genotype.clear(); include_genotype.resize(N_entries, vector<bool>(N_indv, true)); }
// Read VCF file void vcf_file::scan_file(const string &chr, const string &exclude_chr) { printLOG("Scanning " + filename + " ... \n"); bool filter_by_chr = (chr != ""); bool exclude_by_chr = (exclude_chr != ""); string line, tmp; N_indv = 0; unsigned int N_read = 0; istringstream ss; string last_CHROM = ""; N_entries=0; string CHROM; bool finish = false; int last_POS = -1; int POS; streampos filepos; while(!feof()) { filepos = get_filepos(); read_line(line); if (line.length() <= 2) continue; if (line[0] == '#') { if (line[1] == '#') { // Meta information parse_meta(line); } else { // Must be header information: #CHROM POS ID REF ALT QUAL FILTER INFO (FORMAT NA00001 NA00002 ... ) parse_header(line); } } else { // Must be a data line ss.clear(); ss.str(line); ss >> CHROM; N_read++; if ((filter_by_chr == true) && (last_CHROM == chr) && (CHROM != chr)) { // Presuming the file to be sorted (it should be), we have already found the chromosome we wanted, so there's no need to continue. printLOG("\tCompleted reading required chromosome. Skipping remainder of file.\n"); finish = true; break; } if (CHROM != last_CHROM) { printLOG("Currently scanning CHROM: " + CHROM); if ((exclude_by_chr == true) && (CHROM == exclude_chr)) printLOG(" - excluded."); printLOG("\n"); last_CHROM = CHROM; last_POS = -1; } if ((exclude_by_chr == true) && (CHROM == exclude_chr)) continue; if (filter_by_chr == true) { // For speed, only parse the entry if it's needed if (CHROM == chr) { ss >> POS; if (POS < last_POS) error("VCF file is not sorted at: " + CHROM + ":" + int2str(POS)); last_POS = POS; entry_file_locations.push_back(filepos); N_entries++; } } else { ss >> POS; if (POS < last_POS) error("VCF file is not sorted at: " + CHROM + ":" + int2str(POS)); last_POS = POS; entry_file_locations.push_back(filepos); N_entries++; } }