Ejemplo n.º 1
0
// Read VCF file
void vcf_file::scan_file(const string &chr, const string &exclude_chr, bool force_write_index)
{
	bool filter_by_chr = (chr != "");
	bool exclude_by_chr = (exclude_chr != "");
	string index_filename = filename + ".vcfidx";
	bool could_read_index_file = false;
	if (force_write_index == false)
		could_read_index_file = read_index_file(index_filename);
	string CHROM, last_CHROM="";
	int POS, last_POS = -1;
	if (could_read_index_file == false)
	{
		printLOG("Building new index file.\n");
		string line, CHROM, last_CHROM = "";
		streampos filepos;
		char c;
		N_entries=0;
		N_indv = 0;

		while (!feof())
		{
			filepos = get_filepos();
			c = peek();

			if ((c == '\n') || (c == '\r'))
			{
				read_line(line);
				continue;
			}
			else if (c == EOF)
				break;

			if (c == '#')
			{
				read_line(line);
				if (line[1] == '#')
				{	// Meta information
					parse_meta(line);
				}
				else
				{	// Must be header information: #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	(FORMAT	NA00001 NA00002 ... )
					parse_header(line);
				}
			}
			else
			{	// Must be a data line
				read_CHROM_and_POS_and_skip_remainder_of_line(CHROM, POS);
				if (last_CHROM != CHROM)
				{
					printLOG("\tScanning Chromosome: " + CHROM + "\n");
					last_CHROM = CHROM;
				}
				if (POS == last_POS)
				{
					one_off_warning("\tWarning - file contains entries with the same position. This is not supported by vcftools, and may cause unexpected behaviour.\n");
				}
				last_POS = POS;
				entry_file_locations.push_back(filepos);
				N_entries++;
			}
		}

		write_index_file(index_filename);
	}

	printLOG("File contains " + int2str(N_entries) + " entries and " + int2str(N_indv) + " individuals.\n");
	vector<string> meta_lines = meta; meta.resize(0);
	for (unsigned int ui=0; ui<meta_lines.size(); ui++)
		parse_meta(meta_lines[ui]);
	has_genotypes = (N_indv > 0);

	bool already_found_required_chr = false;
	bool already_filtered_required_chr = false;
	if ((exclude_by_chr == true) || (filter_by_chr == true))
	{
		printLOG("Filtering by chromosome.\n");
		for (unsigned int ui=0; ui<N_entries; ui++)
		{
			if (already_found_required_chr == true)
			{
				printLOG("Skipping Remainder.\n");
				entry_file_locations.erase(entry_file_locations.begin()+ui, entry_file_locations.end());
				break;
			}
			if (already_filtered_required_chr == true)
			{
				printLOG("Skipping Remainder.\n");
				break;
			}

			set_filepos(entry_file_locations[ui]);
			read_CHROM_only(CHROM);

			if (last_CHROM != CHROM)
			{
				printLOG("\tChromosome: " + CHROM + "\n");
				if ((filter_by_chr == true) && (last_CHROM == chr))
					already_found_required_chr = true;

				if ((exclude_by_chr == true) && (last_CHROM == exclude_chr))
					already_filtered_required_chr = true;

				last_CHROM = CHROM;
			}
			if ((exclude_by_chr == true) && (CHROM == exclude_chr))
			{
				entry_file_locations[ui] = -1;
				continue;
			}
			if ((filter_by_chr == true) && (CHROM != chr))
			{
				entry_file_locations[ui] = -1;
				continue;
			}
		}
		sort(entry_file_locations.begin(), entry_file_locations.end());
		while((entry_file_locations.size() > 0) && (entry_file_locations[0] < 0))
			entry_file_locations.pop_front();

		N_entries = entry_file_locations.size();
		printLOG("Keeping " + int2str(N_entries) + " entries on specified chromosomes.\n");
	}

	include_indv.clear();
	include_indv.resize(N_indv, true);
	include_entry.clear();
	include_entry.resize(N_entries, true);
	include_genotype.clear();
	include_genotype.resize(N_entries, vector<bool>(N_indv, true));
}
Ejemplo n.º 2
0
 /**
  * Open has to be called before any of the other functions are called.
  * Throws a string exception if it is unable to open the index file, or if
  * there is a format error in the sarray index file.
  *
  * Will throw an exception if a file set is already open.
  */
 void open(std::string sidx_file) {
   index_file = sidx_file;
   ASSERT_MSG(!array_open, "sarray already open");
   index_info = read_index_file(index_file);
   initialize();
 }