Exemplo n.º 1
0
// Read VCF file
void vcf_file::scan_file(const string &chr, const string &exclude_chr, bool force_write_index)
{
	bool filter_by_chr = (chr != "");
	bool exclude_by_chr = (exclude_chr != "");
	string index_filename = filename + ".vcfidx";
	bool could_read_index_file = false;
	if (force_write_index == false)
		could_read_index_file = read_index_file(index_filename);
	string CHROM, last_CHROM="";
	int POS, last_POS = -1;
	if (could_read_index_file == false)
	{
		printLOG("Building new index file.\n");
		string line, CHROM, last_CHROM = "";
		streampos filepos;
		char c;
		N_entries=0;
		N_indv = 0;

		while (!feof())
		{
			filepos = get_filepos();
			c = peek();

			if ((c == '\n') || (c == '\r'))
			{
				read_line(line);
				continue;
			}
			else if (c == EOF)
				break;

			if (c == '#')
			{
				read_line(line);
				if (line[1] == '#')
				{	// Meta information
					parse_meta(line);
				}
				else
				{	// Must be header information: #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	(FORMAT	NA00001 NA00002 ... )
					parse_header(line);
				}
			}
			else
			{	// Must be a data line
				read_CHROM_and_POS_and_skip_remainder_of_line(CHROM, POS);
				if (last_CHROM != CHROM)
				{
					printLOG("\tScanning Chromosome: " + CHROM + "\n");
					last_CHROM = CHROM;
				}
				if (POS == last_POS)
				{
					one_off_warning("\tWarning - file contains entries with the same position. This is not supported by vcftools, and may cause unexpected behaviour.\n");
				}
				last_POS = POS;
				entry_file_locations.push_back(filepos);
				N_entries++;
			}
		}

		write_index_file(index_filename);
	}

	printLOG("File contains " + int2str(N_entries) + " entries and " + int2str(N_indv) + " individuals.\n");
	vector<string> meta_lines = meta; meta.resize(0);
	for (unsigned int ui=0; ui<meta_lines.size(); ui++)
		parse_meta(meta_lines[ui]);
	has_genotypes = (N_indv > 0);

	bool already_found_required_chr = false;
	bool already_filtered_required_chr = false;
	if ((exclude_by_chr == true) || (filter_by_chr == true))
	{
		printLOG("Filtering by chromosome.\n");
		for (unsigned int ui=0; ui<N_entries; ui++)
		{
			if (already_found_required_chr == true)
			{
				printLOG("Skipping Remainder.\n");
				entry_file_locations.erase(entry_file_locations.begin()+ui, entry_file_locations.end());
				break;
			}
			if (already_filtered_required_chr == true)
			{
				printLOG("Skipping Remainder.\n");
				break;
			}

			set_filepos(entry_file_locations[ui]);
			read_CHROM_only(CHROM);

			if (last_CHROM != CHROM)
			{
				printLOG("\tChromosome: " + CHROM + "\n");
				if ((filter_by_chr == true) && (last_CHROM == chr))
					already_found_required_chr = true;

				if ((exclude_by_chr == true) && (last_CHROM == exclude_chr))
					already_filtered_required_chr = true;

				last_CHROM = CHROM;
			}
			if ((exclude_by_chr == true) && (CHROM == exclude_chr))
			{
				entry_file_locations[ui] = -1;
				continue;
			}
			if ((filter_by_chr == true) && (CHROM != chr))
			{
				entry_file_locations[ui] = -1;
				continue;
			}
		}
		sort(entry_file_locations.begin(), entry_file_locations.end());
		while((entry_file_locations.size() > 0) && (entry_file_locations[0] < 0))
			entry_file_locations.pop_front();

		N_entries = entry_file_locations.size();
		printLOG("Keeping " + int2str(N_entries) + " entries on specified chromosomes.\n");
	}

	include_indv.clear();
	include_indv.resize(N_indv, true);
	include_entry.clear();
	include_entry.resize(N_entries, true);
	include_genotype.clear();
	include_genotype.resize(N_entries, vector<bool>(N_indv, true));
}
// Read VCF file
void vcf_file::scan_file(const string &chr, const string &exclude_chr)
{
	printLOG("Scanning " + filename + " ... \n");

	bool filter_by_chr = (chr != "");
	bool exclude_by_chr = (exclude_chr != "");
	string line, tmp;
	N_indv = 0;
	unsigned int N_read = 0;
	istringstream ss;
	string last_CHROM = "";
	N_entries=0;
	string CHROM;
	bool finish = false;
	int last_POS = -1;
	int POS;
	streampos filepos;

	while(!feof())
	{
		filepos = get_filepos();
		read_line(line);

		if (line.length() <= 2)
			continue;

		if (line[0] == '#')
		{
			if (line[1] == '#')
			{	// Meta information
				parse_meta(line);
			}
			else
			{	// Must be header information: #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	(FORMAT	NA00001 NA00002 ... )
				parse_header(line);
			}
		}
		else
		{	// Must be a data line
			ss.clear(); ss.str(line);
			ss >> CHROM;

			N_read++;

			if ((filter_by_chr == true) && (last_CHROM == chr) && (CHROM != chr))
			{	// Presuming the file to be sorted (it should be), we have already found the chromosome we wanted, so there's no need to continue.
				printLOG("\tCompleted reading required chromosome. Skipping remainder of file.\n");
				finish = true;
				break;
			}

			if (CHROM != last_CHROM)
			{
				printLOG("Currently scanning CHROM: " + CHROM);
				if ((exclude_by_chr == true) && (CHROM == exclude_chr))
					printLOG(" - excluded.");
				printLOG("\n");
				last_CHROM = CHROM;
				last_POS = -1;
			}

			if ((exclude_by_chr == true) && (CHROM == exclude_chr))
				continue;

			if (filter_by_chr == true)
			{	// For speed, only parse the entry if it's needed
				if (CHROM == chr)
				{
					ss >> POS;
					if (POS < last_POS)
						error("VCF file is not sorted at: " + CHROM + ":" + int2str(POS));
					last_POS = POS;
					entry_file_locations.push_back(filepos);
					N_entries++;
				}
			}
			else
			{
				ss >> POS;
				if (POS < last_POS)
					error("VCF file is not sorted at: " + CHROM + ":" + int2str(POS));
				last_POS = POS;
				entry_file_locations.push_back(filepos);
				N_entries++;
			}
		}