Пример #1
0
void Genotype::readBIMFile(std::string f)
{
  std::ifstream file;
  std::string line;
  
  if(communicator->mpiRoot)
  {
    misc.message << "Reading SNPs data from file [ " << f << " ] ..." << std::endl;
    
    misc.checkFileExists(f);
    file.open(f.c_str());

    this->SNPs.clear();
    this->SNPIds.clear();
    this->SNPIdsIdx.clear();
    int idx = 0;
    while(getline(file,line))
    {
      if(!file)
      {
	break;
      }
      std::istringstream sstemp(line); //->Comprova que te 6 elements.
      
      std::string temp[6];
      sstemp >> temp[0] >> temp[1] >> temp[2] >> temp[3] >> temp[4] >> temp[5];
      
      SNP snp = SNP(temp[0], temp[1], temp[3], temp[4], temp[5]);
      
      this->SNPs.push_back(snp);
      this->SNPIds.push_back(snp.name);
      if(this->SNPIdsIdx.count(snp.name) != 0)
      {
        misc.error("Error: The SNP with name: " + snp.name + " appears more than one time in the genotypes BIM file.", 0);
      }
      this->SNPIdsIdx[snp.name] = idx;
      idx++;
    }
    file.close();

    this->nSNPs = SNPs.size();
  
    misc.message << this->nSNPs << " SNPs found." << std::endl;
  }
Пример #2
0
// Addition on 29/12/13 (to be compatible with bowtie2):
// CIGAR_indels is field #5 in the sam format
// CIGAR_mismatches is the field that starts with MD:Z.
void ReadMapping::determine_snps(const DNAString& dna_str, const string& quality_str, string CIGAR_indels, string CIGAR_mismatches, size_t ref_pos)
{
	if(CIGAR_mismatches.size() == 0)
		return;

	// This regexp was taken from the SAM manual
	// /^MD:Z:([0-9]+)(([ACGTN]|\^[ACGTN]+)[0-9]+)*$/)
	string::const_iterator p = CIGAR_mismatches.begin();
	if((*p!='M') || (*(p+1)!='D') || (*(p+2)!=':') || (*(p+3)!='Z') || (*(p+4)!=':')) {
		throw Illegal_mapping(CIGAR_mismatches, "Illegal SNPs description, expected MD:Z: at the beginning of CIGAR_mismatches format but found something different", __PRETTY_FUNCTION__);
	}
	p += 5;

	// Read the offset to the first SNP
	size_t	read_pos = 0, ref_deleted = 0;
	while((*p>='0') && (*p<='9')) {
		read_pos *= 10;
		read_pos += (*(p++)-'0');
	}
	// Set the offset to 0 
	read_pos--;
	// Keep on reading the next snps until we reach the end
	while(p != CIGAR_mismatches.end()) {
		// First we expect the ref's char
		if((*p!='^') && ((*p<'A') || (*p>'Z')))
			throw Illegal_mapping(CIGAR_mismatches, string("Illegal SNPs description (2), expected CIGAR_mismatches format but found something different"), __PRETTY_FUNCTION__);

		if(*p=='^') {
			p++;
			while(((*p<'0') || (*p>'9')) && (p != CIGAR_mismatches.end())) {
				ref_deleted++;
				m_snps.push_back(SNP(SNP::DELETION, ref_pos+read_pos+ref_deleted, read_pos, *p, '-', 40));
				p++;
			}
		}
		else {
			read_pos++;
			m_snps.push_back(SNP(SNP::MISMATCH, ref_pos+read_pos+ref_deleted, read_pos, *p, dna_str.get(read_pos), quality_str.at(read_pos)));
			p++;
		}

		// There must be at least one more number
		if((*p < '0') || (*p > '9'))
			throw Illegal_mapping(CIGAR_mismatches, "Illegal SNPs description, expeected CIGAR_mismatches format but found something different", __PRETTY_FUNCTION__);
		unsigned int o = 0;
		while((*p>='0') && (*p<='9')) {
			o *= 10;
			o += (*(p++)-'0');
		}
		read_pos += o;
	}

//cerr << endl << "--------------------------------------------------------------" << endl;
//for(vector<SNP>::iterator itt=m_snps.begin(); itt!=m_snps.end(); itt++) {
//	cerr << (itt-m_snps.begin()) << '\t' << itt->ref_pos << '\t' << itt->read_pos << '\t' << (itt->snp_type==SNP::MISMATCH) << (itt->snp_type==SNP::INSERTION) << (itt->snp_type==SNP::DELETION) << endl;
//}
//cerr << endl << "+++" << endl << endl;
	// Now we have to go through the insertion/deletion CIGAR string and correct mismatches accordingly
	// Examples: 6M1D9M1I132M, 31M1I106M	
	p = CIGAR_indels.begin();
	// Make sure that the end of the string is legal (so we don't have to check it all the time in the loop below)
	char ch = *(CIGAR_indels.rbegin());
	if((ch!='I') && (ch!='D') && (ch!='M')) {
		throw Illegal_mapping(CIGAR_indels, string("Illegal CIGAR-indels description (3)"), __PRETTY_FUNCTION__);
	}

	int read_pos2 = 0;
	size_t ref_pos2 = ref_pos;
	while(p!=CIGAR_indels.end()) {
		size_t o = 0;
		while((*p >= '0') && (*p <= '9')) {
			o *= 10;
			o += (*(p++)-'0');
		}
		if(o == 0) {
			throw Illegal_mapping(CIGAR_indels, string("Illegal CIGAR-indels description (4)"), __PRETTY_FUNCTION__);
		}
		vector<SNP>::iterator ps;
		size_t pos;
		switch (*(p++)) {
			case 'M':
				read_pos2 += o;
				ref_pos2 += o;
				break;
			case 'D':
				// Nothig to do here - this case was already taken care of in CIGAR_mismatches
				ref_pos2 += o;
				break;
			case 'I':
				for(ps=m_snps.begin(); ps!=m_snps.end(); ps++) {
					if(ps->read_pos >= read_pos2) {
						ps->read_pos += o;
					}
				}
				read_pos += o;

				for(ps=m_snps.begin(); (ps!=m_snps.end()) && (ps->read_pos < read_pos2); ps++)
					;
				// At this point we don't provide any information about the insetion except that it exists
				m_snps.insert(ps, SNP(SNP::INSERTION, ref_pos2-1, read_pos2-1, '-', '-', '-'));
				read_pos2 += o;
//				ref_pos2 += o;

//				pos = ps-m_snps.begin();
//				while(o > 0) {
//					ps = m_snps.begin()+pos;
//					pos++;
//					m_snps.insert(ps, SNP(SNP::INSERTION, ref_pos2++, read_pos2-1, '-', dna_str.get(read_pos2), quality_str.at(read_pos2)));
//					o--;
//					read_pos2++;
//				}
				break;
			default:
				throw Illegal_mapping(CIGAR_indels, string("Unexpected CIGAR-indels operation found"), __PRETTY_FUNCTION__);
		}
	}

	// Sanity check
	if(((read_pos+1) != dna_str.size()) || (read_pos2 != dna_str.size()))
		throw Illegal_mapping(CIGAR_indels+string(", ")+CIGAR_mismatches, "snps and offsets do not sum to expected read length", __PRETTY_FUNCTION__);

//for(vector<SNP>::iterator itt=m_snps.begin(); itt!=m_snps.end(); itt++) {
//	cerr << (itt-m_snps.begin()) << '\t' << itt->ref_pos << '\t' << itt->read_pos << '\t' << (itt->snp_type==SNP::MISMATCH) << (itt->snp_type==SNP::INSERTION) << (itt->snp_type==SNP::DELETION) << endl;
//}
}