void Genotype::readBIMFile(std::string f) { std::ifstream file; std::string line; if(communicator->mpiRoot) { misc.message << "Reading SNPs data from file [ " << f << " ] ..." << std::endl; misc.checkFileExists(f); file.open(f.c_str()); this->SNPs.clear(); this->SNPIds.clear(); this->SNPIdsIdx.clear(); int idx = 0; while(getline(file,line)) { if(!file) { break; } std::istringstream sstemp(line); //->Comprova que te 6 elements. std::string temp[6]; sstemp >> temp[0] >> temp[1] >> temp[2] >> temp[3] >> temp[4] >> temp[5]; SNP snp = SNP(temp[0], temp[1], temp[3], temp[4], temp[5]); this->SNPs.push_back(snp); this->SNPIds.push_back(snp.name); if(this->SNPIdsIdx.count(snp.name) != 0) { misc.error("Error: The SNP with name: " + snp.name + " appears more than one time in the genotypes BIM file.", 0); } this->SNPIdsIdx[snp.name] = idx; idx++; } file.close(); this->nSNPs = SNPs.size(); misc.message << this->nSNPs << " SNPs found." << std::endl; }
// Addition on 29/12/13 (to be compatible with bowtie2): // CIGAR_indels is field #5 in the sam format // CIGAR_mismatches is the field that starts with MD:Z. void ReadMapping::determine_snps(const DNAString& dna_str, const string& quality_str, string CIGAR_indels, string CIGAR_mismatches, size_t ref_pos) { if(CIGAR_mismatches.size() == 0) return; // This regexp was taken from the SAM manual // /^MD:Z:([0-9]+)(([ACGTN]|\^[ACGTN]+)[0-9]+)*$/) string::const_iterator p = CIGAR_mismatches.begin(); if((*p!='M') || (*(p+1)!='D') || (*(p+2)!=':') || (*(p+3)!='Z') || (*(p+4)!=':')) { throw Illegal_mapping(CIGAR_mismatches, "Illegal SNPs description, expected MD:Z: at the beginning of CIGAR_mismatches format but found something different", __PRETTY_FUNCTION__); } p += 5; // Read the offset to the first SNP size_t read_pos = 0, ref_deleted = 0; while((*p>='0') && (*p<='9')) { read_pos *= 10; read_pos += (*(p++)-'0'); } // Set the offset to 0 read_pos--; // Keep on reading the next snps until we reach the end while(p != CIGAR_mismatches.end()) { // First we expect the ref's char if((*p!='^') && ((*p<'A') || (*p>'Z'))) throw Illegal_mapping(CIGAR_mismatches, string("Illegal SNPs description (2), expected CIGAR_mismatches format but found something different"), __PRETTY_FUNCTION__); if(*p=='^') { p++; while(((*p<'0') || (*p>'9')) && (p != CIGAR_mismatches.end())) { ref_deleted++; m_snps.push_back(SNP(SNP::DELETION, ref_pos+read_pos+ref_deleted, read_pos, *p, '-', 40)); p++; } } else { read_pos++; m_snps.push_back(SNP(SNP::MISMATCH, ref_pos+read_pos+ref_deleted, read_pos, *p, dna_str.get(read_pos), quality_str.at(read_pos))); p++; } // There must be at least one more number if((*p < '0') || (*p > '9')) throw Illegal_mapping(CIGAR_mismatches, "Illegal SNPs description, expeected CIGAR_mismatches format but found something different", __PRETTY_FUNCTION__); unsigned int o = 0; while((*p>='0') && (*p<='9')) { o *= 10; o += (*(p++)-'0'); } read_pos += o; } //cerr << endl << "--------------------------------------------------------------" << endl; //for(vector<SNP>::iterator itt=m_snps.begin(); itt!=m_snps.end(); itt++) { // cerr << (itt-m_snps.begin()) << '\t' << itt->ref_pos << '\t' << itt->read_pos << '\t' << (itt->snp_type==SNP::MISMATCH) << (itt->snp_type==SNP::INSERTION) << (itt->snp_type==SNP::DELETION) << endl; //} //cerr << endl << "+++" << endl << endl; // Now we have to go through the insertion/deletion CIGAR string and correct mismatches accordingly // Examples: 6M1D9M1I132M, 31M1I106M p = CIGAR_indels.begin(); // Make sure that the end of the string is legal (so we don't have to check it all the time in the loop below) char ch = *(CIGAR_indels.rbegin()); if((ch!='I') && (ch!='D') && (ch!='M')) { throw Illegal_mapping(CIGAR_indels, string("Illegal CIGAR-indels description (3)"), __PRETTY_FUNCTION__); } int read_pos2 = 0; size_t ref_pos2 = ref_pos; while(p!=CIGAR_indels.end()) { size_t o = 0; while((*p >= '0') && (*p <= '9')) { o *= 10; o += (*(p++)-'0'); } if(o == 0) { throw Illegal_mapping(CIGAR_indels, string("Illegal CIGAR-indels description (4)"), __PRETTY_FUNCTION__); } vector<SNP>::iterator ps; size_t pos; switch (*(p++)) { case 'M': read_pos2 += o; ref_pos2 += o; break; case 'D': // Nothig to do here - this case was already taken care of in CIGAR_mismatches ref_pos2 += o; break; case 'I': for(ps=m_snps.begin(); ps!=m_snps.end(); ps++) { if(ps->read_pos >= read_pos2) { ps->read_pos += o; } } read_pos += o; for(ps=m_snps.begin(); (ps!=m_snps.end()) && (ps->read_pos < read_pos2); ps++) ; // At this point we don't provide any information about the insetion except that it exists m_snps.insert(ps, SNP(SNP::INSERTION, ref_pos2-1, read_pos2-1, '-', '-', '-')); read_pos2 += o; // ref_pos2 += o; // pos = ps-m_snps.begin(); // while(o > 0) { // ps = m_snps.begin()+pos; // pos++; // m_snps.insert(ps, SNP(SNP::INSERTION, ref_pos2++, read_pos2-1, '-', dna_str.get(read_pos2), quality_str.at(read_pos2))); // o--; // read_pos2++; // } break; default: throw Illegal_mapping(CIGAR_indels, string("Unexpected CIGAR-indels operation found"), __PRETTY_FUNCTION__); } } // Sanity check if(((read_pos+1) != dna_str.size()) || (read_pos2 != dna_str.size())) throw Illegal_mapping(CIGAR_indels+string(", ")+CIGAR_mismatches, "snps and offsets do not sum to expected read length", __PRETTY_FUNCTION__); //for(vector<SNP>::iterator itt=m_snps.begin(); itt!=m_snps.end(); itt++) { // cerr << (itt-m_snps.begin()) << '\t' << itt->ref_pos << '\t' << itt->read_pos << '\t' << (itt->snp_type==SNP::MISMATCH) << (itt->snp_type==SNP::INSERTION) << (itt->snp_type==SNP::DELETION) << endl; //} }