// ---------------------------------------------------------------------------- ESpecType SpecType( const string& spectype ) // ---------------------------------------------------------------------------- { static map<string, ESpecType> typemap; if ( typemap.empty() ) { typemap["Integer"] = eType_Integer; typemap["Float"] = eType_Float; typemap["Flag"] = eType_Flag; typemap["Character"] = eType_Character; typemap["String"] = eType_String; } try { return typemap[spectype]; } catch( ... ) { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Warning, 0, "CVcfReader::xProcessMetaLineInfo: Unrecognized line or record type.", ILineError::eProblem_GeneralParsingError) ); pErr->Throw(); return eType_String; } };
// ---------------------------------------------------------------------------- bool CVcfReader::xProcessMetaLineFilter( const string& line, CRef<CSeq_annot> pAnnot, IMessageListener* pEC) // ---------------------------------------------------------------------------- { const string prefix = "##FILTER=<"; const string postfix = ">"; if ( ! NStr::StartsWith( line, prefix ) || ! NStr::EndsWith( line, postfix ) ) { return false; } try { vector<string> fields; string key, id, description; string info = line.substr( prefix.length(), line.length() - prefix.length() - postfix.length() ); NStr::Tokenize( info, ",", fields ); NStr::SplitInTwo( fields[0], "=", key, id ); if ( key != "ID" ) { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Error, 0, "CVcfReader::xProcessMetaLineInfo: ##FILTER with bad or missing \"ID\".", ILineError::eProblem_BadFilterLine) ); pErr->Throw(); } NStr::SplitInTwo( fields[1], "=", key, description ); if ( key != "Description" ) { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Error, 0, "CVcfReader::xProcessMetaLineInfo: ##FILTER with bad or missing \"Description\".", ILineError::eProblem_BadFilterLine) ); pErr->Throw(); } m_FilterSpecs[id] = CVcfFilterSpec( id, description ); } catch (CObjReaderLineException& err) { ProcessError(err, pEC); } return true; }
// --------------------------------------------------------------------------- void CGtfReader::ReadSeqAnnots( TAnnots& annots, ILineReader& lr, ILineErrorListener* pEC) // ---------------------------------------------------------------------------- { xProgressInit(lr); string line; while (xGetLine(lr, line)) { if (IsCanceled()) { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Info, 0, "Reader stopped by user.", ILineError::eProblem_ProgressInfo)); ProcessError(*pErr, pEC); annots.clear(); return; } xReportProgress(pEC); try { if (xIsTrackTerminator(line)) { continue; } if (x_ParseBrowserLineGff(line, m_CurrentBrowserInfo)) { continue; } if (xParseTrackLine(line, pEC)) { continue; } if (x_ParseFeatureGff(line, annots, pEC)) { continue; } } catch(CObjReaderLineException& err) { err.SetLineNumber(m_uLineNumber); } } }
// ---------------------------------------------------------------------------- CRef< CSeq_annot > CVcfReader::ReadSeqAnnot( ILineReader& lr, IMessageListener* pEC ) // ---------------------------------------------------------------------------- { CRef< CSeq_annot > annot( new CSeq_annot ); CRef< CAnnot_descr > desc( new CAnnot_descr ); annot->SetDesc( *desc ); annot->SetData().SetFtable(); m_Meta.Reset( new CAnnotdesc ); m_Meta->SetUser().SetType().SetStr( "vcf-meta-info" ); while ( ! lr.AtEOF() ) { m_uLineNumber++; string line = *(++lr); NStr::TruncateSpacesInPlace( line ); if (xProcessMetaLine(line, annot, pEC)) { continue; } if (xProcessHeaderLine(line, annot)) { continue; } if (xProcessDataLine(line, annot, pEC)) { continue; } // still here? not good! AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Warning, 0, "CVcfReader::ReadSeqAnnot: Unrecognized line or record type.", ILineError::eProblem_GeneralParsingError) ); ProcessWarning(*pErr, pEC); } return annot; }
// --------------------------------------------------------------------------- bool CVcfReader::xNormalizeData( CVcfData& data, IMessageListener* pEC) // --------------------------------------------------------------------------- { // make sure none of the alternatives is equal to the reference: for (size_t u=0; u < data.m_Alt.size(); ++u) { if (data.m_Alt[u] == data.m_strRef) { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Error, 0, "CVcfReader::xNormalizeData: Invalid alternative.", ILineError::eProblem_GeneralParsingError)); ProcessError(*pErr, pEC); return false; } } // normalize ref/alt by trimming common prefices and adjusting location bool trimComplete = false; while (!data.m_strRef.empty()) { char leadBase = data.m_strRef[0]; for (size_t u=0; u < data.m_Alt.size(); ++u) { if (!NStr::StartsWith(data.m_Alt[u], leadBase)) { trimComplete = true; break; } } if (trimComplete) { break; } data.m_strRef = data.m_strRef.substr(1); for (size_t u=0; u < data.m_Alt.size(); ++u) { data.m_Alt[u] = data.m_Alt[u].substr(1); } data.m_iPos++; } // normalize ref/alt by trimming common postfixes and adjusting location trimComplete = false; size_t refSize = data.m_strRef.size(); size_t trimSize = 0; while (refSize > trimSize) { string postfix = data.m_strRef.substr(refSize-1-trimSize, trimSize+1); for (size_t u=0; u < data.m_Alt.size(); ++u) { size_t altSize = data.m_Alt[u].size(); if (altSize < trimSize+1) { trimComplete = true; break; } string postfixA = data.m_Alt[u].substr(altSize-1-trimSize, trimSize+1); if (postfix != postfixA) { trimComplete = true; break; } } if (trimComplete) { break; } trimSize++; } if (trimSize > 0) { data.m_strRef = data.m_strRef.substr(0, data.m_strRef.size()-trimSize); for (size_t u=0; u < data.m_Alt.size(); ++u) { data.m_Alt[u] = data.m_Alt[u].substr(0, data.m_Alt[u].size()-trimSize); } } return true; }
// ---------------------------------------------------------------------------- bool CVcfReader::xParseData( const string& line, CVcfData& data, IMessageListener* pEC) // ---------------------------------------------------------------------------- { vector<string> columns; NStr::Tokenize( line, "\t", columns, NStr::eMergeDelims ); if ( columns.size() < 8 ) { return false; } try { data.m_strLine = line; data.m_strChrom = columns[0]; data.m_iPos = NStr::StringToInt( columns[1] ); NStr::Tokenize( columns[2], ";", data.m_Ids, NStr::eNoMergeDelims ); if ( (data.m_Ids.size() == 1) && (data.m_Ids[0] == ".") ) { data.m_Ids.clear(); } data.m_strRef = columns[3]; NStr::Tokenize( columns[4], ",", data.m_Alt, NStr::eNoMergeDelims ); if ( columns[5] != "." ) { data.m_pdQual = new double( NStr::StringToDouble( columns[5] ) ); } data.m_strFilter = columns[6]; vector<string> infos; if ( columns[7] != "." ) { NStr::Tokenize( columns[7], ";", infos, NStr::eMergeDelims ); for ( vector<string>::iterator it = infos.begin(); it != infos.end(); ++it ) { string key, value; NStr::SplitInTwo( *it, "=", key, value ); data.m_Info[key] = vector<string>(); NStr::Tokenize( value, ",", data.m_Info[key] ); } } if ( columns.size() > 8 ) { NStr::Tokenize( columns[8], ":", data.m_FormatKeys, NStr::eMergeDelims ); for ( size_t u=9; u < columns.size(); ++u ) { vector<string> values; NStr::Tokenize( columns[u], ":", values, NStr::eMergeDelims ); data.m_GenotypeData[ m_GenotypeHeaders[u-9] ] = values; } } } catch ( ... ) { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Error, 0, "Unable to parse given VCF data (syntax error).", ILineError::eProblem_GeneralParsingError)); ProcessError(*pErr, pEC); return false; } if (!xNormalizeData(data, pEC)) { return false; } //assign set type: //test for all SNVs bool maybeAllSnv = (data.m_strRef.size() == 1); if (maybeAllSnv) { for (size_t u=0; u < data.m_Alt.size(); ++u) { if (data.m_Alt[u].size() != 1) { maybeAllSnv = false; break; } } if (maybeAllSnv) { data.m_SetType = CVcfData::ST_ALL_SNV; return true; } } //test for all mnvs: bool maybeAllMnv = true; size_t refSize = data.m_strRef.size(); for (size_t u=0; u < data.m_Alt.size(); ++u) { if (data.m_Alt[u].size() != refSize) { maybeAllMnv = false; break; } } if (maybeAllMnv) { data.m_SetType = CVcfData::ST_ALL_MNV; return true; } //test for all insertions: bool maybeAllIns = true; for (size_t u=0; u < data.m_Alt.size(); ++u) { if (! NStr::StartsWith(data.m_Alt[u], data.m_strRef)) { maybeAllIns = false; break; } } if (maybeAllIns) { data.m_SetType = CVcfData::ST_ALL_INS; return true; } //test for all deletions: // note: even it is all deletions we are not able to process them // as such because those deletions would be at different ASN1 // locations. Hence we punt to "indel" if there is more than one // alternative. bool maybeAllDel = false; for (size_t u=0; u < data.m_Alt.size(); ++u) { if (data.m_Alt.size() == 1 && data.m_Alt[0].empty()) { maybeAllDel = true; } } if (maybeAllDel) { data.m_SetType = CVcfData::ST_ALL_DEL; return true; } data.m_SetType = CVcfData::ST_MIXED; return true; }
// ---------------------------------------------------------------------------- bool CVcfReader::xAssignVariantProps( CVcfData& data, CRef<CSeq_feat> pFeat, IMessageListener* pEC) // ---------------------------------------------------------------------------- { typedef CVariantProperties VP; CVcfData::INFOS& infos = data.m_Info; VP& props = pFeat->SetData().SetVariation().SetVariant_prop(); CVcfData::INFOS::iterator it; props.SetResource_link() = 0; props.SetGene_location() = 0; props.SetEffect() = 0; props.SetMapping() = 0; props.SetFrequency_based_validation() = 0; props.SetGenotype() = 0; props.SetQuality_check() = 0; //byte F0 props.SetVersion() = 5; //superbyte F1 it = infos.find("SLO"); if (infos.end() != it) { props.SetResource_link() |= VP::eResource_link_submitterLinkout; infos.erase(it); } it = infos.find("S3D"); if (infos.end() != it) { props.SetResource_link() |= VP::eResource_link_has3D; infos.erase(it); } it = infos.find("TPA"); if (infos.end() != it) { props.SetResource_link() |= VP::eResource_link_provisional; infos.erase(it); } it = infos.find("PM"); if (infos.end() != it) { props.SetResource_link() |= VP::eResource_link_preserved; infos.erase(it); } it = infos.find("CLN"); if (infos.end() != it) { props.SetResource_link() |= VP::eResource_link_clinical; infos.erase(it); } //todo: INFO ID=PMC it = infos.find("PMC"); if (infos.end() != it) { infos.erase(it); } it = infos.find("PMID"); if (infos.end() != it) { vector<string> pmids = it->second; for (vector<string>::const_iterator cit = pmids.begin(); cit != pmids.end(); ++cit) { try { string db, tag; NStr::SplitInTwo(*cit, ":", db, tag); if (db != "PM") { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Warning, 0, "CVcfReader::xAssignVariantProps: Invalid PMID database ID.", ILineError::eProblem_GeneralParsingError) ); ProcessWarning(*pErr, pEC); continue; } CRef<CDbtag> pDbtag(new CDbtag); pDbtag->SetDb(db); pDbtag->SetTag().SetId( NStr::StringToInt(tag)); pFeat->SetDbxref().push_back(pDbtag); } catch(...) {} } infos.erase(it); } //superbyte F2 it = infos.find("R5"); if (infos.end() != it) { props.SetGene_location() |= VP::eGene_location_near_gene_5; infos.erase(it); } it = infos.find("R3"); if (infos.end() != it) { props.SetGene_location() |= VP::eGene_location_near_gene_3; infos.erase(it); } it = infos.find("INT"); if (infos.end() != it) { props.SetGene_location() |= VP::eGene_location_intron; infos.erase(it); } it = infos.find("DSS"); if (infos.end() != it) { props.SetGene_location() |= VP::eGene_location_donor; infos.erase(it); } it = infos.find("ASS"); if (infos.end() != it) { props.SetGene_location() |= VP::eGene_location_acceptor; infos.erase(it); } it = infos.find("U5"); if (infos.end() != it) { props.SetGene_location() |= VP::eGene_location_utr_5; infos.erase(it); } it = infos.find("U3"); if (infos.end() != it) { props.SetGene_location() |= CVariantProperties::eGene_location_utr_3; infos.erase(it); } it = infos.find("SYN"); if (infos.end() != it) { props.SetGene_location() |= VP::eEffect_synonymous; infos.erase(it); } it = infos.find("NSN"); if (infos.end() != it) { props.SetGene_location() |= VP::eEffect_stop_gain; infos.erase(it); } it = infos.find("NSM"); if (infos.end() != it) { props.SetGene_location() |= VP::eEffect_missense; infos.erase(it); } it = infos.find("NSF"); if (infos.end() != it) { props.SetGene_location() |= VP::eEffect_frameshift; infos.erase(it); } //byte F3 it = infos.find("WGT"); if (infos.end() != it) { int weight = NStr::StringToInt( infos["WGT"][0] ); switch(weight) { default: break; case 1: props.SetMap_weight() = VP::eMap_weight_is_uniquely_placed; infos.erase(it); break; case 2: props.SetMap_weight() = VP::eMap_weight_placed_twice_on_same_chrom; infos.erase(it); break; case 3: props.SetMap_weight() = VP::eMap_weight_placed_twice_on_diff_chrom; infos.erase(it); break; case 10: props.SetMap_weight() = VP::eMap_weight_many_placements; break; } } it = infos.find("ASP"); if (infos.end() != it) { props.SetMapping() |= VP::eMapping_is_assembly_specific; infos.erase(it); } it = infos.find("CFL"); if (infos.end() != it) { props.SetMapping() |= VP::eMapping_has_assembly_conflict; infos.erase(it); } it = infos.find("OTH"); if (infos.end() != it) { props.SetMapping() |= VP::eMapping_has_other_snp; infos.erase(it); } //byte F4 it = infos.find("OTH"); if (infos.end() != it) { props.SetMapping() |= VP::eFrequency_based_validation_above_5pct_all; infos.erase(it); } it = infos.find("G5A"); if (infos.end() != it) { props.SetMapping() |= VP::eFrequency_based_validation_above_5pct_1plus; infos.erase(it); } it = infos.find("VLD"); if (infos.end() != it) { props.SetMapping() |= VP::eFrequency_based_validation_validated; infos.erase(it); } it = infos.find("MUT"); if (infos.end() != it) { props.SetMapping() |= VP::eFrequency_based_validation_is_mutation; infos.erase(it); } it = infos.find("GMAF"); if (infos.end() != it) { props.SetAllele_frequency() = NStr::StringToDouble(infos["GMAF"][0]); infos.erase(it); } //byte F5 it = infos.find("GNO"); if (infos.end() != it) { props.SetGenotype() |= VP::eGenotype_has_genotypes; infos.erase(it); } it = infos.find("HD"); if (infos.end() != it) { props.SetResource_link() |= VP::eResource_link_genotypeKit; infos.erase(it); } //byte F6 if (infos.end() != infos.find("PH3")) { CRef<CDbtag> pDbtag(new CDbtag); pDbtag->SetDb("BioProject"); pDbtag->SetTag().SetId(60835); pFeat->SetData().SetVariation().SetOther_ids().push_back(pDbtag); } if (infos.end() != infos.find("KGPhase1")) { CRef<CDbtag> pDbtag(new CDbtag); pDbtag->SetDb("BioProject"); pDbtag->SetTag().SetId(28889); pFeat->SetData().SetVariation().SetOther_ids().push_back(pDbtag); } //byte F7 //byte F8 //no relevant information found in VCF //byte F9 it = infos.find("GCF"); if (infos.end() != it) { props.SetQuality_check() |= VP::eQuality_check_genotype_conflict; infos.erase(it); } it = infos.find("NOV"); if (infos.end() != it) { props.SetQuality_check() |= VP::eQuality_check_non_overlapping_alleles; infos.erase(it); } it = infos.find("WTD"); if (infos.end() != it) { props.SetQuality_check() |= VP::eQuality_check_withdrawn_by_submitter; infos.erase(it); } it = infos.find("NOC"); if (infos.end() != it) { props.SetQuality_check() |= VP::eQuality_check_contig_allele_missing; infos.erase(it); } return true; }
// ---------------------------------------------------------------------------- void CUCSCRegionReader::x_SetFeatureLocation( CRef<CSeq_feat>& feature, const vector<string>& fields ) // ---------------------------------------------------------------------------- { // // Note: // BED convention for specifying intervals is 0-based, first in, first out. // ASN convention for specifying intervals is 0-based, first in, last in. // Hence, conversion BED->ASN leaves the first leaves the "from" coordinate // unchanged, and decrements the "to" coordinate by one. // CRef<CSeq_loc> location(new CSeq_loc); int from, to; from = to = -1; //already established: We got at least three columns try { from = NStr::StringToInt(fields[1], NStr::fAllowCommas)-1; } catch(std::exception&) { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Error, m_uLineNumber, "Invalid data line: Bad \"SeqStart\" value." ) ); pErr->Throw(); } to = from; if (fields.size()>2) try { to = NStr::StringToInt(fields[2], NStr::fAllowCommas) - 1; } catch(std::exception&) { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Error, m_uLineNumber, "Invalid data line: Bad \"SeqStop\" value.") ); pErr->Throw(); } if (from == to) { location->SetPnt().SetPoint(from); } else if (from < to) { location->SetInt().SetFrom(from); location->SetInt().SetTo(to); } else { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Error, m_uLineNumber, "Invalid data line: \"SeqStop\" less than \"SeqStart\"." ) ); pErr->Throw(); } size_t strand_field = 3; if (strand_field < fields.size()) { string strand = fields[strand_field]; if (strand != "+" && strand != "-" && strand != ".") { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Error, m_uLineNumber, "Invalid data line: Invalid strand character." ) ); pErr->Throw(); } location->SetStrand(( fields[strand_field] == "+" ) ? eNa_strand_plus : eNa_strand_minus ); } try { CRef<CSeq_id> id = CReadUtil::AsSeqId(fields[0], m_iFlags, false); //CRef<CSeq_id> id (new CSeq_id(fields[0], CSeq_id::fParse_AnyRaw | m_iFlags)); location->SetId(*id); feature->SetLocation(*location); } catch(CSeqIdException&) { AutoPtr<CObjReaderLineException> pErr( CObjReaderLineException::Create( eDiag_Error, m_uLineNumber, "Malformed sequence id:" ) ); pErr->Throw(); } }