Beispiel #1
0
//  ----------------------------------------------------------------------------
ESpecType SpecType( 
    const string& spectype )
//  ----------------------------------------------------------------------------
{
    static map<string, ESpecType> typemap;
    if ( typemap.empty() ) {
        typemap["Integer"] = eType_Integer;
        typemap["Float"] = eType_Float;
        typemap["Flag"] = eType_Flag;
        typemap["Character"] = eType_Character;
        typemap["String"] = eType_String;
    }
    try {
        return typemap[spectype];
    }
    catch( ... ) {
        AutoPtr<CObjReaderLineException> pErr(
            CObjReaderLineException::Create(
            eDiag_Warning,
            0,
            "CVcfReader::xProcessMetaLineInfo: Unrecognized line or record type.",
            ILineError::eProblem_GeneralParsingError) );
        pErr->Throw();
        return eType_String;
    }
};
Beispiel #2
0
//  ----------------------------------------------------------------------------
bool
CVcfReader::xProcessMetaLineFilter(
    const string& line,
    CRef<CSeq_annot> pAnnot,
    IMessageListener* pEC)
//  ----------------------------------------------------------------------------
{
    const string prefix = "##FILTER=<";
    const string postfix = ">";

    if ( ! NStr::StartsWith( line, prefix ) || ! NStr::EndsWith( line, postfix ) ) {
        return false;
    }
    
    try {
        vector<string> fields;
        string key, id, description;
        string info = line.substr( 
            prefix.length(), line.length() - prefix.length() - postfix.length() );
        NStr::Tokenize( info, ",", fields );
        NStr::SplitInTwo( fields[0], "=", key, id );
        if ( key != "ID" ) {
            AutoPtr<CObjReaderLineException> pErr(
                CObjReaderLineException::Create(
                eDiag_Error,
                0,
                "CVcfReader::xProcessMetaLineInfo: ##FILTER with bad or missing \"ID\".",
                ILineError::eProblem_BadFilterLine) );
            pErr->Throw();
        }
        NStr::SplitInTwo( fields[1], "=", key, description );
        if ( key != "Description" ) {
            AutoPtr<CObjReaderLineException> pErr(
                CObjReaderLineException::Create(
                eDiag_Error,
                0,
                "CVcfReader::xProcessMetaLineInfo: ##FILTER with bad or missing \"Description\".",
                ILineError::eProblem_BadFilterLine) );
            pErr->Throw();
        }
        m_FilterSpecs[id] = CVcfFilterSpec( id, description );        
    }
    catch (CObjReaderLineException& err) {
        ProcessError(err, pEC);
    }
    return true;
}
Beispiel #3
0
//  ---------------------------------------------------------------------------                       
void
CGtfReader::ReadSeqAnnots(
    TAnnots& annots,
    ILineReader& lr,
    ILineErrorListener* pEC)
//  ----------------------------------------------------------------------------
{
    xProgressInit(lr);

    string line;
    while (xGetLine(lr, line)) {
        if (IsCanceled()) {
            AutoPtr<CObjReaderLineException> pErr(
                CObjReaderLineException::Create(
                eDiag_Info,
                0,
                "Reader stopped by user.",
                ILineError::eProblem_ProgressInfo));
            ProcessError(*pErr, pEC);
            annots.clear();
            return;
        }
        xReportProgress(pEC);
        try {
            if (xIsTrackTerminator(line)) {
                continue;
            }
            if (x_ParseBrowserLineGff(line, m_CurrentBrowserInfo)) {
                continue;
            }
            if (xParseTrackLine(line, pEC)) {
                continue;
            }
            if (x_ParseFeatureGff(line, annots, pEC)) {
                continue;
            }
        }
        catch(CObjReaderLineException& err) {
            err.SetLineNumber(m_uLineNumber);
        }
    }
}
Beispiel #4
0
//  ----------------------------------------------------------------------------                
CRef< CSeq_annot >
CVcfReader::ReadSeqAnnot(
    ILineReader& lr,
    IMessageListener* pEC ) 
//  ----------------------------------------------------------------------------                
{
    CRef< CSeq_annot > annot( new CSeq_annot );
    CRef< CAnnot_descr > desc( new CAnnot_descr );
    annot->SetDesc( *desc );
    annot->SetData().SetFtable();
    m_Meta.Reset( new CAnnotdesc );
    m_Meta->SetUser().SetType().SetStr( "vcf-meta-info" );

    while ( ! lr.AtEOF() ) {
        m_uLineNumber++;
        string line = *(++lr);
        NStr::TruncateSpacesInPlace( line );
        if (xProcessMetaLine(line, annot, pEC)) {
            continue;
        }
        if (xProcessHeaderLine(line, annot)) {
            continue;
        }
        if (xProcessDataLine(line, annot, pEC)) {
            continue;
        }
        // still here? not good!
        AutoPtr<CObjReaderLineException> pErr(
            CObjReaderLineException::Create(
            eDiag_Warning,
            0,
            "CVcfReader::ReadSeqAnnot: Unrecognized line or record type.",
            ILineError::eProblem_GeneralParsingError) );
        ProcessWarning(*pErr, pEC);
    }
    return annot;
}
Beispiel #5
0
//  ---------------------------------------------------------------------------
bool
CVcfReader::xNormalizeData(
    CVcfData& data,
    IMessageListener* pEC)
//  ---------------------------------------------------------------------------
{
    // make sure none of the alternatives is equal to the reference:
    for (size_t u=0; u < data.m_Alt.size(); ++u) {
        if (data.m_Alt[u] == data.m_strRef) {
            AutoPtr<CObjReaderLineException> pErr(
                CObjReaderLineException::Create(
                eDiag_Error,
                0,
                "CVcfReader::xNormalizeData: Invalid alternative.",
                ILineError::eProblem_GeneralParsingError));
            ProcessError(*pErr, pEC);
            return false;
        }
    }

    // normalize ref/alt by trimming common prefices and adjusting location
    bool trimComplete = false;
    while (!data.m_strRef.empty()) {
        char leadBase = data.m_strRef[0];
        for (size_t u=0; u < data.m_Alt.size(); ++u) {
            if (!NStr::StartsWith(data.m_Alt[u], leadBase)) {
                trimComplete = true;
                break;
            }
        }
        if (trimComplete) {
            break;
        }
        data.m_strRef = data.m_strRef.substr(1);
        for (size_t u=0; u < data.m_Alt.size(); ++u) {
            data.m_Alt[u] = data.m_Alt[u].substr(1);
        }
        data.m_iPos++;
    }

    //  normalize ref/alt by trimming common postfixes and adjusting location
    trimComplete = false;
    size_t refSize = data.m_strRef.size();
    size_t trimSize = 0;
    while (refSize > trimSize) {
        string postfix = data.m_strRef.substr(refSize-1-trimSize, trimSize+1);
        for (size_t u=0; u < data.m_Alt.size(); ++u) {
            size_t altSize = data.m_Alt[u].size();
            if (altSize < trimSize+1) {
                trimComplete = true;
                break;
            }
            string postfixA = data.m_Alt[u].substr(altSize-1-trimSize, trimSize+1);
            if (postfix != postfixA) {
                trimComplete = true;
                break;
            }
        }
        if (trimComplete) {
            break;
        }
        trimSize++;
    }
    if (trimSize > 0) {
        data.m_strRef = 
            data.m_strRef.substr(0, data.m_strRef.size()-trimSize);
        for (size_t u=0; u < data.m_Alt.size(); ++u) {
            data.m_Alt[u] = 
                data.m_Alt[u].substr(0, data.m_Alt[u].size()-trimSize);
        }
    }
    return true;
}
Beispiel #6
0
//  ----------------------------------------------------------------------------
bool
CVcfReader::xParseData(
    const string& line,
    CVcfData& data,
    IMessageListener* pEC)
//  ----------------------------------------------------------------------------
{
    vector<string> columns;
    NStr::Tokenize( line, "\t", columns, NStr::eMergeDelims );
    if ( columns.size() < 8 ) {
        return false;
    }
    try {
        data.m_strLine = line;

        data.m_strChrom = columns[0];
        data.m_iPos = NStr::StringToInt( columns[1] );
        NStr::Tokenize( columns[2], ";", data.m_Ids, NStr::eNoMergeDelims );
        if ( (data.m_Ids.size() == 1)  &&  (data.m_Ids[0] == ".") ) {
            data.m_Ids.clear();
        }
        data.m_strRef = columns[3];
        NStr::Tokenize( columns[4], ",", data.m_Alt, NStr::eNoMergeDelims );
        if ( columns[5] != "." ) {
            data.m_pdQual = new double( NStr::StringToDouble( columns[5] ) );
        }
        data.m_strFilter = columns[6];

        vector<string> infos;
        if ( columns[7] != "." ) {
            NStr::Tokenize( columns[7], ";", infos, NStr::eMergeDelims );
            for ( vector<string>::iterator it = infos.begin(); 
                it != infos.end(); ++it ) 
            {
                string key, value;
                NStr::SplitInTwo( *it, "=", key, value );
                data.m_Info[key] = vector<string>();
                NStr::Tokenize( value, ",", data.m_Info[key] );
            }
        }
        if ( columns.size() > 8 ) {
            NStr::Tokenize( columns[8], ":", data.m_FormatKeys, NStr::eMergeDelims );

            for ( size_t u=9; u < columns.size(); ++u ) {
                vector<string> values;
                NStr::Tokenize( columns[u], ":", values, NStr::eMergeDelims );
                data.m_GenotypeData[ m_GenotypeHeaders[u-9] ] = values;
            }
        }
    }
    catch ( ... ) {
        AutoPtr<CObjReaderLineException> pErr(
            CObjReaderLineException::Create(
            eDiag_Error,
            0,
            "Unable to parse given VCF data (syntax error).",
            ILineError::eProblem_GeneralParsingError));
        ProcessError(*pErr, pEC);
        return false;
    }

    if (!xNormalizeData(data, pEC)) {
        return false;
    }

    //assign set type:

    //test for all SNVs
    bool maybeAllSnv = (data.m_strRef.size() == 1);
    if (maybeAllSnv) {
        for (size_t u=0; u < data.m_Alt.size(); ++u) {
            if (data.m_Alt[u].size() != 1) {
                maybeAllSnv = false;
                break;
            }
        }
        if (maybeAllSnv) {
            data.m_SetType = CVcfData::ST_ALL_SNV;
            return true;
        }
    }

    //test for all mnvs:
    bool maybeAllMnv = true;
    size_t refSize = data.m_strRef.size();
    for (size_t u=0; u < data.m_Alt.size(); ++u) {
        if (data.m_Alt[u].size() != refSize) {
            maybeAllMnv = false;
            break;
        }
    }
    if (maybeAllMnv) {
        data.m_SetType = CVcfData::ST_ALL_MNV;
        return true;
    }

    //test for all insertions:
    bool maybeAllIns = true;
    for (size_t u=0; u < data.m_Alt.size(); ++u) {
        if (! NStr::StartsWith(data.m_Alt[u], data.m_strRef)) {
            maybeAllIns = false;
            break;
        }
    }
    if (maybeAllIns) {
        data.m_SetType = CVcfData::ST_ALL_INS;
        return true;
    }

    //test for all deletions:
    // note: even it is all deletions we are not able to process them 
    // as such because those deletions would be at different ASN1
    // locations. Hence we punt to "indel" if there is more than one
    // alternative.
    bool maybeAllDel = false;
    for (size_t u=0; u < data.m_Alt.size(); ++u) {
        if (data.m_Alt.size() == 1  && data.m_Alt[0].empty()) {
            maybeAllDel = true;
        }
    }
    if (maybeAllDel) {
        data.m_SetType = CVcfData::ST_ALL_DEL;
        return true;
    }

    data.m_SetType = CVcfData::ST_MIXED;
    return true;
}
Beispiel #7
0
//  ----------------------------------------------------------------------------
bool
CVcfReader::xAssignVariantProps(
    CVcfData& data,
    CRef<CSeq_feat> pFeat,
    IMessageListener* pEC)
//  ----------------------------------------------------------------------------
{
    typedef CVariantProperties VP;

    CVcfData::INFOS& infos = data.m_Info;
    VP& props = pFeat->SetData().SetVariation().SetVariant_prop(); 
    CVcfData::INFOS::iterator it;

    props.SetResource_link() = 0;
    props.SetGene_location() = 0;
    props.SetEffect() = 0;
    props.SetMapping() = 0;
    props.SetFrequency_based_validation() = 0;
    props.SetGenotype() = 0;
    props.SetQuality_check() = 0;

    //byte F0
    props.SetVersion() = 5;

    //superbyte F1
    it = infos.find("SLO");
    if (infos.end() != it) {
        props.SetResource_link() |= VP::eResource_link_submitterLinkout; 
        infos.erase(it);
    }
    it = infos.find("S3D");
    if (infos.end() != it) {
        props.SetResource_link() |= VP::eResource_link_has3D; 
        infos.erase(it);
    }
    it = infos.find("TPA");
    if (infos.end() != it) {
        props.SetResource_link() |= VP::eResource_link_provisional; 
        infos.erase(it);
    }
    it = infos.find("PM");
    if (infos.end() != it) {
        props.SetResource_link() |= VP::eResource_link_preserved; 
        infos.erase(it);
    }
    it = infos.find("CLN");
    if (infos.end() != it) {
        props.SetResource_link() |= VP::eResource_link_clinical; 
        infos.erase(it);
    }
    //todo: INFO ID=PMC
    it = infos.find("PMC");
    if (infos.end() != it) {
        infos.erase(it);
    }
    it = infos.find("PMID");
    if (infos.end() != it) {
        vector<string> pmids = it->second;
        for (vector<string>::const_iterator cit = pmids.begin();
            cit != pmids.end(); ++cit)
        {
            try {
                string db, tag;
                NStr::SplitInTwo(*cit, ":", db, tag);
                if (db != "PM") {
                    AutoPtr<CObjReaderLineException> pErr(
                        CObjReaderLineException::Create(
                        eDiag_Warning,
                        0,
                        "CVcfReader::xAssignVariantProps: Invalid PMID database ID.",
                        ILineError::eProblem_GeneralParsingError) );
                    ProcessWarning(*pErr, pEC);
                    continue;
                }
                CRef<CDbtag> pDbtag(new CDbtag);
                pDbtag->SetDb(db);
                pDbtag->SetTag().SetId(
                    NStr::StringToInt(tag));
                pFeat->SetDbxref().push_back(pDbtag);
            }
            catch(...) {}
        }
        infos.erase(it);
    }

    //superbyte F2
    it = infos.find("R5");
    if (infos.end() != it) {
        props.SetGene_location() |= VP::eGene_location_near_gene_5; 
        infos.erase(it);
    }
    it = infos.find("R3");
    if (infos.end() != it) {
        props.SetGene_location() |= VP::eGene_location_near_gene_3; 
        infos.erase(it);
    }
    it = infos.find("INT");
    if (infos.end() != it) {
        props.SetGene_location() |= VP::eGene_location_intron; 
        infos.erase(it);
    }
    it = infos.find("DSS");
    if (infos.end() != it) {
        props.SetGene_location() |= VP::eGene_location_donor; 
        infos.erase(it);
    }
    it = infos.find("ASS");
    if (infos.end() != it) {
        props.SetGene_location() |= VP::eGene_location_acceptor; 
        infos.erase(it);
    }
    it = infos.find("U5");
    if (infos.end() != it) {
        props.SetGene_location() |= VP::eGene_location_utr_5; 
        infos.erase(it);
    }
    it = infos.find("U3");
    if (infos.end() != it) {
        props.SetGene_location() |= CVariantProperties::eGene_location_utr_3; 
        infos.erase(it);
    }

    it = infos.find("SYN");
    if (infos.end() != it) {
        props.SetGene_location() |= VP::eEffect_synonymous; 
        infos.erase(it);
    }
    it = infos.find("NSN");
    if (infos.end() != it) {
        props.SetGene_location() |= VP::eEffect_stop_gain; 
        infos.erase(it);
    }
    it = infos.find("NSM");
    if (infos.end() != it) {
        props.SetGene_location() |= VP::eEffect_missense; 
        infos.erase(it);
    }
    it = infos.find("NSF");
    if (infos.end() != it) {
        props.SetGene_location() |= VP::eEffect_frameshift; 
        infos.erase(it);
    }

    //byte F3
    it = infos.find("WGT");
    if (infos.end() != it) {
        int weight = NStr::StringToInt( infos["WGT"][0] ); 
        switch(weight) {
        default:
            break;
        case 1:
            props.SetMap_weight() = VP::eMap_weight_is_uniquely_placed;
            infos.erase(it);
            break;
        case 2:
            props.SetMap_weight() = VP::eMap_weight_placed_twice_on_same_chrom;
            infos.erase(it);
            break;
        case 3:
            props.SetMap_weight() = VP::eMap_weight_placed_twice_on_diff_chrom;
            infos.erase(it);
            break;
        case 10:
            props.SetMap_weight() = VP::eMap_weight_many_placements;
            break;
        }
    }

    it = infos.find("ASP");
    if (infos.end() != it) {
        props.SetMapping() |= VP::eMapping_is_assembly_specific; 
        infos.erase(it);
    }
    it = infos.find("CFL");
    if (infos.end() != it) {
        props.SetMapping() |= VP::eMapping_has_assembly_conflict; 
        infos.erase(it);
    }
    it = infos.find("OTH");
    if (infos.end() != it) {
        props.SetMapping() |= VP::eMapping_has_other_snp; 
        infos.erase(it);
    }

    //byte F4
    it = infos.find("OTH");
    if (infos.end() != it) {
        props.SetMapping() |= VP::eFrequency_based_validation_above_5pct_all; 
        infos.erase(it);
    }
    it = infos.find("G5A");
    if (infos.end() != it) {
        props.SetMapping() |= VP::eFrequency_based_validation_above_5pct_1plus; 
        infos.erase(it);
    }
    it = infos.find("VLD");
    if (infos.end() != it) {
        props.SetMapping() |= VP::eFrequency_based_validation_validated; 
        infos.erase(it);
    }
    it = infos.find("MUT");
    if (infos.end() != it) {
        props.SetMapping() |= VP::eFrequency_based_validation_is_mutation; 
        infos.erase(it);
    }
    it = infos.find("GMAF");
    if (infos.end() != it) {
        props.SetAllele_frequency() = NStr::StringToDouble(infos["GMAF"][0]);
        infos.erase(it);
    }

    //byte F5
    it = infos.find("GNO");
    if (infos.end() != it) {
        props.SetGenotype() |= VP::eGenotype_has_genotypes; 
        infos.erase(it);
    }
    it = infos.find("HD");
    if (infos.end() != it) {
        props.SetResource_link() |= VP::eResource_link_genotypeKit; 
        infos.erase(it);
    }

    //byte F6
    if (infos.end() != infos.find("PH3")) {
        CRef<CDbtag> pDbtag(new CDbtag);
        pDbtag->SetDb("BioProject");
        pDbtag->SetTag().SetId(60835);
        pFeat->SetData().SetVariation().SetOther_ids().push_back(pDbtag);
    }
    if (infos.end() != infos.find("KGPhase1")) {
        CRef<CDbtag> pDbtag(new CDbtag);
        pDbtag->SetDb("BioProject");
        pDbtag->SetTag().SetId(28889);
        pFeat->SetData().SetVariation().SetOther_ids().push_back(pDbtag);
    }

    //byte F7

    //byte F8
    //no relevant information found in VCF

    //byte F9
    it = infos.find("GCF");
    if (infos.end() != it) {
        props.SetQuality_check() |= VP::eQuality_check_genotype_conflict;
        infos.erase(it);
    }
    it = infos.find("NOV");
    if (infos.end() != it) {
        props.SetQuality_check() |= VP::eQuality_check_non_overlapping_alleles;
        infos.erase(it);
    }
    it = infos.find("WTD");
    if (infos.end() != it) {
        props.SetQuality_check() |= VP::eQuality_check_withdrawn_by_submitter;
        infos.erase(it);
    }
    it = infos.find("NOC");
    if (infos.end() != it) {
        props.SetQuality_check() |= VP::eQuality_check_contig_allele_missing;
        infos.erase(it);
    }
    return true;
}
//  ----------------------------------------------------------------------------                
void CUCSCRegionReader::x_SetFeatureLocation(
    CRef<CSeq_feat>& feature,
    const vector<string>& fields )
//  ----------------------------------------------------------------------------
{
    //
    //  Note:
    //  BED convention for specifying intervals is 0-based, first in, first out.
    //  ASN convention for specifying intervals is 0-based, first in, last in.
    //  Hence, conversion BED->ASN  leaves the first leaves the "from" coordinate
    //  unchanged, and decrements the "to" coordinate by one.
    //

    CRef<CSeq_loc> location(new CSeq_loc);
    int from, to;
    from = to = -1;

    //already established: We got at least three columns
    try {
        from = NStr::StringToInt(fields[1], NStr::fAllowCommas)-1;
    }
    catch(std::exception&) {
        AutoPtr<CObjReaderLineException> pErr(
            CObjReaderLineException::Create(
            eDiag_Error,
            m_uLineNumber,
            "Invalid data line: Bad \"SeqStart\" value." ) );
        pErr->Throw();
    }
    to = from;

    if (fields.size()>2)
    try {
        to = NStr::StringToInt(fields[2], NStr::fAllowCommas) - 1;
    }
    catch(std::exception&) {
        AutoPtr<CObjReaderLineException> pErr(
            CObjReaderLineException::Create(
            eDiag_Error,
            m_uLineNumber,
            "Invalid data line: Bad \"SeqStop\" value.") );
        pErr->Throw();
    }

    if (from == to) {
        location->SetPnt().SetPoint(from);
    }
    else if (from < to) {
        location->SetInt().SetFrom(from);
        location->SetInt().SetTo(to);
    }
    else {
        AutoPtr<CObjReaderLineException> pErr(
            CObjReaderLineException::Create(
            eDiag_Error,
            m_uLineNumber,
            "Invalid data line: \"SeqStop\" less than \"SeqStart\"." ) );
        pErr->Throw();
    }

    size_t strand_field = 3;
    if (strand_field < fields.size()) {
        string strand = fields[strand_field];
        if (strand != "+"  &&  strand != "-"  &&  strand != ".") {
            AutoPtr<CObjReaderLineException> pErr(
                CObjReaderLineException::Create(
                eDiag_Error,
                m_uLineNumber,
                "Invalid data line: Invalid strand character." ) );
            pErr->Throw();
        }
        location->SetStrand(( fields[strand_field] == "+" ) ?
                           eNa_strand_plus : eNa_strand_minus );
    }
    try
    {
        CRef<CSeq_id> id = CReadUtil::AsSeqId(fields[0], m_iFlags, false);
        //CRef<CSeq_id> id (new CSeq_id(fields[0], CSeq_id::fParse_AnyRaw | m_iFlags));
        location->SetId(*id);
        feature->SetLocation(*location);
    }
    catch(CSeqIdException&)
    {
        AutoPtr<CObjReaderLineException> pErr(
            CObjReaderLineException::Create(
            eDiag_Error,
            m_uLineNumber,
            "Malformed sequence id:" ) );
        pErr->Throw();
    }
    
}