Пример #1
0
void convertHitsToASQG(const std::string& indexPrefix, const StringVector& hitsFilenames, std::ostream* pASQGWriter)
{
    // Load the suffix array index and the reverse suffix array index
    // Note these are not the full suffix arrays
    SuffixArray* pFwdSAI = new SuffixArray(indexPrefix + SAI_EXT);
    SuffixArray* pRevSAI = new SuffixArray(indexPrefix + RSAI_EXT);

    // Load the ReadInfoTable for the queries to look up the ID and lengths of the hits
    ReadInfoTable* pQueryRIT = new ReadInfoTable(opt::readsFile);

    // If the target file is not the query file, load its ReadInfoTable
    ReadInfoTable* pTargetRIT;
    if(!opt::targetFile.empty() && opt::targetFile != opt::readsFile)
        pTargetRIT = new ReadInfoTable(opt::targetFile);
    else
        pTargetRIT = pQueryRIT;

    bool bIsSelfCompare = pTargetRIT == pQueryRIT;

    // Convert the hits to overlaps and write them to the asqg file as initial edges
    for(StringVector::const_iterator iter = hitsFilenames.begin(); iter != hitsFilenames.end(); ++iter)
    {
        printf("[%s] parsing file %s\n", PROGRAM_IDENT, iter->c_str());
        std::istream* pReader = createReader(*iter);
    
        // Read each hit sequentially, converting it to an overlap
        std::string line;
        while(getline(*pReader, line))
        {
            size_t readIdx;
            size_t totalEntries;
            bool isSubstring;
            OverlapVector ov;
            OverlapCommon::parseHitsString(line, pQueryRIT, pTargetRIT, pFwdSAI, pRevSAI, bIsSelfCompare, readIdx, totalEntries, ov, isSubstring);
            for(OverlapVector::iterator iter = ov.begin(); iter != ov.end(); ++iter)
            {
                ASQG::EdgeRecord edgeRecord(*iter);
                edgeRecord.write(*pASQGWriter);
            }
        }
        delete pReader;

        // delete the hits file
        unlink(iter->c_str());
    }

    // Deallocate data
    if(pTargetRIT != pQueryRIT)
        delete pTargetRIT;
    delete pFwdSAI;
    delete pRevSAI;
    delete pQueryRIT;
}
Пример #2
0
StringGraph* SGUtil::loadASQG(const std::string& filename, const unsigned int minOverlap, 
                              bool allowContainments)
{
    // Initialize graph
    StringGraph* pGraph = new StringGraph;

    std::istream* pReader = createReader(filename);

    int stage = 0;
    int line = 0;
    std::string recordLine;
    while(getline(*pReader, recordLine))
    {
        ASQG::RecordType rt = ASQG::getRecordType(recordLine);
        switch(rt)
        {
            case ASQG::RT_HEADER:
            {
                if(stage != 0)
                {
                    std::cerr << "Error: Unexpected header record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::HeaderRecord headerRecord(recordLine);
                const SQG::IntTag& overlapTag = headerRecord.getOverlapTag();
                if(overlapTag.isInitialized())
                    pGraph->setMinOverlap(overlapTag.get());
                else
                    pGraph->setMinOverlap(0);

                const SQG::FloatTag& errorRateTag = headerRecord.getErrorRateTag();
                if(errorRateTag.isInitialized())
                    pGraph->setErrorRate(errorRateTag.get());
                
                const SQG::IntTag& containmentTag = headerRecord.getContainmentTag();
                if(containmentTag.isInitialized())
                    pGraph->setContainmentFlag(containmentTag.get());
                else
                    pGraph->setContainmentFlag(true); // conservatively assume containments are present

                const SQG::IntTag& transitiveTag = headerRecord.getTransitiveTag();
                if(!transitiveTag.isInitialized())
                {
                    std::cerr << "Warning: ASQG does not have transitive tag\n";
                    pGraph->setTransitiveFlag(true);
                }
                else
                {
                    pGraph->setTransitiveFlag(transitiveTag.get());
                }

                break;
            }
            case ASQG::RT_VERTEX:
            {
                // progress the stage if we are done the header
                if(stage == 0)
                    stage = 1;

                if(stage != 1)
                {
                    std::cerr << "Error: Unexpected vertex record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::VertexRecord vertexRecord(recordLine);
                const SQG::IntTag& ssTag = vertexRecord.getSubstringTag();

                Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(vertexRecord.getID(), vertexRecord.getSeq());
                if(ssTag.isInitialized() && ssTag.get() == 1)
                {
                    // Vertex is a substring of some other vertex, mark it as contained
                    pVertex->setContained(true);
                    pGraph->setContainmentFlag(true);
                }
                pGraph->addVertex(pVertex);
                break;
            }
            case ASQG::RT_EDGE:
            {
                if(stage == 1)
                    stage = 2;
                
                if(stage != 2)
                {
                    std::cerr << "Error: Unexpected edge record found at line " << line << "\n";
                    exit(EXIT_FAILURE);
                }

                ASQG::EdgeRecord edgeRecord(recordLine);
                const Overlap& ovr = edgeRecord.getOverlap();

                // Add the edge to the graph
                if(ovr.match.getMinOverlapLength() >= (int)minOverlap)
                    SGAlgorithms::createEdgesFromOverlap(pGraph, ovr, allowContainments);
                break;
            }
        }
        ++line;
    }

    // Remove any duplicate edges
    SGDuplicateVisitor dupVisit;
    pGraph->visit(dupVisit);

    SGGraphStatsVisitor statsVisit;
    pGraph->visit(statsVisit);
    // Remove identical vertices
    // This is much cheaper to do than remove via
    // SGContainRemove as no remodelling needs to occur
   /*
    SGIdenticalRemoveVisitor irv;
    pGraph->visit(irv);

    // Remove substring vertices
    while(pGraph->hasContainment())
    {
        SGContainRemoveVisitor crv;
        pGraph->visit(crv);
    }
*/
    delete pReader;
    return pGraph;
}