void convertHitsToASQG(const std::string& indexPrefix, const StringVector& hitsFilenames, std::ostream* pASQGWriter) { // Load the suffix array index and the reverse suffix array index // Note these are not the full suffix arrays SuffixArray* pFwdSAI = new SuffixArray(indexPrefix + SAI_EXT); SuffixArray* pRevSAI = new SuffixArray(indexPrefix + RSAI_EXT); // Load the ReadInfoTable for the queries to look up the ID and lengths of the hits ReadInfoTable* pQueryRIT = new ReadInfoTable(opt::readsFile); // If the target file is not the query file, load its ReadInfoTable ReadInfoTable* pTargetRIT; if(!opt::targetFile.empty() && opt::targetFile != opt::readsFile) pTargetRIT = new ReadInfoTable(opt::targetFile); else pTargetRIT = pQueryRIT; bool bIsSelfCompare = pTargetRIT == pQueryRIT; // Convert the hits to overlaps and write them to the asqg file as initial edges for(StringVector::const_iterator iter = hitsFilenames.begin(); iter != hitsFilenames.end(); ++iter) { printf("[%s] parsing file %s\n", PROGRAM_IDENT, iter->c_str()); std::istream* pReader = createReader(*iter); // Read each hit sequentially, converting it to an overlap std::string line; while(getline(*pReader, line)) { size_t readIdx; size_t totalEntries; bool isSubstring; OverlapVector ov; OverlapCommon::parseHitsString(line, pQueryRIT, pTargetRIT, pFwdSAI, pRevSAI, bIsSelfCompare, readIdx, totalEntries, ov, isSubstring); for(OverlapVector::iterator iter = ov.begin(); iter != ov.end(); ++iter) { ASQG::EdgeRecord edgeRecord(*iter); edgeRecord.write(*pASQGWriter); } } delete pReader; // delete the hits file unlink(iter->c_str()); } // Deallocate data if(pTargetRIT != pQueryRIT) delete pTargetRIT; delete pFwdSAI; delete pRevSAI; delete pQueryRIT; }
StringGraph* SGUtil::loadASQG(const std::string& filename, const unsigned int minOverlap, bool allowContainments) { // Initialize graph StringGraph* pGraph = new StringGraph; std::istream* pReader = createReader(filename); int stage = 0; int line = 0; std::string recordLine; while(getline(*pReader, recordLine)) { ASQG::RecordType rt = ASQG::getRecordType(recordLine); switch(rt) { case ASQG::RT_HEADER: { if(stage != 0) { std::cerr << "Error: Unexpected header record found at line " << line << "\n"; exit(EXIT_FAILURE); } ASQG::HeaderRecord headerRecord(recordLine); const SQG::IntTag& overlapTag = headerRecord.getOverlapTag(); if(overlapTag.isInitialized()) pGraph->setMinOverlap(overlapTag.get()); else pGraph->setMinOverlap(0); const SQG::FloatTag& errorRateTag = headerRecord.getErrorRateTag(); if(errorRateTag.isInitialized()) pGraph->setErrorRate(errorRateTag.get()); const SQG::IntTag& containmentTag = headerRecord.getContainmentTag(); if(containmentTag.isInitialized()) pGraph->setContainmentFlag(containmentTag.get()); else pGraph->setContainmentFlag(true); // conservatively assume containments are present const SQG::IntTag& transitiveTag = headerRecord.getTransitiveTag(); if(!transitiveTag.isInitialized()) { std::cerr << "Warning: ASQG does not have transitive tag\n"; pGraph->setTransitiveFlag(true); } else { pGraph->setTransitiveFlag(transitiveTag.get()); } break; } case ASQG::RT_VERTEX: { // progress the stage if we are done the header if(stage == 0) stage = 1; if(stage != 1) { std::cerr << "Error: Unexpected vertex record found at line " << line << "\n"; exit(EXIT_FAILURE); } ASQG::VertexRecord vertexRecord(recordLine); const SQG::IntTag& ssTag = vertexRecord.getSubstringTag(); Vertex* pVertex = new(pGraph->getVertexAllocator()) Vertex(vertexRecord.getID(), vertexRecord.getSeq()); if(ssTag.isInitialized() && ssTag.get() == 1) { // Vertex is a substring of some other vertex, mark it as contained pVertex->setContained(true); pGraph->setContainmentFlag(true); } pGraph->addVertex(pVertex); break; } case ASQG::RT_EDGE: { if(stage == 1) stage = 2; if(stage != 2) { std::cerr << "Error: Unexpected edge record found at line " << line << "\n"; exit(EXIT_FAILURE); } ASQG::EdgeRecord edgeRecord(recordLine); const Overlap& ovr = edgeRecord.getOverlap(); // Add the edge to the graph if(ovr.match.getMinOverlapLength() >= (int)minOverlap) SGAlgorithms::createEdgesFromOverlap(pGraph, ovr, allowContainments); break; } } ++line; } // Remove any duplicate edges SGDuplicateVisitor dupVisit; pGraph->visit(dupVisit); SGGraphStatsVisitor statsVisit; pGraph->visit(statsVisit); // Remove identical vertices // This is much cheaper to do than remove via // SGContainRemove as no remodelling needs to occur /* SGIdenticalRemoveVisitor irv; pGraph->visit(irv); // Remove substring vertices while(pGraph->hasContainment()) { SGContainRemoveVisitor crv; pGraph->visit(crv); } */ delete pReader; return pGraph; }