void convertHitsToASQG(const std::string& indexPrefix, const StringVector& hitsFilenames, std::ostream* pASQGWriter) { // Load the suffix array index and the reverse suffix array index // Note these are not the full suffix arrays SuffixArray* pFwdSAI = new SuffixArray(indexPrefix + SAI_EXT); SuffixArray* pRevSAI = new SuffixArray(indexPrefix + RSAI_EXT); // Load the ReadInfoTable for the queries to look up the ID and lengths of the hits ReadInfoTable* pQueryRIT = new ReadInfoTable(opt::readsFile); // If the target file is not the query file, load its ReadInfoTable ReadInfoTable* pTargetRIT; if(!opt::targetFile.empty() && opt::targetFile != opt::readsFile) pTargetRIT = new ReadInfoTable(opt::targetFile); else pTargetRIT = pQueryRIT; bool bIsSelfCompare = pTargetRIT == pQueryRIT; // Convert the hits to overlaps and write them to the asqg file as initial edges for(StringVector::const_iterator iter = hitsFilenames.begin(); iter != hitsFilenames.end(); ++iter) { printf("[%s] parsing file %s\n", PROGRAM_IDENT, iter->c_str()); std::istream* pReader = createReader(*iter); // Read each hit sequentially, converting it to an overlap std::string line; while(getline(*pReader, line)) { size_t readIdx; size_t totalEntries; bool isSubstring; OverlapVector ov; OverlapCommon::parseHitsString(line, pQueryRIT, pTargetRIT, pFwdSAI, pRevSAI, bIsSelfCompare, readIdx, totalEntries, ov, isSubstring); for(OverlapVector::iterator iter = ov.begin(); iter != ov.end(); ++iter) { ASQG::EdgeRecord edgeRecord(*iter); edgeRecord.write(*pASQGWriter); } } delete pReader; // delete the hits file unlink(iter->c_str()); } // Deallocate data if(pTargetRIT != pQueryRIT) delete pTargetRIT; delete pFwdSAI; delete pRevSAI; delete pQueryRIT; }
// Convert a line from a hits file into a vector of overlaps and sets the flag // indicating whether the read was found to be a substring of other reads // Only the forward read table is used since we only care about the IDs and length // of the read, not the sequence, so that we don't need an explicit reverse read table void OverlapCommon::parseHitsString(const std::string& hitString, const ReadInfoTable* pQueryRIT, const ReadInfoTable* pTargetRIT, const SuffixArray* pFwdSAI, const SuffixArray* pRevSAI, bool bCheckIDs, size_t& readIdx, size_t& sumBlockSize, OverlapVector& outVector, bool& isSubstring) { OverlapVector outvec; std::istringstream convertor(hitString); sumBlockSize = 0; // Read the overlap blocks for a read size_t numBlocks; convertor >> readIdx >> isSubstring >> numBlocks; //std::cout << "<Read> idx: " << readIdx << " count: " << numBlocks << "\n"; for(size_t i = 0; i < numBlocks; ++i) { // Read the block OverlapBlock record; convertor >> record; //std::cout << "\t" << record << "\n"; // Iterate through the range and write the overlaps for(int64_t j = record.ranges.interval[0].lower; j <= record.ranges.interval[0].upper; ++j) { sumBlockSize += 1; const SuffixArray* pCurrSAI = (record.flags.isTargetRev()) ? pRevSAI : pFwdSAI; const ReadInfo& queryInfo = pQueryRIT->getReadInfo(readIdx); int64_t saIdx = j; // The index of the second read is given as the position in the SuffixArray index const ReadInfo& targetInfo = pTargetRIT->getReadInfo(pCurrSAI->get(saIdx).getID()); // Skip self alignments and non-canonical (where the query read has a lexo. higher name) if(queryInfo.id != targetInfo.id) { Overlap o = record.toOverlap(queryInfo.id, targetInfo.id, queryInfo.length, targetInfo.length); //std::cout << queryInfo.id << " " << targetInfo.id << " " << queryInfo.length << " " << targetInfo.length << "\n"; // The alignment logic above has the potential to produce duplicate alignments // To avoid this, we skip overlaps where the id of the first coord is lexo. lower than // the second or the match is a containment and the query is reversed (containments can be // output up to 4 times total). if(bCheckIDs && (o.id[0] < o.id[1] || (o.match.isContainment() && record.flags.isQueryRev()))) continue; outVector.push_back(o); } } } }
std::string parseDupHits(const StringVector& hitsFilenames, const std::string& out_prefix) { // Load the suffix array index and the reverse suffix array index // Note these are not the full suffix arrays SuffixArray* pFwdSAI = new SuffixArray(opt::prefix + SAI_EXT); SuffixArray* pRevSAI = new SuffixArray(opt::prefix + RSAI_EXT); // Load the read table to look up the lengths of the reads and their ids. // When rmduping a set of reads, the ReadInfoTable can actually be larger than the // BWT if the names of the reads are very long. Previously, when two reads // are duplicated, the read with the lexographically lower read name was chosen // to be kept. To save memory here, we break ties using the index in the ReadInfoTable // instead. This allows us to avoid loading the read names. ReadInfoTable* pRIT = new ReadInfoTable(opt::readsFile, pFwdSAI->getNumStrings(), RIO_NUMERICID); std::string outFile = out_prefix + ".fa"; std::string dupFile = out_prefix + ".dups.fa"; std::ostream* pWriter = createWriter(outFile); std::ostream* pDupWriter = createWriter(dupFile); size_t substringRemoved = 0; size_t identicalRemoved = 0; size_t kept = 0; size_t buffer_size = SequenceProcessFramework::BUFFER_SIZE; // The reads must be output in their original ordering. // The hits are in the blocks of buffer_size items. We read // buffer_size items from the first hits file, then buffer_size // from the second and so on until all the hits have been processed. size_t num_files = hitsFilenames.size(); std::vector<std::istream*> reader_vec(num_files, 0); for(size_t i = 0; i < num_files; ++i) { std::cout << "Opening " << hitsFilenames[i] << "\n"; reader_vec[i] = createReader(hitsFilenames[i]); } bool done = false; size_t currReaderIdx = 0; size_t numRead = 0; size_t numReadersDone = 0; std::string line; while(!done) { // Parse a line from the current file bool valid = getline(*reader_vec[currReaderIdx], line); ++numRead; // Deal with switching the active reader and the end of files if(!valid || numRead == buffer_size) { // Switch the reader currReaderIdx = (currReaderIdx + 1) % num_files; numRead = 0; // Break once all the readers are invalid if(!valid) { ++numReadersDone; if(numReadersDone == num_files) { done = true; break; } } } // Parse the data if(valid) { std::string id; std::string sequence; std::string hitsStr; size_t readIdx; size_t numCopies; bool isSubstring; std::stringstream parser(line); parser >> id; parser >> sequence; getline(parser, hitsStr); OverlapVector ov; OverlapCommon::parseHitsString(hitsStr, pRIT, pRIT, pFwdSAI, pRevSAI, true, readIdx, numCopies, ov, isSubstring); bool isContained = false; if(isSubstring) { ++substringRemoved; isContained = true; } else { for(OverlapVector::iterator iter = ov.begin(); iter != ov.end(); ++iter) { if(iter->isContainment() && iter->getContainedIdx() == 0) { // This read is contained by some other read ++identicalRemoved; isContained = true; break; } } } SeqItem item = {id, sequence}; std::stringstream meta; meta << id << " NumDuplicates=" << numCopies; if(isContained) { // The read's index in the sequence data base // is needed when removing it from the FM-index. // In the output fasta, we set the reads ID to be the index // and record its old id in the fasta header. std::stringstream newID; newID << item.id << ",seqrank=" << readIdx; item.id = newID.str(); // Write some metadata with the fasta record item.write(*pDupWriter, meta.str()); } else { ++kept; // Write the read item.write(*pWriter, meta.str()); } } } for(size_t i = 0; i < num_files; ++i) { delete reader_vec[i]; unlink(hitsFilenames[i].c_str()); } printf("[%s] Removed %zu substring reads\n", PROGRAM_IDENT, substringRemoved); printf("[%s] Removed %zu identical reads\n", PROGRAM_IDENT, identicalRemoved); printf("[%s] Kept %zu reads\n", PROGRAM_IDENT, kept); // Delete allocated data delete pFwdSAI; delete pRevSAI; delete pRIT; delete pWriter; delete pDupWriter; return dupFile; }