// // Main // int overlapLongMain(int argc, char** argv) { parseOverlapLongOptions(argc, argv); // Open output file std::ostream* pASQGWriter = createWriter(opt::outFile); // Build and write the ASQG header ASQG::HeaderRecord headerRecord; headerRecord.setOverlapTag(opt::minOverlap); headerRecord.setErrorRateTag(opt::errorRate); headerRecord.setInputFileTag(opt::readsFile); headerRecord.setTransitiveTag(true); headerRecord.write(*pASQGWriter); // Determine which index files to use. If a target file was provided, // use the index of the target reads std::string indexPrefix; if(!opt::targetFile.empty()) indexPrefix = stripFilename(opt::targetFile); else indexPrefix = stripFilename(opt::readsFile); BWT* pBWT = new BWT(indexPrefix + BWT_EXT, opt::sampleRate); SampledSuffixArray* pSSA = new SampledSuffixArray(indexPrefix + SAI_EXT, SSA_FT_SAI); Timer* pTimer = new Timer(PROGRAM_IDENT); pBWT->printInfo(); // Read the sequence file and write vertex records for each // Also store the read names in a vector of strings ReadTable reads; SeqReader* pReader = new SeqReader(opt::readsFile, SRF_NO_VALIDATION); SeqRecord record; while(pReader->get(record)) { reads.addRead(record.toSeqItem()); ASQG::VertexRecord vr(record.id, record.seq.toString()); vr.write(*pASQGWriter); if(reads.getCount() % 100000 == 0) printf("Read %zu sequences\n", reads.getCount()); } delete pReader; pReader = NULL; BWTIndexSet index; index.pBWT = pBWT; index.pSSA = pSSA; index.pReadTable = &reads; // Make a prefix for the temporary hits files size_t n_reads = reads.getCount(); omp_set_num_threads(opt::numThreads); #pragma omp parallel for for(size_t read_idx = 0; read_idx < n_reads; ++read_idx) { const SeqItem& curr_read = reads.getRead(read_idx); printf("read %s %zubp\n", curr_read.id.c_str(), curr_read.seq.length()); SequenceOverlapPairVector sopv = KmerOverlaps::retrieveMatches(curr_read.seq.toString(), opt::seedLength, opt::minOverlap, 1 - opt::errorRate, 100, index); printf("Found %zu matches\n", sopv.size()); for(size_t i = 0; i < sopv.size(); ++i) { std::string match_id = reads.getRead(sopv[i].match_idx).id; // We only want to output each edge once so skip this overlap // if the matched read has a lexicographically lower ID if(curr_read.id > match_id) continue; std::string ao = ascii_overlap(sopv[i].sequence[0], sopv[i].sequence[1], sopv[i].overlap, 50); printf("\t%s\t[%d %d] ID=%s OL=%d PI:%.2lf C=%s\n", ao.c_str(), sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, match_id.c_str(), sopv[i].overlap.getOverlapLength(), sopv[i].overlap.getPercentIdentity(), sopv[i].overlap.cigar.c_str()); // Convert to ASQG SeqCoord sc1(sopv[i].overlap.match[0].start, sopv[i].overlap.match[0].end, sopv[i].overlap.length[0]); SeqCoord sc2(sopv[i].overlap.match[1].start, sopv[i].overlap.match[1].end, sopv[i].overlap.length[1]); // KmerOverlaps returns the coordinates of the overlap after flipping the reads // to ensure the strand matches. The ASQG file wants the coordinate of the original // sequencing strand. Flip here if necessary if(sopv[i].is_reversed) sc2.flip(); // Convert the SequenceOverlap the ASQG's overlap format Overlap ovr(curr_read.id, sc1, match_id, sc2, sopv[i].is_reversed, -1); ASQG::EdgeRecord er(ovr); er.setCigarTag(sopv[i].overlap.cigar); er.setPercentIdentityTag(sopv[i].overlap.getPercentIdentity()); #pragma omp critical { er.write(*pASQGWriter); } } } // Cleanup delete pReader; delete pBWT; delete pSSA; delete pASQGWriter; delete pTimer; if(opt::numThreads > 1) pthread_exit(NULL); return 0; }
ExecStatus Weights<View>::propagate(Space& home, const ModEventDelta&) { ModEvent me = ME_SET_NONE; if (!x.assigned()) { // Collect the weights of the elements in the unknown set in an array int size = elements.size(); Region r(home); int* currentWeights = r.alloc<int>(size); UnknownRanges<View> ur(x); Iter::Ranges::ToValues<UnknownRanges<View> > urv(ur); for (int i=0; i<size; i++) { if (!urv() || elements[i]<urv.val()) { currentWeights[i] = 0; } else { assert(elements[i] == urv.val()); currentWeights[i] = weights[i]; ++urv; } } // Sort the weights of the unknown elements IntLess il; Support::quicksort<int>(currentWeights, size, il); // The maximum number of elements that can still be added to x int delta = static_cast<int>(std::min(x.unknownSize(), x.cardMax() - x.glbSize())); // The weight of the elements already in x GlbRanges<View> glb(x); int glbWeight = weightI<GlbRanges<View> >(elements, weights, glb); // Compute the weight of the current lower bound of x, plus at most // delta-1 further elements with smallest negative weights. This weight // determines which elements in the upper bound cannot possibly be // added to x (those whose weight would exceed the capacity even if // all other elements are minimal) int lowWeight = glbWeight; for (int i=0; i<delta-1; i++) { if (currentWeights[i] >= 0) break; lowWeight+=currentWeights[i]; } // Compute the lowest possible weight of x. If there is another element // with negative weight left, then add its weight to lowWeight. // Otherwise lowWeight is already the lowest possible weight. int lowestWeight = lowWeight; if (delta>0 && currentWeights[delta-1]<0) lowestWeight+=currentWeights[delta-1]; // If after including the minimal number of required elements, // no more element with negative weight is available, then // a tighter lower bound can be computed. if ( (x.cardMin() - x.glbSize() > 0 && currentWeights[x.cardMin() - x.glbSize() - 1] >= 0) || currentWeights[0] >= 0 ) { int lowestPosWeight = glbWeight; for (unsigned int i=0; i<x.cardMin() - x.glbSize(); i++) { lowestPosWeight += currentWeights[i]; } lowestWeight = std::max(lowestWeight, lowestPosWeight); } // Compute the highest possible weight of x as the weight of the lower // bound plus the weight of the delta heaviest elements still in the // upper bound. int highestWeight = glbWeight; for (int i=0; i<delta; i++) { if (currentWeights[size-i-1]<=0) break; highestWeight += currentWeights[size-i-1]; } // Prune the weight using the computed bounds GECODE_ME_CHECK(y.gq(home, lowestWeight)); GECODE_ME_CHECK(y.lq(home, highestWeight)); // Exclude all elements that are too heavy from the set x. // Elements are too heavy if their weight alone already // exceeds the remaining capacity int remainingCapacity = y.max()-lowWeight; UnknownRanges<View> ur2(x); Iter::Ranges::ToValues<UnknownRanges<View> > urv2(ur2); OverweightValues<Iter::Ranges::ToValues<UnknownRanges<View> > > ov(remainingCapacity, elements, weights, urv2); Iter::Values::ToRanges<OverweightValues< Iter::Ranges::ToValues<UnknownRanges<View> > > > ovr(ov); me = x.excludeI(home, ovr); GECODE_ME_CHECK(me); } if (x.assigned()) { // If x is assigned, just compute its weight and assign y. GlbRanges<View> glb(x); int w = weightI<GlbRanges<View> >(elements, weights, glb); GECODE_ME_CHECK(y.eq(home, w)); return home.ES_SUBSUMED(*this); } return me_modified(me) ? ES_NOFIX : ES_FIX; }