void testCluster() { int w=40; string forced_edges = "" ; map<string, string> labels; WindowsStorage windows = WindowsStorage(labels); Sequence seq = {"", "", "", "", NULL}; windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT", seq, 0, 0); windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA", seq, 0, 0); windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAG", seq, 0, 0); windows.add("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAC", seq, 0, 0); windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTT", seq, 0, 0); windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTAAA", seq, 0, 0); windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTGGG", seq, 0, 0); windows.add("TTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTCCC", seq, 0, 0); windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTT", seq, 0, 0); windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGAAAAAAAA", seq, 0, 0); windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGG", seq, 0, 0); windows.add("GGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGCCCCCCCC", seq, 0, 0); windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCTTTTTTTTTTTTTTT", seq, 0, 0); windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCAAAAAAGCTAAAAAA", seq, 0, 0); windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGTCTAGGGGG", seq, 0, 0); windows.add("CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCATGCCCCCC", seq, 0, 0); windows.sort(); list<pair <junction, size_t> > sort_clones = windows.getSortedList(); comp_matrix comp=comp_matrix(sort_clones); //create matrix using junctions comp.compare( cout, Cluster); //save matrix file comp.save("comp_matrix.data"); //reset matrix comp.del(); //create matrix using matrix file comp.load("comp_matrix.data"); //save matrix file comp.save("comp_matrix2.data"); //test clustering list <list <junction> > cluster ; //0 différence admise / taille mini 10 / 0 cluster possible cluster = comp.cluster(forced_edges, w, cout, 0, 10) ; TAP_TEST(cluster.size()==0, TEST_CLUSTER, "no cluster here (cluster.size()=" <<cluster.size() << ")" ) ; //epsilon 0// taille mini 1 / 16 cluster possible cluster = comp.cluster(forced_edges, w, cout, 0, 1) ; TAP_TEST(cluster.size()==16, TEST_CLUSTER, "expected 16 clusters (cluster.size()=" <<cluster.size() << ")") ; //epsilon 1// taille mini 3 / 1 cluster possible cluster = comp.cluster(forced_edges, w, cout, 1, 3) ; TAP_TEST(cluster.size()==1, TEST_CLUSTER, "expected 1 cluster (cluster.size()=" <<cluster.size() << ")") ; //epsilon 3// taille mini 3 / 2 cluster possible cluster = comp.cluster(forced_edges, w, cout, 3, 3) ; TAP_TEST(cluster.size()==2, TEST_CLUSTER, "expected 2 clusters (cluster.size()=" <<cluster.size() << ")") ; //epsilon 20 // taille mini 3 / 4 cluster possible cluster = comp.cluster(forced_edges, w, cout, 20, 3) ; TAP_TEST(cluster.size()==4, TEST_CLUSTER, "expected 4 clusters (cluster.size()=" <<cluster.size() << ")") ; //del matrix comp.del(); }
WindowsStorage *WindowExtractor::extract(OnlineFasta *reads, size_t w, map<string, string> &windows_labels, bool only_labeled_windows, bool keep_unsegmented_as_clone, double nb_expected, int nb_reads_for_evalue) { init_stats(); WindowsStorage *windowsStorage = new WindowsStorage(windows_labels); windowsStorage->setMaximalNbReadsPerWindow(max_reads_per_window); unsigned long long int bp_total = 0; while (reads->hasNext()) { try { reads->next(); } catch (const invalid_argument e) { cout << endl; cerr << WARNING_STRING << "Error in getting a new read: " << e.what() << endl; cerr << WARNING_STRING << "Vidjil stops the analysis here, after " << nb_reads << " reads." << endl; break ; } nb_reads++; if (out_affects) { *out_affects << reads->getSequence(); } KmerMultiSegmenter kmseg(reads->getSequence(), multigermline, out_affects, nb_expected, nb_reads_for_evalue); KmerSegmenter *seg = kmseg.the_kseg ; // Window length threshold junction junc ; if (seg->isSegmented()) { junc = seg->getJunction(w); if (!junc.size()) { seg->setSegmentationStatus(UNSEG_TOO_SHORT_FOR_WINDOW); } } int read_length = seg->getSequence().sequence.length(); // Update stats stats[seg->getSegmentationStatus()].insert(read_length); if (seg->isSegmented()) { // Filter if (!only_labeled_windows || windowsStorage->isInterestingJunction(junc)) // Store the window windowsStorage->add(junc, reads->getSequence(), seg->getSegmentationStatus(), seg->segmented_germline); // Update stats stats[TOTAL_SEG_AND_WINDOW].insert(read_length) ; stats_reads[seg->system].addScore(read_length); if (out_segmented) { *out_segmented << *seg ; // KmerSegmenter output (V/N/J) } } else { if (keep_unsegmented_as_clone && (reads->getSequence().sequence.length() >= w)) { // Keep the unsegmented read, taking the full sequence as the junction windowsStorage->add(reads->getSequence().sequence, reads->getSequence(), seg->getSegmentationStatus(), seg->segmented_germline); stats[TOTAL_SEG_AND_WINDOW].insert(read_length) ; // TODO: rather count that in a pseudo-germline such as 'TRG!' } if (out_unsegmented) { *out_unsegmented << *seg ; } if (out_unsegmented_detail && (seg->getSegmentationStatus() >= STATS_FIRST_UNSEG)) { if (unsegmented_detail_full || (seg->getSegmentationStatus() != UNSEG_TOO_FEW_ZERO && seg->getSegmentationStatus() != UNSEG_TOO_SHORT)) *out_unsegmented_detail[seg->getSegmentationStatus()] << *seg ; } } // Last line of detailed affects output if (out_affects) { *out_affects << "#>" << seg->label << " " << seg->getInfoLine() << endl << endl; } // Progress bar bp_total += read_length; if (!(nb_reads % PROGRESS_POINT)) { cout << "." ; if (!(nb_reads % (PROGRESS_POINT * PROGRESS_LINE))) cout << setw(10) << nb_reads / 1000 << "k reads " << fixed << setprecision(2) << setw(14) << bp_total / 1E6 << " Mbp" << endl ; cout.flush() ; } } cout << endl ; fillStatsClones(windowsStorage); return windowsStorage; }