void align_reads(vector<pair<string, string> >& reference, string& read_file, string& sam_file, vector<reference_index>& refindex) { time_t tstrt, tbgn, tnd; time(&tstrt); /*//Can be used for analyzing the difference between LAST and NanoBLASTer ifstream fp_nano; string nano_input; string nano_file = "last_but_not_nano.txt"; char *nano = new char[nano_file.length() + 1]; strcpy(nano, nano_file.c_str()); fp_nano.open(nano, ifstream::in); unordered_map<string, int> nano_read; while(getline(fp_nano, nano_input)) { //cout << "" << nano_input << endl; nano_read[nano_input] = 1; //continue; } cout << "Total Size of Nano Read = " << nano_read.size() << endl << endl; fp_nano.close(); delete [] nano; */ ifstream fp_read; ofstream fp_sam; char *read = new char[read_file.length() + 1]; strcpy(read, read_file.c_str()); char *sam = new char[sam_file.length() + 1]; strcpy(sam, sam_file.c_str()); fp_read.open(read, ifstream::in); fp_sam.open(sam, ofstream::out | ofstream::app); string input, read_name, ref_name; string readseq, refgenome; string slash = "/"; int map = 0; int count = 0; int cant_map = 0; int invalid_count = 0; fp_csv << "cnt, red_nam, red_len, red_dir, ref_nam, ref_len, ref_pos, score, span, " << "percent, aln_len, spn_rat, aln_tim, tot_tim" << endl; getline(fp_read, input); while(!fp_read.eof()) { //int find = input.find(slash); //if(find != string::npos) // read_name = input.substr(1, find - 1); //else // read_name = input.substr(1); read_name = input.substr(1); readseq = ""; while(getline(fp_read, input)) { if(input.length() == 0) continue; if(input.at(0) == '>') break; readseq += input; } //getline(fp_read, input); //getline(fp_read, input); //ratio problem with channel_46_read_98_1406145606_2D //if(read_name.find("channel_407_read_0_1405093831_2D") == std::string::npos)//to optimize the output //if(read_name.find("channel_17_read_24_1405524767_2D") == std::string::npos)//small read to optimize //if(read_name.find("channel_201_read_10_1405541481_2D") == std::string::npos)//to compare version 1 and 2 //if(read_name.find("channel_64_read_7_1403826200_template") == std::string::npos)//max length reads analysis //if(read_name.find("channel_424_read_1_1403566249_template") == std::string::npos)//found in last but not in nano //if(read_name.find("2D") == std::string::npos)//03-09-2015 //if(read_name.find("channel_237_read_42_1406145606_2D") == std::string::npos) //if(read_name.find("channel_322_read_11_1405524767_template") == std::string::npos) //if(read_name.find("channel_171_read_2_1403855963_2D") == std::string::npos)//20 times higher than last //if(read_name.find("channel_82_read_0_1403855963_2D") == std::string::npos)//20 times higher than last //if(read_name.find("channel_221_read_19_1406145606_2D") == std::string::npos)//has maximul length of deletion //if(read_name.find("channel_415_read_6_1406242409_template") == std::string::npos)//has 5 times less length than last //if(read_name.find("channel_167_read_19_1403811400_2D") == std::string::npos)//analyze output validity //if(read_name.find("channel_474_read_32_1405524767_template") == std::string::npos)//found in last and nano repeat //if(read_name.find("channel_468_read_12_1403811400_complement") == std::string::npos)//cause exception in nano repeat //if(read_name.find("channel_345_read_7_1403811400_2D") == std::string::npos)//max length increased //if(read_name.find("channel_104_read_1_1403551548_template") == std::string::npos)//different in edit not lis //if(read_name.find("channel_216_read_0_1403551548_template") == std::string::npos)//different in lis not edit //if(read_name.find("channel_118_read_6_1403551548_template") == std::string::npos)//different in lis and edit //if(read_name.find("channel_486_read_0_1403566249_template") == std::string::npos)//reverse problem //if(nano_read.find(read_name) == nano_read.end()) //if(read_name.find("channel_352_read_34_1405541481_template") == std::string::npos)//Why there are multiple results //if(read_name.find("channel_68_read_22_1405541481_template") == std::string::npos)//multiple results, boundary problem //if(read_name.find("channel_261_read_39_1405541481_template") == std::string::npos)//multiple result indexing //if(read_name.find("channel_302_read_2_1403855963_2D") == std::string::npos)//found in mms not in ssg = align length //if(read_name.find("channel_243_read_0_1403595798_template") == std::string::npos)//found in 40655 not in lis+edit //if(read_name.find("channel_452_read_46_1405541481_template") == std::string::npos)//same problem as above //if(read_name.find("channel_431_read_2_1403915857_template") == std::string::npos)//require top 40 tuple list to solve //readseq = readseq.substr(readseq.length() / 2, readseq.length() - readseq.length() / 2); //if(read_name.find("channel_199_read_0_1403841073_template") == std::string::npos)//solved //if(read_name.find("channel_480_read_91_1406242409_template") == std::string::npos)//in last and not in nano //if(read_name.find("channel_389_read_57_1406242409_template") == std::string::npos)//solved //if(read_name.find("channel_56_read_1_1403826200_template") == std::string::npos) //if(read_name.find("channel_356_read_29_1406242409_template") == std::string::npos)// < 40 in nano very weird //if(read_name.find("channel_131_read_5_1403826200_template") == string::npos)// < 100 in nano seems weird //if(read_name.find("channel_75_read_80_1406145606_template") == std::string::npos)//80% last not found now solved // continue; if(count >= MAXREAD && MAXREAD != 0) break; count += 1; //if(count < 11762) continue; cout << count << ") " << read_name << endl; if(readseq.length() < MINREADLEN || readseq.length() > MAXREADLEN)//03-09-2015 { cout << "Invalid String Found" << endl; invalid_count += 1; count -= 1; fp_sam << read_name << "\t4\t*\t0\t0\t*\t*\t0\t0\t" << readseq << "\t*" << endl; time(&tnd); //fp_csv << count << ", " << readseq.length() << ", 0, 0, 0, " << // "0, 0, 0, 0, 0, 0, 0, " << difftime(tnd, tstrt) << endl; continue; } if(count <= MINREAD) continue; //if(count < 318) continue; time(&tbgn); if(DEBUG == 99) fp_csv << count << ", " << read_name << ", " << readseq.length() << ", "; upper_case(readseq); //reverse_str(readseq); //readseq = reverse_complement(readseq); int match_info, global_match = -1, indpos; int match, max_match = 0, match_index, dir; vector<vector<string> > list_final_result; //time_t start, end; //clock_t t_start, t_end; //for(int i = 0; i < reference.size(); i++) { //vector<pair<int, pair<int, int> > > kmer_ref; vector<pair<int, vector<pair<int, int> > > > kmer_ref; //cout << "Analysis for forward:" << endl; // //time(&start); //t_start = clock(); read_vs_reference(readseq, read_name, FF, refindex, kmer_ref); //t_end = clock(); //time(&end); //cout << "Total time taken for calling forward read_vs_ref = " << difftime(end, start) << endl; //t_lookup += t_end - t_start; //align(readseq, read_name, FF, refindex, kmer_ref, final_result); //cout << "Data for reverse:" << endl; //time(&start); //t_start = clock(); if(SINGLE == 1) { string reverse = reverse_complement(readseq); read_vs_reference(reverse, read_name, FR, refindex, kmer_ref); } //t_end = clock(); //time(&end); //cout << "Total time taken for calling reverse read_vs_ref = " << difftime(end, start) << endl; //t_lookup += t_end - t_start; //cout << endl <<endl; //uncomment here for aligninng read list_final_result.clear(); align(readseq, read_name, FR, refindex, kmer_ref, list_final_result); if(list_final_result.size() == 0) { cant_map += 1; kmer_ref.clear(); //time(&tnd); //fp_csv << difftime(tnd, tbgn) << ", " << difftime(tnd, tstrt) << endl; fp_sam << read_name << "\t4\t*\t0\t0\t*\t*\t0\t0\t" << readseq << "\t*" << endl; continue; } kmer_ref.clear(); } for(int i = 0; i < list_final_result.size(); i++) { vector<string>& final_result = list_final_result[i]; fp_sam << final_result[0]; for(int k = 1; k < final_result.size(); k++) { fp_sam << "\t" << final_result[k]; //cout << i << ": " << output[k] << endl; } fp_sam << endl; map += 1; final_result.clear(); } /* if(list_final_result.size() == 0 && SAM_FORMAT == 1) { fp_sam << read_name << "\t4\t*\t0\t0\t*\t*\t0\t0\t" << readseq << "\t*" << endl; } */ //time(&tnd); list_final_result.clear(); if(DEBUG == 99) fp_csv << endl;//difftime(tnd, tbgn) << ", " << difftime(tnd, tstrt) << endl; //cout << "\nTime taken to process " << count << "th read = " << difftime(tnd, tstrt) << "\n" << endl; //break; } cout << endl; cout << "Overall Statistics - " << endl; cout << "total reference size = " << reference.size() << endl << endl; cout << "Total read = " << count << endl << endl; cout << "Total read mapped = " << map << endl << endl; cout << "Total unmapped read = " << cant_map << endl << endl; cout << "Out of range read (< 100 or > 15000) = " << invalid_count << endl << endl; //cout << "Total MAX_MATCHED (= " << MAX_MATCHED << ") Reads = " << MAX_SCORED << endl << endl; fp_read.close(); fp_sam.close(); delete [] read; delete [] sam; }
// kmer rank REQUIRE( kmer_rank("AAAAA", 5) == 0 ); REQUIRE( kmer_rank("GATGA", 5) == 568 ); REQUIRE( kmer_rank("TTTTT", 5) == 1023 ); REQUIRE( kmer_rank("GATGA", 5) == rc_kmer_rank("TCATC", 5 ) ); // lexicographic increment std::string str = "AAAAA"; lexicographic_next(str); REQUIRE( str == "AAAAC" ); str = "AAAAT"; lexicographic_next(str); REQUIRE( str == "AAACA" ); // complement, reverse complement REQUIRE( complement('A') == 'T' ); REQUIRE( reverse_complement("GATGA") == "TCATC" ); } TEST_CASE( "math", "[math]") { GaussianParameters params; params.mean = 4; params.stdv = 2; params.log_stdv = log(params.stdv); REQUIRE( normal_pdf(2.25, params) == Approx(0.1360275) ); REQUIRE( log_normal_pdf(2.25, params) == Approx(log(normal_pdf(2.25, params))) ); }