/* 'ReadBuffer::flush' writes the current contents of the buffer to the designated output file, and then clears the contents to be ready to receive the next reads. */ void ReadBuffer::flush() { // std::cout << "in flush " << std::endl; #pragma omp parallel for for (int i=0; i<m_currentsize ; i++ ) { // std::cout << "before GetCloseEnd " << std::endl; GetCloseEnd(m_CHROMOSOME, m_rawreads[i]); // std::cout << "after GetCloseEnd " << std::endl; if (m_rawreads[i].hasCloseEnd()) { //if (m_rawreads[i].Name == "@DD7DT8Q1:4:1106:17724:13906#GTACCT/1") { // std::cout << "m_rawreads[i].hasCloseEnd()" << std::endl; //} updateReadAfterCloseEndMapping(m_rawreads[i]); #pragma omp critical m_filteredReads.push_back(m_rawreads[i]); } else { //if (m_rawreads[i].Name == "@DD7DT8Q1:4:1106:17724:13906#GTACCT/1") { // std::cout << "m_rawreads[i] no close end" << std::endl; //} #pragma omp critical m_OneEndMappedReads.push_back(m_rawreads[i]); } } //std::cout << "end of flush " << std::endl; m_rawreads.clear(); m_currentsize = 0; //std::cout << "existing flush " << std::endl; }
/* 'ReadBuffer::flush' writes the current contents of the buffer to the designated output file, and then clears the contents to be ready to receive the next reads. */ void ReadBuffer::flush() { // std::cout << "in flush " << std::endl; #pragma omp parallel for for (int i=0; i<m_currentsize ; i++ ) { // std::cout << "before GetCloseEnd " << std::endl; //std::map<std::string, unsigned>::iterator it = g_ReadSeq2Index.find(m_rawreads[i].UnmatchedSeq); //if (it == g_ReadSeq2Index.end()) { //if (m_rawreads[i].MapperSplit ) { // std::cout << "skip close end search" << std::endl; //} if (m_rawreads[i].MapperSplit == false) GetCloseEnd(m_CHROMOSOME, m_rawreads[i]); if (m_rawreads[i].hasCloseEnd()) { updateReadAfterCloseEndMapping(m_rawreads[i]); #pragma omp critical { // g_ReadSeq2Index.insert(std::pair<std::string, unsigned> (m_rawreads[i].UnmatchedSeq, m_filteredReads.size())); m_rawreads[i].SampleName2Number.insert(std::pair <std::string, unsigned> (m_rawreads[i].Tag, 1)); m_filteredReads.push_back(m_rawreads[i]); } } // else { //if (m_rawreads[i].Name == "@DD7DT8Q1:4:1106:17724:13906#GTACCT/1") { // std::cout << "m_rawreads[i] no close end" << std::endl; //} //#pragma omp critical //m_OneEndMappedReads.push_back(m_rawreads[i]); // } } //else { // SampleName2Number std::map <std::string, unsigned> SampleName2Number; // #pragma omp critical // { // unsigned ReadIndex = it -> second; // m_filteredReads[ReadIndex] // std::map <std::string, unsigned>::iterator it_SampleName = m_filteredReads[ReadIndex].SampleName2Number.find(m_rawreads[i].Tag); // // if (it_SampleName == m_filteredReads[ReadIndex].SampleName2Number.end()) { // //std::cout << "adding " << m_rawreads[i].Tag << "\t1" << std::endl; // m_filteredReads[ReadIndex].SampleName2Number.insert(std::pair <std::string, unsigned> (m_rawreads[i].Tag, 1)); // } // else { // // it_SampleName -> second++; // //std::cout << "increasing " << m_rawreads[i].Tag << "\t" << it_SampleName -> second << std::endl; // } // } //} // std::cout << "after GetCloseEnd " << std::endl; } //std::cout << "end of flush " << std::endl; m_rawreads.clear(); m_currentsize = 0; //std::cout << "existing flush " << std::endl; }
void CombineReads(const std::string & CurrentChrSeq, const char & Strand, const std::vector <SPLIT_READ> & input_reads, const std::vector <unsigned int> & Index_Of_Useful_Reads, std::vector <SPLIT_READ> & output_reads) { //std::cout << "start of CombineReads" << std::endl; std::string Spacer = ""; unsigned Max_ReadLength = 0; unsigned Max_AssembledLength = 0; unsigned Min_LeftMostPos = input_reads[Index_Of_Useful_Reads[0]].LeftMostPos; SPLIT_READ output_one_read;// = input_reads[Index_Of_Useful_Reads[0]]; unsigned Index2Read4Copy = 0; //unsigned Min_Close_Size = 10000; for (unsigned ReadIndex = 0; ReadIndex < Index_Of_Useful_Reads.size(); ReadIndex++) { //if (input_reads[Index_Of_Useful_Reads[ReadIndex]].UP_Close.size() < Min_Close_Size) { // Min_Close_Size = input_reads[Index_Of_Useful_Reads[ReadIndex]].UP_Close.size(); // Index2Read4Copy = ReadIndex; // input_reads[Index_Of_Useful_Reads[Index2Read4Copy]] //} //std::cout << Strand << " " << input_reads[Index_Of_Useful_Reads[ReadIndex]].UP_Close.size() << std::endl; //std::cout << input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos << " " << input_reads[Index_Of_Useful_Reads[ReadIndex]].UnmatchedSeq << std::endl; if (input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos < (int)Min_LeftMostPos) Min_LeftMostPos = input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos; if (input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength() > (short)Max_ReadLength) Max_ReadLength = input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength(); if (input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos + (unsigned)input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength() > Max_AssembledLength) Max_AssembledLength = input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos + input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength(); } Max_AssembledLength = Max_AssembledLength - Min_LeftMostPos; if ((float)Max_AssembledLength < Max_ReadLength * 1.3) return; std::cout << "Max_AssembledLength " << Max_AssembledLength << std::endl; for (unsigned ReadIndex = 0; ReadIndex < Index_Of_Useful_Reads.size(); ReadIndex++) { Spacer.clear(); if (Strand == '+') { for (unsigned SpacerIndex = 0; SpacerIndex < Max_AssembledLength + Min_LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength(); SpacerIndex++) Spacer += " "; //std::cout << Spacer << (input_reads[Index_Of_Useful_Reads[ReadIndex]].UnmatchedSeq) << std::endl; } else { for (unsigned SpacerIndex = 0; SpacerIndex < input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - Min_LeftMostPos; SpacerIndex++) Spacer += " "; //std::cout << Spacer << (input_reads[Index_Of_Useful_Reads[ReadIndex]].UnmatchedSeq) << std::endl; } } if (Strand == '+') { // UnmatchedSeq std::cout << "+ Max_ReadLength " << Max_ReadLength << "\t" << "Min_LeftMostPos " << Min_LeftMostPos << "\nref: \n" << ReverseComplement(CurrentChrSeq.substr(Min_LeftMostPos, Max_AssembledLength)) << std::endl; } else if (Strand == '-') { std::cout << "- Max_ReadLength " << Max_ReadLength << "\t" << "Min_LeftMostPos " << Min_LeftMostPos << "\nref: \n" << CurrentChrSeq.substr(Min_LeftMostPos, Max_AssembledLength) << std::endl; } unsigned Count[5][Max_AssembledLength]; float Ratio[5][Max_AssembledLength]; for (short i = 0; i < 5; i++) { for (unsigned j = 0; j < Max_AssembledLength; j++) { Count[i][j] = 0; Ratio[i][j] = 0.0; } } if (Strand == '+') { for (unsigned ReadIndex = 0; ReadIndex < Index_Of_Useful_Reads.size(); ReadIndex++) { //std::cout << std::endl; for (short BaseIndex = 0; BaseIndex < input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength(); BaseIndex++) { //std::cout << input_reads[Index_Of_Useful_Reads[ReadIndex]].UnmatchedSeq[BaseIndex]; switch (input_reads[Index_Of_Useful_Reads[ReadIndex]].getUnmatchedSeq()[BaseIndex]) { case 'A': Count[0][Max_AssembledLength + Min_LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength() + BaseIndex]++; break; // 00000000 case 'C': Count[1][Max_AssembledLength + Min_LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength() + BaseIndex]++; break; // 00010000 case 'G': Count[2][Max_AssembledLength + Min_LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength() + BaseIndex]++; break; // 00100000 case 'T': Count[3][Max_AssembledLength + Min_LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength() + BaseIndex]++; break; // 00110000 default: Count[4][Max_AssembledLength + Min_LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength() + BaseIndex]++; // 01000000 } } } } else if (Strand == '-') { for (unsigned ReadIndex = 0; ReadIndex < Index_Of_Useful_Reads.size(); ReadIndex++) { //std::cout << std::endl; for (short BaseIndex = 0; BaseIndex < input_reads[Index_Of_Useful_Reads[ReadIndex]].getReadLength(); BaseIndex++) { //std::cout << input_reads[Index_Of_Useful_Reads[ReadIndex]].UnmatchedSeq[BaseIndex]; switch (input_reads[Index_Of_Useful_Reads[ReadIndex]].getUnmatchedSeq()[BaseIndex]) { case 'A': Count[0][input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - Min_LeftMostPos + BaseIndex]++; break; // 00000000 case 'C': Count[1][input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - Min_LeftMostPos + BaseIndex]++; break; // 00010000 case 'G': Count[2][input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - Min_LeftMostPos + BaseIndex]++; break; // 00100000 case 'T': Count[3][input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - Min_LeftMostPos + BaseIndex]++; break; // 00110000 default: Count[4][input_reads[Index_Of_Useful_Reads[ReadIndex]].LeftMostPos - Min_LeftMostPos + BaseIndex]++; // 01000000 } } } } float Sum; for (unsigned PosIndex = 0; PosIndex < Max_AssembledLength; PosIndex++) { Sum = Count[0][PosIndex] + Count[1][PosIndex] + Count[2][PosIndex] + Count[3][PosIndex] + Count[4][PosIndex]; //std::cout << Count[0][PosIndex] << " " << Count[1][PosIndex] << " " << Count[2][PosIndex] << " " << Count[3][PosIndex] << " " << Count[4][PosIndex] << std::endl; for (unsigned BaseIndex = 0; BaseIndex < 5; BaseIndex++) { Ratio[BaseIndex][PosIndex] = Count[BaseIndex][PosIndex] / Sum; } } std::string OutputOneStr = ""; const float RatioCutoff = 0.66; unsigned Max_Base_Count = 0; short Max_Base_Count_Index = -1; for (unsigned PosIndex = 0; PosIndex < Max_AssembledLength; PosIndex++) { if (Ratio[0][PosIndex] > RatioCutoff) { OutputOneStr += "A"; continue; } if (Ratio[1][PosIndex] > RatioCutoff) { OutputOneStr += "C"; continue; } if (Ratio[2][PosIndex] > RatioCutoff) { OutputOneStr += "G"; continue; } if (Ratio[3][PosIndex] > RatioCutoff) { OutputOneStr += "T"; continue; } for (short BaseIndex = 0; BaseIndex < 4; BaseIndex++) { if (Count[BaseIndex][PosIndex] > Max_Base_Count && Count[BaseIndex][PosIndex] >= 3) Max_Base_Count_Index = BaseIndex; } //if (Max_Base_Count_Index != -1) { switch (Max_Base_Count_Index) { case 0: OutputOneStr += "A"; break; case 1: OutputOneStr += "C"; break; case 2: OutputOneStr += "G"; break; case 3: OutputOneStr += "T"; break; case -1: OutputOneStr += "N"; break; default: break; } //} //else OutputOneStr += "N"; } if (Strand == '+') { std::cout << "Final merged string +: original\n" << (OutputOneStr) << std::endl; std::cout << "Final merged string +: convert to ref\n" << ReverseComplement(OutputOneStr) << std::endl; } else { std::cout << "Final merged string -: original\n" << (OutputOneStr) << std::endl; std::cout << "Final merged string : convert to ref\n" << (OutputOneStr) << std::endl; }//std::cout << "Final merged string: -\n" << (OutputOneStr) << std::endl; //std::cout << "here1" << std::endl; unsigned Count_N = 0; for (unsigned pos_index = 0; pos_index < OutputOneStr.size(); pos_index++) { if (OutputOneStr[pos_index] == 'N') Count_N++; } if ((float)Count_N < OutputOneStr.size() * 0.05) { unsigned Min_Close_Size = 10000; Index2Read4Copy = 0; // if the best one cannot be found due to N or whatever reasons, use the first read as the template for copy. //std::cout << "Original Index2Read4Copy: " << Index2Read4Copy << std::endl; for (unsigned ReadIndex = 0; ReadIndex < Index_Of_Useful_Reads.size(); ReadIndex++) { if (input_reads[Index_Of_Useful_Reads[ReadIndex]].UP_Close.size() < Min_Close_Size && OutputOneStr.find(input_reads[Index_Of_Useful_Reads[ReadIndex]].getUnmatchedSeq()) !=std::string::npos) { // quick fix here: need more work Min_Close_Size = input_reads[Index_Of_Useful_Reads[ReadIndex]].UP_Close.size(); Index2Read4Copy = ReadIndex; // input_reads[Index_Of_Useful_Reads[Index2Read4Copy]] //std::cout << "Changed Index2Read4Copy: " << Index2Read4Copy << std::endl; } } //std::cout << "here2" << std::endl; output_one_read = input_reads[Index_Of_Useful_Reads[Index2Read4Copy]]; //std::cout << "here2a" << std::endl; output_one_read.setUnmatchedSeq( OutputOneStr ); //update std::map <std::string, int> ReadCountPerSample; GetReadCountPerSample(input_reads, Index_Of_Useful_Reads, output_one_read); //std::cout << "here2b" << std::endl; //std::cout << "Before: " << output_one_read.UP_Close.size() << std::endl; output_one_read.UP_Close.clear(); //std::cout << "here3" << std::endl; output_one_read.Thickness = Index_Of_Useful_Reads.size(); //std::cout << "Thickness " << output_one_read.Thickness << std::endl; GetCloseEnd(CurrentChrSeq, output_one_read); //std::cout << "After: " << output_one_read.UP_Close.size() << std::endl; output_reads.push_back(output_one_read); //std::cout << "here4" << std::endl; //std::cout << "end of CombineReads" << std::endl; } }