indexData intern_readIndex(string filename){ string line; ifstream myFile; indexData toReturn; toReturn.mlindex1=0; toReturn.mlindex2=0; bool isFirstLine =true; //initialize the values for the likelihood of matches or mismatches for(int i=0;i<64;i++){ if(i == 0) likeMatch[i] = -3.0; // this is vrong, hope it's never accessed else likeMatch[i] = log1p( -pow(10.0,i/-10.0) )/log(10); likeMismatch[i] = i/-10.0; #ifdef DEBUG2 cout<<"qual = "<<i<<endl; cout<<likeMatch[i]<<endl; cout<<likeMismatch[i]<<endl; #endif } //reading the files // myFile.open(filename.c_str(), ios::in); // if (myFile.is_open()){ vector<string> allLinesIndex = allTokens(filename,'\n'); //while ( getline (myFile,line)){ for(unsigned int i=0;i<allLinesIndex.size();i++){ line = allLinesIndex[i]; if(line.empty()) continue; line+=' '; // cerr<<"line #"<<line<<"#"<<toReturn.isDoubleIndex<<endl; if(isFirstLine){ if(line[0] == '#'){ unsigned int i=0; int numberOfFields=0; bool inWS=true; while(i<line.length()){ if( isspace(line[i])){ inWS=true; }else{ if(inWS){ numberOfFields++; } inWS=false; } i++; } if(numberOfFields==2){ toReturn.isDoubleIndex=false; }else{ if(numberOfFields==3 || numberOfFields==5){ toReturn.isDoubleIndex=true; }else{ cerr << "Must have 2, 3 or 5 fields"<<endl; exit(1); } } }else{ cerr << "First line must begin with #"<<endl; exit(1); } isFirstLine=false; }else{ int i=0; int fieldIndex=0; bool inWS=false; int lastOneNW=0; string foundName; while(i<int(line.length())){ if( isspace(line[i]) && i==0){ cerr<<line<<endl; cerr << "First character cannot be a space"<<endl; exit(1); } if( isspace(line[i]) ){ if(!inWS){ //found a field //first field, first index if(fieldIndex==0){ toReturn.indices1.push_back(toUpperCase(line.substr(lastOneNW,i-lastOneNW))); if(toReturn.mlindex1 < (i-lastOneNW)){ toReturn.mlindex1 =(i-lastOneNW); } }else{ //second field, either name of single ind or second index if(fieldIndex==1){ if(toReturn.isDoubleIndex){ toReturn.indices2.push_back(toUpperCase(line.substr(lastOneNW,i-lastOneNW))); if(toReturn.mlindex2 < (i-lastOneNW)){ toReturn.mlindex2 =(i-lastOneNW); } }else{ foundName=line.substr(lastOneNW,i-lastOneNW); //duplicated names ? if(toReturn.namesMap.find( foundName ) != toReturn.namesMap.end()){ cerr<<"Warning: The sequence name is duplicated "<<foundName<<endl; //exit(1); }else{ toReturn.namesMap[ foundName ] = ""; } toReturn.names.push_back( foundName ); } }else if(fieldIndex==2){ //sequence name when two indices if(toReturn.isDoubleIndex){ //duplicated names foundName=line.substr(lastOneNW,i-lastOneNW); if(toReturn.namesMap.find( foundName ) != toReturn.namesMap.end()){ cerr<<"Warning: The sequence name is duplicated "<<foundName<<endl; //exit(1); }else{ toReturn.namesMap[ foundName ] = ""; } toReturn.names.push_back( foundName ); }else{ //it's a comment for single index toReturn.namesMap[ foundName ] += line.substr(lastOneNW,i-lastOneNW); // cerr<<"Single index file cannot have 3 fields"<<endl; // exit(1); } }else{ //it's a comment again toReturn.namesMap[ foundName ] += line.substr(lastOneNW,i-lastOneNW); } } fieldIndex++; } inWS=true; }else{ if(inWS) lastOneNW=i; inWS=false; } i++; } //ending while(i<line.length()){ } // ending else firstline } // ending while myFile.good() ){ //checking for size // cout<<toReturn.indices1.size()<<endl; // cout<<toReturn.indices2.size()<<endl; // cout<<toReturn.names.size()<<endl; if(toReturn.isDoubleIndex) if((toReturn.indices1.size() != toReturn.indices2.size()) ){ cerr << "Size of the fields inconsistent "<<filename<<endl; exit(1); } if(toReturn.indices1.size() != toReturn.names.size() ){ cerr << "Size of the fields inconsistent "<<filename<<endl; exit(1); } //checking for valid dna for(unsigned int i=0;i<toReturn.indices1.size();i++){ if(!isValidDNA(toReturn.indices1[i])){ cerr << "Index " << toReturn.indices1[i] <<" is not a valid DNA sequence"<<endl; exit(1); } if(toReturn.isDoubleIndex) if(!isValidDNA(toReturn.indices2[i])){ cerr << "Index " << toReturn.indices2[i] <<" is not a valid DNA sequence"<<endl; exit(1); } } return toReturn; }
void VerticesExtractor::call_RAY_SLAVE_MODE_ADD_EDGES(){ MACRO_COLLECT_PROFILING_INFORMATION(); if(this->m_outbox==NULL){ m_rank=m_parameters->getRank(); this->m_mode=m_mode; this->m_outbox=m_outbox; this->m_outboxAllocator=m_outboxAllocator; } if(m_finished){ return; } if(!m_checkedCheckpoint){ m_checkedCheckpoint=true; if(m_parameters->hasCheckpoint("GenomeGraph")){ cout<<"Rank "<<m_parameters->getRank()<<": checkpoint GenomeGraph exists, not extracting vertices."<<endl; Message aMessage(NULL,0,MASTER_RANK,RAY_MPI_TAG_VERTICES_DISTRIBUTED,m_parameters->getRank()); m_outbox->push_back(aMessage); m_finished=true; return; } } #ifdef ASSERT assert(m_pendingMessages>=0); #endif if(m_pendingMessages!=0){ return; } if(m_mode_send_vertices_sequence_id%10000==0 &&m_mode_send_vertices_sequence_id_position==0 &&m_mode_send_vertices_sequence_id<(int)m_myReads->size()){ string reverse=""; if(m_reverseComplementVertex==true){ reverse="(reverse complement) "; } printf("Rank %i is adding edges %s[%i/%i]\n",m_parameters->getRank(),reverse.c_str(),(int)m_mode_send_vertices_sequence_id+1,(int)m_myReads->size()); fflush(stdout); m_derivative.addX(m_mode_send_vertices_sequence_id); m_derivative.printStatus(SLAVE_MODES[RAY_SLAVE_MODE_ADD_EDGES],RAY_SLAVE_MODE_ADD_EDGES); m_derivative.printEstimatedTime(m_myReads->size()); } if(m_mode_send_vertices_sequence_id==(int)m_myReads->size()){ MACRO_COLLECT_PROFILING_INFORMATION(); // flush data flushAll(m_outboxAllocator,m_outbox,m_parameters->getRank()); if(m_pendingMessages==0){ #ifdef ASSERT assert(m_bufferedDataForIngoingEdges.isEmpty()); assert(m_bufferedDataForOutgoingEdges.isEmpty()); #endif Message aMessage(NULL,0, MASTER_RANK, RAY_MPI_TAG_VERTICES_DISTRIBUTED,m_parameters->getRank()); m_outbox->push_back(aMessage); m_finished=true; printf("Rank %i is adding edges [%i/%i] (completed)\n",m_parameters->getRank(),(int)m_mode_send_vertices_sequence_id,(int)m_myReads->size()); fflush(stdout); m_bufferedDataForIngoingEdges.showStatistics(m_parameters->getRank()); m_bufferedDataForOutgoingEdges.showStatistics(m_parameters->getRank()); m_derivative.writeFile(&cout); } }else{ MACRO_COLLECT_PROFILING_INFORMATION(); /* * Decode the DNA sequence * and store it in a local buffer. */ if(m_mode_send_vertices_sequence_id_position==0){ (*m_myReads)[(m_mode_send_vertices_sequence_id)]->getSeq(m_readSequence,m_parameters->getColorSpaceMode(),false); //cout<<"DEBUG Read="<<*m_mode_send_vertices_sequence_id<<" color="<<m_parameters->getColorSpaceMode()<<" Seq= "<<m_readSequence<<endl; } int len=strlen(m_readSequence); if(len<m_parameters->getWordSize()){ m_hasPreviousVertex=false; (m_mode_send_vertices_sequence_id)++; (m_mode_send_vertices_sequence_id_position)=0; return; } MACRO_COLLECT_PROFILING_INFORMATION(); char memory[1000]; int maximumPosition=len-m_parameters->getWordSize()+1; #ifdef ASSERT assert(m_readSequence!=NULL); #endif int p=(m_mode_send_vertices_sequence_id_position); memcpy(memory,m_readSequence+p,m_parameters->getWordSize()); memory[m_parameters->getWordSize()]='\0'; MACRO_COLLECT_PROFILING_INFORMATION(); if(isValidDNA(memory)){ MACRO_COLLECT_PROFILING_INFORMATION(); Kmer currentForwardKmer=wordId(memory); /* TODO: possibly don't flush k-mer that are not lower. not sure it that would work though. -Seb */ /* * previousForwardKmer -> currentForwardKmer * previousReverseKmer <- currentReverseKmer */ /* * Push the kmer */ MACRO_COLLECT_PROFILING_INFORMATION(); if(m_hasPreviousVertex){ MACRO_COLLECT_PROFILING_INFORMATION(); // outgoing edge // PreviousVertex(*) -> CurrentVertex Rank outgoingRank=m_parameters->_vertexRank(&m_previousVertex); for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForOutgoingEdges.addAt(outgoingRank,m_previousVertex.getU64(i)); } for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForOutgoingEdges.addAt(outgoingRank,currentForwardKmer.getU64(i)); } if(m_bufferedDataForOutgoingEdges.flush(outgoingRank,2*KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_OUT_EDGES_DATA,m_outboxAllocator,m_outbox,m_parameters->getRank(),false)){ m_pendingMessages++; } // ingoing edge // PreviousVertex -> CurrentVertex(*) Rank ingoingRank=m_parameters->_vertexRank(¤tForwardKmer); for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForIngoingEdges.addAt(ingoingRank,m_previousVertex.getU64(i)); } for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForIngoingEdges.addAt(ingoingRank,currentForwardKmer.getU64(i)); } if(m_bufferedDataForIngoingEdges.flush(ingoingRank,2*KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_IN_EDGES_DATA,m_outboxAllocator,m_outbox,m_parameters->getRank(),false)){ m_pendingMessages++; } MACRO_COLLECT_PROFILING_INFORMATION(); } // reverse complement // Kmer currentReverseKmer=currentForwardKmer. complementVertex(m_parameters->getWordSize(),m_parameters->getColorSpaceMode()); if(m_hasPreviousVertex){ MACRO_COLLECT_PROFILING_INFORMATION(); // outgoing edge // Rank outgoingRank=m_parameters->_vertexRank(¤tReverseKmer); for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForOutgoingEdges.addAt(outgoingRank,currentReverseKmer.getU64(i)); } for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForOutgoingEdges.addAt(outgoingRank,m_previousVertexRC.getU64(i)); } MACRO_COLLECT_PROFILING_INFORMATION(); if(m_bufferedDataForOutgoingEdges.flush(outgoingRank,2*KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_OUT_EDGES_DATA,m_outboxAllocator,m_outbox,m_parameters->getRank(),false)){ m_pendingMessages++; } MACRO_COLLECT_PROFILING_INFORMATION(); // ingoing edge Rank ingoingRank=m_parameters->_vertexRank(&m_previousVertexRC); for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForIngoingEdges.addAt(ingoingRank,currentReverseKmer.getU64(i)); } for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForIngoingEdges.addAt(ingoingRank,m_previousVertexRC.getU64(i)); } MACRO_COLLECT_PROFILING_INFORMATION(); if(m_bufferedDataForIngoingEdges.flush(ingoingRank,2*KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_IN_EDGES_DATA,m_outboxAllocator,m_outbox,m_parameters->getRank(),false)){ m_pendingMessages++; } MACRO_COLLECT_PROFILING_INFORMATION(); } // there is a previous vertex. m_hasPreviousVertex=true; m_previousVertex=currentForwardKmer; m_previousVertexRC=currentReverseKmer; }else{ m_hasPreviousVertex=false; } MACRO_COLLECT_PROFILING_INFORMATION(); (m_mode_send_vertices_sequence_id_position++); if((m_mode_send_vertices_sequence_id_position)==maximumPosition){ m_hasPreviousVertex=false; (m_mode_send_vertices_sequence_id)++; (m_mode_send_vertices_sequence_id_position)=0; } } MACRO_COLLECT_PROFILING_INFORMATION(); }
void VerticesExtractor::process(int*m_mode_send_vertices_sequence_id, ArrayOfReads*m_myReads, bool*m_reverseComplementVertex, int rank, StaticVector*m_outbox, bool*m_mode_send_vertices, int wordSize, int size, RingAllocator*m_outboxAllocator, int*m_mode ){ if(this->m_outbox==NULL){ m_rank=rank; this->m_mode=m_mode; this->m_outbox=m_outbox; this->m_outboxAllocator=m_outboxAllocator; } #ifdef ASSERT assert(m_pendingMessages>=0); #endif if(m_pendingMessages!=0){ return; } if(m_finished){ return; } if(*m_mode_send_vertices_sequence_id%100000==0 &&m_mode_send_vertices_sequence_id_position==0 &&*m_mode_send_vertices_sequence_id<(int)m_myReads->size()){ string reverse=""; if(*m_reverseComplementVertex==true){ reverse="(reverse complement) "; } printf("Rank %i is computing vertices & edges %s[%i/%i]\n",rank,reverse.c_str(),(int)*m_mode_send_vertices_sequence_id+1,(int)m_myReads->size()); fflush(stdout); } if(*m_mode_send_vertices_sequence_id>(int)m_myReads->size()-1){ // flush data flushAll(m_outboxAllocator,m_outbox,rank); if(m_pendingMessages==0){ #ifdef ASSERT assert(m_bufferedData.isEmpty()); assert(m_bufferedDataForIngoingEdges.isEmpty()); assert(m_bufferedDataForOutgoingEdges.isEmpty()); #endif Message aMessage(NULL,0, MASTER_RANK, RAY_MPI_TAG_VERTICES_DISTRIBUTED,rank); m_outbox->push_back(aMessage); m_finished=true; printf("Rank %i is computing vertices & edges [%i/%i] (completed)\n",rank,(int)*m_mode_send_vertices_sequence_id,(int)m_myReads->size()); fflush(stdout); m_bufferedData.showStatistics(m_parameters->getRank()); m_bufferedDataForIngoingEdges.showStatistics(m_parameters->getRank()); m_bufferedDataForOutgoingEdges.showStatistics(m_parameters->getRank()); } }else{ if(m_mode_send_vertices_sequence_id_position==0){ (*m_myReads)[(*m_mode_send_vertices_sequence_id)]->getSeq(m_readSequence,m_parameters->getColorSpaceMode(),false); //cout<<"DEBUG Read="<<*m_mode_send_vertices_sequence_id<<" color="<<m_parameters->getColorSpaceMode()<<" Seq= "<<m_readSequence<<endl; } int len=strlen(m_readSequence); if(len<wordSize){ m_hasPreviousVertex=false; (*m_mode_send_vertices_sequence_id)++; (m_mode_send_vertices_sequence_id_position)=0; return; } char memory[1000]; int lll=len-wordSize+1; #ifdef ASSERT assert(m_readSequence!=NULL); #endif int p=(m_mode_send_vertices_sequence_id_position); memcpy(memory,m_readSequence+p,wordSize); memory[wordSize]='\0'; if(isValidDNA(memory)){ Kmer a=wordId(memory); int rankToFlush=0; rankToFlush=m_parameters->_vertexRank(&a); for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedData.addAt(rankToFlush,a.getU64(i)); } if(m_bufferedData.flush(rankToFlush,KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_VERTICES_DATA,m_outboxAllocator,m_outbox,rank,false)){ m_pendingMessages++; } if(m_hasPreviousVertex){ // outgoing edge int outgoingRank=m_parameters->_vertexRank(&m_previousVertex); for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForOutgoingEdges.addAt(outgoingRank,m_previousVertex.getU64(i)); } for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForOutgoingEdges.addAt(outgoingRank,a.getU64(i)); } if(m_bufferedDataForOutgoingEdges.needsFlushing(outgoingRank,2*KMER_U64_ARRAY_SIZE)){ if(m_bufferedData.flush(outgoingRank,KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_VERTICES_DATA,m_outboxAllocator,m_outbox,rank,true)){ m_pendingMessages++; } } if(m_bufferedDataForOutgoingEdges.flush(outgoingRank,2*KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_OUT_EDGES_DATA,m_outboxAllocator,m_outbox,rank,false)){ m_pendingMessages++; } // ingoing edge int ingoingRank=m_parameters->_vertexRank(&a); for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForIngoingEdges.addAt(ingoingRank,m_previousVertex.getU64(i)); } for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForIngoingEdges.addAt(ingoingRank,a.getU64(i)); } if(m_bufferedDataForIngoingEdges.needsFlushing(ingoingRank,2*KMER_U64_ARRAY_SIZE)){ if(m_bufferedData.flush(ingoingRank,KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_VERTICES_DATA,m_outboxAllocator,m_outbox,rank,true)){ m_pendingMessages++; } } if(m_bufferedDataForIngoingEdges.flush(ingoingRank,2*KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_IN_EDGES_DATA,m_outboxAllocator,m_outbox,rank,false)){ m_pendingMessages++; } } // reverse complement Kmer b=complementVertex(&a,wordSize,m_parameters->getColorSpaceMode()); rankToFlush=m_parameters->_vertexRank(&b); for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedData.addAt(rankToFlush,b.getU64(i)); } if(m_bufferedData.flush(rankToFlush,KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_VERTICES_DATA,m_outboxAllocator,m_outbox,rank,false)){ m_pendingMessages++; } if(m_hasPreviousVertex){ // outgoing edge int outgoingRank=m_parameters->_vertexRank(&b); for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForOutgoingEdges.addAt(outgoingRank,b.getU64(i)); } for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForOutgoingEdges.addAt(outgoingRank,m_previousVertexRC.getU64(i)); } if(m_bufferedDataForOutgoingEdges.needsFlushing(outgoingRank,2*KMER_U64_ARRAY_SIZE)){ if(m_bufferedData.flush(outgoingRank,1*KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_VERTICES_DATA,m_outboxAllocator,m_outbox,rank,true)){ m_pendingMessages++; } } if(m_bufferedDataForOutgoingEdges.flush(outgoingRank,2*KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_OUT_EDGES_DATA,m_outboxAllocator,m_outbox,rank,false)){ m_pendingMessages++; } // ingoing edge int ingoingRank=m_parameters->_vertexRank(&m_previousVertexRC); for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForIngoingEdges.addAt(ingoingRank,b.getU64(i)); } for(int i=0;i<KMER_U64_ARRAY_SIZE;i++){ m_bufferedDataForIngoingEdges.addAt(ingoingRank,m_previousVertexRC.getU64(i)); } if(m_bufferedDataForIngoingEdges.needsFlushing(ingoingRank,2*KMER_U64_ARRAY_SIZE)){ if(m_bufferedData.flush(ingoingRank,1*KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_VERTICES_DATA,m_outboxAllocator,m_outbox,rank,true)){ m_pendingMessages++; } } if(m_bufferedDataForIngoingEdges.flush(ingoingRank,2*KMER_U64_ARRAY_SIZE,RAY_MPI_TAG_IN_EDGES_DATA,m_outboxAllocator,m_outbox,rank,false)){ m_pendingMessages++; } } // there is a previous vertex. m_hasPreviousVertex=true; m_previousVertex=a; m_previousVertexRC=b; }else{ m_hasPreviousVertex=false; } (m_mode_send_vertices_sequence_id_position++); if((m_mode_send_vertices_sequence_id_position)==lll){ m_hasPreviousVertex=false; (*m_mode_send_vertices_sequence_id)++; (m_mode_send_vertices_sequence_id_position)=0; } } }