int Application::go(int argc,char *argv[]) { // Process command line CommandLine cmd(argc,argv,""); if(cmd.numArgs()!=3) throw "\n\ threshold <in.schema> <input-dir> <threshold>\n\ \n\ "; String schemaFile=cmd.arg(0); String inDir=cmd.arg(1); threshold=cmd.arg(2).asDouble(); // Misc. initialization cout.precision(10); cerr.precision(10); BOOM::catchFloatOverflow(); // Load the training set schema=Schema(schemaFile); D=schema.getNumContinuous(); trainingSet.load(inDir,schema); // Apply thresholding to the data applyThreshold(); // Save the results trainingSet.save(); return 0; }
size_t total_length( SequenceSet const& set ) { return std::accumulate( set.begin(), set.end(), 0, [] (size_t c, Sequence const& s) { return c + s.length(); } ); }
void filter_max_sequence_length( SequenceSet& set, size_t max_length ) { size_t index = 0; while( index < set.size() ) { if( set.at(index).length() > max_length ) { set.remove(index); } else { ++index; } } }
void filter_min_max_sequence_length( SequenceSet& set, size_t min_length, size_t max_length ) { size_t index = 0; while( index < set.size() ) { auto len = set.at(index).length(); if( len < min_length || len > max_length ) { set.remove(index); } else { ++index; } } }
void SessionHandler::expected(const SequenceSet& commands, const Array& /*fragments*/) { checkAttached(); if (getState()->hasState()) { // Replay if (commands.empty()) throw IllegalStateException( QPID_MSG(getState()->getId() << ": has state but client is attaching as new session.")); // TODO aconway 2008-05-12: support replay of partial commands. // Here we always round down to the last command boundary. SessionPoint expectedPoint = commands.empty() ? SequenceNumber(0) : SessionPoint(commands.front(),0); SessionState::ReplayRange replay = getState()->senderExpected(expectedPoint); sendCommandPoint(expectedPoint); std::for_each(replay.begin(), replay.end(), out); // replay } else sendCommandPoint(getState()->senderGetCommandPoint()); }
utils::Bitvector gap_sites( SequenceSet const& set, std::string const& gap_chars ) { // Edge case. if( set.size() == 0 ) { return utils::Bitvector(); } // Init result bitvector to all true. Then, for every site that is not a gap, reset to false. auto result = utils::Bitvector( set[0].length(), true ); // Init lookup array. auto lookup = utils::CharLookup<bool>( false ); lookup.set_selection_upper_lower( gap_chars, true ); // Process all sequences in the set. for( auto const& seq : set ) { if( seq.length() != result.size() ) { throw std::runtime_error( "Cannot calculate gap_sites() if SequenceSet is not an alignment." ); } // Process the sites of the sequence. If it is not a gap, set it to false in the bitvector. // This way, only sites that are all-gap will remain with value `true` in the end. for( size_t i = 0; i < seq.length(); ++i ) { if( ! lookup[ seq[ i ] ] ) { result.set( i, false ); } } } return result; }
void SessionHandler::flush(bool expected, bool confirmed, bool completed) { checkAttached(); if (expected) { SequenceSet expectSet; if (getState()->hasState()) expectSet.add(getState()->receiverGetExpected().command); peer.expected(expectSet, Array()); } if (confirmed) { SequenceSet confirmSet; if (!getState()->receiverGetUnknownComplete().empty()) confirmSet.add(getState()->receiverGetUnknownComplete().front(), getState()->receiverGetReceived().command); peer.confirmed(confirmSet, Array()); } if (completed) peer.completed(getState()->receiverGetUnknownComplete(), true); }
TEST( Sequence, PhylipReaderAaSequential ) { // Skip test if no data availabe. NEEDS_TEST_DATA; // Load sequence file. std::string infile = environment->data_dir + "sequence/aa_3_384_s.phylip"; SequenceSet sset; PhylipReader() .label_length( 10 ) .valid_chars( amino_acid_codes_all() ) .read( from_file(infile), sset); // Check data. EXPECT_EQ( 3, sset.size() ); EXPECT_EQ( 384, sset[0].length() ); EXPECT_EQ( "CATH_HUMAN", sset[2].label() ); EXPECT_EQ( "G-AVTPVKNQ", sset[0].sites().substr( 160, 10 ) ); }
TEST( Sequence, PhylipReaderDnaSequential ) { // Skip test if no data availabe. NEEDS_TEST_DATA; // Load sequence file. std::string infile = environment->data_dir + "sequence/dna_5_42_s.phylip"; SequenceSet sset; PhylipReader() .label_length( 10 ) .valid_chars( nucleic_acid_codes_all() ) .read( from_file(infile), sset); // Check data. EXPECT_EQ( 5, sset.size() ); EXPECT_EQ( 42, sset[0].length() ); EXPECT_EQ( "Salmo gair", sset[1].label() ); EXPECT_EQ( "CGGGCACGGT", sset[1].sites().substr( 30, 10 ) ); }
TEST( Sequence, PhylipReaderDnaInterleaved ) { // Skip test if no data availabe. NEEDS_TEST_DATA; // Load sequence file. std::string infile = environment->data_dir + "sequence/dna_5_42_i.phylip"; SequenceSet sset; PhylipReader() .label_length( 10 ) .mode( PhylipReader::Mode::kInterleaved ) .valid_chars( nucleic_acid_codes_all() ) .read( from_file(infile), sset); // Check data. EXPECT_EQ( 5, sset.size() ); EXPECT_EQ( 42, sset[0].length() ); EXPECT_EQ( "Turkey", sset[0].label() ); EXPECT_EQ( "AAGCTNGGGC", sset[0].sites().substr( 0, 10 ) ); }
void computeMIonCPU(SequenceSet& sequences, Matrix<float>& MI) { const int numChars = NUMPROTEINCHARS; const int sequenceLength = sequences.getSequenceLength(); const int numSequences = sequences.getNumberOfSequences(); const double epsilon=1e-6; timeval start, end; gettimeofday(&start, 0); for (int k = 0; k < LOOPCOUNT; k++) { //iterate over all column combinations for (int j = 0; j < sequenceLength; j++) { for (int i = 0; i <= j; i++) { //absolute number of occurrences of character pairs x,y: N_ij(x,y) int twoPointOccs[numChars][numChars]; memset(twoPointOccs, 0, sizeof(twoPointOccs)); //iterate through all sequences and compute two-point occurrences for (int seq = 0; seq < numSequences; seq++) twoPointOccs[sequences.getData(seq, i)][sequences.getData(seq, j)]++; /* puts("===START==="); for (int m=0; m<numChars; m++) { for (int n=0; n<numChars; n++) printf("%d %d: %d\n", m, n, twoPointOccs[m][n]); puts(""); } puts("===STOP ==="); */ double MI_ij = 0; //sum over all x and y for (int x = 0; x < numChars; x++) { if (sequences.getOnePointProb(x, i) < epsilon) continue; for (int y = 0; y < numChars; y++) { if (sequences.getOnePointProb(y, j) < epsilon || twoPointOccs[x][y] == 0) continue; double p_ij_xy = double(twoPointOccs[x][y]) / double(numSequences); MI_ij += p_ij_xy * log2(p_ij_xy / (sequences.getOnePointProb(x, i) * sequences.getOnePointProb(y, j))); } } MI.set(i, j, MI_ij); } } } gettimeofday(&end, 0); std::cout << "execution time: " << (end.tv_sec - start.tv_sec ) * 1000 + ( end.tv_usec - start.tv_usec) / 1000 << " milliseconds" << std::endl; }
void Application::applyThreshold() { const int numSeqs=trainingSet.size(); for(int i=0 ; i<numSeqs ; ++i) { EmissionSequence &S=*trainingSet[i]; const int L=S.length(); for(int pos=0 ; pos<L ; ++pos) { Emission &e=S[pos]; GSL::Vector &V=e.getContinuous(); for(int d=0 ; d<D ; ++d) V[d]=V[d]>=threshold ? threshold : 0; } } }
void Results::completed(const SequenceSet& set) { //call complete on those listeners whose ids fall within the set Listeners::iterator i = listeners.begin(); while (i != listeners.end()) { if (set.contains(i->first)) { i->second->completed(); listeners.erase(i++); } else { i++; } } }
bool is_alignment( SequenceSet const& set ) { if( set.size() == 0 ) { return true; } size_t length = set[0].length(); for( auto& s : set ) { if( s.length() != length ) { return false; } } return true; }
void parse_seqs_file (SequenceSet& allSeqs, int& numSeq, char* fname) { ifstream seq_file(fname); string tmp_str; while (getline(seq_file, tmp_str)) { int seq_len = tmp_str.size(); Sequence ht_tmp_seq (seq_len+1+1, 0); ht_tmp_seq[0] = '*'; for(int i = 0; i < seq_len; i ++) ht_tmp_seq[i+1] = tmp_str.at(i); ht_tmp_seq[seq_len+1] = '#'; allSeqs.push_back(ht_tmp_seq); ++ numSeq; } seq_file.close(); }
void writeClusterView( string fpath , SequenceSet& allModelSeqs, SequenceSet& allDataSeqs ){ int numSeq = allDataSeqs.size(); SequenceSet allCOSeqs (numSeq, Sequence(0)); vector<int> pos(numSeq, 0); while (true) { set<int> insertion_ids; for (int i = 0; i < numSeq; i ++) { if (pos[i] >= allModelSeqs[i].size()) continue; if (allModelSeqs[i][pos[i]] == '-') insertion_ids.insert(i); } if (insertion_ids.size() != 0) { // insertion exists for (int i = 0; i < numSeq; i ++) { if (insertion_ids.find(i)==insertion_ids.end()) // not in set allCOSeqs[i].push_back('-'); else { // in set allCOSeqs[i].push_back(allDataSeqs[i][pos[i]++]); } } } else { // no insertion for (int i = 0; i < numSeq; i ++) allCOSeqs[i].push_back(allDataSeqs[i][pos[i]++]); } // terminating bool terminated = true; for (int i = 0; i < numSeq; i ++) if (pos[i] != allModelSeqs[i].size()) { terminated = false; break; } if (terminated) break; } ofstream co_out (fpath.c_str()); for (int i = 0; i < numSeq; i ++) { for (int j = 0; j < allCOSeqs[i].size(); j++) { co_out << allCOSeqs[i][j]; } co_out << endl; } co_out.close(); }
int Application::go(int argc,char *argv[]) { // Process command line CommandLine cmd(argc,argv,""); if(cmd.numArgs()!=5) throw "\n\ hack-known-sites [options] <training-dir> <schema.txt> <signal-name> <margin-around-peak> <motifs>\n\ "; String dir=cmd.arg(0); String schemaFile=cmd.arg(1); String signalName=cmd.arg(2); int margin=cmd.arg(3).asInt(); String motifs=cmd.arg(4); // Misc. initialization cout.precision(10); cerr.precision(10); BOOM::catchFloatOverflow(); BOOM::Vector<String> &motifV=*motifs.getFields(","); Regex r("([^\/]+)\\.fastb"); // Load schema Schema schema(schemaFile); sigID=schema.lookupContinuousID(signalName); openID=schema.lookupContinuousID("unpaired"); alphabet=schema.getAlphabet(0); // Load the training set cerr<<"loading training data"<<endl; trainingSet.load(dir,schema); // Process sequences cerr<<"processing sequences"<<endl; BOOM::Vector<float> all, cluster; BOOM::Array1D< BOOM::Vector<double> > offsets(2*WINDOW_SIZE+1); BOOM::Vector<double> motifClusterSizes, otherClusterSizes; const int N=trainingSet.size(); int nextSeqID=1; for(int i=0 ; i<N ; ++i) { EmissionSequence &S=*trainingSet[i]; if(!r.search(S.getFilename())) throw S.getFilename(); String substrate=r[1]; int begin; double prevX=0; double maxX=-1; int maxPos; const int L=S.length(); for(int pos=0 ; pos<L ; ++pos) { const Emission &e=S[pos]; const double x=e[sigID]; all.push_back(e[openID]); if(prevX==0 && x>0) { begin=pos; maxX=x; maxPos=pos; } else if(prevX>0 && x>0 && x>maxX) { maxX=x; maxPos=pos; } else if(prevX>0 && x==0) { int from, to; if(1) { if(findMotif(maxPos,from,to,motifV,margin,S)) { /* cout<<substrate<<"\tpositive\tgroup\t"<<begin<<"\t"<<pos<<"\t1\t+\t.\n"; cout<<substrate<<"\tmotif\tsite\t"<<from<<"\t"<<to<<"\t1\t+\t.\n"; */ //cout<<maxX<<endl; /* EmissionSequence *subseq=S.getSubsequence(begin,pos-begin); Sequence *seq=subseq->getDiscreteSeq(0); delete subseq; cout<<">"<<nextSeqID++<<endl; seq->printOn(cout,schema.getAlphabet(0)); cout<<endl; delete seq; */ for(int delta=-WINDOW_SIZE ; delta<=WINDOW_SIZE ; ++delta) { //int x=to+delta; int x=maxPos+delta; if(x<0 || x>=L) continue; double y=S[x][openID]; offsets[delta+WINDOW_SIZE].push_back(y); } //motifClusterSizes.push_back(pos-begin); } } else {//for(int i=begin ; i<pos ; ++i) { /* cout<<substrate<<"\tnegative\tgroup\t"<<begin+1<<"\t"<<pos<<"\t1\t+\t.\n"; cout<<substrate<<"\tdecoy\tsite\t"<<maxPos-margin<<"\t"<<maxPos+margin<<"\t1\t+\t.\n"; */ //cout<<maxX<<endl; /* //EmissionSequence *subseq=S.getSubsequence(begin,pos-begin); if(maxPos>margin && maxPos+margin<L) { EmissionSequence *subseq=S.getSubsequence(maxPos-margin,2*margin); Sequence *seq=subseq->getDiscreteSeq(0); delete subseq; cout<<">"<<nextSeqID++<<endl; seq->printOn(cout,schema.getAlphabet(0)); cout<<endl; delete seq; } */ for(int delta=-WINDOW_SIZE ; delta<=WINDOW_SIZE ; ++delta) { int x=maxPos+delta; if(x<0 || x>=L) continue; double y=S[x][openID]; offsets[delta+WINDOW_SIZE].push_back(y); } //cluster.push_back(S[i][openID]); int start=maxPos-margin, stop=maxPos+margin+1; if(start<0) start=0; if(stop>L) stop=L; else for(int i=start ; i<stop ; ++i) cluster.push_back(S[i][openID]); otherClusterSizes.push_back(pos-begin); } } prevX=x; } } /* SummaryStats ss(all), ssC(cluster); SummaryStats sizes1(motifClusterSizes), sizes2(otherClusterSizes); cerr<<"overall stats: "<<ss.getMean()<<" +/- "<<ss.getStdDev()<<endl; cerr<<"cluster stats: "<<ssC.getMean()<<" +/- "<<ssC.getStdDev()<<" N="<<ssC.getN()<<endl; cerr<<"motif cluster sizes: "<<sizes1.getMean()<<" +/- "<<sizes1.getStdDev()<<" N="<<sizes1.getN()<<endl; cerr<<"non-motif cluster sizes: "<<sizes2.getMean()<<" +/- "<<sizes2.getStdDev()<<" N="<<sizes2.getN()<<endl; */ for(int delta=-WINDOW_SIZE ; delta<=WINDOW_SIZE ; ++delta) { int x=delta+WINDOW_SIZE; SummaryStats ss(offsets[x]); cout<<delta<<"\t"<<ss.getMean()<<endl; } return 0; }
Tensor5D CVX_ADMM_MSA (SequenceSet& allSeqs, vector<int>& lenSeqs, int T2, string& dir_path) { // 1. initialization int numSeq = allSeqs.size(); vector<Tensor4D> C (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE, vector<double>(NUM_MOVEMENT, 0.0))))); vector<Tensor4D> W_1 (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE, vector<double>(NUM_MOVEMENT, 0.0))))); vector<Tensor4D> W_2 (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE, vector<double>(NUM_MOVEMENT, 0.0))))); vector<Tensor4D> Y (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE, vector<double>(NUM_MOVEMENT, 0.0))))); tensor5D_init (C, allSeqs, lenSeqs, T2); tensor5D_init (W_1, allSeqs, lenSeqs, T2); tensor5D_init (W_2, allSeqs, lenSeqs, T2); tensor5D_init (Y, allSeqs, lenSeqs, T2); set_C (C, allSeqs); // 2. ADMM iteration int iter = 0; double mu = MU; double prev_CoZ = MAX_DOUBLE; while (iter < MAX_ADMM_ITER) { // 2a. Subprogram: FrankWolf Algorithm // NOTE: parallelize this for to enable parallelism #ifdef PARRALLEL_COMPUTING #pragma omp parallel for #endif for (int n = 0; n < numSeq; n++) first_subproblem (W_1[n], W_2[n], Y[n], C[n], mu, allSeqs[n]); // 2b. Subprogram: second_subproblem (W_1, W_2, Y, mu, allSeqs, lenSeqs); // 2d. update Y: Y += mu * (W_1 - W_2) for (int n = 0; n < numSeq; n ++) tensor4D_lin_update (Y[n], W_1[n], W_2[n], mu); // 2e. print out tracking info double CoZ = 0.0; for (int n = 0; n < numSeq; n++) CoZ += tensor4D_frob_prod(C[n], W_2[n]); double W1mW2 = 0.0; for (int n = 0; n < numSeq; n ++) { int T1 = W_1[n].size(); for (int i = 0; i < T1; i ++) for (int j = 0; j < T2; j ++) for (int d = 0; d < NUM_DNA_TYPE; d ++) for (int m = 0; m < NUM_MOVEMENT; m ++) { double value = (W_1[n][i][j][d][m] - W_2[n][i][j][d][m]); W1mW2 = max( fabs(value), W1mW2 ) ; } } ///////////////////////////////////Copy from Main///////////////////////////////////////// int T2m = T2; Tensor tensor (T2m, Matrix (NUM_DNA_TYPE, vector<double>(NUM_DNA_TYPE, 0.0))); Matrix mat_insertion (T2m, vector<double> (NUM_DNA_TYPE, 0.0)); for (int n = 0; n < numSeq; n ++) { int T1 = W_2[n].size(); for (int i = 0; i < T1; i ++) { for (int j = 0; j < T2m; j ++) { for (int d = 0; d < NUM_DNA_TYPE; d ++) { for (int m = 0; m < NUM_MOVEMENT; m ++) { if (m == DELETION_A or m == MATCH_A) tensor[j][d][dna2T3idx('A')] += max(0.0, W_2[n][i][j][d][m]); else if (m == DELETION_T or m == MATCH_T) tensor[j][d][dna2T3idx('T')] += max(0.0, W_2[n][i][j][d][m]); else if (m == DELETION_C or m == MATCH_C) tensor[j][d][dna2T3idx('C')] += max(0.0, W_2[n][i][j][d][m]); else if (m == DELETION_G or m == MATCH_G) tensor[j][d][dna2T3idx('G')] += max(0.0, W_2[n][i][j][d][m]); else if (m == DELETION_START or m == MATCH_START) tensor[j][d][dna2T3idx('*')] += max(0.0, W_2[n][i][j][d][m]); else if (m == DELETION_END or m == MATCH_END) tensor[j][d][dna2T3idx('#')] += max(0.0, W_2[n][i][j][d][m]); else if (m == INSERTION) mat_insertion[j][d] += max(0.0, W_2[n][i][j][d][m]); } } } } } Trace trace (0, Cell(2)); // 1d: j, 2d: ATCG refined_viterbi_algo (trace, tensor, mat_insertion); Sequence recSeq; for (int i = 0; i < trace.size(); i ++) if (trace[i].action != INSERTION) { recSeq.push_back(trace[i].acidB); if (trace[i].acidB == '#') break; } ////////////////////////////////END copy from MAIN///////////////////////////////////////////////////// SequenceSet allModelSeqs, allDataSeqs; double obj_rounded = 0.0; for (int n = 0; n < numSeq; n ++) { Sequence model_seq = recSeq, data_seq = allSeqs[n]; data_seq.erase(data_seq.begin()); model_seq.erase(model_seq.begin()); data_seq.erase(data_seq.end()-1); model_seq.erase(model_seq.end()-1); // align sequences locally Plane plane (data_seq.size()+1, Trace(model_seq.size()+1, Cell(2))); Trace trace (0, Cell(2)); smith_waterman (model_seq, data_seq, plane, trace); // get the objective of rounded result for (int i = 0; i < trace.size(); i ++) { if (trace[i].acidA == '-' && trace[i].acidB != '-') obj_rounded += 1.0;//C_I; else if (trace[i].acidA != '-' && trace[i].acidB == '-') obj_rounded += 1.0;//C_D; else if (trace[i].acidA == trace[i].acidB) obj_rounded += 0.0;//C_M; else if (trace[i].acidA != trace[i].acidB) obj_rounded += 1.0;//C_MM; } model_seq.clear(); data_seq.clear(); for (int i = 0; i < trace.size(); i ++) model_seq.push_back(trace[i].acidA); for (int i = 0; i < trace.size(); i ++) data_seq.push_back(trace[i].acidB); allModelSeqs.push_back(model_seq); allDataSeqs.push_back(data_seq); } //writeClusterView( dir_path+to_string(iter), allModelSeqs, allDataSeqs ); // cerr << "=============================================================================" << endl; char COZ_val [50], w1mw2_val [50]; sprintf(COZ_val, "%6f", CoZ); sprintf(w1mw2_val, "%6f", W1mW2); cerr << "ADMM_iter = " << iter << ", C o Z = " << COZ_val << ", Wdiff_max = " << w1mw2_val << ", obj_rounded = " << obj_rounded << endl; // cerr << "sub1_Obj = CoW_1+0.5*mu*||W_1-Z+1/mu*Y_1||^2 = " << sub1_cost << endl; // cerr << "sub2_Obj = ||W_2-Z+1/mu*Y_2||^2 = " << sub2_cost << endl; // 2f. stopping conditions if (ADMM_EARLY_STOP_TOGGLE and iter > MIN_ADMM_ITER) if ( W1mW2 < EPS_Wdiff ) { cerr << "CoZ Converges. ADMM early stop!" << endl; break; } prev_CoZ = CoZ; iter ++; } cout << "W_1: " << endl; for (int i = 0; i < numSeq; i ++) tensor4D_dump(W_1[i]); cout << "W_2: " << endl; for (int i = 0; i < numSeq; i ++) tensor4D_dump(W_2[i]); return W_2; }
/* We resolve the second subproblem through sky-plane projection */ Sequence second_subproblem (Tensor5D& W_1, Tensor5D& W_2, Tensor5D& Y, double& mu, SequenceSet& allSeqs, vector<int> lenSeqs) { /*{{{*/ int numSeq = allSeqs.size(); int T2 = W_2[0][0].size(); // reinitialize W_2 to all-zero matrix for (int n = 0; REINIT_W_ZERO_TOGGLE and n < numSeq; n ++) { int T1 = W_2[n].size(); for (int i = 0; i < T1; i ++) for (int j = 0; j < T2; j ++) for (int d = 0; d < NUM_DNA_TYPE; d ++) for (int m = 0; m < NUM_MOVEMENT; m ++) W_2[n][i][j][d][m] = 0.0; } vector<Tensor4D> delta (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE, vector<double>(NUM_MOVEMENT, 0.0))))); tensor5D_init (delta, allSeqs, lenSeqs, T2); Tensor tensor (T2, Matrix (NUM_DNA_TYPE, vector<double>(NUM_DNA_TYPE, 0.0))); Matrix mat_insertion (T2, vector<double>(NUM_DNA_TYPE, 0.0)); Trace trace (0, Cell(2)); // 1d: j, 2d: ATCG int fw_iter = -1; while (fw_iter < MAX_2nd_FW_ITER) { fw_iter ++; // 1. compute delta #ifdef PARRALLEL_COMPUTING //#pragma omp parallel for #endif for (int n = 0; n < numSeq; n ++) { int T1 = W_2[n].size(); for (int i = 0; i < T1; i ++) { for (int j = 0; j < T2; j ++) for (int d = 0; d < NUM_DNA_TYPE; d ++) for (int m = 0; m < NUM_MOVEMENT; m ++) { delta[n][i][j][d][m] = -1.0* mu * (W_2[n][i][j][d][m] - W_1[n][i][j][d][m]) + Y[n][i][j][d][m]; #ifdef SECOND_SUBPROBLEM_DEBUG if (delta[n][i][j][d][m] > 0) cout <<"delta: " << n << "," << i << "," << j << "," << d << "," << m << ": " << delta[n][i][j][d][m] << endl; #endif if (m == DELETION_A or m == MATCH_A) tensor[j][d][dna2T3idx('A')] += max(0.0, delta[n][i][j][d][m]); else if (m == DELETION_T or m == MATCH_T) tensor[j][d][dna2T3idx('T')] += max(0.0, delta[n][i][j][d][m]); else if (m == DELETION_C or m == MATCH_C) tensor[j][d][dna2T3idx('C')] += max(0.0, delta[n][i][j][d][m]); else if (m == DELETION_G or m == MATCH_G) tensor[j][d][dna2T3idx('G')] += max(0.0, delta[n][i][j][d][m]); else if (m == DELETION_START or m == MATCH_START) tensor[j][d][dna2T3idx('*')] += max(0.0, delta[n][i][j][d][m]); else if (m == DELETION_END or m == MATCH_END) tensor[j][d][dna2T3idx('#')] += max(0.0, delta[n][i][j][d][m]); else if (m == INSERTION) { mat_insertion[j][d] += max(0.0, delta[n][i][j][d][m]); } } } } #ifdef SECOND_SUBPROBLEM_DEBUG cout << "tensor transition input list:" << endl; for (int j = 0; j < T2; j ++) for (int d = 0; d < NUM_DNA_TYPE; d ++) for (int k = 0; k < NUM_DNA_TYPE; k ++) { if (tensor[j][d][k] > 0) cout << "(" << j << ", " << d << ", " << k << ")=" << tensor[j][d][k] << endl; } #endif double delta_square = 0.0; for (int n = 0; n < numSeq; n ++) delta_square += tensor4D_frob_prod (delta[n], delta[n]); //cout << "delta_square: " << delta_square << endl; if ( delta_square < 1e-12 ) { //cout << "small delta. early stop." << endl; break; } // 2. determine the trace: run viterbi algorithm trace.clear(); refined_viterbi_algo (trace, tensor, mat_insertion); Tensor5D S (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE, vector<double>(NUM_MOVEMENT, 0.0))))); tensor5D_init (S, allSeqs, lenSeqs, T2); // 3. recover values for S // 3b. set a number of selected elements to 1 for (int t = 0; t < trace.size(); t++) { int sj = trace[t].location[0]; int sd = trace[t].location[1]; int sm = dna2T3idx(trace[t].acidB); // cout << trace[t].acidB; for (int n = 0; n < numSeq; n ++) { int T1 = S[n].size(); for (int i = 0; i < T1; i ++) { for (int m = 0; m < NUM_MOVEMENT; m ++) if (delta[n][i][sj][sd][m] > 0.0) { if (m == DEL_BASE_IDX + sm or m == MTH_BASE_IDX + sm) S[n][i][sj][sd][m] = 1.0; else if (m == INSERTION and trace[t].action == INSERTION) { S[n][i][sj][sd][m] = 1.0; } } } } } // cout << endl; #ifdef SECOND_SUBPROBLEM_DEBUG cout << "Result of viterbi:" << endl; for (int t = 0; t < trace.size(); t++) cout << "(" << trace[t].location[0] << ", " << trace[t].acidA << ", "<< trace[t].acidB << ")=" << trace[t].score << endl; double S_s = 0.0; for (int n = 0; n < numSeq; n ++) S_s += tensor4D_frob_prod (S[n], S[n]); cout << "S_s: " << S_s << endl; for (int n = 0; n < numSeq; n ++) tensor4D_dump(S[n]); #endif // 4. Exact Line search: determine the optimal step size \gamma // gamma = [ ( Y_2 + mu*W - mu*Z ) dot (W_2 - S) ] / || W_2 - S ||^2 // ---------------combo------------------ double numerator = 0.0, denominator = 0.0; for (int n = 0; n < numSeq; n ++) { int T1 = S[n].size(); for (int i = 0; i < T1; i ++) for (int j = 0; j < T2; j ++) for (int d = 0; d < NUM_DNA_TYPE; d ++) for (int m = 0; m < NUM_MOVEMENT; m ++) { double wms = W_2[n][i][j][d][m] - S[n][i][j][d][m]; numerator += (-1.0*Y[n][i][j][d][m] + mu*W_2[n][i][j][d][m] - mu*W_1[n][i][j][d][m]) * wms; denominator += mu * wms * wms; } } #ifdef SECOND_SUBPROBLEM_DEBUG cout << "numerator: " << numerator << ", denominator: " << denominator << endl; #endif if ( denominator < 10e-6) { //cout << "small denominator: " << denominator << endl; break; } double gamma = numerator / denominator; if (fw_iter == 0) gamma = 1.0; gamma = max(gamma, 0.0); gamma = min(gamma, 1.0); // cout << "gamma: " << gamma << ", mu*||W-S||^2: " << denominator << endl; // 3. update W for (int n = 0; n < numSeq; n ++) { int T1 = S[n].size(); for (int i = 0; i < T1; i ++) for (int j = 0; j < T2; j ++) for (int d = 0; d < NUM_DNA_TYPE; d ++) for (int m = 0; m < NUM_MOVEMENT; m ++) W_2[n][i][j][d][m] = (1-gamma) * W_2[n][i][j][d][m] + gamma* S[n][i][j][d][m]; } // 4. output iteration tracking info // second_subproblem_log(fw_iter, W, Z, Y, mu); // 5. early stop condition if (fabs(gamma) < EPS_2nd_FW) break; } Sequence recSeq; for (int t = 0; t < trace.size(); t++) recSeq.push_back(trace[t].acidB); return recSeq; /*}}}*/ }
void merge_duplicate_sequences( SequenceSet& set, MergeDuplicateSequencesCountPolicy count_policy, std::string const& counter_prefix ) { // Helper to keep track of which sequences (at position i in the set) occured how often. struct Duplicate { size_t index; size_t count; }; // Find duplicates and count their occurences. // TODO this is a very memory intense step. find an algo that does not need to copy all sites... std::unordered_map< std::string, Duplicate > dup_map; size_t i = 0; while( i < set.size() ) { auto& seq = set[i]; if( dup_map.count( seq.sites() ) == 0 ) { // If it is a new sequence, init the map entry and move to the next sequence. dup_map[ seq.sites() ].index = i; dup_map[ seq.sites() ].count = 1; ++i; } else { // If we already saw that sequence, increment its counter, and remove the sequence // from the set. Do not increment i - we deleted a sequence, so staying at the // position automatically "moves" to the next one. ++dup_map[ seq.sites() ].count; set.remove(i); } } // We do not need to relabel sequences. if( count_policy == MergeDuplicateSequencesCountPolicy::kDiscard ) { return; } // Relabel using the counts. for( size_t j = 0; j < set.size(); ++j ) { auto& seq = set[j]; // The sequence needs to be in the map, as we added it in the previous step. // It also needs to have the same index, as we never changed that. assert( dup_map.count(seq.sites()) > 0 ); assert( dup_map[ seq.sites() ].index == j ); // Append the count to the label. auto count = dup_map[ seq.sites() ].count; if( count_policy == MergeDuplicateSequencesCountPolicy::kAppendToLabel ) { auto new_label = seq.label() + counter_prefix + std::to_string(count); seq.label( new_label ); } else { // We already skipped the discard case, so this here should not happen. assert( false ); } } }
void SessionHandler::confirmed(const SequenceSet& commands, const Array& /*fragments*/) { checkAttached(); // Ignore non-contiguous confirmations. if (!commands.empty() && commands.front() >= getState()->senderGetReplayPoint()) getState()->senderConfirmed(commands.rangesBegin()->last()); }
void computeMIonGPU(SequenceSet& sequence, Matrix<float>& MI, bool GPU) { // initializes context and kernel and stores them OCL ocl(GPU); cl_int oclError1, oclError2; timeval start, end; // memory sizes size_t sequenceLength = sequence.getSequenceLength(); size_t numSequences = sequence.getNumberOfSequences(); // matrix MI is of size numElements size_t numElements = sequenceLength * sequenceLength; size_t sequenceSize = sequence.getNumberOfSequences() * sequenceLength; size_t onePointProbsSize = sequenceLength * NUMPROTEINCHARS; // host memory float * dst = new float[MI.size()]; memset(dst, 0, MI.size()); // device memory for sequences, one point probablities and resulting matrix cl_mem oclDevSrcSequence, oclDevSrcOnePointProbs, oclDevDstMI; // size for a work group: each workgroup computes one matrix entry, thus computes the correlation // one time for each character => 25 work items are sufficient size_t localWorkSize[2] = { 5, 5 }; if (sequenceLength % localWorkSize[0] != 0) throw std::runtime_error("sequence length ^ 2 not divisable by local work size"); // global work size defines the total amount of threads over all work group, thus needs to be a multiple of the local // work size in each dimension. size_t globalWorkSize[2] = { sequenceLength, sequenceLength }; // create buffer on device, one for each input array oclDevSrcSequence = clCreateBuffer( ocl.oclContext, CL_MEM_READ_ONLY, sizeof(cl_uchar) * sequenceSize, 0, &oclError1); oclDevSrcOnePointProbs = clCreateBuffer(ocl.oclContext, CL_MEM_READ_ONLY, sizeof(cl_float) * onePointProbsSize, 0, &oclError2); oclError1 |= oclError2; oclDevDstMI = clCreateBuffer( ocl.oclContext, CL_MEM_WRITE_ONLY, sizeof(cl_float) * numElements, 0, &oclError2); oclError1 |= oclError2; if (oclError1 != CL_SUCCESS) { std::cout << "error while allocating buffers" << std::endl; exit(1); } // set buffer to appropriate kernel arguments oclError1 = clSetKernelArg(ocl.oclKernel, 0, sizeof(cl_mem), (void*)&oclDevSrcSequence); oclError1 |= clSetKernelArg(ocl.oclKernel, 1, sizeof(cl_mem), (void*)&oclDevSrcOnePointProbs); oclError1 |= clSetKernelArg(ocl.oclKernel, 2, sizeof(cl_mem), (void*)&oclDevDstMI); oclError1 |= clSetKernelArg(ocl.oclKernel, 3, sizeof(cl_uint), &sequenceLength); oclError1 |= clSetKernelArg(ocl.oclKernel, 4, sizeof(cl_uint), &numSequences); if (oclError1 != CL_SUCCESS) { std::cout << "error while setting arguments: " << ocl.oclErrorString(oclError1) << std::endl; exit(1); } // copy host memory to device, non-blocking copy oclError1 = clEnqueueWriteBuffer( ocl.oclCmdQueue, oclDevSrcSequence, CL_FALSE, 0, sizeof(cl_uchar) * sequenceSize, (const void *) sequence.getData(), 0, 0, 0); oclError1 |= clEnqueueWriteBuffer( ocl.oclCmdQueue, oclDevSrcOnePointProbs, CL_FALSE, 0, sizeof(cl_float) * onePointProbsSize, (const void *) sequence.getOnePointProbs(), 0, 0, 0); if (oclError1 != CL_SUCCESS) { std::cout << "error while writing to device " << ocl.oclErrorString(oclError1) << std::endl; exit(1); } // execute kernel LOOPCOUNT times and measure execution time // TODO LOOPCOUNT aendern, um Kernel mehrfach auszufuehren gettimeofday(&start, 0); for (int i = 0; i < LOOPCOUNT; ++i) { oclError1 = clEnqueueNDRangeKernel( ocl.oclCmdQueue, ocl.oclKernel, 2, // dimension 0, globalWorkSize, localWorkSize, 0, 0, 0); if (oclError1 != CL_SUCCESS) { std::cout << "error while executing kernel: " << ocl.oclErrorString(oclError1) << std::endl; exit(1); } } // clFinish blocks until all issued commands so far are completed, necessary for computing execution time oclError1 = clFinish(ocl.oclCmdQueue); gettimeofday(&end, 0); // read memory from device, store in temporary array and if no error happend copy to result matrix oclError1 = clEnqueueReadBuffer( ocl.oclCmdQueue, oclDevDstMI, CL_TRUE, 0, sizeof(cl_float) * numElements, dst, 0, 0, 0); if (oclError1 != CL_SUCCESS) { std::cout << "error while reading from device: " << ocl.oclErrorString(oclError1) << std::endl; exit(1); } std::cout << "execution time: " << (end.tv_sec - start.tv_sec ) * 1000 + ( end.tv_usec - start.tv_usec) / 1000 << " milliseconds" << std::endl; // fill the matrix with the computed results MI.copyElements(dst); // release used memory, can cause really bad crashes otherwise clReleaseMemObject(oclDevSrcSequence); clReleaseMemObject(oclDevSrcOnePointProbs); clReleaseMemObject(oclDevDstMI); }
void SessionImpl::releaseImpl(qpid::messaging::Message& m) { SequenceSet set; set.add(MessageImplAccess::get(m).getInternalId()); session.messageRelease(set, true); }
void SessionImpl::rejectImpl(qpid::messaging::Message& m) { SequenceSet set; set.add(MessageImplAccess::get(m).getInternalId()); session.messageReject(set); }