Example #1
0
int Application::go(int argc,char *argv[])
{
  // Process command line
  CommandLine cmd(argc,argv,"");
  if(cmd.numArgs()!=3)
    throw "\n\
threshold <in.schema> <input-dir> <threshold>\n\
\n\
";
  String schemaFile=cmd.arg(0);
  String inDir=cmd.arg(1);
  threshold=cmd.arg(2).asDouble();

  // Misc. initialization
  cout.precision(10);
  cerr.precision(10);
  BOOM::catchFloatOverflow();

  // Load the training set
  schema=Schema(schemaFile);
  D=schema.getNumContinuous();
  trainingSet.load(inDir,schema);

  // Apply thresholding to the data
  applyThreshold();

  // Save the results
  trainingSet.save();

  return 0;
}
Example #2
0
size_t total_length( SequenceSet const& set )
{
    return std::accumulate( set.begin(), set.end(), 0,
        [] (size_t c, Sequence const& s) {
            return c + s.length();
        }
    );
}
Example #3
0
void filter_max_sequence_length( SequenceSet& set, size_t max_length )
{
    size_t index = 0;
    while( index < set.size() ) {
        if( set.at(index).length() > max_length ) {
            set.remove(index);
        } else {
            ++index;
        }
    }
}
Example #4
0
void filter_min_max_sequence_length( SequenceSet& set, size_t min_length, size_t max_length )
{
    size_t index = 0;
    while( index < set.size() ) {
        auto len = set.at(index).length();
        if( len < min_length || len > max_length ) {
            set.remove(index);
        } else {
            ++index;
        }
    }
}
void SessionHandler::expected(const SequenceSet& commands, const Array& /*fragments*/) {
    checkAttached();
    if (getState()->hasState()) { // Replay
        if (commands.empty()) throw IllegalStateException(
            QPID_MSG(getState()->getId() << ": has state but client is attaching as new session."));        
        // TODO aconway 2008-05-12: support replay of partial commands.
        // Here we always round down to the last command boundary.
        SessionPoint expectedPoint = commands.empty() ? SequenceNumber(0) : SessionPoint(commands.front(),0);
        SessionState::ReplayRange replay = getState()->senderExpected(expectedPoint);
        sendCommandPoint(expectedPoint);
        std::for_each(replay.begin(), replay.end(), out); // replay
    }
    else
        sendCommandPoint(getState()->senderGetCommandPoint());
}
Example #6
0
utils::Bitvector gap_sites( SequenceSet const& set, std::string const& gap_chars )
{
    // Edge case.
    if( set.size() == 0 ) {
        return utils::Bitvector();
    }

    // Init result bitvector to all true. Then, for every site that is not a gap, reset to false.
    auto result = utils::Bitvector( set[0].length(), true );

    // Init lookup array.
    auto lookup = utils::CharLookup<bool>( false );
    lookup.set_selection_upper_lower( gap_chars, true );

    // Process all sequences in the set.
    for( auto const& seq : set ) {
        if( seq.length() != result.size() ) {
            throw std::runtime_error(
                "Cannot calculate gap_sites() if SequenceSet is not an alignment."
            );
        }

        // Process the sites of the sequence. If it is not a gap, set it to false in the bitvector.
        // This way, only sites that are all-gap will remain with value `true` in the end.
        for( size_t i = 0; i < seq.length(); ++i ) {
            if( ! lookup[ seq[ i ] ] ) {
                result.set( i, false );
            }
        }
    }

    return result;
}
void SessionHandler::flush(bool expected, bool confirmed, bool completed) {
    checkAttached();
    if (expected)  {
        SequenceSet expectSet;
        if (getState()->hasState())
            expectSet.add(getState()->receiverGetExpected().command);
        peer.expected(expectSet, Array());
    }
    if (confirmed) {
        SequenceSet confirmSet;
        if (!getState()->receiverGetUnknownComplete().empty()) 
            confirmSet.add(getState()->receiverGetUnknownComplete().front(),
                           getState()->receiverGetReceived().command);
        peer.confirmed(confirmSet, Array());
    }
    if (completed)
        peer.completed(getState()->receiverGetUnknownComplete(), true);
}
Example #8
0
TEST( Sequence, PhylipReaderAaSequential )
{
    // Skip test if no data availabe.
    NEEDS_TEST_DATA;

    // Load sequence file.
    std::string infile = environment->data_dir + "sequence/aa_3_384_s.phylip";
    SequenceSet sset;
    PhylipReader()
        .label_length( 10 )
        .valid_chars( amino_acid_codes_all() )
        .read( from_file(infile), sset);

    // Check data.
    EXPECT_EQ( 3, sset.size() );
    EXPECT_EQ( 384,          sset[0].length() );
    EXPECT_EQ( "CATH_HUMAN", sset[2].label() );
    EXPECT_EQ( "G-AVTPVKNQ", sset[0].sites().substr( 160, 10 ) );
}
Example #9
0
TEST( Sequence, PhylipReaderDnaSequential )
{
    // Skip test if no data availabe.
    NEEDS_TEST_DATA;

    // Load sequence file.
    std::string infile = environment->data_dir + "sequence/dna_5_42_s.phylip";
    SequenceSet sset;
    PhylipReader()
        .label_length( 10 )
        .valid_chars( nucleic_acid_codes_all() )
        .read( from_file(infile), sset);

    // Check data.
    EXPECT_EQ( 5, sset.size() );
    EXPECT_EQ( 42,           sset[0].length() );
    EXPECT_EQ( "Salmo gair", sset[1].label() );
    EXPECT_EQ( "CGGGCACGGT", sset[1].sites().substr( 30, 10 ) );
}
Example #10
0
TEST( Sequence, PhylipReaderDnaInterleaved )
{
    // Skip test if no data availabe.
    NEEDS_TEST_DATA;

    // Load sequence file.
    std::string infile = environment->data_dir + "sequence/dna_5_42_i.phylip";
    SequenceSet sset;
    PhylipReader()
        .label_length( 10 )
        .mode( PhylipReader::Mode::kInterleaved )
        .valid_chars( nucleic_acid_codes_all() )
        .read( from_file(infile), sset);

    // Check data.
    EXPECT_EQ( 5, sset.size() );
    EXPECT_EQ( 42,           sset[0].length() );
    EXPECT_EQ( "Turkey",     sset[0].label() );
    EXPECT_EQ( "AAGCTNGGGC", sset[0].sites().substr( 0, 10 ) );
}
Example #11
0
void computeMIonCPU(SequenceSet& sequences,	Matrix<float>& MI) {
	const int numChars = NUMPROTEINCHARS;
	const int sequenceLength = sequences.getSequenceLength();
	const int numSequences = sequences.getNumberOfSequences();

	const double epsilon=1e-6;
	
	timeval start, end;
	
	gettimeofday(&start, 0);
	for (int k = 0; k < LOOPCOUNT; k++) {
		//iterate over all column combinations
		for (int j = 0; j < sequenceLength; j++) {
			for (int i = 0; i <= j; i++) {
				//absolute number of occurrences of character pairs x,y: N_ij(x,y)
				int twoPointOccs[numChars][numChars];
				memset(twoPointOccs, 0, sizeof(twoPointOccs));
				//iterate through all sequences and compute two-point occurrences
				for (int seq = 0; seq < numSequences; seq++)
					twoPointOccs[sequences.getData(seq, i)][sequences.getData(seq, j)]++;
/*
				puts("===START===");
				for (int m=0; m<numChars; m++) {
					for (int n=0; n<numChars; n++)
						printf("%d %d: %d\n", m, n, twoPointOccs[m][n]);
					puts("");
				}
				puts("===STOP ===");
*/
				double MI_ij = 0;
				//sum over all x and y
				for (int x = 0; x < numChars; x++) {
					if (sequences.getOnePointProb(x, i) < epsilon)
						continue;
					for (int y = 0; y < numChars; y++) {
						if (sequences.getOnePointProb(y, j) < epsilon || twoPointOccs[x][y] == 0)
							continue;
						double p_ij_xy = double(twoPointOccs[x][y]) / double(numSequences);
						MI_ij += p_ij_xy * log2(p_ij_xy / (sequences.getOnePointProb(x, i) * sequences.getOnePointProb(y, j)));
					}
				}
				MI.set(i, j, MI_ij);
			}
		}
	}
	gettimeofday(&end, 0);
	std::cout << "execution time: " 
			<< (end.tv_sec - start.tv_sec ) * 1000 +  ( end.tv_usec - start.tv_usec) / 1000
			<< " milliseconds" << std::endl;
}
Example #12
0
void Application::applyThreshold()
{
  const int numSeqs=trainingSet.size();
  for(int i=0 ; i<numSeqs ; ++i) {
    EmissionSequence &S=*trainingSet[i];
    const int L=S.length();
    for(int pos=0 ; pos<L ; ++pos) {
      Emission &e=S[pos];
      GSL::Vector &V=e.getContinuous();
      for(int d=0 ; d<D ; ++d) V[d]=V[d]>=threshold ? threshold : 0;
    }
  }
}
Example #13
0
void Results::completed(const SequenceSet& set)
{   
    //call complete on those listeners whose ids fall within the set
    Listeners::iterator i = listeners.begin();
    while (i != listeners.end()) {
        if (set.contains(i->first)) {
            i->second->completed();
            listeners.erase(i++);
        } else {
            i++;
        }
    }
}
Example #14
0
bool is_alignment( SequenceSet const& set )
{
    if( set.size() == 0 ) {
        return true;
    }

    size_t length = set[0].length();
    for( auto& s : set ) {
        if( s.length() != length ) {
            return false;
        }
    }
    return true;
}
Example #15
0
void parse_seqs_file (SequenceSet& allSeqs, int& numSeq, char* fname) {
    ifstream seq_file(fname);
    string tmp_str;
    while (getline(seq_file, tmp_str)) {
        int seq_len = tmp_str.size();
        Sequence ht_tmp_seq (seq_len+1+1, 0);
        ht_tmp_seq[0] = '*';
        for(int i = 0; i < seq_len; i ++) 
            ht_tmp_seq[i+1] = tmp_str.at(i);
        ht_tmp_seq[seq_len+1] = '#';
        allSeqs.push_back(ht_tmp_seq);
        ++ numSeq;
    }
    seq_file.close();
}
Example #16
0
void writeClusterView( string fpath , SequenceSet& allModelSeqs, SequenceSet& allDataSeqs ){

    int numSeq = allDataSeqs.size();
    SequenceSet allCOSeqs (numSeq, Sequence(0));
    vector<int> pos(numSeq, 0);
    while (true) {
        set<int> insertion_ids;
        for (int i = 0; i < numSeq; i ++) {
            if (pos[i] >= allModelSeqs[i].size()) continue;
            if (allModelSeqs[i][pos[i]] == '-') 
                insertion_ids.insert(i);
        }
        if (insertion_ids.size() != 0) {
            // insertion exists
            for (int i = 0; i < numSeq; i ++) {
                if (insertion_ids.find(i)==insertion_ids.end()) // not in set
                    allCOSeqs[i].push_back('-');
                else { // in set
                    allCOSeqs[i].push_back(allDataSeqs[i][pos[i]++]);
                }
            }
        } else { // no insertion
            for (int i = 0; i < numSeq; i ++) 
                allCOSeqs[i].push_back(allDataSeqs[i][pos[i]++]);
        }
        // terminating
        bool terminated = true;
        for (int i = 0; i < numSeq; i ++) 
            if (pos[i] != allModelSeqs[i].size()) {
                terminated = false; 
                break;
            }
        if (terminated) break;
    }
    ofstream co_out (fpath.c_str());
    for (int i = 0; i < numSeq; i ++) {
        for (int j = 0; j < allCOSeqs[i].size(); j++)  {
            co_out << allCOSeqs[i][j];
        }
        co_out << endl;
    }
    co_out.close();
}
Example #17
0
int Application::go(int argc,char *argv[])
{
  // Process command line
  CommandLine cmd(argc,argv,"");
  if(cmd.numArgs()!=5) 
    throw "\n\
hack-known-sites [options] <training-dir> <schema.txt> <signal-name> <margin-around-peak> <motifs>\n\
";
  String dir=cmd.arg(0);
  String schemaFile=cmd.arg(1);
  String signalName=cmd.arg(2);
  int margin=cmd.arg(3).asInt();
  String motifs=cmd.arg(4);

  // Misc. initialization
  cout.precision(10);
  cerr.precision(10);
  BOOM::catchFloatOverflow();
  BOOM::Vector<String> &motifV=*motifs.getFields(",");
  Regex r("([^\/]+)\\.fastb");

  // Load schema
  Schema schema(schemaFile);
  sigID=schema.lookupContinuousID(signalName);
  openID=schema.lookupContinuousID("unpaired");
  alphabet=schema.getAlphabet(0);

  // Load the training set
  cerr<<"loading training data"<<endl;
  trainingSet.load(dir,schema);

  // Process sequences
  cerr<<"processing sequences"<<endl;
  BOOM::Vector<float> all, cluster;
  BOOM::Array1D< BOOM::Vector<double> > offsets(2*WINDOW_SIZE+1);
  BOOM::Vector<double> motifClusterSizes, otherClusterSizes;
  const int N=trainingSet.size();
  int nextSeqID=1;
  for(int i=0 ; i<N ; ++i) {
    EmissionSequence &S=*trainingSet[i];
    if(!r.search(S.getFilename())) throw S.getFilename();
    String substrate=r[1];
    int begin;
    double prevX=0;
    double maxX=-1;
    int maxPos;
    const int L=S.length();
    for(int pos=0 ; pos<L ; ++pos) {
      const Emission &e=S[pos];
      const double x=e[sigID];
      all.push_back(e[openID]);
      if(prevX==0 && x>0) { begin=pos; maxX=x; maxPos=pos; }
      else if(prevX>0 && x>0 && x>maxX) { maxX=x; maxPos=pos; }
      else if(prevX>0 && x==0) {
	int from, to;
	if(1) {
	  if(findMotif(maxPos,from,to,motifV,margin,S)) {
	  /*
	  cout<<substrate<<"\tpositive\tgroup\t"<<begin<<"\t"<<pos<<"\t1\t+\t.\n";
	  cout<<substrate<<"\tmotif\tsite\t"<<from<<"\t"<<to<<"\t1\t+\t.\n";
	  */
	  //cout<<maxX<<endl;
	  /*
	  EmissionSequence *subseq=S.getSubsequence(begin,pos-begin);
	  Sequence *seq=subseq->getDiscreteSeq(0);
	  delete subseq;
	  cout<<">"<<nextSeqID++<<endl;
	  seq->printOn(cout,schema.getAlphabet(0));
	  cout<<endl;
	  delete seq;
	  */

	  for(int delta=-WINDOW_SIZE ; delta<=WINDOW_SIZE ; ++delta) {
	    //int x=to+delta;
	    int x=maxPos+delta;
	    if(x<0 || x>=L) continue;
	    double y=S[x][openID];
	    offsets[delta+WINDOW_SIZE].push_back(y);
	  }

	  //motifClusterSizes.push_back(pos-begin);
	  }
	}
	else {//for(int i=begin ; i<pos ; ++i) {
	  /*
	  cout<<substrate<<"\tnegative\tgroup\t"<<begin+1<<"\t"<<pos<<"\t1\t+\t.\n";
	  cout<<substrate<<"\tdecoy\tsite\t"<<maxPos-margin<<"\t"<<maxPos+margin<<"\t1\t+\t.\n";
	  */
	  //cout<<maxX<<endl;
	  /*
	  //EmissionSequence *subseq=S.getSubsequence(begin,pos-begin);
	  if(maxPos>margin && maxPos+margin<L) {
	    EmissionSequence *subseq=S.getSubsequence(maxPos-margin,2*margin);
	    Sequence *seq=subseq->getDiscreteSeq(0);
	    delete subseq;
	    cout<<">"<<nextSeqID++<<endl;
	    seq->printOn(cout,schema.getAlphabet(0));
	    cout<<endl;
	    delete seq;
	  }
	  */
	  for(int delta=-WINDOW_SIZE ; delta<=WINDOW_SIZE ; ++delta) {
	    int x=maxPos+delta;
	    if(x<0 || x>=L) continue;
	    double y=S[x][openID];
	    offsets[delta+WINDOW_SIZE].push_back(y);
	  }

	  //cluster.push_back(S[i][openID]);
	  int start=maxPos-margin, stop=maxPos+margin+1;
	  if(start<0) start=0;
	  if(stop>L) stop=L;
	  else for(int i=start ; i<stop ; ++i) 
	    cluster.push_back(S[i][openID]);
	  otherClusterSizes.push_back(pos-begin);
	}
      }
      prevX=x;
    }
  }
  /*
  SummaryStats ss(all), ssC(cluster);
  SummaryStats sizes1(motifClusterSizes), sizes2(otherClusterSizes);
  cerr<<"overall stats: "<<ss.getMean()<<" +/- "<<ss.getStdDev()<<endl;
  cerr<<"cluster stats: "<<ssC.getMean()<<" +/- "<<ssC.getStdDev()<<" N="<<ssC.getN()<<endl;
  cerr<<"motif cluster sizes: "<<sizes1.getMean()<<" +/- "<<sizes1.getStdDev()<<" N="<<sizes1.getN()<<endl;
  cerr<<"non-motif cluster sizes: "<<sizes2.getMean()<<" +/- "<<sizes2.getStdDev()<<" N="<<sizes2.getN()<<endl;
  */
  for(int delta=-WINDOW_SIZE ; delta<=WINDOW_SIZE ; ++delta) {
    int x=delta+WINDOW_SIZE;
    SummaryStats ss(offsets[x]);
    cout<<delta<<"\t"<<ss.getMean()<<endl;
  }

  return 0;
}
Example #18
0
Tensor5D CVX_ADMM_MSA (SequenceSet& allSeqs, vector<int>& lenSeqs, int T2, string& dir_path) {
    // 1. initialization
    int numSeq = allSeqs.size();
    vector<Tensor4D> C (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE,
                        vector<double>(NUM_MOVEMENT, 0.0)))));  
    vector<Tensor4D> W_1 (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE,
                        vector<double>(NUM_MOVEMENT, 0.0)))));  
    vector<Tensor4D> W_2 (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE,
                        vector<double>(NUM_MOVEMENT, 0.0)))));  
    vector<Tensor4D> Y (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE,
                        vector<double>(NUM_MOVEMENT, 0.0)))));  
    tensor5D_init (C, allSeqs, lenSeqs, T2);
    tensor5D_init (W_1, allSeqs, lenSeqs, T2);
    tensor5D_init (W_2, allSeqs, lenSeqs, T2);
    tensor5D_init (Y, allSeqs, lenSeqs, T2);
    set_C (C, allSeqs);

    // 2. ADMM iteration
    int iter = 0;
    double mu = MU;
    double prev_CoZ = MAX_DOUBLE;
    while (iter < MAX_ADMM_ITER) {
        // 2a. Subprogram: FrankWolf Algorithm
        // NOTE: parallelize this for to enable parallelism
#ifdef PARRALLEL_COMPUTING
#pragma omp parallel for
#endif
        for (int n = 0; n < numSeq; n++) 
            first_subproblem (W_1[n], W_2[n], Y[n], C[n], mu, allSeqs[n]);

        // 2b. Subprogram: 
        second_subproblem (W_1, W_2, Y, mu, allSeqs, lenSeqs);
	
        // 2d. update Y: Y += mu * (W_1 - W_2)
        for (int n = 0; n < numSeq; n ++)
            tensor4D_lin_update (Y[n], W_1[n], W_2[n], mu);

        // 2e. print out tracking info
        double CoZ = 0.0;
        for (int n = 0; n < numSeq; n++) 
            CoZ += tensor4D_frob_prod(C[n], W_2[n]);
        double W1mW2 = 0.0;
        for (int n = 0; n < numSeq; n ++) {
            int T1 = W_1[n].size();
            for (int i = 0; i < T1; i ++) 
                for (int j = 0; j < T2; j ++) 
                    for (int d = 0; d < NUM_DNA_TYPE; d ++) 
                        for (int m = 0; m < NUM_MOVEMENT; m ++) {
                            double value = (W_1[n][i][j][d][m] - W_2[n][i][j][d][m]);
                            W1mW2 = max( fabs(value), W1mW2 ) ;
                        }
        }
        ///////////////////////////////////Copy from Main/////////////////////////////////////////
	int T2m = T2;
	Tensor tensor (T2m, Matrix (NUM_DNA_TYPE, vector<double>(NUM_DNA_TYPE, 0.0)));
	Matrix mat_insertion (T2m, vector<double> (NUM_DNA_TYPE, 0.0));
	for (int n = 0; n < numSeq; n ++) {
		int T1 = W_2[n].size();
		for (int i = 0; i < T1; i ++) { 
			for (int j = 0; j < T2m; j ++) {
				for (int d = 0; d < NUM_DNA_TYPE; d ++) {
					for (int m = 0; m < NUM_MOVEMENT; m ++) {
						if (m == DELETION_A or m == MATCH_A)
							tensor[j][d][dna2T3idx('A')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == DELETION_T or m == MATCH_T)
							tensor[j][d][dna2T3idx('T')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == DELETION_C or m == MATCH_C)
							tensor[j][d][dna2T3idx('C')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == DELETION_G or m == MATCH_G)
							tensor[j][d][dna2T3idx('G')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == DELETION_START or m == MATCH_START)
							tensor[j][d][dna2T3idx('*')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == DELETION_END or m == MATCH_END)
							tensor[j][d][dna2T3idx('#')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == INSERTION) 
							mat_insertion[j][d] += max(0.0, W_2[n][i][j][d][m]);
					}
				}
			}
		}
	}
	Trace trace (0, Cell(2)); // 1d: j, 2d: ATCG
	refined_viterbi_algo (trace, tensor, mat_insertion);
	
	Sequence recSeq;
	for (int i = 0; i < trace.size(); i ++) 
		if (trace[i].action != INSERTION) {
			recSeq.push_back(trace[i].acidB);
			if (trace[i].acidB == '#') break;
		}
	////////////////////////////////END copy from MAIN/////////////////////////////////////////////////////
	
	SequenceSet allModelSeqs, allDataSeqs;
        double obj_rounded = 0.0;
        for (int n = 0; n < numSeq; n ++) {
            Sequence model_seq = recSeq, data_seq = allSeqs[n];
            data_seq.erase(data_seq.begin());
            model_seq.erase(model_seq.begin());
            data_seq.erase(data_seq.end()-1);
            model_seq.erase(model_seq.end()-1);

            // align sequences locally
            Plane plane (data_seq.size()+1, Trace(model_seq.size()+1, Cell(2)));
            Trace trace (0, Cell(2));
            smith_waterman (model_seq, data_seq, plane, trace);

            // get the objective of rounded result
            for (int i = 0; i < trace.size(); i ++) {
                if (trace[i].acidA == '-' && trace[i].acidB != '-') 
                    obj_rounded += 1.0;//C_I;
                else if (trace[i].acidA != '-' && trace[i].acidB == '-') 
                    obj_rounded += 1.0;//C_D;
                else if (trace[i].acidA == trace[i].acidB) 
                    obj_rounded += 0.0;//C_M;
                else if (trace[i].acidA != trace[i].acidB) 
                    obj_rounded += 1.0;//C_MM;
            }
            
            model_seq.clear(); data_seq.clear();
            for (int i = 0; i < trace.size(); i ++) 
                model_seq.push_back(trace[i].acidA);
            for (int i = 0; i < trace.size(); i ++) 
                data_seq.push_back(trace[i].acidB);
            allModelSeqs.push_back(model_seq);
            allDataSeqs.push_back(data_seq);
        }
	//writeClusterView( dir_path+to_string(iter), allModelSeqs, allDataSeqs );
	

        // cerr << "=============================================================================" << endl;
        char COZ_val [50], w1mw2_val [50]; 
        sprintf(COZ_val, "%6f", CoZ);
        sprintf(w1mw2_val, "%6f", W1mW2);
        cerr << "ADMM_iter = " << iter 
            << ", C o Z = " << COZ_val
            << ", Wdiff_max = " << w1mw2_val
            << ", obj_rounded = " << obj_rounded
            << endl;
        // cerr << "sub1_Obj = CoW_1+0.5*mu*||W_1-Z+1/mu*Y_1||^2 = " << sub1_cost << endl;
        // cerr << "sub2_Obj = ||W_2-Z+1/mu*Y_2||^2 = " << sub2_cost << endl;

        // 2f. stopping conditions
        if (ADMM_EARLY_STOP_TOGGLE and iter > MIN_ADMM_ITER)
            if ( W1mW2 < EPS_Wdiff ) {
                cerr << "CoZ Converges. ADMM early stop!" << endl;
                break;
            }
        prev_CoZ = CoZ;
        iter ++;
    }
    cout << "W_1: " << endl;
    for (int i = 0; i < numSeq; i ++) tensor4D_dump(W_1[i]);
    cout << "W_2: " << endl;
    for (int i = 0; i < numSeq; i ++) tensor4D_dump(W_2[i]);
    return W_2;
}
Example #19
0
/* We resolve the second subproblem through sky-plane projection */
Sequence second_subproblem (Tensor5D& W_1, Tensor5D& W_2, Tensor5D& Y, double& mu, SequenceSet& allSeqs, vector<int> lenSeqs) {
    /*{{{*/
    int numSeq = allSeqs.size();
    int T2 = W_2[0][0].size();
    // reinitialize W_2 to all-zero matrix
    for (int n = 0; REINIT_W_ZERO_TOGGLE and n < numSeq; n ++) {
        int T1 = W_2[n].size();
        for (int i = 0; i < T1; i ++)  
            for (int j = 0; j < T2; j ++) 
                for (int d = 0; d < NUM_DNA_TYPE; d ++) 
                    for (int m = 0; m < NUM_MOVEMENT; m ++) 
                        W_2[n][i][j][d][m] = 0.0;
    }

    vector<Tensor4D> delta (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE,
                        vector<double>(NUM_MOVEMENT, 0.0)))));  
    tensor5D_init (delta, allSeqs, lenSeqs, T2);
    Tensor tensor (T2, Matrix (NUM_DNA_TYPE, vector<double>(NUM_DNA_TYPE, 0.0)));
    Matrix mat_insertion (T2, vector<double>(NUM_DNA_TYPE, 0.0));

    Trace trace (0, Cell(2)); // 1d: j, 2d: ATCG
    int fw_iter = -1;
    while (fw_iter < MAX_2nd_FW_ITER) {
        fw_iter ++;
        // 1. compute delta
#ifdef PARRALLEL_COMPUTING
//#pragma omp parallel for
#endif
        for (int n = 0; n < numSeq; n ++) {
            int T1 = W_2[n].size();
            for (int i = 0; i < T1; i ++) { 
                for (int j = 0; j < T2; j ++) 
                    for (int d = 0; d < NUM_DNA_TYPE; d ++) 
                        for (int m = 0; m < NUM_MOVEMENT; m ++) {
                            delta[n][i][j][d][m] = -1.0* mu * (W_2[n][i][j][d][m] - W_1[n][i][j][d][m]) + Y[n][i][j][d][m];
#ifdef SECOND_SUBPROBLEM_DEBUG
                            if (delta[n][i][j][d][m] > 0)
                                cout <<"delta: " << n << "," << i << "," << j << "," << d  << "," << m << ": "
                                    << delta[n][i][j][d][m] << endl;
#endif
                            if (m == DELETION_A or m == MATCH_A)
                                tensor[j][d][dna2T3idx('A')] += max(0.0, delta[n][i][j][d][m]);
                            else if (m == DELETION_T or m == MATCH_T)
                                tensor[j][d][dna2T3idx('T')] += max(0.0, delta[n][i][j][d][m]);
                            else if (m == DELETION_C or m == MATCH_C)
                                tensor[j][d][dna2T3idx('C')] += max(0.0, delta[n][i][j][d][m]);
                            else if (m == DELETION_G or m == MATCH_G)
                                tensor[j][d][dna2T3idx('G')] += max(0.0, delta[n][i][j][d][m]);
                            else if (m == DELETION_START or m == MATCH_START)
                                tensor[j][d][dna2T3idx('*')] += max(0.0, delta[n][i][j][d][m]);
                            else if (m == DELETION_END or m == MATCH_END)
                                tensor[j][d][dna2T3idx('#')] += max(0.0, delta[n][i][j][d][m]);
                            else if (m == INSERTION) {
                                mat_insertion[j][d] += max(0.0, delta[n][i][j][d][m]);
                            }
                        }
            }
        }
#ifdef SECOND_SUBPROBLEM_DEBUG
        cout << "tensor transition input list:" << endl;
        for (int j = 0; j < T2; j ++) 
            for (int d = 0; d < NUM_DNA_TYPE; d ++) 
                for (int k = 0; k < NUM_DNA_TYPE; k ++) {
                    if (tensor[j][d][k] > 0)
                        cout << "(" << j << ", " << d << ", " << k << ")=" << tensor[j][d][k] << endl;
                }
#endif

        double delta_square = 0.0;
        for (int n = 0; n < numSeq; n ++) 
            delta_square += tensor4D_frob_prod (delta[n], delta[n]);
        //cout << "delta_square: " << delta_square << endl;
        if ( delta_square < 1e-12 ) {
            //cout << "small delta. early stop." << endl;
            break;
        }

        // 2. determine the trace: run viterbi algorithm
        trace.clear();
	refined_viterbi_algo (trace, tensor, mat_insertion);
        Tensor5D S (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE, vector<double>(NUM_MOVEMENT, 0.0))))); 
        tensor5D_init (S, allSeqs, lenSeqs, T2);

        // 3. recover values for S 
        // 3b. set a number of selected elements to 1
        for (int t = 0; t < trace.size(); t++) {
            int sj = trace[t].location[0];
            int sd = trace[t].location[1];
            int sm = dna2T3idx(trace[t].acidB);
            // cout << trace[t].acidB;
            for (int n = 0; n < numSeq; n ++) {
                int T1 = S[n].size();
                for (int i = 0; i < T1; i ++) {
                    for (int m = 0; m < NUM_MOVEMENT; m ++)
                        if (delta[n][i][sj][sd][m] > 0.0) { 
                            if (m == DEL_BASE_IDX + sm or m == MTH_BASE_IDX + sm)
                                S[n][i][sj][sd][m] = 1.0;
                            else if (m == INSERTION and trace[t].action == INSERTION) {
                                S[n][i][sj][sd][m] = 1.0;
                            }
                        }
                }
            }
        }
        // cout <<  endl;

#ifdef SECOND_SUBPROBLEM_DEBUG
        cout << "Result of viterbi:" << endl;
        for (int t = 0; t < trace.size(); t++) 
            cout << "(" <<  trace[t].location[0] << ", " << trace[t].acidA << ", "<< trace[t].acidB << ")=" << trace[t].score << endl;
        double S_s = 0.0;
        for (int n = 0; n < numSeq; n ++) 
            S_s += tensor4D_frob_prod (S[n], S[n]);
        cout << "S_s: " << S_s << endl;
        for (int n = 0; n < numSeq; n ++) 
            tensor4D_dump(S[n]);
#endif

        // 4. Exact Line search: determine the optimal step size \gamma
        // gamma = [ ( Y_2 + mu*W - mu*Z ) dot (W_2 - S) ] / || W_2 - S ||^2
        //           ---------------combo------------------
        double numerator = 0.0, denominator = 0.0;
        for (int n = 0; n < numSeq; n ++) {
            int T1 = S[n].size();
            for (int i = 0; i < T1; i ++) 
                for (int j = 0; j < T2; j ++) 
                    for (int d = 0; d < NUM_DNA_TYPE; d ++) 
                        for (int m = 0; m < NUM_MOVEMENT; m ++) {
                            double wms = W_2[n][i][j][d][m] - S[n][i][j][d][m];
                            numerator += (-1.0*Y[n][i][j][d][m] + mu*W_2[n][i][j][d][m] - mu*W_1[n][i][j][d][m]) * wms;
                            denominator += mu * wms * wms;
                        }
        }
#ifdef SECOND_SUBPROBLEM_DEBUG
        cout << "numerator: " << numerator << ", denominator: " << denominator << endl;
#endif
        if ( denominator < 10e-6) {
            //cout << "small denominator: " << denominator << endl;
            break;
        }
        double gamma = numerator / denominator;
        if (fw_iter == 0) gamma = 1.0;
        gamma = max(gamma, 0.0);
        gamma = min(gamma, 1.0);
        // cout << "gamma: " << gamma << ", mu*||W-S||^2: " << denominator << endl;

        // 3. update W
        for (int n = 0; n < numSeq; n ++) {
            int T1 = S[n].size();
            for (int i = 0; i < T1; i ++) 
                for (int j = 0; j < T2; j ++) 
                    for (int d = 0; d < NUM_DNA_TYPE; d ++) 
                        for (int m = 0; m < NUM_MOVEMENT; m ++)
                            W_2[n][i][j][d][m] = (1-gamma) * W_2[n][i][j][d][m] + gamma* S[n][i][j][d][m];
        }

        // 4. output iteration tracking info
        // second_subproblem_log(fw_iter, W, Z, Y, mu);
        // 5. early stop condition
        if (fabs(gamma) < EPS_2nd_FW) break; 
    }
    
    Sequence recSeq;
    for (int t = 0; t < trace.size(); t++) 
        recSeq.push_back(trace[t].acidB);
    return recSeq;
    /*}}}*/
}
Example #20
0
void merge_duplicate_sequences(
    SequenceSet& set,
    MergeDuplicateSequencesCountPolicy count_policy,
    std::string const& counter_prefix
) {
    // Helper to keep track of which sequences (at position i in the set) occured how often.
    struct Duplicate
    {
        size_t index;
        size_t count;
    };

    // Find duplicates and count their occurences.
    // TODO this is a very memory intense step. find an algo that does not need to copy all sites...
    std::unordered_map< std::string, Duplicate > dup_map;
    size_t i = 0;
    while( i < set.size() ) {
        auto& seq = set[i];

        if( dup_map.count( seq.sites() ) == 0 ) {

            // If it is a new sequence, init the map entry and move to the next sequence.
            dup_map[ seq.sites() ].index = i;
            dup_map[ seq.sites() ].count = 1;
            ++i;

        } else {

            // If we already saw that sequence, increment its counter, and remove the sequence
            // from the set. Do not increment i - we deleted a sequence, so staying at the
            // position automatically "moves" to the next one.
            ++dup_map[ seq.sites() ].count;
            set.remove(i);
        }
    }

    // We do not need to relabel sequences.
    if( count_policy == MergeDuplicateSequencesCountPolicy::kDiscard ) {
        return;
    }

    // Relabel using the counts.
    for( size_t j = 0; j < set.size(); ++j ) {
        auto& seq = set[j];

        // The sequence needs to be in the map, as we added it in the previous step.
        // It also needs to have the same index, as we never changed that.
        assert( dup_map.count(seq.sites()) > 0 );
        assert( dup_map[ seq.sites() ].index == j );

        // Append the count to the label.
        auto count = dup_map[ seq.sites() ].count;
        if( count_policy == MergeDuplicateSequencesCountPolicy::kAppendToLabel ) {
            auto new_label = seq.label() + counter_prefix + std::to_string(count);
            seq.label( new_label );

        } else {

            // We already skipped the discard case, so this here should not happen.
            assert( false );
        }
    }
}
void SessionHandler::confirmed(const SequenceSet& commands, const Array& /*fragments*/) {
    checkAttached();
    // Ignore non-contiguous confirmations.
    if (!commands.empty() && commands.front() >= getState()->senderGetReplayPoint()) 
        getState()->senderConfirmed(commands.rangesBegin()->last());
}
Example #22
0
void computeMIonGPU(SequenceSet& sequence, Matrix<float>& MI, bool GPU)
{
	// initializes context and kernel and stores them
	OCL ocl(GPU);

	cl_int oclError1, oclError2;

	timeval start, end;

	// memory sizes
	size_t sequenceLength = sequence.getSequenceLength();
	size_t numSequences = sequence.getNumberOfSequences();
	
	// matrix MI is of size numElements
	size_t numElements = sequenceLength * sequenceLength;
	size_t sequenceSize = sequence.getNumberOfSequences() * sequenceLength;
	size_t onePointProbsSize = sequenceLength * NUMPROTEINCHARS;
		
	// host memory
	float * dst = new float[MI.size()];
	memset(dst, 0, MI.size());
	
	// device memory for sequences, one point probablities and resulting matrix
	cl_mem oclDevSrcSequence, oclDevSrcOnePointProbs, oclDevDstMI;
	
	// size for a work group: each workgroup computes one matrix entry, thus computes the correlation
	// one time for each character => 25 work items are sufficient
	size_t localWorkSize[2] = { 5, 5 };
	if (sequenceLength % localWorkSize[0] != 0) throw std::runtime_error("sequence length ^ 2 not divisable by local work size");
	
	// global work size defines the total amount of threads over all work group, thus needs to be a multiple of the local
	// work size in each dimension.
	size_t globalWorkSize[2] = { sequenceLength, sequenceLength };
	
	// create buffer on device, one for each input array
	oclDevSrcSequence = clCreateBuffer(		ocl.oclContext,
											CL_MEM_READ_ONLY,
											sizeof(cl_uchar) * sequenceSize,
											0, &oclError1);

	oclDevSrcOnePointProbs = clCreateBuffer(ocl.oclContext,
											CL_MEM_READ_ONLY,
											sizeof(cl_float) * onePointProbsSize,
											0, &oclError2);
	oclError1 |= oclError2;

	oclDevDstMI = clCreateBuffer(			ocl.oclContext,
											CL_MEM_WRITE_ONLY,
											sizeof(cl_float) * numElements,
											0, &oclError2);
	oclError1 |= oclError2;
	
	if (oclError1 != CL_SUCCESS) {
		std::cout << "error while allocating buffers" << std::endl;
		exit(1);
	}
	
	// set buffer to appropriate kernel arguments
	oclError1 = clSetKernelArg(ocl.oclKernel, 0, sizeof(cl_mem), (void*)&oclDevSrcSequence);
	oclError1 |= clSetKernelArg(ocl.oclKernel, 1, sizeof(cl_mem), (void*)&oclDevSrcOnePointProbs);
	oclError1 |= clSetKernelArg(ocl.oclKernel, 2, sizeof(cl_mem), (void*)&oclDevDstMI);
	oclError1 |= clSetKernelArg(ocl.oclKernel, 3, sizeof(cl_uint), &sequenceLength);
	oclError1 |= clSetKernelArg(ocl.oclKernel, 4, sizeof(cl_uint), &numSequences);

	if (oclError1 != CL_SUCCESS) {
		std::cout << "error while setting arguments: " << ocl.oclErrorString(oclError1) << std::endl;
		exit(1);
	}

	// copy host memory to device, non-blocking copy
	oclError1 = clEnqueueWriteBuffer(	ocl.oclCmdQueue,
						oclDevSrcSequence,
						CL_FALSE,
						0,
						sizeof(cl_uchar) * sequenceSize,
						(const void *) sequence.getData(),
						0, 0, 0);

	oclError1 |= clEnqueueWriteBuffer(	ocl.oclCmdQueue,
						oclDevSrcOnePointProbs,
						CL_FALSE,
						0,
						sizeof(cl_float) * onePointProbsSize,
						(const void *) sequence.getOnePointProbs(),
						0, 0, 0);

	if (oclError1 != CL_SUCCESS) {
		std::cout << "error while writing to device " << ocl.oclErrorString(oclError1) << std::endl;
		exit(1);
	}

	// execute kernel LOOPCOUNT times and measure execution time
	// TODO LOOPCOUNT aendern, um Kernel mehrfach auszufuehren
	gettimeofday(&start, 0);
	for (int i = 0; i < LOOPCOUNT; ++i) {
		oclError1 = clEnqueueNDRangeKernel(	ocl.oclCmdQueue,
							ocl.oclKernel,
							2,			// dimension
							0,
							globalWorkSize,
							localWorkSize,
							0, 0, 0);
	
	
		if (oclError1 != CL_SUCCESS) {
			std::cout << "error while executing kernel: " << ocl.oclErrorString(oclError1) << std::endl;
			exit(1);
		}
	}
	
	// clFinish blocks until all issued commands so far are completed, necessary for computing execution time
	oclError1 = clFinish(ocl.oclCmdQueue);
	gettimeofday(&end, 0);

	// read memory from device, store in temporary array and if no error happend copy to result matrix
	oclError1 = clEnqueueReadBuffer(	ocl.oclCmdQueue,
						oclDevDstMI,
						CL_TRUE,
						0,
						sizeof(cl_float) * numElements,
						dst,
						0, 0, 0);

	if (oclError1 != CL_SUCCESS) {
		std::cout << "error while reading from device: " << ocl.oclErrorString(oclError1) << std::endl;
		exit(1);
	}

	std::cout << "execution time: " 
			<< (end.tv_sec - start.tv_sec ) * 1000 +  ( end.tv_usec - start.tv_usec) / 1000
			<< " milliseconds" << std::endl;
	
	// fill the matrix with the computed results
	MI.copyElements(dst);
	
	// release used memory, can cause really bad crashes otherwise
	clReleaseMemObject(oclDevSrcSequence);
	clReleaseMemObject(oclDevSrcOnePointProbs);
	clReleaseMemObject(oclDevDstMI);
}
Example #23
0
void SessionImpl::releaseImpl(qpid::messaging::Message& m)
{
    SequenceSet set;
    set.add(MessageImplAccess::get(m).getInternalId());
    session.messageRelease(set, true);
}
Example #24
0
void SessionImpl::rejectImpl(qpid::messaging::Message& m)
{
    SequenceSet set;
    set.add(MessageImplAccess::get(m).getInternalId());
    session.messageReject(set);
}