예제 #1
0
void benchmark_sw(
    boost::random::mt19937& rng
  , boost::uint32_t seed 
  , boost::uint32_t length
  , boost::uint32_t grain_size
  , boost::uint32_t iterations = 1 << 10
    )
{ 
    std::string const chars("ATGC");
  
    boost::random::uniform_int_distribution<boost::uint32_t>
        index_dist(0, chars.size() - 1);

    ///////////////////////////////////////////////////////////////////////////
    // Generate our sequences using the mt19937 random number generator.
    std::string a, b;

    for (boost::uint32_t x = 0; x < length; ++x)
    {
        a += chars[index_dist(rng)];
        b += chars[index_dist(rng)];
    }

    ///////////////////////////////////////////////////////////////////////////
    // Generate our sequences using the mt19937 random number generator.
    hpx::util::high_resolution_timer t;

    for (boost::uint32_t x = 0; x < iterations; ++x)
        smith_waterman(a, b, grain_size);

    double runtime = t.elapsed();

    std::cout << seed << ","
              << hpx::get_os_thread_count() << "," 
              << length << ","
              << grain_size << ","
              << iterations << ","
              << runtime << "\n";
}
예제 #2
0
int main()
{
  GestureDB db("gesture_data.db");
  db.load();

  //cout<<*db<<endl;
  /// fetch the 7th gesture
  Gesture* gesture = db[7];
  cout<<*gesture<<endl;
  cout<<gesture->numPoint();

  gesture->filter();
  //cout<<gesture->numPoint();

  gesture->sample(17);
  cout<<*gesture<<endl;

  cout<<"similarity score"<<smith_waterman(gesture, gesture)<<endl;

  //db.save();

  return 0;
}
예제 #3
0
파일: main.c 프로젝트: misaelh/AMS
int main(int argc, const char **argv) {

    if (argc != 3) {
        printf("usage: swalign SEQ1 SEQ2\n");
        exit(1);
    }

    char q[strlen(argv[1])], db[strlen(argv[2])];
    int q_len,db_len;
    int result;

    strcpy(q, argv[1]);
    strcpy(db, argv[2]);

    q_len = strlen(q);
    db_len = strlen(db);

    result = smith_waterman(q,q_len,db,db_len,12,2);

    //printf("%s\n%s\n", result->a, result->b);

    exit(0);
} 
예제 #4
0
int main (int argn, char** argv) {
    // 1. parse cmd 
    parse_cmd_line(argn, argv);

    // 2. input DNA sequence file
    int numSeq = 0;
    SequenceSet allSeqs (0, Sequence());
    parse_seqs_file(allSeqs, numSeq, trainFname);
    vector<int> lenSeqs (numSeq, 0);
    for (int n = 0; n < numSeq; n ++) 
        lenSeqs[n] = allSeqs[n].size();

    // pre-info
    cout << "#########################################################" << endl;
    cout << "ScoreMatch: " << C_M;
    cout << ", ScoreInsertion: " << C_I;
    cout << ", ScoreDeletion: " << C_D;
    cout << ", ScoreMismatch: " << C_MM << endl;
    for (int n = 0; n < numSeq; n ++) 
        sequence_dump(allSeqs, n);

    // 3. relaxed convex program: ADMM-based algorithm
    string dir_path = string(trainFname)+".trace/";
    //system((string("rm -rf ")+dir_path).c_str());
    //system((string("mkdir ")+dir_path).c_str());
    // omp_set_num_threads(NUM_THREADS);
    int T2 = get_init_model_length (lenSeqs) + LENGTH_OFFSET; // model_seq_length
    time_t begin = time(NULL);
    vector<Tensor4D> W = CVX_ADMM_MSA (allSeqs, lenSeqs, T2, dir_path);
    time_t end = time(NULL);

    // 4. output the result
    // a. tuple view
    cout << ">>>>>>>>>>>>>>>>>>>>>>>TupleView<<<<<<<<<<<<<<<<<<<<<<<<" << endl;
    for (int n = 0; n < numSeq; n ++) {
        cout << "n = " << n << endl;
        tensor4D_dump(W[n]);
    }
    // b. sequence view
    cout << ">>>>>>>>>>>>>>>>>>>>>>>SequenceView<<<<<<<<<<<<<<<<<<<<<<<<" << endl;
    int T2m = T2;
    Tensor tensor (T2m, Matrix (NUM_DNA_TYPE, vector<double>(NUM_DNA_TYPE, 0.0)));
    Matrix mat_insertion (T2m, vector<double> (NUM_DNA_TYPE, 0.0));
    for (int n = 0; n < numSeq; n ++) {
        int T1 = W[n].size();
        for (int i = 0; i < T1; i ++) { 
            for (int j = 0; j < T2m; j ++) {
                for (int d = 0; d < NUM_DNA_TYPE; d ++) {
                    for (int m = 0; m < NUM_MOVEMENT; m ++) {
                        if (m == DELETION_A or m == MATCH_A)
                            tensor[j][d][dna2T3idx('A')] += max(0.0, W[n][i][j][d][m]);
                        else if (m == DELETION_T or m == MATCH_T)
                            tensor[j][d][dna2T3idx('T')] += max(0.0, W[n][i][j][d][m]);
                        else if (m == DELETION_C or m == MATCH_C)
                            tensor[j][d][dna2T3idx('C')] += max(0.0, W[n][i][j][d][m]);
                        else if (m == DELETION_G or m == MATCH_G)
                            tensor[j][d][dna2T3idx('G')] += max(0.0, W[n][i][j][d][m]);
                        else if (m == DELETION_START or m == MATCH_START)
                            tensor[j][d][dna2T3idx('*')] += max(0.0, W[n][i][j][d][m]);
                        else if (m == DELETION_END or m == MATCH_END)
                            tensor[j][d][dna2T3idx('#')] += max(0.0, W[n][i][j][d][m]);
                        else if (m == INSERTION) 
                            mat_insertion[j][d] += max(0.0, W[n][i][j][d][m]);
                    }
                }
            }
        }
    }
    Trace trace (0, Cell(2)); // 1d: j, 2d: ATCG
    refined_viterbi_algo (trace, tensor, mat_insertion);
    // for (int i = 0; i < trace.size(); i ++) 
    //    cout << trace[i].toString() << endl;
    for (int n = 0; n < numSeq; n ++) {
        char buffer [50];
        sprintf (buffer, "Seq%5d", n);
        cout << buffer << ": ";
        for (int j = 0; j < allSeqs[n].size(); j ++) 
            cout << allSeqs[n][j];
        cout << endl;
    }
    Sequence recSeq;
    cout << "SeqRecov: ";
    for (int i = 0; i < trace.size(); i ++) 
        if (trace[i].action != INSERTION) {
            cout << trace[i].acidB;
            recSeq.push_back(trace[i].acidB);
            if (trace[i].acidB == '#') break;
        }
    cout << endl;
    cout << ">>>>>>>>>>>>>>>>>>>>>>>MatchingView<<<<<<<<<<<<<<<<<<<<<<<<" << endl;
    // NOTE: rounding scheme
    SequenceSet allModelSeqs, allDataSeqs;
    for (int n = 0; n < numSeq; n ++) {
        Sequence model_seq = recSeq, data_seq = allSeqs[n];
       data_seq.erase(data_seq.begin());
       model_seq.erase(model_seq.begin());
        data_seq.erase(data_seq.end()-1);
        model_seq.erase(model_seq.end()-1);

        // align sequences locally
        Plane plane (data_seq.size()+1, Trace(model_seq.size()+1, Cell(2)));
        Trace trace (0, Cell(2));
        smith_waterman (model_seq, data_seq, plane, trace);

        // 4. output the result
        model_seq.clear(); data_seq.clear();
        for (int i = 0; i < trace.size(); i ++) 
            model_seq.push_back(trace[i].acidA);
        for (int i = 0; i < trace.size(); i ++) 
            data_seq.push_back(trace[i].acidB);
        allModelSeqs.push_back(model_seq);
        allDataSeqs.push_back(data_seq);
        for (int i = 0; i < model_seq.size(); i ++) cout << model_seq[i];
        cout << endl;
        for (int i = 0; i < data_seq.size(); i ++) cout << data_seq[i];
        cout << endl;
    }
    cout << ">>>>>>>>>>>>>>>>>>>>>ClustalOmegaView<<<<<<<<<<<<<<<<<<<<<<" << endl;
    writeClusterView(string(trainFname)+".co", allModelSeqs, allDataSeqs);
	
    
    cout << "#########################################################" << endl;
    cout << "Time Spent: " << end - begin << " seconds" << endl;
    return 0;
}
예제 #5
0
Tensor5D CVX_ADMM_MSA (SequenceSet& allSeqs, vector<int>& lenSeqs, int T2, string& dir_path) {
    // 1. initialization
    int numSeq = allSeqs.size();
    vector<Tensor4D> C (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE,
                        vector<double>(NUM_MOVEMENT, 0.0)))));  
    vector<Tensor4D> W_1 (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE,
                        vector<double>(NUM_MOVEMENT, 0.0)))));  
    vector<Tensor4D> W_2 (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE,
                        vector<double>(NUM_MOVEMENT, 0.0)))));  
    vector<Tensor4D> Y (numSeq, Tensor4D(0, Tensor(T2, Matrix(NUM_DNA_TYPE,
                        vector<double>(NUM_MOVEMENT, 0.0)))));  
    tensor5D_init (C, allSeqs, lenSeqs, T2);
    tensor5D_init (W_1, allSeqs, lenSeqs, T2);
    tensor5D_init (W_2, allSeqs, lenSeqs, T2);
    tensor5D_init (Y, allSeqs, lenSeqs, T2);
    set_C (C, allSeqs);

    // 2. ADMM iteration
    int iter = 0;
    double mu = MU;
    double prev_CoZ = MAX_DOUBLE;
    while (iter < MAX_ADMM_ITER) {
        // 2a. Subprogram: FrankWolf Algorithm
        // NOTE: parallelize this for to enable parallelism
#ifdef PARRALLEL_COMPUTING
#pragma omp parallel for
#endif
        for (int n = 0; n < numSeq; n++) 
            first_subproblem (W_1[n], W_2[n], Y[n], C[n], mu, allSeqs[n]);

        // 2b. Subprogram: 
        second_subproblem (W_1, W_2, Y, mu, allSeqs, lenSeqs);
	
        // 2d. update Y: Y += mu * (W_1 - W_2)
        for (int n = 0; n < numSeq; n ++)
            tensor4D_lin_update (Y[n], W_1[n], W_2[n], mu);

        // 2e. print out tracking info
        double CoZ = 0.0;
        for (int n = 0; n < numSeq; n++) 
            CoZ += tensor4D_frob_prod(C[n], W_2[n]);
        double W1mW2 = 0.0;
        for (int n = 0; n < numSeq; n ++) {
            int T1 = W_1[n].size();
            for (int i = 0; i < T1; i ++) 
                for (int j = 0; j < T2; j ++) 
                    for (int d = 0; d < NUM_DNA_TYPE; d ++) 
                        for (int m = 0; m < NUM_MOVEMENT; m ++) {
                            double value = (W_1[n][i][j][d][m] - W_2[n][i][j][d][m]);
                            W1mW2 = max( fabs(value), W1mW2 ) ;
                        }
        }
        ///////////////////////////////////Copy from Main/////////////////////////////////////////
	int T2m = T2;
	Tensor tensor (T2m, Matrix (NUM_DNA_TYPE, vector<double>(NUM_DNA_TYPE, 0.0)));
	Matrix mat_insertion (T2m, vector<double> (NUM_DNA_TYPE, 0.0));
	for (int n = 0; n < numSeq; n ++) {
		int T1 = W_2[n].size();
		for (int i = 0; i < T1; i ++) { 
			for (int j = 0; j < T2m; j ++) {
				for (int d = 0; d < NUM_DNA_TYPE; d ++) {
					for (int m = 0; m < NUM_MOVEMENT; m ++) {
						if (m == DELETION_A or m == MATCH_A)
							tensor[j][d][dna2T3idx('A')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == DELETION_T or m == MATCH_T)
							tensor[j][d][dna2T3idx('T')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == DELETION_C or m == MATCH_C)
							tensor[j][d][dna2T3idx('C')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == DELETION_G or m == MATCH_G)
							tensor[j][d][dna2T3idx('G')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == DELETION_START or m == MATCH_START)
							tensor[j][d][dna2T3idx('*')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == DELETION_END or m == MATCH_END)
							tensor[j][d][dna2T3idx('#')] += max(0.0, W_2[n][i][j][d][m]);
						else if (m == INSERTION) 
							mat_insertion[j][d] += max(0.0, W_2[n][i][j][d][m]);
					}
				}
			}
		}
	}
	Trace trace (0, Cell(2)); // 1d: j, 2d: ATCG
	refined_viterbi_algo (trace, tensor, mat_insertion);
	
	Sequence recSeq;
	for (int i = 0; i < trace.size(); i ++) 
		if (trace[i].action != INSERTION) {
			recSeq.push_back(trace[i].acidB);
			if (trace[i].acidB == '#') break;
		}
	////////////////////////////////END copy from MAIN/////////////////////////////////////////////////////
	
	SequenceSet allModelSeqs, allDataSeqs;
        double obj_rounded = 0.0;
        for (int n = 0; n < numSeq; n ++) {
            Sequence model_seq = recSeq, data_seq = allSeqs[n];
            data_seq.erase(data_seq.begin());
            model_seq.erase(model_seq.begin());
            data_seq.erase(data_seq.end()-1);
            model_seq.erase(model_seq.end()-1);

            // align sequences locally
            Plane plane (data_seq.size()+1, Trace(model_seq.size()+1, Cell(2)));
            Trace trace (0, Cell(2));
            smith_waterman (model_seq, data_seq, plane, trace);

            // get the objective of rounded result
            for (int i = 0; i < trace.size(); i ++) {
                if (trace[i].acidA == '-' && trace[i].acidB != '-') 
                    obj_rounded += 1.0;//C_I;
                else if (trace[i].acidA != '-' && trace[i].acidB == '-') 
                    obj_rounded += 1.0;//C_D;
                else if (trace[i].acidA == trace[i].acidB) 
                    obj_rounded += 0.0;//C_M;
                else if (trace[i].acidA != trace[i].acidB) 
                    obj_rounded += 1.0;//C_MM;
            }
            
            model_seq.clear(); data_seq.clear();
            for (int i = 0; i < trace.size(); i ++) 
                model_seq.push_back(trace[i].acidA);
            for (int i = 0; i < trace.size(); i ++) 
                data_seq.push_back(trace[i].acidB);
            allModelSeqs.push_back(model_seq);
            allDataSeqs.push_back(data_seq);
        }
	//writeClusterView( dir_path+to_string(iter), allModelSeqs, allDataSeqs );
	

        // cerr << "=============================================================================" << endl;
        char COZ_val [50], w1mw2_val [50]; 
        sprintf(COZ_val, "%6f", CoZ);
        sprintf(w1mw2_val, "%6f", W1mW2);
        cerr << "ADMM_iter = " << iter 
            << ", C o Z = " << COZ_val
            << ", Wdiff_max = " << w1mw2_val
            << ", obj_rounded = " << obj_rounded
            << endl;
        // cerr << "sub1_Obj = CoW_1+0.5*mu*||W_1-Z+1/mu*Y_1||^2 = " << sub1_cost << endl;
        // cerr << "sub2_Obj = ||W_2-Z+1/mu*Y_2||^2 = " << sub2_cost << endl;

        // 2f. stopping conditions
        if (ADMM_EARLY_STOP_TOGGLE and iter > MIN_ADMM_ITER)
            if ( W1mW2 < EPS_Wdiff ) {
                cerr << "CoZ Converges. ADMM early stop!" << endl;
                break;
            }
        prev_CoZ = CoZ;
        iter ++;
    }
    cout << "W_1: " << endl;
    for (int i = 0; i < numSeq; i ++) tensor4D_dump(W_1[i]);
    cout << "W_2: " << endl;
    for (int i = 0; i < numSeq; i ++) tensor4D_dump(W_2[i]);
    return W_2;
}
예제 #6
0
int main(int argc, char* argv[])
{
    if(argc < 3)
    {
        std::cout
            << "Error: no se especificaron suficientes archivos de entrada." 
            << std::endl;
        return 1;
    }
    
    std::string filename1 = argv[1];
    std::string filename2 = argv[2];
    
    FASTAReader reader1(filename1);
    FASTAReader reader2(filename2);
    
    reader1.setDefault(0);
    reader2.setDefault(1);
    
    //matriz de sustitucion
    int smatrix[]{ 5, -4, -4, -4,
                -4,  5, -4, -4,
                -4, -4,  5, -4,
                -4, -4, -4,  5};
    int gap_open   = 10;
    int gap_extend =  1;
    int match = 5;
    int mismatch = -4;
    
    #pragma omp parallel
    {
        int seq_len = DEFAULT_SEQ_LEN;
        
        //container vectors for sequences
        Buffer<int16_t> seqs1(seq_len * VSIZE, ALNSIZE);
        Buffer<int16_t> seqs2(seq_len * VSIZE, ALNSIZE);
        
        //containers for ids
        std::vector<std::string> seqs1_ids(VSIZE);
        std::vector<std::string> seqs2_ids(VSIZE);
        
        
        
        //legths of sequences
        int seqs1_len[VSIZE];
        int seqs2_len[VSIZE];
        
        //containter for flags
        Buffer<int8_t> flags(seq_len * seq_len * VSIZE, ALNSIZE);
        int16_t __attribute((aligned(ALNSIZE))) scores[VSIZE];
        int16_t __attribute((aligned(ALNSIZE))) ipos[VSIZE];
        int16_t __attribute((aligned(ALNSIZE))) jpos[VSIZE];
        
        //containers for arrays
        int16_t inf = gap_open + gap_extend + 1;
        //int16_t aF[256 * VSIZE] __attribute((aligned(ALNSIZE))) = {(int16_t)(-inf)};
        //int16_t aH[256 * VSIZE] __attribute((aligned(ALNSIZE))) = {0};

        int bsize = 128 * VSIZE;
	
        //Buffer<int16_t> E(bsize, ALNSIZE);
        Buffer<int16_t> F(bsize, ALNSIZE);
        Buffer<int16_t> H(bsize, ALNSIZE);
        //int16_t __attribute((aligned(ALNSIZE))) H[128 * VSIZE];
        
        //alignments
        char aln1[256];
        char aln2[256];
        
        //max sizes
        int max_x, max_y;
        
        //alignment start position
        int x0, y0;
        
        while(read_seqs(reader1, reader2, &seqs1, &seqs2, seqs1_len, seqs2_len, 
              &seqs1_ids, &seqs2_ids))
        {
            max_x = *std::max_element(seqs1_len, seqs1_len + VSIZE) + 1;
            max_y = *std::max_element(seqs2_len, seqs2_len + VSIZE) + 1;
            //E.clear(-inf);
            F.clear(-inf);
            H.clear(0);
            //flags.clear(0);
	    
            smith_waterman(seqs1.data(), seqs2.data(), match, mismatch, gap_open, gap_extend, 
			   flags.data(), scores, ipos, jpos, max_x, max_y, F.data(), H.data());
            
            for(int i = 0; i < VSIZE; i++)
            {
                //std::cout << scores[i] << std::endl;
                //std::cout << ipos[i] << std::endl;
                //std::cout << jpos[i] << std::endl;
                sw_backtrack(i, flags.data(), seqs1.data(), seqs2.data(), max_x, max_y,
                    aln1, aln2, ipos[i], jpos[i], x0, y0);
                //puts(aln1);
                //puts(aln2);
                print_alignment (stdout, seqs1_ids, seqs2_ids, scores, 
                    aln1, aln2, strlen(aln1), i);
            }
        }
    }
    return 0;
}
예제 #7
0
int main(int argc, char *argv[]) {
    if (argc != 4) {
        printf("usage: \n\t$ %s <SimMetric> <string1> <string2>\n", basename(argv[0]));
        printf("\nWhere SimMetric is one of:\n");
        int i;
        for (i=0; i < SIMMETC; i++) {
            if (i > 0)
                printf(",");
            printf(" %s", SIMMETS[i]);
        }
        printf("\n");
        return (1);
    }
    else if (strcmp(argv[1], "all") == 0) {
        argv[1] = "block_distance"; main(argc, argv);
        argv[1] = "cosine"; main(argc, argv);
        argv[1] = "dice"; main(argc, argv);
        argv[1] = "euclidean_distance"; main(argc, argv);
        argv[1] = "jaccard"; main(argc, argv);
        argv[1] = "jaro"; main(argc, argv);
        argv[1] = "jaro_winkler"; main(argc, argv);
        argv[1] = "levenshtein"; main(argc, argv);
        argv[1] = "matching_coefficient"; main(argc, argv);
        argv[1] = "monge_elkan"; main(argc, argv);
        argv[1] = "needleman_wunch"; main(argc, argv);
        argv[1] = "overlap_coefficient"; main(argc, argv);
        argv[1] = "qgrams_distance"; main(argc, argv);
        argv[1] = "smith_waterman"; main(argc, argv);
        argv[1] = "smith_waterman_gotoh"; main(argc, argv);
        argv[1] = "soundex"; main(argc, argv);
        argv[1] = "metaphone"; main(argc, argv);
        argv[1] = "double_metaphone"; main(argc, argv);
    }
    else {
        float similarity = 0;
        char *sm_name, metrics[50], compare[50];

        sprintf(compare, "%10s & %-10s", argv[2], argv[3]);
        switch (which_type(argv[1])) {
            case 0:
            case 1:
                sm_name = "Block Distance";
                sprintf(metrics, "%d", block_distance(argv[2], argv[3]));
                similarity = block_distance_similarity(argv[2], argv[3]);
                break;
            case 2:
            case 3:
                sm_name = "Cosine Similarity";
                similarity = cosine_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 4:
                sm_name = "Dice Similarity";
                similarity = dice_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 5:
            case 6:
                sm_name = "Euclidean Distance";
                sprintf(metrics, "%3.2f", euclidean_distance(argv[2], argv[3]));
                similarity = euclidean_distance_similarity(argv[2], argv[3]);
                break;
            case 7:
            case 8:
                sm_name = "Jaccard Similarity";
                similarity = jaccard_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 9:
            case 10:
                sm_name = "Jaro Similarity";
                similarity = jaro_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 11:
            case 12:
                sm_name = "Jaro Winkler Similarity";
                similarity = jaro_winkler_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 13:
            case 14:
                sm_name = "Levenshtein Distance";
                sprintf(metrics, "%d", levenshtein(argv[2], argv[3]));
                similarity = levenshtein_similarity(argv[2], argv[3]);
                break;
            case 15:
            case 16:
                sm_name = "Matching Coefficient SimMetrics";
                sprintf(metrics, "%3.2f", matching_coefficient(argv[2], argv[3]));
                similarity = matching_coefficient_similarity(argv[2], argv[3]);
                break;
            case 17:
            case 18:
                sm_name = "Monge Elkan Similarity";
                similarity = monge_elkan_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 19:
            case 20:
                sm_name = "Needleman Wunch SimMetrics";
                sprintf(metrics, "%3.2f", needleman_wunch(argv[2], argv[3]));
                similarity = needleman_wunch_similarity(argv[2], argv[3]);
                break;
            case 21:
            case 22:
                sm_name = "Overlap Coefficient Similarity";
                similarity = overlap_coefficient_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 23:
            case 24:
                sm_name = "QGrams Distance";
                sprintf(metrics, "%d", qgrams_distance(argv[2], argv[3]));
                similarity = qgrams_distance_similarity(argv[2], argv[3]);
                break;
            case 25:
            case 26:
                sm_name = "Smith Waterman SimMetrics";
                sprintf(metrics, "%3.2f", smith_waterman(argv[2], argv[3]));
                similarity = smith_waterman_similarity(argv[2], argv[3]);
                break;
            case 27:
            case 28:
                sm_name = "Smith Waterman Gotoh SimMetrics";
                sprintf(metrics, "%3.2f", smith_waterman_gotoh(argv[2], argv[3]));
                similarity = smith_waterman_gotoh_similarity(argv[2], argv[3]);
                break;
            case 29:
            case 30:
                sm_name = "Soundex Phonetics";
                char *s1 = soundex(argv[2]);
                char *s2 = soundex(argv[3]);
                sprintf(metrics, "%s & %s", s1, s2);
                free(s1);
                free(s2);
                similarity = soundex_similarity(argv[2], argv[3]);
                break;
            case 31:
            case 32:
                sm_name = "Metaphone Phonetics";
                char *m1 = metaphone(argv[2]);
                char *m2 = metaphone(argv[3]);
                sprintf(metrics, "%s & %s", m1, m2);
                free(m1);
                free(m2);
                similarity = metaphone_similarity(argv[2], argv[3]);
                break;
            case 33:
            case 34:
                sm_name = "Double Metaphone Phonetics";
                char *dm1 = double_metaphone(argv[2]);
                char *dm2 = double_metaphone(argv[3]);
                sprintf(metrics, "%s & %s", dm1, dm2);
                free(dm1);
                free(dm2);
                similarity = double_metaphone_similarity(argv[2], argv[3]);
                break;
            default:
               printf("Unknown SimMetric %s, not found.\n", argv[1]);
               return (1);
        }

        printf("%-31s between %-25s is %12s ", sm_name, compare, metrics);
        printf("and yields a %3.0f%% similarity\n", similarity * 100);

        return (EXIT_SUCCESS);
    }
}
예제 #8
0
int find_map ( Penalty_parametrization * penalty_params,
	       Representation *X_rep,  Representation *Y_rep,
	       double ** R,
	       double alpha, double * F_effective,  Map *map,
	       int *anchor_x, int *anchor_y, int anchor_size) {
    
    int i, j, map_size, a;
    int NX=X_rep->N_full, NY=Y_rep->N_full;
    double aln_score;
    
    /* "sse_pair_score" is the matrix of values (exp weight)*(-1 for SSE mismatch) */
    store_sse_pair_score (X_rep, Y_rep, R,  alpha, map);

    /* if the anchors are given, somebody insists
       they should be considered as already aligned */
    if (penalty_params->custom_gap_penalty_x)
	memset (penalty_params->custom_gap_penalty_x, 0, NX*sizeof(double) );
    if (penalty_params->custom_gap_penalty_y)
	memset (penalty_params->custom_gap_penalty_y, 0, NY*sizeof(double) );
    
   
    if (anchor_x && anchor_y) {
        for (a=0; a<anchor_size; a++){
	     
	      i = anchor_x[a];
	      if (penalty_params->custom_gap_penalty_x)
		  penalty_params->custom_gap_penalty_x[i] = options.far_far_away;
	      for (j=0; j<NY; j++) {
		   if ( j ==  anchor_y[a] )  continue;
		   map->sse_pair_score[i][j]  =  options.far_far_away;
		   map->cosine[i][j] = -0.9999;
	      }
	      
	      j = anchor_y[a];
	      if (penalty_params->custom_gap_penalty_y)
		  penalty_params->custom_gap_penalty_y[j] = options.far_far_away;
	      for (i=0; i<NX; i++) {
		   if ( i ==  anchor_x[a] )  continue;
		   map->sse_pair_score [i][j] = options.far_far_away;
		   map->cosine[i][j] = -0.9999;
	      }
	 }
    }
    /* dynamic programming using the sse_pair_score */
    if (options.current_algorithm == SEQUENTIAL) {
	double **similarity_score =  map->sse_pair_score;
        smith_waterman (penalty_params, NX, NY, similarity_score, map->x2y, map->y2x, &aln_score);
	
    } else {
	// shouldn't we have checked for this before?
	fprintf (stderr, "%s:%d: Unrecognized algorithm type.\n",__FILE__, __LINE__);
	exit (1);
    }
    //exit(1);
    map_assigned_score (X_rep, map);
    map -> avg_length_mismatch = 0;
    map_size = 0;
    for (i=0; i<NX; i++) {
	 j = map->x2y[i];
	 if ( j<0) continue;
	 
	 if ( map->cosine[i][j] < options.far_away_cosine) {
	     map->x2y[i] =  options.far_far_away; 
	     map->y2x[j] =  options.far_far_away; /*everything crashes; investigate later */
	 } else {
	     map_size++;
	     map->avg_length_mismatch += fabs(X_rep->length[i] - Y_rep->length[j]);
	     
	 }    
    }
    map->avg_length_mismatch /= map_size;
    *F_effective = -map_size;
    
    return 0;
}