Exemple #1
0
double FrVectorSimilarity(FrClusteringMeasure sim,
                          const double *vec1, const double *vec2,
                          size_t veclen, bool normalize)
{
    if (veclen == 0)
        return 1.0 ;			// empty vectors are always identical
    if (!vec1 || !vec2)
        return -1.0 ;			// maximum diff if vector missing
    switch (sim)
    {
    case FrCM_COSINE:
        return cosine_similarity(vec1,vec2,veclen,normalize) ;
    case FrCM_EUCLIDEAN:
        return 1.0 - euclidean_distance(vec1,vec2,veclen,normalize) ;
    case FrCM_MANHATTAN:
        return 1.0 - manhattan_distance(vec1,vec2,veclen,normalize) ;
    case FrCM_JACCARD:
        return jaccard_coefficient(vec1,vec2,veclen) ;
    case FrCM_SIMPSON:
        return simpson_coefficient(vec1,vec2,veclen) ;
    case FrCM_EXTSIMPSON:
        return extended_simpson_coefficient(vec1,vec2,veclen,normalize) ;
    case FrCM_DICE:
        return dice_coefficient(vec1,vec2,veclen,normalize) ;
    case FrCM_ANTIDICE:
        return antidice_coefficient(vec1,vec2,veclen,normalize) ;
    case FrCM_TANIMOTO:
        return tanimoto_coefficient(vec1,vec2,veclen,normalize) ;
    case FrCM_BRAUN_BLANQUET:
        return braun_blanquet_coefficient(vec1,vec2,veclen,normalize) ;
    case FrCM_KULCZYNSKI1:
        return kulczynski_measure1(vec1,vec2,veclen,normalize) ;
    case FrCM_KULCZYNSKI2:
        return kulczynski_measure2(vec1,vec2,veclen,normalize) ;
    case FrCM_OCHIAI:
        return ochiai_measure(vec1,vec2,veclen,normalize) ;
    case FrCM_SOKALSNEATH:
        return sokal_sneath_measure(vec1,vec2,veclen,normalize) ;
    case FrCM_MCCONNAUGHEY:
        // mcConnaughey_measure() return is in range -1.0...+1.0
        return (mcConnaughey_measure(vec1,vec2,veclen,normalize)+1.0) / 2.0 ;
    case FrCM_LANCEWILLIAMS:
        return 1.0 - lance_williams_distance(vec1,vec2,veclen,normalize) ;
    case FrCM_BRAYCURTIS:
        return bray_curtis_measure(vec1,vec2,veclen,normalize) ;
    case FrCM_CANBERRA:
        return 1.0 - canberra_measure(vec1,vec2,veclen,normalize) ;
    case FrCM_CIRCLEPROD:
        return circle_product(vec1,vec2,veclen,normalize) / veclen ;
    case FrCM_CZEKANOWSKI:
        return czekanowski_measure(vec1,vec2,veclen,normalize) ;
    case FrCM_ROBINSON:
        return robinson_coefficient(vec1,vec2,veclen,normalize) / 2.0 ;
    case FrCM_DRENNAN:
        return 1.0 - drennan_dissimilarity(vec1,vec2,veclen,normalize) ;
    case FrCM_SIMILARITYRATIO:
        return similarity_ratio(vec1,vec2,veclen,normalize) ;
    case FrCM_JENSENSHANNON:
        return 1.0 - jensen_shannon_divergence(vec1,vec2,veclen,normalize) ;
    case FrCM_MOUNTFORD:
        return mountford_coefficient(vec1,vec2,veclen) ;
    case FrCM_FAGER_MCGOWAN:
        return fager_mcgowan_coefficient(vec1,vec2,veclen) ;
    case FrCM_TRIPARTITE:
        return tripartite_similarity_index(vec1,vec2,veclen) ;
    case FrCM_BIN_DICE:
        return binary_dice_coefficient(vec1,vec2,veclen) ;
    case FrCM_BIN_ANTIDICE:
        return binary_antidice_coefficient(vec1,vec2,veclen) ;
    case FrCM_BIN_GAMMA:
        return binary_gamma_coefficient(vec1,vec2,veclen) ;
    case FrCM_NONE:
        return 0.0 ;
    default:
        FrMissedCase("FrVectorSimilarity()") ;
        return 0.0 ;
    }
}
int main(int argc, char *argv[]) {
    if (argc != 4) {
        printf("usage: \n\t$ %s <SimMetric> <string1> <string2>\n", basename(argv[0]));
        printf("\nWhere SimMetric is one of:\n");
        int i;
        for (i=0; i < SIMMETC; i++) {
            if (i > 0)
                printf(",");
            printf(" %s", SIMMETS[i]);
        }
        printf("\n");
        return (1);
    }
    else if (strcmp(argv[1], "all") == 0) {
        argv[1] = "block_distance"; main(argc, argv);
        argv[1] = "cosine"; main(argc, argv);
        argv[1] = "dice"; main(argc, argv);
        argv[1] = "euclidean_distance"; main(argc, argv);
        argv[1] = "jaccard"; main(argc, argv);
        argv[1] = "jaro"; main(argc, argv);
        argv[1] = "jaro_winkler"; main(argc, argv);
        argv[1] = "levenshtein"; main(argc, argv);
        argv[1] = "matching_coefficient"; main(argc, argv);
        argv[1] = "monge_elkan"; main(argc, argv);
        argv[1] = "needleman_wunch"; main(argc, argv);
        argv[1] = "overlap_coefficient"; main(argc, argv);
        argv[1] = "qgrams_distance"; main(argc, argv);
        argv[1] = "smith_waterman"; main(argc, argv);
        argv[1] = "smith_waterman_gotoh"; main(argc, argv);
        argv[1] = "soundex"; main(argc, argv);
        argv[1] = "metaphone"; main(argc, argv);
        argv[1] = "double_metaphone"; main(argc, argv);
    }
    else {
        float similarity = 0;
        char *sm_name, metrics[50], compare[50];

        sprintf(compare, "%10s & %-10s", argv[2], argv[3]);
        switch (which_type(argv[1])) {
            case 0:
            case 1:
                sm_name = "Block Distance";
                sprintf(metrics, "%d", block_distance(argv[2], argv[3]));
                similarity = block_distance_similarity(argv[2], argv[3]);
                break;
            case 2:
            case 3:
                sm_name = "Cosine Similarity";
                similarity = cosine_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 4:
                sm_name = "Dice Similarity";
                similarity = dice_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 5:
            case 6:
                sm_name = "Euclidean Distance";
                sprintf(metrics, "%3.2f", euclidean_distance(argv[2], argv[3]));
                similarity = euclidean_distance_similarity(argv[2], argv[3]);
                break;
            case 7:
            case 8:
                sm_name = "Jaccard Similarity";
                similarity = jaccard_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 9:
            case 10:
                sm_name = "Jaro Similarity";
                similarity = jaro_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 11:
            case 12:
                sm_name = "Jaro Winkler Similarity";
                similarity = jaro_winkler_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 13:
            case 14:
                sm_name = "Levenshtein Distance";
                sprintf(metrics, "%d", levenshtein(argv[2], argv[3]));
                similarity = levenshtein_similarity(argv[2], argv[3]);
                break;
            case 15:
            case 16:
                sm_name = "Matching Coefficient SimMetrics";
                sprintf(metrics, "%3.2f", matching_coefficient(argv[2], argv[3]));
                similarity = matching_coefficient_similarity(argv[2], argv[3]);
                break;
            case 17:
            case 18:
                sm_name = "Monge Elkan Similarity";
                similarity = monge_elkan_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 19:
            case 20:
                sm_name = "Needleman Wunch SimMetrics";
                sprintf(metrics, "%3.2f", needleman_wunch(argv[2], argv[3]));
                similarity = needleman_wunch_similarity(argv[2], argv[3]);
                break;
            case 21:
            case 22:
                sm_name = "Overlap Coefficient Similarity";
                similarity = overlap_coefficient_similarity(argv[2], argv[3]);
                sprintf(metrics, "%f", similarity);
                break;
            case 23:
            case 24:
                sm_name = "QGrams Distance";
                sprintf(metrics, "%d", qgrams_distance(argv[2], argv[3]));
                similarity = qgrams_distance_similarity(argv[2], argv[3]);
                break;
            case 25:
            case 26:
                sm_name = "Smith Waterman SimMetrics";
                sprintf(metrics, "%3.2f", smith_waterman(argv[2], argv[3]));
                similarity = smith_waterman_similarity(argv[2], argv[3]);
                break;
            case 27:
            case 28:
                sm_name = "Smith Waterman Gotoh SimMetrics";
                sprintf(metrics, "%3.2f", smith_waterman_gotoh(argv[2], argv[3]));
                similarity = smith_waterman_gotoh_similarity(argv[2], argv[3]);
                break;
            case 29:
            case 30:
                sm_name = "Soundex Phonetics";
                char *s1 = soundex(argv[2]);
                char *s2 = soundex(argv[3]);
                sprintf(metrics, "%s & %s", s1, s2);
                free(s1);
                free(s2);
                similarity = soundex_similarity(argv[2], argv[3]);
                break;
            case 31:
            case 32:
                sm_name = "Metaphone Phonetics";
                char *m1 = metaphone(argv[2]);
                char *m2 = metaphone(argv[3]);
                sprintf(metrics, "%s & %s", m1, m2);
                free(m1);
                free(m2);
                similarity = metaphone_similarity(argv[2], argv[3]);
                break;
            case 33:
            case 34:
                sm_name = "Double Metaphone Phonetics";
                char *dm1 = double_metaphone(argv[2]);
                char *dm2 = double_metaphone(argv[3]);
                sprintf(metrics, "%s & %s", dm1, dm2);
                free(dm1);
                free(dm2);
                similarity = double_metaphone_similarity(argv[2], argv[3]);
                break;
            default:
               printf("Unknown SimMetric %s, not found.\n", argv[1]);
               return (1);
        }

        printf("%-31s between %-25s is %12s ", sm_name, compare, metrics);
        printf("and yields a %3.0f%% similarity\n", similarity * 100);

        return (EXIT_SUCCESS);
    }
}