Esempio n. 1
0
int match_line_with_keyword(const char *line, int line_length, const char *keyword, MatchMode mode)
{
    MYLOG("line_length %d", line_length);

    wchar_t line_char,keyword_char; 
    int match_hanzi_count = 0;

    utf8vector line_vector = utf8vector_create(line, line_length);
    utf8vector keyword_vector = utf8vector_create(keyword, -1);

    int keyword_length = utf8vector_uni_count(keyword_vector);
    int keyword_index = 0;
    wchar_t *keyword_uni = malloc(sizeof(wchar_t) * keyword_length);

    while ((keyword_char = utf8vector_next_unichar(keyword_vector)) != '\0')
    {
        keyword_uni[keyword_index] = keyword_char;
        keyword_index ++;
    }

    //printf("keyword length %d\n", keyword_length);

    int match_rt = 1;
    keyword_index = 0;

    while((line_char = utf8vector_next_unichar(line_vector)) != '\0'
            && keyword_index < keyword_length)
    {
        keyword_char = keyword_uni[keyword_index];
        //printf("keyword index %d\n", keyword_index);

        if (pinyin_ishanzi(line_char))
        {
            if (pinyin_ishanzi(keyword_char))
            {
                if (line_char != keyword_char)
                {
                    match_rt = 0;
                    break;
                }
            }
            else if (pinyin_isabc(keyword_char))
            {
                keyword_char = pinyin_lowercase(keyword_char);
                //printf("keyword_char %d\n", keyword_char);
                const char **pinyins;
                int count = pinyin_get_pinyins_by_unicode(line_char, &pinyins);
                if (mode == MatchModeFirstLetter)
                {
                    int finded = 0;
                    for (int i = 0; i < count; i++)
                    {
                        //printf("pinyin0 %d\n", pinyins[i][0]);
                        if (keyword_char == pinyins[i][0])
                        {
                            finded = 1;
                            //printf("matched !!!!\n");
                            break;
                        }
                    }

                    if (finded == 0)
                        match_rt = 0;
                    else
                        match_hanzi_count ++;
                }
                else if (mode == MatchModeFull)
                {
                    int finded = 0;
                    for (int i = 0; i < count; i++)
                    {
                        int kindex_start = keyword_index;
                        const char *pinyin = pinyins[i];
                        int j = 0;
                        char pinyin_char;

                        while ((pinyin_char = pinyin[j]) != '\0' && kindex_start < keyword_length)
                        {
                            if (pinyin_char != pinyin_lowercase(keyword_uni[kindex_start]))
                            {
                                break;
                            }
                            j++;
                            kindex_start ++;
                        }
                       
                        int matched = (pinyin_char == '\0');
                        
                        if (matched)
                        {
                            finded = 1;
                            keyword_index = kindex_start - 1;
                            break;
                        }
                    }

                    if (finded == 0)
                        match_rt = 0;
                    else
                        match_hanzi_count ++;
                }

                free(pinyins);

                if (match_rt == 0)
                    break;
            }
            else
            {
                //printf("not hanzi or abc %d\n", keyword_char);
                match_rt = 0;
                break;
            }
        }
        else
        {
            if (line_char != keyword_char)
            {
                match_rt = 0;
                break;
            }
        }

        keyword_index ++;
    }
    
    //keyword.length > line.length
    if (match_rt == 1 && keyword_index < keyword_length)
        match_rt = 0;

    free(keyword_uni);
    utf8vector_free(line_vector);
    utf8vector_free(keyword_vector);
    if (match_rt == 0)
        return -1;
    else
        return match_hanzi_count;
}
Esempio n. 2
0
File: hz2py.c Progetto: donliu/unp
void hz2py(const char *line,
        int line_length,
        int add_blank,
        int polyphone_support,
        int first_letter_only,
        int convert_double_char,
        int show_tones)
{
    wchar_t uni_char; 
    wchar_t last_uni_char = 0;
    const char *utf8;
    int utf8_length;

    utf8vector line_vector = utf8vector_create(line, line_length);

    while((uni_char = utf8vector_next_unichar_with_raw(line_vector, &utf8, &utf8_length)) != '\0')
    {
        if (pinyin_ishanzi(uni_char))
        {
            const char **pinyins = NULL;
            int print_count = 0;
            int count = pinyin_get_pinyins_by_unicode(uni_char, &pinyins);
            if (count == 0)
            {
                printf("%.*s", utf8_length, utf8);
            }
            else
            {
                char *tones = NULL;
                if (show_tones)
                    pinyin_get_tones_by_unicode(uni_char, &tones);

                // add blank
                if (add_blank && last_uni_char != 0 && !pinyin_ishanzi(last_uni_char)) printf(" ");

                for (int i = 0; i < count; i++)
                {
                    if (first_letter_only)
                    {
                        if (show_tones)
                        {
                            if (print_count > 0)
                                printf("|");

                            printf("%c", pinyins[i][0]);
                            print_count ++;
                        }
                        else
                        {
                            int has_print = 0;
                            char c = pinyins[i][0];

                            for (int j = 0; j < i; j ++)
                            {
                                if (pinyins[j][0] == c)
                                {
                                    has_print = 1;
                                    break;
                                }
                            }

                            if (! has_print)
                            {
                                // fprintf(stderr, "I am here, i =%d \n", i);
                                if (print_count > 0)
                                    printf("|");
                                printf("%c", pinyins[i][0]);
                                print_count++;
                            }
                            else
                            {
                                continue;
                            }
                        }
                    }
                    else
                    {
                        if (show_tones)
                        {
                            if (print_count > 0)
                                    printf("|");
                            printf("%s", pinyins[i]);
                            print_count ++;
                        }
                        else
                        {
                            int has_print = 0;
                            char *s = (char *)pinyins[i];
                            for (int j = 0; j < i; j ++)
                            {
                                if (strcmp(pinyins[j], s) == 0)
                                {
                                    has_print = 1;
                                    break;
                                }
                            }

                            if (!has_print)
                            {
                                if (print_count > 0)
                                    printf("|");
                                printf("%s", pinyins[i]);
                                print_count ++;
                            }
                            else
                            {
                                continue;
                            }
                        }
                    }

                    if (show_tones)
                        printf("%d", tones[i]);

                    if (!polyphone_support)
                        break;

                }

                if (add_blank) printf(" ");

                free(tones);
            }
            free(pinyins);
        }
        else
        {
            if (convert_double_char && uni_char > 65280 && uni_char < 65375)
            {
                printf("%c", uni_char - 65248);
            }
            else if (convert_double_char && uni_char == 12288)
            {
                printf("%c", 32);
            }
            else
            {
                printf("%.*s", utf8_length, utf8);
            }
        }
        last_uni_char = uni_char;
    }
    printf("\n");
    utf8vector_free(line_vector);
}