int match_line_with_keyword(const char *line, int line_length, const char *keyword, MatchMode mode) { MYLOG("line_length %d", line_length); wchar_t line_char,keyword_char; int match_hanzi_count = 0; utf8vector line_vector = utf8vector_create(line, line_length); utf8vector keyword_vector = utf8vector_create(keyword, -1); int keyword_length = utf8vector_uni_count(keyword_vector); int keyword_index = 0; wchar_t *keyword_uni = malloc(sizeof(wchar_t) * keyword_length); while ((keyword_char = utf8vector_next_unichar(keyword_vector)) != '\0') { keyword_uni[keyword_index] = keyword_char; keyword_index ++; } //printf("keyword length %d\n", keyword_length); int match_rt = 1; keyword_index = 0; while((line_char = utf8vector_next_unichar(line_vector)) != '\0' && keyword_index < keyword_length) { keyword_char = keyword_uni[keyword_index]; //printf("keyword index %d\n", keyword_index); if (pinyin_ishanzi(line_char)) { if (pinyin_ishanzi(keyword_char)) { if (line_char != keyword_char) { match_rt = 0; break; } } else if (pinyin_isabc(keyword_char)) { keyword_char = pinyin_lowercase(keyword_char); //printf("keyword_char %d\n", keyword_char); const char **pinyins; int count = pinyin_get_pinyins_by_unicode(line_char, &pinyins); if (mode == MatchModeFirstLetter) { int finded = 0; for (int i = 0; i < count; i++) { //printf("pinyin0 %d\n", pinyins[i][0]); if (keyword_char == pinyins[i][0]) { finded = 1; //printf("matched !!!!\n"); break; } } if (finded == 0) match_rt = 0; else match_hanzi_count ++; } else if (mode == MatchModeFull) { int finded = 0; for (int i = 0; i < count; i++) { int kindex_start = keyword_index; const char *pinyin = pinyins[i]; int j = 0; char pinyin_char; while ((pinyin_char = pinyin[j]) != '\0' && kindex_start < keyword_length) { if (pinyin_char != pinyin_lowercase(keyword_uni[kindex_start])) { break; } j++; kindex_start ++; } int matched = (pinyin_char == '\0'); if (matched) { finded = 1; keyword_index = kindex_start - 1; break; } } if (finded == 0) match_rt = 0; else match_hanzi_count ++; } free(pinyins); if (match_rt == 0) break; } else { //printf("not hanzi or abc %d\n", keyword_char); match_rt = 0; break; } } else { if (line_char != keyword_char) { match_rt = 0; break; } } keyword_index ++; } //keyword.length > line.length if (match_rt == 1 && keyword_index < keyword_length) match_rt = 0; free(keyword_uni); utf8vector_free(line_vector); utf8vector_free(keyword_vector); if (match_rt == 0) return -1; else return match_hanzi_count; }
void hz2py(const char *line, int line_length, int add_blank, int polyphone_support, int first_letter_only, int convert_double_char, int show_tones) { wchar_t uni_char; wchar_t last_uni_char = 0; const char *utf8; int utf8_length; utf8vector line_vector = utf8vector_create(line, line_length); while((uni_char = utf8vector_next_unichar_with_raw(line_vector, &utf8, &utf8_length)) != '\0') { if (pinyin_ishanzi(uni_char)) { const char **pinyins = NULL; int print_count = 0; int count = pinyin_get_pinyins_by_unicode(uni_char, &pinyins); if (count == 0) { printf("%.*s", utf8_length, utf8); } else { char *tones = NULL; if (show_tones) pinyin_get_tones_by_unicode(uni_char, &tones); // add blank if (add_blank && last_uni_char != 0 && !pinyin_ishanzi(last_uni_char)) printf(" "); for (int i = 0; i < count; i++) { if (first_letter_only) { if (show_tones) { if (print_count > 0) printf("|"); printf("%c", pinyins[i][0]); print_count ++; } else { int has_print = 0; char c = pinyins[i][0]; for (int j = 0; j < i; j ++) { if (pinyins[j][0] == c) { has_print = 1; break; } } if (! has_print) { // fprintf(stderr, "I am here, i =%d \n", i); if (print_count > 0) printf("|"); printf("%c", pinyins[i][0]); print_count++; } else { continue; } } } else { if (show_tones) { if (print_count > 0) printf("|"); printf("%s", pinyins[i]); print_count ++; } else { int has_print = 0; char *s = (char *)pinyins[i]; for (int j = 0; j < i; j ++) { if (strcmp(pinyins[j], s) == 0) { has_print = 1; break; } } if (!has_print) { if (print_count > 0) printf("|"); printf("%s", pinyins[i]); print_count ++; } else { continue; } } } if (show_tones) printf("%d", tones[i]); if (!polyphone_support) break; } if (add_blank) printf(" "); free(tones); } free(pinyins); } else { if (convert_double_char && uni_char > 65280 && uni_char < 65375) { printf("%c", uni_char - 65248); } else if (convert_double_char && uni_char == 12288) { printf("%c", 32); } else { printf("%.*s", utf8_length, utf8); } } last_uni_char = uni_char; } printf("\n"); utf8vector_free(line_vector); }