/* * call-seq: * utf8_titleize(string) * * Returns a title case string. * * Glib.utf8_titleize('привет всем') #=> Привет Всем */ static VALUE utf8_titleize(VALUE self, VALUE string) { VALUE result; gchar *temp; long index, length_in_bytes, length_in_chars; gunichar *chars_as_ucs4, current_char; gboolean first_character_of_word = TRUE; Check_Type(string, T_STRING); length_in_bytes = RSTRING_LEN(string); if ((chars_as_ucs4 = g_utf8_to_ucs4(StringValuePtr(string), length_in_bytes, NULL, &length_in_chars, NULL))) { for (index = 0; index < length_in_chars; index++) { current_char = chars_as_ucs4[index]; if (first_character_of_word == TRUE && g_unichar_isalpha(current_char)) { chars_as_ucs4[index] = g_unichar_totitle(current_char); first_character_of_word = FALSE; } if (g_unichar_isspace(current_char) || g_unichar_ispunct(current_char)) { first_character_of_word = TRUE; } } temp = g_ucs4_to_utf8(chars_as_ucs4, -1, NULL, NULL, NULL); result = rb_str_new2(temp); g_free(chars_as_ucs4); g_free(temp); return result; } else { return Qnil; } }
static void test_valid_turkish() { long nwritten; long nread; char *res; int i; long size; gunichar *verify; unsigned char *back; unsigned char buf[2]; static int map_size = sizeof(gsm_turkish_to_unicode_map) / sizeof(unsigned short) / 2; for (i = 0; i < map_size; i++) { unsigned short c = gsm_turkish_to_unicode_map[i*2]; if (c & 0x1b00) { buf[0] = 0x1b; buf[1] = c & 0x7f; size = 2; } else { size = 1; buf[0] = c & 0x7f; } res = convert_gsm_to_utf8_with_lang(buf, size, &nread, &nwritten, 0, 1, 1); g_assert(res); if (g_test_verbose()) g_print("size: %ld, nread:%ld, nwritten:%ld, %s\n", size, nread, nwritten, res); g_assert(nread == size); verify = g_utf8_to_ucs4(res, -1, NULL, NULL, NULL); g_assert(verify[0] == gsm_turkish_to_unicode_map[i*2+1]); g_assert(verify[1] == 0); g_assert(nwritten == UTF8_LENGTH(verify[0])); back = convert_utf8_to_gsm_with_lang(res, -1, &nread, &nwritten, 0, 1, 1); g_assert(back); g_assert(nwritten == size); if (c & 0x1b00) { g_assert(back[0] == 0x1b); g_assert(back[1] == (c & 0x7f)); } else { g_assert(back[0] == (c & 0x7f)); } g_free(back); g_free(verify); g_free(res); } }
BOOL utf8_to_ucs2(const gchar *utf8, gunichar2 *ucs2, int ucs2_len) { int i; glong items_read; glong count; gunichar *ucs4; gunichar2 *ptr; items_read = 0; count = 0; ucs2_len--; /* Space for null terminator */ ucs4 = g_utf8_to_ucs4(utf8, -1, &items_read, &count, NULL); if (ucs4 == NULL) { return FALSE; } ptr = (gunichar2 *)ucs2; for (i = 0; (i < count) && (i < ucs2_len); i++) { if (ucs4[i] < 0x10000 && !(ucs4[i] >= 0xd800 && ucs4[i] < 0xe000)) { *ptr = (gunichar2)ucs4[i]; ptr++; } /* we're simply ignoring any chars that don't fit into ucs2 */ } ucs2[i] = 0; /* terminate */ /* free the intermediate ucs4 string */ GdipFree(ucs4); return TRUE; }
int is_name_char(const char *offset) { /* TODO: manage length*/ gunichar *converted; gunichar value; if (*offset == '\0') return FALSE; if (is_name_start_char(offset)) return TRUE; /* Easy ASCII-only tests */ if (*offset == '-') return TRUE; if (*offset == '.') return TRUE; if (*offset >= '0' && *offset <= '9') return TRUE; /* TODO: error checking */ converted = g_utf8_to_ucs4(offset, 1, NULL, NULL, NULL); value = *converted; free(converted); /* More expensive Unicode checks */ if (value == 0xB7) return TRUE; if (value >= 0x0300 && value <= 0x036F) return TRUE; if (value >= 0x203F && value <= 0x2040) return TRUE; return FALSE; }
static gunichar lx_tn_engine_get_prev_surrounding_char (LxTNEngine *lx_tn_engine, IBusEngine *ibus_engine) { if (is_client_support_surrounding (ibus_engine)) { IBusText *surrounding; guint cursor_pos; guint anchor_pos; gunichar *u_surrounding; gunichar ret = 0; ibus_engine_get_surrounding_text (ibus_engine, &surrounding, &cursor_pos, &anchor_pos); u_surrounding = g_utf8_to_ucs4 (ibus_text_get_text (surrounding), -1, NULL, NULL, NULL); if (u_surrounding) { ret = (cursor_pos > 0) ? u_surrounding[cursor_pos - 1] : 0; g_free (u_surrounding); } return ret; } return 0; }
/** score_string: * @str input string * @desired_glyphs string containing highly desired glyphs * * Generates an ipsum with an attempt at maximizing the amount of differing * neighbour pairs. * * Returns the score for a string, a higher score is a better string. */ static int score_string (const char *str, const char *desired_glyphs) { /* we pick slightly larger than a power of two, to aovid aliasing of things * starting on multiples of 512 in the unicode set. */ #define ADJ_DIM 1023 gunichar *ustr; char adjacency_matrix[ADJ_DIM*ADJ_DIM]={0,}; gunichar *p; if (!str || str[0] == 0) return 0; ustr = g_utf8_to_ucs4 (str, -1, NULL, NULL, NULL); if (!ustr) return 0; /* walk throguh the string ..*/ for (p = ustr; p[1]; p++) { gunichar x = p[0]; /* .. using the current .. */ gunichar y = p[1]; /* .. and the next characters unicode position ..*/ if (x==' ' || y == ' ') continue; /* (bailing if one of them is a space) */ x %= ADJ_DIM; /* with unicode positions wrapped down to our */ y %= ADJ_DIM; /* matrix dimensions */ /* mark cell in matrix as visited */ adjacency_matrix[y * ADJ_DIM + x] = 1; } /* count number of distinct pairs encountered (permitting some collisions, * in a bloom-filter like manner) */ { int i; int sum = 0; if (desired_glyphs) for (i = 0; ustr[i]; i++) { int j; for (j = 0; desired_glyphs[j]; j++) if (desired_glyphs[j] == ustr[i]) sum ++; } for (i = 0; i < ADJ_DIM * ADJ_DIM ; i ++) sum += adjacency_matrix[i] * 2; g_free (ustr); return sum; } }
void SmkyManufacturerDatabase::setFistAndLastLetters(const std::string & firstLetters, const std::string & lastLetters, int minLength) { if (m_hunspell && m_lastHunspellResult) { m_hunspell->free_list(&m_lastHunspellResult, m_lastHunspellResultCount); m_lastHunspellResult = NULL; } m_lastFirstLastLetterResults.clear(); auto_g_free_array<gunichar> first16 = g_utf8_to_ucs4(firstLetters.c_str(), -1, NULL, NULL, NULL); auto_g_free_array<gunichar> last16 = g_utf8_to_ucs4(lastLetters.c_str(), -1, NULL, NULL, NULL); if (first16 && last16) { gunichar * p = first16; while (*p) *p = g_unichar_tolower(*p), ++p; p = last16; while (*p) *p = g_unichar_tolower(*p), ++p; for (std::map<std::string, int>::iterator iter = m_words.begin(); iter != m_words.end(); ++iter) { const std::string & word = iter->first; const char * wordStr = word.c_str(); gunichar firstLetter = g_unichar_tolower(g_utf8_get_char(wordStr)); if (wcschr(first16.as<wchar_t>(), (wchar_t) firstLetter)) { // first letter is a match! const char * next = wordStr; int wordLength = 1; while ((next = g_utf8_next_char(wordStr)) && *next) wordStr = next, ++wordLength; if (wordLength >= minLength) { gunichar lastLetter = g_unichar_tolower(g_utf8_get_char(wordStr)); if (wcschr(last16.as<wchar_t>(), (wchar_t) lastLetter)) { DEBUG_CALLBACK("First-last letter match: %s-%s -> %s", firstLetters.c_str(), lastLetters.c_str(), word.c_str()); m_lastFirstLastLetterResults.push_back(word.c_str()); } } else { DEBUG_CALLBACK("'%s' is discarded because it's too short: %d letters, %d min", word.c_str(), wordLength, minLength); } } } } }
static int grind_utf8_to_ucs4 (const char *str, gsize len) { int i; for (i = 0; i < NUM_ITERATIONS; i++) { gunichar *ustr; ustr = g_utf8_to_ucs4 (str, -1, NULL, NULL, NULL); g_free (ustr); } return 0; }
/** * gsdl_tokenizer_new_from_string: * @str: String to be parsed. * @err: Return location for a %GError to be set on failure, may be NULL. * * Creates a new tokenizer consuming the given string. The filename will be set to "<string>". * * Returns: A new %GSDLTokenizer, or NULL on failure. */ GSDLTokenizer* gsdl_tokenizer_new_from_string(const char *str, GError **err) { GSDLTokenizer* self = g_slice_new0(GSDLTokenizer); self->filename = "<string>"; self->stringbuf = g_utf8_to_ucs4(str, -1, NULL, NULL, err); if (!self->stringbuf) return NULL; self->channel = NULL; self->line = 1; self->col = 1; self->peek_avail = false; return self; }
gunichar *gglk_text_line_input_get(GglkText *tb) { GtkTextIter b, e; gchar *line_utf8, *line_utf8_normal; gunichar *line_ucs4; glong len; gtk_text_buffer_get_iter_at_mark(tb->buffer, &b, tb->startedit); gtk_text_buffer_get_iter_at_mark(tb->buffer, &e, tb->endedit); line_utf8 = gtk_text_buffer_get_text(tb->buffer, &b, &e, FALSE); line_utf8_normal = g_utf8_normalize(line_utf8, -1, G_NORMALIZE_NFC); line_ucs4 = g_utf8_to_ucs4(line_utf8, -1, NULL, &len, NULL); g_free(line_utf8); line_utf8 = NULL; return line_ucs4; }
/* Currently not used */ static int CalculateStringWidthsUTF8 (cairo_t *ct, GDIPCONST GpFont *gdiFont, const BYTE *utf8, unsigned long StringDetailElements, GpStringDetailStruct *StringDetails) { FT_Face face; size_t i; gunichar *ucs4 = NULL; cairo_font_face_t *Font; GpStringDetailStruct *CurrentDetail; glong NumOfGlyphs; cairo_matrix_t matrix; #ifdef DRAWSTRING_DEBUG printf("CalculateStringWidths(font, %s, %d, details) called\n", utf8, StringDetailElements); #endif Font = (cairo_font_face_t *)gdiFont->cairofnt; face = gdip_cairo_ft_font_lock_face(Font); if (!face) return 0; cairo_get_font_matrix(ct, &matrix); cairo_matrix_scale(&matrix, gdiFont->sizeInPixels, gdiFont->sizeInPixels); ucs4 = g_utf8_to_ucs4 ((const gchar *) utf8, (glong)-1, NULL, &NumOfGlyphs, NULL); if ((NumOfGlyphs == 0) || (ucs4 == NULL)) { return 0; } CurrentDetail=StringDetails; for (i = 0; i < NumOfGlyphs; i++) { FT_Load_Glyph (face, FT_Get_Char_Index (face, ucs4[i]), FT_LOAD_DEFAULT); CurrentDetail->Width = DOUBLE_FROM_26_6 (face->glyph->advance.x); CurrentDetail++; } gdip_cairo_ft_font_unlock_face(Font); GdipFree(ucs4); #ifdef DRAWSTRING_DEBUG printf("CalculateStringWidths: string >%s< translated into %d glyphs\n", utf8, NumOfGlyphs); #endif return NumOfGlyphs; }
std::wstring fromUtf8(const std::string &str) { long readed, writed; wchar_t *errMsg = NULL; wchar_t *res = g_utf8_to_ucs4(str.c_str(), str.length(), &readed, &writed, &errMsg); if (! res) { if (errMsg) throw Exception(errMsg); else throw Exception(L"Error converting text from UTF-8"); } std::wstring s(res); free(res); return s; }
static char * gnt_text_view_get_p(GntTextView *view, int x, int y) { int n; int i = 0; GntWidget *wid = GNT_WIDGET(view); GntTextLine *line; GList *lines; GList *segs; GntTextSegment *seg; gchar *pos; n = g_list_length(view->list); y = wid->priv.height - y; if (n < y) { x = 0; y = n - 1; } lines = g_list_nth(view->list, y - 1); if (!lines) return NULL; do { line = lines->data; lines = lines->next; } while (line && !line->segments && lines); if (!line || !line->segments) /* no valid line */ return NULL; segs = line->segments; seg = (GntTextSegment *)segs->data; pos = view->string->str + seg->start; x = MIN(x, line->length); while (++i <= x) { gunichar *u; pos = g_utf8_next_char(pos); u = g_utf8_to_ucs4(pos, -1, NULL, NULL, NULL); if (u && g_unichar_iswide(*u)) i++; g_free(u); } return pos; }
static int entry_set(LuaState *L) { Entry *e = ms_lua_checkclass(L, CLASS, 1); const char *line = luaL_checkstring(L, 2); e->dirty = TRUE; // was 0 - why? GError *error; glong written; gunichar *buffer = g_utf8_to_ucs4(line, -1, NULL, &written, &error); if (buffer) { g_free(e->buffer); e->bufsize = e->bufused = written; e->curs_off = e->bufused; e->view_off = 0; e->buffer = buffer; return 0; } else { lua_pushfstring(L, "Entry:set() - UCS4 conversion failed: %s", error->message); g_error_free(error); return lua_error(L); } }
static void do_set_text(const char *text, gboolean accept) { GglkText *tb = GGLK_TEXT(gglk_get_line_input_view()); gunichar *buf_ucs4; if(!tb) { char *msg; if(strcmp(text, "") == 0) { msg = g_strdup("No window to clear"); } else { msg = g_strdup_printf("No window to accept %s", text); } sglk_status_set_mesg(msg); g_free(msg); return; } buf_ucs4 = g_utf8_to_ucs4(text, -1, NULL, NULL, NULL); gglk_text_line_input_set(tb, tb->line_maxlen, buf_ucs4); g_free(buf_ucs4); if(accept) gglk_text_line_input_accept(tb); }
bool PhraseLargeTable3::load_text(FILE * infile){ char pinyin[256]; char phrase[256]; phrase_token_t token; size_t freq; while (!feof(infile)) { int num = fscanf(infile, "%255s %255s %u %ld", pinyin, phrase, &token, &freq); if (4 != num) continue; if (feof(infile)) break; glong phrase_len = g_utf8_strlen(phrase, -1); ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); add_index(phrase_len, new_phrase, token); g_free(new_phrase); } return true; }
int is_name_start_char(const char *offset) { /* TODO: manage length*/ gunichar *converted; gunichar value; if (*offset == '\0') return FALSE; /* Easy ASCII-only tests */ if (*offset == ':') return TRUE; if (*offset >= 'A' && *offset <= 'Z') return TRUE; if (*offset == '_') return TRUE; if (*offset >= 'a' && *offset <= 'z') return TRUE; /* TODO: error checking */ converted = g_utf8_to_ucs4(offset, 1, NULL, NULL, NULL); value = *converted; free(converted); /* More expensive Unicode checks */ if (value >= 0xC0 && value <= 0xD6) return TRUE; if (value >= 0xD8 && value <= 0xF6) return TRUE; if (value >= 0xF8 && value <= 0x2FF) return TRUE; if (value >= 0x370 && value <= 0x37D) return TRUE; if (value >= 0x37F && value <= 0x1FFF) return TRUE; if (value >= 0x200C && value <= 0x200D) return TRUE; if (value >= 0x2070 && value <= 0x218F) return TRUE; if (value >= 0x2C00 && value <= 0x2FEF) return TRUE; if (value >= 0x3001 && value <= 0xD7FF) return TRUE; if (value >= 0xF900 && value <= 0xFDCF) return TRUE; if (value >= 0xFDF0 && value <= 0xFFFD) return TRUE; if (value >= 0x10000 && value <= 0xEFFFF) return TRUE; return FALSE; }
/* Punycode encoder, RFC 3492 section 6.3. The algorithm is * sufficiently bizarre that it's not really worth trying to explain * here. */ static gboolean punycode_encode (const gchar *input_utf8, gsize input_utf8_length, GString *output) { guint delta, handled_chars, num_basic_chars, bias, j, q, k, t, digit; gunichar n, m, *input; glong input_length; gboolean success = FALSE; /* Convert from UTF-8 to Unicode code points */ input = g_utf8_to_ucs4 (input_utf8, input_utf8_length, NULL, &input_length, NULL); if (!input) return FALSE; /* Copy basic chars */ for (j = num_basic_chars = 0; j < input_length; j++) { if (PUNYCODE_IS_BASIC (input[j])) { g_string_append_c (output, g_ascii_tolower (input[j])); num_basic_chars++; } } if (num_basic_chars) g_string_append_c (output, '-'); handled_chars = num_basic_chars; /* Encode non-basic chars */ delta = 0; bias = PUNYCODE_INITIAL_BIAS; n = PUNYCODE_INITIAL_N; while (handled_chars < input_length) { /* let m = the minimum {non-basic} code point >= n in the input */ for (m = G_MAXUINT, j = 0; j < input_length; j++) { if (input[j] >= n && input[j] < m) m = input[j]; } if (m - n > (G_MAXUINT - delta) / (handled_chars + 1)) goto fail; delta += (m - n) * (handled_chars + 1); n = m; for (j = 0; j < input_length; j++) { if (input[j] < n) { if (++delta == 0) goto fail; } else if (input[j] == n) { q = delta; for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE) { if (k <= bias) t = PUNYCODE_TMIN; else if (k >= bias + PUNYCODE_TMAX) t = PUNYCODE_TMAX; else t = k - bias; if (q < t) break; digit = t + (q - t) % (PUNYCODE_BASE - t); g_string_append_c (output, encode_digit (digit)); q = (q - t) / (PUNYCODE_BASE - t); } g_string_append_c (output, encode_digit (q)); bias = adapt (delta, handled_chars + 1, handled_chars == num_basic_chars); delta = 0; handled_chars++; } } delta++; n++; } success = TRUE; fail: g_free (input); return success; }
int main(int argc, char * argv[]){ setlocale(LC_ALL, ""); SystemTableInfo system_table_info; bool retval = system_table_info.load("../../data/table.conf"); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } /* init phrase table */ FacadePhraseTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("../../data/phrase_index.bin"); phrase_table.load(chunk, NULL); const pinyin_table_info_t * phrase_files = system_table_info.get_table_info(); /* init phrase index */ FacadePhraseIndex phrase_index; if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); /* init bi-gram */ Bigram system_bigram; system_bigram.attach("../../data/bigram.db", ATTACH_READONLY); Bigram user_bigram; gfloat lambda = system_table_info.get_lambda(); /* init phrase lookup */ PhraseLookup phrase_lookup(lambda, &phrase_table, &phrase_index, &system_bigram, &user_bigram); /* try one sentence */ char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, stdin)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } if ( strcmp ( linebuf, "quit" ) == 0) break; /* check non-ucs4 characters */ const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs4 characters are not accepted.\n"); g_free(sentence); continue; } try_phrase_lookup(&phrase_lookup, sentence, len); g_free(sentence); } free(linebuf); return 0; }
int main(int argc, char * argv[]){ const char * evals_text = "evals.text"; pinyin_option_t options = USE_TONE; FacadeChewingTable largetable; MemoryChunk * chunk = new MemoryChunk; chunk->load("pinyin_index.bin"); largetable.load(options, chunk, NULL); FacadePhraseTable2 phrase_table; chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk, NULL); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); Bigram system_bigram; system_bigram.attach("bigram.db", ATTACH_READONLY); Bigram user_bigram; user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE); PinyinLookup2 pinyin_lookup(options, &largetable, &phrase_index, &system_bigram, &user_bigram); /* open evals.text. */ FILE * evals_file = fopen(evals_text, "r"); if ( NULL == evals_file ) { fprintf(stderr, "Can't open file:%s\n", evals_text); exit(ENOENT); } PhraseTokens phrase_tokens; memset(phrase_tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(phrase_tokens); /* Evaluates the correction rate of test text documents. */ size_t tested_count = 0; size_t passed_count = 0; char* linebuf = NULL; size_t size = 0; TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t)); phrase_token_t token = null_token; while( getline(&linebuf, &size, evals_file) ) { if ( feof(evals_file) ) break; if ( '\n' == linebuf[strlen(linebuf)-1] ) linebuf[strlen(linebuf)-1] = '\0'; glong phrase_len = 0; ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); token = null_token; if ( 0 != phrase_len ) { int result = phrase_table.search(phrase_len, phrase, phrase_tokens); int num = get_first_token(phrase_tokens, token); if ( !(result & SEARCH_OK) ) token = null_token; g_free(phrase); phrase = NULL; } if ( null_token == token ) { if ( tokens->len ) { /* one test. */ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) { tested_count ++; passed_count ++; } else { tested_count ++; } g_array_set_size(tokens, 0); } } else { g_array_append_val(tokens, token); } } if ( tokens->len ) { /* one test. */ if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) { tested_count ++; passed_count ++; } else { tested_count ++; } } parameter_t rate = passed_count / (parameter_t) tested_count; printf("correction rate:%f\n", rate); g_array_free(tokens, TRUE); fclose(evals_file); free(linebuf); phrase_index.destroy_tokens(phrase_tokens); return 0; }
static void process (gint line, gchar *utf8, Status status, gunichar *ucs4, gint ucs4_len) { const gchar *end; gboolean is_valid = g_utf8_validate (utf8, -1, &end); GError *error = NULL; glong items_read, items_written; switch (status) { case VALID: if (!is_valid) { fail ("line %d: valid but g_utf8_validate returned FALSE\n", line); return; } break; case NOTUNICODE: case INCOMPLETE: case OVERLONG: case MALFORMED: if (is_valid) { fail ("line %d: invalid but g_utf8_validate returned TRUE\n", line); return; } break; } if (status == INCOMPLETE) { gunichar *ucs4_result; ucs4_result = g_utf8_to_ucs4 (utf8, -1, NULL, NULL, &error); if (!error || !g_error_matches (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT)) { fail ("line %d: incomplete input not properly detected\n", line); return; } g_clear_error (&error); ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, NULL, &error); if (!ucs4_result || items_read == strlen (utf8)) { fail ("line %d: incomplete input not properly detected\n", line); return; } g_free (ucs4_result); } if (status == VALID || status == NOTUNICODE) { gunichar *ucs4_result; gchar *utf8_result; ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, &items_written, &error); if (!ucs4_result) { fail ("line %d: conversion to ucs4 failed: %s\n", line, error->message); return; } if (!ucs4_equal (ucs4_result, ucs4) || items_read != strlen (utf8) || items_written != ucs4_len) { fail ("line %d: results of conversion to ucs4 do not match expected.\n", line); return; } g_free (ucs4_result); ucs4_result = g_utf8_to_ucs4_fast (utf8, -1, &items_written); if (!ucs4_equal (ucs4_result, ucs4) || items_written != ucs4_len) { fail ("line %d: results of conversion to ucs4 do not match expected.\n", line); return; } utf8_result = g_ucs4_to_utf8 (ucs4_result, -1, &items_read, &items_written, &error); if (!utf8_result) { fail ("line %d: conversion back to utf8 failed: %s", line, error->message); return; } if (strcmp (utf8_result, utf8) != 0 || items_read != ucs4_len || items_written != strlen (utf8)) { fail ("line %d: conversion back to utf8 did not match original\n", line); return; } g_free (utf8_result); g_free (ucs4_result); } if (status == VALID) { gunichar2 *utf16_expected_tmp; gunichar2 *utf16_expected; gunichar2 *utf16_from_utf8; gunichar2 *utf16_from_ucs4; gunichar *ucs4_result; gsize bytes_written; gint n_chars; gchar *utf8_result; #if G_BYTE_ORDER == G_LITTLE_ENDIAN #define TARGET "UTF-16LE" #else #define TARGET "UTF-16" #endif if (!(utf16_expected_tmp = (gunichar2 *)g_convert (utf8, -1, TARGET, "UTF-8", NULL, &bytes_written, NULL))) { fail ("line %d: could not convert to UTF-16 via g_convert\n", line); return; } /* zero-terminate and remove BOM */ n_chars = bytes_written / 2; if (utf16_expected_tmp[0] == 0xfeff) /* BOM */ { n_chars--; utf16_expected = g_new (gunichar2, n_chars + 1); memcpy (utf16_expected, utf16_expected_tmp + 1, sizeof(gunichar2) * n_chars); } else if (utf16_expected_tmp[0] == 0xfffe) /* ANTI-BOM */ { fail ("line %d: conversion via iconv to \"UTF-16\" is not native-endian\n", line); return; } else { utf16_expected = g_new (gunichar2, n_chars + 1); memcpy (utf16_expected, utf16_expected_tmp, sizeof(gunichar2) * n_chars); } utf16_expected[n_chars] = '\0'; if (!(utf16_from_utf8 = g_utf8_to_utf16 (utf8, -1, &items_read, &items_written, &error))) { fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message); return; } if (items_read != strlen (utf8) || utf16_count (utf16_from_utf8) != items_written) { fail ("line %d: length error in conversion to ucs16\n", line); return; } if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (ucs4, -1, &items_read, &items_written, &error))) { fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message); return; } if (items_read != ucs4_len || utf16_count (utf16_from_ucs4) != items_written) { fail ("line %d: length error in conversion to ucs16\n", line); return; } if (!utf16_equal (utf16_from_utf8, utf16_expected) || !utf16_equal (utf16_from_ucs4, utf16_expected)) { fail ("line %d: results of conversion to ucs16 do not match\n", line); return; } if (!(utf8_result = g_utf16_to_utf8 (utf16_from_utf8, -1, &items_read, &items_written, &error))) { fail ("line %d: conversion back to utf8 failed: %s\n", line, error->message); return; } if (items_read != utf16_count (utf16_from_utf8) || items_written != strlen (utf8)) { fail ("line %d: length error in conversion from ucs16 to utf8\n", line); return; } if (!(ucs4_result = g_utf16_to_ucs4 (utf16_from_ucs4, -1, &items_read, &items_written, &error))) { fail ("line %d: conversion back to utf8/ucs4 failed\n", line); return; } if (items_read != utf16_count (utf16_from_utf8) || items_written != ucs4_len) { fail ("line %d: length error in conversion from ucs16 to ucs4\n", line); return; } if (strcmp (utf8, utf8_result) != 0 || !ucs4_equal (ucs4, ucs4_result)) { fail ("line %d: conversion back to utf8/ucs4 did not match original\n", line); return; } g_free (utf16_expected_tmp); g_free (utf16_expected); g_free (utf16_from_utf8); g_free (utf16_from_ucs4); g_free (utf8_result); g_free (ucs4_result); } }
int main(int argc, char * argv[]){ int i = 1; bool train_pi_gram = true; const char * bigram_filename = "bigram.db"; setlocale(LC_ALL, ""); while ( i < argc ){ if ( strcmp("--help", argv[i]) == 0){ print_help(); exit(0); }else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){ train_pi_gram = false; }else if ( strcmp("--bigram-file", argv[i]) == 0){ if ( ++i >= argc ) { print_help(); exit(EINVAL); } bigram_filename = argv[i]; }else{ print_help(); exit(EINVAL); } ++i; } PhraseLargeTable2 phrase_table; /* init phrase table */ MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk); FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); Bigram bigram; bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE); PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); char* linebuf = NULL; size_t size = 0; phrase_token_t last_token, cur_token = last_token = 0; while( getline(&linebuf, &size, stdin) ){ if ( feof(stdin) ) break; linebuf[strlen(linebuf)-1] = '\0'; glong phrase_len = 0; ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL); phrase_token_t token = null_token; if ( 0 != phrase_len ) { phrase_index.clear_tokens(tokens); int result = phrase_table.search(phrase_len, phrase, tokens); int num = get_first_token(tokens, token); if ( !(result & SEARCH_OK) ) token = null_token; g_free(phrase); phrase = NULL; } last_token = cur_token; cur_token = token; /* skip null_token in second word. */ if ( null_token == cur_token ) continue; /* training uni-gram */ phrase_index.add_unigram_frequency(cur_token, 1); /* skip pi-gram training. */ if ( null_token == last_token ){ if ( !train_pi_gram ) continue; last_token = sentence_start; } /* train bi-gram */ SingleGram * single_gram = NULL; bigram.load(last_token, single_gram); if ( NULL == single_gram ){ single_gram = new SingleGram; } guint32 freq, total_freq; /* increase freq */ if (single_gram->get_freq(cur_token, freq)) assert(single_gram->set_freq(cur_token, freq + 1)); else assert(single_gram->insert_freq(cur_token, 1)); /* increase total freq */ single_gram->get_total_freq(total_freq); single_gram->set_total_freq(total_freq + 1); bigram.store(last_token, single_gram); delete single_gram; } phrase_index.destroy_tokens(tokens); free(linebuf); if (!save_phrase_index(&phrase_index)) exit(ENOENT); return 0; }
int main(int argc, char * argv[]){ SystemTableInfo2 system_table_info; bool retval = system_table_info.load("../../data/table.conf"); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } PhraseLargeTable3 largetable; FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); TABLE_PHONETIC_TYPE type = system_table_info.get_table_phonetic_type(); if (!load_phrase_table(phrase_files, NULL, &largetable, &phrase_index, type)) exit(ENOENT); #if 0 MemoryChunk * chunk = new MemoryChunk; largetable.store(chunk); largetable.load(chunk); #endif char* linebuf = NULL; size_t size = 0; ssize_t read; while ((read = getline(&linebuf, &size, stdin)) != -1) { if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } if ( strcmp ( linebuf, "quit" ) == 0) break; glong phrase_len = g_utf8_strlen(linebuf, -1); ucs4_t * new_phrase = g_utf8_to_ucs4(linebuf, -1, NULL, NULL, NULL); if (0 == phrase_len) continue; PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); guint32 start = record_time(); size_t i = 0; for (i = 0; i < bench_times; ++i){ phrase_index.clear_tokens(tokens); largetable.search(phrase_len, new_phrase, tokens); } print_time(start, bench_times); /* test search continued information. */ int retval = SEARCH_NONE; for (i = 1; i < phrase_len; ++i) { phrase_index.clear_tokens(tokens); retval = largetable.search(i, new_phrase, tokens); if (retval & SEARCH_CONTINUED) printf("return continued information with length:%ld\n", i); } phrase_index.clear_tokens(tokens); retval = largetable.search(phrase_len, new_phrase, tokens); if (retval & SEARCH_OK) { for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { GArray * array = tokens[i]; if (NULL == array) continue; for (size_t k = 0; k < array->len; ++k) { phrase_token_t token = g_array_index (array, phrase_token_t, k); printf("token:%d\t", token); } } printf("\n"); } phrase_index.destroy_tokens(tokens); g_free(new_phrase); } if ( linebuf ) free(linebuf); /* mask out all index items. */ largetable.mask_out(0x0, 0x0); return 0; }
char *ipsumat_generate (const char *dict_path, const char *charset, const char *desired_glyphs, int max_wordlen, int max_words) { gunichar *p; gunichar *ucharset; gunichar *udesired_glyphs = NULL; int count = 0; int best_sentence[MAX_WORDS]={0,}; int sentence[MAX_WORDS]={0,}; char *words_str = NULL; gunichar *uwords_str = NULL; GList *words = NULL; GString *word = g_string_new (""); int best_score = 0; int i; if (!dict_path) dict_path = "/usr/share/dict/words"; g_file_get_contents (dict_path, &words_str, NULL, NULL); if (!words_str) return g_strdup ("problem opening dictionary"); uwords_str = g_utf8_to_ucs4 (words_str, -1, NULL, NULL, NULL); if (charset == NULL) charset = "abcdefghijklmnopqrstuvwxyz"; ucharset = g_utf8_to_ucs4 (charset, -1, NULL, NULL, NULL); if (desired_glyphs) udesired_glyphs = g_utf8_to_ucs4 (desired_glyphs, -1, NULL, NULL, NULL); if (max_words > MAX_WORDS) max_words = MAX_WORDS; for (p = uwords_str; *p; p++) { switch (*p) { case '\n': case '\r': case ' ': case '\t': if (word->len) { int skip = 0; int i; gunichar *uword = g_utf8_to_ucs4 (word->str, -1, NULL, NULL, NULL); for (i = 0; uword[i]; i++) { int k; skip++; for (k = 0; ucharset[k]; k++) if (ucharset[k]==uword[i]) { skip--;break; } } if (word->len > max_wordlen) skip++; if (!skip) { words = g_list_prepend (words, g_strdup (word->str)); count ++; } g_free (uword); } g_string_assign (word, ""); break; default: g_string_append_unichar (word, *p); break; } } g_free (ucharset); g_free (words_str); g_free (uwords_str); for (i = 0; i < attempts; i ++) { GString *example = g_string_new (""); int j; for (j = 0; j < max_words; j ++) { int n; const char *str; n = rand()%count; { int k; for (k = 0; k < j; k++) if (sentence[k]==n) { /* we try once more if it collides with already picked * random number,. - but this value will stick */ n = rand()%count; break; } } sentence[j] = n; str = g_list_nth_data (words, n); if (str) { if (j) g_string_append (example, " "); g_string_append (example, str); } } float score = score_string ((void*)example->str, desired_glyphs); if (score >= best_score) { for (j = 0; j < max_words; j ++) best_sentence[j] = sentence[j]; best_score = score; } g_string_free (example, TRUE); } if (print_score) printf ("Score: %i\n", best_score); { char *ret = NULL; int j; GString *s = g_string_new (""); if (desired_glyphs && desired_glyphs[0]) { g_string_append (s, desired_glyphs); g_string_append (s, " "); } for (j = 0; j < max_words; j ++) { const char *str; str = g_list_nth_data (words, best_sentence[j]); if (str) { if (j) g_string_append (s, " "); g_string_append (s, str); } } ret = strdup (s->str); g_string_free (s, TRUE); g_free (udesired_glyphs); return ret; } }
int main(int argc, char * argv[]){ int i = 1; bool gen_extra_enter = false; setlocale(LC_ALL, ""); //deal with options. while ( i < argc ){ if ( strcmp ("--help", argv[i]) == 0) { print_help(); exit(0); } else if (strcmp("--generate-extra-enter", argv[i]) == 0) { gen_extra_enter = true; } else { print_help(); exit(EINVAL); } ++i; } /* init phrase table */ FacadePhraseTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk, NULL); /* init phrase index */ FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, stdin)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } //check non-ucs4 characters const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); printf("\n"); continue; } //do segment stuff GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep)); segment(&phrase_table, &phrase_index, sentence, len, strings); //print out the split phrase for ( glong i = 0; i < strings->len; ++i ) { SegmentStep * step = &g_array_index(strings, SegmentStep, i); char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL); printf("%d %s\n", step->m_handle, string); g_free(string); } /* print extra enter */ if ( gen_extra_enter ) printf("\n"); g_array_free(strings, TRUE); g_free(sentence); } /* print enter at file tail */ printf("\n"); return 0; }
static void gimp_number_pair_entry_set_property (GObject *object, guint property_id, const GValue *value, GParamSpec *pspec) { GimpNumberPairEntry *entry = GIMP_NUMBER_PAIR_ENTRY (object); GimpNumberPairEntryPrivate *priv; priv = GIMP_NUMBER_PAIR_ENTRY_GET_PRIVATE (entry); switch (property_id) { case PROP_LEFT_NUMBER: gimp_number_pair_entry_set_values (entry, g_value_get_double (value), priv->right_number); break; case PROP_RIGHT_NUMBER: gimp_number_pair_entry_set_values (entry, priv->left_number, g_value_get_double (value)); break; case PROP_DEFAULT_LEFT_NUMBER: gimp_number_pair_entry_set_default_values (entry, g_value_get_double (value), priv->default_right_number); break; case PROP_DEFAULT_RIGHT_NUMBER: gimp_number_pair_entry_set_default_values (entry, priv->default_left_number, g_value_get_double (value)); break; case PROP_USER_OVERRIDE: gimp_number_pair_entry_set_user_override (entry, g_value_get_boolean (value)); break; case PROP_SEPARATORS: g_free (priv->separators); priv->num_separators = 0; if (g_value_get_string (value)) priv->separators = g_utf8_to_ucs4 (g_value_get_string (value), -1, NULL, &priv->num_separators, NULL); else priv->separators = NULL; break; case PROP_DEFAULT_TEXT: gimp_number_pair_entry_set_default_text (entry, g_value_get_string (value)); break; case PROP_ALLOW_SIMPLIFICATION: priv->allow_simplification = g_value_get_boolean (value); break; case PROP_MIN_VALID_VALUE: priv->min_valid_value = g_value_get_double (value); break; case PROP_MAX_VALID_VALUE: priv->max_valid_value = g_value_get_double (value); break; case PROP_RATIO: gimp_number_pair_entry_set_ratio (entry, g_value_get_double (value)); break; case PROP_ASPECT: gimp_number_pair_entry_set_aspect (entry, g_value_get_enum (value)); break; default: G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec); break; } }
int main(int argc, char * argv[]){ PhraseLargeTable2 largetable; FacadePhraseIndex phrase_index; if (!load_phrase_table(NULL, &largetable, &phrase_index)) exit(ENOENT); MemoryChunk * chunk = new MemoryChunk; largetable.store(chunk); largetable.load(chunk); char* linebuf = NULL; size_t size = 0; ssize_t read; while ((read = getline(&linebuf, &size, stdin)) != -1) { if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } if ( strcmp ( linebuf, "quit" ) == 0) break; glong phrase_len = g_utf8_strlen(linebuf, -1); ucs4_t * new_phrase = g_utf8_to_ucs4(linebuf, -1, NULL, NULL, NULL); if (0 == phrase_len) continue; PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); guint32 start = record_time(); for (size_t i = 0; i < bench_times; ++i){ phrase_index.clear_tokens(tokens); largetable.search(phrase_len, new_phrase, tokens); } print_time(start, bench_times); phrase_index.clear_tokens(tokens); int retval = largetable.search(phrase_len, new_phrase, tokens); if (retval & SEARCH_OK) { for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) { GArray * array = tokens[i]; if (NULL == array) continue; for (size_t k = 0; k < array->len; ++k) { phrase_token_t token = g_array_index (array, phrase_token_t, k); printf("token:%d\t", token); } } printf("\n"); } phrase_index.destroy_tokens(tokens); g_free(new_phrase); } if ( linebuf ) free(linebuf); return 0; }
int main(int argc, char * argv[]){ int i = 1; bool gen_extra_enter = false; setlocale(LC_ALL, ""); /* deal with options */ while ( i < argc ){ if ( strcmp ("--help", argv[i]) == 0 ){ print_help(); exit(0); } else if ( strcmp("--generate-extra-enter", argv[i]) == 0 ){ gen_extra_enter = true; } else { print_help(); exit(EINVAL); } ++i; } /* init phrase table */ FacadePhraseTable2 phrase_table; MemoryChunk * chunk = new MemoryChunk; chunk->load("phrase_index.bin"); phrase_table.load(chunk, NULL); /* init phrase index */ FacadePhraseIndex phrase_index; if (!load_phrase_index(&phrase_index)) exit(ENOENT); /* init bi-gram */ Bigram system_bigram; system_bigram.attach("bigram.db", ATTACH_READONLY); Bigram user_bigram; /* init phrase lookup */ PhraseLookup phrase_lookup(&phrase_table, &phrase_index, &system_bigram, &user_bigram); CONTEXT_STATE state, next_state; GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); /* split the sentence */ char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, stdin)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } /* check non-ucs4 characters */ const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); printf("\n"); continue; } /* only new-line persists. */ if ( 0 == num_of_chars ) { printf("\n"); continue; } state = CONTEXT_INIT; int result = phrase_table.search( 1, sentence, tokens); g_array_append_val( current_ucs4, sentence[0]); if ( result & SEARCH_OK ) state = CONTEXT_SEGMENTABLE; else state = CONTEXT_UNKNOWN; for ( int i = 1; i < num_of_chars; ++i) { int result = phrase_table.search( 1, sentence + i, tokens); if ( result & SEARCH_OK ) next_state = CONTEXT_SEGMENTABLE; else next_state = CONTEXT_UNKNOWN; if ( state == next_state ){ g_array_append_val(current_ucs4, sentence[i]); continue; } assert ( state != next_state ); if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(&phrase_lookup, current_ucs4); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_ucs4); /* save the current character */ g_array_set_size(current_ucs4, 0); g_array_append_val(current_ucs4, sentence[i]); state = next_state; } if ( current_ucs4->len ) { /* this seems always true. */ if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(&phrase_lookup, current_ucs4); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_ucs4); g_array_set_size(current_ucs4, 0); } /* print extra enter */ if ( gen_extra_enter ) printf("\n"); } phrase_index.destroy_tokens(tokens); /* print enter at file tail */ printf("\n"); g_array_free(current_ucs4, TRUE); free(linebuf); return 0; }
void feed_line (const char * phrase, const char * pinyin, const guint32 freq){ phrase_item * new_phrase_ptr = (phrase_item *) malloc( sizeof(phrase_item)); new_phrase_ptr->length = g_utf8_strlen(phrase, -1); /* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp * where is the code which I don't want to touch. :-) */ if (new_phrase_ptr->length >= MAX_PHRASE_LENGTH ) { fprintf(stderr, "too long phrase:%s\t%s\t%d\n", phrase, pinyin, freq); free(new_phrase_ptr); return; } new_phrase_ptr->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL); PinyinDefaultParser parser; NullPinyinValidator validator; PinyinKeyVector keys; PinyinKeyPosVector poses; keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey)); poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos)); parser.parse(validator, keys, poses, pinyin); GArray * array = (GArray *)g_tree_lookup(g_pinyin_tree, new_phrase_ptr); pinyin_and_freq_item value_item; value_item.pinyin = keys; value_item.freq = freq; if(new_phrase_ptr->length != value_item.pinyin->len){ fprintf(stderr, "error:phrase:%s\tpinyin:%s\n", phrase, pinyin); return; } if ( array == NULL){ array = g_array_new(FALSE, TRUE, sizeof(pinyin_and_freq_item)); g_array_append_val(array, value_item); g_tree_insert(g_pinyin_tree, new_phrase_ptr, array); return; } bool found = false; for ( size_t i = 0; i < array->len ; ++i){ pinyin_and_freq_item * old_value_item = &g_array_index(array, pinyin_and_freq_item, i); int result = pinyin_exact_compare((PinyinKey *)value_item.pinyin->data, (PinyinKey *)old_value_item->pinyin->data , value_item.pinyin->len); if ( result == 0 ){ printf("Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", phrase, pinyin, freq); old_value_item->freq += freq; found = true; } } g_array_free(poses, TRUE); if ( !found ){ g_array_append_val(array, value_item); g_tree_insert(g_pinyin_tree, new_phrase_ptr, array); }else g_array_free(keys, TRUE); free(new_phrase_ptr); //g_array_free(keys, TRUE); }
int main(int argc, char * argv[]){ FILE * input = stdin; FILE * output = stdout; setlocale(LC_ALL, ""); GError * error = NULL; GOptionContext * context; context = g_option_context_new("- n-gram segment"); g_option_context_add_main_entries(context, entries, NULL); if (!g_option_context_parse(context, &argc, &argv, &error)) { g_print("option parsing failed:%s\n", error->message); exit(EINVAL); } if (outputfile) { output = fopen(outputfile, "w"); if (NULL == output) { perror("open file failed"); exit(EINVAL); } } if (argc > 2) { fprintf(stderr, "too many arguments.\n"); exit(EINVAL); } if (2 == argc) { input = fopen(argv[1], "r"); if (NULL == input) { perror("open file failed"); exit(EINVAL); } } SystemTableInfo2 system_table_info; bool retval = system_table_info.load(SYSTEM_TABLE_INFO); if (!retval) { fprintf(stderr, "load table.conf failed.\n"); exit(ENOENT); } /* init phrase table */ FacadePhraseTable3 phrase_table; phrase_table.load(SYSTEM_PHRASE_INDEX, NULL); /* init phrase index */ FacadePhraseIndex phrase_index; const pinyin_table_info_t * phrase_files = system_table_info.get_default_tables(); if (!load_phrase_index(phrase_files, &phrase_index)) exit(ENOENT); /* init bi-gram */ Bigram system_bigram; system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY); Bigram user_bigram; gfloat lambda = system_table_info.get_lambda(); /* init phrase lookup */ PhraseLookup phrase_lookup(lambda, &phrase_table, &phrase_index, &system_bigram, &user_bigram); CONTEXT_STATE state, next_state; GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t)); PhraseTokens tokens; memset(tokens, 0, sizeof(PhraseTokens)); phrase_index.prepare_tokens(tokens); /* split the sentence */ char * linebuf = NULL; size_t size = 0; ssize_t read; while( (read = getline(&linebuf, &size, input)) != -1 ){ if ( '\n' == linebuf[strlen(linebuf) - 1] ) { linebuf[strlen(linebuf) - 1] = '\0'; } /* check non-ucs4 characters */ const glong num_of_chars = g_utf8_strlen(linebuf, -1); glong len = 0; ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL); if ( len != num_of_chars ) { fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf); fprintf(output, "%d \n", null_token); continue; } /* only new-line persists. */ if ( 0 == num_of_chars ) { fprintf(output, "%d \n", null_token); continue; } state = CONTEXT_INIT; int result = phrase_table.search( 1, sentence, tokens); g_array_append_val( current_ucs4, sentence[0]); if ( result & SEARCH_OK ) state = CONTEXT_SEGMENTABLE; else state = CONTEXT_UNKNOWN; for ( int i = 1; i < num_of_chars; ++i) { int result = phrase_table.search( 1, sentence + i, tokens); if ( result & SEARCH_OK ) next_state = CONTEXT_SEGMENTABLE; else next_state = CONTEXT_UNKNOWN; if ( state == next_state ){ g_array_append_val(current_ucs4, sentence[i]); continue; } assert ( state != next_state ); if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(&phrase_lookup, current_ucs4, output); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_ucs4, output); /* save the current character */ g_array_set_size(current_ucs4, 0); g_array_append_val(current_ucs4, sentence[i]); state = next_state; } if ( current_ucs4->len ) { /* this seems always true. */ if ( state == CONTEXT_SEGMENTABLE ) deal_with_segmentable(&phrase_lookup, current_ucs4, output); if ( state == CONTEXT_UNKNOWN ) deal_with_unknown(current_ucs4, output); g_array_set_size(current_ucs4, 0); } /* print extra enter */ if ( gen_extra_enter ) fprintf(output, "%d \n", null_token); g_free(sentence); } phrase_index.destroy_tokens(tokens); /* print enter at file tail */ fprintf(output, "%d \n", null_token); g_array_free(current_ucs4, TRUE); free(linebuf); fclose(input); fclose(output); return 0; }