Exemple #1
0
/*
 *  call-seq:
 *    utf8_titleize(string)
 *
 *  Returns a title case string.
 *
 *    Glib.utf8_titleize('привет всем') #=> Привет Всем
 */
static VALUE utf8_titleize(VALUE self, VALUE string)
{
  VALUE result;
  gchar *temp;
  long index, length_in_bytes, length_in_chars;
  gunichar *chars_as_ucs4, current_char;
  gboolean first_character_of_word = TRUE;

  Check_Type(string, T_STRING);

  length_in_bytes = RSTRING_LEN(string);
  if ((chars_as_ucs4 = g_utf8_to_ucs4(StringValuePtr(string), length_in_bytes, NULL, &length_in_chars, NULL))) {
    for (index = 0; index < length_in_chars; index++) {
      current_char = chars_as_ucs4[index];
      if (first_character_of_word == TRUE && g_unichar_isalpha(current_char)) {
        chars_as_ucs4[index] = g_unichar_totitle(current_char);
        first_character_of_word = FALSE;
      }

      if (g_unichar_isspace(current_char) || g_unichar_ispunct(current_char)) {
        first_character_of_word = TRUE;
      }
    }
    
    temp = g_ucs4_to_utf8(chars_as_ucs4, -1, NULL, NULL, NULL);
    result = rb_str_new2(temp);
    g_free(chars_as_ucs4);
    g_free(temp);
    
    return result;
  } else {
    return Qnil;
  }
}
Exemple #2
0
static void test_valid_turkish()
{
	long nwritten;
	long nread;
	char *res;
	int i;
	long size;
	gunichar *verify;
	unsigned char *back;

	unsigned char buf[2];

	static int map_size =
		sizeof(gsm_turkish_to_unicode_map) / sizeof(unsigned short) / 2;

	for (i = 0; i < map_size; i++) {
		unsigned short c = gsm_turkish_to_unicode_map[i*2];

		if (c & 0x1b00) {
			buf[0] = 0x1b;
			buf[1] = c & 0x7f;
			size = 2;
		} else {
			size = 1;
			buf[0] = c & 0x7f;
		}

		res = convert_gsm_to_utf8_with_lang(buf, size, &nread, &nwritten, 0, 1, 1);
		g_assert(res);

		if (g_test_verbose())
			g_print("size: %ld, nread:%ld, nwritten:%ld, %s\n",
				size, nread, nwritten, res);

		g_assert(nread == size);

		verify = g_utf8_to_ucs4(res, -1, NULL, NULL, NULL);

		g_assert(verify[0] == gsm_turkish_to_unicode_map[i*2+1]);
		g_assert(verify[1] == 0);

		g_assert(nwritten == UTF8_LENGTH(verify[0]));

		back = convert_utf8_to_gsm_with_lang(res, -1, &nread, &nwritten, 0, 1, 1);

		g_assert(back);

		g_assert(nwritten == size);
		if (c & 0x1b00) {
			g_assert(back[0] == 0x1b);
			g_assert(back[1] == (c & 0x7f));
		} else {
			g_assert(back[0] == (c & 0x7f));
		}

		g_free(back);
		g_free(verify);
		g_free(res);
	}
}
BOOL
utf8_to_ucs2(const gchar *utf8, gunichar2 *ucs2, int ucs2_len) {
	int 		i;
	glong		items_read;
	glong		count;
	gunichar	*ucs4;
	gunichar2	*ptr;

	items_read = 0;
	count = 0;

	ucs2_len--;	/* Space for null terminator */

	ucs4 = g_utf8_to_ucs4(utf8, -1, &items_read, &count, NULL);
	if (ucs4 == NULL) {
		return FALSE;
	}

	ptr = (gunichar2 *)ucs2;
	for (i = 0; (i < count) && (i < ucs2_len); i++) {
		if (ucs4[i] < 0x10000 && !(ucs4[i] >= 0xd800 && ucs4[i] < 0xe000)) {
			*ptr = (gunichar2)ucs4[i];
			ptr++;
		}	/* we're simply ignoring any chars that don't fit into ucs2 */
	}
	ucs2[i] = 0;	/* terminate */

	/* free the intermediate ucs4 string */
	GdipFree(ucs4);

	return TRUE;
}
Exemple #4
0
int
is_name_char(const char *offset)
{
  /* TODO: manage length*/
  gunichar *converted;
  gunichar value;

  if (*offset == '\0') return FALSE;
  if (is_name_start_char(offset)) return TRUE;

  /* Easy ASCII-only tests */
  if (*offset == '-') return TRUE;
  if (*offset == '.') return TRUE;
  if (*offset >= '0' && *offset <= '9') return TRUE;

  /* TODO: error checking */
  converted = g_utf8_to_ucs4(offset, 1, NULL, NULL, NULL);
  value = *converted;
  free(converted);

  /* More expensive Unicode checks */
  if (value == 0xB7) return TRUE;
  if (value >= 0x0300 && value <= 0x036F) return TRUE;
  if (value >= 0x203F && value <= 0x2040) return TRUE;

  return FALSE;
}
Exemple #5
0
static gunichar
lx_tn_engine_get_prev_surrounding_char (LxTNEngine *lx_tn_engine,
                                        IBusEngine *ibus_engine)
{
  if (is_client_support_surrounding (ibus_engine))
    {
      IBusText *surrounding;
      guint     cursor_pos;
      guint     anchor_pos;
      gunichar *u_surrounding;
      gunichar  ret = 0;

      ibus_engine_get_surrounding_text (ibus_engine,
                                        &surrounding, &cursor_pos, &anchor_pos);
      u_surrounding = g_utf8_to_ucs4 (ibus_text_get_text (surrounding), -1,
                                      NULL, NULL, NULL);
      if (u_surrounding)
        {
          ret = (cursor_pos > 0) ? u_surrounding[cursor_pos - 1] : 0;
          g_free (u_surrounding);
        }

      return ret;
    }

  return 0;
}
Exemple #6
0
/** score_string:
 * @str              input string
 * @desired_glyphs   string containing highly desired glyphs
 *
 * Generates an ipsum with an attempt at maximizing the amount of differing
 * neighbour pairs.
 *
 * Returns the score for a string, a higher score is a better string.
 */
static int score_string (const char *str,
                         const char *desired_glyphs)
{
  /* we pick slightly larger than a power of two, to aovid aliasing of things
   * starting on multiples of 512 in the unicode set.
   */
#define ADJ_DIM   1023
  gunichar *ustr;
  char adjacency_matrix[ADJ_DIM*ADJ_DIM]={0,};
  gunichar *p;

  if (!str || str[0] == 0)
    return 0;

  ustr = g_utf8_to_ucs4 (str, -1, NULL, NULL, NULL);

  if (!ustr)
    return 0;

  /* walk throguh the string ..*/
  for (p = ustr; p[1]; p++)
    {
      gunichar x = p[0]; /* .. using the current .. */
      gunichar y = p[1]; /* .. and the next characters unicode position ..*/

      if (x==' ' || y == ' ') 
        continue;  /* (bailing if one of them is a space) */

      x %= ADJ_DIM; /* with unicode positions wrapped down to our */
      y %= ADJ_DIM; /*  matrix dimensions */

      /* mark cell in matrix as visited */
      adjacency_matrix[y * ADJ_DIM + x] = 1;
    }

  /* count number of distinct pairs encountered (permitting some collisions,
   * in a bloom-filter like manner) */
  {
    int i;
    int sum = 0;

    if (desired_glyphs)
    for (i = 0; ustr[i]; i++)
      {
        int j;
        for (j = 0; desired_glyphs[j]; j++)
          if (desired_glyphs[j] == ustr[i])
            sum ++;
      }

    for (i = 0; i < ADJ_DIM * ADJ_DIM ; i ++)
      sum += adjacency_matrix[i] * 2;
      
    g_free (ustr);
    return sum;
  }
}
void SmkyManufacturerDatabase::setFistAndLastLetters(const std::string & firstLetters, const std::string & lastLetters, int minLength)
{
	if (m_hunspell && m_lastHunspellResult)
	{
		m_hunspell->free_list(&m_lastHunspellResult, m_lastHunspellResultCount);
		m_lastHunspellResult = NULL;
	}
	m_lastFirstLastLetterResults.clear();
	auto_g_free_array<gunichar> first16 = g_utf8_to_ucs4(firstLetters.c_str(), -1, NULL, NULL, NULL);
	auto_g_free_array<gunichar> last16 = g_utf8_to_ucs4(lastLetters.c_str(), -1, NULL, NULL, NULL);
	if (first16 && last16)
	{
		gunichar * p = first16;
		while (*p)
			*p = g_unichar_tolower(*p), ++p;
		p = last16;
		while (*p)
			*p = g_unichar_tolower(*p), ++p;
		for (std::map<std::string, int>::iterator iter = m_words.begin(); iter != m_words.end(); ++iter)
		{
			const std::string & word = iter->first;
			const char * wordStr = word.c_str();
			gunichar firstLetter = g_unichar_tolower(g_utf8_get_char(wordStr));
			if (wcschr(first16.as<wchar_t>(), (wchar_t) firstLetter))
			{	// first letter is a match!
				const char * next = wordStr;
				int wordLength = 1;
				while ((next = g_utf8_next_char(wordStr)) && *next)
					wordStr = next, ++wordLength;
				if (wordLength >= minLength) {
					gunichar lastLetter = g_unichar_tolower(g_utf8_get_char(wordStr));
					if (wcschr(last16.as<wchar_t>(), (wchar_t) lastLetter))
					{
						DEBUG_CALLBACK("First-last letter match: %s-%s -> %s", firstLetters.c_str(), lastLetters.c_str(), word.c_str());
						m_lastFirstLastLetterResults.push_back(word.c_str());
					}
				}
				else {
					DEBUG_CALLBACK("'%s' is discarded because it's too short: %d letters, %d min", word.c_str(), wordLength, minLength);
				}
			}
		}
	}
}
static int
grind_utf8_to_ucs4 (const char *str, gsize len)
{
  int i;
  for (i = 0; i < NUM_ITERATIONS; i++)
    {
      gunichar *ustr;
      ustr = g_utf8_to_ucs4 (str, -1, NULL, NULL, NULL);
      g_free (ustr);
    }
  return 0;
}
Exemple #9
0
/**
 * gsdl_tokenizer_new_from_string:
 * @str: String to be parsed.
 * @err: Return location for a %GError to be set on failure, may be NULL.
 *
 * Creates a new tokenizer consuming the given string. The filename will be set to "&lt;string&gt;".
 *
 * Returns: A new %GSDLTokenizer, or NULL on failure.
 */
GSDLTokenizer* gsdl_tokenizer_new_from_string(const char *str, GError **err) {
	GSDLTokenizer* self = g_slice_new0(GSDLTokenizer);
	self->filename = "<string>";
	self->stringbuf = g_utf8_to_ucs4(str, -1, NULL, NULL, err);

	if (!self->stringbuf) return NULL;

	self->channel = NULL;
	self->line = 1;
	self->col = 1;
	self->peek_avail = false;

	return self;
}
Exemple #10
0
gunichar *gglk_text_line_input_get(GglkText *tb)
{
    GtkTextIter b, e;
    gchar *line_utf8, *line_utf8_normal;
    gunichar *line_ucs4;
    glong len;
    gtk_text_buffer_get_iter_at_mark(tb->buffer, &b, tb->startedit);
    gtk_text_buffer_get_iter_at_mark(tb->buffer, &e, tb->endedit);
    line_utf8 = gtk_text_buffer_get_text(tb->buffer, &b, &e, FALSE);
    line_utf8_normal = g_utf8_normalize(line_utf8, -1, G_NORMALIZE_NFC);
    line_ucs4 = g_utf8_to_ucs4(line_utf8, -1, NULL, &len, NULL);
    g_free(line_utf8); line_utf8 = NULL;
    return line_ucs4;
}
Exemple #11
0
/* Currently not used */
static int
CalculateStringWidthsUTF8 (cairo_t *ct, GDIPCONST GpFont *gdiFont, const BYTE *utf8, unsigned long StringDetailElements, GpStringDetailStruct *StringDetails)
{
	FT_Face			face;
	size_t			i;
	gunichar		*ucs4 = NULL;
	cairo_font_face_t	*Font;
	GpStringDetailStruct	*CurrentDetail;
	glong			NumOfGlyphs;
	cairo_matrix_t		matrix;

#ifdef DRAWSTRING_DEBUG
	printf("CalculateStringWidths(font, %s, %d, details) called\n", utf8, StringDetailElements);
#endif

	Font = (cairo_font_face_t *)gdiFont->cairofnt;
	face = gdip_cairo_ft_font_lock_face(Font);
	if (!face)
		return 0;

	cairo_get_font_matrix(ct, &matrix);	
	cairo_matrix_scale(&matrix, gdiFont->sizeInPixels, gdiFont->sizeInPixels);

	ucs4 = g_utf8_to_ucs4 ((const gchar *) utf8, (glong)-1, NULL, &NumOfGlyphs, NULL);

	if ((NumOfGlyphs == 0) || (ucs4 == NULL)) {
		return 0;
	}

	CurrentDetail=StringDetails;
	for (i = 0; i < NumOfGlyphs; i++) {
		FT_Load_Glyph (face, FT_Get_Char_Index (face, ucs4[i]), FT_LOAD_DEFAULT);
		CurrentDetail->Width = DOUBLE_FROM_26_6 (face->glyph->advance.x);
		CurrentDetail++;
	}

	gdip_cairo_ft_font_unlock_face(Font);

	GdipFree(ucs4);

	
#ifdef DRAWSTRING_DEBUG
	printf("CalculateStringWidths: string >%s< translated into %d glyphs\n", utf8, NumOfGlyphs);
#endif
	return NumOfGlyphs;
}
Exemple #12
0
std::wstring fromUtf8(const std::string &str)
{
    long readed, writed;
    wchar_t *errMsg = NULL;
    
    wchar_t *res = g_utf8_to_ucs4(str.c_str(), str.length(), &readed,
            &writed, &errMsg);
    if (! res) {
        if (errMsg)
            throw Exception(errMsg);
        else
            throw Exception(L"Error converting text from UTF-8");
    }

    std::wstring s(res);
    free(res);
    
    return s;
}
Exemple #13
0
static char *
gnt_text_view_get_p(GntTextView *view, int x, int y)
{
	int n;
	int i = 0;
	GntWidget *wid = GNT_WIDGET(view);
	GntTextLine *line;
	GList *lines;
	GList *segs;
	GntTextSegment *seg;
	gchar *pos;

	n = g_list_length(view->list);
	y = wid->priv.height - y;
	if (n < y) {
		x = 0;
		y = n - 1;
	}

	lines = g_list_nth(view->list, y - 1);
	if (!lines)
		return NULL;
	do {
		line = lines->data;
		lines = lines->next;
	} while (line && !line->segments && lines);

	if (!line || !line->segments) /* no valid line */
		return NULL;
	segs = line->segments;
	seg = (GntTextSegment *)segs->data;
	pos = view->string->str + seg->start;
	x = MIN(x, line->length);
	while (++i <= x) {
		gunichar *u;
		pos = g_utf8_next_char(pos);
		u = g_utf8_to_ucs4(pos, -1, NULL, NULL, NULL);
		if (u && g_unichar_iswide(*u))
			i++;
		g_free(u);
	}
	return pos;
}
Exemple #14
0
static int entry_set(LuaState *L)
{
    Entry *e         = ms_lua_checkclass(L, CLASS, 1);
    const char *line = luaL_checkstring(L, 2);

    e->dirty = TRUE; // was 0 - why?
    GError *error;
    glong written;
    gunichar *buffer = g_utf8_to_ucs4(line, -1, NULL, &written, &error);
    if  (buffer) {
        g_free(e->buffer);

        e->bufsize = e->bufused = written;
        e->curs_off = e->bufused;
        e->view_off = 0;
        e->buffer = buffer;
        return 0;
    } else {
        lua_pushfstring(L, "Entry:set() - UCS4 conversion failed: %s", error->message);
        g_error_free(error);
        return lua_error(L);
    }
}
Exemple #15
0
static void do_set_text(const char *text, gboolean accept)
{
    GglkText *tb = GGLK_TEXT(gglk_get_line_input_view());
    gunichar *buf_ucs4;
    
    if(!tb) {
	char *msg;
	if(strcmp(text, "") == 0) {
	    msg = g_strdup("No window to clear");
	} else {
	    msg = g_strdup_printf("No window to accept %s", text);
	}
	sglk_status_set_mesg(msg);
	g_free(msg);
	return;
    }
    
    buf_ucs4 = g_utf8_to_ucs4(text, -1, NULL, NULL, NULL);
    gglk_text_line_input_set(tb, tb->line_maxlen, buf_ucs4);
    g_free(buf_ucs4);

    if(accept)
	gglk_text_line_input_accept(tb);
}
bool PhraseLargeTable3::load_text(FILE * infile){
    char pinyin[256];
    char phrase[256];
    phrase_token_t token;
    size_t freq;

    while (!feof(infile)) {
        int num = fscanf(infile, "%255s %255s %u %ld",
                         pinyin, phrase, &token, &freq);

        if (4 != num)
            continue;

        if (feof(infile))
            break;

        glong phrase_len = g_utf8_strlen(phrase, -1);
        ucs4_t * new_phrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
        add_index(phrase_len, new_phrase, token);

        g_free(new_phrase);
    }
    return true;
}
Exemple #17
0
int
is_name_start_char(const char *offset)
{
  /* TODO: manage length*/
  gunichar *converted;
  gunichar value;

  if (*offset == '\0') return FALSE;

  /* Easy ASCII-only tests */
  if (*offset == ':') return TRUE;
  if (*offset >= 'A' && *offset <= 'Z') return TRUE;
  if (*offset == '_') return TRUE;
  if (*offset >= 'a' && *offset <= 'z') return TRUE;

  /* TODO: error checking */
  converted = g_utf8_to_ucs4(offset, 1, NULL, NULL, NULL);
  value = *converted;
  free(converted);

  /* More expensive Unicode checks */
  if (value >= 0xC0 && value <= 0xD6) return TRUE;
  if (value >= 0xD8 && value <= 0xF6) return TRUE;
  if (value >= 0xF8 && value <= 0x2FF) return TRUE;
  if (value >= 0x370 && value <= 0x37D) return TRUE;
  if (value >= 0x37F && value <= 0x1FFF) return TRUE;
  if (value >= 0x200C && value <= 0x200D) return TRUE;
  if (value >= 0x2070 && value <= 0x218F) return TRUE;
  if (value >= 0x2C00 && value <= 0x2FEF) return TRUE;
  if (value >= 0x3001 && value <= 0xD7FF) return TRUE;
  if (value >= 0xF900 && value <= 0xFDCF) return TRUE;
  if (value >= 0xFDF0 && value <= 0xFFFD) return TRUE;
  if (value >= 0x10000 && value <= 0xEFFFF) return TRUE;

  return FALSE;
}
Exemple #18
0
/* Punycode encoder, RFC 3492 section 6.3. The algorithm is
 * sufficiently bizarre that it's not really worth trying to explain
 * here.
 */
static gboolean
punycode_encode (const gchar *input_utf8,
                 gsize        input_utf8_length,
		 GString     *output)
{
  guint delta, handled_chars, num_basic_chars, bias, j, q, k, t, digit;
  gunichar n, m, *input;
  glong input_length;
  gboolean success = FALSE;

  /* Convert from UTF-8 to Unicode code points */
  input = g_utf8_to_ucs4 (input_utf8, input_utf8_length, NULL,
			  &input_length, NULL);
  if (!input)
    return FALSE;

  /* Copy basic chars */
  for (j = num_basic_chars = 0; j < input_length; j++)
    {
      if (PUNYCODE_IS_BASIC (input[j]))
	{
	  g_string_append_c (output, g_ascii_tolower (input[j]));
	  num_basic_chars++;
	}
    }
  if (num_basic_chars)
    g_string_append_c (output, '-');

  handled_chars = num_basic_chars;

  /* Encode non-basic chars */
  delta = 0;
  bias = PUNYCODE_INITIAL_BIAS;
  n = PUNYCODE_INITIAL_N;
  while (handled_chars < input_length)
    {
      /* let m = the minimum {non-basic} code point >= n in the input */
      for (m = G_MAXUINT, j = 0; j < input_length; j++)
	{
	  if (input[j] >= n && input[j] < m)
	    m = input[j];
	}

      if (m - n > (G_MAXUINT - delta) / (handled_chars + 1))
	goto fail;
      delta += (m - n) * (handled_chars + 1);
      n = m;

      for (j = 0; j < input_length; j++)
	{
	  if (input[j] < n)
	    {
	      if (++delta == 0)
		goto fail;
	    }
	  else if (input[j] == n)
	    {
	      q = delta;
	      for (k = PUNYCODE_BASE; ; k += PUNYCODE_BASE)
		{
		  if (k <= bias)
		    t = PUNYCODE_TMIN;
		  else if (k >= bias + PUNYCODE_TMAX)
		    t = PUNYCODE_TMAX;
		  else
		    t = k - bias;
		  if (q < t)
		    break;
		  digit = t + (q - t) % (PUNYCODE_BASE - t);
		  g_string_append_c (output, encode_digit (digit));
		  q = (q - t) / (PUNYCODE_BASE - t);
		}

	      g_string_append_c (output, encode_digit (q));
	      bias = adapt (delta, handled_chars + 1, handled_chars == num_basic_chars);
	      delta = 0;
	      handled_chars++;
	    }
	}

      delta++;
      n++;
    }

  success = TRUE;

 fail:
  g_free (input);
  return success;
}
int main(int argc, char * argv[]){
    setlocale(LC_ALL, "");

    SystemTableInfo system_table_info;

    bool retval = system_table_info.load("../../data/table.conf");
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    /* init phrase table */
    FacadePhraseTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("../../data/phrase_index.bin");
    phrase_table.load(chunk, NULL);

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_table_info();

    /* init phrase index */
    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    /* init bi-gram */
    Bigram system_bigram;
    system_bigram.attach("../../data/bigram.db", ATTACH_READONLY);
    Bigram user_bigram;

    gfloat lambda = system_table_info.get_lambda();

    /* init phrase lookup */
    PhraseLookup phrase_lookup(lambda,
                               &phrase_table, &phrase_index,
                               &system_bigram, &user_bigram);

    /* try one sentence */
    char * linebuf = NULL;
    size_t size = 0;
    ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        if ( strcmp ( linebuf, "quit" ) == 0)
            break;

        /* check non-ucs4 characters */
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs4 characters are not accepted.\n");
            g_free(sentence);
            continue;
        }

        try_phrase_lookup(&phrase_lookup, sentence, len);
        g_free(sentence);
    }

    free(linebuf);
    return 0;
}
int main(int argc, char * argv[]){
    const char * evals_text = "evals.text";

    pinyin_option_t options = USE_TONE;
    FacadeChewingTable largetable;

    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("pinyin_index.bin");
    largetable.load(options, chunk, NULL);

    FacadePhraseTable2 phrase_table;
    chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk, NULL);

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    Bigram system_bigram;
    system_bigram.attach("bigram.db", ATTACH_READONLY);
    Bigram user_bigram;
    user_bigram.attach(NULL, ATTACH_CREATE|ATTACH_READWRITE);

    PinyinLookup2 pinyin_lookup(options, &largetable, &phrase_index,
                               &system_bigram, &user_bigram);

    /* open evals.text. */
    FILE * evals_file = fopen(evals_text, "r");
    if ( NULL == evals_file ) {
        fprintf(stderr, "Can't open file:%s\n", evals_text);
        exit(ENOENT);
    }

    PhraseTokens phrase_tokens;
    memset(phrase_tokens, 0, sizeof(PhraseTokens));
    phrase_index.prepare_tokens(phrase_tokens);

    /* Evaluates the correction rate of test text documents. */
    size_t tested_count = 0; size_t passed_count = 0;
    char* linebuf = NULL; size_t size = 0;
    TokenVector tokens = g_array_new(FALSE, TRUE, sizeof(phrase_token_t));

    phrase_token_t token = null_token;
    while( getline(&linebuf, &size, evals_file) ) {
        if ( feof(evals_file) )
            break;
        if ( '\n' == linebuf[strlen(linebuf)-1] )
            linebuf[strlen(linebuf)-1] = '\0';

        glong phrase_len = 0;
        ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL);

        token = null_token;
        if ( 0 != phrase_len ) {
            int result = phrase_table.search(phrase_len, phrase, phrase_tokens);
            int num = get_first_token(phrase_tokens, token);

            if ( !(result & SEARCH_OK) )
                token = null_token;

            g_free(phrase);
            phrase = NULL;
        }

        if ( null_token == token ) {
            if ( tokens->len ) { /* one test. */
                if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
                    tested_count ++; passed_count ++;
                } else {
                    tested_count ++;
                }
                g_array_set_size(tokens, 0);
            }
        } else {
            g_array_append_val(tokens, token);
        }
    }

    if ( tokens->len ) { /* one test. */
        if ( do_one_test(&pinyin_lookup, &phrase_index, tokens) ) {
            tested_count ++; passed_count ++;
        } else {
            tested_count ++;
        }
    }

    parameter_t rate = passed_count / (parameter_t) tested_count;
    printf("correction rate:%f\n", rate);

    g_array_free(tokens, TRUE);
    fclose(evals_file);
    free(linebuf);

    phrase_index.destroy_tokens(phrase_tokens);

    return 0;
}
static void
process (gint      line,
	 gchar    *utf8,
	 Status    status,
	 gunichar *ucs4,
	 gint      ucs4_len)
{
  const gchar *end;
  gboolean is_valid = g_utf8_validate (utf8, -1, &end);
  GError *error = NULL;
  glong items_read, items_written;

  switch (status)
    {
    case VALID:
      if (!is_valid)
	{
	  fail ("line %d: valid but g_utf8_validate returned FALSE\n", line);
	  return;
	}
      break;
    case NOTUNICODE:
    case INCOMPLETE:
    case OVERLONG:
    case MALFORMED:
      if (is_valid)
	{
	  fail ("line %d: invalid but g_utf8_validate returned TRUE\n", line);
	  return;
	}
      break;
    }

  if (status == INCOMPLETE)
    {
      gunichar *ucs4_result;      

      ucs4_result = g_utf8_to_ucs4 (utf8, -1, NULL, NULL, &error);

      if (!error || !g_error_matches (error, G_CONVERT_ERROR, G_CONVERT_ERROR_PARTIAL_INPUT))
	{
	  fail ("line %d: incomplete input not properly detected\n", line);
	  return;
	}
      g_clear_error (&error);

      ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, NULL, &error);

      if (!ucs4_result || items_read == strlen (utf8))
	{
	  fail ("line %d: incomplete input not properly detected\n", line);
	  return;
	}

      g_free (ucs4_result);
    }

  if (status == VALID || status == NOTUNICODE)
    {
      gunichar *ucs4_result;
      gchar *utf8_result;

      ucs4_result = g_utf8_to_ucs4 (utf8, -1, &items_read, &items_written, &error);
      if (!ucs4_result)
	{
	  fail ("line %d: conversion to ucs4 failed: %s\n", line, error->message);
	  return;
	}
      
      if (!ucs4_equal (ucs4_result, ucs4) ||
	  items_read != strlen (utf8) ||
	  items_written != ucs4_len)
	{
	  fail ("line %d: results of conversion to ucs4 do not match expected.\n", line);
	  return;
	}

      g_free (ucs4_result);

      ucs4_result = g_utf8_to_ucs4_fast (utf8, -1, &items_written);
      
      if (!ucs4_equal (ucs4_result, ucs4) ||
	  items_written != ucs4_len)
	{
	  fail ("line %d: results of conversion to ucs4 do not match expected.\n", line);
	  return;
	}

      utf8_result = g_ucs4_to_utf8 (ucs4_result, -1, &items_read, &items_written, &error);
      if (!utf8_result)
	{
	  fail ("line %d: conversion back to utf8 failed: %s", line, error->message);
	  return;
	}

      if (strcmp (utf8_result, utf8) != 0 ||
	  items_read != ucs4_len ||
	  items_written != strlen (utf8))
	{
	  fail ("line %d: conversion back to utf8 did not match original\n", line);
	  return;
	}

      g_free (utf8_result);
      g_free (ucs4_result);
    }

  if (status == VALID)
    {
      gunichar2 *utf16_expected_tmp;
      gunichar2 *utf16_expected;
      gunichar2 *utf16_from_utf8;
      gunichar2 *utf16_from_ucs4;
      gunichar *ucs4_result;
      gsize bytes_written;
      gint n_chars;
      gchar *utf8_result;

#if G_BYTE_ORDER == G_LITTLE_ENDIAN
#define TARGET "UTF-16LE"
#else
#define TARGET "UTF-16"
#endif

      if (!(utf16_expected_tmp = (gunichar2 *)g_convert (utf8, -1, TARGET, "UTF-8",
							 NULL, &bytes_written, NULL)))
	{
	  fail ("line %d: could not convert to UTF-16 via g_convert\n", line);
	  return;
	}

      /* zero-terminate and remove BOM
       */
      n_chars = bytes_written / 2;
      if (utf16_expected_tmp[0] == 0xfeff) /* BOM */
	{
	  n_chars--;
	  utf16_expected = g_new (gunichar2, n_chars + 1);
	  memcpy (utf16_expected, utf16_expected_tmp + 1, sizeof(gunichar2) * n_chars);
	}
      else if (utf16_expected_tmp[0] == 0xfffe) /* ANTI-BOM */
	{
	  fail ("line %d: conversion via iconv to \"UTF-16\" is not native-endian\n", line);
	  return;
	}
      else
	{
	  utf16_expected = g_new (gunichar2, n_chars + 1);
	  memcpy (utf16_expected, utf16_expected_tmp, sizeof(gunichar2) * n_chars);
	}

      utf16_expected[n_chars] = '\0';
      
      if (!(utf16_from_utf8 = g_utf8_to_utf16 (utf8, -1, &items_read, &items_written, &error)))
	{
	  fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
	  return;
	}

      if (items_read != strlen (utf8) ||
	  utf16_count (utf16_from_utf8) != items_written)
	{
	  fail ("line %d: length error in conversion to ucs16\n", line);
	  return;
	}

      if (!(utf16_from_ucs4 = g_ucs4_to_utf16 (ucs4, -1, &items_read, &items_written, &error)))
	{
	  fail ("line %d: conversion to ucs16 failed: %s\n", line, error->message);
	  return;
	}

      if (items_read != ucs4_len ||
	  utf16_count (utf16_from_ucs4) != items_written)
	{
	  fail ("line %d: length error in conversion to ucs16\n", line);
	  return;
	}

      if (!utf16_equal (utf16_from_utf8, utf16_expected) ||
	  !utf16_equal (utf16_from_ucs4, utf16_expected))
	{
	  fail ("line %d: results of conversion to ucs16 do not match\n", line);
	  return;
	}

      if (!(utf8_result = g_utf16_to_utf8 (utf16_from_utf8, -1, &items_read, &items_written, &error)))
	{
	  fail ("line %d: conversion back to utf8 failed: %s\n", line, error->message);
	  return;
	}

      if (items_read != utf16_count (utf16_from_utf8) ||
	  items_written != strlen (utf8))
	{
	  fail ("line %d: length error in conversion from ucs16 to utf8\n", line);
	  return;
	}

      if (!(ucs4_result = g_utf16_to_ucs4 (utf16_from_ucs4, -1, &items_read, &items_written, &error)))
	{
	  fail ("line %d: conversion back to utf8/ucs4 failed\n", line);
	  return;
	}

      if (items_read != utf16_count (utf16_from_utf8) ||
	  items_written != ucs4_len)
	{
	  fail ("line %d: length error in conversion from ucs16 to ucs4\n", line);
	  return;
	}

      if (strcmp (utf8, utf8_result) != 0 ||
	  !ucs4_equal (ucs4, ucs4_result))
	{
	  fail ("line %d: conversion back to utf8/ucs4 did not match original\n", line);
	  return;
	}
      
      g_free (utf16_expected_tmp);
      g_free (utf16_expected);
      g_free (utf16_from_utf8);
      g_free (utf16_from_ucs4);
      g_free (utf8_result);
      g_free (ucs4_result);
    }
}
Exemple #22
0
int main(int argc, char * argv[]){
    int i = 1;
    bool train_pi_gram = true;
    const char * bigram_filename = "bigram.db";

    setlocale(LC_ALL, "");
    while ( i < argc ){
	if ( strcmp("--help", argv[i]) == 0){
	    print_help();
            exit(0);
	}else if ( strcmp("--skip-pi-gram-training", argv[i]) == 0 ){
	    train_pi_gram = false;
	}else if ( strcmp("--bigram-file", argv[i]) == 0){
            if ( ++i >= argc ) {
                print_help();
                exit(EINVAL);
            }
            bigram_filename = argv[i];
	}else{
            print_help();
            exit(EINVAL);
        }
	++i;
    }
    
    PhraseLargeTable2 phrase_table;
    /* init phrase table */
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk);

    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);
    
    Bigram bigram;
    bigram.attach(bigram_filename, ATTACH_CREATE|ATTACH_READWRITE);

    PhraseTokens tokens;
    memset(tokens, 0, sizeof(PhraseTokens));
    phrase_index.prepare_tokens(tokens);
    
    char* linebuf = NULL;
    size_t size = 0;
    phrase_token_t last_token, cur_token = last_token = 0;
    while( getline(&linebuf, &size, stdin) ){
	if ( feof(stdin) )
	    break;
        linebuf[strlen(linebuf)-1] = '\0';

        glong phrase_len = 0;
        ucs4_t * phrase = g_utf8_to_ucs4(linebuf, -1, NULL, &phrase_len, NULL);

	phrase_token_t token = null_token;
        if ( 0 != phrase_len ) {
            phrase_index.clear_tokens(tokens);
            int result = phrase_table.search(phrase_len, phrase, tokens);
            int num = get_first_token(tokens, token);
            if ( !(result & SEARCH_OK) )
                token = null_token;
            g_free(phrase);
            phrase = NULL;
        }

	last_token = cur_token;
	cur_token = token;

        /* skip null_token in second word. */
        if ( null_token == cur_token )
            continue;

        /* training uni-gram */
        phrase_index.add_unigram_frequency(cur_token, 1);

        /* skip pi-gram training. */
        if ( null_token == last_token ){
            if ( !train_pi_gram )
                continue;
            last_token = sentence_start;
        }

        /* train bi-gram */
        SingleGram * single_gram = NULL;
        bigram.load(last_token, single_gram);

        if ( NULL == single_gram ){
            single_gram = new SingleGram;
        }
        guint32 freq, total_freq;
        /* increase freq */
        if (single_gram->get_freq(cur_token, freq))
            assert(single_gram->set_freq(cur_token, freq + 1));
        else
            assert(single_gram->insert_freq(cur_token, 1));
        /* increase total freq */
        single_gram->get_total_freq(total_freq);
        single_gram->set_total_freq(total_freq + 1);

        bigram.store(last_token, single_gram);
        delete single_gram;
    }

    phrase_index.destroy_tokens(tokens);
    free(linebuf);
    
    if (!save_phrase_index(&phrase_index))
        exit(ENOENT);

    return 0;
}
Exemple #23
0
int main(int argc, char * argv[]){
    SystemTableInfo2 system_table_info;

    bool retval = system_table_info.load("../../data/table.conf");
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    PhraseLargeTable3 largetable;
    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_default_tables();

    TABLE_PHONETIC_TYPE type = system_table_info.get_table_phonetic_type();
    if (!load_phrase_table(phrase_files, NULL,
                           &largetable, &phrase_index, type))
        exit(ENOENT);

#if 0
    MemoryChunk * chunk = new MemoryChunk;
    largetable.store(chunk);
    largetable.load(chunk);
#endif

    char* linebuf = NULL; size_t size = 0; ssize_t read;
    while ((read = getline(&linebuf, &size, stdin)) != -1) {
        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        if ( strcmp ( linebuf, "quit" ) == 0)
            break;

        glong phrase_len = g_utf8_strlen(linebuf, -1);
        ucs4_t * new_phrase = g_utf8_to_ucs4(linebuf, -1, NULL, NULL, NULL);

        if (0 == phrase_len)
            continue;

        PhraseTokens tokens;
        memset(tokens, 0, sizeof(PhraseTokens));
        phrase_index.prepare_tokens(tokens);

        guint32 start = record_time();
        size_t i = 0;
        for (i = 0; i < bench_times; ++i){
            phrase_index.clear_tokens(tokens);
            largetable.search(phrase_len, new_phrase, tokens);
        }
        print_time(start, bench_times);

        /* test search continued information. */
        int retval = SEARCH_NONE;
        for (i = 1; i < phrase_len; ++i) {
            phrase_index.clear_tokens(tokens);
            retval = largetable.search(i, new_phrase, tokens);
            if (retval & SEARCH_CONTINUED)
                printf("return continued information with length:%ld\n", i);
        }

        phrase_index.clear_tokens(tokens);
        retval = largetable.search(phrase_len, new_phrase, tokens);

        if (retval & SEARCH_OK) {
            for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
                GArray * array = tokens[i];
                if (NULL == array)
                    continue;

                for (size_t k = 0; k < array->len; ++k) {
                    phrase_token_t token = g_array_index
                        (array, phrase_token_t, k);

                    printf("token:%d\t", token);
                }
            }
            printf("\n");
        }

        phrase_index.destroy_tokens(tokens);
        g_free(new_phrase);
    }

    if ( linebuf )
        free(linebuf);

    /* mask out all index items. */
    largetable.mask_out(0x0, 0x0);

    return 0;
}
Exemple #24
0
char *ipsumat_generate (const char *dict_path,
                        const char *charset,
                        const char *desired_glyphs,
                        int         max_wordlen,
                        int         max_words)
{
  gunichar *p;
  gunichar *ucharset;
  gunichar *udesired_glyphs = NULL;

  int      count = 0;
  int      best_sentence[MAX_WORDS]={0,};
  int      sentence[MAX_WORDS]={0,};
  char    *words_str = NULL;
  gunichar *uwords_str = NULL;
  GList   *words = NULL;
  GString *word = g_string_new ("");
  int      best_score = 0;
  int      i;
  if (!dict_path)
    dict_path = "/usr/share/dict/words";

  g_file_get_contents (dict_path, &words_str, NULL, NULL);

  if (!words_str)
    return g_strdup ("problem opening dictionary");

  uwords_str = g_utf8_to_ucs4 (words_str, -1, NULL, NULL, NULL);

  if (charset == NULL)
    charset = "abcdefghijklmnopqrstuvwxyz";

  ucharset = g_utf8_to_ucs4 (charset, -1, NULL, NULL, NULL);
  if (desired_glyphs)
    udesired_glyphs = g_utf8_to_ucs4 (desired_glyphs, -1, NULL, NULL, NULL);


  if (max_words > MAX_WORDS)
    max_words = MAX_WORDS;

  for (p = uwords_str; *p; p++)
    {
      switch (*p)
      {
        case '\n':
        case '\r':
        case ' ':
        case '\t':
          if (word->len)
          {
            int skip = 0;
            int i;
            gunichar *uword = g_utf8_to_ucs4 (word->str, -1, NULL, NULL, NULL);
            for (i = 0; uword[i]; i++)
              {
                int k;
                skip++;
                for (k = 0; ucharset[k]; k++)
                  if (ucharset[k]==uword[i])
                    {
                      skip--;break;
                    }
              }
            if (word->len > max_wordlen)
              skip++;

            if (!skip)
            {
              words = g_list_prepend (words, g_strdup (word->str));
              count ++;
            }
            g_free (uword);
          }
          g_string_assign (word, "");
          break;
        default:
          g_string_append_unichar (word, *p);
          break;
      }
    }
  g_free (ucharset);
  g_free (words_str);
  g_free (uwords_str);

  for (i = 0; i < attempts; i ++)
    {
      GString *example = g_string_new ("");
      int j;
      for (j = 0; j < max_words; j ++)
        {
          int n;
          const char *str;
          n = rand()%count;
 
          {
            int k;
            for (k = 0; k < j; k++)
              if (sentence[k]==n)
                {
                  /* we try once more if it collides with already picked
                   * random number,. - but this value will stick */
                  n = rand()%count;
                  break;
                }
          }
          sentence[j] = n;
          
          str = g_list_nth_data (words, n);
          if (str)
          {
            if (j)
              g_string_append (example, " ");
            g_string_append (example, str);
          }
        }
      float score = score_string ((void*)example->str, desired_glyphs);
      if (score >= best_score)
      {
        for (j = 0; j < max_words; j ++)
          best_sentence[j] = sentence[j];
        best_score = score;
      }
      g_string_free (example, TRUE);
    }

  if (print_score)
    printf ("Score: %i\n", best_score);

  {
    char *ret = NULL;
    int j;
    GString *s = g_string_new ("");

    if (desired_glyphs && desired_glyphs[0])
      {
        g_string_append (s, desired_glyphs);
        g_string_append (s, " ");
      }

    for (j = 0; j < max_words; j ++)
    {
      const char *str;
      str = g_list_nth_data (words, best_sentence[j]);
      if (str)
        {
          if (j)
            g_string_append (s, " ");
          g_string_append (s, str);
        }
    }
    ret = strdup (s->str);
    g_string_free (s, TRUE);
    g_free (udesired_glyphs);
    return ret;
  }
}
Exemple #25
0
int main(int argc, char * argv[]){
    int i = 1;
    bool gen_extra_enter = false;

    setlocale(LC_ALL, "");
    //deal with options.
    while ( i < argc ){
        if ( strcmp ("--help", argv[i]) == 0) {
            print_help();
            exit(0);
        } else if (strcmp("--generate-extra-enter", argv[i]) == 0) {
            gen_extra_enter = true;
        } else {
            print_help();
            exit(EINVAL);
        }
        ++i;
    }

    /* init phrase table */
    FacadePhraseTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk, NULL);

    /* init phrase index */
    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    char * linebuf = NULL;
    size_t size = 0;
    ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        //check non-ucs4 characters
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
            printf("\n");
            continue;
        }

        //do segment stuff
        GArray * strings = g_array_new(TRUE, TRUE, sizeof(SegmentStep));
        segment(&phrase_table, &phrase_index, sentence, len, strings);

        //print out the split phrase
        for ( glong i = 0; i < strings->len; ++i ) {
            SegmentStep * step = &g_array_index(strings, SegmentStep, i);
            char * string = g_ucs4_to_utf8( step->m_phrase, step->m_phrase_len, NULL, NULL, NULL);
            printf("%d %s\n", step->m_handle, string);
            g_free(string);
        }

        /* print extra enter */
        if ( gen_extra_enter )
            printf("\n");

        g_array_free(strings, TRUE);
        g_free(sentence);
    }

    /* print enter at file tail */
    printf("\n");
    return 0;
}
static void
gimp_number_pair_entry_set_property (GObject      *object,
                                     guint         property_id,
                                     const GValue *value,
                                     GParamSpec   *pspec)
{
  GimpNumberPairEntry        *entry = GIMP_NUMBER_PAIR_ENTRY (object);
  GimpNumberPairEntryPrivate *priv;

  priv = GIMP_NUMBER_PAIR_ENTRY_GET_PRIVATE (entry);

  switch (property_id)
    {
    case PROP_LEFT_NUMBER:
      gimp_number_pair_entry_set_values (entry,
                                         g_value_get_double (value),
                                         priv->right_number);
      break;
    case PROP_RIGHT_NUMBER:
      gimp_number_pair_entry_set_values (entry,
                                         priv->left_number,
                                         g_value_get_double (value));
      break;
    case PROP_DEFAULT_LEFT_NUMBER:
      gimp_number_pair_entry_set_default_values (entry,
                                                 g_value_get_double (value),
                                                 priv->default_right_number);
      break;
    case PROP_DEFAULT_RIGHT_NUMBER:
      gimp_number_pair_entry_set_default_values (entry,
                                                 priv->default_left_number,
                                                 g_value_get_double (value));
      break;
    case PROP_USER_OVERRIDE:
      gimp_number_pair_entry_set_user_override (entry,
                                                g_value_get_boolean (value));
      break;
    case PROP_SEPARATORS:
      g_free (priv->separators);
      priv->num_separators = 0;
      if (g_value_get_string (value))
        priv->separators = g_utf8_to_ucs4 (g_value_get_string (value), -1,
                                           NULL, &priv->num_separators, NULL);
      else
        priv->separators = NULL;
      break;
    case PROP_DEFAULT_TEXT:
      gimp_number_pair_entry_set_default_text (entry,
                                               g_value_get_string (value));
      break;
    case PROP_ALLOW_SIMPLIFICATION:
      priv->allow_simplification = g_value_get_boolean (value);
      break;
    case PROP_MIN_VALID_VALUE:
      priv->min_valid_value = g_value_get_double (value);
      break;
    case PROP_MAX_VALID_VALUE:
      priv->max_valid_value = g_value_get_double (value);
      break;
    case PROP_RATIO:
      gimp_number_pair_entry_set_ratio (entry, g_value_get_double (value));
      break;
    case PROP_ASPECT:
      gimp_number_pair_entry_set_aspect (entry, g_value_get_enum (value));
      break;
    default:
      G_OBJECT_WARN_INVALID_PROPERTY_ID (object, property_id, pspec);
      break;
    }
}
int main(int argc, char * argv[]){
    PhraseLargeTable2 largetable;
    FacadePhraseIndex phrase_index;

    if (!load_phrase_table(NULL, &largetable, &phrase_index))
        exit(ENOENT);

    MemoryChunk * chunk = new MemoryChunk;
    largetable.store(chunk);
    largetable.load(chunk);

    char* linebuf = NULL; size_t size = 0; ssize_t read;
    while ((read = getline(&linebuf, &size, stdin)) != -1) {
        if ( '\n' == linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        if ( strcmp ( linebuf, "quit" ) == 0)
            break;

        glong phrase_len = g_utf8_strlen(linebuf, -1);
        ucs4_t * new_phrase = g_utf8_to_ucs4(linebuf, -1, NULL, NULL, NULL);

        if (0 == phrase_len)
            continue;

        PhraseTokens tokens;
        memset(tokens, 0, sizeof(PhraseTokens));
        phrase_index.prepare_tokens(tokens);

        guint32 start = record_time();
        for (size_t i = 0; i < bench_times; ++i){
            phrase_index.clear_tokens(tokens);
            largetable.search(phrase_len, new_phrase, tokens);
        }
        print_time(start, bench_times);

        phrase_index.clear_tokens(tokens);
        int retval = largetable.search(phrase_len, new_phrase, tokens);

        if (retval & SEARCH_OK) {
            for (size_t i = 0; i < PHRASE_INDEX_LIBRARY_COUNT; ++i) {
                GArray * array = tokens[i];
                if (NULL == array)
                    continue;

                for (size_t k = 0; k < array->len; ++k) {
                    phrase_token_t token = g_array_index
                        (array, phrase_token_t, k);

                    printf("token:%d\t", token);
                }
            }
            printf("\n");
        }

        phrase_index.destroy_tokens(tokens);
        g_free(new_phrase);
    }

    if ( linebuf )
        free(linebuf);
    return 0;
}
Exemple #28
0
int main(int argc, char * argv[]){
    int i = 1;
    bool gen_extra_enter = false;

    setlocale(LC_ALL, "");
    /* deal with options */
    while ( i < argc ){
        if ( strcmp ("--help", argv[i]) == 0 ){
            print_help();
            exit(0);
        } else if ( strcmp("--generate-extra-enter", argv[i]) == 0 ){
            gen_extra_enter = true;
        } else {
            print_help();
            exit(EINVAL);
        }
        ++i;
    }

    /* init phrase table */
    FacadePhraseTable2 phrase_table;
    MemoryChunk * chunk = new MemoryChunk;
    chunk->load("phrase_index.bin");
    phrase_table.load(chunk, NULL);

    /* init phrase index */
    FacadePhraseIndex phrase_index;
    if (!load_phrase_index(&phrase_index))
        exit(ENOENT);

    /* init bi-gram */
    Bigram system_bigram;
    system_bigram.attach("bigram.db", ATTACH_READONLY);
    Bigram user_bigram;

    /* init phrase lookup */
    PhraseLookup phrase_lookup(&phrase_table, &phrase_index,
                               &system_bigram, &user_bigram);


    CONTEXT_STATE state, next_state;
    GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));

    PhraseTokens tokens;
    memset(tokens, 0, sizeof(PhraseTokens));
    phrase_index.prepare_tokens(tokens);

    /* split the sentence */
    char * linebuf = NULL; size_t size = 0; ssize_t read;
    while( (read = getline(&linebuf, &size, stdin)) != -1 ){
        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        /* check non-ucs4 characters */
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
            printf("\n");
            continue;
        }

        /* only new-line persists. */
        if ( 0  == num_of_chars ) {
            printf("\n");
            continue;
        }

        state = CONTEXT_INIT;
        int result = phrase_table.search( 1, sentence, tokens);
        g_array_append_val( current_ucs4, sentence[0]);
        if ( result & SEARCH_OK )
            state = CONTEXT_SEGMENTABLE;
        else
            state = CONTEXT_UNKNOWN;

        for ( int i = 1; i < num_of_chars; ++i) {
            int result = phrase_table.search( 1, sentence + i, tokens);
            if ( result & SEARCH_OK )
                next_state = CONTEXT_SEGMENTABLE;
            else
                next_state = CONTEXT_UNKNOWN;

            if ( state == next_state ){
                g_array_append_val(current_ucs4, sentence[i]);
                continue;
            }

            assert ( state != next_state );
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(&phrase_lookup, current_ucs4);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_ucs4);

            /* save the current character */
            g_array_set_size(current_ucs4, 0);
            g_array_append_val(current_ucs4, sentence[i]);
            state = next_state;
        }

        if ( current_ucs4->len ) {
            /* this seems always true. */
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(&phrase_lookup, current_ucs4);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_ucs4);
            g_array_set_size(current_ucs4, 0);
        }

        /* print extra enter */
        if ( gen_extra_enter )
            printf("\n");
    }
    phrase_index.destroy_tokens(tokens);

    /* print enter at file tail */
    printf("\n");
    g_array_free(current_ucs4, TRUE);
    free(linebuf);
    return 0;
}
void feed_line (const char * phrase, const char * pinyin, const guint32 freq){
    phrase_item * new_phrase_ptr = (phrase_item *)
	malloc( sizeof(phrase_item));     
    new_phrase_ptr->length = g_utf8_strlen(phrase, -1);
	/* FIXME: modify ">" to ">=" according to pinyin_large_table.cpp
	 *	where is the code which I don't want to touch. :-)
	 */
	if (new_phrase_ptr->length >= MAX_PHRASE_LENGTH ) {
		fprintf(stderr, "too long phrase:%s\t%s\t%d\n", phrase,
			pinyin, freq);
		free(new_phrase_ptr);
		return;
	}
    new_phrase_ptr->uniphrase = g_utf8_to_ucs4(phrase, -1, NULL, NULL, NULL);
    
    PinyinDefaultParser parser;
    NullPinyinValidator validator;
    PinyinKeyVector keys;
    PinyinKeyPosVector poses;
    
    keys = g_array_new(FALSE, FALSE, sizeof( PinyinKey));
    poses = g_array_new(FALSE, FALSE, sizeof( PinyinKeyPos));
    parser.parse(validator, keys, poses, pinyin);

    GArray * array = (GArray *)g_tree_lookup(g_pinyin_tree, new_phrase_ptr);

    pinyin_and_freq_item value_item;
    value_item.pinyin = keys;
    value_item.freq = freq;
    
    if(new_phrase_ptr->length != value_item.pinyin->len){
	fprintf(stderr, "error:phrase:%s\tpinyin:%s\n", phrase, pinyin);
	return;
    }

    if ( array == NULL){
	array = g_array_new(FALSE, TRUE, sizeof(pinyin_and_freq_item));
	g_array_append_val(array, value_item);
	g_tree_insert(g_pinyin_tree, new_phrase_ptr, array);
	return;
    }
    bool found = false;
    for ( size_t i = 0; i < array->len ; ++i){
	pinyin_and_freq_item * old_value_item = &g_array_index(array, pinyin_and_freq_item, i);
	int result = pinyin_exact_compare((PinyinKey *)value_item.pinyin->data, 
					  (PinyinKey *)old_value_item->pinyin->data , value_item.pinyin->len);
	if ( result == 0 ){
	    printf("Duplicate item: phrase:%s\tpinyin:%s\tfreq:%u\n", 
		   phrase, pinyin, freq);
	    old_value_item->freq += freq;
	    found = true;
	}
    }

    g_array_free(poses, TRUE);
    
    if ( !found ){
	g_array_append_val(array, value_item);
	g_tree_insert(g_pinyin_tree, new_phrase_ptr, array);
    }else
	g_array_free(keys, TRUE);

    free(new_phrase_ptr);
    //g_array_free(keys, TRUE);
}
Exemple #30
0
int main(int argc, char * argv[]){
    FILE * input = stdin;
    FILE * output = stdout;

    setlocale(LC_ALL, "");

    GError * error = NULL;
    GOptionContext * context;

    context = g_option_context_new("- n-gram segment");
    g_option_context_add_main_entries(context, entries, NULL);
    if (!g_option_context_parse(context, &argc, &argv, &error)) {
        g_print("option parsing failed:%s\n", error->message);
        exit(EINVAL);
    }

    if (outputfile) {
        output = fopen(outputfile, "w");
        if (NULL == output) {
            perror("open file failed");
            exit(EINVAL);
        }
    }

    if (argc > 2) {
        fprintf(stderr, "too many arguments.\n");
        exit(EINVAL);
    }

    if (2 == argc) {
        input = fopen(argv[1], "r");
        if (NULL == input) {
            perror("open file failed");
            exit(EINVAL);
        }
    }

    SystemTableInfo2 system_table_info;

    bool retval = system_table_info.load(SYSTEM_TABLE_INFO);
    if (!retval) {
        fprintf(stderr, "load table.conf failed.\n");
        exit(ENOENT);
    }

    /* init phrase table */
    FacadePhraseTable3 phrase_table;
    phrase_table.load(SYSTEM_PHRASE_INDEX, NULL);

    /* init phrase index */
    FacadePhraseIndex phrase_index;

    const pinyin_table_info_t * phrase_files =
        system_table_info.get_default_tables();

    if (!load_phrase_index(phrase_files, &phrase_index))
        exit(ENOENT);

    /* init bi-gram */
    Bigram system_bigram;
    system_bigram.attach(SYSTEM_BIGRAM, ATTACH_READONLY);
    Bigram user_bigram;

    gfloat lambda = system_table_info.get_lambda();

    /* init phrase lookup */
    PhraseLookup phrase_lookup(lambda,
                               &phrase_table, &phrase_index,
                               &system_bigram, &user_bigram);


    CONTEXT_STATE state, next_state;
    GArray * current_ucs4 = g_array_new(TRUE, TRUE, sizeof(ucs4_t));

    PhraseTokens tokens;
    memset(tokens, 0, sizeof(PhraseTokens));
    phrase_index.prepare_tokens(tokens);

    /* split the sentence */
    char * linebuf = NULL; size_t size = 0; ssize_t read;
    while( (read = getline(&linebuf, &size, input)) != -1 ){
        if ( '\n' ==  linebuf[strlen(linebuf) - 1] ) {
            linebuf[strlen(linebuf) - 1] = '\0';
        }

        /* check non-ucs4 characters */
        const glong num_of_chars = g_utf8_strlen(linebuf, -1);
        glong len = 0;
        ucs4_t * sentence = g_utf8_to_ucs4(linebuf, -1, NULL, &len, NULL);
        if ( len != num_of_chars ) {
            fprintf(stderr, "non-ucs4 characters encountered:%s.\n", linebuf);
            fprintf(output, "%d \n", null_token);
            continue;
        }

        /* only new-line persists. */
        if ( 0  == num_of_chars ) {
            fprintf(output, "%d \n", null_token);
            continue;
        }

        state = CONTEXT_INIT;
        int result = phrase_table.search( 1, sentence, tokens);
        g_array_append_val( current_ucs4, sentence[0]);
        if ( result & SEARCH_OK )
            state = CONTEXT_SEGMENTABLE;
        else
            state = CONTEXT_UNKNOWN;

        for ( int i = 1; i < num_of_chars; ++i) {
            int result = phrase_table.search( 1, sentence + i, tokens);
            if ( result & SEARCH_OK )
                next_state = CONTEXT_SEGMENTABLE;
            else
                next_state = CONTEXT_UNKNOWN;

            if ( state == next_state ){
                g_array_append_val(current_ucs4, sentence[i]);
                continue;
            }

            assert ( state != next_state );
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(&phrase_lookup, current_ucs4, output);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_ucs4, output);

            /* save the current character */
            g_array_set_size(current_ucs4, 0);
            g_array_append_val(current_ucs4, sentence[i]);
            state = next_state;
        }

        if ( current_ucs4->len ) {
            /* this seems always true. */
            if ( state == CONTEXT_SEGMENTABLE )
                deal_with_segmentable(&phrase_lookup, current_ucs4, output);

            if ( state == CONTEXT_UNKNOWN )
                deal_with_unknown(current_ucs4, output);
            g_array_set_size(current_ucs4, 0);
        }

        /* print extra enter */
        if ( gen_extra_enter )
            fprintf(output, "%d \n", null_token);

        g_free(sentence);
    }
    phrase_index.destroy_tokens(tokens);

    /* print enter at file tail */
    fprintf(output, "%d \n", null_token);
    g_array_free(current_ucs4, TRUE);
    free(linebuf);
    fclose(input);
    fclose(output);
    return 0;
}