//check if the specified word is a whitespace. FRISO_API int friso_whitespace( friso_charset_t charset, friso_task_t task ) { if ( charset == FRISO_UTF8 ) return utf8_whitespace(task->unicode); else if ( charset == FRISO_GBK ) return gbk_whitespace(task->buffer); return 0; }
/* * get the next match from the current position, * throught the dictionary. * this will return all the matchs. * * @return friso_array_t that contains all the matchs. */ __STATIC_API__ friso_array_t get_next_match( friso_t friso, friso_task_t task, uint_t idx ) { register uint_t t; string_buffer_t sb = new_string_buffer_with_string( task->buffer ); //create a match dynamic array. friso_array_t match = new_array_list_with_opacity( friso->max_len ); array_list_add( match, friso_dic_get( friso->dic, __LEX_CJK_WORDS__, task->buffer ) ); for ( t = 1; t < friso->max_len && ( task->bytes = read_next_word( task, &idx, task->buffer ) ) != 0; t++ ) { task->unicode = get_utf8_unicode( task->buffer ); if ( utf8_whitespace( task->unicode ) ) break; if ( ! utf8_cjk_string( task->unicode ) ) break; //append the task->buffer to the buffer. string_buffer_append( sb, task->buffer ); //check the CJK dictionary. if ( friso_dic_match( friso->dic, __LEX_CJK_WORDS__, sb->buffer ) ) { /* * add the lex_entry_t insite. * here is a key point: * we use friso_dic_get function to get the address of the lex_entry_cdt * that store in the dictionary, not create a new lex_entry_cdt. * so : * 1.we will not bother to the allocations of the newly created lex_entry_cdt. * 2.more efficient of course. */ array_list_add( match, friso_dic_get( friso->dic, __LEX_CJK_WORDS__, sb->buffer ) ); } } /*buffer allocations clear*/ free_string_buffer( sb ); //array_list_trim( match ); return match; }
//get the next cjk word from the current position, with simple mode. __STATIC_API__ friso_hits_t next_simple_cjk( friso_t friso, friso_task_t task ) { uint_t t, idx = task->idx, __length__; string_buffer_t sb = new_string_buffer_with_string( task->buffer ); lex_entry_t e = friso_dic_get( friso->dic, __LEX_CJK_WORDS__, sb->buffer ); /* * here bak the e->length in the task->hits->type. * we will use it to count the task->idx. * for the sake of use less variable. */ __length__ = e->length; for ( t = 1; t < friso->max_len && ( task->bytes = read_next_word( task, &idx, task->buffer ) ) != 0; t++ ) { task->unicode = get_utf8_unicode( task->buffer ); if ( utf8_whitespace( task->unicode ) ) break; if ( ! utf8_cjk_string( task->unicode ) ) break; string_buffer_append( sb, task->buffer ); //check the existence of the word by search the dictionary. if ( friso_dic_match( friso->dic, __LEX_CJK_WORDS__, sb->buffer ) ) { e = friso_dic_get( friso->dic, __LEX_CJK_WORDS__, sb->buffer ); } } //correct the offset of the segment. task->idx += ( e->length - __length__ ); free_string_buffer( sb ); //free the buffer //reset the hits. task->hits->word = e->word; task->hits->type = __FRISO_SYS_WORDS__; return task->hits; }
/* * get the next segmentation. * and also this is the friso enterface function. * * @param friso. * @param segment. * @return string. */ __EXTERN_API__ friso_hits_t friso_next( friso_t friso, friso_mode_t _mode, friso_task_t task ) { lex_entry_t lex = NULL; if ( ! link_list_empty( task->poll ) ) { /* * load word from the word poll if it is not empty. * this will make the next word more convenient and efficient. * often synonyms, newly created word will be stored in the poll. */ lex = ( lex_entry_t ) link_list_remove_first( task->poll ); task->hits->word = lex->word; task->hits->offset = task->idx; task->hits->type = \ lex->type == __LEX_OTHER_WORDS__ ? __FRISO_NEW_WORDS__ : __FRISO_SYS_WORDS__; //free the allocations of the lexicon entry if its type is other words. if ( lex->type == __LEX_OTHER_WORDS__ ) { free_lex_entry( lex ); } return task->hits; } while ( task->idx < task->length ) { //read the next word from the current position. task->bytes = read_next_word( task, &task->idx, task->buffer ); if ( task->bytes == 0 ) break; task->unicode = get_utf8_unicode( task->buffer ); if ( utf8_whitespace( task->unicode ) ) continue; task->hits->offset = task->idx - task->bytes; //start the segmentation. if ( utf8_cjk_string( task->unicode ) && friso_dic_match( friso->dic, __LEX_CJK_WORDS__, task->buffer) ) { //complex mode. if ( _mode == __FRISO_COMPLEX_MODE__ ) { return next_complex_cjk( friso, task ); } else { return next_simple_cjk( friso, task ); } } else if ( utf8_halfwidth_letter_digit( task->unicode ) || utf8_fullwidth_letter_digit( task->unicode ) ) { if ( utf8_en_punctuation( task->unicode ) ) continue; //get the next basic latin word. return next_basic_latin( friso, task ); } else if ( utf8_letter_number( task->unicode ) ) { } else if ( utf8_other_number( task->unicode ) ) { } } return NULL; }
//get the next latin word from the current position. __STATIC_API__ friso_hits_t next_basic_latin( friso_t friso, friso_task_t task ) { char __convert = 0, t = 0; string_buffer_t sb, temp; lex_entry_t e = NULL; //full-half width and upper-lower case exchange. task->unicode = get_utf8_unicode( task->buffer ); ___LATAIN_FULL_UPPER_CHECK___ //creat a new string buffer and append the task->buffer insite. sb = new_string_buffer_with_string( task->buffer ); //segmentation. while ( ( task->bytes = read_next_word( task, &task->idx, task->buffer ) ) != 0 ) { task->unicode = get_utf8_unicode( task->buffer ); if ( utf8_whitespace( task->unicode ) ) break; if ( utf8_en_punctuation( task->unicode ) && ! utf8_keep_punctuation( task->buffer ) ) break; if ( ! ( utf8_halfwidth_letter_digit( task->unicode ) || utf8_fullwidth_letter_digit( task->unicode ) ) ) { task->idx -= task->bytes; t = 1; break; } //full-half width and upper-lower case convert ___LATAIN_FULL_UPPER_CHECK___ //append the word the buffer. string_buffer_append( sb, task->buffer ); } /*clear the useless english punctuation from the end of the buffer.*/ for ( ; sb->length > 0 && sb->buffer[ sb->length - 1 ] != '%' && is_en_punctuation( sb->buffer[ sb->length - 1 ] ); ) { sb->buffer[ --sb->length ] = '\0'; } /* * find the chinese or english mixed word. * or single chinese units.*/ if ( t == 1 ) { if ( utf8_cjk_string( task->unicode ) ) { //temp string buffer. temp = new_string_buffer_with_string( sb->buffer ); for ( t = 0; t < friso->mix_len && ( task->bytes = read_next_word( task , &task->idx, task->buffer ) ) != 0; t++ ) { task->unicode = get_utf8_unicode( task->buffer ); if ( ! utf8_cjk_string( task->unicode ) ) { task->idx -= task->bytes; break; } string_buffer_append( temp, task->buffer ); //check the mixed word dictionary. if ( friso_dic_match( friso->dic, __LEX_MIX_WORDS__, temp->buffer ) ) { __convert = 1; //get the lexicon entry from the dictionary. e = friso_dic_get( friso->dic, __LEX_MIX_WORDS__, temp->buffer ); } } //correct the segmentation offset. task->idx -= ( temp->length - ( e == NULL ? sb->length : e->length ) ); free_string_buffer( temp ); //no match for mix word, try to find a single chinese unit. if ( __convert == 0 ) { //check if it is string made up with numeric if ( utf8_numeric_string( sb->buffer ) && ( task->bytes = read_next_word( task, &task->idx, task->buffer ) ) != 0 ) { //check the single chinese units dictionary. if ( friso_dic_match( friso->dic, __LEX_CJK_UNITS__, task->buffer ) ) { string_buffer_append( sb, task->buffer ); } else { task->idx -= task->bytes; } } } //end convert condition } } if ( __convert == 1 ) { free_string_buffer( sb ); task->hits->word = e->word; task->hits->type = __FRISO_SYS_WORDS__; } else { /* * adjust the string buffer. * here we do not trim the buffer cause its allocations will be free * after the call of friso_next - sooner or later it will be released. * if your memory almost run out, you should call string_buffer_trim. * or we save the time to do the allocations and copy the buffer insite. */ //string_buffer_trim( sb ); task->hits->word = string_buffer_devote( sb ); task->hits->type = __FRISO_NEW_WORDS__; } return task->hits; }