Ejemplo n.º 1
0
/* check if the specified string is a cn string.
 * 
 * @return int (true for cn string or false)
 * */
FRISO_API int friso_cn_string( 
	friso_charset_t charset, 
	friso_task_t task )
{
    if ( charset == FRISO_UTF8 )
	return utf8_cjk_string(task->unicode);
    else if ( charset == FRISO_GBK )
	return gbk_cn_string(task->buffer);
    return 0;
}
Ejemplo n.º 2
0
/*
 * get the next match from the current position,
 *		throught the dictionary.
 *	this will return all the matchs.
 *
 * @return friso_array_t that contains all the matchs.
 */
__STATIC_API__ friso_array_t get_next_match( friso_t friso, friso_task_t task, uint_t idx ) {

	register uint_t t;
	string_buffer_t sb = new_string_buffer_with_string( task->buffer );

	//create a match dynamic array.
	friso_array_t match = new_array_list_with_opacity( friso->max_len );
	array_list_add( match, friso_dic_get( friso->dic, __LEX_CJK_WORDS__, task->buffer ) );

	for ( t = 1; t < friso->max_len
			&& ( task->bytes = read_next_word( task, &idx, task->buffer ) ) != 0; t++ ) {

		task->unicode = get_utf8_unicode( task->buffer );
		if ( utf8_whitespace( task->unicode ) ) 	break;
		if ( ! utf8_cjk_string( task->unicode ) ) break;

		//append the task->buffer to the buffer.
		string_buffer_append( sb, task->buffer );

		//check the CJK dictionary.
		if ( friso_dic_match( friso->dic, __LEX_CJK_WORDS__, sb->buffer ) ) {
			/*
			 * add the lex_entry_t insite.
			 * here is a key point:
			 *		we use friso_dic_get function to get the address of the lex_entry_cdt
			 *		that store in the dictionary, not create a new lex_entry_cdt.
			 * so :
			 *		1.we will not bother to the allocations of the newly created lex_entry_cdt.
			 *		2.more efficient of course.
			 */
			array_list_add( match, friso_dic_get( friso->dic, __LEX_CJK_WORDS__, sb->buffer ) );
		}
	}

	/*buffer allocations clear*/
	free_string_buffer( sb );
	//array_list_trim( match );
	
	return match;
}
Ejemplo n.º 3
0
//get the next cjk word from the current position, with simple mode.
__STATIC_API__ friso_hits_t next_simple_cjk( friso_t friso, friso_task_t task ) {

	uint_t t, idx = task->idx, __length__;
	string_buffer_t sb = new_string_buffer_with_string( task->buffer );
	lex_entry_t e = friso_dic_get( friso->dic, __LEX_CJK_WORDS__, sb->buffer );
	/*
	 * here bak the e->length in the task->hits->type.
	 *		we will use it to count the task->idx.
	 * for the sake of use less variable.
	 */
	__length__ = e->length;

	for ( t = 1; 
		t < friso->max_len 
		&& ( task->bytes = read_next_word( task, &idx, task->buffer ) ) != 0; t++ ) {
		
		task->unicode = get_utf8_unicode( task->buffer );

		if ( utf8_whitespace( task->unicode ) ) break;
		if ( ! utf8_cjk_string( task->unicode ) ) break;

		string_buffer_append( sb, task->buffer );

		//check the existence of the word by search the dictionary.
		if ( friso_dic_match( friso->dic, __LEX_CJK_WORDS__, sb->buffer ) ) {
			e = friso_dic_get( friso->dic, __LEX_CJK_WORDS__, sb->buffer );
		}
	}

	//correct the offset of the segment.
	task->idx += ( e->length - __length__ );
	free_string_buffer( sb );							//free the buffer

	//reset the hits.
	task->hits->word = e->word;
	task->hits->type = __FRISO_SYS_WORDS__;

	return task->hits;
}
Ejemplo n.º 4
0
/*
 * get the next segmentation.
 * 		and also this is the friso enterface function.
 *
 * @param friso.
 * @param segment.
 * @return string.
 */
__EXTERN_API__ friso_hits_t friso_next( friso_t friso, friso_mode_t _mode, friso_task_t task ) {

	lex_entry_t lex = NULL;

	if ( ! link_list_empty( task->poll ) ) {
		/*
		 * load word from the word poll if it is not empty.
		 * 		this will make the next word more convenient and efficient.
		 * 	often synonyms, newly created word will be stored in the poll.
		 */
		lex = ( lex_entry_t ) link_list_remove_first( task->poll );

		task->hits->word = lex->word;
		task->hits->offset = task->idx;
		task->hits->type = 				\
				lex->type ==	__LEX_OTHER_WORDS__  ? __FRISO_NEW_WORDS__ : __FRISO_SYS_WORDS__;

		//free the allocations of the lexicon entry if its type is other words.
		if ( lex->type == __LEX_OTHER_WORDS__ ) {
			free_lex_entry( lex );
		}

		return task->hits;
	}

	while ( task->idx < task->length ) {
		//read the next word from the current position.
		task->bytes = read_next_word( task, &task->idx, task->buffer );
		if ( task->bytes == 0 ) break;

		task->unicode = get_utf8_unicode( task->buffer );
		if ( utf8_whitespace( task->unicode ) ) continue;
		task->hits->offset = task->idx - task->bytes;

		//start the segmentation.
		if ( utf8_cjk_string( task->unicode ) 
						&& friso_dic_match( friso->dic, __LEX_CJK_WORDS__, task->buffer) ) {

			//complex mode.
			if ( _mode == __FRISO_COMPLEX_MODE__ ) {
				return next_complex_cjk( friso, task );
			} else {
				return next_simple_cjk( friso, task );
			}
		} 
		else if ( utf8_halfwidth_letter_digit( task->unicode ) 
						|| utf8_fullwidth_letter_digit( task->unicode ) ) 
		{
			if ( utf8_en_punctuation( task->unicode ) ) continue; 
			//get the next basic latin word.
			return next_basic_latin( friso, task );
		} 
		else if ( utf8_letter_number( task->unicode ) ) {

		} 
		else if ( utf8_other_number( task->unicode ) ) {

		}
	}
	
	return NULL;
}
Ejemplo n.º 5
0
/*
 * get the next cjk word from the current position, with complex mode.
 *	this is the core of the mmseg chinese word segemetation algorithm.
 *	we use four rules to filter the matched chunks and get the best one
 *		as the final result.
 *
 * @see mmseg_core_invoke( chunks );
 */
__STATIC_API__ friso_hits_t next_complex_cjk( friso_t friso, friso_task_t task ) {

	register uint_t x, y, z;
	/*bakup the task->bytes here*/
	uint_t __idx__ = task->bytes;
	lex_entry_t fe, se, te;
	friso_chunk_t e;
	friso_array_t words, chunks;
	friso_array_t smatch, tmatch, fmatch = get_next_match( friso, task, task->idx );

	/*
	 * here:
	 *		if the length of the fmatch is 1, mean we don't have to
	 *	continue the following work. ( no matter what we get the same result. )
	 */
	if ( fmatch->length == 1 ) {
		task->hits->word =  ( ( lex_entry_t ) fmatch->items[0] )->word;
		task->hits->type = __FRISO_SYS_WORDS__;
		free_array_list( fmatch );
		
		return task->hits;
	}

	chunks = new_array_list();
	task->idx -= __idx__;
	

	for ( x = 0; x < fmatch->length; x++ ) 
	{
		/*get the word and try the second layer match*/
		fe = ( lex_entry_t ) array_list_get( fmatch, x );
		__idx__ = task->idx + fe->length;
		read_next_word( task, &__idx__, task->buffer );

		if ( task->bytes != 0 
				&& utf8_cjk_string( get_utf8_unicode( task->buffer ) ) 
				&& friso_dic_match( friso->dic, __LEX_CJK_WORDS__, task->buffer ) ) {

			//get the next matchs
			smatch = get_next_match( friso, task, __idx__ );
			for ( y = 0; y < smatch->length; y++ ) 
			{
				/*get the word and try the third layer match*/
				se = ( lex_entry_t ) array_list_get( smatch, y );
				__idx__ = task->idx + fe->length + se->length;
				read_next_word( task, &__idx__, task->buffer );

				if ( task->bytes != 0 
						&& utf8_cjk_string( get_utf8_unicode( task->buffer ) )
						&& friso_dic_match( friso->dic, __LEX_CJK_WORDS__, task->buffer ) ) {

					//get the matchs.
					tmatch = get_next_match( friso, task, __idx__ );
					for ( z = 0; z < tmatch->length; z++ ) 
					{
						te = ( lex_entry_t ) array_list_get( tmatch, z );
						words = new_array_list_with_opacity(3);
						array_list_add( words, fe );
						array_list_add( words, se );
						array_list_add( words, te );
						array_list_add( chunks, 
								new_chunk( words, fe->length + se->length + te->length ) );
					}
					free_array_list( tmatch );
				} else {
					words = new_array_list_with_opacity(2);
					array_list_add( words, fe );
					array_list_add( words, se );
					//add the chunk
					array_list_add( chunks,
							new_chunk( words, fe->length + se->length ) );
				}
			}
			free_array_list( smatch );
		} else {
			words = new_array_list_with_opacity(1);
			array_list_add( words, fe );
			array_list_add( chunks, new_chunk( words, fe->length ) );
		}
	}
	free_array_list( fmatch );

	/*
	 * filter the chunks with the four rules of the mmseg algorithm
	 *		and get best chunk as the final result.
	 * @see mmseg_core_invoke( chunks );
	 * @date 2012-12-13
	 */
	if ( chunks->length > 1 ) {
		e = mmseg_core_invoke( chunks );
	} else {
		e = ( friso_chunk_t ) chunks->items[0];
	}
	fe = ( lex_entry_t ) e->words->items[0];
	task->hits->word = fe->word;
	task->hits->type = __FRISO_SYS_WORDS__;
	task->idx += fe->length;						//reset the idx of the task.
	free_chunk( e->words );
	free_chunk( e );
	
	return task->hits;
}
Ejemplo n.º 6
0
//get the next latin word from the current position.
__STATIC_API__ friso_hits_t next_basic_latin( friso_t friso, friso_task_t task ) {

	char __convert = 0, t = 0;
	string_buffer_t sb, temp;
	lex_entry_t e = NULL;

	//full-half width and upper-lower case exchange.
	task->unicode = get_utf8_unicode( task->buffer );
	___LATAIN_FULL_UPPER_CHECK___

	//creat a new string buffer and append the task->buffer insite.
	sb = new_string_buffer_with_string( task->buffer );


	//segmentation.
	while ( ( task->bytes = read_next_word( task, &task->idx, task->buffer ) ) != 0 ) {

		task->unicode = get_utf8_unicode( task->buffer );

		if ( utf8_whitespace( task->unicode ) ) break;
		if ( utf8_en_punctuation( task->unicode ) && ! utf8_keep_punctuation( task->buffer ) ) break; 
		if ( ! ( utf8_halfwidth_letter_digit( task->unicode ) || utf8_fullwidth_letter_digit( task->unicode ) ) ) {
			task->idx -= task->bytes;
			t = 1;
			break;
		}

		//full-half width and upper-lower case convert
		___LATAIN_FULL_UPPER_CHECK___

		//append the word the buffer.
		string_buffer_append( sb, task->buffer );
	}

	/*clear the useless english punctuation from
		the end of the buffer.*/
	for ( ; sb->length > 0 
			&& sb->buffer[ sb->length - 1 ] != '%' 
			&& is_en_punctuation( sb->buffer[ sb->length - 1 ] ); ) {
		sb->buffer[ --sb->length ] = '\0';
	}

	/*
	 * find the chinese or english mixed word.
	 * 		or single chinese units.*/
	if ( t == 1 ) {
		if ( utf8_cjk_string( task->unicode ) ) {
			//temp string buffer.
			temp = new_string_buffer_with_string( sb->buffer );

			for ( t = 0; 
				t < friso->mix_len 
				&& ( task->bytes = read_next_word( task , &task->idx, task->buffer ) ) != 0; t++ ) {
				
				task->unicode = get_utf8_unicode( task->buffer );

				if ( ! utf8_cjk_string( task->unicode ) ) {
					task->idx -= task->bytes;
					break;
				}

				string_buffer_append( temp, task->buffer );

				//check the mixed word dictionary.
				if ( friso_dic_match( friso->dic, __LEX_MIX_WORDS__, temp->buffer ) ) {
					__convert = 1;
					//get the lexicon entry from the dictionary.
					e = friso_dic_get( friso->dic, __LEX_MIX_WORDS__, temp->buffer );
				}
			}

			//correct the segmentation offset.
			task->idx -= ( temp->length - ( e == NULL ? sb->length : e->length ) );
			free_string_buffer( temp );

			//no match for mix word, try to find a single chinese unit.
			if ( __convert == 0 ) {
				//check if it is string made up with numeric
				if ( utf8_numeric_string( sb->buffer ) 
						&& ( task->bytes = read_next_word( task, &task->idx, task->buffer ) ) != 0 ) {

					//check the single chinese units dictionary.
					if ( friso_dic_match( friso->dic, __LEX_CJK_UNITS__, task->buffer ) ) {
						string_buffer_append( sb, task->buffer );
					} else {
						task->idx -= task->bytes;
					}
				}
			}	//end convert condition

		}
	}


	if ( __convert == 1 ) {
		free_string_buffer( sb );
		task->hits->word = e->word;
		task->hits->type = __FRISO_SYS_WORDS__;
	} else {
		/*
		 * adjust the string buffer.
		 *		here we do not trim the buffer cause its allocations will be free
		 *	after the call of friso_next - sooner or later it will be released.
		 *	if your memory almost run out, you should call string_buffer_trim.
		 *	or we save the time to do the allocations and copy the buffer insite.
		 */
		//string_buffer_trim( sb );
		task->hits->word = string_buffer_devote( sb );
		task->hits->type = __FRISO_NEW_WORDS__;
	}

	return task->hits;
}