Beispiel #1
0
//check the specified string is make up with numeric. 
FRISO_API int friso_numeric_string( 
	friso_charset_t charset, 
	char *buffer )
{
    if ( charset == FRISO_UTF8 )
	return utf8_numeric_string( buffer );
    else if ( charset == FRISO_GBK )
	return gbk_numeric_string( buffer );
    return 0;
}
Beispiel #2
0
//get the next latin word from the current position.
__STATIC_API__ friso_hits_t next_basic_latin( friso_t friso, friso_task_t task ) {

	char __convert = 0, t = 0;
	string_buffer_t sb, temp;
	lex_entry_t e = NULL;

	//full-half width and upper-lower case exchange.
	task->unicode = get_utf8_unicode( task->buffer );
	___LATAIN_FULL_UPPER_CHECK___

	//creat a new string buffer and append the task->buffer insite.
	sb = new_string_buffer_with_string( task->buffer );


	//segmentation.
	while ( ( task->bytes = read_next_word( task, &task->idx, task->buffer ) ) != 0 ) {

		task->unicode = get_utf8_unicode( task->buffer );

		if ( utf8_whitespace( task->unicode ) ) break;
		if ( utf8_en_punctuation( task->unicode ) && ! utf8_keep_punctuation( task->buffer ) ) break; 
		if ( ! ( utf8_halfwidth_letter_digit( task->unicode ) || utf8_fullwidth_letter_digit( task->unicode ) ) ) {
			task->idx -= task->bytes;
			t = 1;
			break;
		}

		//full-half width and upper-lower case convert
		___LATAIN_FULL_UPPER_CHECK___

		//append the word the buffer.
		string_buffer_append( sb, task->buffer );
	}

	/*clear the useless english punctuation from
		the end of the buffer.*/
	for ( ; sb->length > 0 
			&& sb->buffer[ sb->length - 1 ] != '%' 
			&& is_en_punctuation( sb->buffer[ sb->length - 1 ] ); ) {
		sb->buffer[ --sb->length ] = '\0';
	}

	/*
	 * find the chinese or english mixed word.
	 * 		or single chinese units.*/
	if ( t == 1 ) {
		if ( utf8_cjk_string( task->unicode ) ) {
			//temp string buffer.
			temp = new_string_buffer_with_string( sb->buffer );

			for ( t = 0; 
				t < friso->mix_len 
				&& ( task->bytes = read_next_word( task , &task->idx, task->buffer ) ) != 0; t++ ) {
				
				task->unicode = get_utf8_unicode( task->buffer );

				if ( ! utf8_cjk_string( task->unicode ) ) {
					task->idx -= task->bytes;
					break;
				}

				string_buffer_append( temp, task->buffer );

				//check the mixed word dictionary.
				if ( friso_dic_match( friso->dic, __LEX_MIX_WORDS__, temp->buffer ) ) {
					__convert = 1;
					//get the lexicon entry from the dictionary.
					e = friso_dic_get( friso->dic, __LEX_MIX_WORDS__, temp->buffer );
				}
			}

			//correct the segmentation offset.
			task->idx -= ( temp->length - ( e == NULL ? sb->length : e->length ) );
			free_string_buffer( temp );

			//no match for mix word, try to find a single chinese unit.
			if ( __convert == 0 ) {
				//check if it is string made up with numeric
				if ( utf8_numeric_string( sb->buffer ) 
						&& ( task->bytes = read_next_word( task, &task->idx, task->buffer ) ) != 0 ) {

					//check the single chinese units dictionary.
					if ( friso_dic_match( friso->dic, __LEX_CJK_UNITS__, task->buffer ) ) {
						string_buffer_append( sb, task->buffer );
					} else {
						task->idx -= task->bytes;
					}
				}
			}	//end convert condition

		}
	}


	if ( __convert == 1 ) {
		free_string_buffer( sb );
		task->hits->word = e->word;
		task->hits->type = __FRISO_SYS_WORDS__;
	} else {
		/*
		 * adjust the string buffer.
		 *		here we do not trim the buffer cause its allocations will be free
		 *	after the call of friso_next - sooner or later it will be released.
		 *	if your memory almost run out, you should call string_buffer_trim.
		 *	or we save the time to do the allocations and copy the buffer insite.
		 */
		//string_buffer_trim( sb );
		task->hits->word = string_buffer_devote( sb );
		task->hits->type = __FRISO_NEW_WORDS__;
	}

	return task->hits;
}