Exemplo n.º 1
0
acseg_result_t *
acseg_full_seg(acseg_index_t *acseg_index, acseg_str_t *text,int max_seek)
{
	int j,current_pos,tmp_j;
	acseg_str_t atom,atom2,tmp_atom;
	acseg_result_t *seg_result;
	acseg_index_item_t *index_item, *s_index_item,* tmp_s_index_item;
	seg_result = acseg_result_init();
//    int max_seek=5;
    int seeks=0;
	if (acseg_index->state != AC_INDEX_FIXED) {
		return seg_result;
	}
	current_pos=j = 0;
	index_item = acseg_index->root;
	while (j < text->len) {
        seeks=0;
		atom.data = &(text->data[j]);
		atom.len = get_mblen( ((u_char) atom.data[0]) );
		tmp_atom = atom;	
		tmp_s_index_item = s_index_item = find_child_index_item(index_item, &atom);
        while(
                tmp_s_index_item ==NULL &&
                seeks<max_seek && current_pos <(text->len)){
            atom2.data = &(text->data[current_pos+tmp_atom.len]);
            atom2.len = get_mblen( ((u_char) atom2.data[0]) );
            print_atom(&atom2);
		    tmp_s_index_item = find_child_index_item(index_item, &atom2);
            seeks++;
            if(tmp_s_index_item!=NULL){
                current_pos = j = current_pos +tmp_atom.len; 
                atom = atom2;
                s_index_item = tmp_s_index_item;
                break;
            }
            else{
                current_pos =  current_pos +tmp_atom.len; 
                tmp_atom = atom2;
            }
        }
		while(s_index_item == NULL) {
			if (index_item == acseg_index->root) {
				s_index_item = index_item;
				break;
			}
			index_item = index_item->failure;
			s_index_item = find_child_index_item(index_item, &atom);
		}

		index_item = s_index_item;

		add_to_result(seg_result, index_item->output);

		add_to_result(seg_result, index_item->extra_outputs);

		current_pos = tmp_j =  j = j + atom.len;
	}

	return seg_result;
}
Exemplo n.º 2
0
acseg_result_t *
acseg_full_seg3(acseg_index_t *acseg_index, acseg_str_t *text)
{
    int j;

    acseg_str_t atom;

    acseg_result_t *seg_result;

    acseg_index_item_t *index_item, *s_index_item;

    seg_result = acseg_result_init();


    if (acseg_index->state != AC_INDEX_FIXED) {
        return seg_result;
    }


    j = 0;
    index_item = acseg_index->root;
    while (j < text->len) {
        atom.data = &(text->data[j]);
        atom.len = get_mblen( ((u_char) atom.data[0]) );

        s_index_item = find_child_index_item(index_item, &atom);

        while(s_index_item == NULL) {
            if (index_item == acseg_index->root) {
                s_index_item = index_item;
                break;
            }
            index_item = index_item->failure;
            s_index_item = find_child_index_item(index_item, &atom);
        }

        index_item = s_index_item;

        add_to_result(seg_result, index_item->output);

        add_to_result(seg_result, index_item->extra_outputs);

        j = j + atom.len;
    }

    return seg_result;
}
Exemplo n.º 3
0
fkw_word_list_t *
fkw_full_seg(fkw_str_t *text, fkw_dict_array_t *dict_array)
{
	u_char *start;

	fkw_str_t atom_value;

	fkw_atom_iter_t *atom_iter;

	fkw_word_list_t *word_list;

	mc_collector_t *mc;

	atom_iter = atom_iter_init(MAX_WORD_SIZE);

	mc = NULL;
	word_list = (fkw_word_list_t *) mc_calloc(&mc, sizeof(fkw_word_list_t));
	word_list->start = NULL;
	word_list->end   = NULL;
	word_list->num   = 0;
	word_list->mc    = mc;

	start = text->data;
	while ((start - text->data) < text->len) {
		atom_value.data = start;
		atom_value.len  = get_mblen((u_char)start[0]);
		
		atom_iterator_add(atom_iter, &atom_value);
		if (atom_iter->free_list == NULL) {
			fkw_do_seg(dict_array, atom_iter, word_list);
		}

		start = start + atom_value.len;
	}

	fkw_atom_iter_pop(atom_iter);
	while (atom_iter->worker_list && atom_iter->worker_list->next){
		fkw_do_seg(dict_array, atom_iter, word_list);
		fkw_atom_iter_pop(atom_iter);
	}

	// free atom iter's memory
	mc_destory(atom_iter->mc);
	return word_list;
}
Exemplo n.º 4
0
acseg_index_t *
acseg_index_add(acseg_index_t *acseg_index, acseg_str_t *phrase)
{
	int i;

	acseg_rbtree_key_t rbtree_key;

	acseg_str_t atom;
	acseg_str_t * new_phrase;

	acseg_rbtree_t *childs_rbtree;
	acseg_rbtree_node_t * rbtree_sentinel, *s_node, *insert_node;

	acseg_index_item_t *index_item, *new_index_item;

	if (acseg_index->state == AC_INDEX_FIXED){
		return NULL;
	}

	childs_rbtree = acseg_index->root->childs_rbtree;
	rbtree_sentinel = childs_rbtree->sentinel;

	index_item = NULL;
	new_index_item = NULL;

	i = 0;
	while (i < phrase->len) {
		atom.data = &(phrase->data[i]);
		atom.len = get_mblen(atom.data[0]);

		rbtree_key = ord_utf8_wch((char *) atom.data);
		s_node = acseg_rbtree_search(childs_rbtree, rbtree_key);
		if (s_node == NULL){
			break;
		} else {
			i = i + atom.len;
			index_item = (acseg_index_item_t *) s_node->data;
			childs_rbtree = index_item->childs_rbtree;
		}
	}

	while (i < phrase->len){
		atom.data = &(phrase->data[i]);
		atom.len = get_mblen(atom.data[0]);
		
		rbtree_key = ord_utf8_wch((char *) atom.data);
		new_index_item = create_index_item(&atom, rbtree_sentinel, &(acseg_index->mc));

		// insert node
		insert_node = create_rbtree_node(rbtree_key, new_index_item, &(acseg_index->mc));
		acseg_rbtree_insert(childs_rbtree, insert_node);

		index_item = new_index_item;
		childs_rbtree = new_index_item->childs_rbtree;
		i = i + atom.len;
	}

	new_phrase = (acseg_str_t *) mc_calloc(&(acseg_index->mc), sizeof(acseg_str_t));

	acseg_copy_str_t(new_phrase, phrase, &(acseg_index->mc));

	acseg_list_add(index_item->output, new_phrase, &(acseg_index->mc));

	return acseg_index;
}
Exemplo n.º 5
0
/** Helper to check if the given characters qualify as valid UTF-8 encoded. */
bool check_mb(const std::string& s)
{
    int next = get_mblen(s.data(), s.size());
    return check_mb(s.data(), next);
}