acseg_result_t * acseg_full_seg(acseg_index_t *acseg_index, acseg_str_t *text,int max_seek) { int j,current_pos,tmp_j; acseg_str_t atom,atom2,tmp_atom; acseg_result_t *seg_result; acseg_index_item_t *index_item, *s_index_item,* tmp_s_index_item; seg_result = acseg_result_init(); // int max_seek=5; int seeks=0; if (acseg_index->state != AC_INDEX_FIXED) { return seg_result; } current_pos=j = 0; index_item = acseg_index->root; while (j < text->len) { seeks=0; atom.data = &(text->data[j]); atom.len = get_mblen( ((u_char) atom.data[0]) ); tmp_atom = atom; tmp_s_index_item = s_index_item = find_child_index_item(index_item, &atom); while( tmp_s_index_item ==NULL && seeks<max_seek && current_pos <(text->len)){ atom2.data = &(text->data[current_pos+tmp_atom.len]); atom2.len = get_mblen( ((u_char) atom2.data[0]) ); print_atom(&atom2); tmp_s_index_item = find_child_index_item(index_item, &atom2); seeks++; if(tmp_s_index_item!=NULL){ current_pos = j = current_pos +tmp_atom.len; atom = atom2; s_index_item = tmp_s_index_item; break; } else{ current_pos = current_pos +tmp_atom.len; tmp_atom = atom2; } } while(s_index_item == NULL) { if (index_item == acseg_index->root) { s_index_item = index_item; break; } index_item = index_item->failure; s_index_item = find_child_index_item(index_item, &atom); } index_item = s_index_item; add_to_result(seg_result, index_item->output); add_to_result(seg_result, index_item->extra_outputs); current_pos = tmp_j = j = j + atom.len; } return seg_result; }
acseg_result_t * acseg_full_seg3(acseg_index_t *acseg_index, acseg_str_t *text) { int j; acseg_str_t atom; acseg_result_t *seg_result; acseg_index_item_t *index_item, *s_index_item; seg_result = acseg_result_init(); if (acseg_index->state != AC_INDEX_FIXED) { return seg_result; } j = 0; index_item = acseg_index->root; while (j < text->len) { atom.data = &(text->data[j]); atom.len = get_mblen( ((u_char) atom.data[0]) ); s_index_item = find_child_index_item(index_item, &atom); while(s_index_item == NULL) { if (index_item == acseg_index->root) { s_index_item = index_item; break; } index_item = index_item->failure; s_index_item = find_child_index_item(index_item, &atom); } index_item = s_index_item; add_to_result(seg_result, index_item->output); add_to_result(seg_result, index_item->extra_outputs); j = j + atom.len; } return seg_result; }
fkw_word_list_t * fkw_full_seg(fkw_str_t *text, fkw_dict_array_t *dict_array) { u_char *start; fkw_str_t atom_value; fkw_atom_iter_t *atom_iter; fkw_word_list_t *word_list; mc_collector_t *mc; atom_iter = atom_iter_init(MAX_WORD_SIZE); mc = NULL; word_list = (fkw_word_list_t *) mc_calloc(&mc, sizeof(fkw_word_list_t)); word_list->start = NULL; word_list->end = NULL; word_list->num = 0; word_list->mc = mc; start = text->data; while ((start - text->data) < text->len) { atom_value.data = start; atom_value.len = get_mblen((u_char)start[0]); atom_iterator_add(atom_iter, &atom_value); if (atom_iter->free_list == NULL) { fkw_do_seg(dict_array, atom_iter, word_list); } start = start + atom_value.len; } fkw_atom_iter_pop(atom_iter); while (atom_iter->worker_list && atom_iter->worker_list->next){ fkw_do_seg(dict_array, atom_iter, word_list); fkw_atom_iter_pop(atom_iter); } // free atom iter's memory mc_destory(atom_iter->mc); return word_list; }
acseg_index_t * acseg_index_add(acseg_index_t *acseg_index, acseg_str_t *phrase) { int i; acseg_rbtree_key_t rbtree_key; acseg_str_t atom; acseg_str_t * new_phrase; acseg_rbtree_t *childs_rbtree; acseg_rbtree_node_t * rbtree_sentinel, *s_node, *insert_node; acseg_index_item_t *index_item, *new_index_item; if (acseg_index->state == AC_INDEX_FIXED){ return NULL; } childs_rbtree = acseg_index->root->childs_rbtree; rbtree_sentinel = childs_rbtree->sentinel; index_item = NULL; new_index_item = NULL; i = 0; while (i < phrase->len) { atom.data = &(phrase->data[i]); atom.len = get_mblen(atom.data[0]); rbtree_key = ord_utf8_wch((char *) atom.data); s_node = acseg_rbtree_search(childs_rbtree, rbtree_key); if (s_node == NULL){ break; } else { i = i + atom.len; index_item = (acseg_index_item_t *) s_node->data; childs_rbtree = index_item->childs_rbtree; } } while (i < phrase->len){ atom.data = &(phrase->data[i]); atom.len = get_mblen(atom.data[0]); rbtree_key = ord_utf8_wch((char *) atom.data); new_index_item = create_index_item(&atom, rbtree_sentinel, &(acseg_index->mc)); // insert node insert_node = create_rbtree_node(rbtree_key, new_index_item, &(acseg_index->mc)); acseg_rbtree_insert(childs_rbtree, insert_node); index_item = new_index_item; childs_rbtree = new_index_item->childs_rbtree; i = i + atom.len; } new_phrase = (acseg_str_t *) mc_calloc(&(acseg_index->mc), sizeof(acseg_str_t)); acseg_copy_str_t(new_phrase, phrase, &(acseg_index->mc)); acseg_list_add(index_item->output, new_phrase, &(acseg_index->mc)); return acseg_index; }
/** Helper to check if the given characters qualify as valid UTF-8 encoded. */ bool check_mb(const std::string& s) { int next = get_mblen(s.data(), s.size()); return check_mb(s.data(), next); }