/** * Reads the start and end positions of each token stored in the file * produced by Tokenize's --output_offsets option. */ vector_uima_offset* load_uima_offsets(const VersatileEncodingConfig* vec,const char* name) { U_FILE* f; f=u_fopen(vec,name,U_READ); if (f==NULL) { return NULL; } vector_int* v=new_vector_int(); Ustring* line=new_Ustring(); int a,b,c; while (EOF!=readline(line,f)) { u_sscanf(line->str,"%d%d%d",&a,&b,&c); vector_int_add(v,b); vector_int_add(v,c); } free_Ustring(line); u_fclose(f); return (vector_uima_offset*)v; }
/** * Returns the number of the given token, inserting it if needed in the * data structures. Its number of occurrences is also updated. */ int get_token_number(unichar* s,vector_ptr* tokens,struct hash_table* hashtable,vector_int* n_occur) { int ret; struct any* value=get_value(hashtable,s,HT_INSERT_IF_NEEDED,&ret); if (ret==HT_KEY_ADDED) { /* If the token was not already in the hash table, we must give it * a number */ value->_int=vector_ptr_add(tokens,u_strdup(s)); vector_int_add(n_occur,0); } int n=value->_int; /* Then we update the number of occurrences */ n_occur->tab[n]++; return n; }
int main() { vector_int a; vector_int_init(&a,3); printf("<<<init &a 3>>>\n"); print_vector_int(&a, stdout); assert(3 == vector_int_size(&a)); assert(3 == vector_int_capacity(&a)); assert(0 == vector_int_e(&a, 0)); vector_int_insert(&a, 2, 10); printf("<<<insert &a, 2, 10>>>\n"); assert(4 == vector_int_capacity(&a)); assert(4 == vector_int_size(&a)); assert(10 == vector_int_e(&a, 2)); assert(10 == *vector_int_e_ptr(&a, 2)); print_vector_int(&a, stdout); vector_int_push_back(&a, 60); printf("<<<push back &a, 60>>>\n"); assert(60 == vector_int_tail(&a)); assert(8 == vector_int_capacity(&a)); assert(5 == vector_int_size(&a)); assert(60 == vector_int_max(&a)); assert(4 == vector_int_which_max(&a)); assert(1 == vector_int_contains(&a, 60)); assert(0 == vector_int_contains(&a, 100)); print_vector_int(&a, stdout); vector_int_resize_min(&a); printf("<<<resize min &a>>>\n"); assert(60 == vector_int_tail(&a)); assert(5 == vector_int_capacity(&a)); assert(5 == vector_int_size(&a)); print_vector_int(&a, stdout); printf("<<<pop_back &a>>>\n"); assert(60 == vector_int_pop_back(&a)); assert(5 == vector_int_capacity(&a)); assert(4 == vector_int_size(&a)); print_vector_int(&a, stdout); vector_int_set(&a, 2, -12); printf("<<<set &a 2 12>>>\n"); assert(-12 == VECTOR(a)[2]); assert(5 == vector_int_capacity(&a)); assert(4 == vector_int_size(&a)); assert(-12 == vector_int_min(&a)); assert(2 == vector_int_which_min(&a)); print_vector_int(&a, stdout); vector_int_set(&a, 1, 12); int min, max; int which_min, which_max; vector_int_minmax(&a,&min, &max); vector_int_which_minmax(&a, &which_min, &which_max); assert(-12 == min); assert(12 == max); assert(2 == which_min); assert(1 == which_max); vector_int_reserve(&a, 10); printf("<<<reserve &a 10>>>\n"); assert(10 == vector_int_capacity(&a)); assert(4 == vector_int_size(&a)); print_vector_int(&a, stdout); vector_int_null(&a); printf("<<<null &a>>>\n"); assert(10 == vector_int_capacity(&a)); assert(4 == vector_int_size(&a)); assert(0 == VECTOR(a)[2]); print_vector_int(&a, stdout); vector_int_fill(&a, 15); printf("<<<fill &a 15>>>\n"); assert(10 == vector_int_capacity(&a)); assert(4 == vector_int_size(&a)); assert(15 == VECTOR(a)[2]); print_vector_int(&a, stdout); vector_int_clear(&a); printf("<<<clear &a>>>\n"); assert(10 == vector_int_capacity(&a)); assert(0 == vector_int_size(&a)); print_vector_int(&a, stdout); vector_int_destroy(&a); printf("<<<destroy &a>>>\n"); // assert(0 == vector_int_capacity(&a)); // assert(0 == vector_int_size(&a)); // print_vector_int(&a, stdout); int hehe[5] = {1,2,3,4,5}; vector_int_init_copy(&a, hehe, 5); printf("<<<init_copy &a {1,2,3,4,5}>>>\n"); assert(5 == vector_int_capacity(&a)); assert(5 == vector_int_size(&a)); assert(3 == VECTOR(a)[2]); print_vector_int(&a, stdout); int hehe2[5]; vector_int_copy_to(&a, hehe2); for (int i = 0; i < 5; i++) assert(hehe[i] == hehe2[i]); vector_int_add_constant(&a, 1); printf("<<<add_constant &a 1>>>\n"); assert(4 == VECTOR(a)[2]); print_vector_int(&a, stdout); vector_int a2; vector_int af; vector_int_init(&af,0); vector_int_init_value(&a2, 5, 5,4,3,2,1); vector_int_add(&a2, &a); assert(7 == VECTOR(a2)[2]); print_vector_int(&a2, stdout); vector_int_ele_freq_min_max(&af, &a2, &min, &max); print_vector_int(&af,stdout); vector_int_ele_freq_min_max(&af, &a2, &min, &max); print_vector_int(&af,stdout); vector_int_sub(&a2, &a); assert(3 == VECTOR(a2)[2]); print_vector_int(&a2, stdout); vector_int_cumsum(&a2, &a); assert(20 == VECTOR(a2)[4]); print_vector_int(&a2, stdout); int sum = vector_int_sum(&a); assert(sum == VECTOR(a2)[4]); vector_int b; printf("<<<copy &a &b>>>\n"); print_vector_int(&a, stdout); vector_int_copy(&b, &a); for (int i = 0; i < vector_int_size(&b); i++) assert(VECTOR(a)[i] == VECTOR(b)[i]); print_vector_int(&b, stdout); vector_int b2; vector_int_init_value(&b2, 5, 0,1,2,3,4,5); printf("<<<init_value &b2 5 0,1,2,3,4,5>>>\n"); assert(5 == vector_int_capacity(&b2)); assert(5 == vector_int_size(&b2)); assert(2 == VECTOR(b2)[2]); print_vector_int(&b2, stdout); vector_int_remove_section(&b2, 1, 3); printf("<<<remove_section &b2 1 3>>>\n"); assert(5 == vector_int_capacity(&b2)); assert(3 == vector_int_size(&b2)); assert(3 == VECTOR(b2)[1]); print_vector_int(&b2, stdout); vector_int_remove(&b2, 1); printf("<<<remove &b2, 1>>>\n"); assert(5 == vector_int_capacity(&b2)); assert(2 == vector_int_size(&b2)); assert(4 == VECTOR(b2)[1]); print_vector_int(&b2, stdout); vector_int c; vector_int_init_value_end(&c, 14, 1,2,14,4); printf("<<<init_value_end &c 14 1,2,14,4>>>\n"); assert(2 == vector_int_capacity(&c)); assert(2 == vector_int_size(&c)); assert(2 == VECTOR(c)[1]); print_vector_int(&c, stdout); vector_int v1,v2,res,res2; vector_int_init_value(&v1, 8, 4,5,2,3,7,1,6,1); vector_int_init_value(&v2, 8, 3,4,1,7,2,5,1,3); vector_int_init(&res, 1); vector_int_init(&res2, 1); print_vector_int(&res2, stdout); vector_int_order_inc2(&v1, &v2, &res, 8); vector_int_scan_tie(&res2, &v1, &res, 8); printf("<<<order&v1, &v2, &res, 7>>>\n"); print_vector_int(&v1, stdout); print_vector_int(&v2, stdout); print_vector_int(&res, stdout); print_vector_int(&res2, stdout); assert(7 == VECTOR(res)[0]); assert(5 == VECTOR(res)[1]); assert(2 == VECTOR(res)[2]); assert(3 == VECTOR(res)[3]); assert(0 == VECTOR(res)[4]); assert(1 == VECTOR(res)[5]); assert(6 == VECTOR(res)[6]); assert(4 == VECTOR(res)[7]); assert(0 == VECTOR(res2)[0]); assert(0 == VECTOR(res2)[1]); assert(2 == VECTOR(res2)[2]); assert(3 == VECTOR(res2)[3]); assert(4 == VECTOR(res2)[4]); assert(5 == VECTOR(res2)[5]); assert(6 == VECTOR(res2)[6]); assert(7 == VECTOR(res2)[7]); vector_double d; vector_double_init(&d, 3); print_vector_double(&d, stdout); vector_double e; vector_double_init_value(&e, 3, 1.0, 1.2, 1.3); print_vector_double(&e, stdout); vector_double f; vector_double_init_value_end(&f, 1.4, 1.0, 1.3, 1.4, 10.0); print_vector_double(&f, stdout); vector_test_t g; vector_test_t_init(&g, 3); test_t tmp_test_t = {1,3}; vector_test_t_set(&g, 1, tmp_test_t); print_vector_test_t(&g, stdout); assert(1 == vector_test_t_contains_op(&g, tmp_test_t, test_t_op)); vector_char aaa; vector_char_init(&aaa, 4); VECTOR(aaa)[0] = 'a'; VECTOR(aaa)[1] = 'a'; VECTOR(aaa)[2] = 'a'; VECTOR(aaa)[3] = '\0'; printf("%s\n",VECTOR(aaa)); vector_int haha; vector_int_init_value_end(&haha, -1, 3,4,7,1,6,5, -1); print_vector_int(&haha, stdout); vector_int order; vector_int_init(&order, 0); vector_int_order_inc(&haha, &order,8); print_vector_int(&order, stdout); vector_int_order_dec(&haha, &order,8); print_vector_int(&order, stdout); vector_int_destroy(&order); vector_int_destroy(&haha); return 0; }
/** * This function removes all non ambiguous outputs from the given match list. * If renumber is non NULL, we have renumber[x]=y, where x is the position * of a match in the filtered list, and y its corresponding number in the * unfiltered original one. */ void filter_unambiguous_outputs(struct match_list* *list,vector_int* renumber) { struct match_list* tmp; if (*list==NULL) return; struct match_list* previous=NULL; struct match_list* l=*list; int previous_was_identical=0; int original_match_number=-1; while (l!=NULL) { original_match_number++; if (previous==NULL) { /* Case 1: we are at the beginning of the list */ /* Case 1a: there is only one cell */ if (l->next==NULL) { free_match_list(l); *list=NULL; return; } /* Case 1b: there is a next cell, but it's not ambiguous with the current one */ if (!are_ambiguous(l,l->next)) { /* We have to delete the current cell */ tmp=l->next; free_match_list_element(l); l=tmp; continue; } /* Case 1c: the next cell is an ambiguous one, we can move on */ /* Now we know the list head element */ *list=l; previous=l; previous_was_identical=1; l=l->next; vector_int_add(renumber,original_match_number); continue; } else { /* Case 2: there is a previous cell */ if (previous_was_identical) { vector_int_add(renumber,original_match_number); /* Case 2a: we know that we have to keep this current cell, but * we must check if the next is also an ambiguous one */ if (l->next==NULL) { /* No next cell ? We're done then */ return; } previous_was_identical=are_ambiguous(l,l->next); previous=l; l=l->next; continue; } /* Case 2b: previous cell is different, so we have to test the next one * to know whether we must keep the current one or not */ if (l->next==NULL) { /* No next cell ? We have to delete the current one and then * we are done */ free_match_list_element(l); previous->next=NULL; return; } previous_was_identical=are_ambiguous(l,l->next); if (previous_was_identical) { /* We have to keep the current cell */ previous=l; l=l->next; vector_int_add(renumber,original_match_number); continue; } /* Final case, the next cell is not ambiguous, so we have to delete * the current one */ tmp=l; l=l->next; free_match_list_element(tmp); previous->next=l; continue; } } }
/** * Explores the given dictionary to match the given word. */ static void explore_dic(int offset,unichar* word,int pos_word,Dictionary* d,SpellCheckConfig* cfg, Ustring* output,SpellCheckHypothesis* *list,int base,Ustring* inflected) { int original_offset=offset; int original_base=base; int final,n_transitions,inf_code; int z=save_output(output); int size_pairs=cfg->pairs->nbelems; offset=read_dictionary_state(d,offset,&final,&n_transitions,&inf_code); if (final) { if (word[pos_word]=='\0') { /* If we have a match */ deal_with_matches(d,inflected->str,inf_code,output,cfg,base,list); } base=output->len; } /* If we are at the end of the token, then we stop */ if (word[pos_word]=='\0') { return; } unsigned int l2=inflected->len; unichar c; int dest_offset; for (int i=0;i<n_transitions;i++) { restore_output(z,output); offset=read_dictionary_transition(d,offset,&c,&dest_offset,output); /* For backup_output, see comment below */ int backup_output=save_output(output); if (c==word[pos_word] || word[pos_word]==u_toupper(c)) { u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } else { /* We deal with the SP_SWAP case, made of 2 SP_CHANGE_XXX */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SWAP!=cfg->max_SP_SWAP && is_letter_swap(cfg,word,pos_word,inflected,c)) { /* We don't modify the number of errors since we override an existing * SP_CHANGE_XXX one */ cfg->current_SP_SWAP++; /* We override the previous change */ int a=cfg->pairs->tab[cfg->pairs->nbelems-2]; int b=cfg->pairs->tab[cfg->pairs->nbelems-1]; cfg->pairs->tab[cfg->pairs->nbelems-2]=pos_word-1; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_SWAP_DEFAULT; u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); cfg->pairs->tab[cfg->pairs->nbelems-2]=a; cfg->pairs->tab[cfg->pairs->nbelems-1]=b; cfg->current_SP_SWAP--; } else /* We deal with the SP_CHANGE case */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_CHANGE!=cfg->max_SP_CHANGE /* We want letters, not spaces or anything else */ && is_letter(c,NULL) /* We do not allow the replacement of a lowercase letter by an uppercase * letter at the beginning of the word like Niserable, unless the whole word * is in uppercase or the letter is the same, module the case */ && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL) || word[0]==u_toupper(c)))) { cfg->current_errors++; cfg->current_SP_CHANGE++; /* Now we test all possible kinds of change */ vector_int_add(cfg->pairs,pos_word); u_strcat(inflected,c); /* We always add the default case */ vector_int_add(cfg->pairs,SP_CHANGE_DEFAULT); int n_elem=cfg->pairs->nbelems; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); /* Then we test the accent case */ if (u_deaccentuate(c)==u_deaccentuate(word[pos_word])) { /* After a call to explore_dic, we must restore the output. * But, when dealing with SP_CHANGE_XXX ops, we must restore the * output including the output associated to the current transition, * which is why we don't use z (output before the current transition) * but backup_output */ restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_DIACRITIC; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } /* And the case variations */ if (u_tolower(c)==u_tolower(word[pos_word])) { restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_CASE; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } /* And finally the position on keyboard */ if (areCloseOnKeyboard(c,word[pos_word],cfg->keyboard)) { restore_output(backup_output,output); cfg->pairs->nbelems=n_elem; cfg->pairs->tab[cfg->pairs->nbelems-1]=SP_CHANGE_KEYBOARD; explore_dic(dest_offset,word,pos_word+1,d,cfg,output,list,base,inflected); } cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_CHANGE--; /* End of the SP_CHANGE case */ } } restore_output(backup_output,output); truncate(inflected,l2); /* Now we deal with the SP_SUPPR case */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_SUPPR!=cfg->max_SP_SUPPR /* We want letters, not spaces or anything else */ && is_letter(c,NULL)) { cfg->current_errors++; cfg->current_SP_SUPPR++; vector_int_add(cfg->pairs,pos_word); if (pos_word>=1 && c==word[pos_word-1]) { vector_int_add(cfg->pairs,SP_SUPPR_DOUBLE); } else { vector_int_add(cfg->pairs,SP_SUPPR_DEFAULT); } u_strcat(inflected,c); explore_dic(dest_offset,word,pos_word,d,cfg,output,list,original_base,inflected); truncate(inflected,l2); cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_SUPPR--; } } restore_output(z,output); /* Finally, we deal with the SP_INSERT case, by calling again the current * function with the same parameters, except pos_word that will be increased of 1 */ if (cfg->current_errors!=cfg->max_errors && cfg->current_SP_INSERT!=cfg->max_SP_INSERT /* We want letters, not spaces or anything else */ && is_letter(word[pos_word],NULL) /* We do not allow the insertion of a capital letter at the beginning of * the word like Astreet, unless the whole word is in uppercase like ASTREET */ && (cfg->allow_uppercase_initial || pos_word>0 || (!is_upper(word[0],NULL) || is_upper(word[1],NULL)))) { cfg->current_errors++; cfg->current_SP_INSERT++; vector_int_add(cfg->pairs,pos_word); if (pos_word>=1 && word[pos_word]==word[pos_word-1]) { vector_int_add(cfg->pairs,SP_INSERT_DOUBLE); } else { vector_int_add(cfg->pairs,SP_INSERT_DEFAULT); } explore_dic(original_offset,word,pos_word+1,d,cfg,output,list,original_base,inflected); truncate(inflected,l2); cfg->pairs->nbelems=size_pairs; cfg->current_errors--; cfg->current_SP_INSERT--; } /* Finally, we restore the output as it was when we enter the function */ restore_output(z,output); }
void char_by_char_tokenization(U_FILE* f,U_FILE* coded_text,U_FILE* output,Alphabet* alph, vector_ptr* tokens,struct hash_table* hashtable, vector_int* n_occur,vector_int* n_enter_pos, int *SENTENCES,int *TOKENS_TOTAL,int *WORDS_TOTAL, int *DIGITS_TOTAL) { int c; unichar s[MAX_TAG_LENGTH]; int n; char ENTER; int COUNT=0; int current_megabyte=0; c=u_fgetc(f); while (c!=EOF) { COUNT++; if ((COUNT/(1024*512))!=current_megabyte) { current_megabyte++; u_printf("%d megabytes read... \r",(COUNT/(1024*512))); } if (c==' ' || c==0x0d || c==0x0a) { ENTER=0; if (c=='\n') { ENTER=1; } // if the char is a separator, we jump all the separators while ((c=u_fgetc(f))==' ' || c==0x0d || c==0x0a) { if (c=='\n') ENTER=1; COUNT++; } s[0]=' '; s[1]='\0'; n=get_token_number(s,tokens,hashtable,n_occur); /* If there is a \n, we note it */ if (ENTER==1) { vector_int_add(n_enter_pos,*TOKENS_TOTAL); } (*TOKENS_TOTAL)++; fwrite(&n,4,1,coded_text); } else if (c=='{') { s[0]='{'; int z=1; while (z<(MAX_TAG_LENGTH-1) && (c=u_fgetc(f))!='}' && c!='{' && c!='\n') { s[z++]=(unichar)c; COUNT++; } if (c=='\n') { // if the tag contains a return fatal_error("Error: a tag containing a new-line sequence has been found\n"); } if (z==(MAX_TAG_LENGTH-1) || c!='}') { // if the tag has no ending } if (z==(MAX_TAG_LENGTH-1)) {z--;} s[z]='\0'; fatal_error("Error: a tag without ending } has been found:\n==>%S<==\n",s); } s[z]='}'; s[z+1]='\0'; if (!u_strcmp(s,"{S}")) { // if we have found a sentence delimiter (*SENTENCES)++; } else { if (u_strcmp(s,"{STOP}") && !check_tag_token(s)) { // if a tag is incorrect, we exit fatal_error("The text contains an invalid tag. Unitex cannot process it."); } } n=get_token_number(s,tokens,hashtable,n_occur); (*TOKENS_TOTAL)++; fwrite(&n,4,1,coded_text); c=u_fgetc(f); } else { s[0]=(unichar)c; s[1]='\0'; n=get_token_number(s,tokens,hashtable,n_occur); (*TOKENS_TOTAL)++; if (is_letter((unichar)c,alph)) (*WORDS_TOTAL)++; else if (c>='0' && c<='9') (*DIGITS_TOTAL)++; fwrite(&n,4,1,coded_text); c=u_fgetc(f); } } for (n=0;n<tokens->nbelems;n++) { u_fprintf(output,"%S\n",tokens->tab[n],output); } }
/** * This function adds a new token shift to the given snt offsets. */ void add_snt_offsets(vector_int* snt_offsets,int token_pos,int shift_before,int shift_after) { vector_int_add(snt_offsets,token_pos); vector_int_add(snt_offsets,shift_before); vector_int_add(snt_offsets,shift_after); }