/** * Loads an existing token file. */ void load_token_file(char* filename,int mask_encoding_compatibility_input,vector_ptr* tokens,struct hash_table* hashtable,vector_int* n_occur) { U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,filename,U_READ); if (f==NULL) { fatal_error("Cannot open token file %s\n",filename); } unichar tmp[1024]; if (EOF==u_fgets_limit2(tmp,1024,f)) { fatal_error("Unexpected empty token file %s\n",filename); } while (EOF!=u_fgets_limit2(tmp,1024,f)) { int n=get_token_number(tmp,tokens,hashtable,n_occur); /* We decrease the number of occurrences, in order to have all those numbers equal to 0 */ n_occur->tab[n]--; } u_fclose(f); }
/** * This function takes a compound word and tokenizes it according to * the given text tokens. The result is an integer sequence that is * stored in 'token_sequence'. Each integer represents a token number, * and the sequence is ended by -1. * * Example: "sans raison" may be turned into (121,1,1643,-1) * * WARNING: every token of the compound word is supposed to be present * in the given text tokens. */ int build_token_sequence(unichar* compound_word,struct text_tokens* tokens,int* token_sequence) { struct list_ustring* list=tokenize(compound_word,WORD_BY_WORD_TOKENIZATION,NULL); struct list_ustring* tmp; int i=0; while (list!=NULL) { token_sequence[i]=get_token_number(list->string,tokens); if (token_sequence[i]==-1) { error("Unknown token <%S> in build_token_sequence\n",list->string); return 0; } i++; tmp=list; list=list->next; free_list_ustring_element(tmp); } /* We put the final -1 */ token_sequence[i]=-1; return 1; }
void char_by_char_tokenization(U_FILE* f,U_FILE* coded_text,U_FILE* output,Alphabet* alph, vector_ptr* tokens,struct hash_table* hashtable, vector_int* n_occur,vector_int* n_enter_pos, int *SENTENCES,int *TOKENS_TOTAL,int *WORDS_TOTAL, int *DIGITS_TOTAL) { int c; unichar s[MAX_TAG_LENGTH]; int n; char ENTER; int COUNT=0; int current_megabyte=0; c=u_fgetc(f); while (c!=EOF) { COUNT++; if ((COUNT/(1024*512))!=current_megabyte) { current_megabyte++; u_printf("%d megabytes read... \r",(COUNT/(1024*512))); } if (c==' ' || c==0x0d || c==0x0a) { ENTER=0; if (c=='\n') { ENTER=1; } // if the char is a separator, we jump all the separators while ((c=u_fgetc(f))==' ' || c==0x0d || c==0x0a) { if (c=='\n') ENTER=1; COUNT++; } s[0]=' '; s[1]='\0'; n=get_token_number(s,tokens,hashtable,n_occur); /* If there is a \n, we note it */ if (ENTER==1) { vector_int_add(n_enter_pos,*TOKENS_TOTAL); } (*TOKENS_TOTAL)++; fwrite(&n,4,1,coded_text); } else if (c=='{') { s[0]='{'; int z=1; while (z<(MAX_TAG_LENGTH-1) && (c=u_fgetc(f))!='}' && c!='{' && c!='\n') { s[z++]=(unichar)c; COUNT++; } if (c=='\n') { // if the tag contains a return fatal_error("Error: a tag containing a new-line sequence has been found\n"); } if (z==(MAX_TAG_LENGTH-1) || c!='}') { // if the tag has no ending } if (z==(MAX_TAG_LENGTH-1)) {z--;} s[z]='\0'; fatal_error("Error: a tag without ending } has been found:\n==>%S<==\n",s); } s[z]='}'; s[z+1]='\0'; if (!u_strcmp(s,"{S}")) { // if we have found a sentence delimiter (*SENTENCES)++; } else { if (u_strcmp(s,"{STOP}") && !check_tag_token(s)) { // if a tag is incorrect, we exit fatal_error("The text contains an invalid tag. Unitex cannot process it."); } } n=get_token_number(s,tokens,hashtable,n_occur); (*TOKENS_TOTAL)++; fwrite(&n,4,1,coded_text); c=u_fgetc(f); } else { s[0]=(unichar)c; s[1]='\0'; n=get_token_number(s,tokens,hashtable,n_occur); (*TOKENS_TOTAL)++; if (is_letter((unichar)c,alph)) (*WORDS_TOTAL)++; else if (c>='0' && c<='9') (*DIGITS_TOTAL)++; fwrite(&n,4,1,coded_text); c=u_fgetc(f); } } for (n=0;n<tokens->nbelems;n++) { u_fprintf(output,"%S\n",tokens->tab[n],output); } }