Esempio n. 1
0
/**
 * Loads an existing token file.
 */
void load_token_file(char* filename,int mask_encoding_compatibility_input,vector_ptr* tokens,struct hash_table* hashtable,vector_int* n_occur) {
U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,filename,U_READ);
if (f==NULL) {
   fatal_error("Cannot open token file %s\n",filename);
}
unichar tmp[1024];
if (EOF==u_fgets_limit2(tmp,1024,f)) {
   fatal_error("Unexpected empty token file %s\n",filename);
}
while (EOF!=u_fgets_limit2(tmp,1024,f)) {
   int n=get_token_number(tmp,tokens,hashtable,n_occur);
   /* We decrease the number of occurrences, in order to have all those numbers equal to 0 */
   n_occur->tab[n]--;
}
u_fclose(f);
}
/**
 * This function takes a compound word and tokenizes it according to
 * the given text tokens. The result is an integer sequence that is
 * stored in 'token_sequence'. Each integer represents a token number,
 * and the sequence is ended by -1.
 * 
 * Example: "sans raison" may be turned into (121,1,1643,-1)
 * 
 * WARNING: every token of the compound word is supposed to be present
 *          in the given text tokens.
 */
int build_token_sequence(unichar* compound_word,struct text_tokens* tokens,int* token_sequence) {
struct list_ustring* list=tokenize(compound_word,WORD_BY_WORD_TOKENIZATION,NULL);
struct list_ustring* tmp;
int i=0;
while (list!=NULL) {
   token_sequence[i]=get_token_number(list->string,tokens);
   if (token_sequence[i]==-1) {
      error("Unknown token <%S> in build_token_sequence\n",list->string);
      return 0;
   }
   i++;
   tmp=list;
   list=list->next;
   free_list_ustring_element(tmp);
}
/* We put the final -1 */
token_sequence[i]=-1;
return 1;
}
Esempio n. 3
0
void char_by_char_tokenization(U_FILE* f,U_FILE* coded_text,U_FILE* output,Alphabet* alph,
                               vector_ptr* tokens,struct hash_table* hashtable,
                               vector_int* n_occur,vector_int* n_enter_pos,
                               int *SENTENCES,int *TOKENS_TOTAL,int *WORDS_TOTAL,
                               int *DIGITS_TOTAL) {
int c;
unichar s[MAX_TAG_LENGTH];
int n;
char ENTER;
int COUNT=0;
int current_megabyte=0;
c=u_fgetc(f);
while (c!=EOF) {
   COUNT++;
   if ((COUNT/(1024*512))!=current_megabyte) {
      current_megabyte++;
      u_printf("%d megabytes read...         \r",(COUNT/(1024*512)));
   }
   if (c==' ' || c==0x0d || c==0x0a) {
      ENTER=0;
      if (c=='\n') {
         ENTER=1;
      }
      // if the char is a separator, we jump all the separators
      while ((c=u_fgetc(f))==' ' || c==0x0d || c==0x0a) {
         if (c=='\n') ENTER=1;
         COUNT++;
      }
      s[0]=' ';
      s[1]='\0';
      n=get_token_number(s,tokens,hashtable,n_occur);
      /* If there is a \n, we note it */
      if (ENTER==1) {
         vector_int_add(n_enter_pos,*TOKENS_TOTAL);
      }
      (*TOKENS_TOTAL)++;
      fwrite(&n,4,1,coded_text);
   }
   else if (c=='{') {
     s[0]='{';
     int z=1;
     while (z<(MAX_TAG_LENGTH-1) && (c=u_fgetc(f))!='}' && c!='{' && c!='\n') {
        s[z++]=(unichar)c;
        COUNT++;
     }
     if (c=='\n') {
        // if the tag contains a return
        fatal_error("Error: a tag containing a new-line sequence has been found\n");
     }
     if (z==(MAX_TAG_LENGTH-1) || c!='}') {
        // if the tag has no ending }
        if (z==(MAX_TAG_LENGTH-1)) {z--;}
        s[z]='\0';
        fatal_error("Error: a tag without ending } has been found:\n==>%S<==\n",s);
     }
     s[z]='}';
     s[z+1]='\0';
     if (!u_strcmp(s,"{S}")) {
        // if we have found a sentence delimiter
        (*SENTENCES)++;
     } else {
        if (u_strcmp(s,"{STOP}") && !check_tag_token(s)) {
           // if a tag is incorrect, we exit
           fatal_error("The text contains an invalid tag. Unitex cannot process it.");
        }
     }
     n=get_token_number(s,tokens,hashtable,n_occur);
     (*TOKENS_TOTAL)++;
     fwrite(&n,4,1,coded_text);
     c=u_fgetc(f);
   }
   else {
      s[0]=(unichar)c;
      s[1]='\0';
      n=get_token_number(s,tokens,hashtable,n_occur);
      (*TOKENS_TOTAL)++;
      if (is_letter((unichar)c,alph)) (*WORDS_TOTAL)++;
      else if (c>='0' && c<='9') (*DIGITS_TOTAL)++;
      fwrite(&n,4,1,coded_text);
      c=u_fgetc(f);
   }
}
for (n=0;n<tokens->nbelems;n++) {
   u_fprintf(output,"%S\n",tokens->tab[n],output);
}
}