示例#1
0
/**
 * For each token of the text, we compute its associated control byte.
 * We use the unknown word file 'err' in order to determine if a token
 * must be matched by <!DIC>
 */
void compute_token_controls(Alphabet* alph,const char* err,struct locate_parameters* p) {
    struct string_hash* ERR=load_key_list(err,p->mask_encoding_compatibility_input);
    int n=p->tokens->size;
    for (int i=0; i<n; i++) {
        p->token_control[i]=get_control_byte(p->tokens->value[i],alph,ERR,p->tokenization_policy);
    }
    free_string_hash(ERR);
}
示例#2
0
int main_PolyLex(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int language=-1;
char alphabet[FILENAME_MAX]="";
char name_bin[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
char info[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_PolyLex,lopts_PolyLex,&index))) {
   switch(val) {
   case 'D': language=DUTCH; break;
   case 'G': language=GERMAN; break;
   case 'N': language=NORWEGIAN; break;
   case 'R': language=RUSSIAN; break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'd': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty dictionary file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(name_bin,options.vars()->optarg);
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'i': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty information file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(info,options.vars()->optarg);
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_PolyLex[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (name_bin[0]=='\0') {
   error("You must specify the .bin dictionary to use\n");
   return USAGE_ERROR_CODE;
}

if (output[0]=='\0') {
   error("You must specify the output dictionary file name\n");
   return USAGE_ERROR_CODE;
}

if (language==-1) {
   error("You must specify the language\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   u_printf("Loading alphabet...\n");
   alph=load_alphabet(&vec,alphabet);
   if (alph==NULL) {
      error("Cannot load alphabet file %s\n",alphabet);
      return USAGE_ERROR_CODE;
   }
}

char name_inf[FILENAME_MAX];
struct string_hash* forbiddenWords=NULL;
if (language==DUTCH || language==NORWEGIAN) {
   get_path(name_bin,name_inf);
   strcat(name_inf,"ForbiddenWords.txt");
   forbiddenWords=load_key_list(&vec,name_inf);
   if (forbiddenWords==NULL) {
       /* If there was no file, we don't want to block the process */
       forbiddenWords=new_string_hash(DONT_USE_VALUES);
   }
}

strcpy(name_inf,name_bin);
name_inf[strlen(name_bin)-3]='\0';
strcat(name_inf,"inf");
Dictionary* d=new_Dictionary(&vec,name_bin,name_inf);
if (d==NULL) {
    error("Cannot load dictionary %s\n",name_bin);
    free_string_hash(forbiddenWords);
    free_alphabet(alph);
    return DEFAULT_ERROR_CODE;
}

char tmp[FILENAME_MAX];
strcpy(tmp,argv[options.vars()->optind]);
strcat(tmp,".tmp");

U_FILE* words=u_fopen(&vec,argv[options.vars()->optind],U_READ);
if (words==NULL) {
   error("Cannot open word list file %s\n",argv[options.vars()->optind]);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   // here we return 0 in order to do not block the preprocessing
   // in the Unitex/GramLab IDE interface, if no dictionary was applied
   // so that there is no "err" file
   return SUCCESS_RETURN_CODE;
}

U_FILE* new_unknown_words=u_fopen(&vec,tmp,U_WRITE);
if (new_unknown_words==NULL) {
   error("Cannot open temporary word list file %s\n",tmp);
   u_fclose(words);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   return DEFAULT_ERROR_CODE;
}

U_FILE* res=u_fopen(&vec,output,U_APPEND);
if (res==NULL) {
   error("Cannot open result file %s\n",output);
   u_fclose(new_unknown_words);
   u_fclose(words);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   u_fclose(words);
   return DEFAULT_ERROR_CODE;
}

U_FILE* debug=NULL;
if ((*info)!='\0') {
   debug=u_fopen(&vec,info,U_WRITE);
   if (debug==NULL) {
      error("Cannot open debug file %s\n",info);
   }
}
struct utags UTAG;

switch(language) {
  case DUTCH:
    analyse_dutch_unknown_words(alph,
                                d,
                                words,
                                res,
                                debug,
                                new_unknown_words,
                                forbiddenWords);
    break;
  case GERMAN:
    analyse_german_compounds(alph,
                             d,
                             words,
                             res,
                             debug,
                             new_unknown_words);
    break;
  case NORWEGIAN:
    analyse_norwegian_unknown_words(alph,
                                    d,
                                    words,
                                    res,
                                    debug,
                                    new_unknown_words,
                                    forbiddenWords);
    break;
  case RUSSIAN:
     init_russian(&UTAG);
     analyse_compounds(alph,
                       d,
                       words,
                       res,
                       debug,
                       new_unknown_words,
                       UTAG);
     break;
}

free_alphabet(alph);
free_Dictionary(d);
u_fclose(words);
u_fclose(new_unknown_words);
free_string_hash(forbiddenWords);
af_remove(argv[options.vars()->optind]);
af_rename(tmp,argv[options.vars()->optind]);
u_fclose(res);

if (debug!=NULL) {
   u_fclose(debug);
}

return SUCCESS_RETURN_CODE;
}
示例#3
0
文件: PolyLex.cpp 项目: adri87/Q-A
int main_PolyLex(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}


int language=-1;
char alphabet[FILENAME_MAX]="";
char dictionary[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
char info[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_PolyLex,lopts_PolyLex,&index,vars))) {
   switch(val) {
   case 'D': language=DUTCH; break;
   case 'G': language=GERMAN; break;
   case 'N': language=NORWEGIAN; break;
   case 'R': language=RUSSIAN; break;
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty alphabet file name\n");
             }
             strcpy(alphabet,vars->optarg);
             break;
   case 'd': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty dictionary file name\n");
             }
             strcpy(dictionary,vars->optarg);
             break;
   case 'o': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty output file name\n");
             }
             strcpy(output,vars->optarg);
             break;
   case 'i': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty information file name\n");
             }
             strcpy(info,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_PolyLex[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}

if (dictionary[0]=='\0') {
   fatal_error("You must specify the .bin dictionary to use\n");
}
if (output[0]=='\0') {
   fatal_error("You must specify the output dictionary file name\n");
}
if (language==-1) {
   fatal_error("You must specify the language\n");
}

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   u_printf("Loading alphabet...\n");
   alph=load_alphabet(alphabet);
   if (alph==NULL) {
      fatal_error("Cannot load alphabet file %s\n",alphabet);
   }
}
char temp[FILENAME_MAX];
struct string_hash* forbiddenWords=NULL;
if (language==DUTCH || language==NORWEGIAN) {
   get_path(dictionary,temp);
   strcat(temp,"ForbiddenWords.txt");
   forbiddenWords=load_key_list(temp,mask_encoding_compatibility_input);
}
u_printf("Loading BIN file...\n");
struct BIN_free_info bin_free;
const unsigned char* bin=load_abstract_BIN_file(dictionary,&bin_free);
if (bin==NULL) {
   error("Cannot load bin file %s\n",dictionary);
   free_alphabet(alph);
   free_string_hash(forbiddenWords);
   return 1;
}
strcpy(temp,dictionary);
temp[strlen(dictionary)-3]='\0';
strcat(temp,"inf");
u_printf("Loading INF file...\n");
struct INF_free_info inf_free;
const struct INF_codes* inf=load_abstract_INF_file(temp,&inf_free);
if (inf==NULL) {
   error("Cannot load inf file %s\n",temp);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_string_hash(forbiddenWords);
   return 1;
}
char tmp[FILENAME_MAX];
strcpy(tmp,argv[vars->optind]);
strcat(tmp,".tmp");
U_FILE* words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (words==NULL) {
   error("Cannot open word list file %s\n",argv[vars->optind]);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   free_string_hash(forbiddenWords);
   // here we return 0 in order to do not block the preprocessing
   // in the Unitex Java interface, if no dictionary was applied
   // so that there is no "err" file
   return 0;
}
U_FILE* new_unknown_words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,tmp,U_WRITE);
if (new_unknown_words==NULL) {
   error("Cannot open temporary word list file %s\n",tmp);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   u_fclose(words);
   free_string_hash(forbiddenWords);
   return 1;
}

U_FILE* res=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output,U_APPEND);
if (res==NULL) {
   error("Cannot open result file %s\n",output);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   u_fclose(words);
   u_fclose(new_unknown_words);
   free_string_hash(forbiddenWords);
   return 1;
}
U_FILE* debug=NULL;
if (info!=NULL) {
   debug=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,info,U_WRITE);
   if (debug==NULL) {
      error("Cannot open debug file %s\n",info);
   }
}
struct utags UTAG;

switch(language) {
case DUTCH: analyse_dutch_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break;
case GERMAN: analyse_german_compounds(alph,bin,inf,words,res,debug,new_unknown_words); break;
case NORWEGIAN: analyse_norwegian_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break;
case RUSSIAN:
   init_russian(&UTAG);
   analyse_compounds(alph,bin,inf,words,res,debug,new_unknown_words,UTAG);
   break;
}

free_alphabet(alph);
free_abstract_BIN(bin,&bin_free);
free_abstract_INF(inf,&inf_free);
u_fclose(words);
u_fclose(new_unknown_words);
free_string_hash(forbiddenWords);
af_remove(argv[vars->optind]);
af_rename(tmp,argv[vars->optind]);
u_fclose(res);
if (debug!=NULL) {
   u_fclose(debug);
}
free_OptVars(vars);
return 0;
}