/** * For each token of the text, we compute its associated control byte. * We use the unknown word file 'err' in order to determine if a token * must be matched by <!DIC> */ void compute_token_controls(Alphabet* alph,const char* err,struct locate_parameters* p) { struct string_hash* ERR=load_key_list(err,p->mask_encoding_compatibility_input); int n=p->tokens->size; for (int i=0; i<n; i++) { p->token_control[i]=get_control_byte(p->tokens->value[i],alph,ERR,p->tokenization_policy); } free_string_hash(ERR); }
int main_PolyLex(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } int language=-1; char alphabet[FILENAME_MAX]=""; char name_bin[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; char info[FILENAME_MAX]=""; VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_PolyLex,lopts_PolyLex,&index))) { switch(val) { case 'D': language=DUTCH; break; case 'G': language=GERMAN; break; case 'N': language=NORWEGIAN; break; case 'R': language=RUSSIAN; break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'd': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty dictionary file name\n"); return USAGE_ERROR_CODE; } strcpy(name_bin,options.vars()->optarg); break; case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output file name\n"); return USAGE_ERROR_CODE; } strcpy(output,options.vars()->optarg); break; case 'i': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty information file name\n"); return USAGE_ERROR_CODE; } strcpy(info,options.vars()->optarg); break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_PolyLex[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (name_bin[0]=='\0') { error("You must specify the .bin dictionary to use\n"); return USAGE_ERROR_CODE; } if (output[0]=='\0') { error("You must specify the output dictionary file name\n"); return USAGE_ERROR_CODE; } if (language==-1) { error("You must specify the language\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } Alphabet* alph=NULL; if (alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); alph=load_alphabet(&vec,alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); return USAGE_ERROR_CODE; } } char name_inf[FILENAME_MAX]; struct string_hash* forbiddenWords=NULL; if (language==DUTCH || language==NORWEGIAN) { get_path(name_bin,name_inf); strcat(name_inf,"ForbiddenWords.txt"); forbiddenWords=load_key_list(&vec,name_inf); if (forbiddenWords==NULL) { /* If there was no file, we don't want to block the process */ forbiddenWords=new_string_hash(DONT_USE_VALUES); } } strcpy(name_inf,name_bin); name_inf[strlen(name_bin)-3]='\0'; strcat(name_inf,"inf"); Dictionary* d=new_Dictionary(&vec,name_bin,name_inf); if (d==NULL) { error("Cannot load dictionary %s\n",name_bin); free_string_hash(forbiddenWords); free_alphabet(alph); return DEFAULT_ERROR_CODE; } char tmp[FILENAME_MAX]; strcpy(tmp,argv[options.vars()->optind]); strcat(tmp,".tmp"); U_FILE* words=u_fopen(&vec,argv[options.vars()->optind],U_READ); if (words==NULL) { error("Cannot open word list file %s\n",argv[options.vars()->optind]); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); // here we return 0 in order to do not block the preprocessing // in the Unitex/GramLab IDE interface, if no dictionary was applied // so that there is no "err" file return SUCCESS_RETURN_CODE; } U_FILE* new_unknown_words=u_fopen(&vec,tmp,U_WRITE); if (new_unknown_words==NULL) { error("Cannot open temporary word list file %s\n",tmp); u_fclose(words); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); return DEFAULT_ERROR_CODE; } U_FILE* res=u_fopen(&vec,output,U_APPEND); if (res==NULL) { error("Cannot open result file %s\n",output); u_fclose(new_unknown_words); u_fclose(words); free_Dictionary(d); free_string_hash(forbiddenWords); free_alphabet(alph); u_fclose(words); return DEFAULT_ERROR_CODE; } U_FILE* debug=NULL; if ((*info)!='\0') { debug=u_fopen(&vec,info,U_WRITE); if (debug==NULL) { error("Cannot open debug file %s\n",info); } } struct utags UTAG; switch(language) { case DUTCH: analyse_dutch_unknown_words(alph, d, words, res, debug, new_unknown_words, forbiddenWords); break; case GERMAN: analyse_german_compounds(alph, d, words, res, debug, new_unknown_words); break; case NORWEGIAN: analyse_norwegian_unknown_words(alph, d, words, res, debug, new_unknown_words, forbiddenWords); break; case RUSSIAN: init_russian(&UTAG); analyse_compounds(alph, d, words, res, debug, new_unknown_words, UTAG); break; } free_alphabet(alph); free_Dictionary(d); u_fclose(words); u_fclose(new_unknown_words); free_string_hash(forbiddenWords); af_remove(argv[options.vars()->optind]); af_rename(tmp,argv[options.vars()->optind]); u_fclose(res); if (debug!=NULL) { u_fclose(debug); } return SUCCESS_RETURN_CODE; }
int main_PolyLex(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int language=-1; char alphabet[FILENAME_MAX]=""; char dictionary[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; char info[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; int val,index=-1; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_PolyLex,lopts_PolyLex,&index,vars))) { switch(val) { case 'D': language=DUTCH; break; case 'G': language=GERMAN; break; case 'N': language=NORWEGIAN; break; case 'R': language=RUSSIAN; break; case 'a': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty alphabet file name\n"); } strcpy(alphabet,vars->optarg); break; case 'd': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty dictionary file name\n"); } strcpy(dictionary,vars->optarg); break; case 'o': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty output file name\n"); } strcpy(output,vars->optarg); break; case 'i': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty information file name\n"); } strcpy(info,vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_PolyLex[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; } index=-1; } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } if (dictionary[0]=='\0') { fatal_error("You must specify the .bin dictionary to use\n"); } if (output[0]=='\0') { fatal_error("You must specify the output dictionary file name\n"); } if (language==-1) { fatal_error("You must specify the language\n"); } Alphabet* alph=NULL; if (alphabet[0]!='\0') { u_printf("Loading alphabet...\n"); alph=load_alphabet(alphabet); if (alph==NULL) { fatal_error("Cannot load alphabet file %s\n",alphabet); } } char temp[FILENAME_MAX]; struct string_hash* forbiddenWords=NULL; if (language==DUTCH || language==NORWEGIAN) { get_path(dictionary,temp); strcat(temp,"ForbiddenWords.txt"); forbiddenWords=load_key_list(temp,mask_encoding_compatibility_input); } u_printf("Loading BIN file...\n"); struct BIN_free_info bin_free; const unsigned char* bin=load_abstract_BIN_file(dictionary,&bin_free); if (bin==NULL) { error("Cannot load bin file %s\n",dictionary); free_alphabet(alph); free_string_hash(forbiddenWords); return 1; } strcpy(temp,dictionary); temp[strlen(dictionary)-3]='\0'; strcat(temp,"inf"); u_printf("Loading INF file...\n"); struct INF_free_info inf_free; const struct INF_codes* inf=load_abstract_INF_file(temp,&inf_free); if (inf==NULL) { error("Cannot load inf file %s\n",temp); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_string_hash(forbiddenWords); return 1; } char tmp[FILENAME_MAX]; strcpy(tmp,argv[vars->optind]); strcat(tmp,".tmp"); U_FILE* words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ); if (words==NULL) { error("Cannot open word list file %s\n",argv[vars->optind]); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); free_string_hash(forbiddenWords); // here we return 0 in order to do not block the preprocessing // in the Unitex Java interface, if no dictionary was applied // so that there is no "err" file return 0; } U_FILE* new_unknown_words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,tmp,U_WRITE); if (new_unknown_words==NULL) { error("Cannot open temporary word list file %s\n",tmp); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); u_fclose(words); free_string_hash(forbiddenWords); return 1; } U_FILE* res=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output,U_APPEND); if (res==NULL) { error("Cannot open result file %s\n",output); free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); u_fclose(words); u_fclose(new_unknown_words); free_string_hash(forbiddenWords); return 1; } U_FILE* debug=NULL; if (info!=NULL) { debug=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,info,U_WRITE); if (debug==NULL) { error("Cannot open debug file %s\n",info); } } struct utags UTAG; switch(language) { case DUTCH: analyse_dutch_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break; case GERMAN: analyse_german_compounds(alph,bin,inf,words,res,debug,new_unknown_words); break; case NORWEGIAN: analyse_norwegian_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break; case RUSSIAN: init_russian(&UTAG); analyse_compounds(alph,bin,inf,words,res,debug,new_unknown_words,UTAG); break; } free_alphabet(alph); free_abstract_BIN(bin,&bin_free); free_abstract_INF(inf,&inf_free); u_fclose(words); u_fclose(new_unknown_words); free_string_hash(forbiddenWords); af_remove(argv[vars->optind]); af_rename(tmp,argv[vars->optind]); u_fclose(res); if (debug!=NULL) { u_fclose(debug); } free_OptVars(vars); return 0; }