コード例 #1
0
ファイル: Reg2Grf.cpp プロジェクト: adri87/Q-A
int main_Reg2Grf(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}

Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;

int val,index=-1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Reg2Grf,lopts_Reg2Grf,&index,vars))) {
   switch(val) {
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_Reg2Grf[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}

U_FILE* f=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (f==NULL) {
   fatal_error("Cannot open file %s\n",argv[vars->optind]);
}
/* We read the regular expression in the file */
unichar exp[REG_EXP_MAX_LENGTH];
if ((REG_EXP_MAX_LENGTH-1)==u_fgets(exp,REG_EXP_MAX_LENGTH,f)) {
   fatal_error("Too long regular expression\n");
}
u_fclose(f);
char grf_name[FILENAME_MAX];
get_path(argv[vars->optind],grf_name);
strcat(grf_name,"regexp.grf");
if (!reg2grf(exp,grf_name,encoding_output,bom_output)) {
   return 1;
}
free_OptVars(vars);
u_printf("Expression converted.\n");
return 0;
}
コード例 #2
0
ファイル: TEI2Txt.cpp プロジェクト: adri87/Q-A
int main_TEI2Txt(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}

char output[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_TEI2Txt,lopts_TEI2Txt,&index,vars))) {
   switch(val) {
   case 'o': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty output file name\n");
             }
             strcpy(output,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_TEI2Txt[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}

if(output[0]=='\0') {
   remove_extension(argv[vars->optind],output);
	strcat(output,".txt");
}
tei2txt(argv[vars->optind],output,encoding_output,bom_output);
free_OptVars(vars);
return 0;
}
コード例 #3
0
int main_Normalize(int argc,char* const argv[]) {
if (argc==1) {
  usage();
  return SUCCESS_RETURN_CODE;
}
int mode=KEEP_CARRIAGE_RETURN;
int separator_normalization=1;
char rules[FILENAME_MAX]="";
char input_offsets[FILENAME_MAX]="";
char output_offsets[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int convLFtoCRLF=1;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_Normalize,lopts_Normalize,&index))) {
   switch(val) {
   case 'l': convLFtoCRLF=0; break;
   case 'n': mode=REMOVE_CARRIAGE_RETURN; break;
   case 'r': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty replacement rule file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(rules,options.vars()->optarg);
             break;
   case 1: separator_normalization=0; break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
              error("Empty input_encoding argument\n");
              return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
              error("Empty output_encoding argument\n");
              return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case '$': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty input offset file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(input_offsets,options.vars()->optarg);
             break;
   case '@': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty output offset file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(output_offsets,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Normalize[index].name);
             return USAGE_ERROR_CODE;
             break;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
  error("Invalid arguments: rerun with --help\n");
  return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

vector_offset* v_input_offsets=NULL;
vector_offset* v_output_offsets=NULL;
U_FILE* f_output_offsets=NULL;

if (output_offsets[0]!='\0') {
  /* We deal with offsets only if we have to produce output offsets */
  if (input_offsets[0]!='\0') {
    v_input_offsets=load_offsets(&vec,input_offsets);
  }
  f_output_offsets=u_fopen(&vec, output_offsets, U_WRITE);
  if (f_output_offsets==NULL) {
    error("Cannot create offset file %s\n",output_offsets);
    return DEFAULT_ERROR_CODE;
  }
  v_output_offsets=new_vector_offset();
}
char tmp_file[FILENAME_MAX];
get_extension(argv[options.vars()->optind],tmp_file);
if (!strcmp(tmp_file, ".snt")) {
   /* If the file to process has already the .snt extension, we temporary rename it to
   * .snt.normalizing */
  strcpy(tmp_file,argv[options.vars()->optind]);
  strcat(tmp_file,".normalizing");
  af_rename(argv[options.vars()->optind],tmp_file);
} else {
   strcpy(tmp_file,argv[options.vars()->optind]);
}
/* We set the destination file */
char dest_file[FILENAME_MAX];
remove_extension(argv[options.vars()->optind],dest_file);
strcat(dest_file,".snt");
u_printf("Normalizing %s...\n",argv[options.vars()->optind]);

int return_value = normalize(tmp_file,
                             dest_file,
                             &vec,
                             mode,
                             convLFtoCRLF,
                             rules,
                             v_output_offsets,
                             separator_normalization);
u_printf("\n");
/* If we have used a temporary file, we delete it */
if (strcmp(tmp_file,argv[options.vars()->optind])) {
   af_remove(tmp_file);
}
process_offsets(v_input_offsets,v_output_offsets,f_output_offsets);
u_fclose(f_output_offsets);
free_vector_offset(v_input_offsets);
free_vector_offset(v_output_offsets);
u_printf((return_value==SUCCESS_RETURN_CODE) ? "Done.\n" : "Unsuccessfull.\n");

return return_value;
}
コード例 #4
0
InstallLogger::InstallLogger(int argc,char* const argv[]) :
  ule(ule_default_init), init_done(0) {
  ClearUniLoggerSpaceStruct(0);
  if (argc==1) {
    usage();
    return;
  }

  Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
  int bom_output = DEFAULT_BOM_OUTPUT;
  int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
  int val,index=-1;
  bool only_verify_arguments = false;
  UnitexGetOpt options;
  while (EOF!=(val=options.parse_long(argc,argv,optstring_CreateLog,lopts_CreateLog,&index))) {
     switch(val) {
       case 'V': only_verify_arguments = true;
                 break;
       case 'h': usage();
                 return;
       case 'n': ule.store_file_in_content = 0;
                 break;
       case 'i': ule.store_file_in_content = 1;
                 break;
       case 'o': ule.store_file_out_content = 1;
                 break;
       case 'u': ule.store_file_out_content = 0;
                 break;
       case 's': ule.store_list_file_in_content = 1;
                 break;
       case 't': ule.store_list_file_in_content = 0;
                 break;
       case 'r': ule.store_list_file_out_content = 1;
                 break;
       case 'f': ule.store_list_file_out_content = 0;
                 break;
       case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                             error("Missing argument for option --%s\n",lopts_CreateLog[index].name);
                 return;            
       case 'k': if (options.vars()->optarg[0]=='\0') {
                    error("Empty input_encoding argument\n");
                    return;
                 }
                 decode_reading_encoding_parameter(&mask_encoding_compatibility_input,options.vars()->optarg);
                 break;
       case 'q': if (options.vars()->optarg[0]=='\0') {
                    error("Empty output_encoding argument\n");
                    return;
                 }
                 decode_writing_encoding_parameter(&encoding_output,&bom_output,options.vars()->optarg);
                 break;
       case 'g': ClearUniLoggerSpaceStruct(1);
                 return;
       case 'p': if (options.vars()->optarg[0]=='\0') {
                    error("You must specify a non empty param file\n");
                    return;
                 }
                 ClearUniLoggerSpaceStruct(1);
                 LoadParamFile(options.vars()->optarg);
                 return;
       case 'l': if (options.vars()->optarg[0]=='\0') {
                    error("You must specify a non empty log filename\n");
                    return;
                 }
                 if (ule.szNameLog != NULL) {
                     free((void*)ule.szNameLog);
                 }
                 ule.szNameLog = strdup(options.vars()->optarg);
                 break;
     
       case 'd': if (options.vars()->optarg[0]=='\0') {
                    error("You must specify a non empty directory\n");
                    return;
                 }
                 if (ule.szPathLog != NULL) {
                     free((void*)ule.szPathLog);
                 }    
                 ule.szPathLog = strdup(options.vars()->optarg);
                 break;

       case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                             error("Invalid option --%s\n",options.vars()->optarg);
                 return;
     }
     index=-1;
  }

  if (options.vars()->optind!=argc-1) {
  }

  if (only_verify_arguments) {
    // freeing all allocated memory
    return;
  }

  if (AddActivityLogger(&ule) != 0) {
    init_done = 1;
  } else {
    ClearUniLoggerSpaceStruct(1);
  }
}
コード例 #5
0
ファイル: ConcorDiff.cpp プロジェクト: adri87/Q-A
int main_ConcorDiff(int argc,char* const argv[]) {
if (argc==1) {
	usage();
	return 0;
}


int val,index=-1;
char* out=NULL;
char* font=NULL;
int size=0;
char foo;
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_ConcorDiff,lopts_ConcorDiff,&index,vars))) {
   switch(val) {
   case 'o': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty output file\n");
             }
             out=strdup(vars->optarg);
             if (out==NULL) {
                fatal_alloc_error("main_ConcorDiff");
             }
             break;
   case 'f': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty font name\n");
             }
             font=strdup(vars->optarg);
             if (font==NULL) {
                fatal_alloc_error("main_ConcorDiff");
             }
             break;
   case 's': if (1!=sscanf(vars->optarg,"%d%c",&size,&foo)
                 || size<=0) {
                /* foo is used to check that the font size is not like "45gjh" */
                fatal_error("Invalid font size argument: %s\n",vars->optarg);
             }
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_ConcorDiff[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   }
   index=-1;
}

if (out==NULL) {
   fatal_error("You must specify the output file\n");
}
if (font==NULL) {
   fatal_error("You must specify the font to use\n");
}
if (size==0) {
   fatal_error("You must specify the font size to use\n");
}
if (vars->optind!=argc-2) {
   error("Invalid arguments: rerun with --help\n");
   return 1;
}
diff(encoding_output,bom_output,mask_encoding_compatibility_input,argv[vars->optind],argv[vars->optind+1],out,font,size);
free(out);
free(font);
free_OptVars(vars);
return 0;
}
コード例 #6
0
int main_Flatten(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int RTN=1;
int depth=10;
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
char foo;
bool only_verify_arguments = false;
UnitexGetOpt options;

while (EOF!=(val=options.parse_long(argc,argv,optstring_Flatten,lopts_Flatten,&index))) {
   switch(val) {
   case 'f': RTN=0; break;
   case 'r': RTN=1; break;
   case 'd': if (1!=sscanf(options.vars()->optarg,"%d%c",&depth,&foo) || depth<=0) {
                /* foo is used to check that the depth is not like "45gjh" */
                error("Invalid depth argument: %s\n",options.vars()->optarg);
                return USAGE_ERROR_CODE;
             }
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'h': usage(); return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Flatten[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

u_printf("Loading %s...\n",argv[options.vars()->optind]);
struct FST2_free_info fst2_free;
Fst2* origin=load_abstract_fst2(&vec,argv[options.vars()->optind],1,&fst2_free);
if (origin==NULL) {
   error("Cannot load %s\n",argv[options.vars()->optind]);
   return DEFAULT_ERROR_CODE;
}

char temp[FILENAME_MAX];
strcpy(temp,argv[options.vars()->optind]);
strcat(temp,".tmp.fst2");

switch (flatten_fst2(origin,depth,temp,&vec,RTN)) {
   case EQUIVALENT_FST:
      u_printf("The resulting grammar is an equivalent finite-state transducer.\n");
      break;
   case APPROXIMATIVE_FST:
      u_printf("The resulting grammar is a finite-state approximation.\n");
      break;
   case EQUIVALENT_RTN:
      u_printf("The resulting grammar is an equivalent FST2 (RTN).\n");
      break;
   default: 
      error("Internal state error in Flatten's main\n");
      free_abstract_Fst2(origin,&fst2_free);
      return DEFAULT_ERROR_CODE;
}

free_abstract_Fst2(origin,&fst2_free);
af_remove(argv[options.vars()->optind]);
af_rename(temp,argv[options.vars()->optind]);

return SUCCESS_RETURN_CODE;
}
コード例 #7
0
int main_MultiFlex(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

char output[FILENAME_MAX]="";
char config_dir[FILENAME_MAX]="";
char alphabet[FILENAME_MAX]="";
char pkgdir[FILENAME_MAX]="";
char* named=NULL;
int is_korean=0;
// default policy is to compile only out of date graphs
GraphRecompilationPolicy graph_recompilation_policy = ONLY_OUT_OF_DATE;
//Current language's alphabet
int error_check_status=SIMPLE_AND_COMPOUND_WORDS;
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_MultiFlex,lopts_MultiFlex,&index))) {
   switch(val) {
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty DELAF file name\n");
                free(named);
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                free(named);
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'd': strcpy(config_dir,options.vars()->optarg); break;
   case 'K': is_korean=1;
             break;
   case 's': error_check_status=ONLY_SIMPLE_WORDS; break;
   case 'c': error_check_status=ONLY_COMPOUND_WORDS; break;
   case 'f': graph_recompilation_policy = ALWAYS_RECOMPILE; break;
   case 'n': graph_recompilation_policy = NEVER_RECOMPILE;  break;
   case 't': graph_recompilation_policy = ONLY_OUT_OF_DATE; break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free(named);
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free(named);
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'p': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty package directory name\n");
                free(named);
                return USAGE_ERROR_CODE;
             }
             strcpy(pkgdir,options.vars()->optarg);
             break;
   case 'r': if (named==NULL) {
                  named=strdup(options.vars()->optarg);
                  if (named==NULL) {
                     alloc_error("main_Grf2Fst2");
                     return ALLOC_ERROR_CODE;
                  }
             } else {
                   char* more_names = (char*)realloc((void*)named,strlen(named)+strlen(options.vars()->optarg)+2);
                 if (more_names) {
                  named = more_names;
                 } else {
                  alloc_error("main_MultiFlex");
                  free(named);
                  return ALLOC_ERROR_CODE;
                 }
                 strcat(named,";");
                 strcat(named,options.vars()->optarg);
             }
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             free(named);
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_MultiFlex[index].name);
             free(named);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free(named);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free(named);
   return USAGE_ERROR_CODE;
}

if (output[0]=='\0') {
   error("You must specify the output DELAF name\n");
   free(named);
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free(named);
  return SUCCESS_RETURN_CODE;
}

//Load morphology description
char morphology[FILENAME_MAX];
new_file(config_dir,"Morphology.txt",morphology);
//int config_files_status=CONFIG_FILES_OK;
Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   //Load alphabet
   alph=load_alphabet(&vec,alphabet,1);  //To be done once at the beginning of the inflection
   if (alph==NULL) {
      error("Cannot open alphabet file %s\n",alphabet);
      free(named);
      return DEFAULT_ERROR_CODE;
   }
}
//Init equivalence files
char equivalences[FILENAME_MAX];
new_file(config_dir,"Equivalences.txt",equivalences);

/* Korean */
Korean* korean=NULL;
if (is_korean) {
   if (alph==NULL) {
      error("Cannot initialize Korean data with a NULL alphabet\n");
      free(named);
      return DEFAULT_ERROR_CODE;
   }
    korean=new Korean(alph);
}
MultiFlex_ctx* p_multiFlex_ctx=new_MultiFlex_ctx(config_dir,
                                                 morphology,
                                                 equivalences,
                                                 &vec,
                                                 korean,
                                                 pkgdir,
                                                 named,
                                                 graph_recompilation_policy);

//DELAC inflection
int return_value = inflect(argv[options.vars()->optind],output,p_multiFlex_ctx,alph,error_check_status);

free(named);

for (int count_free_fst2=0;count_free_fst2<p_multiFlex_ctx->n_fst2;count_free_fst2++) {
    free_abstract_Fst2(p_multiFlex_ctx->fst2[count_free_fst2],&(p_multiFlex_ctx->fst2_free[count_free_fst2]));
    p_multiFlex_ctx->fst2[count_free_fst2] = NULL;
}

free_alphabet(alph);

free_MultiFlex_ctx(p_multiFlex_ctx);

if (korean!=NULL) {
    delete korean;
}

u_printf("Done.\n");
return return_value;
}
コード例 #8
0
int main_Untokenize(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

char alphabet[FILENAME_MAX]="";
char token_file[FILENAME_MAX]="";
char dynamicSntDir[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
int range_start,range_stop,use_range;
int token_step_number=0;
range_start=range_stop=use_range=0;
char foo=0;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_Untokenize,lopts_Untokenize,&index))) {
   switch(val) {
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'd': if (options.vars()->optarg[0]=='\0') {
                   error("You must specify a non empty snt dir name\n");
                   return USAGE_ERROR_CODE;
                }
                strcpy(dynamicSntDir,options.vars()->optarg);
                break;
   case 't': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty token file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(token_file,options.vars()->optarg);
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;

   case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&token_step_number,&foo) || token_step_number<=0) {
                /* foo is used to check that the search limit is not like "45gjh" */
                error("Invalid token numbering argument: %s\n",options.vars()->optarg);
                return USAGE_ERROR_CODE;
             }
             break;
   case 'r': {
                int param1 = 0;
                int param2 = 0;
                int ret_scan = sscanf(options.vars()->optarg,"%d,%d%c",&param1,&param2,&foo);
                if (ret_scan == 2) {
                    range_start = param1;
                    range_stop  = param2;
                    use_range=1;
                    if (((range_start < -1)) || (range_stop < -1)) {
                        /* foo is used to check that the search limit is not like "45gjh" */
                        error("Invalid stop count argument: %s\n",options.vars()->optarg);
                        return USAGE_ERROR_CODE;
                    }
                }
                else
                    if (1!=sscanf(options.vars()->optarg,"%d%c",&range_start,&foo) || (range_start < -1)) {
                        /* foo is used to check that the search limit is not like "45gjh" */
                        error("Invalid stop count argument: %s\n",options.vars()->optarg);
                        return USAGE_ERROR_CODE;
                    }
                    use_range=1;
             }
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Untokenize[index].name);
             return USAGE_ERROR_CODE;            
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

char tokens_txt[FILENAME_MAX];
char text_cod[FILENAME_MAX];
char enter_pos[FILENAME_MAX];

if (dynamicSntDir[0]=='\0') {
    get_snt_path(argv[options.vars()->optind],dynamicSntDir);
}

strcpy(text_cod,dynamicSntDir);
strcat(text_cod,"text.cod");
strcpy(enter_pos,dynamicSntDir);
strcat(enter_pos,"enter.pos");
strcpy(tokens_txt,dynamicSntDir);
strcat(tokens_txt,"tokens.txt");

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   alph=load_alphabet(&vec,alphabet);
   if (alph==NULL) {
      error("Cannot load alphabet file %s\n",alphabet);
      return DEFAULT_ERROR_CODE;
   }
}

ABSTRACTMAPFILE* af_text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0);
if (af_text_cod==NULL) {
  error("Cannot open file %s\n",text_cod);
  free_alphabet(alph);
  return DEFAULT_ERROR_CODE;
}

ABSTRACTMAPFILE* af_enter_pos=af_open_mapfile(enter_pos,MAPFILE_OPTION_READ,0);
if (af_enter_pos==NULL) {
  error("Cannot open file %s\n",enter_pos);
  af_close_mapfile(af_text_cod);
  free_alphabet(alph);
  return DEFAULT_ERROR_CODE;
}

U_FILE* text = u_fopen(&vec,argv[options.vars()->optind],U_WRITE);
if (text==NULL) {
  error("Cannot create text file %s\n",argv[options.vars()->optind]);
  af_close_mapfile(af_enter_pos);
  af_close_mapfile(af_text_cod);
  free_alphabet(alph);
  return DEFAULT_ERROR_CODE;
}

struct text_tokens* tok=load_text_tokens(&vec,tokens_txt);
u_printf("Untokenizing text...\n");
size_t nb_item = af_get_mapfile_size(af_text_cod)/sizeof(int);
const int* buf=(const int*)af_get_mapfile_pointer(af_text_cod);

size_t nb_item_enter_pos=0;
const int* buf_enter=NULL;

if (af_enter_pos!=NULL) {
    buf_enter=(const int*)af_get_mapfile_pointer(af_enter_pos);
    if (buf_enter!=NULL) {
        nb_item_enter_pos=af_get_mapfile_size(af_enter_pos)/sizeof(int);
    }
}

size_t count_pos=0;
for (size_t i=0;i<nb_item;i++) {
    int is_in_range=1;
    if ((use_range!=0) && (i<(size_t)range_start)) {
        is_in_range=0;
    }
    if ((use_range!=0) && (range_stop!=0) && (i>(size_t)range_stop)) {
        is_in_range=0;
    }
    int is_newline=0;
    if (count_pos<nb_item_enter_pos) {
        if (i==(size_t)(*(buf_enter+count_pos))) {
            is_newline = 1;
            count_pos++;
        }
    }

    if (is_in_range!=0) {
        if (token_step_number != 0)
            if ((i%token_step_number)==0)
                u_fprintf(text,"\n\nToken %d : ", (int)i);

        if (is_newline!=0) {
            u_fprintf(text,"\n", tok->token[*(buf+i)]);
        }
        else {
			u_fputs(tok->token[*(buf+i)], text);
        }
    }
}

af_release_mapfile_pointer(af_text_cod,buf);
af_release_mapfile_pointer(af_enter_pos,buf_enter);
af_close_mapfile(af_enter_pos);
af_close_mapfile(af_text_cod);
free_text_tokens(tok);
u_fclose(text);
free_alphabet(alph);

u_printf("\nDone.\n");
return SUCCESS_RETURN_CODE;
}
コード例 #9
0
ファイル: Extract.cpp プロジェクト: adri87/Q-A
int main_Extract(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}


int val,index=-1;
char extract_matching_units=1;
char text_name[FILENAME_MAX]="";
char concord_ind[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Extract,lopts_Extract,&index,vars))) {
   switch(val) {
   case 'y': extract_matching_units=1; break;
   case 'n': extract_matching_units=0; break;
   case 'o': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty output file name\n");
             }
             strcpy(output,vars->optarg);
             break;
   case 'i': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty concordance file name\n");
             }
             strcpy(concord_ind,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_Extract[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   }
   index=-1;
}

if (output[0]=='\0') {
   fatal_error("You must specify the output text file\n");
}
if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}
strcpy(text_name,argv[vars->optind]);

struct snt_files* snt_files=new_snt_files(text_name);
ABSTRACTMAPFILE* text=af_open_mapfile(snt_files->text_cod,MAPFILE_OPTION_READ,0);
if (text==NULL) {
   error("Cannot open %s\n",snt_files->text_cod);
   return 1;
}
struct text_tokens* tok=load_text_tokens(snt_files->tokens_txt,mask_encoding_compatibility_input);
if (tok==NULL) {
   error("Cannot load token list %s\n",snt_files->tokens_txt);
   af_close_mapfile(text);
   return 1;
}
if (tok->SENTENCE_MARKER==-1) {
   error("The text does not contain any sentence marker {S}\n");
   af_close_mapfile(text);
   free_text_tokens(tok);
   return 1;
}

if (concord_ind[0]=='\0') {
   char tmp[FILENAME_MAX];
   get_extension(text_name,tmp);
   if (strcmp(tmp,"snt")) {
      fatal_error("Unable to find the concord.ind file. Please explicit it\n");
   }
   remove_extension(text_name,concord_ind);
   strcat(concord_ind,"_snt");
   strcat(concord_ind,PATH_SEPARATOR_STRING);
   strcat(concord_ind,"concord.ind");
}
U_FILE* concord=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,concord_ind,U_READ);
if (concord==NULL) {
   error("Cannot open concordance %s\n",concord_ind);
   af_close_mapfile(text);
   free_text_tokens(tok);
   return 1;
}
U_FILE* result=u_fopen_creating_versatile_encoding(encoding_output,bom_output,output,U_WRITE);
if (result==NULL) {
   error("Cannot write output file %s\n",output);
   af_close_mapfile(text);
   u_fclose(concord);
   free_text_tokens(tok);
   return 1;
}
free_snt_files(snt_files);
extract_units(extract_matching_units,text,tok,concord,result);
af_close_mapfile(text);
u_fclose(concord);
u_fclose(result);
free_text_tokens(tok);
free_OptVars(vars);
u_printf("Done.\n");
return 0;
}
コード例 #10
0
ファイル: Tokenize.cpp プロジェクト: adri87/Q-A
int main_Tokenize(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}

char alphabet[FILENAME_MAX]="";
char token_file[FILENAME_MAX]="";

Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
int mode=NORMAL;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Tokenize,lopts_Tokenize,&index,vars))) {
   switch(val) {
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty alphabet file name\n");
             }
             strcpy(alphabet,vars->optarg);
             break;
   case 'c': mode=CHAR_BY_CHAR; break;
   case 'w': mode=NORMAL; break;
   case 't': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty token file name\n");
             }
             strcpy(token_file,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_Tokenize[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}
U_FILE* text;
U_FILE* out;
U_FILE* output;
U_FILE* enter;
char tokens_txt[FILENAME_MAX];
char text_cod[FILENAME_MAX];
char enter_pos[FILENAME_MAX];
Alphabet* alph=NULL;

get_snt_path(argv[vars->optind],text_cod);
strcat(text_cod,"text.cod");
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"tokens.txt");
get_snt_path(argv[vars->optind],enter_pos);
strcat(enter_pos,"enter.pos");
text=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (text==NULL) {
   fatal_error("Cannot open text file %s\n",argv[vars->optind]);
}
if (alphabet[0]!='\0') {
   alph=load_alphabet(alphabet);
   if (alph==NULL) {
      error("Cannot load alphabet file %s\n",alphabet);
      u_fclose(text);
      return 1;
   }
}
out=u_fopen(BINARY,text_cod,U_WRITE);
if (out==NULL) {
   error("Cannot create file %s\n",text_cod);
   u_fclose(text);
   if (alph!=NULL) {
      free_alphabet(alph);
   }
   return 1;
}
enter=u_fopen(BINARY,enter_pos,U_WRITE);
if (enter==NULL) {
   error("Cannot create file %s\n",enter_pos);
   u_fclose(text);
   u_fclose(out);
   if (alph!=NULL) {
      free_alphabet(alph);
   }
   return 1;
}


vector_ptr* tokens=new_vector_ptr(4096);
vector_int* n_occur=new_vector_int(4096);
vector_int* n_enter_pos=new_vector_int(4096);
struct hash_table* hashtable=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal,
                                            (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy);
if (token_file[0]!='\0') {
   load_token_file(token_file,mask_encoding_compatibility_input,tokens,hashtable,n_occur);
}

output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot create file %s\n",tokens_txt);
   u_fclose(text);
   u_fclose(out);
   u_fclose(enter);
   if (alph!=NULL) {
      free_alphabet(alph);
   }

   free_hash_table(hashtable);
   free_vector_ptr(tokens,free);
   free_vector_int(n_occur);
   free_vector_int(n_enter_pos);

   return 1;
}
u_fprintf(output,"0000000000\n");

int SENTENCES=0;
int TOKENS_TOTAL=0;
int WORDS_TOTAL=0;
int DIGITS_TOTAL=0;
u_printf("Tokenizing text...\n");
if (mode==NORMAL) {
   normal_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos,
		   &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL);
}
else {
   char_by_char_tokenization(text,out,output,alph,tokens,hashtable,n_occur,n_enter_pos,
		   &SENTENCES,&TOKENS_TOTAL,&WORDS_TOTAL,&DIGITS_TOTAL);
}
u_printf("\nDone.\n");
save_new_line_positions(enter,n_enter_pos);
u_fclose(enter);
u_fclose(text);
u_fclose(out);
u_fclose(output);
write_number_of_tokens(tokens_txt,encoding_output,bom_output,tokens->nbelems);
// we compute some statistics
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"stats.n");
output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot write %s\n",tokens_txt);
}
else {
   compute_statistics(output,tokens,alph,SENTENCES,TOKENS_TOTAL,WORDS_TOTAL,DIGITS_TOTAL);
   u_fclose(output);
}
// we save the tokens by frequence
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"tok_by_freq.txt");
output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot write %s\n",tokens_txt);
}
else {
   sort_and_save_by_frequence(output,tokens,n_occur);
   u_fclose(output);
}
// we save the tokens by alphabetical order
get_snt_path(argv[vars->optind],tokens_txt);
strcat(tokens_txt,"tok_by_alph.txt");
output=u_fopen_creating_versatile_encoding(encoding_output,bom_output,tokens_txt,U_WRITE);
if (output==NULL) {
   error("Cannot write %s\n",tokens_txt);
}
else {
   sort_and_save_by_alph_order(output,tokens,n_occur);
   u_fclose(output);
}
free_hash_table(hashtable);
free_vector_ptr(tokens,free);
free_vector_int(n_occur);
free_vector_int(n_enter_pos);
if (alph!=NULL) {
   free_alphabet(alph);
}
free_OptVars(vars);
return 0;
}
コード例 #11
0
int main_XMLizer(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int output_style=TEI;
char output[FILENAME_MAX]="";
char alphabet[FILENAME_MAX]="";
char normalization[FILENAME_MAX]="";
char segmentation[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int convLFtoCRLF=1;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_XMLizer,lopts_XMLizer,&index))) {
   switch(val) {
   case 'x': output_style=XML; break;
   case 't': output_style=TEI; break;
   case 'n': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty normalization grammar name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(normalization,options.vars()->optarg);
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 's': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty segmentation grammar name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(segmentation,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_XMLizer[index].name);
             return USAGE_ERROR_CODE;                         
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case '?': index==-1  ? error("Invalid option -%c\n",options.vars()->optopt) :
                          error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;             
  
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (segmentation[0]=='\0') {
   error("You must specify the segmentation grammar to use\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

char input[FILENAME_MAX];
strcpy(input,argv[options.vars()->optind]);
char snt[FILENAME_MAX];
remove_extension(input,snt);
strcat(snt,"_tmp.snt");
char tmp[FILENAME_MAX];
remove_extension(input,tmp);
strcat(tmp,".tmp");
normalize(input,snt,&vec,KEEP_CARRIAGE_RETURN,convLFtoCRLF,normalization,NULL,1);
struct fst2txt_parameters* p=new_fst2txt_parameters();
p->vec=vec;
p->input_text_file=strdup(snt);
if (p->input_text_file ==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}

p->output_text_file_is_temp=1;
p->output_text_file=strdup(tmp);
if (p->output_text_file==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}
p->fst_file=strdup(segmentation);
if (p->fst_file==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}
p->alphabet_file=strdup(alphabet);
if (p->alphabet_file==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}

p->output_policy=MERGE_OUTPUTS;
p->tokenization_policy=WORD_BY_WORD_TOKENIZATION;
p->space_policy=DONT_START_WITH_SPACE;

main_fst2txt(p);

free_fst2txt_parameters(p);

if (output[0]=='\0') {
  remove_extension(input,output);
	strcat(output,".xml");
}

int return_value = xmlize(&vec,snt,output,output_style);

af_remove(snt);
af_remove(tmp);

return return_value;
}
コード例 #12
0
ファイル: TEI2Txt.cpp プロジェクト: UnitexGramLab/unitex-core
int main_TEI2Txt(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

char output[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_TEI2Txt,lopts_TEI2Txt,&index))) {
   switch(val) {
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_TEI2Txt[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

if(output[0]=='\0') {
  remove_extension(argv[options.vars()->optind],output);
    strcat(output,".txt");
}

int return_value = tei2txt(argv[options.vars()->optind],output,&vec);

return return_value;
}
コード例 #13
0
ファイル: BuildKrMwuDic.cpp プロジェクト: adri87/Q-A
/**
 * The same than main, but no call to setBufferMode.
 */
int main_BuildKrMwuDic(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}


int val,index=-1;
char output[FILENAME_MAX]="";
char inflection_dir[FILENAME_MAX]="";
char alphabet[FILENAME_MAX]="";
char dic_bin[FILENAME_MAX]="";
char dic_inf[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_BuildKrMwuDic,lopts_BuildKrMwuDic,&index,vars))) {
   switch(val) {
   case 'o': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty output file name\n");
             }
             strcpy(output,vars->optarg);
             break;
   case 'd': if (vars->optarg[0]=='\0') {
                fatal_error("Empty inflection directory\n");
             }
             strcpy(inflection_dir,vars->optarg);
             break;
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty alphabet file name\n");
             }
             strcpy(alphabet,vars->optarg);
             break;
   case 'b': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty binary dictionary name\n");
             }
             strcpy(dic_bin,vars->optarg);
             remove_extension(dic_bin,dic_inf);
             strcat(dic_inf,".inf");
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_BuildKrMwuDic[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   }
   index=-1;
}
if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}
if (output[0]=='\0') {
   fatal_error("Output file must be specified\n");
}
if (inflection_dir[0]=='\0') {
   fatal_error("Inflection directory must be specified\n");
}
if (alphabet[0]=='\0') {
   fatal_error("Alphabet file must be specified\n");
}
if (dic_bin[0]=='\0') {
   fatal_error("Binary dictionary must be specified\n");
}

U_FILE* delas=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (delas==NULL) {
   fatal_error("Cannot open %s\n",argv[vars->optind]);
}
U_FILE* grf=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,output,U_WRITE);
if (grf==NULL) {
   fatal_error("Cannot open %s\n",output);
}
Alphabet* alph=load_alphabet(alphabet,1);
if (alph==NULL) {
   fatal_error("Cannot open alphabet file %s\n",alphabet);
}
Korean* korean=new Korean(alph);
MultiFlex_ctx* multiFlex_ctx = (MultiFlex_ctx*)malloc(sizeof(MultiFlex_ctx));
if (multiFlex_ctx==NULL) {
   fatal_alloc_error("main_BuildKrMwuDic");
}
strcpy(multiFlex_ctx->inflection_directory,inflection_dir);
if (init_transducer_tree(multiFlex_ctx)) {
   fatal_error("init_transducer_tree error\n");
}
struct l_morpho_t* pL_MORPHO=init_langage_morph();
if (pL_MORPHO == NULL) {
   fatal_error("init_langage_morph error\n");
}

unsigned char* bin=load_BIN_file(dic_bin);
struct INF_codes* inf=load_INF_file(dic_inf);

create_mwu_dictionary(delas,grf,multiFlex_ctx,korean,pL_MORPHO,encoding_output,
       bom_output,mask_encoding_compatibility_input,bin,inf);

free(bin);
free_INF_codes(inf);
u_fclose(delas);
u_fclose(grf);
free_alphabet(alph);
delete korean;
free_transducer_tree(multiFlex_ctx);
for (int count_free_fst2=0;count_free_fst2<multiFlex_ctx->n_fst2;count_free_fst2++) {
    free_abstract_Fst2(multiFlex_ctx->fst2[count_free_fst2],&(multiFlex_ctx->fst2_free[count_free_fst2]));
    multiFlex_ctx->fst2[count_free_fst2]=NULL;
}
free_language_morpho(pL_MORPHO);
free(multiFlex_ctx);
free_OptVars(vars);
u_printf("Done.\n");
return 0;
}
コード例 #14
0
/**
 * The same than main, but no call to setBufferMode.
 */
int main_BuildKrMwuDic(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int val,index=-1;
char output[FILENAME_MAX]="";
char inflection_dir[FILENAME_MAX]="";
char alphabet[FILENAME_MAX]="";
char dic_bin[FILENAME_MAX]="";
char dic_inf[FILENAME_MAX]="";

// default policy is to compile only out of date graphs
GraphRecompilationPolicy graph_recompilation_policy = ONLY_OUT_OF_DATE;

VersatileEncodingConfig vec=VEC_DEFAULT;

bool only_verify_arguments = false;

UnitexGetOpt options;

while (EOF!=(val=options.parse_long(argc,argv,optstring_BuildKrMwuDic,lopts_BuildKrMwuDic,&index))) {
   switch(val) {
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'd': if (options.vars()->optarg[0]=='\0') {
                error("Empty inflection directory\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(inflection_dir,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'b': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty binary dictionary name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(dic_bin,options.vars()->optarg);
             remove_extension(dic_bin,dic_inf);
             strcat(dic_inf,".inf");
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case 'f': graph_recompilation_policy = ALWAYS_RECOMPILE; break;
   case 'n': graph_recompilation_policy = NEVER_RECOMPILE;  break;
   case 't': graph_recompilation_policy = ONLY_OUT_OF_DATE; break;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_BuildKrMwuDic[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   }
   index=-1;
}
if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}
if (output[0]=='\0') {
   error("Output file must be specified\n");
   return USAGE_ERROR_CODE;
}
if (inflection_dir[0]=='\0') {
   error("Inflection directory must be specified\n");
   return USAGE_ERROR_CODE;
}
if (alphabet[0]=='\0') {
   error("Alphabet file must be specified\n");
   return USAGE_ERROR_CODE;
}
if (dic_bin[0]=='\0') {
   error("Binary dictionary must be specified\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory 
  return SUCCESS_RETURN_CODE;
}

U_FILE* delas=u_fopen(&vec,argv[options.vars()->optind],U_READ);
if (delas==NULL) {
   error("Cannot open %s\n",argv[options.vars()->optind]);
   return DEFAULT_ERROR_CODE;
}

U_FILE* grf=u_fopen(&vec,output,U_WRITE);
if (grf==NULL) {
   error("Cannot open %s\n",output);
   u_fclose(delas);  
   return DEFAULT_ERROR_CODE;
}

Alphabet* alph=load_alphabet(&vec,alphabet,1);
if (alph==NULL) {
   u_fclose(grf);
   u_fclose(delas);
   error("Cannot open alphabet file %s\n",alphabet);
   return DEFAULT_ERROR_CODE;
}
Korean* korean=new Korean(alph);

MultiFlex_ctx* multiFlex_ctx=new_MultiFlex_ctx(inflection_dir,
                                               NULL,
                                               NULL,
                                               &vec,
                                               korean,
                                               NULL,
                                               NULL,
                                               graph_recompilation_policy);

Dictionary* d=new_Dictionary(&vec,dic_bin,dic_inf);

create_mwu_dictionary(delas,grf,multiFlex_ctx,d);

free_Dictionary(d);
u_fclose(delas);
u_fclose(grf);
free_alphabet(alph);
delete korean;
for (int count_free_fst2=0;count_free_fst2<multiFlex_ctx->n_fst2;count_free_fst2++) {
    free_abstract_Fst2(multiFlex_ctx->fst2[count_free_fst2],&(multiFlex_ctx->fst2_free[count_free_fst2]));
    multiFlex_ctx->fst2[count_free_fst2]=NULL;
}
free_MultiFlex_ctx(multiFlex_ctx);
u_printf("Done.\n");
return SUCCESS_RETURN_CODE;
}
コード例 #15
0
int main_ElagComp(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
char compilename[FILENAME_MAX]="";
char directory[FILENAME_MAX]="";
char grammar[FILENAME_MAX]="";
char rule_file[FILENAME_MAX]="";
char lang[FILENAME_MAX]="";
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_ElagComp,lopts_ElagComp,&index))) {
   switch(val) {
   case 'l': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty language definition file\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(lang,options.vars()->optarg);
             break;
   case 'r': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty rule file\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(rule_file,options.vars()->optarg);
             get_path(rule_file,directory);
             break;
   case 'g': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty grammar file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(grammar,options.vars()->optarg);
             get_path(grammar,directory);
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(compilename,options.vars()->optarg);
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_ElagComp[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (lang[0]=='\0') {
   error("You must define the language definition file\n");
   return USAGE_ERROR_CODE;
}
if ((rule_file[0]=='\0' && grammar[0]=='\0')
     || (rule_file[0]!='\0' && grammar[0]!='\0')) {
   error("You must define a rule list OR a grammar\n");
   return USAGE_ERROR_CODE;
}
if (options.vars()->optind!=argc) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (rule_file[0]=='\0' && grammar[0]=='\0') {
   error("You must specified a grammar or a rule file name\n");
   return USAGE_ERROR_CODE;
}

if (rule_file[0]!='\0' && grammar[0]!='\0') {
   error("Cannot handle both a rule file and a grammar\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

language_t* language = load_language_definition(&vec,lang);
if (rule_file[0]!='\0') {
   /* If we work with a rule list */
   if (compilename[0]=='\0') {
      int l=(int)strlen(rule_file);
      if (strcmp(rule_file+l-4,".lst")==0) {
         strcpy(compilename,rule_file);
         strcpy(compilename+l-4,".rul");
      } else {
         sprintf(compilename,"%s.rul",rule_file);
      }
   }
   if (compile_elag_rules(rule_file,compilename,&vec,language)==-1) {
      error("An error occurred while compiling %s\n",compilename);
      free_language_t(language);
      return DEFAULT_ERROR_CODE;
   }
   u_printf("\nElag grammars are compiled in %s.\n",compilename);
} else {
   /* If we must compile a single grammar */
   char elg_file[FILENAME_MAX];
   get_extension(grammar,elg_file);
   if (strcmp(elg_file,".fst2")) {
     error("Grammar '%s' should be a .fst2 file\n");
     free_language_t(language);
     return DEFAULT_ERROR_CODE;
   }
   remove_extension(grammar,elg_file);
   strcat(elg_file,".elg");
   if (compile_elag_grammar(grammar,elg_file,&vec,language)==-1) {
     error("An error occured while compiling %s\n",grammar);
     free_language_t(language);
     return DEFAULT_ERROR_CODE;
   }
   u_printf("Elag grammar is compiled into %s.\n",elg_file);
}
free_language_t(language);
return SUCCESS_RETURN_CODE;
}
コード例 #16
0
ファイル: Cassys.cpp プロジェクト: adri87/Q-A
int main_Cassys(int argc,char* const argv[]) {
	if (argc==1) {
		usage();
		return 0;
	}

	char transducer_list_file_name[FILENAME_MAX];
	bool has_transducer_list = false;

	char text_file_name[FILENAME_MAX];
	bool has_text_file_name = false;

	char alphabet_file_name[FILENAME_MAX];
    char transducer_filename_prefix[FILENAME_MAX];
	bool has_alphabet = false;
    char negation_operator[0x20];

	Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
	int bom_output = DEFAULT_BOM_OUTPUT;
	int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
    int must_create_directory = 1;
    int in_place = 0;

    struct transducer_name_and_mode_linked_list* transducer_name_and_mode_linked_list_arg=NULL;

	// decode the command line
	int val;
	int index = 1;
    negation_operator[0]='\0';
    transducer_filename_prefix[0]='\0';
	struct OptVars* vars=new_OptVars();
	while (EOF != (val = getopt_long_TS(argc, argv, optstring_Cassys,
			lopts_Cassys, &index, vars))) {
		switch (val) {
		case 'h': usage(); 
                  free_OptVars(vars);
                  free_transducer_name_and_mode_linked_list(transducer_name_and_mode_linked_list_arg); 
                  return 0;
		case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
		case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
		case 't': {
			if (vars -> optarg[0] == '\0') {
				fatal_error("Command line error : Empty file name argument\n");
			}

			char extension_text_name[FILENAME_MAX];
			get_extension(vars -> optarg, extension_text_name);
			if (strcmp(extension_text_name, ".snt") != 0) {
				fatal_error(
						"Command line error : File name argument %s must be a preprocessed snt file\n",
						vars -> optarg);
			}

			strcpy(text_file_name, vars -> optarg);
			has_text_file_name = true;

			break;
		}
		case 'l': {
			if(vars -> optarg[0] == '\0'){
				fatal_error("Command line error : Empty transducer list argument\n");
			} else {
				strcpy(transducer_list_file_name, vars -> optarg);
				has_transducer_list = true;
			}
			break;
		}
		case 'r': {
			if(vars -> optarg[0] == '\0'){
				fatal_error("Command line error : Empty transducer directory argument\n");
			} else {
				strcpy(transducer_filename_prefix, vars -> optarg);
				has_transducer_list = true;
			}
			break;
		}
		case 's': {
			if(vars -> optarg[0] == '\0'){
				fatal_error("Command line error : Empty transducer filename argument\n");
			} else {
				transducer_name_and_mode_linked_list_arg=add_transducer_linked_list_new_name(transducer_name_and_mode_linked_list_arg,vars -> optarg);
			}
			break;
		}
		case 'm': {
			if(vars -> optarg[0] == '\0'){
				fatal_error("Command line error : Empty transducer mode argument\n");
			} else {
				set_last_transducer_linked_list_mode_by_string(transducer_name_and_mode_linked_list_arg,vars -> optarg);
			}
			break;
		}
		case 'a':{
			if (vars -> optarg[0] == '\0') {
				fatal_error("Command line error : Empty alphabet argument\n");
			} else {
				strcpy(alphabet_file_name, vars -> optarg);
				has_alphabet = true;
			}
			break;
		}
   	    case 'g': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify an argument for negation operator\n");
             }
             if ((strcmp(vars->optarg,"minus")!=0) && (strcmp(vars->optarg,"-")!=0) && 
                 (strcmp(vars->optarg,"tilde")!=0) && (strcmp(vars->optarg,"~")!=0))
             {
                 fatal_error("You must specify a valid argument for negation operator\n");
             }
             strcpy(negation_operator,vars->optarg);
             break;
        case 'i': {
            in_place = 1;
			break;
		}
        case 'd': {
            must_create_directory = 0;
			break;
		}
		default :{
			fatal_error("Unknown option : %c\n",val);
			break;
		}
		}
	}
	index = -1;

	if(has_alphabet == false){
		fatal_error("Command line error : no alphabet provided\nRerun with --help\n");
	}
	if(has_text_file_name == false){
		fatal_error("Command line error : no text file provided\nRerun with --help\n");
	}
	if((has_transducer_list == false) && (transducer_name_and_mode_linked_list_arg == NULL)){
		fatal_error("Command line error : no transducer list provided\nRerun with --help\n");
	}



	// Load the list of transducers from the file transducer list and stores it in a list
	//struct fifo *transducer_list = load_transducer(transducer_list_file_name);
    if ((transducer_name_and_mode_linked_list_arg == NULL) && has_transducer_list)
        transducer_name_and_mode_linked_list_arg = load_transducer_list_file(transducer_list_file_name);
    struct fifo *transducer_list=load_transducer_from_linked_list(transducer_name_and_mode_linked_list_arg,transducer_filename_prefix);

	cascade(text_file_name, in_place, must_create_directory, transducer_list, alphabet_file_name,negation_operator,encoding_output,bom_output,mask_encoding_compatibility_input);
	free_fifo(transducer_list);
    free_OptVars(vars);
    free_transducer_name_and_mode_linked_list(transducer_name_and_mode_linked_list_arg);
	return 0;
}
コード例 #17
0
ファイル: Fst2Txt.cpp プロジェクト: anukat2015/unitex-core
int main_Fst2Txt(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

struct fst2txt_parameters* p=new_fst2txt_parameters();
char in_offsets[FILENAME_MAX]="";
char out_offsets[FILENAME_MAX]="";
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;

while (EOF!=(val=options.parse_long(argc,argv,optstring_Fst2Txt,lopts_Fst2Txt,&index))) {
   switch(val) {
   case 't': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty text file name\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;
             }
             p->input_text_file=strdup(options.vars()->optarg);
             if (p->input_text_file==NULL) {
                alloc_error("main_Fst2Txt");
                free_fst2txt_parameters(p);
                return ALLOC_ERROR_CODE;
             }
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty text output file name\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;
             }
             p->output_text_file=strdup(options.vars()->optarg);
			 p->output_text_file_is_temp=0;
             if (p->output_text_file==NULL) {
                alloc_error("main_Fst2Txt");
                free_fst2txt_parameters(p);
                return ALLOC_ERROR_CODE;
             }
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;                
             }
             p->alphabet_file=strdup(options.vars()->optarg);
             if (p->alphabet_file==NULL) {
               alloc_error("main_Fst2Txt");
               free_fst2txt_parameters(p);
               return ALLOC_ERROR_CODE;               
             }
             break;
   case 'M': p->output_policy=MERGE_OUTPUTS; break;
   case 'R': p->output_policy=REPLACE_OUTPUTS; break;
   case 'c': p->tokenization_policy=CHAR_BY_CHAR_TOKENIZATION; break;
   case 'w': p->tokenization_policy=WORD_BY_WORD_TOKENIZATION; break;
   case 's': p->space_policy=START_WITH_SPACE; break;
   case 'x': p->space_policy=DONT_START_WITH_SPACE; break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             free_fst2txt_parameters(p);
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Fst2Txt[index].name);
             free_fst2txt_parameters(p);
             return USAGE_ERROR_CODE; 
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;                 
             }
             decode_reading_encoding_parameter(&(p->vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE; 
             }
             decode_writing_encoding_parameter(&(p->vec.encoding_output),&(p->vec.bom_output),options.vars()->optarg);
             break;
   case '$': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_offsets argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE; 
             }
             strcpy(in_offsets,options.vars()->optarg);
             break;
   case '@': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_offsets argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE; 
             }
             strcpy(out_offsets,options.vars()->optarg);
             break;
   case 'l': p->convLFtoCRLF=0; break;
   case 'r': p->keepCR = 1; break;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free_fst2txt_parameters(p);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free_fst2txt_parameters(p);
   return USAGE_ERROR_CODE;
}

if (p->input_text_file==NULL) {
   error("You must specify the text file\n");
   free_fst2txt_parameters(p);
   return USAGE_ERROR_CODE;   
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free_fst2txt_parameters(p);
  return SUCCESS_RETURN_CODE;
}

if (out_offsets[0]!='\0') {
	/* We deal with offsets only if the program is expected to produce some */
	if (in_offsets[0]!='\0') {
		p->v_in_offsets=load_offsets(&(p->vec),in_offsets);
		if (p->v_in_offsets==NULL) {
			error("Cannot load offset file %s\n",in_offsets);
      free_fst2txt_parameters(p);
      return DEFAULT_ERROR_CODE;      
		}
	} else {
		/* If there is no input offset file, we create an empty offset vector
		 * in order to avoid testing whether the vector is NULL or not */
		p->v_in_offsets=new_vector_offset(1);
	}
	p->f_out_offsets=u_fopen(&(p->vec),out_offsets,U_WRITE);
	if (p->f_out_offsets==NULL) {
		error("Cannot create file %s\n",out_offsets);
    free_fst2txt_parameters(p);
    return DEFAULT_ERROR_CODE;     
	}
}

if (p->output_text_file == NULL) {
	char tmp[FILENAME_MAX];
	remove_extension(p->input_text_file, tmp);
	strcat(tmp, ".tmp");
	p->output_text_file_is_temp=1;
	p->output_text_file = strdup(tmp);
	if (p->output_text_file == NULL) {
		alloc_error("main_Fst2Txt");
		free_fst2txt_parameters(p);
		return ALLOC_ERROR_CODE;
	}
}
p->fst_file=strdup(argv[options.vars()->optind]);
if (p->fst_file==NULL) {
   alloc_error("main_Fst2Txt");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;   
}

int result=main_fst2txt(p);

free_fst2txt_parameters(p);
return result;
}
コード例 #18
0
ファイル: PolyLex.cpp プロジェクト: adri87/Q-A
int main_PolyLex(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}


int language=-1;
char alphabet[FILENAME_MAX]="";
char dictionary[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
char info[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_PolyLex,lopts_PolyLex,&index,vars))) {
   switch(val) {
   case 'D': language=DUTCH; break;
   case 'G': language=GERMAN; break;
   case 'N': language=NORWEGIAN; break;
   case 'R': language=RUSSIAN; break;
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty alphabet file name\n");
             }
             strcpy(alphabet,vars->optarg);
             break;
   case 'd': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty dictionary file name\n");
             }
             strcpy(dictionary,vars->optarg);
             break;
   case 'o': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty output file name\n");
             }
             strcpy(output,vars->optarg);
             break;
   case 'i': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty information file name\n");
             }
             strcpy(info,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_PolyLex[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}

if (dictionary[0]=='\0') {
   fatal_error("You must specify the .bin dictionary to use\n");
}
if (output[0]=='\0') {
   fatal_error("You must specify the output dictionary file name\n");
}
if (language==-1) {
   fatal_error("You must specify the language\n");
}

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   u_printf("Loading alphabet...\n");
   alph=load_alphabet(alphabet);
   if (alph==NULL) {
      fatal_error("Cannot load alphabet file %s\n",alphabet);
   }
}
char temp[FILENAME_MAX];
struct string_hash* forbiddenWords=NULL;
if (language==DUTCH || language==NORWEGIAN) {
   get_path(dictionary,temp);
   strcat(temp,"ForbiddenWords.txt");
   forbiddenWords=load_key_list(temp,mask_encoding_compatibility_input);
}
u_printf("Loading BIN file...\n");
struct BIN_free_info bin_free;
const unsigned char* bin=load_abstract_BIN_file(dictionary,&bin_free);
if (bin==NULL) {
   error("Cannot load bin file %s\n",dictionary);
   free_alphabet(alph);
   free_string_hash(forbiddenWords);
   return 1;
}
strcpy(temp,dictionary);
temp[strlen(dictionary)-3]='\0';
strcat(temp,"inf");
u_printf("Loading INF file...\n");
struct INF_free_info inf_free;
const struct INF_codes* inf=load_abstract_INF_file(temp,&inf_free);
if (inf==NULL) {
   error("Cannot load inf file %s\n",temp);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_string_hash(forbiddenWords);
   return 1;
}
char tmp[FILENAME_MAX];
strcpy(tmp,argv[vars->optind]);
strcat(tmp,".tmp");
U_FILE* words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (words==NULL) {
   error("Cannot open word list file %s\n",argv[vars->optind]);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   free_string_hash(forbiddenWords);
   // here we return 0 in order to do not block the preprocessing
   // in the Unitex Java interface, if no dictionary was applied
   // so that there is no "err" file
   return 0;
}
U_FILE* new_unknown_words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,tmp,U_WRITE);
if (new_unknown_words==NULL) {
   error("Cannot open temporary word list file %s\n",tmp);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   u_fclose(words);
   free_string_hash(forbiddenWords);
   return 1;
}

U_FILE* res=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output,U_APPEND);
if (res==NULL) {
   error("Cannot open result file %s\n",output);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   u_fclose(words);
   u_fclose(new_unknown_words);
   free_string_hash(forbiddenWords);
   return 1;
}
U_FILE* debug=NULL;
if (info!=NULL) {
   debug=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,info,U_WRITE);
   if (debug==NULL) {
      error("Cannot open debug file %s\n",info);
   }
}
struct utags UTAG;

switch(language) {
case DUTCH: analyse_dutch_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break;
case GERMAN: analyse_german_compounds(alph,bin,inf,words,res,debug,new_unknown_words); break;
case NORWEGIAN: analyse_norwegian_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break;
case RUSSIAN:
   init_russian(&UTAG);
   analyse_compounds(alph,bin,inf,words,res,debug,new_unknown_words,UTAG);
   break;
}

free_alphabet(alph);
free_abstract_BIN(bin,&bin_free);
free_abstract_INF(inf,&inf_free);
u_fclose(words);
u_fclose(new_unknown_words);
free_string_hash(forbiddenWords);
af_remove(argv[vars->optind]);
af_rename(tmp,argv[vars->optind]);
u_fclose(res);
if (debug!=NULL) {
   u_fclose(debug);
}
free_OptVars(vars);
return 0;
}
コード例 #19
0
int main_SpellCheck(int argc,char* const argv[]) {
if (argc==1) {
    usage();
    return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
char mode=0;
char snt[FILENAME_MAX]="";
char txt[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
char output_set=0;
char output_op='A';
SpellCheckConfig config;
config.max_errors=1;
config.max_SP_INSERT=1;
config.max_SP_SUPPR=1;
config.max_SP_SWAP=1;
config.max_SP_CHANGE=1;
for (int i=0;i<N_SPSubOp;i++) {
    config.score[i]=default_scores[i];
}
config.min_length1=4;
config.min_length2=6;
config.min_length3=12;
config.input_op='D';
config.keyboard=NULL;
config.allow_uppercase_initial=0;
char foo;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_SpellCheck,lopts_SpellCheck,&index))) {
   switch(val) {
   case 's': {
       strcpy(snt,options.vars()->optarg);
       mode='s';
       break;
   }
   case 'f': {
       strcpy(txt,options.vars()->optarg);
       mode='f';
       break;
   }
   case 'o': {
       if (options.vars()->optarg!=NULL) {
           strcpy(output,options.vars()->optarg);
       }
       output_set=1;
       break;
   }
   case 'I': {
       if (!strcmp(options.vars()->optarg,"D") || !strcmp(options.vars()->optarg,"M") || !strcmp(options.vars()->optarg,"U")) {
           config.input_op=options.vars()->optarg[0];
       } else {
       error("Invalid argument %s for option --input-op: should in [DMU]\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 'O': {
       if (!strcmp(options.vars()->optarg,"O") || !strcmp(options.vars()->optarg,"A")) {
           output_op=options.vars()->optarg[0];
       } else {
           error("Invalid argument %s for option --output-op: should in [OA]\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 1: {
       config.keyboard=get_Keyboard(options.vars()->optarg);
       if (config.keyboard==NULL) {
           error("Invalid argument %s for option --keyboard:\nUse --show-keyboards to see possible values\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 2: {
       print_available_keyboards(U_STDOUT);
       return SUCCESS_RETURN_CODE;
   }
   case 10: {
       if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_errors,&foo)) {
           error("Invalid argument %s for --max-errors: should be an integer >=0\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 11: {
       if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_INSERT,&foo)) {
           error("Invalid argument %s for --max-insert: should be an integer >=0\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 12: {
       if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_SUPPR,&foo)) {
           error("Invalid argument %s for --max-suppr: should be an integer >=0\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 13: {
       if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_CHANGE,&foo)) {
           error("Invalid argument %s for --max-change: should be an integer >=0\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 14: {
       if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_SWAP,&foo)) {
           error("Invalid argument %s for --max-swap: should be an integer >=0\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 20: {
       int* scores=config.score;
       if (N_SPSubOp!=sscanf(options.vars()->optarg,"%d,%d,%d,%d,%d,%d,%d,%d,%d%c",
            scores,scores+1,scores+2,scores+3,scores+4,scores+5,
            scores+6,scores+7,scores+8,&foo)) {
            error("Invalid argument %s for option --scores. See --help-scores\n",options.vars()->optarg);
        return USAGE_ERROR_CODE;
       }
       break;
   }
   case 21: {
       usage_scores();
       return SUCCESS_RETURN_CODE;
   }
   case 22: {
       if (3!=sscanf(options.vars()->optarg,"%u,%u,%u%c",
            &config.min_length1,&config.min_length2,&config.min_length3,&foo)) {
            error("Invalid argument %s for option --min-lengths\n",options.vars()->optarg);
        return USAGE_ERROR_CODE;
       }
       break;
   }
   case 23: {
       if (!strcmp(options.vars()->optarg,"yes")) {
           config.allow_uppercase_initial=1;
       } else if (!strcmp(options.vars()->optarg,"no")) {
           config.allow_uppercase_initial=0;
       } else {
           error("Invalid argument %s for option --upper-initial\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_SpellCheck[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind==argc) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (mode==0) {
  error("You must use either --snt or --file\n");
  return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

config.n_dics=argc-options.vars()->optind;
config.dics=(Dictionary**)malloc(config.n_dics*sizeof(Dictionary*));
if (config.dics==NULL) {
    alloc_error("main_SpellCheck");
  return ALLOC_ERROR_CODE;
}

for (int i=0;i<config.n_dics;i++) {
    config.dics[i]=new_Dictionary(&vec,argv[i+options.vars()->optind]);
    if (config.dics[i]==NULL) {
        error("Cannot load dictionary %s\n",argv[i+options.vars()->optind]);
    }
}

config.out=U_STDOUT;
config.n_input_lines=0;
config.n_output_lines=0;

if (mode=='s') {
    /* When working with a .snt, we actually want to work on its err file */
    get_snt_path(snt,txt);
    strcat(txt,"err");
    /* the output must be dlf, and we note the number of lines in the existing
     * dlf file, if any */
    get_snt_path(snt,output);
    strcat(output,"dlf.n");
    U_FILE* f=u_fopen(&vec,output,U_READ);
    if (f!=NULL) {
        u_fscanf(f,"%d",&(config.n_output_lines));
        u_fclose(f);
    }
    get_snt_path(snt,output);
    strcat(output,"dlf");
    output_set=1;
    /* and we force the values for -I and -O */
    config.input_op='U';
    output_op='A';
} else {
    /* If mode=='f', we don't have anything to do since we already
     * defined the default output to stdout */
}

if (output_set) {
    if (output_op=='O') {
        config.out=u_fopen(&vec,output,U_WRITE);
    } else {
        config.out=u_fopen(&vec,output,U_APPEND);
    }
    if (config.out==NULL) {
        error("Cannot open output file %s\n",output);
    for (int i=0;i<config.n_dics;i++) {
      free_Dictionary(config.dics[i]);
    }
    free(config.dics);
    return DEFAULT_ERROR_CODE;
    }
}

config.modified_input=NULL;
char modified_input[FILENAME_MAX]="";
if (config.input_op!='D') {
    strcpy(modified_input,txt);
    strcat(modified_input,".tmp");
    config.modified_input=u_fopen(&vec,modified_input,U_WRITE);
    if (config.modified_input==NULL) {
        error("Cannot open tmp file %s\n",modified_input);
    if (config.out!=U_STDOUT) {
      u_fclose(config.out);
    }
    for (int i=0;i<config.n_dics;i++) {
      free_Dictionary(config.dics[i]);
    }
    free(config.dics);
    return DEFAULT_ERROR_CODE;
    }
}

config.in=u_fopen(&vec,txt,U_READ);
if (config.in==NULL) {
    error("Cannot open file %s\n",txt);
  u_fclose(config.modified_input);
  if (config.out!=U_STDOUT) {
    u_fclose(config.out);
  }
  for (int i=0;i<config.n_dics;i++) {
    free_Dictionary(config.dics[i]);
  }
  free(config.dics);
  return DEFAULT_ERROR_CODE;
}

/* We perform spellchecking */
spellcheck(&config);

/* And we clean */

u_fclose(config.in);

if (config.modified_input!=NULL) {
  /* If we used a tmp file because the input file has to be modified,
   * it's now time to actually modify it */
  u_fclose(config.modified_input);
  af_remove(txt);
  af_rename(modified_input,txt);
}

if (config.out!=U_STDOUT) {
  u_fclose(config.out);
}

for (int i=0;i<config.n_dics;i++) {
  free_Dictionary(config.dics[i]);
}
free(config.dics);

/* Finally, we update the dlf.n and err.n files if mode=='s' */
if (mode=='s') {
    get_snt_path(snt,output);
    strcat(output,"err.n");
    U_FILE* f=u_fopen(&vec,output,U_WRITE);
    if (f!=NULL) {
        u_fprintf(f,"%d",config.n_input_lines);
        u_fclose(f);
    }
    if (config.input_op!='D') {
        get_snt_path(snt,output);
        strcat(output,"dlf.n");
        U_FILE* fw=u_fopen(&vec,output,U_WRITE);
        if (fw!=NULL) {
            u_fprintf(fw,"%d",config.n_output_lines);
            u_fclose(fw);
        }
    }
}

return SUCCESS_RETURN_CODE;
}
コード例 #20
0
/**
 * The same than main, but no call to setBufferMode.
 */
int main_KeyWords(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;

char tokens[FILENAME_MAX];
char output[FILENAME_MAX]="";
char alph[FILENAME_MAX]="";
char cdic[FILENAME_MAX]="";
unichar* code=u_strdup("XXX");
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;

while (EOF!=(val=options.parse_long(argc,argv,optstring_KeyWords,lopts_KeyWords,&index))) {
   switch(val) {
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output\n");
                free(code);
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             strcpy(alph,options.vars()->optarg);
             break;
   case 'f': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty forbidden code\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             free(code);
             code=u_strdup(options.vars()->optarg);
             break;
   case 'c': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty file name\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             strcpy(cdic,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             free(code);
             return SUCCESS_RETURN_CODE;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free(code);
                return USAGE_ERROR_CODE;                
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_KeyWords[index].name);
             free(code);
             return USAGE_ERROR_CODE;
             break;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free(code);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind==argc || options.vars()->optind==argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free(code);
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free(code);
  return SUCCESS_RETURN_CODE;
}

Alphabet* alphabet=NULL;
if (alph[0]!='\0') {
  alphabet=load_alphabet(&vec,alph);
  if (alphabet==NULL) {
    error("Cannot load alphabet file %s\n",alph);
    free(code);
    return DEFAULT_ERROR_CODE;    
  }
}

strcpy(tokens,argv[(options.vars()->optind++)]);
if (output[0]=='\0') {
	get_path(tokens,output);
	strcat(output,"keywords.txt");
}
struct string_hash_ptr* keywords=load_tokens_by_freq(tokens,&vec);
filter_non_letter_keywords(keywords,alphabet);
if (cdic[0]!='\0') {
	load_compound_words(cdic,&vec,keywords);
}

for (;options.vars()->optind!=argc;(options.vars()->optind)++) {
	filter_keywords_with_dic(keywords,argv[options.vars()->optind],&vec,alphabet);
}
merge_case_equivalent_unknown_words(keywords,alphabet);
struct string_hash* forbidden_lemmas=compute_forbidden_lemmas(keywords,code);
remove_keywords_with_forbidden_lemma(keywords,forbidden_lemmas);
free_string_hash(forbidden_lemmas);
vector_ptr* sorted=sort_keywords(keywords);
U_FILE* f_output=u_fopen(&vec,output,U_WRITE);

if (f_output==NULL) {
	error("Cannot write in file %s\n",output);
  free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list);
  free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list);
  free_alphabet(alphabet);
  free(code);
  return DEFAULT_ERROR_CODE;  
}

dump_keywords(sorted,f_output);

u_fclose(f_output);
free_vector_ptr(sorted,(void(*)(void*))free_KeyWord_list);
free_string_hash_ptr(keywords,(void(*)(void*))free_KeyWord_list);
free_alphabet(alphabet);
free(code);

return SUCCESS_RETURN_CODE;
}
コード例 #21
0
ファイル: SortTxt.cpp プロジェクト: UnitexGramLab/unitex-core
int main_SortTxt(int argc, char* const argv[]) {
  if (argc == 1) {
    usage();
    return SUCCESS_RETURN_CODE;
  }

  struct sort_infos* inf = new_sort_infos();
  if(!inf) {
    return ALLOC_ERROR_CODE;
  }

  int mode = DEFAULT;
  char line_info[FILENAME_MAX] = "";
  char sort_order[FILENAME_MAX] = "";
  VersatileEncodingConfig vec = { DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT,
      DEFAULT_ENCODING_OUTPUT, DEFAULT_BOM_OUTPUT };
  int val, index = -1;
  bool only_verify_arguments = false;
  UnitexGetOpt options;
  while (EOF != (val = options.parse_long(argc, argv, optstring_SortTxt,
      lopts_SortTxt, &index))) {
    switch (val) {
    case 'n':
      inf->REMOVE_DUPLICATES = 1;
      break;
    case 'd':
      inf->REMOVE_DUPLICATES = 0;
      break;
    case 'r':
      inf->REVERSE = -1;
      break;
    case 'o':
      if (options.vars()->optarg[0] == '\0') {
        error("You must specify a non empty sort order file name\n");
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
      }
      strcpy(sort_order, options.vars()->optarg);
      break;
    case 'l':
      if (options.vars()->optarg[0] == '\0') {
        error("You must specify a non empty information file name\n");
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
      }
      strcpy(line_info, options.vars()->optarg);
      break;
    case 't':
      mode = THAI;
      break;
    case 'f':
      inf->factorize_inflectional_codes = 1;
      break;
    case 'V': only_verify_arguments = true;
      break;
    case 'h':
      usage();
      free_sort_infos(inf);
      return SUCCESS_RETURN_CODE;
    case 'k':
      if (options.vars()->optarg[0] == '\0') {
        error("Empty input_encoding argument\n");
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
      }
      decode_reading_encoding_parameter(
          &(vec.mask_encoding_compatibility_input), options.vars()->optarg);
      break;
    case 'q':
      if (options.vars()->optarg[0] == '\0') {
        error("Empty output_encoding argument\n");
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
      }
      decode_writing_encoding_parameter(&(vec.encoding_output),
          &(vec.bom_output), options.vars()->optarg);
      break;
    case ':':
        index == -1 ? error("Missing argument for option -%c\n", options.vars()->optopt) :
                      error("Missing argument for option --%s\n",lopts_SortTxt[index].name);
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
    case '?':
        index == -1 ? error("Invalid option -%c\n", options.vars()->optopt) :
                      error("Invalid option --%s\n", options.vars()->optarg);
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
    }
    index = -1;
  }

  if (options.vars()->optind != argc - 1) {
    error("Invalid arguments: rerun with --help\n");
    free_sort_infos(inf);
    return USAGE_ERROR_CODE;
  }

  if (only_verify_arguments) {
    // freeing all allocated memory
    free_sort_infos(inf);
    return SUCCESS_RETURN_CODE;
  }

  if (sort_order[0] != '\0') {
    read_char_order(&vec, sort_order, inf);
  }

  char new_name[FILENAME_MAX];
  strcpy(new_name, argv[options.vars()->optind]);
  strcat(new_name, ".new");

  inf->f = u_fopen(&vec, argv[options.vars()->optind], U_READ);
  if (inf->f == NULL) {
    error("Cannot open file %s\n", argv[options.vars()->optind]);
    free_sort_infos(inf);
    return DEFAULT_ERROR_CODE;
  }

  inf->f_out = u_fopen(&vec, new_name, U_WRITE);
  if (inf->f_out == NULL) {
    error("Cannot open temporary file %s\n", new_name);
    u_fclose(inf->f);
    free_sort_infos(inf);
    return DEFAULT_ERROR_CODE;
  }

  switch (mode) {
  case DEFAULT:
    sort(inf);
    break;
  case THAI:
    sort_thai(inf);
    break;
  }
  if (line_info[0] != '\0') {
    U_FILE* F = u_fopen(&vec, line_info, U_WRITE);
    if (F == NULL) {
      error("Cannot write %s\n", line_info);
    } else {
      u_fprintf(F, "%d\n", inf->resulting_line_number);
      u_fclose(F);
    }
  }

  u_fclose(inf->f_out);
  u_fclose(inf->f);
  af_remove(argv[options.vars()->optind]);
  af_rename(new_name, argv[options.vars()->optind]);
  free_sort_infos(inf);

  u_printf("Done.\n");
  return SUCCESS_RETURN_CODE;
}
コード例 #22
0
/**
 * This is the customized diff program designed to compare grf files.
 */
int main_SelectOutput(int argc,char* const argv[]) {
if (argc==1) {
	usage();
	return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_SelectOutput,lopts_SelectOutput,&index))) {
   switch(val) {
   case 'V': only_verify_arguments = true;
             break;    
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case 'e':
   case 'o':
	   {
		   enum stdwrite_kind swk = (val == 'o') ? stdwrite_kind_out : stdwrite_kind_err;
		   if (strcmp(options.vars()->optarg,"on") == 0)
		   {
		       SetStdWriteCB(swk,0,NULL,NULL);
		   }
		   else
		   if (strcmp(options.vars()->optarg,"off") == 0)
		   {
		       SetStdWriteCB(swk,1,NULL,NULL);
		   }
		   else 
		   {
		       error("Invalid option --%s, must be 'on' or 'off'\n",options.vars()->optarg);
		       return USAGE_ERROR_CODE;
		   }
		   break;
	   }
   
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_SelectOutput[index].name);
             return USAGE_ERROR_CODE;            
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

  // keep this for further modifications
  if (only_verify_arguments) {
    // freeing all allocated memory
    return SUCCESS_RETURN_CODE;
  }


	return SUCCESS_RETURN_CODE;
}
コード例 #23
0
ファイル: Evamb.cpp プロジェクト: adri87/Q-A
int main_Evamb(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}


int val,index=-1;
int sentence_number=-1;
const char* outfilename=NULL;
char output_name_buffer[FILENAME_MAX]="";

Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;


struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Evamb,lopts_Evamb,&index,vars))) {
   switch(val) {
   case 's':   {  char c_foo;
                  if (1!=sscanf(vars->optarg,"%d%c",&sentence_number,&c_foo) || sentence_number<=0) {
                    /* foo is used to check that the sentence number is not like "45gjh" */
                    fatal_error("Invalid sentence number: %s\n",vars->optarg);
                  }
                }
                break;
      case 'o': if (vars->optarg[0]=='\0') {
                   fatal_error("You must specify a non empty output file name\n");
                }
                strcpy(output_name_buffer,vars->optarg);
                outfilename=output_name_buffer;
                break;
      case 'k': if (vars->optarg[0]=='\0') {
                  fatal_error("Empty input_encoding argument\n");
                }
                decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
                break;
      case 'q': if (vars->optarg[0]=='\0') {
                  fatal_error("Empty output_encoding argument\n");
                }
                decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
                break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_Evamb[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}

u_printf("Loading '%s'...\n",argv[vars->optind]);
Tfst* tfst=open_text_automaton(argv[vars->optind]);
if (tfst==NULL) {
   fatal_error("Unable to load '%s'\n",argv[vars->optind]);
}
if (sentence_number>tfst->N) {
   fatal_error("Invalid sentence number %d: should be in [1;%d]\n",sentence_number,tfst->N);
}
U_FILE* outfile = (outfilename == NULL) ? U_STDOUT : u_fopen_creating_versatile_encoding(encoding_output,bom_output, outfilename, U_WRITE);
if (outfile==NULL) {
    close_text_automaton(tfst);
    free_OptVars(vars);
    error("Cannot create file %s\n",outfilename);
    return 1;
}
if (sentence_number==-1) {
   /* If we have to evaluate the ambiguity rate of the whole automaton */
   double lognp_total=0.0;
   double lmoy_total=0.0;
   double maxlogamb=0.0;
   double minlogamb=(double)INT_MAX;
   /* This is the number of bad automata in the text .fst2 */
   int n_bad_automata=0;
   int maxambno=-1;
   int minambno=-1;
   for (sentence_number=1;sentence_number<=tfst->N;sentence_number++) {
      load_sentence(tfst,sentence_number);
      SingleGraph graph=tfst->automaton;
      if (graph->number_of_states==0 || graph->states[0]->outgoing_transitions==NULL) {
         n_bad_automata++;
         error("Sentence %d: empty automaton\n",sentence_number);
      } else {
         /* log(number of paths) */
         double lognp;
         /* minimum/maximum path length */
         int lmin,lmax;
         /* Approximation of the sentence length */
         double lmoy;
         /* log(ambiguity rate) */
         double logamb;
         lognp=evaluate_ambiguity(graph,&lmin,&lmax);
         lmoy=(double)(lmin+lmax)/2.0;
         logamb=lognp/lmoy;
         if (maxlogamb<logamb) {
            maxlogamb=logamb;
            maxambno=sentence_number;
         }
         if (minlogamb>logamb) {
            minlogamb=logamb;
            minambno=sentence_number;
         }
         u_printf("Sentence %d            \r",sentence_number);
         lognp_total=lognp_total+lognp;
         lmoy_total=lmoy_total+lmoy;
      }
   }
   if (n_bad_automata>=tfst->N) {
      error("No stats to print because no non-empty sentence automata were found.\n");
   } else {
      u_fprintf(outfile,"%d/%d sentence%s taken into account\n",tfst->N-n_bad_automata,tfst->N,(tfst->N>1)?"s":"");
      u_fprintf(outfile,"Average ambiguity rate=%.3f\n",exp(lognp_total/lmoy_total));
      u_fprintf(outfile,"Minimum ambiguity rate=%.3f (sentence %d)\n",exp(minlogamb),minambno);
      u_fprintf(outfile,"Maximum ambiguity rate=%.3f (sentence %d)\n",exp(maxlogamb),maxambno);
   }
} else {
   /* If we have to evaluate the ambiguity rate of a single sentence automaton */
   load_sentence(tfst,sentence_number);
   SingleGraph graph=tfst->automaton;
   if (graph->number_of_states==0) {
      error("Sentence %d: empty automaton\n",sentence_number);
   } else {
      int min;
      int max;
      double lognp=evaluate_ambiguity(graph,&min,&max);
      double lmoy=(double)(min+max)/2.0;
      u_fprintf(outfile,"Sentence %d: ambiguity rate=%.3f\n",sentence_number,exp(lognp/lmoy));
   }
}
if (outfile!=U_STDOUT)
  u_fclose(outfile);

close_text_automaton(tfst);
free_OptVars(vars);
return 0;
}
コード例 #24
0
ファイル: GrfDiff.cpp プロジェクト: UnitexGramLab/unitex-core
/**
 * This is the customized diff program designed to compare grf files.
 */
int main_GrfDiff(int argc,char* const argv[]) {
if (argc==1) {
    usage();
    return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
char output[FILENAME_MAX]="";
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_GrfDiff,lopts_GrfDiff,&index))) {
   switch(val) {
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case 1: {
       strcpy(output,options.vars()->optarg);
       break;
   }
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_GrfDiff[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-2) {
  error("Invalid arguments: rerun with --help\n");
  return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

U_FILE* f=U_STDOUT;
if (output[0]!='\0') {
    /* Since the output is supposed to be a diff-like one, there is no point
     * in outputing in a variable encoding, so we force UTF8 */
    f=u_fopen(UTF8,output,U_WRITE);
    if (f==NULL) {
        error("Cannot create file %s\n",output);
    return DEFAULT_ERROR_CODE;
    }
}

Grf* a=load_Grf(&vec,argv[options.vars()->optind]);
if (a==NULL) {
  if (f!=U_STDOUT) {
    u_fclose(f);
  }
  return DEFAULT_ERROR_CODE;
}

Grf* b=load_Grf(&vec,argv[options.vars()->optind+1]);
if (b==NULL) {
    free_Grf(a);
  if (f!=U_STDOUT) {
    u_fclose(f);
  }
  return DEFAULT_ERROR_CODE;
}

GrfDiff* diff=grf_diff(a,b);
free_Grf(a);
free_Grf(b);
print_diff(f,diff);
if (f!=U_STDOUT) {
  u_fclose(f);
}
int different=diff->diff_ops->nbelems!=0;
free_GrfDiff(diff);
return different;
}
コード例 #25
0
ファイル: PolyLex.cpp プロジェクト: UnitexGramLab/unitex-core
int main_PolyLex(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int language=-1;
char alphabet[FILENAME_MAX]="";
char name_bin[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
char info[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_PolyLex,lopts_PolyLex,&index))) {
   switch(val) {
   case 'D': language=DUTCH; break;
   case 'G': language=GERMAN; break;
   case 'N': language=NORWEGIAN; break;
   case 'R': language=RUSSIAN; break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'd': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty dictionary file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(name_bin,options.vars()->optarg);
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'i': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty information file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(info,options.vars()->optarg);
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_PolyLex[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (name_bin[0]=='\0') {
   error("You must specify the .bin dictionary to use\n");
   return USAGE_ERROR_CODE;
}

if (output[0]=='\0') {
   error("You must specify the output dictionary file name\n");
   return USAGE_ERROR_CODE;
}

if (language==-1) {
   error("You must specify the language\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   u_printf("Loading alphabet...\n");
   alph=load_alphabet(&vec,alphabet);
   if (alph==NULL) {
      error("Cannot load alphabet file %s\n",alphabet);
      return USAGE_ERROR_CODE;
   }
}

char name_inf[FILENAME_MAX];
struct string_hash* forbiddenWords=NULL;
if (language==DUTCH || language==NORWEGIAN) {
   get_path(name_bin,name_inf);
   strcat(name_inf,"ForbiddenWords.txt");
   forbiddenWords=load_key_list(&vec,name_inf);
   if (forbiddenWords==NULL) {
       /* If there was no file, we don't want to block the process */
       forbiddenWords=new_string_hash(DONT_USE_VALUES);
   }
}

strcpy(name_inf,name_bin);
name_inf[strlen(name_bin)-3]='\0';
strcat(name_inf,"inf");
Dictionary* d=new_Dictionary(&vec,name_bin,name_inf);
if (d==NULL) {
    error("Cannot load dictionary %s\n",name_bin);
    free_string_hash(forbiddenWords);
    free_alphabet(alph);
    return DEFAULT_ERROR_CODE;
}

char tmp[FILENAME_MAX];
strcpy(tmp,argv[options.vars()->optind]);
strcat(tmp,".tmp");

U_FILE* words=u_fopen(&vec,argv[options.vars()->optind],U_READ);
if (words==NULL) {
   error("Cannot open word list file %s\n",argv[options.vars()->optind]);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   // here we return 0 in order to do not block the preprocessing
   // in the Unitex/GramLab IDE interface, if no dictionary was applied
   // so that there is no "err" file
   return SUCCESS_RETURN_CODE;
}

U_FILE* new_unknown_words=u_fopen(&vec,tmp,U_WRITE);
if (new_unknown_words==NULL) {
   error("Cannot open temporary word list file %s\n",tmp);
   u_fclose(words);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   return DEFAULT_ERROR_CODE;
}

U_FILE* res=u_fopen(&vec,output,U_APPEND);
if (res==NULL) {
   error("Cannot open result file %s\n",output);
   u_fclose(new_unknown_words);
   u_fclose(words);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   u_fclose(words);
   return DEFAULT_ERROR_CODE;
}

U_FILE* debug=NULL;
if ((*info)!='\0') {
   debug=u_fopen(&vec,info,U_WRITE);
   if (debug==NULL) {
      error("Cannot open debug file %s\n",info);
   }
}
struct utags UTAG;

switch(language) {
  case DUTCH:
    analyse_dutch_unknown_words(alph,
                                d,
                                words,
                                res,
                                debug,
                                new_unknown_words,
                                forbiddenWords);
    break;
  case GERMAN:
    analyse_german_compounds(alph,
                             d,
                             words,
                             res,
                             debug,
                             new_unknown_words);
    break;
  case NORWEGIAN:
    analyse_norwegian_unknown_words(alph,
                                    d,
                                    words,
                                    res,
                                    debug,
                                    new_unknown_words,
                                    forbiddenWords);
    break;
  case RUSSIAN:
     init_russian(&UTAG);
     analyse_compounds(alph,
                       d,
                       words,
                       res,
                       debug,
                       new_unknown_words,
                       UTAG);
     break;
}

free_alphabet(alph);
free_Dictionary(d);
u_fclose(words);
u_fclose(new_unknown_words);
free_string_hash(forbiddenWords);
af_remove(argv[options.vars()->optind]);
af_rename(tmp,argv[options.vars()->optind]);
u_fclose(res);

if (debug!=NULL) {
   u_fclose(debug);
}

return SUCCESS_RETURN_CODE;
}
コード例 #26
0
/**
 * The same than main, but no call to setBufferMode.
 */
int main_Concord(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int val,index=-1;
struct conc_opt* concord_options = new_conc_opt();
char foo;
VersatileEncodingConfig vec=VEC_DEFAULT;
int ret;
char offset_file[FILENAME_MAX]="";
char PRLG[FILENAME_MAX]="";
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_Concord,lopts_Concord,&index))) {
   switch(val) {
   case 'f': if (options.vars()->optarg[0]=='\0') {
                error("Empty font name argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             concord_options->fontname=strdup(options.vars()->optarg);
             if (concord_options->fontname==NULL) {
                alloc_error("main_Concord");
                free_conc_opt(concord_options);
                return ALLOC_ERROR_CODE;
             }
             break;
   case 's': if (1!=sscanf(options.vars()->optarg,"%d%c",&(concord_options->fontsize),&foo)) {
                /* foo is used to check that the font size is not like "45gjh" */
                error("Invalid font size argument: %s\n",options.vars()->optarg);
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             break;
   case 'l': ret=sscanf(options.vars()->optarg,"%d%c%c",&(concord_options->left_context),&foo,&foo);
             if (ret==0 || ret==3 || (ret==2 && foo!='s') || concord_options->left_context<0) {
                error("Invalid left context argument: %s\n",options.vars()->optarg);
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;                
             }
             if (ret==2) {
                concord_options->left_context_until_eos=1;
             }
             break;
   case 'r': ret=sscanf(options.vars()->optarg,"%d%c%c",&(concord_options->right_context),&foo,&foo);
             if (ret==0 || ret==3 || (ret==2 && foo!='s') || concord_options->right_context<0) {
                error("Invalid right context argument: %s\n",options.vars()->optarg);
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;                
             }
             if (ret==2) {
                concord_options->right_context_until_eos=1;
             }
             break;
   case 'L': concord_options->convLFtoCRLF=0; break;
   case 0: concord_options->sort_mode=TEXT_ORDER; break;
   case 1: concord_options->sort_mode=LEFT_CENTER; break;
   case 2: concord_options->sort_mode=LEFT_RIGHT; break;
   case 3: concord_options->sort_mode=CENTER_LEFT; break;
   case 4: concord_options->sort_mode=CENTER_RIGHT; break;
   case 5: concord_options->sort_mode=RIGHT_LEFT; break;
   case 6: concord_options->sort_mode=RIGHT_CENTER; break;
   case 7: concord_options->result_mode=DIFF_; break;
   case 8: concord_options->only_ambiguous=1; break;
   case 9: {
     strcpy(PRLG,options.vars()->optarg);
     char* pos=strchr(PRLG,',');
     if (pos==NULL || pos==PRLG || *(pos+1)=='\0') {
       error("Invalid argument for option --PRLG: %s\n",options.vars()->optarg);
       free_conc_opt(concord_options);
       return USAGE_ERROR_CODE;
     }
     *pos='\0';
     strcpy(offset_file,pos+1);
     break;
   }
   case 10: concord_options->only_matches=1; break;
   case 11: concord_options->result_mode=LEMMATIZE_; break;
   case 12: concord_options->result_mode=CSV_; break;
   case 'H': concord_options->result_mode=HTML_; break;
   case 't': {
     concord_options->result_mode=TEXT_;
     if (options.vars()->optarg!=NULL) {
       strcpy(concord_options->output,options.vars()->optarg);
     }
     break;
   }
   case 'g': concord_options->result_mode=GLOSSANET_;
             if (options.vars()->optarg[0]=='\0') {
                error("Empty glossanet script argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;                   
             }
             concord_options->script=strdup(options.vars()->optarg);
             if (concord_options->script==NULL) {
                alloc_error("main_Concord");
                free_conc_opt(concord_options);
                return ALLOC_ERROR_CODE;                   
             }
             break;
   case 'p': concord_options->result_mode=SCRIPT_;
             if (options.vars()->optarg[0]=='\0') {
                error("Empty script argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;                
             }
             concord_options->script=strdup(options.vars()->optarg);
             if (concord_options->script==NULL) {
                alloc_error("main_Concord");
                free_conc_opt(concord_options);
                return ALLOC_ERROR_CODE;                 
             }
             break;
   case 'i': concord_options->result_mode=INDEX_; break;
   case 'u': concord_options->result_mode=UIMA_;
             if (options.vars()->optarg!=NULL) {
                strcpy(offset_file,options.vars()->optarg);
             }
             concord_options->original_file_offsets=1;
             break;
   case 'e': concord_options->result_mode=XML_;
             if (options.vars()->optarg!=NULL) {
                strcpy(offset_file, options.vars()->optarg);
                concord_options->original_file_offsets=1;
             }
             break;
   case 'w': concord_options->result_mode=XML_WITH_HEADER_; 
             if (options.vars()->optarg!=NULL) {
                strcpy(offset_file, options.vars()->optarg);
                concord_options->original_file_offsets = 1;
             }
             break;
   case '$': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_offsets argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             strcpy(concord_options->input_offsets,options.vars()->optarg);
             break;
   case '@': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_offsets argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             strcpy(concord_options->output_offsets,options.vars()->optarg);
             break;
   case 'A': concord_options->result_mode=AXIS_; break;
   case 'x': concord_options->result_mode=XALIGN_; break;
   case 'm': concord_options->result_mode=MERGE_;
             if (options.vars()->optarg[0]=='\0') {
                error("Empty output file name argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             strcpy(concord_options->output,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("Empty alphabet argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;                
             }
             concord_options->sort_alphabet=strdup(options.vars()->optarg);
             if (concord_options->sort_alphabet==NULL) {
                alloc_error("main_Concord");
                free_conc_opt(concord_options);
                return ALLOC_ERROR_CODE;            }
             break;
   case 'T': concord_options->thai_mode=1; break;
   case 'd': if (options.vars()->optarg[0]=='\0') {
                error("Empty snt directory argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             strcpy(concord_options->working_directory,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;             
   case 'h': usage();
             free_conc_opt(concord_options); 
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Concord[index].name);
             free_conc_opt(concord_options);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free_conc_opt(concord_options);
             return USAGE_ERROR_CODE;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free_conc_opt(concord_options);
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free_conc_opt(concord_options);
   return USAGE_ERROR_CODE;
}
if (concord_options->fontname==NULL || concord_options->fontsize<=0) {
   if (concord_options->result_mode==HTML_ || concord_options->result_mode==GLOSSANET_) {
      error("The specified output mode is an HTML file: you must specify font parameters\n");
      free_conc_opt(concord_options);
      return USAGE_ERROR_CODE;      
   }
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free_conc_opt(concord_options);
  return SUCCESS_RETURN_CODE;
}

U_FILE* concor=u_fopen(&vec,argv[options.vars()->optind],U_READ);
if (concor==NULL) {
   error("Cannot open concordance index file %s\n",argv[options.vars()->optind]);
   free_conc_opt(concord_options);
   return DEFAULT_ERROR_CODE;   
}

if (concord_options->working_directory[0]=='\0') {
   get_path(argv[options.vars()->optind],concord_options->working_directory);
}
if (concord_options->only_matches) {
  concord_options->left_context=0;
  concord_options->right_context=0;
}
/* We compute the name of the files associated to the text */
struct snt_files* snt_files=new_snt_files_from_path(concord_options->working_directory);
ABSTRACTMAPFILE* text=af_open_mapfile(snt_files->text_cod,MAPFILE_OPTION_READ,0);
if (text==NULL) {
  error("Cannot open file %s\n",snt_files->text_cod);
  free_snt_files(snt_files);
  u_fclose(concor);
  free_conc_opt(concord_options);
  return DEFAULT_ERROR_CODE;
}
struct text_tokens* tok=load_text_tokens(&vec,snt_files->tokens_txt);
if (tok==NULL) {
  error("Cannot load text token file %s\n",snt_files->tokens_txt);
  af_close_mapfile(text);
  free_snt_files(snt_files);
  u_fclose(concor);
  free_conc_opt(concord_options);
  return DEFAULT_ERROR_CODE;
}

U_FILE* f_enter=u_fopen(BINARY,snt_files->enter_pos,U_READ);
int n_enter_char=0;
int* enter_pos=NULL;
/* New lines are encoded in 'enter.pos' files. Those files will disappear in the future */
if (f_enter==NULL) {
  error("Cannot open file %s\n",snt_files->enter_pos);
}
else {
  long size=get_file_size(f_enter);
  enter_pos=(int*)malloc(size);
  if (enter_pos==NULL) {
    alloc_error("main_Concord");
    u_fclose(f_enter);
    free_text_tokens(tok);
    af_close_mapfile(text);
    free_snt_files(snt_files);
    u_fclose(concor);
    free_conc_opt(concord_options);
    return ALLOC_ERROR_CODE;     
  }
  n_enter_char=(int)fread(enter_pos,sizeof(int),size/sizeof(int),f_enter);
  if (n_enter_char!=(int)(size/sizeof(int))) {
    error("Read error on enter.pos file in main_Concord\n");
    u_fclose(f_enter);
    free(enter_pos);
    free_text_tokens(tok);
    af_close_mapfile(text);
    free_snt_files(snt_files);
    u_fclose(concor);
    free_conc_opt(concord_options);
    return DEFAULT_ERROR_CODE;
  }
  u_fclose(f_enter);
}
if (concord_options->result_mode==INDEX_ || concord_options->result_mode==UIMA_ || 
    concord_options->result_mode==XML_ || concord_options->result_mode==XML_WITH_HEADER_ ||
    concord_options->result_mode==AXIS_) {
   /* We force some options for index, uima and axis files */
   concord_options->left_context=0;
   concord_options->right_context=0;
   concord_options->sort_mode=TEXT_ORDER;
}
if (concord_options->only_ambiguous && concord_options->result_mode!=LEMMATIZE_) {
  /* We force text order when displaying only ambiguous outputs */
  concord_options->sort_mode=TEXT_ORDER;
}
if (concord_options->result_mode==HTML_ || concord_options->result_mode==DIFF_ || concord_options->result_mode==LEMMATIZE_) {
  /* We need the offset file if and only if we have to produce
   * an html concordance with positions in .snt file */
  concord_options->snt_offsets=load_snt_offsets(snt_files->snt_offsets_pos);
  if (concord_options->snt_offsets==NULL) {
    error("Cannot read snt offset file %s\n",snt_files->snt_offsets_pos);
    free(enter_pos);    
    free_text_tokens(tok);
    af_close_mapfile(text);
    free_snt_files(snt_files);
    u_fclose(concor);
    free_conc_opt(concord_options);
    return DEFAULT_ERROR_CODE;    
  }
}
if (offset_file[0]!='\0') {
  concord_options->uima_offsets=load_uima_offsets(&vec,offset_file);
  if (concord_options->uima_offsets==NULL) {
    error("Cannot read offset file %s\n",offset_file);
    free(enter_pos); 
    free_text_tokens(tok);
    af_close_mapfile(text);
    free_snt_files(snt_files);
    u_fclose(concor);
    free_conc_opt(concord_options);
    return DEFAULT_ERROR_CODE;    
  }
}
if (PRLG[0]!='\0') {
  concord_options->PRLG_data=load_PRLG_data(&vec,PRLG);
  if (concord_options->PRLG_data==NULL) {
    error("Cannot read PRLG file %s\n",PRLG);
    free(enter_pos);
    free_text_tokens(tok);
    af_close_mapfile(text);
    free_snt_files(snt_files);
    u_fclose(concor);
    free_conc_opt(concord_options);
    return DEFAULT_ERROR_CODE;    
  }
}
if (concord_options->result_mode==CSV_) {
  concord_options->sort_mode=TEXT_ORDER;
  concord_options->only_matches=1;
}

/* Once we have set all parameters, we call the function that
 * will actually create the concordance. */
create_concordance(&vec,concor,text,tok,n_enter_char,enter_pos,concord_options);

free(enter_pos);
free_text_tokens(tok);
af_close_mapfile(text);
free_snt_files(snt_files);
u_fclose(concor);
free_conc_opt(concord_options);

u_printf("Done.\n");

return SUCCESS_RETURN_CODE;
}
コード例 #27
0
int main_Uncompress(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
char output[FILENAME_MAX]="";
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_Uncompress,lopts_Uncompress,&index))) {
   switch(val) {
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;             
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Uncompress[index].name);
             return USAGE_ERROR_CODE;                         
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

if (output[0]=='\0') {
   remove_extension(argv[options.vars()->optind],output);
   strcat(output,".dic");
}

U_FILE* f=u_fopen(&vec,output,U_WRITE);
if (f==NULL) {
   error("Cannot open file %s\n",output);
   return DEFAULT_ERROR_CODE;
}

char inf_file[FILENAME_MAX];
remove_extension(argv[options.vars()->optind],inf_file);
strcat(inf_file,".inf");
u_printf("Uncompressing %s...\n",argv[options.vars()->optind]);
Dictionary* d=new_Dictionary(&vec,argv[options.vars()->optind],inf_file);

if (d!=NULL) {
  rebuild_dictionary(d,f);
}

u_fclose(f);
free_Dictionary(d);
u_printf("Done.\n");

return SUCCESS_RETURN_CODE;
}
コード例 #28
0
int main_RebuildTfst(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val, index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
int save_statistics=1;
while (EOF!=(val=options.parse_long(argc,argv,optstring_RebuildTfst,lopts_RebuildTfst,&index))) {
   switch (val) {
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'S': save_statistics = 0;
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h':
      usage();
      return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n", options.vars()->optopt) :
                         error("Missing argument for option --%s\n", lopts_RebuildTfst[index].name);
     return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n", options.vars()->optopt) :
                         error("Invalid option --%s\n", options.vars()->optarg);
     return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

char input_tfst[FILENAME_MAX];
char input_tind[FILENAME_MAX];
strcpy(input_tfst,argv[options.vars()->optind]);
remove_extension(input_tfst,input_tind);
strcat(input_tind,".tind");

u_printf("Loading %s...\n",input_tfst);

Tfst* tfst = open_text_automaton(&vec,input_tfst);
if (tfst==NULL) {
   error("Unable to load %s automaton\n",input_tfst);
   return DEFAULT_ERROR_CODE;
}

char basedir[FILENAME_MAX];
get_path(input_tfst,basedir);
char output_tfst[FILENAME_MAX];
sprintf(output_tfst, "%s.new.tfst",input_tfst);
char output_tind[FILENAME_MAX];
sprintf(output_tind, "%s.new.tind",input_tfst);

U_FILE* f_tfst;
if ((f_tfst = u_fopen(&vec,output_tfst,U_WRITE)) == NULL) {
   error("Unable to open %s for writing\n", output_tfst);
   close_text_automaton(tfst);
   return DEFAULT_ERROR_CODE;
}

U_FILE* f_tind;
if ((f_tind = u_fopen(BINARY,output_tind,U_WRITE)) == NULL) {
   u_fclose(f_tfst);
   close_text_automaton(tfst);
   error("Unable to open %s for writing\n", output_tind);
   return DEFAULT_ERROR_CODE;
}
/* We use this hash table to rebuild files tfst_tags_by_freq/alph.txt */
struct hash_table* form_frequencies=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal,
        (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy);

u_fprintf(f_tfst,"%010d\n",tfst->N);
for (int i = 1; i <= tfst->N; i++) {
   if ((i % 100) == 0) {
      u_printf("%d/%d sentences rebuilt...\n", i, tfst->N);
   }
   load_sentence(tfst,i);

   char grfname[FILENAME_MAX];
   sprintf(grfname, "%ssentence%d.grf", basedir, i);
   unichar** tags=NULL;
   int n_tags=-1;
   if (fexists(grfname)) {
      /* If there is a .grf for the current sentence, then we must
       * take it into account */
      if (0==pseudo_main_Grf2Fst2(&vec,grfname,0,NULL,1,1,NULL,NULL,0)) {
         /* We proceed only if the graph compilation was a success */
         char fst2name[FILENAME_MAX];
         sprintf(fst2name, "%ssentence%d.fst2", basedir, i);
         struct FST2_free_info fst2_free;
         Fst2* fst2=load_abstract_fst2(&vec,fst2name,0,&fst2_free);
         af_remove(fst2name);
         free_SingleGraph(tfst->automaton,NULL);
         tfst->automaton=create_copy_of_fst2_subgraph(fst2,1);
         tags=create_tfst_tags(fst2,&n_tags);
         free_abstract_Fst2(fst2,&fst2_free);
      } else {
         error("Error: %s is not a valid sentence automaton\n",grfname);
      }
   }
   save_current_sentence(tfst,f_tfst,f_tind,tags,n_tags,form_frequencies);
   if (tags!=NULL) {
      /* If necessary, we free the tags we created */
      for (int count_tags=0;count_tags<n_tags;count_tags++) {
         free(tags[count_tags]);
      }
      free(tags);
   }
}

u_printf("Text automaton rebuilt.\n");

u_fclose(f_tind);
u_fclose(f_tfst);
close_text_automaton(tfst);

/* Finally, we save statistics */
if (save_statistics) {
    char tfst_tags_by_freq[FILENAME_MAX];
    char tfst_tags_by_alph[FILENAME_MAX];
    strcpy(tfst_tags_by_freq, basedir);
    strcat(tfst_tags_by_freq, "tfst_tags_by_freq.txt");
    strcpy(tfst_tags_by_alph, basedir);
    strcat(tfst_tags_by_alph, "tfst_tags_by_alph.txt");
    U_FILE* f_tfst_tags_by_freq = u_fopen(&vec, tfst_tags_by_freq, U_WRITE);
    if (f_tfst_tags_by_freq == NULL) {
        error("Cannot open %s\n", tfst_tags_by_freq);
    }
    U_FILE* f_tfst_tags_by_alph = u_fopen(&vec, tfst_tags_by_alph, U_WRITE);
    if (f_tfst_tags_by_alph == NULL) {
        error("Cannot open %s\n", tfst_tags_by_alph);
    }
    sort_and_save_tfst_stats(form_frequencies, f_tfst_tags_by_freq, f_tfst_tags_by_alph);
    u_fclose(f_tfst_tags_by_freq);
    u_fclose(f_tfst_tags_by_alph);
}
free_hash_table(form_frequencies);

/* make a backup and replace old automaton with new */
char backup_tfst[FILENAME_MAX];
char backup_tind[FILENAME_MAX];
sprintf(backup_tfst,"%s.bck",input_tfst);
sprintf(backup_tind,"%s.bck",input_tind);
/* We remove the existing backup files, if any */
af_remove(backup_tfst);
af_remove(backup_tind);
af_rename(input_tfst,backup_tfst);
af_rename(input_tind,backup_tind);
af_rename(output_tfst,input_tfst);
af_rename(output_tind,input_tind);
u_printf("\nYou can find a backup of the original files in:\n    %s\nand %s\n",
         backup_tfst,backup_tind);

return SUCCESS_RETURN_CODE;
}
コード例 #29
0
/*
 * This function behaves in the same way that a main one, except that it does
 * not invoke the setBufferMode function.
 */
int main_LocateTfst(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
char text[FILENAME_MAX]="";
char alphabet[FILENAME_MAX]="";
int is_korean=0;
int tilde_negation_operator=1;
int selected_negation_operator=0;
int tagging=0;
int single_tags_only=0;
int match_word_boundaries=1;
MatchPolicy match_policy=LONGEST_MATCHES;
OutputPolicy output_policy=IGNORE_OUTPUTS;
AmbiguousOutputPolicy ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS;
VariableErrorPolicy variable_error_policy=IGNORE_VARIABLE_ERRORS;
int search_limit=NO_MATCH_LIMIT;
char foo;
vector_ptr* injected=new_vector_ptr();
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_LocateTfst,lopts_LocateTfst,&index))) {
   switch(val) {
   case 't': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty .tfst name\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;
             }
             strcpy(text,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet name\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'K': is_korean=1;
   	   	   	  match_word_boundaries=0;
              break;
   case 'l': search_limit=NO_MATCH_LIMIT; break;
   case 'g': if (options.vars()->optarg[0]=='\0') {
                error("You must specify an argument for negation operator\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;
             }
             selected_negation_operator=1;
             if ((strcmp(options.vars()->optarg,"minus")==0) || (strcmp(options.vars()->optarg,"-")==0)) {
                 tilde_negation_operator=0;
             }
             else
             if ((strcmp(options.vars()->optarg,"tilde")!=0) && (strcmp(options.vars()->optarg,"~")!=0)) {
                 error("You must specify a valid argument for negation operator\n");
                 free_vector_ptr(injected);
                 return USAGE_ERROR_CODE;                 
             }
             break;
   case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&search_limit,&foo) || search_limit<=0) {
                /* foo is used to check that the search limit is not like "45gjh" */
                error("Invalid search limit argument: %s\n",options.vars()->optarg);
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;                
             }
             break;
   case 'S': match_policy=SHORTEST_MATCHES; break;
   case 'L': match_policy=LONGEST_MATCHES; break;
   case 'A': match_policy=ALL_MATCHES; break;
   case 'I': output_policy=IGNORE_OUTPUTS; break;
   case 'M': output_policy=MERGE_OUTPUTS; break;
   case 'R': output_policy=REPLACE_OUTPUTS; break;
   case 'X': variable_error_policy=EXIT_ON_VARIABLE_ERRORS; break;
   case 'Y': variable_error_policy=IGNORE_VARIABLE_ERRORS; break;
   case 'Z': variable_error_policy=BACKTRACK_ON_VARIABLE_ERRORS; break;
   case 'b': ambiguous_output_policy=ALLOW_AMBIGUOUS_OUTPUTS; break;
   case 'z': ambiguous_output_policy=IGNORE_AMBIGUOUS_OUTPUTS; break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case 1: tagging=1; break;
   case 2: single_tags_only=1; break;
   case 3: match_word_boundaries=0; break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;                
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free_vector_ptr(injected);
                return USAGE_ERROR_CODE;                
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'v': {
	   unichar* key=u_strdup(options.vars()->optarg);
	   unichar* value=u_strchr(key,'=');
	   if (value==NULL) {
		   error("Invalid variable injection: %s\n",options.vars()->optarg);
       free_vector_ptr(injected);
       return USAGE_ERROR_CODE;       
	   }
	   (*value)='\0';
	   value++;
	   value=u_strdup(value);
	   vector_ptr_add(injected,key);
	   vector_ptr_add(injected,value);
	   break;
   }
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_LocateTfst[index].name);
             free_vector_ptr(injected);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free_vector_ptr(injected);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free_vector_ptr(injected);
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free_vector_ptr(injected);
  return SUCCESS_RETURN_CODE;
}

if (selected_negation_operator==0) {
    get_graph_compatibility_mode_by_file(&vec,&tilde_negation_operator);
}

char grammar[FILENAME_MAX];
char output[FILENAME_MAX];
strcpy(grammar,argv[options.vars()->optind]);
get_path(text,output);
strcat(output,"concord.ind");

int OK=locate_tfst(text,
                   grammar,
                   alphabet,
                   output,
                   &vec,
                   match_policy,
                   output_policy,
                   ambiguous_output_policy,
                   variable_error_policy,
                   search_limit,
                   is_korean,
                   tilde_negation_operator,
                   injected,
                   tagging,
                   single_tags_only,
                   match_word_boundaries);

free_vector_ptr(injected);

return (!OK);
}
コード例 #30
0
ファイル: CheckDic.cpp プロジェクト: adri87/Q-A
int main_CheckDic(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}

int is_a_DELAF=-1;
int strict_unprotected=0;
int skip_path=0;
char alph[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
int space_warnings=1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_CheckDic,lopts_CheckDic,&index,vars))) {
   switch(val) {
   case 'f': is_a_DELAF=1; break;
   case 's': is_a_DELAF=0; break;
   case 'h': usage(); return 0;
   case 'r': strict_unprotected=1; break;
   case 't': strict_unprotected=0; break;
   case 'n': space_warnings=0; break;
   case 'p': skip_path=1; break;
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("Empty alphabet argument\n");
             }
             strcpy(alph,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_CheckDic[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (is_a_DELAF==-1 || vars->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return 1;
}

U_FILE* dic=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (dic==NULL) {
	fatal_error("Cannot open dictionary %s\n",argv[vars->optind]);
}
Alphabet* alphabet0=NULL;
if (alph[0]!='\0') {
   alphabet0=load_alphabet(alph,1);
}
char output_filename[FILENAME_MAX];
get_path(argv[vars->optind],output_filename);
strcat(output_filename,"CHECK_DIC.TXT");
U_FILE* out=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output_filename,U_WRITE);
if (out==NULL) {
	u_fclose(dic);
	fatal_error("Cannot create %s\n",output_filename);
}
u_printf("Checking %s...\n",argv[vars->optind]);
unichar line[CHECKDIC_LINE_SIZE];
int line_number=1;
/*
 * We declare and initialize an array in order to know which
 * letters are used in the dictionary.
 */
int i;
char* alphabet=(char*)malloc(sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS);
if (alphabet==NULL) {
	fatal_alloc_error("CheckDic's main");
}
memset(alphabet,0,sizeof(char)*MAX_NUMBER_OF_UNICODE_CHARS);
/*
 * We use two structures for the storage of the codes found in the
 * dictionary. Note that 'semantic_codes' is used to store both grammatical and
 * semantic codes.
 */
struct string_hash* semantic_codes=new_string_hash();
struct string_hash* inflectional_codes=new_string_hash();
struct string_hash* simple_lemmas=new_string_hash(DONT_USE_VALUES);
struct string_hash* compound_lemmas=new_string_hash(DONT_USE_VALUES);
int n_simple_entries=0;
int n_compound_entries=0;
/*
 * We read all the lines and check them.
 */
while (EOF!=u_fgets_limit2(line,DIC_LINE_SIZE,dic)) {
   if (line[0]=='\0') {
		/* If we have an empty line, we print a unicode error message
		 * into the output file */
		u_fprintf(out,"Line %d: empty line\n",line_number);
	}
	else if (line[0]=='/') {
		/* If a line starts with '/', it is a commment line, so
		 * we ignore it */
	}
	else {
		/* If we have a line to check, we check it according to the
		 * dictionary type */
		check_DELA_line(line,out,is_a_DELAF,line_number,alphabet,semantic_codes,
		                inflectional_codes,simple_lemmas,compound_lemmas,
		                &n_simple_entries,&n_compound_entries,alphabet0,strict_unprotected);
	}
	/* At regular intervals, we display a message on the standard
	 * output to show that the program is working */
	if (line_number%10000==0) {
		u_printf("%d lines read...\r",line_number);
	}
	line_number++;
}
u_printf("%d lines read\n",line_number-1);
u_fclose(dic);
/*
 * Once we have checked all the lines, we print some informations
 * in the output file.
 */
u_fprintf(out,"-----------------------------------\n");
u_fprintf(out,"-------------  Stats  -------------\n");
u_fprintf(out,"-----------------------------------\n");
if (skip_path != 0) { 
    char filename_without_path[FILENAME_MAX];
    remove_path(argv[vars->optind],filename_without_path);
    u_fprintf(out,"File: %s\n",filename_without_path);
}
else {
    u_fprintf(out,"File: %s\n",argv[vars->optind]);
}
u_fprintf(out,"Type: %s\n",is_a_DELAF?"DELAF":"DELAS");
u_fprintf(out,"%d line%s read\n",line_number-1,(line_number-1>1)?"s":"");
u_fprintf(out,"%d simple entr%s ",n_simple_entries,(n_simple_entries>1)?"ies":"y");
u_fprintf(out,"for %d distinct lemma%s\n",simple_lemmas->size,(simple_lemmas->size>1)?"s":"");
u_fprintf(out,"%d compound entr%s ",n_compound_entries,(n_compound_entries>1)?"ies":"y");
u_fprintf(out,"for %d distinct lemma%s\n",compound_lemmas->size,(compound_lemmas->size>1)?"s":"");
/**
 * We print the list of the characters that are used, with
 * their unicode numbers shown in hexadecimal. This can be useful
 * to detect different characters that are graphically identical
 * like 'A' (upper of latin 'a' or upper of greek alpha ?).
 */
u_fprintf(out,"-----------------------------------\n");
u_fprintf(out,"----  All chars used in forms  ----\n");
u_fprintf(out,"-----------------------------------\n");
unichar r[4];
unichar r2[7];
r[1]=' ';
r[2]='(';
r[3]='\0';
r2[5]='\n';
r2[6]='\0';
for (i=0;i<MAX_NUMBER_OF_UNICODE_CHARS;i++) {
	if (alphabet[i]) {
      u_fprintf(out,"%C (%04X)\n",i,i);
	}
}
/*
 * Then we print the list of all grammatical and semantic codes used in the
 * dictionary. If a code contains a non ASCII character, a space or a tabulation,
 * we print a warning.
 */
u_fprintf(out,"-------------------------------------------------------------\n");
u_fprintf(out,"----  %3d grammatical/semantic code%s",semantic_codes->size,(semantic_codes->size>1)?"s used in dictionary  ----\n":" used in dictionary  -----\n");
u_fprintf(out,"-------------------------------------------------------------\n");
unichar comment[2000];
for (i=0;i<semantic_codes->size;i++) {
	/* We print the code, followed if necessary by a warning */
	u_fprintf(out,"%S",semantic_codes->value[i]);
	if (warning_on_code(semantic_codes->value[i],comment,space_warnings)) {
		u_fprintf(out," %S",comment);
	}
	u_fprintf(out,"\n");
}
/*
 * Finally, we print the list of inflectional codes,
 * with warnings in the case of non ASCII letters, spaces
 * or tabulations.
 */
u_fprintf(out,"-----------------------------------------------------\n");
u_fprintf(out,"----  %3d inflectional code%s",inflectional_codes->size,(inflectional_codes->size>1)?"s used in dictionary  ----\n":" used in dictionary  -----\n");
u_fprintf(out,"-----------------------------------------------------\n");


for (i=0;i<inflectional_codes->size;i++) {
	u_fprintf(out,"%S",inflectional_codes->value[i]);
	if (warning_on_code(inflectional_codes->value[i],comment,space_warnings)) {
		u_fprintf(out," %S",comment);
	}
	u_fprintf(out,"\n");
}
u_fclose(out);
free_OptVars(vars);
u_printf("Done.\n");
/* Note that we don't free anything since it would only waste time */

free(alphabet);
if (alphabet0!=NULL) {
   free_alphabet(alphabet0);
}
#if (defined(UNITEX_LIBRARY) || defined(UNITEX_RELEASE_MEMORY_AT_EXIT))
/* cleanup for no leak on library */
free_string_hash(semantic_codes);
free_string_hash(inflectional_codes);
free_string_hash(simple_lemmas);
free_string_hash(compound_lemmas);
#endif
return 0;
}