示例#1
0
/**
 * This function takes two concordance index 'in1' and 'in2', and builds
 * the associated concordances 'out1' and 'out2'.
 */
void create_text_concordances(const VersatileEncodingConfig* vec,const char* in1,const char* in2,const char* out1,const char* out2) {
pseudo_main_Concord(vec,in1,NULL,0,20,80,NULL,"--diff",NULL,NULL,0,0,0);
char f[FILENAME_MAX];
get_path(in1,f);
strcat(f,"concord.txt");
af_remove(out1);
af_rename(f,out1);
pseudo_main_Concord(vec,in2,NULL,0,20,80,NULL,"--diff",NULL,NULL,0,0,0);
af_remove(out2);
af_rename(f,out2);
}
/**
 * This function takes a unicode string representing a regular expression and
 * compiles it into a .grf file. It returns 1 in case of success; 0 otherwise.
 */
int reg2grf(const unichar* regexp,const char* name_grf, const VersatileEncodingConfig* vec) {
if (regexp[0]=='\0') {
   error("You must specify a non empty regular expression\n");
   return 0;
}
U_FILE* out=u_fopen(vec,name_grf,U_WRITE);
if (out==NULL) {
   error("Cannot open the output file for the regular expression\n");
   return 0;
}
struct reg2grf_info* INFO=new_reg2grf_info();
/* We create the initial and final states that must have numbers 0 and 1 */
add_state(INFO,u_strdup("<E>"));
add_state(INFO,u_strdup(""));
/* We print the grf header */
u_fprintf(out,"#Unigraph\n");
u_fprintf(out,"SIZE 1313 950\n");
u_fprintf(out,"FONT Times New Roman:  12\n");
u_fprintf(out,"OFONT Times New Roman:B 12\n");
u_fprintf(out,"BCOLOR 16777215\n");
u_fprintf(out,"FCOLOR 0\n");
u_fprintf(out,"ACOLOR 12632256\n");
u_fprintf(out,"SCOLOR 16711680\n");
u_fprintf(out,"CCOLOR 255\n");
u_fprintf(out,"DBOXES y\n");
u_fprintf(out,"DFRAME y\n");
u_fprintf(out,"DDATE y\n");
u_fprintf(out,"DFILE y\n");
u_fprintf(out,"DDIR y\n");
u_fprintf(out,"DRIG n\n");
u_fprintf(out,"DRST n\n");
u_fprintf(out,"FITS 100\n");
u_fprintf(out,"PORIENT L\n");
u_fprintf(out,"#\n");

int input_state;
int output_state;
int result=reg_2_grf(regexp,&input_state,&output_state,INFO);
if (result!=1) {
   u_fclose(out);
   af_remove(name_grf);
   free_reg2grf_info(INFO);
   if (result==0) {
      error("Syntax error in regular expression\n");
   }
   return 0;
}
/* If the compilation has successed, we must link the resulting automaton piece
 * to the grf's initial and final states */
add_transition(0,input_state,INFO);
add_transition(output_state,1,INFO);
save_states(out,INFO);
free_reg2grf_info(INFO);
u_fclose(out);
return 1;
}
示例#3
0
static void remove_file_in_path(char* path, const char* filename, int mandatory)
{
	if (!path)
		return;
	char * end_path = path + strlen(path);
	strcpy(end_path, filename);
	if (mandatory || fexists(path))
		af_remove(path);
	*end_path = '\0';
}
示例#4
0
int main_fst2txt(struct fst2txt_parameters* p) {
    p->f_input=u_fopen_existing_versatile_encoding(p->mask_encoding_compatibility_input,p->text_file,U_READ);
    if (p->f_input==NULL) {
        error("Cannot open file %s\n",p->text_file);
        return 1;
    }

    p->text_buffer=new_buffer_for_file(UNICHAR_BUFFER,p->f_input,CAPACITY_LIMIT);
    p->buffer=p->text_buffer->unichar_buffer;

    p->f_output=u_fopen_creating_versatile_encoding(p->encoding_output,p->bom_output,p->temp_file,U_WRITE);
    if (p->f_output==NULL) {
        error("Cannot open temporary file %s\n",p->temp_file);
        u_fclose(p->f_input);
        return 1;
    }

    p->fst2=load_abstract_fst2(p->fst_file,1,NULL);
    if (p->fst2==NULL) {
        error("Cannot load grammar %s\n",p->fst_file);
        u_fclose(p->f_input);
        u_fclose(p->f_output);
        return 1;
    }

    if (p->alphabet_file!=NULL && p->alphabet_file[0]!='\0') {
       p->alphabet=load_alphabet(p->alphabet_file);
       if (p->alphabet==NULL) {
          error("Cannot load alphabet file %s\n",p->alphabet_file);
          u_fclose(p->f_input);
          u_fclose(p->f_output);
          free_abstract_Fst2(p->fst2,NULL);
          return 1;
       }
    }

    u_printf("Applying %s in %s mode...\n",p->fst_file,(p->output_policy==MERGE_OUTPUTS)?"merge":"replace");
    build_state_token_trees(p);
    parse_text(p);
    u_fclose(p->f_input);
    u_fclose(p->f_output);
    af_remove(p->text_file);
    af_rename(p->temp_file,p->text_file);
    u_printf("Done.\n");
    return 0;
}
示例#5
0
int main_PolyLex(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int language=-1;
char alphabet[FILENAME_MAX]="";
char name_bin[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
char info[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_PolyLex,lopts_PolyLex,&index))) {
   switch(val) {
   case 'D': language=DUTCH; break;
   case 'G': language=GERMAN; break;
   case 'N': language=NORWEGIAN; break;
   case 'R': language=RUSSIAN; break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'd': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty dictionary file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(name_bin,options.vars()->optarg);
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'i': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty information file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(info,options.vars()->optarg);
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_PolyLex[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (name_bin[0]=='\0') {
   error("You must specify the .bin dictionary to use\n");
   return USAGE_ERROR_CODE;
}

if (output[0]=='\0') {
   error("You must specify the output dictionary file name\n");
   return USAGE_ERROR_CODE;
}

if (language==-1) {
   error("You must specify the language\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   u_printf("Loading alphabet...\n");
   alph=load_alphabet(&vec,alphabet);
   if (alph==NULL) {
      error("Cannot load alphabet file %s\n",alphabet);
      return USAGE_ERROR_CODE;
   }
}

char name_inf[FILENAME_MAX];
struct string_hash* forbiddenWords=NULL;
if (language==DUTCH || language==NORWEGIAN) {
   get_path(name_bin,name_inf);
   strcat(name_inf,"ForbiddenWords.txt");
   forbiddenWords=load_key_list(&vec,name_inf);
   if (forbiddenWords==NULL) {
       /* If there was no file, we don't want to block the process */
       forbiddenWords=new_string_hash(DONT_USE_VALUES);
   }
}

strcpy(name_inf,name_bin);
name_inf[strlen(name_bin)-3]='\0';
strcat(name_inf,"inf");
Dictionary* d=new_Dictionary(&vec,name_bin,name_inf);
if (d==NULL) {
    error("Cannot load dictionary %s\n",name_bin);
    free_string_hash(forbiddenWords);
    free_alphabet(alph);
    return DEFAULT_ERROR_CODE;
}

char tmp[FILENAME_MAX];
strcpy(tmp,argv[options.vars()->optind]);
strcat(tmp,".tmp");

U_FILE* words=u_fopen(&vec,argv[options.vars()->optind],U_READ);
if (words==NULL) {
   error("Cannot open word list file %s\n",argv[options.vars()->optind]);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   // here we return 0 in order to do not block the preprocessing
   // in the Unitex/GramLab IDE interface, if no dictionary was applied
   // so that there is no "err" file
   return SUCCESS_RETURN_CODE;
}

U_FILE* new_unknown_words=u_fopen(&vec,tmp,U_WRITE);
if (new_unknown_words==NULL) {
   error("Cannot open temporary word list file %s\n",tmp);
   u_fclose(words);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   return DEFAULT_ERROR_CODE;
}

U_FILE* res=u_fopen(&vec,output,U_APPEND);
if (res==NULL) {
   error("Cannot open result file %s\n",output);
   u_fclose(new_unknown_words);
   u_fclose(words);
   free_Dictionary(d);
   free_string_hash(forbiddenWords);
   free_alphabet(alph);
   u_fclose(words);
   return DEFAULT_ERROR_CODE;
}

U_FILE* debug=NULL;
if ((*info)!='\0') {
   debug=u_fopen(&vec,info,U_WRITE);
   if (debug==NULL) {
      error("Cannot open debug file %s\n",info);
   }
}
struct utags UTAG;

switch(language) {
  case DUTCH:
    analyse_dutch_unknown_words(alph,
                                d,
                                words,
                                res,
                                debug,
                                new_unknown_words,
                                forbiddenWords);
    break;
  case GERMAN:
    analyse_german_compounds(alph,
                             d,
                             words,
                             res,
                             debug,
                             new_unknown_words);
    break;
  case NORWEGIAN:
    analyse_norwegian_unknown_words(alph,
                                    d,
                                    words,
                                    res,
                                    debug,
                                    new_unknown_words,
                                    forbiddenWords);
    break;
  case RUSSIAN:
     init_russian(&UTAG);
     analyse_compounds(alph,
                       d,
                       words,
                       res,
                       debug,
                       new_unknown_words,
                       UTAG);
     break;
}

free_alphabet(alph);
free_Dictionary(d);
u_fclose(words);
u_fclose(new_unknown_words);
free_string_hash(forbiddenWords);
af_remove(argv[options.vars()->optind]);
af_rename(tmp,argv[options.vars()->optind]);
u_fclose(res);

if (debug!=NULL) {
   u_fclose(debug);
}

return SUCCESS_RETURN_CODE;
}
示例#6
0
int main_Flatten(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int RTN=1;
int depth=10;
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
char foo;
bool only_verify_arguments = false;
UnitexGetOpt options;

while (EOF!=(val=options.parse_long(argc,argv,optstring_Flatten,lopts_Flatten,&index))) {
   switch(val) {
   case 'f': RTN=0; break;
   case 'r': RTN=1; break;
   case 'd': if (1!=sscanf(options.vars()->optarg,"%d%c",&depth,&foo) || depth<=0) {
                /* foo is used to check that the depth is not like "45gjh" */
                error("Invalid depth argument: %s\n",options.vars()->optarg);
                return USAGE_ERROR_CODE;
             }
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'h': usage(); return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Flatten[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

u_printf("Loading %s...\n",argv[options.vars()->optind]);
struct FST2_free_info fst2_free;
Fst2* origin=load_abstract_fst2(&vec,argv[options.vars()->optind],1,&fst2_free);
if (origin==NULL) {
   error("Cannot load %s\n",argv[options.vars()->optind]);
   return DEFAULT_ERROR_CODE;
}

char temp[FILENAME_MAX];
strcpy(temp,argv[options.vars()->optind]);
strcat(temp,".tmp.fst2");

switch (flatten_fst2(origin,depth,temp,&vec,RTN)) {
   case EQUIVALENT_FST:
      u_printf("The resulting grammar is an equivalent finite-state transducer.\n");
      break;
   case APPROXIMATIVE_FST:
      u_printf("The resulting grammar is a finite-state approximation.\n");
      break;
   case EQUIVALENT_RTN:
      u_printf("The resulting grammar is an equivalent FST2 (RTN).\n");
      break;
   default: 
      error("Internal state error in Flatten's main\n");
      free_abstract_Fst2(origin,&fst2_free);
      return DEFAULT_ERROR_CODE;
}

free_abstract_Fst2(origin,&fst2_free);
af_remove(argv[options.vars()->optind]);
af_rename(temp,argv[options.vars()->optind]);

return SUCCESS_RETURN_CODE;
}
示例#7
0
int main_SortTxt(int argc, char* const argv[]) {
  if (argc == 1) {
    usage();
    return SUCCESS_RETURN_CODE;
  }

  struct sort_infos* inf = new_sort_infos();
  if(!inf) {
    return ALLOC_ERROR_CODE;
  }

  int mode = DEFAULT;
  char line_info[FILENAME_MAX] = "";
  char sort_order[FILENAME_MAX] = "";
  VersatileEncodingConfig vec = { DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT,
      DEFAULT_ENCODING_OUTPUT, DEFAULT_BOM_OUTPUT };
  int val, index = -1;
  bool only_verify_arguments = false;
  UnitexGetOpt options;
  while (EOF != (val = options.parse_long(argc, argv, optstring_SortTxt,
      lopts_SortTxt, &index))) {
    switch (val) {
    case 'n':
      inf->REMOVE_DUPLICATES = 1;
      break;
    case 'd':
      inf->REMOVE_DUPLICATES = 0;
      break;
    case 'r':
      inf->REVERSE = -1;
      break;
    case 'o':
      if (options.vars()->optarg[0] == '\0') {
        error("You must specify a non empty sort order file name\n");
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
      }
      strcpy(sort_order, options.vars()->optarg);
      break;
    case 'l':
      if (options.vars()->optarg[0] == '\0') {
        error("You must specify a non empty information file name\n");
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
      }
      strcpy(line_info, options.vars()->optarg);
      break;
    case 't':
      mode = THAI;
      break;
    case 'f':
      inf->factorize_inflectional_codes = 1;
      break;
    case 'V': only_verify_arguments = true;
      break;
    case 'h':
      usage();
      free_sort_infos(inf);
      return SUCCESS_RETURN_CODE;
    case 'k':
      if (options.vars()->optarg[0] == '\0') {
        error("Empty input_encoding argument\n");
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
      }
      decode_reading_encoding_parameter(
          &(vec.mask_encoding_compatibility_input), options.vars()->optarg);
      break;
    case 'q':
      if (options.vars()->optarg[0] == '\0') {
        error("Empty output_encoding argument\n");
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
      }
      decode_writing_encoding_parameter(&(vec.encoding_output),
          &(vec.bom_output), options.vars()->optarg);
      break;
    case ':':
        index == -1 ? error("Missing argument for option -%c\n", options.vars()->optopt) :
                      error("Missing argument for option --%s\n",lopts_SortTxt[index].name);
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
    case '?':
        index == -1 ? error("Invalid option -%c\n", options.vars()->optopt) :
                      error("Invalid option --%s\n", options.vars()->optarg);
        free_sort_infos(inf);
        return USAGE_ERROR_CODE;
    }
    index = -1;
  }

  if (options.vars()->optind != argc - 1) {
    error("Invalid arguments: rerun with --help\n");
    free_sort_infos(inf);
    return USAGE_ERROR_CODE;
  }

  if (only_verify_arguments) {
    // freeing all allocated memory
    free_sort_infos(inf);
    return SUCCESS_RETURN_CODE;
  }

  if (sort_order[0] != '\0') {
    read_char_order(&vec, sort_order, inf);
  }

  char new_name[FILENAME_MAX];
  strcpy(new_name, argv[options.vars()->optind]);
  strcat(new_name, ".new");

  inf->f = u_fopen(&vec, argv[options.vars()->optind], U_READ);
  if (inf->f == NULL) {
    error("Cannot open file %s\n", argv[options.vars()->optind]);
    free_sort_infos(inf);
    return DEFAULT_ERROR_CODE;
  }

  inf->f_out = u_fopen(&vec, new_name, U_WRITE);
  if (inf->f_out == NULL) {
    error("Cannot open temporary file %s\n", new_name);
    u_fclose(inf->f);
    free_sort_infos(inf);
    return DEFAULT_ERROR_CODE;
  }

  switch (mode) {
  case DEFAULT:
    sort(inf);
    break;
  case THAI:
    sort_thai(inf);
    break;
  }
  if (line_info[0] != '\0') {
    U_FILE* F = u_fopen(&vec, line_info, U_WRITE);
    if (F == NULL) {
      error("Cannot write %s\n", line_info);
    } else {
      u_fprintf(F, "%d\n", inf->resulting_line_number);
      u_fclose(F);
    }
  }

  u_fclose(inf->f_out);
  u_fclose(inf->f);
  af_remove(argv[options.vars()->optind]);
  af_rename(new_name, argv[options.vars()->optind]);
  free_sort_infos(inf);

  u_printf("Done.\n");
  return SUCCESS_RETURN_CODE;
}
示例#8
0
int main_SpellCheck(int argc,char* const argv[]) {
if (argc==1) {
    usage();
    return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
char mode=0;
char snt[FILENAME_MAX]="";
char txt[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
char output_set=0;
char output_op='A';
SpellCheckConfig config;
config.max_errors=1;
config.max_SP_INSERT=1;
config.max_SP_SUPPR=1;
config.max_SP_SWAP=1;
config.max_SP_CHANGE=1;
for (int i=0;i<N_SPSubOp;i++) {
    config.score[i]=default_scores[i];
}
config.min_length1=4;
config.min_length2=6;
config.min_length3=12;
config.input_op='D';
config.keyboard=NULL;
config.allow_uppercase_initial=0;
char foo;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_SpellCheck,lopts_SpellCheck,&index))) {
   switch(val) {
   case 's': {
       strcpy(snt,options.vars()->optarg);
       mode='s';
       break;
   }
   case 'f': {
       strcpy(txt,options.vars()->optarg);
       mode='f';
       break;
   }
   case 'o': {
       if (options.vars()->optarg!=NULL) {
           strcpy(output,options.vars()->optarg);
       }
       output_set=1;
       break;
   }
   case 'I': {
       if (!strcmp(options.vars()->optarg,"D") || !strcmp(options.vars()->optarg,"M") || !strcmp(options.vars()->optarg,"U")) {
           config.input_op=options.vars()->optarg[0];
       } else {
       error("Invalid argument %s for option --input-op: should in [DMU]\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 'O': {
       if (!strcmp(options.vars()->optarg,"O") || !strcmp(options.vars()->optarg,"A")) {
           output_op=options.vars()->optarg[0];
       } else {
           error("Invalid argument %s for option --output-op: should in [OA]\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 1: {
       config.keyboard=get_Keyboard(options.vars()->optarg);
       if (config.keyboard==NULL) {
           error("Invalid argument %s for option --keyboard:\nUse --show-keyboards to see possible values\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 2: {
       print_available_keyboards(U_STDOUT);
       return SUCCESS_RETURN_CODE;
   }
   case 10: {
       if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_errors,&foo)) {
           error("Invalid argument %s for --max-errors: should be an integer >=0\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 11: {
       if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_INSERT,&foo)) {
           error("Invalid argument %s for --max-insert: should be an integer >=0\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 12: {
       if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_SUPPR,&foo)) {
           error("Invalid argument %s for --max-suppr: should be an integer >=0\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 13: {
       if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_CHANGE,&foo)) {
           error("Invalid argument %s for --max-change: should be an integer >=0\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 14: {
       if (1!=sscanf(options.vars()->optarg,"%u%c",&config.max_SP_SWAP,&foo)) {
           error("Invalid argument %s for --max-swap: should be an integer >=0\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 20: {
       int* scores=config.score;
       if (N_SPSubOp!=sscanf(options.vars()->optarg,"%d,%d,%d,%d,%d,%d,%d,%d,%d%c",
            scores,scores+1,scores+2,scores+3,scores+4,scores+5,
            scores+6,scores+7,scores+8,&foo)) {
            error("Invalid argument %s for option --scores. See --help-scores\n",options.vars()->optarg);
        return USAGE_ERROR_CODE;
       }
       break;
   }
   case 21: {
       usage_scores();
       return SUCCESS_RETURN_CODE;
   }
   case 22: {
       if (3!=sscanf(options.vars()->optarg,"%u,%u,%u%c",
            &config.min_length1,&config.min_length2,&config.min_length3,&foo)) {
            error("Invalid argument %s for option --min-lengths\n",options.vars()->optarg);
        return USAGE_ERROR_CODE;
       }
       break;
   }
   case 23: {
       if (!strcmp(options.vars()->optarg,"yes")) {
           config.allow_uppercase_initial=1;
       } else if (!strcmp(options.vars()->optarg,"no")) {
           config.allow_uppercase_initial=0;
       } else {
           error("Invalid argument %s for option --upper-initial\n",options.vars()->optarg);
       return USAGE_ERROR_CODE;
       }
       break;
   }
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_SpellCheck[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind==argc) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (mode==0) {
  error("You must use either --snt or --file\n");
  return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

config.n_dics=argc-options.vars()->optind;
config.dics=(Dictionary**)malloc(config.n_dics*sizeof(Dictionary*));
if (config.dics==NULL) {
    alloc_error("main_SpellCheck");
  return ALLOC_ERROR_CODE;
}

for (int i=0;i<config.n_dics;i++) {
    config.dics[i]=new_Dictionary(&vec,argv[i+options.vars()->optind]);
    if (config.dics[i]==NULL) {
        error("Cannot load dictionary %s\n",argv[i+options.vars()->optind]);
    }
}

config.out=U_STDOUT;
config.n_input_lines=0;
config.n_output_lines=0;

if (mode=='s') {
    /* When working with a .snt, we actually want to work on its err file */
    get_snt_path(snt,txt);
    strcat(txt,"err");
    /* the output must be dlf, and we note the number of lines in the existing
     * dlf file, if any */
    get_snt_path(snt,output);
    strcat(output,"dlf.n");
    U_FILE* f=u_fopen(&vec,output,U_READ);
    if (f!=NULL) {
        u_fscanf(f,"%d",&(config.n_output_lines));
        u_fclose(f);
    }
    get_snt_path(snt,output);
    strcat(output,"dlf");
    output_set=1;
    /* and we force the values for -I and -O */
    config.input_op='U';
    output_op='A';
} else {
    /* If mode=='f', we don't have anything to do since we already
     * defined the default output to stdout */
}

if (output_set) {
    if (output_op=='O') {
        config.out=u_fopen(&vec,output,U_WRITE);
    } else {
        config.out=u_fopen(&vec,output,U_APPEND);
    }
    if (config.out==NULL) {
        error("Cannot open output file %s\n",output);
    for (int i=0;i<config.n_dics;i++) {
      free_Dictionary(config.dics[i]);
    }
    free(config.dics);
    return DEFAULT_ERROR_CODE;
    }
}

config.modified_input=NULL;
char modified_input[FILENAME_MAX]="";
if (config.input_op!='D') {
    strcpy(modified_input,txt);
    strcat(modified_input,".tmp");
    config.modified_input=u_fopen(&vec,modified_input,U_WRITE);
    if (config.modified_input==NULL) {
        error("Cannot open tmp file %s\n",modified_input);
    if (config.out!=U_STDOUT) {
      u_fclose(config.out);
    }
    for (int i=0;i<config.n_dics;i++) {
      free_Dictionary(config.dics[i]);
    }
    free(config.dics);
    return DEFAULT_ERROR_CODE;
    }
}

config.in=u_fopen(&vec,txt,U_READ);
if (config.in==NULL) {
    error("Cannot open file %s\n",txt);
  u_fclose(config.modified_input);
  if (config.out!=U_STDOUT) {
    u_fclose(config.out);
  }
  for (int i=0;i<config.n_dics;i++) {
    free_Dictionary(config.dics[i]);
  }
  free(config.dics);
  return DEFAULT_ERROR_CODE;
}

/* We perform spellchecking */
spellcheck(&config);

/* And we clean */

u_fclose(config.in);

if (config.modified_input!=NULL) {
  /* If we used a tmp file because the input file has to be modified,
   * it's now time to actually modify it */
  u_fclose(config.modified_input);
  af_remove(txt);
  af_rename(modified_input,txt);
}

if (config.out!=U_STDOUT) {
  u_fclose(config.out);
}

for (int i=0;i<config.n_dics;i++) {
  free_Dictionary(config.dics[i]);
}
free(config.dics);

/* Finally, we update the dlf.n and err.n files if mode=='s' */
if (mode=='s') {
    get_snt_path(snt,output);
    strcat(output,"err.n");
    U_FILE* f=u_fopen(&vec,output,U_WRITE);
    if (f!=NULL) {
        u_fprintf(f,"%d",config.n_input_lines);
        u_fclose(f);
    }
    if (config.input_op!='D') {
        get_snt_path(snt,output);
        strcat(output,"dlf.n");
        U_FILE* fw=u_fopen(&vec,output,U_WRITE);
        if (fw!=NULL) {
            u_fprintf(fw,"%d",config.n_output_lines);
            u_fclose(fw);
        }
    }
}

return SUCCESS_RETURN_CODE;
}
示例#9
0
int main_RebuildTfst(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

VersatileEncodingConfig vec=VEC_DEFAULT;
int val, index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
int save_statistics=1;
while (EOF!=(val=options.parse_long(argc,argv,optstring_RebuildTfst,lopts_RebuildTfst,&index))) {
   switch (val) {
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case 'S': save_statistics = 0;
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h':
      usage();
      return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n", options.vars()->optopt) :
                         error("Missing argument for option --%s\n", lopts_RebuildTfst[index].name);
     return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n", options.vars()->optopt) :
                         error("Invalid option --%s\n", options.vars()->optarg);
     return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

char input_tfst[FILENAME_MAX];
char input_tind[FILENAME_MAX];
strcpy(input_tfst,argv[options.vars()->optind]);
remove_extension(input_tfst,input_tind);
strcat(input_tind,".tind");

u_printf("Loading %s...\n",input_tfst);

Tfst* tfst = open_text_automaton(&vec,input_tfst);
if (tfst==NULL) {
   error("Unable to load %s automaton\n",input_tfst);
   return DEFAULT_ERROR_CODE;
}

char basedir[FILENAME_MAX];
get_path(input_tfst,basedir);
char output_tfst[FILENAME_MAX];
sprintf(output_tfst, "%s.new.tfst",input_tfst);
char output_tind[FILENAME_MAX];
sprintf(output_tind, "%s.new.tind",input_tfst);

U_FILE* f_tfst;
if ((f_tfst = u_fopen(&vec,output_tfst,U_WRITE)) == NULL) {
   error("Unable to open %s for writing\n", output_tfst);
   close_text_automaton(tfst);
   return DEFAULT_ERROR_CODE;
}

U_FILE* f_tind;
if ((f_tind = u_fopen(BINARY,output_tind,U_WRITE)) == NULL) {
   u_fclose(f_tfst);
   close_text_automaton(tfst);
   error("Unable to open %s for writing\n", output_tind);
   return DEFAULT_ERROR_CODE;
}
/* We use this hash table to rebuild files tfst_tags_by_freq/alph.txt */
struct hash_table* form_frequencies=new_hash_table((HASH_FUNCTION)hash_unichar,(EQUAL_FUNCTION)u_equal,
        (FREE_FUNCTION)free,NULL,(KEYCOPY_FUNCTION)keycopy);

u_fprintf(f_tfst,"%010d\n",tfst->N);
for (int i = 1; i <= tfst->N; i++) {
   if ((i % 100) == 0) {
      u_printf("%d/%d sentences rebuilt...\n", i, tfst->N);
   }
   load_sentence(tfst,i);

   char grfname[FILENAME_MAX];
   sprintf(grfname, "%ssentence%d.grf", basedir, i);
   unichar** tags=NULL;
   int n_tags=-1;
   if (fexists(grfname)) {
      /* If there is a .grf for the current sentence, then we must
       * take it into account */
      if (0==pseudo_main_Grf2Fst2(&vec,grfname,0,NULL,1,1,NULL,NULL,0)) {
         /* We proceed only if the graph compilation was a success */
         char fst2name[FILENAME_MAX];
         sprintf(fst2name, "%ssentence%d.fst2", basedir, i);
         struct FST2_free_info fst2_free;
         Fst2* fst2=load_abstract_fst2(&vec,fst2name,0,&fst2_free);
         af_remove(fst2name);
         free_SingleGraph(tfst->automaton,NULL);
         tfst->automaton=create_copy_of_fst2_subgraph(fst2,1);
         tags=create_tfst_tags(fst2,&n_tags);
         free_abstract_Fst2(fst2,&fst2_free);
      } else {
         error("Error: %s is not a valid sentence automaton\n",grfname);
      }
   }
   save_current_sentence(tfst,f_tfst,f_tind,tags,n_tags,form_frequencies);
   if (tags!=NULL) {
      /* If necessary, we free the tags we created */
      for (int count_tags=0;count_tags<n_tags;count_tags++) {
         free(tags[count_tags]);
      }
      free(tags);
   }
}

u_printf("Text automaton rebuilt.\n");

u_fclose(f_tind);
u_fclose(f_tfst);
close_text_automaton(tfst);

/* Finally, we save statistics */
if (save_statistics) {
    char tfst_tags_by_freq[FILENAME_MAX];
    char tfst_tags_by_alph[FILENAME_MAX];
    strcpy(tfst_tags_by_freq, basedir);
    strcat(tfst_tags_by_freq, "tfst_tags_by_freq.txt");
    strcpy(tfst_tags_by_alph, basedir);
    strcat(tfst_tags_by_alph, "tfst_tags_by_alph.txt");
    U_FILE* f_tfst_tags_by_freq = u_fopen(&vec, tfst_tags_by_freq, U_WRITE);
    if (f_tfst_tags_by_freq == NULL) {
        error("Cannot open %s\n", tfst_tags_by_freq);
    }
    U_FILE* f_tfst_tags_by_alph = u_fopen(&vec, tfst_tags_by_alph, U_WRITE);
    if (f_tfst_tags_by_alph == NULL) {
        error("Cannot open %s\n", tfst_tags_by_alph);
    }
    sort_and_save_tfst_stats(form_frequencies, f_tfst_tags_by_freq, f_tfst_tags_by_alph);
    u_fclose(f_tfst_tags_by_freq);
    u_fclose(f_tfst_tags_by_alph);
}
free_hash_table(form_frequencies);

/* make a backup and replace old automaton with new */
char backup_tfst[FILENAME_MAX];
char backup_tind[FILENAME_MAX];
sprintf(backup_tfst,"%s.bck",input_tfst);
sprintf(backup_tind,"%s.bck",input_tind);
/* We remove the existing backup files, if any */
af_remove(backup_tfst);
af_remove(backup_tind);
af_rename(input_tfst,backup_tfst);
af_rename(input_tind,backup_tind);
af_rename(output_tfst,input_tfst);
af_rename(output_tind,input_tind);
u_printf("\nYou can find a backup of the original files in:\n    %s\nand %s\n",
         backup_tfst,backup_tind);

return SUCCESS_RETURN_CODE;
}
示例#10
0
文件: PolyLex.cpp 项目: adri87/Q-A
int main_PolyLex(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}


int language=-1;
char alphabet[FILENAME_MAX]="";
char dictionary[FILENAME_MAX]="";
char output[FILENAME_MAX]="";
char info[FILENAME_MAX]="";
Encoding encoding_output = DEFAULT_ENCODING_OUTPUT;
int bom_output = DEFAULT_BOM_OUTPUT;
int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT;
int val,index=-1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_PolyLex,lopts_PolyLex,&index,vars))) {
   switch(val) {
   case 'D': language=DUTCH; break;
   case 'G': language=GERMAN; break;
   case 'N': language=NORWEGIAN; break;
   case 'R': language=RUSSIAN; break;
   case 'a': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty alphabet file name\n");
             }
             strcpy(alphabet,vars->optarg);
             break;
   case 'd': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty dictionary file name\n");
             }
             strcpy(dictionary,vars->optarg);
             break;
   case 'o': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty output file name\n");
             }
             strcpy(output,vars->optarg);
             break;
   case 'i': if (vars->optarg[0]=='\0') {
                fatal_error("You must specify a non empty information file name\n");
             }
             strcpy(info,vars->optarg);
             break;
   case 'k': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input_encoding argument\n");
             }
             decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg);
             break;
   case 'q': if (vars->optarg[0]=='\0') {
                fatal_error("Empty output_encoding argument\n");
             }
             decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg);
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_PolyLex[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}

if (dictionary[0]=='\0') {
   fatal_error("You must specify the .bin dictionary to use\n");
}
if (output[0]=='\0') {
   fatal_error("You must specify the output dictionary file name\n");
}
if (language==-1) {
   fatal_error("You must specify the language\n");
}

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   u_printf("Loading alphabet...\n");
   alph=load_alphabet(alphabet);
   if (alph==NULL) {
      fatal_error("Cannot load alphabet file %s\n",alphabet);
   }
}
char temp[FILENAME_MAX];
struct string_hash* forbiddenWords=NULL;
if (language==DUTCH || language==NORWEGIAN) {
   get_path(dictionary,temp);
   strcat(temp,"ForbiddenWords.txt");
   forbiddenWords=load_key_list(temp,mask_encoding_compatibility_input);
}
u_printf("Loading BIN file...\n");
struct BIN_free_info bin_free;
const unsigned char* bin=load_abstract_BIN_file(dictionary,&bin_free);
if (bin==NULL) {
   error("Cannot load bin file %s\n",dictionary);
   free_alphabet(alph);
   free_string_hash(forbiddenWords);
   return 1;
}
strcpy(temp,dictionary);
temp[strlen(dictionary)-3]='\0';
strcat(temp,"inf");
u_printf("Loading INF file...\n");
struct INF_free_info inf_free;
const struct INF_codes* inf=load_abstract_INF_file(temp,&inf_free);
if (inf==NULL) {
   error("Cannot load inf file %s\n",temp);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_string_hash(forbiddenWords);
   return 1;
}
char tmp[FILENAME_MAX];
strcpy(tmp,argv[vars->optind]);
strcat(tmp,".tmp");
U_FILE* words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,argv[vars->optind],U_READ);
if (words==NULL) {
   error("Cannot open word list file %s\n",argv[vars->optind]);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   free_string_hash(forbiddenWords);
   // here we return 0 in order to do not block the preprocessing
   // in the Unitex Java interface, if no dictionary was applied
   // so that there is no "err" file
   return 0;
}
U_FILE* new_unknown_words=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,tmp,U_WRITE);
if (new_unknown_words==NULL) {
   error("Cannot open temporary word list file %s\n",tmp);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   u_fclose(words);
   free_string_hash(forbiddenWords);
   return 1;
}

U_FILE* res=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,output,U_APPEND);
if (res==NULL) {
   error("Cannot open result file %s\n",output);
   free_alphabet(alph);
   free_abstract_BIN(bin,&bin_free);
   free_abstract_INF(inf,&inf_free);
   u_fclose(words);
   u_fclose(new_unknown_words);
   free_string_hash(forbiddenWords);
   return 1;
}
U_FILE* debug=NULL;
if (info!=NULL) {
   debug=u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,info,U_WRITE);
   if (debug==NULL) {
      error("Cannot open debug file %s\n",info);
   }
}
struct utags UTAG;

switch(language) {
case DUTCH: analyse_dutch_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break;
case GERMAN: analyse_german_compounds(alph,bin,inf,words,res,debug,new_unknown_words); break;
case NORWEGIAN: analyse_norwegian_unknown_words(alph,bin,inf,words,res,debug,new_unknown_words,forbiddenWords); break;
case RUSSIAN:
   init_russian(&UTAG);
   analyse_compounds(alph,bin,inf,words,res,debug,new_unknown_words,UTAG);
   break;
}

free_alphabet(alph);
free_abstract_BIN(bin,&bin_free);
free_abstract_INF(inf,&inf_free);
u_fclose(words);
u_fclose(new_unknown_words);
free_string_hash(forbiddenWords);
af_remove(argv[vars->optind]);
af_rename(tmp,argv[vars->optind]);
u_fclose(res);
if (debug!=NULL) {
   u_fclose(debug);
}
free_OptVars(vars);
return 0;
}
/**
 * remove a file
 */
UNITEX_FUNC int UNITEX_CALL RemoveUnitexFile(const char*name)
{
    return af_remove(name);
}
示例#12
0
int main_XMLizer(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int output_style=TEI;
char output[FILENAME_MAX]="";
char alphabet[FILENAME_MAX]="";
char normalization[FILENAME_MAX]="";
char segmentation[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int convLFtoCRLF=1;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_XMLizer,lopts_XMLizer,&index))) {
   switch(val) {
   case 'x': output_style=XML; break;
   case 't': output_style=TEI; break;
   case 'n': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty normalization grammar name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(normalization,options.vars()->optarg);
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 's': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty segmentation grammar name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(segmentation,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_XMLizer[index].name);
             return USAGE_ERROR_CODE;                         
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case '?': index==-1  ? error("Invalid option -%c\n",options.vars()->optopt) :
                          error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;             
  
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (segmentation[0]=='\0') {
   error("You must specify the segmentation grammar to use\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

char input[FILENAME_MAX];
strcpy(input,argv[options.vars()->optind]);
char snt[FILENAME_MAX];
remove_extension(input,snt);
strcat(snt,"_tmp.snt");
char tmp[FILENAME_MAX];
remove_extension(input,tmp);
strcat(tmp,".tmp");
normalize(input,snt,&vec,KEEP_CARRIAGE_RETURN,convLFtoCRLF,normalization,NULL,1);
struct fst2txt_parameters* p=new_fst2txt_parameters();
p->vec=vec;
p->input_text_file=strdup(snt);
if (p->input_text_file ==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}

p->output_text_file_is_temp=1;
p->output_text_file=strdup(tmp);
if (p->output_text_file==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}
p->fst_file=strdup(segmentation);
if (p->fst_file==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}
p->alphabet_file=strdup(alphabet);
if (p->alphabet_file==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}

p->output_policy=MERGE_OUTPUTS;
p->tokenization_policy=WORD_BY_WORD_TOKENIZATION;
p->space_policy=DONT_START_WITH_SPACE;

main_fst2txt(p);

free_fst2txt_parameters(p);

if (output[0]=='\0') {
  remove_extension(input,output);
	strcat(output,".xml");
}

int return_value = xmlize(&vec,snt,output,output_style);

af_remove(snt);
af_remove(tmp);

return return_value;
}
示例#13
0
int main_DuplicateFile(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return 0;
}



const char *input_file = NULL;
const char *output_file = NULL;
int do_delete=0;
int do_move=0;

int val,index=-1;
struct OptVars* vars=new_OptVars();
while (EOF!=(val=getopt_long_TS(argc,argv,optstring_DuplicateFile,lopts_DuplicateFile,&index,vars))) {
   switch(val) {
   case 'd': do_delete=1; break;
   case 'i': if (vars->optarg[0]=='\0') {
                fatal_error("Empty input argument\n");
             }
             input_file = vars->optarg; 
             break;
   case 'm': if (vars->optarg[0]=='\0') {
                fatal_error("Empty move argument\n");
             }
             input_file = vars->optarg; 
             do_move=1; 
             break;
   case 'h': usage(); return 0;
   case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt);
             else fatal_error("Missing argument for option --%s\n",lopts_DuplicateFile[index].name);
   case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt);
             else fatal_error("Invalid option --%s\n",vars->optarg);
             break;
   case 'k':
   case 'q': /* ignore -k and -q parameter instead make error */
             break;
   }
   index=-1;
}

if (vars->optind!=argc-1) {
   fatal_error("Invalid arguments: rerun with --help\n");
}

output_file = argv[vars->optind];

if ((input_file==NULL) && (do_delete==0)) {
   fatal_error("You must specify the input_file file\n");
}

if ((input_file!=NULL) && (do_delete==1)) {
   fatal_error("You cannot specify input_file when delete\n");
}
if (output_file==NULL) {
   fatal_error("You must specify the output_file file\n");
}

int result;
if (input_file != NULL) {
    if (do_move == 0) {
        u_printf("copy file %s to %s\n",input_file,output_file);
        result=af_copy(input_file,output_file);
    }
    else
    {
        u_printf("move file %s to %s\n",input_file,output_file);
        result=af_rename(input_file,output_file);
    }
}
else {
    u_printf("remove file %s\n",output_file);
    result=af_remove(output_file);
}
u_printf((result==0) ? "Done.\n" : "Unsucessfull.\n");
return result;
}
示例#14
0
int main_Normalize(int argc,char* const argv[]) {
if (argc==1) {
  usage();
  return SUCCESS_RETURN_CODE;
}
int mode=KEEP_CARRIAGE_RETURN;
int separator_normalization=1;
char rules[FILENAME_MAX]="";
char input_offsets[FILENAME_MAX]="";
char output_offsets[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int convLFtoCRLF=1;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_Normalize,lopts_Normalize,&index))) {
   switch(val) {
   case 'l': convLFtoCRLF=0; break;
   case 'n': mode=REMOVE_CARRIAGE_RETURN; break;
   case 'r': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty replacement rule file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(rules,options.vars()->optarg);
             break;
   case 1: separator_normalization=0; break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
              error("Empty input_encoding argument\n");
              return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
              error("Empty output_encoding argument\n");
              return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case '$': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty input offset file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(input_offsets,options.vars()->optarg);
             break;
   case '@': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty output offset file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(output_offsets,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Normalize[index].name);
             return USAGE_ERROR_CODE;
             break;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
  error("Invalid arguments: rerun with --help\n");
  return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

vector_offset* v_input_offsets=NULL;
vector_offset* v_output_offsets=NULL;
U_FILE* f_output_offsets=NULL;

if (output_offsets[0]!='\0') {
  /* We deal with offsets only if we have to produce output offsets */
  if (input_offsets[0]!='\0') {
    v_input_offsets=load_offsets(&vec,input_offsets);
  }
  f_output_offsets=u_fopen(&vec, output_offsets, U_WRITE);
  if (f_output_offsets==NULL) {
    error("Cannot create offset file %s\n",output_offsets);
    return DEFAULT_ERROR_CODE;
  }
  v_output_offsets=new_vector_offset();
}
char tmp_file[FILENAME_MAX];
get_extension(argv[options.vars()->optind],tmp_file);
if (!strcmp(tmp_file, ".snt")) {
   /* If the file to process has already the .snt extension, we temporary rename it to
   * .snt.normalizing */
  strcpy(tmp_file,argv[options.vars()->optind]);
  strcat(tmp_file,".normalizing");
  af_rename(argv[options.vars()->optind],tmp_file);
} else {
   strcpy(tmp_file,argv[options.vars()->optind]);
}
/* We set the destination file */
char dest_file[FILENAME_MAX];
remove_extension(argv[options.vars()->optind],dest_file);
strcat(dest_file,".snt");
u_printf("Normalizing %s...\n",argv[options.vars()->optind]);

int return_value = normalize(tmp_file,
                             dest_file,
                             &vec,
                             mode,
                             convLFtoCRLF,
                             rules,
                             v_output_offsets,
                             separator_normalization);
u_printf("\n");
/* If we have used a temporary file, we delete it */
if (strcmp(tmp_file,argv[options.vars()->optind])) {
   af_remove(tmp_file);
}
process_offsets(v_input_offsets,v_output_offsets,f_output_offsets);
u_fclose(f_output_offsets);
free_vector_offset(v_input_offsets);
free_vector_offset(v_output_offsets);
u_printf((return_value==SUCCESS_RETURN_CODE) ? "Done.\n" : "Unsuccessfull.\n");

return return_value;
}
示例#15
0
int main_DuplicateFile(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

const char *input_file = NULL;
const char *output_file = NULL;
int do_delete=0;
int do_recursive_delete=0;
int do_move=0;
int do_make_dir=0;
int do_make_dir_parent=0;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;

while (EOF!=(val=options.parse_long(argc,argv,optstring_DuplicateFile,lopts_DuplicateFile,&index))) {
   switch(val) {
   case 'a': do_make_dir = 1; break;
   case 'p': do_make_dir_parent = 1; break;
   case 'd': do_delete = 1; break;
   case 'r': do_delete = do_recursive_delete = 1; break;
   case 'i': if (options.vars()->optarg[0]=='\0') {
                error("Empty input argument\n");
                return USAGE_ERROR_CODE;
             }
             input_file = options.vars()->optarg;
             break;
   case 'm': if (options.vars()->optarg[0]=='\0') {
                error("Empty move argument\n");
                return USAGE_ERROR_CODE;
             }
             input_file = options.vars()->optarg;
             do_move=1;
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt):
                         error("Missing argument for option --%s\n",lopts_DuplicateFile[index].name);
             return USAGE_ERROR_CODE;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   case 'k':
   case 'q': /* ignore -k and -q parameter instead to raise an error */
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

output_file = argv[options.vars()->optind];

if ((input_file==NULL) && (do_delete==0) && (do_make_dir==0) && (do_make_dir_parent ==0)) {
   error("You must specify the input_file file\n");
   return USAGE_ERROR_CODE;
}

if ((input_file!=NULL) && (do_delete==1)) {
   error("You cannot specify input_file when delete\n");
   return USAGE_ERROR_CODE;
}
if (output_file==NULL) {
   error("You must specify the output_file file\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

int result = 0;
if (input_file != NULL) {
    if (do_move == 0) {
        u_printf("copy file %s to %s\n",input_file,output_file);
        /* af_copy return 0 if success, -1 with reading problem, 1 writing problem */
        result=af_copy(input_file,output_file);
    } else {
        u_printf("move file %s to %s\n",input_file,output_file);
        result=af_rename(input_file,output_file);
    }
} else if (do_make_dir != 0) {
    u_printf("make dir %s\n", output_file);
    result = mkDirPortable(output_file);
} else if (do_make_dir_parent != 0) {
    u_printf("make dir %s with parent\n", output_file);
    result = mkDirRecursiveIfNeeded(output_file);
} else {
    if (do_recursive_delete == 0) {
        u_printf("remove file %s\n",output_file);
        result=af_remove(output_file);
    } else {
        u_printf("remove folder %s\n", output_file);
        af_remove_folder(output_file);
        result=0;
    }
}
u_printf((result==0) ? "Done.\n" : "Unsucessfull.\n");
return result;
}