Exemplo n.º 1
0
int main_Fst2Txt(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

struct fst2txt_parameters* p=new_fst2txt_parameters();
char in_offsets[FILENAME_MAX]="";
char out_offsets[FILENAME_MAX]="";
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;

while (EOF!=(val=options.parse_long(argc,argv,optstring_Fst2Txt,lopts_Fst2Txt,&index))) {
   switch(val) {
   case 't': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty text file name\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;
             }
             p->input_text_file=strdup(options.vars()->optarg);
             if (p->input_text_file==NULL) {
                alloc_error("main_Fst2Txt");
                free_fst2txt_parameters(p);
                return ALLOC_ERROR_CODE;
             }
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty text output file name\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;
             }
             p->output_text_file=strdup(options.vars()->optarg);
			 p->output_text_file_is_temp=0;
             if (p->output_text_file==NULL) {
                alloc_error("main_Fst2Txt");
                free_fst2txt_parameters(p);
                return ALLOC_ERROR_CODE;
             }
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;                
             }
             p->alphabet_file=strdup(options.vars()->optarg);
             if (p->alphabet_file==NULL) {
               alloc_error("main_Fst2Txt");
               free_fst2txt_parameters(p);
               return ALLOC_ERROR_CODE;               
             }
             break;
   case 'M': p->output_policy=MERGE_OUTPUTS; break;
   case 'R': p->output_policy=REPLACE_OUTPUTS; break;
   case 'c': p->tokenization_policy=CHAR_BY_CHAR_TOKENIZATION; break;
   case 'w': p->tokenization_policy=WORD_BY_WORD_TOKENIZATION; break;
   case 's': p->space_policy=START_WITH_SPACE; break;
   case 'x': p->space_policy=DONT_START_WITH_SPACE; break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             free_fst2txt_parameters(p);
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Fst2Txt[index].name);
             free_fst2txt_parameters(p);
             return USAGE_ERROR_CODE; 
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;                 
             }
             decode_reading_encoding_parameter(&(p->vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE; 
             }
             decode_writing_encoding_parameter(&(p->vec.encoding_output),&(p->vec.bom_output),options.vars()->optarg);
             break;
   case '$': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_offsets argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE; 
             }
             strcpy(in_offsets,options.vars()->optarg);
             break;
   case '@': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_offsets argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE; 
             }
             strcpy(out_offsets,options.vars()->optarg);
             break;
   case 'l': p->convLFtoCRLF=0; break;
   case 'r': p->keepCR = 1; break;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free_fst2txt_parameters(p);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free_fst2txt_parameters(p);
   return USAGE_ERROR_CODE;
}

if (p->input_text_file==NULL) {
   error("You must specify the text file\n");
   free_fst2txt_parameters(p);
   return USAGE_ERROR_CODE;   
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free_fst2txt_parameters(p);
  return SUCCESS_RETURN_CODE;
}

if (out_offsets[0]!='\0') {
	/* We deal with offsets only if the program is expected to produce some */
	if (in_offsets[0]!='\0') {
		p->v_in_offsets=load_offsets(&(p->vec),in_offsets);
		if (p->v_in_offsets==NULL) {
			error("Cannot load offset file %s\n",in_offsets);
      free_fst2txt_parameters(p);
      return DEFAULT_ERROR_CODE;      
		}
	} else {
		/* If there is no input offset file, we create an empty offset vector
		 * in order to avoid testing whether the vector is NULL or not */
		p->v_in_offsets=new_vector_offset(1);
	}
	p->f_out_offsets=u_fopen(&(p->vec),out_offsets,U_WRITE);
	if (p->f_out_offsets==NULL) {
		error("Cannot create file %s\n",out_offsets);
    free_fst2txt_parameters(p);
    return DEFAULT_ERROR_CODE;     
	}
}

if (p->output_text_file == NULL) {
	char tmp[FILENAME_MAX];
	remove_extension(p->input_text_file, tmp);
	strcat(tmp, ".tmp");
	p->output_text_file_is_temp=1;
	p->output_text_file = strdup(tmp);
	if (p->output_text_file == NULL) {
		alloc_error("main_Fst2Txt");
		free_fst2txt_parameters(p);
		return ALLOC_ERROR_CODE;
	}
}
p->fst_file=strdup(argv[options.vars()->optind]);
if (p->fst_file==NULL) {
   alloc_error("main_Fst2Txt");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;   
}

int result=main_fst2txt(p);

free_fst2txt_parameters(p);
return result;
}
Exemplo n.º 2
0
int main_XMLizer(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

int output_style=TEI;
char output[FILENAME_MAX]="";
char alphabet[FILENAME_MAX]="";
char normalization[FILENAME_MAX]="";
char segmentation[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int convLFtoCRLF=1;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_XMLizer,lopts_XMLizer,&index))) {
   switch(val) {
   case 'x': output_style=XML; break;
   case 't': output_style=TEI; break;
   case 'n': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty normalization grammar name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(normalization,options.vars()->optarg);
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty output file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(output,options.vars()->optarg);
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 's': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty segmentation grammar name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(segmentation,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_XMLizer[index].name);
             return USAGE_ERROR_CODE;                         
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case '?': index==-1  ? error("Invalid option -%c\n",options.vars()->optopt) :
                          error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;             
  
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (segmentation[0]=='\0') {
   error("You must specify the segmentation grammar to use\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

char input[FILENAME_MAX];
strcpy(input,argv[options.vars()->optind]);
char snt[FILENAME_MAX];
remove_extension(input,snt);
strcat(snt,"_tmp.snt");
char tmp[FILENAME_MAX];
remove_extension(input,tmp);
strcat(tmp,".tmp");
normalize(input,snt,&vec,KEEP_CARRIAGE_RETURN,convLFtoCRLF,normalization,NULL,1);
struct fst2txt_parameters* p=new_fst2txt_parameters();
p->vec=vec;
p->input_text_file=strdup(snt);
if (p->input_text_file ==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}

p->output_text_file_is_temp=1;
p->output_text_file=strdup(tmp);
if (p->output_text_file==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}
p->fst_file=strdup(segmentation);
if (p->fst_file==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}
p->alphabet_file=strdup(alphabet);
if (p->alphabet_file==NULL) {
   alloc_error("main_XMLizer");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;
}

p->output_policy=MERGE_OUTPUTS;
p->tokenization_policy=WORD_BY_WORD_TOKENIZATION;
p->space_policy=DONT_START_WITH_SPACE;

main_fst2txt(p);

free_fst2txt_parameters(p);

if (output[0]=='\0') {
  remove_extension(input,output);
	strcat(output,".xml");
}

int return_value = xmlize(&vec,snt,output,output_style);

af_remove(snt);
af_remove(tmp);

return return_value;
}