/** * Loads the given offset file. Returns NULL in case of error. */ vector_offset* load_offsets(const VersatileEncodingConfig* vec,const char* name) { U_FILE* f=u_fopen(vec,name,U_READ); if (f==NULL) return NULL; int a,b,c,d,n; vector_offset* res=new_vector_offset(); while ((n=u_fscanf(f,"%d%d%d%d",&a,&b,&c,&d))!=EOF) { if (n!=4) { fatal_error("Corrupted offset file %s\n",name); } vector_offset_add(res,a,b,c,d); } u_fclose(f); return res; }
vector_offset* common_offsets_to_modified(const vector_offset* common_offsets, int old_size, int new_size) { if (common_offsets == NULL) { return NULL; } int nb_common_offsets_items = common_offsets->nbelems; vector_offset* modifed_vector_offset = new_vector_offset(nb_common_offsets_items + 2); Offsets latest_common; latest_common.old_start = latest_common.old_end = latest_common.new_start = latest_common.new_end = 0; for (int i = 0; i < nb_common_offsets_items; i++) { Offsets current_common = common_offsets->tab[i]; if ((current_common.old_end - current_common.old_start) != (current_common.new_end - current_common.new_start)) { error("Mismatch in length in common offset"); free_vector_offset(modifed_vector_offset); return NULL; } Offsets modified_offset; modified_offset.old_start = latest_common.old_end; modified_offset.new_start = latest_common.new_end; modified_offset.old_end = current_common.old_start; modified_offset.new_end = current_common.new_start; if ((modified_offset.old_end > modified_offset.old_start) || (modified_offset.new_end > modified_offset.new_start)) { vector_offset_add(modifed_vector_offset, modified_offset); } latest_common = current_common; } Offsets latest_modified_offset; latest_modified_offset.old_start = latest_common.old_end; latest_modified_offset.new_start = latest_common.new_end; latest_modified_offset.old_end = old_size; latest_modified_offset.new_end = new_size; if ((latest_modified_offset.old_end > latest_modified_offset.old_start) || (latest_modified_offset.new_end > latest_modified_offset.new_start)) { vector_offset_add(modifed_vector_offset, latest_modified_offset); } return modifed_vector_offset; }
vector_offset* common_offsets_to_modifed(const vector_offset* offsets, int old_size, int new_size) { int nb_offsets_items = offsets->nbelems; vector_offset* inverted_vector_offset = new_vector_offset(nb_offsets_items + 2); for (int i = 0; i < nb_offsets_items; i++) { Offsets curOffset = offsets->tab[i]; Offsets prevOffset; if (i > 0) { prevOffset = offsets->tab[i - 1]; } else { prevOffset.old_end = prevOffset.new_end = 0; } Offsets DifferentOffset; DifferentOffset.old_start = prevOffset.old_end; DifferentOffset.old_end = curOffset.old_start; DifferentOffset.new_start = prevOffset.new_end; DifferentOffset.new_end = curOffset.new_start; vector_offset_add(inverted_vector_offset, DifferentOffset); } Offsets LastDifferentOffset; if (nb_offsets_items > 0) { LastDifferentOffset.old_start = offsets->tab[nb_offsets_items - 1].old_end; LastDifferentOffset.new_start = offsets->tab[nb_offsets_items - 1].new_end; } else { LastDifferentOffset.old_start = 0; LastDifferentOffset.new_start = 0; } LastDifferentOffset.old_end = old_size; LastDifferentOffset.new_end = new_size; if ((LastDifferentOffset.old_start != LastDifferentOffset.old_end) || (LastDifferentOffset.old_start != LastDifferentOffset.old_end)) { vector_offset_add(inverted_vector_offset, LastDifferentOffset); } return inverted_vector_offset; }
int main_Fst2Txt(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } struct fst2txt_parameters* p=new_fst2txt_parameters(); char in_offsets[FILENAME_MAX]=""; char out_offsets[FILENAME_MAX]=""; int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_Fst2Txt,lopts_Fst2Txt,&index))) { switch(val) { case 't': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty text file name\n"); free_fst2txt_parameters(p); return USAGE_ERROR_CODE; } p->input_text_file=strdup(options.vars()->optarg); if (p->input_text_file==NULL) { alloc_error("main_Fst2Txt"); free_fst2txt_parameters(p); return ALLOC_ERROR_CODE; } break; case 'o': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty text output file name\n"); free_fst2txt_parameters(p); return USAGE_ERROR_CODE; } p->output_text_file=strdup(options.vars()->optarg); p->output_text_file_is_temp=0; if (p->output_text_file==NULL) { alloc_error("main_Fst2Txt"); free_fst2txt_parameters(p); return ALLOC_ERROR_CODE; } break; case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); free_fst2txt_parameters(p); return USAGE_ERROR_CODE; } p->alphabet_file=strdup(options.vars()->optarg); if (p->alphabet_file==NULL) { alloc_error("main_Fst2Txt"); free_fst2txt_parameters(p); return ALLOC_ERROR_CODE; } break; case 'M': p->output_policy=MERGE_OUTPUTS; break; case 'R': p->output_policy=REPLACE_OUTPUTS; break; case 'c': p->tokenization_policy=CHAR_BY_CHAR_TOKENIZATION; break; case 'w': p->tokenization_policy=WORD_BY_WORD_TOKENIZATION; break; case 's': p->space_policy=START_WITH_SPACE; break; case 'x': p->space_policy=DONT_START_WITH_SPACE; break; case 'V': only_verify_arguments = true; break; case 'h': usage(); free_fst2txt_parameters(p); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_Fst2Txt[index].name); free_fst2txt_parameters(p); return USAGE_ERROR_CODE; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); free_fst2txt_parameters(p); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(p->vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); free_fst2txt_parameters(p); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(p->vec.encoding_output),&(p->vec.bom_output),options.vars()->optarg); break; case '$': if (options.vars()->optarg[0]=='\0') { error("Empty input_offsets argument\n"); free_fst2txt_parameters(p); return USAGE_ERROR_CODE; } strcpy(in_offsets,options.vars()->optarg); break; case '@': if (options.vars()->optarg[0]=='\0') { error("Empty output_offsets argument\n"); free_fst2txt_parameters(p); return USAGE_ERROR_CODE; } strcpy(out_offsets,options.vars()->optarg); break; case 'l': p->convLFtoCRLF=0; break; case 'r': p->keepCR = 1; break; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); free_fst2txt_parameters(p); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); free_fst2txt_parameters(p); return USAGE_ERROR_CODE; } if (p->input_text_file==NULL) { error("You must specify the text file\n"); free_fst2txt_parameters(p); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory free_fst2txt_parameters(p); return SUCCESS_RETURN_CODE; } if (out_offsets[0]!='\0') { /* We deal with offsets only if the program is expected to produce some */ if (in_offsets[0]!='\0') { p->v_in_offsets=load_offsets(&(p->vec),in_offsets); if (p->v_in_offsets==NULL) { error("Cannot load offset file %s\n",in_offsets); free_fst2txt_parameters(p); return DEFAULT_ERROR_CODE; } } else { /* If there is no input offset file, we create an empty offset vector * in order to avoid testing whether the vector is NULL or not */ p->v_in_offsets=new_vector_offset(1); } p->f_out_offsets=u_fopen(&(p->vec),out_offsets,U_WRITE); if (p->f_out_offsets==NULL) { error("Cannot create file %s\n",out_offsets); free_fst2txt_parameters(p); return DEFAULT_ERROR_CODE; } } if (p->output_text_file == NULL) { char tmp[FILENAME_MAX]; remove_extension(p->input_text_file, tmp); strcat(tmp, ".tmp"); p->output_text_file_is_temp=1; p->output_text_file = strdup(tmp); if (p->output_text_file == NULL) { alloc_error("main_Fst2Txt"); free_fst2txt_parameters(p); return ALLOC_ERROR_CODE; } } p->fst_file=strdup(argv[options.vars()->optind]); if (p->fst_file==NULL) { alloc_error("main_Fst2Txt"); free_fst2txt_parameters(p); return ALLOC_ERROR_CODE; } int result=main_fst2txt(p); free_fst2txt_parameters(p); return result; }
int main_Normalize(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } int mode=KEEP_CARRIAGE_RETURN; int separator_normalization=1; char rules[FILENAME_MAX]=""; char input_offsets[FILENAME_MAX]=""; char output_offsets[FILENAME_MAX]=""; VersatileEncodingConfig vec=VEC_DEFAULT; int convLFtoCRLF=1; int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_Normalize,lopts_Normalize,&index))) { switch(val) { case 'l': convLFtoCRLF=0; break; case 'n': mode=REMOVE_CARRIAGE_RETURN; break; case 'r': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty replacement rule file name\n"); return USAGE_ERROR_CODE; } strcpy(rules,options.vars()->optarg); break; case 1: separator_normalization=0; break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case '$': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty input offset file name\n"); return USAGE_ERROR_CODE; } strcpy(input_offsets,options.vars()->optarg); break; case '@': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output offset file name\n"); return USAGE_ERROR_CODE; } strcpy(output_offsets,options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_Normalize[index].name); return USAGE_ERROR_CODE; break; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; break; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } vector_offset* v_input_offsets=NULL; vector_offset* v_output_offsets=NULL; U_FILE* f_output_offsets=NULL; if (output_offsets[0]!='\0') { /* We deal with offsets only if we have to produce output offsets */ if (input_offsets[0]!='\0') { v_input_offsets=load_offsets(&vec,input_offsets); } f_output_offsets=u_fopen(&vec, output_offsets, U_WRITE); if (f_output_offsets==NULL) { error("Cannot create offset file %s\n",output_offsets); return DEFAULT_ERROR_CODE; } v_output_offsets=new_vector_offset(); } char tmp_file[FILENAME_MAX]; get_extension(argv[options.vars()->optind],tmp_file); if (!strcmp(tmp_file, ".snt")) { /* If the file to process has already the .snt extension, we temporary rename it to * .snt.normalizing */ strcpy(tmp_file,argv[options.vars()->optind]); strcat(tmp_file,".normalizing"); af_rename(argv[options.vars()->optind],tmp_file); } else { strcpy(tmp_file,argv[options.vars()->optind]); } /* We set the destination file */ char dest_file[FILENAME_MAX]; remove_extension(argv[options.vars()->optind],dest_file); strcat(dest_file,".snt"); u_printf("Normalizing %s...\n",argv[options.vars()->optind]); int return_value = normalize(tmp_file, dest_file, &vec, mode, convLFtoCRLF, rules, v_output_offsets, separator_normalization); u_printf("\n"); /* If we have used a temporary file, we delete it */ if (strcmp(tmp_file,argv[options.vars()->optind])) { af_remove(tmp_file); } process_offsets(v_input_offsets,v_output_offsets,f_output_offsets); u_fclose(f_output_offsets); free_vector_offset(v_input_offsets); free_vector_offset(v_output_offsets); u_printf((return_value==SUCCESS_RETURN_CODE) ? "Done.\n" : "Unsuccessfull.\n"); return return_value; }
vector_offset* process_common_offsets(const vector_offset* first_offsets, const vector_offset* second_offsets) { if ((first_offsets == NULL) || (second_offsets == NULL)) { return NULL; } int first_nb_offsets_items = first_offsets->nbelems; int second_nb_offsets_items = second_offsets->nbelems; vector_offset* merged_vector_offset = new_vector_offset(first_nb_offsets_items + second_nb_offsets_items + 1); int pos_in_first_offsets = 0; for (int i = 0; i < second_nb_offsets_items; i++) { Offsets common_offset_in_second = second_offsets->tab[i]; if ((common_offset_in_second.old_end - common_offset_in_second.old_start) != (common_offset_in_second.new_end - common_offset_in_second.new_start)) { free_vector_offset(merged_vector_offset); error("Invalid common offset file"); return NULL; } while ((common_offset_in_second.old_end - common_offset_in_second.old_start) != 0) { for (;;) { if (pos_in_first_offsets == first_nb_offsets_items) { // we have no common part in first file to process return merged_vector_offset; } if (first_offsets->tab[pos_in_first_offsets].new_end > common_offset_in_second.old_start) { break; } pos_in_first_offsets++; } int nb_common = 0; Offsets current_common_in_first = first_offsets->tab[pos_in_first_offsets]; if (current_common_in_first.new_start > common_offset_in_second.old_start) { int skip_second = current_common_in_first.new_start - common_offset_in_second.old_start; common_offset_in_second.old_start += skip_second; common_offset_in_second.new_start += skip_second; } if (current_common_in_first.new_start < common_offset_in_second.old_start) { int skip_first = common_offset_in_second.old_start - current_common_in_first.new_start; current_common_in_first.old_start += skip_first; current_common_in_first.new_start += skip_first; } int len_common = offset_min(current_common_in_first.new_end - current_common_in_first.new_start, common_offset_in_second.old_end - common_offset_in_second.old_start); if (len_common > 0) { Offsets CommonOffsetToWrite; nb_common = len_common; int shift_in_first = 0;//common_offset_in_second.old_start >= current_common_in_first.new_start; CommonOffsetToWrite.old_start = current_common_in_first.old_start + shift_in_first; CommonOffsetToWrite.old_end = CommonOffsetToWrite.old_start + nb_common; CommonOffsetToWrite.new_start = common_offset_in_second.new_start; CommonOffsetToWrite.new_end = CommonOffsetToWrite.new_start + nb_common; vector_offset_add_with_merging(merged_vector_offset, CommonOffsetToWrite); common_offset_in_second.old_start += nb_common; common_offset_in_second.new_start += nb_common; } else { break; } } } return merged_vector_offset; }
/** * Convert offset data with modified zone to list of common offsets * */ vector_offset* modified_offsets_to_common(const vector_offset* offsets, int old_size, int new_size) { if ((old_size == -1) && (new_size != -1)) { old_size = new_size - global_shift_from_modified_offsets(offsets); } else if ((old_size != -1) && (new_size == -1)) { new_size = old_size + global_shift_from_modified_offsets(offsets); } int nb_offsets_items = offsets->nbelems; vector_offset* inverted_vector_offset = new_vector_offset(nb_offsets_items + 1); for (int i = 0; i < nb_offsets_items; i++) { Offsets curOffset = offsets->tab[i]; Offsets prevOffset; if (i > 0) { prevOffset = offsets->tab[i - 1]; } else { prevOffset.old_end = prevOffset.new_end = 0; } Offsets CommonOffset; CommonOffset.old_start = prevOffset.old_end; CommonOffset.old_end = curOffset.old_start; CommonOffset.new_start = prevOffset.new_end; CommonOffset.new_end = curOffset.new_start; if (DetectCommonIncoherency(old_size, CommonOffset.old_start, CommonOffset.old_end, new_size, CommonOffset.new_start, CommonOffset.new_end)) { free_vector_offset(inverted_vector_offset); error("coherency problem on offset file"); return NULL; } if (CommonOffset.new_start != CommonOffset.new_end) { vector_offset_add(inverted_vector_offset, CommonOffset); } } Offsets LastCommonOffset; if (nb_offsets_items > 0) { LastCommonOffset.old_start = offsets->tab[nb_offsets_items - 1].old_end; LastCommonOffset.new_start = offsets->tab[nb_offsets_items - 1].new_end; } else { LastCommonOffset.old_start = 0; LastCommonOffset.new_start = 0; } LastCommonOffset.old_end = old_size; LastCommonOffset.new_end = new_size; if (DetectCommonIncoherency(old_size, LastCommonOffset.old_start, LastCommonOffset.old_end, new_size, LastCommonOffset.new_start, LastCommonOffset.new_end)) { free_vector_offset(inverted_vector_offset); error("coherency problem on offset file"); return NULL; } if (LastCommonOffset.new_start != LastCommonOffset.new_end) { vector_offset_add(inverted_vector_offset, LastCommonOffset); } return inverted_vector_offset; }