vector_offset* process_offsets(const vector_offset* first_offsets, const vector_offset* second_offsets) { // We don't known the size of the two file. // We calculate possible value int global_shift_first = global_shift_from_modified_offsets(first_offsets); int global_shift_second = global_shift_from_modified_offsets(second_offsets); int last_pos_mentionned_file0 = (first_offsets->nbelems > 0) ? first_offsets->tab[first_offsets->nbelems-1].old_end : 0; int last_pos_mentionned_file1a = (first_offsets->nbelems > 0) ? first_offsets->tab[first_offsets->nbelems-1].new_end : 0; int last_pos_mentionned_file1b = (second_offsets->nbelems > 0) ? second_offsets->tab[second_offsets->nbelems-1].old_end : 0; int last_pos_mentionned_file2 = (second_offsets->nbelems > 0) ? second_offsets->tab[second_offsets->nbelems-1].new_end : 0; int size_possible_file_1a = offset_max(last_pos_mentionned_file0 + global_shift_first, last_pos_mentionned_file1a); int size_possible_file_1b = offset_max(last_pos_mentionned_file2 - global_shift_second, last_pos_mentionned_file1b); int size_possible_file_1 = offset_max(size_possible_file_1a, size_possible_file_1b); int size_possible_file_0 = size_possible_file_1 - global_shift_first; int size_possible_file_2 = size_possible_file_1 + global_shift_second; // if we add the same positive integer to size_possible_file_0, size_possible_file_1, size_possible_file_2 // the merged_modified result will be the same vector_offset* first_offset_common = modified_offsets_to_common(first_offsets, size_possible_file_0, size_possible_file_1); vector_offset* second_offset_common = modified_offsets_to_common(second_offsets, size_possible_file_1, size_possible_file_2); vector_offset* merged_common = process_common_offsets(first_offset_common, second_offset_common); vector_offset* merged_modified = common_offsets_to_modified(merged_common, size_possible_file_0, size_possible_file_2); free_vector_offset(first_offset_common); free_vector_offset(second_offset_common); free_vector_offset(merged_common); return merged_modified; }
/** * This function takes two offset arrays: * * old_offsets=original text => input text * new_offsets=input text => output text * * and it computes and prints in the given file the new offset file: * * original text => output text * * If old_offsets is NULL, new offsets are saved in the output file * without any modification. */ void process_offsets(const vector_offset* first_offsets, const vector_offset* second_offsets, U_FILE* f) { if (f == NULL || second_offsets == NULL) return; if (first_offsets == NULL) { save_offsets(f, second_offsets); } else { vector_offset* processed_offset = process_offsets(first_offsets, second_offsets); save_offsets(f, processed_offset); free_vector_offset(processed_offset); } }
vector_offset* common_offsets_to_modified(const vector_offset* common_offsets, int old_size, int new_size) { if (common_offsets == NULL) { return NULL; } int nb_common_offsets_items = common_offsets->nbelems; vector_offset* modifed_vector_offset = new_vector_offset(nb_common_offsets_items + 2); Offsets latest_common; latest_common.old_start = latest_common.old_end = latest_common.new_start = latest_common.new_end = 0; for (int i = 0; i < nb_common_offsets_items; i++) { Offsets current_common = common_offsets->tab[i]; if ((current_common.old_end - current_common.old_start) != (current_common.new_end - current_common.new_start)) { error("Mismatch in length in common offset"); free_vector_offset(modifed_vector_offset); return NULL; } Offsets modified_offset; modified_offset.old_start = latest_common.old_end; modified_offset.new_start = latest_common.new_end; modified_offset.old_end = current_common.old_start; modified_offset.new_end = current_common.new_start; if ((modified_offset.old_end > modified_offset.old_start) || (modified_offset.new_end > modified_offset.new_start)) { vector_offset_add(modifed_vector_offset, modified_offset); } latest_common = current_common; } Offsets latest_modified_offset; latest_modified_offset.old_start = latest_common.old_end; latest_modified_offset.new_start = latest_common.new_end; latest_modified_offset.old_end = old_size; latest_modified_offset.new_end = new_size; if ((latest_modified_offset.old_end > latest_modified_offset.old_start) || (latest_modified_offset.new_end > latest_modified_offset.new_start)) { vector_offset_add(modifed_vector_offset, latest_modified_offset); } return modifed_vector_offset; }
int main_Normalize(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } int mode=KEEP_CARRIAGE_RETURN; int separator_normalization=1; char rules[FILENAME_MAX]=""; char input_offsets[FILENAME_MAX]=""; char output_offsets[FILENAME_MAX]=""; VersatileEncodingConfig vec=VEC_DEFAULT; int convLFtoCRLF=1; int val,index=-1; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_Normalize,lopts_Normalize,&index))) { switch(val) { case 'l': convLFtoCRLF=0; break; case 'n': mode=REMOVE_CARRIAGE_RETURN; break; case 'r': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty replacement rule file name\n"); return USAGE_ERROR_CODE; } strcpy(rules,options.vars()->optarg); break; case 1: separator_normalization=0; break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case '$': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty input offset file name\n"); return USAGE_ERROR_CODE; } strcpy(input_offsets,options.vars()->optarg); break; case '@': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty output offset file name\n"); return USAGE_ERROR_CODE; } strcpy(output_offsets,options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_Normalize[index].name); return USAGE_ERROR_CODE; break; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; break; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } vector_offset* v_input_offsets=NULL; vector_offset* v_output_offsets=NULL; U_FILE* f_output_offsets=NULL; if (output_offsets[0]!='\0') { /* We deal with offsets only if we have to produce output offsets */ if (input_offsets[0]!='\0') { v_input_offsets=load_offsets(&vec,input_offsets); } f_output_offsets=u_fopen(&vec, output_offsets, U_WRITE); if (f_output_offsets==NULL) { error("Cannot create offset file %s\n",output_offsets); return DEFAULT_ERROR_CODE; } v_output_offsets=new_vector_offset(); } char tmp_file[FILENAME_MAX]; get_extension(argv[options.vars()->optind],tmp_file); if (!strcmp(tmp_file, ".snt")) { /* If the file to process has already the .snt extension, we temporary rename it to * .snt.normalizing */ strcpy(tmp_file,argv[options.vars()->optind]); strcat(tmp_file,".normalizing"); af_rename(argv[options.vars()->optind],tmp_file); } else { strcpy(tmp_file,argv[options.vars()->optind]); } /* We set the destination file */ char dest_file[FILENAME_MAX]; remove_extension(argv[options.vars()->optind],dest_file); strcat(dest_file,".snt"); u_printf("Normalizing %s...\n",argv[options.vars()->optind]); int return_value = normalize(tmp_file, dest_file, &vec, mode, convLFtoCRLF, rules, v_output_offsets, separator_normalization); u_printf("\n"); /* If we have used a temporary file, we delete it */ if (strcmp(tmp_file,argv[options.vars()->optind])) { af_remove(tmp_file); } process_offsets(v_input_offsets,v_output_offsets,f_output_offsets); u_fclose(f_output_offsets); free_vector_offset(v_input_offsets); free_vector_offset(v_output_offsets); u_printf((return_value==SUCCESS_RETURN_CODE) ? "Done.\n" : "Unsuccessfull.\n"); return return_value; }
/** * translate a set of offset * ofs is an array of nb_translations items of type offset_translation * Before calling the function, you must fill: * (ofs + #)->position_to_translate with a position (offset) to translate * (ofs + #)->sort_order=# (because array will be sorted on position_to_translate, then back to sort_order */ void translate_offset(offset_translation* ofs, int nb_translations, const vector_offset* offsets, int revert) { if (offsets == NULL) { return; } if ((nb_translations == 0) || (ofs == NULL)) { return; } int i; int sorted_by_position = 1; for (i = 1;i < nb_translations;i++) if (((ofs + i)->position_to_translate) < ((ofs + i - 1)->position_to_translate)) { sorted_by_position = 0; break; } if (!sorted_by_position) { qsort(ofs,nb_translations,sizeof(offset_translation), compare_offset_translation_by_position); } int last_position_in_offsets = revert ? (offsets->tab[offsets->nbelems - 1].new_start + offsets->tab[offsets->nbelems-1].new_end) : (offsets->tab[offsets->nbelems - 1].old_start + offsets->tab[offsets->nbelems-1].old_end); int last_position_to_translate = (ofs + nb_translations - 1)->position_to_translate; int minimal_filesize = offset_max(last_position_in_offsets, last_position_to_translate); vector_offset* common_offsets = modified_offsets_to_common(offsets, -1, minimal_filesize); int pos_common_offsets = 0; for (i = 0; i < nb_translations; i++) { int pos_to_translate = (ofs + i)->position_to_translate; for (;;) { if (pos_common_offsets == common_offsets->nbelems) break; int end_current_common = revert ? (common_offsets->tab[pos_common_offsets].new_end) : (common_offsets->tab[pos_common_offsets].old_end); if (end_current_common > pos_to_translate) break; pos_common_offsets++; } int current_common_start = -1; int current_common_end = -1; if (pos_common_offsets < common_offsets->nbelems) { current_common_start = revert ? (common_offsets->tab[pos_common_offsets].new_start) : (common_offsets->tab[pos_common_offsets].old_start); current_common_end = revert ? (common_offsets->tab[pos_common_offsets].new_end) : (common_offsets->tab[pos_common_offsets].old_end); int translate_common_start = revert ? (common_offsets->tab[pos_common_offsets].old_start) : (common_offsets->tab[pos_common_offsets].new_start); if ((pos_to_translate >= current_common_start) && (pos_to_translate < current_common_end)) { (ofs + i)->translated_position = translate_common_start + (pos_to_translate - current_common_start); (ofs + i)->translation_pos_in_common = 1; } else { (ofs + i)->translated_position = translate_common_start + (current_common_end - current_common_start); (ofs + i)->translation_pos_in_common = -1; } } else { int last_pos_translated = 0; if (common_offsets->nbelems > 0) last_pos_translated = revert ? ((common_offsets->tab[common_offsets->nbelems-1].old_start) -1) : ((common_offsets->tab[common_offsets->nbelems-1].new_start) -1); if (last_pos_translated == -1) last_pos_translated = 0; (ofs + i)->translated_position = last_pos_translated; (ofs + i)->translation_pos_in_common = -1; } } free_vector_offset(common_offsets); int sorted_by_sort_order = 1; for (i = 1;i < nb_translations;i++) if (((ofs + i)->sort_order) < ((ofs + i - 1)->sort_order)) { sorted_by_sort_order = 0; break; } if (!sorted_by_sort_order) { qsort(ofs, nb_translations, sizeof(offset_translation), compare_offset_translation_by_sort_order); } }
vector_offset* process_common_offsets(const vector_offset* first_offsets, const vector_offset* second_offsets) { if ((first_offsets == NULL) || (second_offsets == NULL)) { return NULL; } int first_nb_offsets_items = first_offsets->nbelems; int second_nb_offsets_items = second_offsets->nbelems; vector_offset* merged_vector_offset = new_vector_offset(first_nb_offsets_items + second_nb_offsets_items + 1); int pos_in_first_offsets = 0; for (int i = 0; i < second_nb_offsets_items; i++) { Offsets common_offset_in_second = second_offsets->tab[i]; if ((common_offset_in_second.old_end - common_offset_in_second.old_start) != (common_offset_in_second.new_end - common_offset_in_second.new_start)) { free_vector_offset(merged_vector_offset); error("Invalid common offset file"); return NULL; } while ((common_offset_in_second.old_end - common_offset_in_second.old_start) != 0) { for (;;) { if (pos_in_first_offsets == first_nb_offsets_items) { // we have no common part in first file to process return merged_vector_offset; } if (first_offsets->tab[pos_in_first_offsets].new_end > common_offset_in_second.old_start) { break; } pos_in_first_offsets++; } int nb_common = 0; Offsets current_common_in_first = first_offsets->tab[pos_in_first_offsets]; if (current_common_in_first.new_start > common_offset_in_second.old_start) { int skip_second = current_common_in_first.new_start - common_offset_in_second.old_start; common_offset_in_second.old_start += skip_second; common_offset_in_second.new_start += skip_second; } if (current_common_in_first.new_start < common_offset_in_second.old_start) { int skip_first = common_offset_in_second.old_start - current_common_in_first.new_start; current_common_in_first.old_start += skip_first; current_common_in_first.new_start += skip_first; } int len_common = offset_min(current_common_in_first.new_end - current_common_in_first.new_start, common_offset_in_second.old_end - common_offset_in_second.old_start); if (len_common > 0) { Offsets CommonOffsetToWrite; nb_common = len_common; int shift_in_first = 0;//common_offset_in_second.old_start >= current_common_in_first.new_start; CommonOffsetToWrite.old_start = current_common_in_first.old_start + shift_in_first; CommonOffsetToWrite.old_end = CommonOffsetToWrite.old_start + nb_common; CommonOffsetToWrite.new_start = common_offset_in_second.new_start; CommonOffsetToWrite.new_end = CommonOffsetToWrite.new_start + nb_common; vector_offset_add_with_merging(merged_vector_offset, CommonOffsetToWrite); common_offset_in_second.old_start += nb_common; common_offset_in_second.new_start += nb_common; } else { break; } } } return merged_vector_offset; }
/** * Convert offset data with modified zone to list of common offsets * */ vector_offset* modified_offsets_to_common(const vector_offset* offsets, int old_size, int new_size) { if ((old_size == -1) && (new_size != -1)) { old_size = new_size - global_shift_from_modified_offsets(offsets); } else if ((old_size != -1) && (new_size == -1)) { new_size = old_size + global_shift_from_modified_offsets(offsets); } int nb_offsets_items = offsets->nbelems; vector_offset* inverted_vector_offset = new_vector_offset(nb_offsets_items + 1); for (int i = 0; i < nb_offsets_items; i++) { Offsets curOffset = offsets->tab[i]; Offsets prevOffset; if (i > 0) { prevOffset = offsets->tab[i - 1]; } else { prevOffset.old_end = prevOffset.new_end = 0; } Offsets CommonOffset; CommonOffset.old_start = prevOffset.old_end; CommonOffset.old_end = curOffset.old_start; CommonOffset.new_start = prevOffset.new_end; CommonOffset.new_end = curOffset.new_start; if (DetectCommonIncoherency(old_size, CommonOffset.old_start, CommonOffset.old_end, new_size, CommonOffset.new_start, CommonOffset.new_end)) { free_vector_offset(inverted_vector_offset); error("coherency problem on offset file"); return NULL; } if (CommonOffset.new_start != CommonOffset.new_end) { vector_offset_add(inverted_vector_offset, CommonOffset); } } Offsets LastCommonOffset; if (nb_offsets_items > 0) { LastCommonOffset.old_start = offsets->tab[nb_offsets_items - 1].old_end; LastCommonOffset.new_start = offsets->tab[nb_offsets_items - 1].new_end; } else { LastCommonOffset.old_start = 0; LastCommonOffset.new_start = 0; } LastCommonOffset.old_end = old_size; LastCommonOffset.new_end = new_size; if (DetectCommonIncoherency(old_size, LastCommonOffset.old_start, LastCommonOffset.old_end, new_size, LastCommonOffset.new_start, LastCommonOffset.new_end)) { free_vector_offset(inverted_vector_offset); error("coherency problem on offset file"); return NULL; } if (LastCommonOffset.new_start != LastCommonOffset.new_end) { vector_offset_add(inverted_vector_offset, LastCommonOffset); } return inverted_vector_offset; }