Ejemplo n.º 1
0
vector_offset* process_offsets(const vector_offset* first_offsets, const vector_offset* second_offsets) {


	// We don't known the size of the two file.
	// We calculate possible value
	int global_shift_first = global_shift_from_modified_offsets(first_offsets);
	int global_shift_second = global_shift_from_modified_offsets(second_offsets);

	int last_pos_mentionned_file0 = (first_offsets->nbelems > 0) ? first_offsets->tab[first_offsets->nbelems-1].old_end : 0;
	int last_pos_mentionned_file1a = (first_offsets->nbelems > 0) ? first_offsets->tab[first_offsets->nbelems-1].new_end : 0;
	int last_pos_mentionned_file1b = (second_offsets->nbelems > 0) ? second_offsets->tab[second_offsets->nbelems-1].old_end : 0;
	int last_pos_mentionned_file2 = (second_offsets->nbelems > 0) ? second_offsets->tab[second_offsets->nbelems-1].new_end : 0;


	int size_possible_file_1a = offset_max(last_pos_mentionned_file0 + global_shift_first, last_pos_mentionned_file1a);
	int size_possible_file_1b = offset_max(last_pos_mentionned_file2 - global_shift_second, last_pos_mentionned_file1b);
	int size_possible_file_1 = offset_max(size_possible_file_1a, size_possible_file_1b);

	int size_possible_file_0 = size_possible_file_1 - global_shift_first;
	int size_possible_file_2 = size_possible_file_1 + global_shift_second;
	
	// if we add the same positive integer to size_possible_file_0, size_possible_file_1, size_possible_file_2
	// the merged_modified result will be the same
	vector_offset* first_offset_common = modified_offsets_to_common(first_offsets, size_possible_file_0, size_possible_file_1);
	vector_offset* second_offset_common = modified_offsets_to_common(second_offsets, size_possible_file_1, size_possible_file_2);
	vector_offset* merged_common = process_common_offsets(first_offset_common, second_offset_common);
	vector_offset* merged_modified = common_offsets_to_modified(merged_common, size_possible_file_0, size_possible_file_2);

	free_vector_offset(first_offset_common);
	free_vector_offset(second_offset_common);
	free_vector_offset(merged_common);

	return merged_modified;
}
Ejemplo n.º 2
0
/**
* This function takes two offset arrays:
*
* old_offsets=original text => input text
* new_offsets=input text => output text
*
* and it computes and prints in the given file the new offset file:
*
* original text => output text
*
* If old_offsets is NULL, new offsets are saved in the output file
* without any modification.
*/
void process_offsets(const vector_offset* first_offsets, const vector_offset* second_offsets, U_FILE* f) {
if (f == NULL || second_offsets == NULL) return;
if (first_offsets == NULL) {
	save_offsets(f, second_offsets);
}
else {
	vector_offset* processed_offset = process_offsets(first_offsets, second_offsets);
	save_offsets(f, processed_offset);

	free_vector_offset(processed_offset);
}
}
Ejemplo n.º 3
0
vector_offset* common_offsets_to_modified(const vector_offset* common_offsets, int old_size, int new_size) {
	if (common_offsets == NULL) {
		return NULL;
	}
	int nb_common_offsets_items = common_offsets->nbelems;
	vector_offset* modifed_vector_offset = new_vector_offset(nb_common_offsets_items + 2);

	Offsets latest_common;
	latest_common.old_start = latest_common.old_end = latest_common.new_start = latest_common.new_end = 0;
	for (int i = 0; i < nb_common_offsets_items; i++) {
		Offsets current_common = common_offsets->tab[i];

		if ((current_common.old_end - current_common.old_start) !=
			(current_common.new_end - current_common.new_start)) {
			error("Mismatch in length in common offset");
			free_vector_offset(modifed_vector_offset);
			return NULL;
		}

		Offsets modified_offset;
		modified_offset.old_start = latest_common.old_end;
		modified_offset.new_start = latest_common.new_end;

		modified_offset.old_end = current_common.old_start;
		modified_offset.new_end = current_common.new_start;

		if ((modified_offset.old_end > modified_offset.old_start) ||
			(modified_offset.new_end > modified_offset.new_start)) {
			vector_offset_add(modifed_vector_offset, modified_offset);			
		}
		latest_common = current_common;
	}

	Offsets latest_modified_offset;
	latest_modified_offset.old_start = latest_common.old_end;
	latest_modified_offset.new_start = latest_common.new_end;

	latest_modified_offset.old_end = old_size;
	latest_modified_offset.new_end = new_size;

	if ((latest_modified_offset.old_end > latest_modified_offset.old_start) ||
		(latest_modified_offset.new_end > latest_modified_offset.new_start)) {
		vector_offset_add(modifed_vector_offset, latest_modified_offset);
	}

	return modifed_vector_offset;
}
Ejemplo n.º 4
0
int main_Normalize(int argc,char* const argv[]) {
if (argc==1) {
  usage();
  return SUCCESS_RETURN_CODE;
}
int mode=KEEP_CARRIAGE_RETURN;
int separator_normalization=1;
char rules[FILENAME_MAX]="";
char input_offsets[FILENAME_MAX]="";
char output_offsets[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int convLFtoCRLF=1;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_Normalize,lopts_Normalize,&index))) {
   switch(val) {
   case 'l': convLFtoCRLF=0; break;
   case 'n': mode=REMOVE_CARRIAGE_RETURN; break;
   case 'r': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty replacement rule file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(rules,options.vars()->optarg);
             break;
   case 1: separator_normalization=0; break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
              error("Empty input_encoding argument\n");
              return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
              error("Empty output_encoding argument\n");
              return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case '$': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty input offset file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(input_offsets,options.vars()->optarg);
             break;
   case '@': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty output offset file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(output_offsets,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Normalize[index].name);
             return USAGE_ERROR_CODE;
             break;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
  error("Invalid arguments: rerun with --help\n");
  return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

vector_offset* v_input_offsets=NULL;
vector_offset* v_output_offsets=NULL;
U_FILE* f_output_offsets=NULL;

if (output_offsets[0]!='\0') {
  /* We deal with offsets only if we have to produce output offsets */
  if (input_offsets[0]!='\0') {
    v_input_offsets=load_offsets(&vec,input_offsets);
  }
  f_output_offsets=u_fopen(&vec, output_offsets, U_WRITE);
  if (f_output_offsets==NULL) {
    error("Cannot create offset file %s\n",output_offsets);
    return DEFAULT_ERROR_CODE;
  }
  v_output_offsets=new_vector_offset();
}
char tmp_file[FILENAME_MAX];
get_extension(argv[options.vars()->optind],tmp_file);
if (!strcmp(tmp_file, ".snt")) {
   /* If the file to process has already the .snt extension, we temporary rename it to
   * .snt.normalizing */
  strcpy(tmp_file,argv[options.vars()->optind]);
  strcat(tmp_file,".normalizing");
  af_rename(argv[options.vars()->optind],tmp_file);
} else {
   strcpy(tmp_file,argv[options.vars()->optind]);
}
/* We set the destination file */
char dest_file[FILENAME_MAX];
remove_extension(argv[options.vars()->optind],dest_file);
strcat(dest_file,".snt");
u_printf("Normalizing %s...\n",argv[options.vars()->optind]);

int return_value = normalize(tmp_file,
                             dest_file,
                             &vec,
                             mode,
                             convLFtoCRLF,
                             rules,
                             v_output_offsets,
                             separator_normalization);
u_printf("\n");
/* If we have used a temporary file, we delete it */
if (strcmp(tmp_file,argv[options.vars()->optind])) {
   af_remove(tmp_file);
}
process_offsets(v_input_offsets,v_output_offsets,f_output_offsets);
u_fclose(f_output_offsets);
free_vector_offset(v_input_offsets);
free_vector_offset(v_output_offsets);
u_printf((return_value==SUCCESS_RETURN_CODE) ? "Done.\n" : "Unsuccessfull.\n");

return return_value;
}
Ejemplo n.º 5
0
/**
 * translate a set of offset
 * ofs is an array of nb_translations items of type offset_translation
 * Before calling the function, you must fill:
 *   (ofs + #)->position_to_translate with a position (offset) to translate
 *   (ofs + #)->sort_order=# (because array will be sorted on position_to_translate, then back to sort_order
 */
void translate_offset(offset_translation* ofs, int nb_translations, const vector_offset* offsets, int revert) {
if (offsets == NULL) {
	return;
}
if ((nb_translations == 0) || (ofs == NULL)) {
	return;
}
int i;
int sorted_by_position = 1;
for (i = 1;i < nb_translations;i++) 
	if (((ofs + i)->position_to_translate) < ((ofs + i - 1)->position_to_translate)) {
		sorted_by_position = 0;
		break;
	}
if (!sorted_by_position) {
	qsort(ofs,nb_translations,sizeof(offset_translation), compare_offset_translation_by_position);
}

int last_position_in_offsets = revert ?
	(offsets->tab[offsets->nbelems - 1].new_start + offsets->tab[offsets->nbelems-1].new_end) :
	(offsets->tab[offsets->nbelems - 1].old_start + offsets->tab[offsets->nbelems-1].old_end);
int last_position_to_translate = (ofs + nb_translations - 1)->position_to_translate;
int minimal_filesize = offset_max(last_position_in_offsets, last_position_to_translate);


vector_offset* common_offsets = modified_offsets_to_common(offsets, -1, minimal_filesize);

int pos_common_offsets = 0;
for (i = 0; i < nb_translations; i++) {
	int pos_to_translate = (ofs + i)->position_to_translate;
	for (;;) {
		if (pos_common_offsets == common_offsets->nbelems)
			break;
		int end_current_common = revert ?
			(common_offsets->tab[pos_common_offsets].new_end) :
			(common_offsets->tab[pos_common_offsets].old_end);
		if (end_current_common > pos_to_translate)
			break;
		pos_common_offsets++;
	}

	int current_common_start = -1;
	int current_common_end = -1;

	if (pos_common_offsets < common_offsets->nbelems) {
		current_common_start = revert ?
			(common_offsets->tab[pos_common_offsets].new_start) :
			(common_offsets->tab[pos_common_offsets].old_start);
		current_common_end = revert ?
			(common_offsets->tab[pos_common_offsets].new_end) :
			(common_offsets->tab[pos_common_offsets].old_end);
		int translate_common_start = revert ?
			(common_offsets->tab[pos_common_offsets].old_start) :
			(common_offsets->tab[pos_common_offsets].new_start);

		if ((pos_to_translate >= current_common_start) && (pos_to_translate < current_common_end)) {
			(ofs + i)->translated_position = translate_common_start + (pos_to_translate - current_common_start);
			(ofs + i)->translation_pos_in_common = 1;
		}
		else {
			(ofs + i)->translated_position = translate_common_start + (current_common_end - current_common_start);
			(ofs + i)->translation_pos_in_common = -1;
		}


	} else {
		int last_pos_translated = 0;
		if (common_offsets->nbelems > 0)
			last_pos_translated = revert ?
				((common_offsets->tab[common_offsets->nbelems-1].old_start) -1) :
				((common_offsets->tab[common_offsets->nbelems-1].new_start) -1);
		if (last_pos_translated == -1)
			last_pos_translated = 0;
		(ofs + i)->translated_position = last_pos_translated;
		(ofs + i)->translation_pos_in_common = -1;
	}

}

free_vector_offset(common_offsets);

int sorted_by_sort_order = 1;
for (i = 1;i < nb_translations;i++)
	if (((ofs + i)->sort_order) < ((ofs + i - 1)->sort_order)) {
		sorted_by_sort_order = 0;
		break;
	}
if (!sorted_by_sort_order) {
	qsort(ofs, nb_translations, sizeof(offset_translation), compare_offset_translation_by_sort_order);
}
}
Ejemplo n.º 6
0
vector_offset* process_common_offsets(const vector_offset* first_offsets, const vector_offset* second_offsets)
{
	if ((first_offsets == NULL) || (second_offsets == NULL)) {
		return NULL;
	}
	int first_nb_offsets_items = first_offsets->nbelems;
	int second_nb_offsets_items = second_offsets->nbelems;
	vector_offset* merged_vector_offset = new_vector_offset(first_nb_offsets_items + second_nb_offsets_items + 1);

	int pos_in_first_offsets = 0;
	for (int i = 0; i < second_nb_offsets_items; i++)
	{
		Offsets common_offset_in_second = second_offsets->tab[i];
		if ((common_offset_in_second.old_end - common_offset_in_second.old_start) !=
			(common_offset_in_second.new_end - common_offset_in_second.new_start))
		{
			free_vector_offset(merged_vector_offset);
			error("Invalid common offset file");
			return NULL;
		}

		while ((common_offset_in_second.old_end - common_offset_in_second.old_start) != 0) {
			for (;;)
			{
				if (pos_in_first_offsets == first_nb_offsets_items) {
					// we have no common part in first file to process
					return merged_vector_offset;
				}

				if (first_offsets->tab[pos_in_first_offsets].new_end > common_offset_in_second.old_start) {
					break;
				}
				pos_in_first_offsets++;
			}

			int nb_common = 0;
			Offsets current_common_in_first = first_offsets->tab[pos_in_first_offsets];
			if (current_common_in_first.new_start > common_offset_in_second.old_start) {
				int skip_second = current_common_in_first.new_start - common_offset_in_second.old_start;
				common_offset_in_second.old_start += skip_second;
				common_offset_in_second.new_start += skip_second;
			}
			if (current_common_in_first.new_start < common_offset_in_second.old_start) {
				int skip_first = common_offset_in_second.old_start - current_common_in_first.new_start;
				current_common_in_first.old_start += skip_first;
				current_common_in_first.new_start += skip_first;
			}

			int len_common = offset_min(current_common_in_first.new_end - current_common_in_first.new_start,
				common_offset_in_second.old_end - common_offset_in_second.old_start);
			if (len_common > 0) {
				Offsets CommonOffsetToWrite;
				nb_common = len_common;
				int shift_in_first = 0;//common_offset_in_second.old_start >= current_common_in_first.new_start;
				CommonOffsetToWrite.old_start = current_common_in_first.old_start + shift_in_first;
				CommonOffsetToWrite.old_end = CommonOffsetToWrite.old_start + nb_common;
				CommonOffsetToWrite.new_start = common_offset_in_second.new_start;
				CommonOffsetToWrite.new_end = CommonOffsetToWrite.new_start + nb_common;
				vector_offset_add_with_merging(merged_vector_offset, CommonOffsetToWrite);
				common_offset_in_second.old_start += nb_common;
				common_offset_in_second.new_start += nb_common;
			}
			else {
				break;
			}
		}
	}

	return merged_vector_offset;
}
Ejemplo n.º 7
0
/**
 * Convert offset data with modified zone to list of common offsets
 *
 */
vector_offset* modified_offsets_to_common(const vector_offset* offsets, int old_size, int new_size) {
	if ((old_size == -1) && (new_size != -1)) {
		old_size = new_size - global_shift_from_modified_offsets(offsets);
	} else if ((old_size != -1) && (new_size == -1)) {
		new_size = old_size + global_shift_from_modified_offsets(offsets);
	}

	int nb_offsets_items = offsets->nbelems;
	vector_offset* inverted_vector_offset = new_vector_offset(nb_offsets_items + 1);
	for (int i = 0; i < nb_offsets_items; i++) {
		Offsets curOffset = offsets->tab[i];
		Offsets prevOffset;
		if (i > 0) {
			prevOffset = offsets->tab[i - 1];
		}
		else {
			prevOffset.old_end = prevOffset.new_end = 0;
		}

		Offsets CommonOffset;
		CommonOffset.old_start = prevOffset.old_end;
		CommonOffset.old_end = curOffset.old_start;
		CommonOffset.new_start = prevOffset.new_end;
		CommonOffset.new_end = curOffset.new_start;
		if (DetectCommonIncoherency(old_size, CommonOffset.old_start, CommonOffset.old_end,
			new_size, CommonOffset.new_start, CommonOffset.new_end)) {
			free_vector_offset(inverted_vector_offset);
			error("coherency problem on offset file");
			return NULL;
		}

		if (CommonOffset.new_start != CommonOffset.new_end) {
			vector_offset_add(inverted_vector_offset, CommonOffset);
		}
	}

	Offsets LastCommonOffset;
	if (nb_offsets_items > 0) {
		LastCommonOffset.old_start = offsets->tab[nb_offsets_items - 1].old_end;
		LastCommonOffset.new_start = offsets->tab[nb_offsets_items - 1].new_end;
	} else {
		LastCommonOffset.old_start = 0;
		LastCommonOffset.new_start = 0;
	}
	LastCommonOffset.old_end = old_size;
	LastCommonOffset.new_end = new_size;


	if (DetectCommonIncoherency(old_size, LastCommonOffset.old_start, LastCommonOffset.old_end,
		new_size, LastCommonOffset.new_start, LastCommonOffset.new_end)) {
		free_vector_offset(inverted_vector_offset);
		error("coherency problem on offset file");
		return NULL;
	}

	if (LastCommonOffset.new_start != LastCommonOffset.new_end) {
		vector_offset_add(inverted_vector_offset, LastCommonOffset);
	}

	return inverted_vector_offset;
}