Example #1
0
/**
 * Loads the given offset file. Returns NULL in case of error.
 */
vector_offset* load_offsets(const VersatileEncodingConfig* vec,const char* name) {
U_FILE* f=u_fopen(vec,name,U_READ);
if (f==NULL) return NULL;
int a,b,c,d,n;
vector_offset* res=new_vector_offset();
while ((n=u_fscanf(f,"%d%d%d%d",&a,&b,&c,&d))!=EOF) {
	if (n!=4) {
		fatal_error("Corrupted offset file %s\n",name);
	}
	vector_offset_add(res,a,b,c,d);
}
u_fclose(f);
return res;
}
Example #2
0
vector_offset* common_offsets_to_modified(const vector_offset* common_offsets, int old_size, int new_size) {
	if (common_offsets == NULL) {
		return NULL;
	}
	int nb_common_offsets_items = common_offsets->nbelems;
	vector_offset* modifed_vector_offset = new_vector_offset(nb_common_offsets_items + 2);

	Offsets latest_common;
	latest_common.old_start = latest_common.old_end = latest_common.new_start = latest_common.new_end = 0;
	for (int i = 0; i < nb_common_offsets_items; i++) {
		Offsets current_common = common_offsets->tab[i];

		if ((current_common.old_end - current_common.old_start) !=
			(current_common.new_end - current_common.new_start)) {
			error("Mismatch in length in common offset");
			free_vector_offset(modifed_vector_offset);
			return NULL;
		}

		Offsets modified_offset;
		modified_offset.old_start = latest_common.old_end;
		modified_offset.new_start = latest_common.new_end;

		modified_offset.old_end = current_common.old_start;
		modified_offset.new_end = current_common.new_start;

		if ((modified_offset.old_end > modified_offset.old_start) ||
			(modified_offset.new_end > modified_offset.new_start)) {
			vector_offset_add(modifed_vector_offset, modified_offset);			
		}
		latest_common = current_common;
	}

	Offsets latest_modified_offset;
	latest_modified_offset.old_start = latest_common.old_end;
	latest_modified_offset.new_start = latest_common.new_end;

	latest_modified_offset.old_end = old_size;
	latest_modified_offset.new_end = new_size;

	if ((latest_modified_offset.old_end > latest_modified_offset.old_start) ||
		(latest_modified_offset.new_end > latest_modified_offset.new_start)) {
		vector_offset_add(modifed_vector_offset, latest_modified_offset);
	}

	return modifed_vector_offset;
}
Example #3
0
vector_offset* common_offsets_to_modifed(const vector_offset* offsets, int old_size, int new_size) {

	int nb_offsets_items = offsets->nbelems;
	vector_offset* inverted_vector_offset = new_vector_offset(nb_offsets_items + 2);
	for (int i = 0; i < nb_offsets_items; i++) {
		Offsets curOffset = offsets->tab[i];
		Offsets prevOffset;
		if (i > 0) {
			prevOffset = offsets->tab[i - 1];
		}
		else {
			prevOffset.old_end = prevOffset.new_end = 0;
		}

		Offsets DifferentOffset;
		DifferentOffset.old_start = prevOffset.old_end;
		DifferentOffset.old_end = curOffset.old_start;
		DifferentOffset.new_start = prevOffset.new_end;
		DifferentOffset.new_end = curOffset.new_start;

		vector_offset_add(inverted_vector_offset, DifferentOffset);
	}

	Offsets LastDifferentOffset;
	if (nb_offsets_items > 0) {
		LastDifferentOffset.old_start = offsets->tab[nb_offsets_items - 1].old_end;
		LastDifferentOffset.new_start = offsets->tab[nb_offsets_items - 1].new_end;
	}
	else {
		LastDifferentOffset.old_start = 0;
		LastDifferentOffset.new_start = 0;
	}
	LastDifferentOffset.old_end = old_size;
	LastDifferentOffset.new_end = new_size;

	if ((LastDifferentOffset.old_start != LastDifferentOffset.old_end) ||
		(LastDifferentOffset.old_start != LastDifferentOffset.old_end)) {

		vector_offset_add(inverted_vector_offset, LastDifferentOffset);
	}

	return inverted_vector_offset;
}
Example #4
0
int main_Fst2Txt(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

struct fst2txt_parameters* p=new_fst2txt_parameters();
char in_offsets[FILENAME_MAX]="";
char out_offsets[FILENAME_MAX]="";
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;

while (EOF!=(val=options.parse_long(argc,argv,optstring_Fst2Txt,lopts_Fst2Txt,&index))) {
   switch(val) {
   case 't': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty text file name\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;
             }
             p->input_text_file=strdup(options.vars()->optarg);
             if (p->input_text_file==NULL) {
                alloc_error("main_Fst2Txt");
                free_fst2txt_parameters(p);
                return ALLOC_ERROR_CODE;
             }
             break;
   case 'o': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty text output file name\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;
             }
             p->output_text_file=strdup(options.vars()->optarg);
			 p->output_text_file_is_temp=0;
             if (p->output_text_file==NULL) {
                alloc_error("main_Fst2Txt");
                free_fst2txt_parameters(p);
                return ALLOC_ERROR_CODE;
             }
             break;
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;                
             }
             p->alphabet_file=strdup(options.vars()->optarg);
             if (p->alphabet_file==NULL) {
               alloc_error("main_Fst2Txt");
               free_fst2txt_parameters(p);
               return ALLOC_ERROR_CODE;               
             }
             break;
   case 'M': p->output_policy=MERGE_OUTPUTS; break;
   case 'R': p->output_policy=REPLACE_OUTPUTS; break;
   case 'c': p->tokenization_policy=CHAR_BY_CHAR_TOKENIZATION; break;
   case 'w': p->tokenization_policy=WORD_BY_WORD_TOKENIZATION; break;
   case 's': p->space_policy=START_WITH_SPACE; break;
   case 'x': p->space_policy=DONT_START_WITH_SPACE; break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             free_fst2txt_parameters(p);
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Fst2Txt[index].name);
             free_fst2txt_parameters(p);
             return USAGE_ERROR_CODE; 
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE;                 
             }
             decode_reading_encoding_parameter(&(p->vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE; 
             }
             decode_writing_encoding_parameter(&(p->vec.encoding_output),&(p->vec.bom_output),options.vars()->optarg);
             break;
   case '$': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_offsets argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE; 
             }
             strcpy(in_offsets,options.vars()->optarg);
             break;
   case '@': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_offsets argument\n");
                free_fst2txt_parameters(p);
                return USAGE_ERROR_CODE; 
             }
             strcpy(out_offsets,options.vars()->optarg);
             break;
   case 'l': p->convLFtoCRLF=0; break;
   case 'r': p->keepCR = 1; break;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             free_fst2txt_parameters(p);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   free_fst2txt_parameters(p);
   return USAGE_ERROR_CODE;
}

if (p->input_text_file==NULL) {
   error("You must specify the text file\n");
   free_fst2txt_parameters(p);
   return USAGE_ERROR_CODE;   
}

if (only_verify_arguments) {
  // freeing all allocated memory
  free_fst2txt_parameters(p);
  return SUCCESS_RETURN_CODE;
}

if (out_offsets[0]!='\0') {
	/* We deal with offsets only if the program is expected to produce some */
	if (in_offsets[0]!='\0') {
		p->v_in_offsets=load_offsets(&(p->vec),in_offsets);
		if (p->v_in_offsets==NULL) {
			error("Cannot load offset file %s\n",in_offsets);
      free_fst2txt_parameters(p);
      return DEFAULT_ERROR_CODE;      
		}
	} else {
		/* If there is no input offset file, we create an empty offset vector
		 * in order to avoid testing whether the vector is NULL or not */
		p->v_in_offsets=new_vector_offset(1);
	}
	p->f_out_offsets=u_fopen(&(p->vec),out_offsets,U_WRITE);
	if (p->f_out_offsets==NULL) {
		error("Cannot create file %s\n",out_offsets);
    free_fst2txt_parameters(p);
    return DEFAULT_ERROR_CODE;     
	}
}

if (p->output_text_file == NULL) {
	char tmp[FILENAME_MAX];
	remove_extension(p->input_text_file, tmp);
	strcat(tmp, ".tmp");
	p->output_text_file_is_temp=1;
	p->output_text_file = strdup(tmp);
	if (p->output_text_file == NULL) {
		alloc_error("main_Fst2Txt");
		free_fst2txt_parameters(p);
		return ALLOC_ERROR_CODE;
	}
}
p->fst_file=strdup(argv[options.vars()->optind]);
if (p->fst_file==NULL) {
   alloc_error("main_Fst2Txt");
   free_fst2txt_parameters(p);
   return ALLOC_ERROR_CODE;   
}

int result=main_fst2txt(p);

free_fst2txt_parameters(p);
return result;
}
Example #5
0
int main_Normalize(int argc,char* const argv[]) {
if (argc==1) {
  usage();
  return SUCCESS_RETURN_CODE;
}
int mode=KEEP_CARRIAGE_RETURN;
int separator_normalization=1;
char rules[FILENAME_MAX]="";
char input_offsets[FILENAME_MAX]="";
char output_offsets[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int convLFtoCRLF=1;
int val,index=-1;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_Normalize,lopts_Normalize,&index))) {
   switch(val) {
   case 'l': convLFtoCRLF=0; break;
   case 'n': mode=REMOVE_CARRIAGE_RETURN; break;
   case 'r': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty replacement rule file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(rules,options.vars()->optarg);
             break;
   case 1: separator_normalization=0; break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
              error("Empty input_encoding argument\n");
              return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
              error("Empty output_encoding argument\n");
              return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;
   case '$': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty input offset file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(input_offsets,options.vars()->optarg);
             break;
   case '@': if (options.vars()->optarg[0]=='\0') {
              error("You must specify a non empty output offset file name\n");
              return USAGE_ERROR_CODE;
             }
             strcpy(output_offsets,options.vars()->optarg);
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage();
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Normalize[index].name);
             return USAGE_ERROR_CODE;
             break;
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
             break;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
  error("Invalid arguments: rerun with --help\n");
  return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

vector_offset* v_input_offsets=NULL;
vector_offset* v_output_offsets=NULL;
U_FILE* f_output_offsets=NULL;

if (output_offsets[0]!='\0') {
  /* We deal with offsets only if we have to produce output offsets */
  if (input_offsets[0]!='\0') {
    v_input_offsets=load_offsets(&vec,input_offsets);
  }
  f_output_offsets=u_fopen(&vec, output_offsets, U_WRITE);
  if (f_output_offsets==NULL) {
    error("Cannot create offset file %s\n",output_offsets);
    return DEFAULT_ERROR_CODE;
  }
  v_output_offsets=new_vector_offset();
}
char tmp_file[FILENAME_MAX];
get_extension(argv[options.vars()->optind],tmp_file);
if (!strcmp(tmp_file, ".snt")) {
   /* If the file to process has already the .snt extension, we temporary rename it to
   * .snt.normalizing */
  strcpy(tmp_file,argv[options.vars()->optind]);
  strcat(tmp_file,".normalizing");
  af_rename(argv[options.vars()->optind],tmp_file);
} else {
   strcpy(tmp_file,argv[options.vars()->optind]);
}
/* We set the destination file */
char dest_file[FILENAME_MAX];
remove_extension(argv[options.vars()->optind],dest_file);
strcat(dest_file,".snt");
u_printf("Normalizing %s...\n",argv[options.vars()->optind]);

int return_value = normalize(tmp_file,
                             dest_file,
                             &vec,
                             mode,
                             convLFtoCRLF,
                             rules,
                             v_output_offsets,
                             separator_normalization);
u_printf("\n");
/* If we have used a temporary file, we delete it */
if (strcmp(tmp_file,argv[options.vars()->optind])) {
   af_remove(tmp_file);
}
process_offsets(v_input_offsets,v_output_offsets,f_output_offsets);
u_fclose(f_output_offsets);
free_vector_offset(v_input_offsets);
free_vector_offset(v_output_offsets);
u_printf((return_value==SUCCESS_RETURN_CODE) ? "Done.\n" : "Unsuccessfull.\n");

return return_value;
}
Example #6
0
vector_offset* process_common_offsets(const vector_offset* first_offsets, const vector_offset* second_offsets)
{
	if ((first_offsets == NULL) || (second_offsets == NULL)) {
		return NULL;
	}
	int first_nb_offsets_items = first_offsets->nbelems;
	int second_nb_offsets_items = second_offsets->nbelems;
	vector_offset* merged_vector_offset = new_vector_offset(first_nb_offsets_items + second_nb_offsets_items + 1);

	int pos_in_first_offsets = 0;
	for (int i = 0; i < second_nb_offsets_items; i++)
	{
		Offsets common_offset_in_second = second_offsets->tab[i];
		if ((common_offset_in_second.old_end - common_offset_in_second.old_start) !=
			(common_offset_in_second.new_end - common_offset_in_second.new_start))
		{
			free_vector_offset(merged_vector_offset);
			error("Invalid common offset file");
			return NULL;
		}

		while ((common_offset_in_second.old_end - common_offset_in_second.old_start) != 0) {
			for (;;)
			{
				if (pos_in_first_offsets == first_nb_offsets_items) {
					// we have no common part in first file to process
					return merged_vector_offset;
				}

				if (first_offsets->tab[pos_in_first_offsets].new_end > common_offset_in_second.old_start) {
					break;
				}
				pos_in_first_offsets++;
			}

			int nb_common = 0;
			Offsets current_common_in_first = first_offsets->tab[pos_in_first_offsets];
			if (current_common_in_first.new_start > common_offset_in_second.old_start) {
				int skip_second = current_common_in_first.new_start - common_offset_in_second.old_start;
				common_offset_in_second.old_start += skip_second;
				common_offset_in_second.new_start += skip_second;
			}
			if (current_common_in_first.new_start < common_offset_in_second.old_start) {
				int skip_first = common_offset_in_second.old_start - current_common_in_first.new_start;
				current_common_in_first.old_start += skip_first;
				current_common_in_first.new_start += skip_first;
			}

			int len_common = offset_min(current_common_in_first.new_end - current_common_in_first.new_start,
				common_offset_in_second.old_end - common_offset_in_second.old_start);
			if (len_common > 0) {
				Offsets CommonOffsetToWrite;
				nb_common = len_common;
				int shift_in_first = 0;//common_offset_in_second.old_start >= current_common_in_first.new_start;
				CommonOffsetToWrite.old_start = current_common_in_first.old_start + shift_in_first;
				CommonOffsetToWrite.old_end = CommonOffsetToWrite.old_start + nb_common;
				CommonOffsetToWrite.new_start = common_offset_in_second.new_start;
				CommonOffsetToWrite.new_end = CommonOffsetToWrite.new_start + nb_common;
				vector_offset_add_with_merging(merged_vector_offset, CommonOffsetToWrite);
				common_offset_in_second.old_start += nb_common;
				common_offset_in_second.new_start += nb_common;
			}
			else {
				break;
			}
		}
	}

	return merged_vector_offset;
}
Example #7
0
/**
 * Convert offset data with modified zone to list of common offsets
 *
 */
vector_offset* modified_offsets_to_common(const vector_offset* offsets, int old_size, int new_size) {
	if ((old_size == -1) && (new_size != -1)) {
		old_size = new_size - global_shift_from_modified_offsets(offsets);
	} else if ((old_size != -1) && (new_size == -1)) {
		new_size = old_size + global_shift_from_modified_offsets(offsets);
	}

	int nb_offsets_items = offsets->nbelems;
	vector_offset* inverted_vector_offset = new_vector_offset(nb_offsets_items + 1);
	for (int i = 0; i < nb_offsets_items; i++) {
		Offsets curOffset = offsets->tab[i];
		Offsets prevOffset;
		if (i > 0) {
			prevOffset = offsets->tab[i - 1];
		}
		else {
			prevOffset.old_end = prevOffset.new_end = 0;
		}

		Offsets CommonOffset;
		CommonOffset.old_start = prevOffset.old_end;
		CommonOffset.old_end = curOffset.old_start;
		CommonOffset.new_start = prevOffset.new_end;
		CommonOffset.new_end = curOffset.new_start;
		if (DetectCommonIncoherency(old_size, CommonOffset.old_start, CommonOffset.old_end,
			new_size, CommonOffset.new_start, CommonOffset.new_end)) {
			free_vector_offset(inverted_vector_offset);
			error("coherency problem on offset file");
			return NULL;
		}

		if (CommonOffset.new_start != CommonOffset.new_end) {
			vector_offset_add(inverted_vector_offset, CommonOffset);
		}
	}

	Offsets LastCommonOffset;
	if (nb_offsets_items > 0) {
		LastCommonOffset.old_start = offsets->tab[nb_offsets_items - 1].old_end;
		LastCommonOffset.new_start = offsets->tab[nb_offsets_items - 1].new_end;
	} else {
		LastCommonOffset.old_start = 0;
		LastCommonOffset.new_start = 0;
	}
	LastCommonOffset.old_end = old_size;
	LastCommonOffset.new_end = new_size;


	if (DetectCommonIncoherency(old_size, LastCommonOffset.old_start, LastCommonOffset.old_end,
		new_size, LastCommonOffset.new_start, LastCommonOffset.new_end)) {
		free_vector_offset(inverted_vector_offset);
		error("coherency problem on offset file");
		return NULL;
	}

	if (LastCommonOffset.new_start != LastCommonOffset.new_end) {
		vector_offset_add(inverted_vector_offset, LastCommonOffset);
	}

	return inverted_vector_offset;
}