Example #1
0
/**
 * Explores the node n, dumps the corresponding lines to the output file,
 * and then frees the node. 'pos' is the current position in the string 's'.
 */
int explore_node(struct sort_tree_node* n, struct sort_infos* inf,
    struct dela_entry* *last) {
  int i, N;
  struct sort_tree_transition* t = NULL;
  struct couple* couple = NULL;
  struct couple* tmp    = NULL;
  if (n == NULL) {
    error("Internal error in explore_node\n");
    return DEFAULT_ERROR_CODE;
  }
  if (n->couples != NULL) {
    /* If the node is a final one, we print the corresponding lines */
    couple = n->couples;
    while (couple != NULL) {
      if (inf->factorize_inflectional_codes) {
        /* We look if the previously printed line, if any, did share
         * the same information. If so, we just append the new inflectional codes.
         * Otherwise, we print the new line.
         *
         * NOTE: in factorize mode, we always ignore duplicates */
        int err;
        struct dela_entry* entry = tokenize_DELAF_line(couple->s,1,&err,0);
        if (entry==NULL) {
          /* We have a non DELAF entry line, like for instance a comment one */
          if (*last!=NULL && *last!=(struct dela_entry*)-1) {
            /* If there was at least one line already printed, then this line
             * awaits for its \n */
            u_fprintf(inf->f_out, "\n");
          }
          /* Then we print the line */
          u_fprintf(inf->f_out, "%S\n",couple->s);
          /* And we reset *last */
          if (*last==(struct dela_entry*)-1) {
            *last=NULL;
          } else if (*last!=NULL) {
            free_dela_entry(*last);
            *last=NULL;
          }
        } else {
          /* So, we have a dic entry. Was there a previous one ? */
          if (*last==NULL || *last==(struct dela_entry*)-1) {
            /* No ? So we print the line, and the current entry becomes *last */
            u_fputs(couple->s, inf->f_out);
            *last=entry;
          } else {
            /* Yes ? We must compare if the codes are compatible */
            if (are_compatible(*last,entry)) {
              /* We look for any code of entry if it was already in *last */
              for (int j=0;j<entry->n_inflectional_codes;j++) {
                if (!dic_entry_contain_inflectional_code(*last,entry->inflectional_codes[j])) {
                  u_fprintf(inf->f_out, ":%S",entry->inflectional_codes[j]);
                  /* We also have to add the newly printed code to *last */
                  (*last)->inflectional_codes[((*last)->n_inflectional_codes)++]=u_strdup(entry->inflectional_codes[j]);
                }
              }
              /* And we must free entry */
              free_dela_entry(entry);
            } else {
              /* If codes are not compatible, we print the \n for the previous
               * line, then the current line that becomes *last */
              u_fprintf(inf->f_out, "\n%S",couple->s);
              free_dela_entry(*last);
              *last=entry;
            }
          }
        }
      } else {
        /* Normal way: we print each line one after the other */
        for (i = 0; i < couple->n; i++) {
          u_fprintf(inf->f_out, "%S\n", couple->s);
          (inf->resulting_line_number)++;
        }
      }
      tmp = couple;
      couple = couple->next;
      free(tmp->s);
      free(tmp);
    }
    n->couples = NULL;
  }
  /* We convert the transition list into a sorted array */
  t = n->transitions;
  N = 0;
  while (t != NULL && N < 0x10000) {
    inf->transitions[N++] = t;
    t = t->next;
  }
  if (N == 0x10000) {
    error("Internal error in explore_node: more than 0x10000 nodes\n");
    free_sort_tree_node(n);
    return DEFAULT_ERROR_CODE;
  }
  if (N > 1)
    quicksort(inf->transitions, 0, N - 1, inf);
  /* After sorting, we copy the result into the transitions of n */
  for (int j = 0; j < N - 1; j++) {
    inf->transitions[j]->next = inf->transitions[j + 1];
  }
  if (N > 0) {
    inf->transitions[N - 1]->next = NULL;
    n->transitions = inf->transitions[0];
  }
  /* Finally, we explore the outgoing transitions */
  t = n->transitions;
  int explore_return_value = SUCCESS_RETURN_CODE;

  while (t != NULL && explore_return_value == SUCCESS_RETURN_CODE) {
    explore_return_value = explore_node(t->node, inf, last);
    if(explore_return_value == SUCCESS_RETURN_CODE) {
      t = t->next;
    }
  }

  /* And we free the node */
  free_sort_tree_node(n);
  return explore_return_value;
}
/**
 * This function saves the current match list in the concordance index file.
 * It is derived from the 'save_matches' from 'Matches.cpp'. At the opposite of
 * 'save_matches', this function is not parameterized by the current position in
 * the text. We assume that this function is called once per sentence automaton, after
 * all matches have been computed.
 */
void save_tfst_matches(struct locate_tfst_infos* p) {
struct tfst_simple_match_list* l=p->matches;
struct tfst_simple_match_list* ptr;
if (p->number_of_matches==p->search_limit) {
	/* If we have reached the limit, then we must free all the remaining matches */
	while (l!=NULL) {
		ptr=l;
		l=l->next;
		free_tfst_simple_match_list(ptr);
	}
	p->matches=NULL;
	return;
}
U_FILE* f=p->output;
if (l==NULL) return;
u_fprintf(f,"%d.%d.%d %d.%d.%d",l->m.start_pos_in_token,l->m.start_pos_in_char,
      l->m.start_pos_in_letter,l->m.end_pos_in_token,
      l->m.end_pos_in_char,l->m.end_pos_in_letter);
if (l->output!=NULL) {
	/* If there is an output */
	u_fprintf(f," ");
	if (p->tagging) {
		/* In tagging mode, we add the sentence number as well as
		 * the start and end states in the .tfst of the match */
		u_fprintf(f,"%d %d %d:",p->tfst->current_sentence,l->start,l->end);
	}
	if (p->debug) {
		save_real_output_from_debug(f,p->output_policy,l->output);
	}
	u_fputs(l->output,f);
}
u_fprintf(f,"\n");
if (p->ambiguous_output_policy==ALLOW_AMBIGUOUS_OUTPUTS) {
   (p->number_of_outputs)++;
   if (!(p->start_position_last_printed_match_token == l->m.start_pos_in_token
         && p->start_position_last_printed_match_char == l->m.start_pos_in_char
         && p->start_position_last_printed_match_letter == l->m.start_pos_in_letter
	      && p->end_position_last_printed_match_token == l->m.end_pos_in_token
	      && p->end_position_last_printed_match_char == l->m.end_pos_in_char
	      && p->end_position_last_printed_match_letter == l->m.end_pos_in_letter)) {
	   (p->number_of_matches)++;
   }
} else {
	/* If we don't allow ambiguous outputs, we count the matches */
	(p->number_of_matches)++;
}
p->start_position_last_printed_match_token=l->m.start_pos_in_token;
p->end_position_last_printed_match_token=l->m.end_pos_in_token;
p->start_position_last_printed_match_char=l->m.start_pos_in_char;
p->end_position_last_printed_match_char=l->m.end_pos_in_char;
p->start_position_last_printed_match_letter=l->m.start_pos_in_letter;
p->end_position_last_printed_match_letter=l->m.end_pos_in_letter;
if (p->number_of_matches==p->search_limit) {
	/* If we have reached the search limitation, we free the remaining
	 * matches and return */
	while (l!=NULL) {
		ptr=l;
		l=l->next;
		free_tfst_simple_match_list(ptr);
	}
	p->matches=NULL;
	return;
}
ptr=l->next;
free_tfst_simple_match_list(l);
p->matches=ptr;
save_tfst_matches(p);
return;
}
int main_Untokenize(int argc,char* const argv[]) {
if (argc==1) {
   usage();
   return SUCCESS_RETURN_CODE;
}

char alphabet[FILENAME_MAX]="";
char token_file[FILENAME_MAX]="";
char dynamicSntDir[FILENAME_MAX]="";
VersatileEncodingConfig vec=VEC_DEFAULT;
int val,index=-1;
int range_start,range_stop,use_range;
int token_step_number=0;
range_start=range_stop=use_range=0;
char foo=0;
bool only_verify_arguments = false;
UnitexGetOpt options;
while (EOF!=(val=options.parse_long(argc,argv,optstring_Untokenize,lopts_Untokenize,&index))) {
   switch(val) {
   case 'a': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty alphabet file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(alphabet,options.vars()->optarg);
             break;
   case 'd': if (options.vars()->optarg[0]=='\0') {
                   error("You must specify a non empty snt dir name\n");
                   return USAGE_ERROR_CODE;
                }
                strcpy(dynamicSntDir,options.vars()->optarg);
                break;
   case 't': if (options.vars()->optarg[0]=='\0') {
                error("You must specify a non empty token file name\n");
                return USAGE_ERROR_CODE;
             }
             strcpy(token_file,options.vars()->optarg);
             break;
   case 'k': if (options.vars()->optarg[0]=='\0') {
                error("Empty input_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg);
             break;
   case 'q': if (options.vars()->optarg[0]=='\0') {
                error("Empty output_encoding argument\n");
                return USAGE_ERROR_CODE;
             }
             decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg);
             break;

   case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&token_step_number,&foo) || token_step_number<=0) {
                /* foo is used to check that the search limit is not like "45gjh" */
                error("Invalid token numbering argument: %s\n",options.vars()->optarg);
                return USAGE_ERROR_CODE;
             }
             break;
   case 'r': {
                int param1 = 0;
                int param2 = 0;
                int ret_scan = sscanf(options.vars()->optarg,"%d,%d%c",&param1,&param2,&foo);
                if (ret_scan == 2) {
                    range_start = param1;
                    range_stop  = param2;
                    use_range=1;
                    if (((range_start < -1)) || (range_stop < -1)) {
                        /* foo is used to check that the search limit is not like "45gjh" */
                        error("Invalid stop count argument: %s\n",options.vars()->optarg);
                        return USAGE_ERROR_CODE;
                    }
                }
                else
                    if (1!=sscanf(options.vars()->optarg,"%d%c",&range_start,&foo) || (range_start < -1)) {
                        /* foo is used to check that the search limit is not like "45gjh" */
                        error("Invalid stop count argument: %s\n",options.vars()->optarg);
                        return USAGE_ERROR_CODE;
                    }
                    use_range=1;
             }
             break;
   case 'V': only_verify_arguments = true;
             break;
   case 'h': usage(); 
             return SUCCESS_RETURN_CODE;
   case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) :
                         error("Missing argument for option --%s\n",lopts_Untokenize[index].name);
             return USAGE_ERROR_CODE;            
   case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) :
                         error("Invalid option --%s\n",options.vars()->optarg);
             return USAGE_ERROR_CODE;
   }
   index=-1;
}

if (options.vars()->optind!=argc-1) {
   error("Invalid arguments: rerun with --help\n");
   return USAGE_ERROR_CODE;
}

if (only_verify_arguments) {
  // freeing all allocated memory
  return SUCCESS_RETURN_CODE;
}

char tokens_txt[FILENAME_MAX];
char text_cod[FILENAME_MAX];
char enter_pos[FILENAME_MAX];

if (dynamicSntDir[0]=='\0') {
    get_snt_path(argv[options.vars()->optind],dynamicSntDir);
}

strcpy(text_cod,dynamicSntDir);
strcat(text_cod,"text.cod");
strcpy(enter_pos,dynamicSntDir);
strcat(enter_pos,"enter.pos");
strcpy(tokens_txt,dynamicSntDir);
strcat(tokens_txt,"tokens.txt");

Alphabet* alph=NULL;
if (alphabet[0]!='\0') {
   alph=load_alphabet(&vec,alphabet);
   if (alph==NULL) {
      error("Cannot load alphabet file %s\n",alphabet);
      return DEFAULT_ERROR_CODE;
   }
}

ABSTRACTMAPFILE* af_text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0);
if (af_text_cod==NULL) {
  error("Cannot open file %s\n",text_cod);
  free_alphabet(alph);
  return DEFAULT_ERROR_CODE;
}

ABSTRACTMAPFILE* af_enter_pos=af_open_mapfile(enter_pos,MAPFILE_OPTION_READ,0);
if (af_enter_pos==NULL) {
  error("Cannot open file %s\n",enter_pos);
  af_close_mapfile(af_text_cod);
  free_alphabet(alph);
  return DEFAULT_ERROR_CODE;
}

U_FILE* text = u_fopen(&vec,argv[options.vars()->optind],U_WRITE);
if (text==NULL) {
  error("Cannot create text file %s\n",argv[options.vars()->optind]);
  af_close_mapfile(af_enter_pos);
  af_close_mapfile(af_text_cod);
  free_alphabet(alph);
  return DEFAULT_ERROR_CODE;
}

struct text_tokens* tok=load_text_tokens(&vec,tokens_txt);
u_printf("Untokenizing text...\n");
size_t nb_item = af_get_mapfile_size(af_text_cod)/sizeof(int);
const int* buf=(const int*)af_get_mapfile_pointer(af_text_cod);

size_t nb_item_enter_pos=0;
const int* buf_enter=NULL;

if (af_enter_pos!=NULL) {
    buf_enter=(const int*)af_get_mapfile_pointer(af_enter_pos);
    if (buf_enter!=NULL) {
        nb_item_enter_pos=af_get_mapfile_size(af_enter_pos)/sizeof(int);
    }
}

size_t count_pos=0;
for (size_t i=0;i<nb_item;i++) {
    int is_in_range=1;
    if ((use_range!=0) && (i<(size_t)range_start)) {
        is_in_range=0;
    }
    if ((use_range!=0) && (range_stop!=0) && (i>(size_t)range_stop)) {
        is_in_range=0;
    }
    int is_newline=0;
    if (count_pos<nb_item_enter_pos) {
        if (i==(size_t)(*(buf_enter+count_pos))) {
            is_newline = 1;
            count_pos++;
        }
    }

    if (is_in_range!=0) {
        if (token_step_number != 0)
            if ((i%token_step_number)==0)
                u_fprintf(text,"\n\nToken %d : ", (int)i);

        if (is_newline!=0) {
            u_fprintf(text,"\n", tok->token[*(buf+i)]);
        }
        else {
			u_fputs(tok->token[*(buf+i)], text);
        }
    }
}

af_release_mapfile_pointer(af_text_cod,buf);
af_release_mapfile_pointer(af_enter_pos,buf_enter);
af_close_mapfile(af_enter_pos);
af_close_mapfile(af_text_cod);
free_text_tokens(tok);
u_fclose(text);
free_alphabet(alph);

u_printf("\nDone.\n");
return SUCCESS_RETURN_CODE;
}