/** * Explores the node n, dumps the corresponding lines to the output file, * and then frees the node. 'pos' is the current position in the string 's'. */ int explore_node(struct sort_tree_node* n, struct sort_infos* inf, struct dela_entry* *last) { int i, N; struct sort_tree_transition* t = NULL; struct couple* couple = NULL; struct couple* tmp = NULL; if (n == NULL) { error("Internal error in explore_node\n"); return DEFAULT_ERROR_CODE; } if (n->couples != NULL) { /* If the node is a final one, we print the corresponding lines */ couple = n->couples; while (couple != NULL) { if (inf->factorize_inflectional_codes) { /* We look if the previously printed line, if any, did share * the same information. If so, we just append the new inflectional codes. * Otherwise, we print the new line. * * NOTE: in factorize mode, we always ignore duplicates */ int err; struct dela_entry* entry = tokenize_DELAF_line(couple->s,1,&err,0); if (entry==NULL) { /* We have a non DELAF entry line, like for instance a comment one */ if (*last!=NULL && *last!=(struct dela_entry*)-1) { /* If there was at least one line already printed, then this line * awaits for its \n */ u_fprintf(inf->f_out, "\n"); } /* Then we print the line */ u_fprintf(inf->f_out, "%S\n",couple->s); /* And we reset *last */ if (*last==(struct dela_entry*)-1) { *last=NULL; } else if (*last!=NULL) { free_dela_entry(*last); *last=NULL; } } else { /* So, we have a dic entry. Was there a previous one ? */ if (*last==NULL || *last==(struct dela_entry*)-1) { /* No ? So we print the line, and the current entry becomes *last */ u_fputs(couple->s, inf->f_out); *last=entry; } else { /* Yes ? We must compare if the codes are compatible */ if (are_compatible(*last,entry)) { /* We look for any code of entry if it was already in *last */ for (int j=0;j<entry->n_inflectional_codes;j++) { if (!dic_entry_contain_inflectional_code(*last,entry->inflectional_codes[j])) { u_fprintf(inf->f_out, ":%S",entry->inflectional_codes[j]); /* We also have to add the newly printed code to *last */ (*last)->inflectional_codes[((*last)->n_inflectional_codes)++]=u_strdup(entry->inflectional_codes[j]); } } /* And we must free entry */ free_dela_entry(entry); } else { /* If codes are not compatible, we print the \n for the previous * line, then the current line that becomes *last */ u_fprintf(inf->f_out, "\n%S",couple->s); free_dela_entry(*last); *last=entry; } } } } else { /* Normal way: we print each line one after the other */ for (i = 0; i < couple->n; i++) { u_fprintf(inf->f_out, "%S\n", couple->s); (inf->resulting_line_number)++; } } tmp = couple; couple = couple->next; free(tmp->s); free(tmp); } n->couples = NULL; } /* We convert the transition list into a sorted array */ t = n->transitions; N = 0; while (t != NULL && N < 0x10000) { inf->transitions[N++] = t; t = t->next; } if (N == 0x10000) { error("Internal error in explore_node: more than 0x10000 nodes\n"); free_sort_tree_node(n); return DEFAULT_ERROR_CODE; } if (N > 1) quicksort(inf->transitions, 0, N - 1, inf); /* After sorting, we copy the result into the transitions of n */ for (int j = 0; j < N - 1; j++) { inf->transitions[j]->next = inf->transitions[j + 1]; } if (N > 0) { inf->transitions[N - 1]->next = NULL; n->transitions = inf->transitions[0]; } /* Finally, we explore the outgoing transitions */ t = n->transitions; int explore_return_value = SUCCESS_RETURN_CODE; while (t != NULL && explore_return_value == SUCCESS_RETURN_CODE) { explore_return_value = explore_node(t->node, inf, last); if(explore_return_value == SUCCESS_RETURN_CODE) { t = t->next; } } /* And we free the node */ free_sort_tree_node(n); return explore_return_value; }
/** * This function saves the current match list in the concordance index file. * It is derived from the 'save_matches' from 'Matches.cpp'. At the opposite of * 'save_matches', this function is not parameterized by the current position in * the text. We assume that this function is called once per sentence automaton, after * all matches have been computed. */ void save_tfst_matches(struct locate_tfst_infos* p) { struct tfst_simple_match_list* l=p->matches; struct tfst_simple_match_list* ptr; if (p->number_of_matches==p->search_limit) { /* If we have reached the limit, then we must free all the remaining matches */ while (l!=NULL) { ptr=l; l=l->next; free_tfst_simple_match_list(ptr); } p->matches=NULL; return; } U_FILE* f=p->output; if (l==NULL) return; u_fprintf(f,"%d.%d.%d %d.%d.%d",l->m.start_pos_in_token,l->m.start_pos_in_char, l->m.start_pos_in_letter,l->m.end_pos_in_token, l->m.end_pos_in_char,l->m.end_pos_in_letter); if (l->output!=NULL) { /* If there is an output */ u_fprintf(f," "); if (p->tagging) { /* In tagging mode, we add the sentence number as well as * the start and end states in the .tfst of the match */ u_fprintf(f,"%d %d %d:",p->tfst->current_sentence,l->start,l->end); } if (p->debug) { save_real_output_from_debug(f,p->output_policy,l->output); } u_fputs(l->output,f); } u_fprintf(f,"\n"); if (p->ambiguous_output_policy==ALLOW_AMBIGUOUS_OUTPUTS) { (p->number_of_outputs)++; if (!(p->start_position_last_printed_match_token == l->m.start_pos_in_token && p->start_position_last_printed_match_char == l->m.start_pos_in_char && p->start_position_last_printed_match_letter == l->m.start_pos_in_letter && p->end_position_last_printed_match_token == l->m.end_pos_in_token && p->end_position_last_printed_match_char == l->m.end_pos_in_char && p->end_position_last_printed_match_letter == l->m.end_pos_in_letter)) { (p->number_of_matches)++; } } else { /* If we don't allow ambiguous outputs, we count the matches */ (p->number_of_matches)++; } p->start_position_last_printed_match_token=l->m.start_pos_in_token; p->end_position_last_printed_match_token=l->m.end_pos_in_token; p->start_position_last_printed_match_char=l->m.start_pos_in_char; p->end_position_last_printed_match_char=l->m.end_pos_in_char; p->start_position_last_printed_match_letter=l->m.start_pos_in_letter; p->end_position_last_printed_match_letter=l->m.end_pos_in_letter; if (p->number_of_matches==p->search_limit) { /* If we have reached the search limitation, we free the remaining * matches and return */ while (l!=NULL) { ptr=l; l=l->next; free_tfst_simple_match_list(ptr); } p->matches=NULL; return; } ptr=l->next; free_tfst_simple_match_list(l); p->matches=ptr; save_tfst_matches(p); return; }
int main_Untokenize(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } char alphabet[FILENAME_MAX]=""; char token_file[FILENAME_MAX]=""; char dynamicSntDir[FILENAME_MAX]=""; VersatileEncodingConfig vec=VEC_DEFAULT; int val,index=-1; int range_start,range_stop,use_range; int token_step_number=0; range_start=range_stop=use_range=0; char foo=0; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_Untokenize,lopts_Untokenize,&index))) { switch(val) { case 'a': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty alphabet file name\n"); return USAGE_ERROR_CODE; } strcpy(alphabet,options.vars()->optarg); break; case 'd': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty snt dir name\n"); return USAGE_ERROR_CODE; } strcpy(dynamicSntDir,options.vars()->optarg); break; case 't': if (options.vars()->optarg[0]=='\0') { error("You must specify a non empty token file name\n"); return USAGE_ERROR_CODE; } strcpy(token_file,options.vars()->optarg); break; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; case 'n': if (1!=sscanf(options.vars()->optarg,"%d%c",&token_step_number,&foo) || token_step_number<=0) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid token numbering argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } break; case 'r': { int param1 = 0; int param2 = 0; int ret_scan = sscanf(options.vars()->optarg,"%d,%d%c",¶m1,¶m2,&foo); if (ret_scan == 2) { range_start = param1; range_stop = param2; use_range=1; if (((range_start < -1)) || (range_stop < -1)) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid stop count argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } } else if (1!=sscanf(options.vars()->optarg,"%d%c",&range_start,&foo) || (range_start < -1)) { /* foo is used to check that the search limit is not like "45gjh" */ error("Invalid stop count argument: %s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } use_range=1; } break; case 'V': only_verify_arguments = true; break; case 'h': usage(); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_Untokenize[index].name); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); return USAGE_ERROR_CODE; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); return USAGE_ERROR_CODE; } if (only_verify_arguments) { // freeing all allocated memory return SUCCESS_RETURN_CODE; } char tokens_txt[FILENAME_MAX]; char text_cod[FILENAME_MAX]; char enter_pos[FILENAME_MAX]; if (dynamicSntDir[0]=='\0') { get_snt_path(argv[options.vars()->optind],dynamicSntDir); } strcpy(text_cod,dynamicSntDir); strcat(text_cod,"text.cod"); strcpy(enter_pos,dynamicSntDir); strcat(enter_pos,"enter.pos"); strcpy(tokens_txt,dynamicSntDir); strcat(tokens_txt,"tokens.txt"); Alphabet* alph=NULL; if (alphabet[0]!='\0') { alph=load_alphabet(&vec,alphabet); if (alph==NULL) { error("Cannot load alphabet file %s\n",alphabet); return DEFAULT_ERROR_CODE; } } ABSTRACTMAPFILE* af_text_cod=af_open_mapfile(text_cod,MAPFILE_OPTION_READ,0); if (af_text_cod==NULL) { error("Cannot open file %s\n",text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } ABSTRACTMAPFILE* af_enter_pos=af_open_mapfile(enter_pos,MAPFILE_OPTION_READ,0); if (af_enter_pos==NULL) { error("Cannot open file %s\n",enter_pos); af_close_mapfile(af_text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } U_FILE* text = u_fopen(&vec,argv[options.vars()->optind],U_WRITE); if (text==NULL) { error("Cannot create text file %s\n",argv[options.vars()->optind]); af_close_mapfile(af_enter_pos); af_close_mapfile(af_text_cod); free_alphabet(alph); return DEFAULT_ERROR_CODE; } struct text_tokens* tok=load_text_tokens(&vec,tokens_txt); u_printf("Untokenizing text...\n"); size_t nb_item = af_get_mapfile_size(af_text_cod)/sizeof(int); const int* buf=(const int*)af_get_mapfile_pointer(af_text_cod); size_t nb_item_enter_pos=0; const int* buf_enter=NULL; if (af_enter_pos!=NULL) { buf_enter=(const int*)af_get_mapfile_pointer(af_enter_pos); if (buf_enter!=NULL) { nb_item_enter_pos=af_get_mapfile_size(af_enter_pos)/sizeof(int); } } size_t count_pos=0; for (size_t i=0;i<nb_item;i++) { int is_in_range=1; if ((use_range!=0) && (i<(size_t)range_start)) { is_in_range=0; } if ((use_range!=0) && (range_stop!=0) && (i>(size_t)range_stop)) { is_in_range=0; } int is_newline=0; if (count_pos<nb_item_enter_pos) { if (i==(size_t)(*(buf_enter+count_pos))) { is_newline = 1; count_pos++; } } if (is_in_range!=0) { if (token_step_number != 0) if ((i%token_step_number)==0) u_fprintf(text,"\n\nToken %d : ", (int)i); if (is_newline!=0) { u_fprintf(text,"\n", tok->token[*(buf+i)]); } else { u_fputs(tok->token[*(buf+i)], text); } } } af_release_mapfile_pointer(af_text_cod,buf); af_release_mapfile_pointer(af_enter_pos,buf_enter); af_close_mapfile(af_enter_pos); af_close_mapfile(af_text_cod); free_text_tokens(tok); u_fclose(text); free_alphabet(alph); u_printf("\nDone.\n"); return SUCCESS_RETURN_CODE; }