void parse_diskiograph_arg(struct text_object *obj, const char *arg) { char *buf = 0; buf = scan_graph(obj, arg, 0); obj->data.opaque = prepare_diskio_stat(dev_name(buf)); if (buf) free(buf); }
void parse_text(struct fst2txt_parameters* p) { fill_buffer(p->text_buffer,p->f_input); int debut=p->fst2->initial_states[1]; p->variables=new_Variables(p->fst2->input_variables); int n_blocks=0; u_printf("Block %d",n_blocks); int within_tag=0; while (p->current_origin<p->text_buffer->size) { if (!p->text_buffer->end_of_file && p->current_origin>(p->text_buffer->size-MINIMAL_SIZE_PRELOADED_TEXT)) { /* If must change of block, we update the absolute offset, and we fill the * buffer. */ p->absolute_offset=p->absolute_offset+p->current_origin; fill_buffer(p->text_buffer,p->current_origin,p->f_input); p->current_origin=0; n_blocks++; u_printf("\rBlock %d ",n_blocks); } p->output[0]='\0'; empty(p->stack); p->input_length=0; //memset(p->buffer,0,p->current_origin); if (p->buffer[p->current_origin]=='{') { within_tag=1; } else if (p->buffer[p->current_origin]=='}') { within_tag=0; } else if (!within_tag && (p->buffer[p->current_origin]!=' ' || p->space_policy==START_WITH_SPACE)) { // we don't start a match on a space unichar mot_token_buffer[MOT_BUFFER_TOKEN_SIZE]; scan_graph(0,debut,0,0,NULL,mot_token_buffer,p); } u_fprintf(p->f_output,"%S",p->output); if (p->input_length==0) { // if no input was read, we go on u_fputc(p->buffer[p->current_origin],p->f_output); (p->current_origin)++; } else { // we increase current_origin p->current_origin=p->current_origin+p->input_length; } } u_printf("\r \n"); free_Variables(p->variables); p->variables=NULL; }
void scan_graph(int n_graph, // number of current graph int e, // number of current state int pos, // int depth, struct parsing_info** liste_arrivee, unichar* mot_token_buffer, struct fst2txt_parameters* p,Abstract_allocator prv_alloc_recycle) { Fst2State etat_courant=p->fst2->states[e]; if (depth > MAX_DEPTH) { error( "\n" "Maximal stack size reached in graph %i!\n" "Recognized more than %i tokens starting from:\n" " ", n_graph, MAX_DEPTH); for (int i=0; i<60; i++) { error("%S",p->buffer[p->current_origin+i]); } error("\nSkipping match at this position, trying from next token!\n"); p->output[0] = '\0'; // clear output p->input_length = 0; // reset taille_entree empty(p->stack); // clear output stack if (liste_arrivee != NULL) { while (*liste_arrivee != NULL) { // free list of subgraph matches struct parsing_info* la_tmp=*liste_arrivee; *liste_arrivee=(*liste_arrivee)->next; la_tmp->next=NULL; // to don't free the next item free_parsing_info(la_tmp, prv_alloc_recycle); } } return; // exit(1); // don't exit, try at next position } depth++; if (is_final_state(etat_courant)) { // if we are in a final state p->stack->stack[p->stack->stack_pointer+1]='\0'; if (n_graph == 0) { // in main graph if (pos>=p->input_length/*sommet>u_strlen(output)*/) { // and if the recognized input is longer than the current one, it replaces it u_strcpy(p->output,p->stack->stack); p->input_length=(pos); } } else { // in a subgraph (*liste_arrivee)=insert_if_absent(pos,-1,-1,(*liste_arrivee),p->stack->stack_pointer+1, p->stack->stack,p->variables,NULL,NULL,-1,-1,NULL,-1, prv_alloc_recycle); } } if (pos+p->current_origin==p->text_buffer->size) { // if we are at the end of the text, we return return; } int SOMMET=p->stack->stack_pointer+1; int pos2; /* If there are some letter sequence transitions like %hello, we process them */ if (p->token_tree[e]->transition_array!=NULL) { if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} /* we don't keep this line because of problems occur in sentence tokenizing * if the return sequence is defautly considered as a separator like space else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} */ else pos2=pos; int position=0; unichar *token=mot_token_buffer; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || (is_letter(p->buffer[pos2+p->current_origin],p->alphabet) && (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet)))) { /* If we are in character by character mode */ while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { token[position++]=p->buffer[(pos2++)+p->current_origin]; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION) { break; } } token[position]='\0'; if (position!=0 && (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || !(is_letter(token[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)))) { // we proceed only if we have exactly read the contenu sequence // in both modes MERGE and REPLACE, we process the transduction if any int SOMMET2=p->stack->stack_pointer; Transition* RES=get_matching_tags(token,p->token_tree[e],p->alphabet); Transition* TMP; unichar* mot_token_new_recurse_buffer=NULL; if (RES!=NULL) { // we allocate a new mot_token_buffer for the scan_graph recursin because we need preserve current // token=mot_token_buffer mot_token_new_recurse_buffer=(unichar*)malloc(MOT_BUFFER_TOKEN_SIZE*sizeof(unichar)); if (mot_token_new_recurse_buffer==NULL) { fatal_alloc_error("scan_graph"); } } while (RES!=NULL) { p->stack->stack_pointer=SOMMET2; Fst2Tag etiq=p->fst2->tags[RES->tag_number]; traiter_transduction(p,etiq->output); int longueur=u_strlen(etiq->input); unichar C=token[longueur]; token[longueur]='\0'; if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,token,0); } token[longueur]=C; scan_graph(n_graph,RES->state_number,pos2-(position-longueur),depth,liste_arrivee,mot_token_new_recurse_buffer,p); TMP=RES; RES=RES->next; free(TMP); } if (mot_token_new_recurse_buffer!=NULL) { free(mot_token_new_recurse_buffer); } } } } Transition* t=etat_courant->transitions; while (t!=NULL) { p->stack->stack_pointer=SOMMET-1; // we process the transition of the current state int n_etiq=t->tag_number; if (n_etiq<0) { // case of a sub-graph struct parsing_info* liste=NULL; unichar* pile_old; p->stack->stack[p->stack->stack_pointer+1]='\0'; pile_old = u_strdup(p->stack->stack); scan_graph((((unsigned)n_etiq)-1),p->fst2->initial_states[-n_etiq],pos,depth,&liste,mot_token_buffer,p); while (liste!=NULL) { p->stack->stack_pointer=liste->stack_pointer-1; u_strcpy(p->stack->stack,liste->stack); scan_graph(n_graph,t->state_number,liste->position,depth,liste_arrivee,mot_token_buffer,p); struct parsing_info* l_tmp=liste; liste=liste->next; l_tmp->next=NULL; // to don't free the next item free_parsing_info(l_tmp, prv_alloc_recycle); } u_strcpy(p->stack->stack,pile_old); free(pile_old); p->stack->stack_pointer=SOMMET-1; } else { // case of a normal tag Fst2Tag etiq=p->fst2->tags[n_etiq]; unichar* contenu=etiq->input; int contenu_len_possible_match=u_len_possible_match(contenu); if (etiq->type==BEGIN_OUTPUT_VAR_TAG) { fatal_error("Unsupported $|XXX( tags in Fst2Txt\n"); } if (etiq->type==END_OUTPUT_VAR_TAG) { fatal_error("Unsupported $|XXX) tags in Fst2Txt\n"); } if (etiq->type==BEGIN_VAR_TAG) { // case of a $a( variable tag //int old; struct transduction_variable* L=get_transduction_variable(p->variables,etiq->variable); if (L==NULL) { fatal_error("Unknown variable: %S\n",etiq->variable); } //old=L->start; if (p->buffer[pos+p->current_origin]==' ' && pos+p->current_origin+1<p->text_buffer->size) { pos2=pos+1; if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' '); } //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; L->start_in_tokens=pos2; scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); //L->start=old; } else if (etiq->type==END_VAR_TAG) { // case of a $a) variable tag //int old; struct transduction_variable* L=get_transduction_variable(p->variables,etiq->variable); if (L==NULL) { fatal_error("Unknown variable: %S\n",etiq->variable); } //old=L->end; if (pos>0) L->end_in_tokens=pos-1; else L->end_in_tokens=pos; // BUG: qd changement de buffer, penser au cas start dans ancien buffer et end dans nouveau scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p); //L->end=old; } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MOT_LN5))) { // case of transition by any sequence of letters if (p->buffer[pos+p->current_origin]==' ' && pos+p->current_origin+1<p->text_buffer->size) { pos2=pos+1; if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' '); } //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || ((pos2+p->current_origin)==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) { while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0) { // we proceed only if we have read a letter sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_output_string(p->stack,mot); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==4) && (!u_trymatch_superfast4(contenu,ETIQ_NB_LN4))) { // case of transition by any sequence of digits if (p->buffer[pos+p->current_origin]==' ') { pos2=pos+1; if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' '); } //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; while (pos2+p->current_origin<p->text_buffer->size && (p->buffer[pos2+p->current_origin]>='0') && (p->buffer[pos2+p->current_origin]<='9')) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0) { // we proceed only if we have read a letter sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_output_string(p->stack,mot); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MAJ_LN5))) { // case of upper case letter sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || ((pos2+p->current_origin)==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) { while (pos2+p->current_origin<p->text_buffer->size && is_upper(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // we proceed only if we have read an upper case letter sequence // which is not followed by a lower case letter // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_MIN_LN5))) { // case of lower case letter sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet))) { while (pos2+p->current_origin<p->text_buffer->size && is_lower(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // we proceed only if we have read a lower case letter sequence // which is not followed by an upper case letter // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_PRE_LN5))) { // case of a sequence beginning by an upper case letter if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar* mot=mot_token_buffer; int position=0; if (p->tokenization_policy==CHAR_BY_CHAR_TOKENIZATION || (is_upper(p->buffer[pos2+p->current_origin],p->alphabet) && (pos2+p->current_origin==0 || !is_letter(p->buffer[pos2+p->current_origin-1],p->alphabet)))) { while (pos2+p->current_origin<p->text_buffer->size && is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (position!=0 && !is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // we proceed only if we have read a letter sequence // which is not followed by a letter // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } else if ((contenu_len_possible_match==5) && (!u_trymatch_superfast5(contenu,ETIQ_PNC_LN5))) { // case of a punctuation sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; unichar C=p->buffer[pos2+p->current_origin]; if (C==';' || C=='!' || C=='?' || C==':' || C==0xbf || C==0xa1 || C==0x0e4f || C==0x0e5a || C==0x0e5b || C==0x3001 || C==0x3002 || C==0x30fb) { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,C); } scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p); } else { // we consider the case of ... // BUG: if ... appears at the end of the buffer if (C=='.') { if ((pos2+p->current_origin+2)<p->text_buffer->size && p->buffer[pos2+p->current_origin+1]=='.' && p->buffer[pos2+p->current_origin+2]=='.') { traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the ... we have read push(p->stack,C);push(p->stack,C);push(p->stack,C); } scan_graph(n_graph,t->state_number,pos2+3,depth,liste_arrivee,mot_token_buffer,p); } else { // we consider the . as a normal punctuation sign traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,C); } scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p); } } } } else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast3(contenu,ETIQ_E_LN3))) { // case of an empty sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p); } else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast3(contenu,ETIQ_CIRC_LN3))) { // case of a new line sequence if (p->buffer[pos+p->current_origin]=='\n') { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,'\n'); } scan_graph(n_graph,t->state_number,pos+1,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==1) && (!u_trymatch_superfast1(contenu,'#')) && (!(etiq->control&RESPECT_CASE_TAG_BIT_MASK))) { // case of a no space condition if (p->buffer[pos+p->current_origin]!=' ') { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); scan_graph(n_graph,t->state_number,pos,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==1) && (!u_trymatch_superfast1(contenu,' '))) { // case of an obligatory space if (p->buffer[pos+p->current_origin]==' ') { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,' '); } scan_graph(n_graph,t->state_number,pos+1,depth,liste_arrivee,mot_token_buffer,p); } } else if ((contenu_len_possible_match==3) && (!u_trymatch_superfast5(contenu,ETIQ_L_LN3))) { // case of a single letter if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; if (is_letter(p->buffer[pos2+p->current_origin],p->alphabet)) { // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push(p->stack,p->buffer[pos2+p->current_origin]); } scan_graph(n_graph,t->state_number,pos2+1,depth,liste_arrivee,mot_token_buffer,p); } } else { // case of a normal letter sequence if (p->buffer[pos+p->current_origin]==' ') {pos2=pos+1;if (p->output_policy==MERGE_OUTPUTS) push(p->stack,' ');} //else if (buffer[pos+origine_courante]==0x0d) {pos2=pos+2;if (MODE==MERGE) empiler(0x0a);} else pos2=pos; if (etiq->control&RESPECT_CASE_TAG_BIT_MASK) { // case of exact case match int position=0; while (pos2+p->current_origin<p->text_buffer->size && p->buffer[pos2+p->current_origin]==contenu[position]) { pos2++; position++; } if (contenu[position]=='\0' && position!=0 && !(is_letter(contenu[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet))) { // we proceed only if we have exactly read the contenu sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,contenu,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } else { // case of variable case match // the letter sequences may have been caught by the arbre_etiquette structure int position=0; unichar* mot=mot_token_buffer; while (pos2+p->current_origin<p->text_buffer->size && is_equal_or_uppercase(contenu[position],p->buffer[pos2+p->current_origin],p->alphabet)) { mot[position++]=p->buffer[(pos2++)+p->current_origin]; } mot[position]='\0'; if (contenu[position]=='\0' && position!=0 && !(is_letter(contenu[position-1],p->alphabet) && is_letter(p->buffer[pos2+p->current_origin],p->alphabet))) { // we proceed only if we have exactly read the contenu sequence // in both modes MERGE and REPLACE, we process the transduction if any traiter_transduction(p,etiq->output); if (p->output_policy==MERGE_OUTPUTS /*|| etiq->transduction==NULL || etiq->transduction[0]=='\0'*/) { // if we are in MERGE mode, we add to ouput the char we have read push_input_string(p->stack,mot,0); } scan_graph(n_graph,t->state_number,pos2,depth,liste_arrivee,mot_token_buffer,p); } } } } t=t->next; } }