cassys_tokens_list *add_replaced_text( const char *text, cassys_tokens_list *list, int transducer_id, const char *alphabet_name,int mask_encoding_compatibility_input) { Alphabet *alphabet = load_alphabet(alphabet_name); struct snt_files *snt_text_files = new_snt_files(text); struct fifo *stage_concord = read_concord_file(snt_text_files->concord_ind, mask_encoding_compatibility_input); // performance enhancement cassys_tokens_list *current_list_position = list; long current_token_position = 0; int nb_sentence = 0; while (!is_empty(stage_concord)) { nb_sentence++; locate_pos *l = (locate_pos*) take_ptr(stage_concord); struct list_ustring *new_sentence_lu = cassys_tokenize_word_by_word(l->label, alphabet); cassys_tokens_list *new_sentence_ctl = new_list(new_sentence_lu, transducer_id); // performance enhancement : // Since matches are sorted, we begin the search from the last known position in the list. // We have to substract from the text position the current token position. cassys_tokens_list *list_position = get_element_at(current_list_position, transducer_id - 1, l->token_start_offset - current_token_position); int replaced_sentence_length = l->token_end_offset - l->token_start_offset+1; int new_sentence_length = length(new_sentence_lu); add_output(list_position, new_sentence_ctl, transducer_id, replaced_sentence_length, new_sentence_length-1); // performance enhancement current_list_position = list_position; current_token_position = l-> token_start_offset; free(l->label); free(l); free_list_ustring(new_sentence_lu); } free_fifo(stage_concord); free_snt_files(snt_text_files); free_alphabet(alphabet); return list; }
int main_Extract(int argc,char* const argv[]) { if (argc==1) { usage(); return 0; } int val,index=-1; char extract_matching_units=1; char text_name[FILENAME_MAX]=""; char concord_ind[FILENAME_MAX]=""; char output[FILENAME_MAX]=""; Encoding encoding_output = DEFAULT_ENCODING_OUTPUT; int bom_output = DEFAULT_BOM_OUTPUT; int mask_encoding_compatibility_input = DEFAULT_MASK_ENCODING_COMPATIBILITY_INPUT; struct OptVars* vars=new_OptVars(); while (EOF!=(val=getopt_long_TS(argc,argv,optstring_Extract,lopts_Extract,&index,vars))) { switch(val) { case 'y': extract_matching_units=1; break; case 'n': extract_matching_units=0; break; case 'o': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty output file name\n"); } strcpy(output,vars->optarg); break; case 'i': if (vars->optarg[0]=='\0') { fatal_error("You must specify a non empty concordance file name\n"); } strcpy(concord_ind,vars->optarg); break; case 'h': usage(); return 0; case ':': if (index==-1) fatal_error("Missing argument for option -%c\n",vars->optopt); else fatal_error("Missing argument for option --%s\n",lopts_Extract[index].name); case '?': if (index==-1) fatal_error("Invalid option -%c\n",vars->optopt); else fatal_error("Invalid option --%s\n",vars->optarg); break; case 'k': if (vars->optarg[0]=='\0') { fatal_error("Empty input_encoding argument\n"); } decode_reading_encoding_parameter(&mask_encoding_compatibility_input,vars->optarg); break; case 'q': if (vars->optarg[0]=='\0') { fatal_error("Empty output_encoding argument\n"); } decode_writing_encoding_parameter(&encoding_output,&bom_output,vars->optarg); break; } index=-1; } if (output[0]=='\0') { fatal_error("You must specify the output text file\n"); } if (vars->optind!=argc-1) { fatal_error("Invalid arguments: rerun with --help\n"); } strcpy(text_name,argv[vars->optind]); struct snt_files* snt_files=new_snt_files(text_name); ABSTRACTMAPFILE* text=af_open_mapfile(snt_files->text_cod,MAPFILE_OPTION_READ,0); if (text==NULL) { error("Cannot open %s\n",snt_files->text_cod); return 1; } struct text_tokens* tok=load_text_tokens(snt_files->tokens_txt,mask_encoding_compatibility_input); if (tok==NULL) { error("Cannot load token list %s\n",snt_files->tokens_txt); af_close_mapfile(text); return 1; } if (tok->SENTENCE_MARKER==-1) { error("The text does not contain any sentence marker {S}\n"); af_close_mapfile(text); free_text_tokens(tok); return 1; } if (concord_ind[0]=='\0') { char tmp[FILENAME_MAX]; get_extension(text_name,tmp); if (strcmp(tmp,"snt")) { fatal_error("Unable to find the concord.ind file. Please explicit it\n"); } remove_extension(text_name,concord_ind); strcat(concord_ind,"_snt"); strcat(concord_ind,PATH_SEPARATOR_STRING); strcat(concord_ind,"concord.ind"); } U_FILE* concord=u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input,concord_ind,U_READ); if (concord==NULL) { error("Cannot open concordance %s\n",concord_ind); af_close_mapfile(text); free_text_tokens(tok); return 1; } U_FILE* result=u_fopen_creating_versatile_encoding(encoding_output,bom_output,output,U_WRITE); if (result==NULL) { error("Cannot write output file %s\n",output); af_close_mapfile(text); u_fclose(concord); free_text_tokens(tok); return 1; } free_snt_files(snt_files); extract_units(extract_matching_units,text,tok,concord,result); af_close_mapfile(text); u_fclose(concord); u_fclose(result); free_text_tokens(tok); free_OptVars(vars); u_printf("Done.\n"); return 0; }
char* create_labeled_files_and_directory(const char *text, int next_transducer_label,int must_create_directory,int must_copy_file) { char path[FILENAME_MAX]; get_path(text, path); char canonical_text_name[FILENAME_MAX]; remove_path_and_extension(text, canonical_text_name); char extension[FILENAME_MAX]; get_extension(text, extension); char working_directory[FILENAME_MAX]; sprintf(working_directory, "%s%s%s%c", path, canonical_text_name, CASSYS_DIRECTORY_EXTENSION, PATH_SEPARATOR_CHAR); // copy the text label i- to i char old_labeled_text_name[FILENAME_MAX]; sprintf(old_labeled_text_name, "%s%s_%d%s", working_directory, canonical_text_name, next_transducer_label - 1, extension); char new_labeled_text_name[FILENAME_MAX]; sprintf(new_labeled_text_name, "%s%s_%d%s", working_directory, canonical_text_name, next_transducer_label, extension); char new_labeled_snt_directory[FILENAME_MAX]; get_snt_path(new_labeled_text_name, new_labeled_snt_directory); if (must_create_directory != 0) { make_directory(new_labeled_snt_directory); } if (must_copy_file != 0) { copy_file(new_labeled_text_name, old_labeled_text_name); // create snt directory labeled i char old_labeled_snt_directory[FILENAME_MAX]; get_snt_path(old_labeled_text_name, old_labeled_snt_directory); // copy dictionary files in the new snt directory struct snt_files *old_snt_ = new_snt_files(old_labeled_text_name); struct snt_files *new_snt_ = new_snt_files(new_labeled_text_name); if (fexists(old_snt_->dlc)) { copy_file(new_snt_->dlc, old_snt_->dlc); } if (fexists(old_snt_-> dlf)) { copy_file(new_snt_->dlf, old_snt_->dlf); } if (fexists(old_snt_-> err)) { copy_file(new_snt_->err, old_snt_->err); } if (fexists(old_snt_->dlc_n)) { copy_file(new_snt_->dlc_n, old_snt_->dlc_n); } if (fexists(old_snt_->dlf_n)) { copy_file(new_snt_->dlf_n, old_snt_->dlf_n); } if (fexists(old_snt_-> err_n)) { copy_file(new_snt_->err_n, old_snt_->err_n); } if (fexists(old_snt_->stat_dic_n)) { copy_file(new_snt_->stat_dic_n, old_snt_->stat_dic_n); } free_snt_files(old_snt_); free_snt_files(new_snt_); } char *labeled_text_name; labeled_text_name = (char*)malloc(sizeof(char)*(strlen(new_labeled_text_name)+1)); if(labeled_text_name == NULL){ perror("malloc\n"); fprintf(stderr,"Impossible to allocate memory\n"); exit(1); } strcpy(labeled_text_name, new_labeled_text_name); return labeled_text_name; }
/** * The main function of the cascade * * */ int cascade(const char* text, int in_place, int must_create_directory, fifo* transducer_list, const char *alphabet, const char*negation_operator, Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input) { launch_tokenize_in_Cassys(text,alphabet,NULL,encoding_output,bom_output,mask_encoding_compatibility_input); //if (in_place == 0) initialize_working_directory(text, must_create_directory); struct snt_files *snt_text_files = new_snt_files(text); struct text_tokens *tokens = NULL; cassys_tokens_list *tokens_list = cassys_load_text(snt_text_files->tokens_txt, snt_text_files->text_cod,&tokens); fprintf(stdout,"Cascade begins\n"); int transducer_number = 1; char *labeled_text_name = NULL; if ((in_place != 0)) labeled_text_name = create_labeled_files_and_directory(text, transducer_number*0, must_create_directory,0); while(!is_empty(transducer_list)){ if ((in_place == 0)) labeled_text_name = create_labeled_files_and_directory(text, transducer_number, must_create_directory,1); /* else { labeled_text_name = strdup(text); }*/ launch_tokenize_in_Cassys(labeled_text_name,alphabet,snt_text_files->tokens_txt,encoding_output,bom_output,mask_encoding_compatibility_input); free_snt_files(snt_text_files); // apply transducer transducer *current_transducer = (transducer*)take_ptr(transducer_list); launch_locate_in_Cassys(labeled_text_name, current_transducer, alphabet, negation_operator,encoding_output,bom_output,mask_encoding_compatibility_input); // generate concordance for this transducer snt_text_files = new_snt_files(labeled_text_name); launch_concord_in_Cassys(labeled_text_name, snt_text_files -> concord_ind, alphabet,encoding_output,bom_output,mask_encoding_compatibility_input); // add_replaced_text(labeled_text_name,tokens_list,transducer_number,alphabet,mask_encoding_compatibility_input); // add protection character in braces when needed protect_special_characters(labeled_text_name,encoding_output,bom_output,mask_encoding_compatibility_input); transducer_number++; free(current_transducer -> transducer_file_name); free(current_transducer); if ((in_place == 0)) free(labeled_text_name); } if ((in_place != 0)) free(labeled_text_name); free_snt_files(snt_text_files); construct_cascade_concord(tokens_list,text,transducer_number,encoding_output,bom_output,mask_encoding_compatibility_input); struct snt_files *snt_files = new_snt_files(text); char result_file_name[FILENAME_MAX]; char text_name_without_extension[FILENAME_MAX]; remove_extension(text,text_name_without_extension); sprintf(result_file_name,"%s.csc",text_name_without_extension); copy_file(result_file_name,text); launch_concord_in_Cassys(result_file_name,snt_files->concord_ind,alphabet,encoding_output,bom_output,mask_encoding_compatibility_input); free_cassys_tokens_list(tokens_list); free_snt_files(snt_files); free_text_tokens(tokens); return 0; }
/** * The same than main, but no call to setBufferMode. */ int main_Concord(int argc,char* const argv[]) { if (argc==1) { usage(); return SUCCESS_RETURN_CODE; } int val,index=-1; struct conc_opt* concord_options = new_conc_opt(); char foo; VersatileEncodingConfig vec=VEC_DEFAULT; int ret; char offset_file[FILENAME_MAX]=""; char PRLG[FILENAME_MAX]=""; bool only_verify_arguments = false; UnitexGetOpt options; while (EOF!=(val=options.parse_long(argc,argv,optstring_Concord,lopts_Concord,&index))) { switch(val) { case 'f': if (options.vars()->optarg[0]=='\0') { error("Empty font name argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } concord_options->fontname=strdup(options.vars()->optarg); if (concord_options->fontname==NULL) { alloc_error("main_Concord"); free_conc_opt(concord_options); return ALLOC_ERROR_CODE; } break; case 's': if (1!=sscanf(options.vars()->optarg,"%d%c",&(concord_options->fontsize),&foo)) { /* foo is used to check that the font size is not like "45gjh" */ error("Invalid font size argument: %s\n",options.vars()->optarg); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } break; case 'l': ret=sscanf(options.vars()->optarg,"%d%c%c",&(concord_options->left_context),&foo,&foo); if (ret==0 || ret==3 || (ret==2 && foo!='s') || concord_options->left_context<0) { error("Invalid left context argument: %s\n",options.vars()->optarg); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } if (ret==2) { concord_options->left_context_until_eos=1; } break; case 'r': ret=sscanf(options.vars()->optarg,"%d%c%c",&(concord_options->right_context),&foo,&foo); if (ret==0 || ret==3 || (ret==2 && foo!='s') || concord_options->right_context<0) { error("Invalid right context argument: %s\n",options.vars()->optarg); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } if (ret==2) { concord_options->right_context_until_eos=1; } break; case 'L': concord_options->convLFtoCRLF=0; break; case 0: concord_options->sort_mode=TEXT_ORDER; break; case 1: concord_options->sort_mode=LEFT_CENTER; break; case 2: concord_options->sort_mode=LEFT_RIGHT; break; case 3: concord_options->sort_mode=CENTER_LEFT; break; case 4: concord_options->sort_mode=CENTER_RIGHT; break; case 5: concord_options->sort_mode=RIGHT_LEFT; break; case 6: concord_options->sort_mode=RIGHT_CENTER; break; case 7: concord_options->result_mode=DIFF_; break; case 8: concord_options->only_ambiguous=1; break; case 9: { strcpy(PRLG,options.vars()->optarg); char* pos=strchr(PRLG,','); if (pos==NULL || pos==PRLG || *(pos+1)=='\0') { error("Invalid argument for option --PRLG: %s\n",options.vars()->optarg); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } *pos='\0'; strcpy(offset_file,pos+1); break; } case 10: concord_options->only_matches=1; break; case 11: concord_options->result_mode=LEMMATIZE_; break; case 12: concord_options->result_mode=CSV_; break; case 'H': concord_options->result_mode=HTML_; break; case 't': { concord_options->result_mode=TEXT_; if (options.vars()->optarg!=NULL) { strcpy(concord_options->output,options.vars()->optarg); } break; } case 'g': concord_options->result_mode=GLOSSANET_; if (options.vars()->optarg[0]=='\0') { error("Empty glossanet script argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } concord_options->script=strdup(options.vars()->optarg); if (concord_options->script==NULL) { alloc_error("main_Concord"); free_conc_opt(concord_options); return ALLOC_ERROR_CODE; } break; case 'p': concord_options->result_mode=SCRIPT_; if (options.vars()->optarg[0]=='\0') { error("Empty script argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } concord_options->script=strdup(options.vars()->optarg); if (concord_options->script==NULL) { alloc_error("main_Concord"); free_conc_opt(concord_options); return ALLOC_ERROR_CODE; } break; case 'i': concord_options->result_mode=INDEX_; break; case 'u': concord_options->result_mode=UIMA_; if (options.vars()->optarg!=NULL) { strcpy(offset_file,options.vars()->optarg); } concord_options->original_file_offsets=1; break; case 'e': concord_options->result_mode=XML_; if (options.vars()->optarg!=NULL) { strcpy(offset_file, options.vars()->optarg); concord_options->original_file_offsets=1; } break; case 'w': concord_options->result_mode=XML_WITH_HEADER_; if (options.vars()->optarg!=NULL) { strcpy(offset_file, options.vars()->optarg); concord_options->original_file_offsets = 1; } break; case '$': if (options.vars()->optarg[0]=='\0') { error("Empty input_offsets argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } strcpy(concord_options->input_offsets,options.vars()->optarg); break; case '@': if (options.vars()->optarg[0]=='\0') { error("Empty output_offsets argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } strcpy(concord_options->output_offsets,options.vars()->optarg); break; case 'A': concord_options->result_mode=AXIS_; break; case 'x': concord_options->result_mode=XALIGN_; break; case 'm': concord_options->result_mode=MERGE_; if (options.vars()->optarg[0]=='\0') { error("Empty output file name argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } strcpy(concord_options->output,options.vars()->optarg); break; case 'a': if (options.vars()->optarg[0]=='\0') { error("Empty alphabet argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } concord_options->sort_alphabet=strdup(options.vars()->optarg); if (concord_options->sort_alphabet==NULL) { alloc_error("main_Concord"); free_conc_opt(concord_options); return ALLOC_ERROR_CODE; } break; case 'T': concord_options->thai_mode=1; break; case 'd': if (options.vars()->optarg[0]=='\0') { error("Empty snt directory argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } strcpy(concord_options->working_directory,options.vars()->optarg); break; case 'V': only_verify_arguments = true; break; case 'h': usage(); free_conc_opt(concord_options); return SUCCESS_RETURN_CODE; case ':': index==-1 ? error("Missing argument for option -%c\n",options.vars()->optopt) : error("Missing argument for option --%s\n",lopts_Concord[index].name); free_conc_opt(concord_options); return USAGE_ERROR_CODE; case '?': index==-1 ? error("Invalid option -%c\n",options.vars()->optopt) : error("Invalid option --%s\n",options.vars()->optarg); free_conc_opt(concord_options); return USAGE_ERROR_CODE; case 'k': if (options.vars()->optarg[0]=='\0') { error("Empty input_encoding argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } decode_reading_encoding_parameter(&(vec.mask_encoding_compatibility_input),options.vars()->optarg); break; case 'q': if (options.vars()->optarg[0]=='\0') { error("Empty output_encoding argument\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } decode_writing_encoding_parameter(&(vec.encoding_output),&(vec.bom_output),options.vars()->optarg); break; } index=-1; } if (options.vars()->optind!=argc-1) { error("Invalid arguments: rerun with --help\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } if (concord_options->fontname==NULL || concord_options->fontsize<=0) { if (concord_options->result_mode==HTML_ || concord_options->result_mode==GLOSSANET_) { error("The specified output mode is an HTML file: you must specify font parameters\n"); free_conc_opt(concord_options); return USAGE_ERROR_CODE; } } if (only_verify_arguments) { // freeing all allocated memory free_conc_opt(concord_options); return SUCCESS_RETURN_CODE; } U_FILE* concor=u_fopen(&vec,argv[options.vars()->optind],U_READ); if (concor==NULL) { error("Cannot open concordance index file %s\n",argv[options.vars()->optind]); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } if (concord_options->working_directory[0]=='\0') { get_path(argv[options.vars()->optind],concord_options->working_directory); } if (concord_options->only_matches) { concord_options->left_context=0; concord_options->right_context=0; } /* We compute the name of the files associated to the text */ struct snt_files* snt_files=new_snt_files_from_path(concord_options->working_directory); ABSTRACTMAPFILE* text=af_open_mapfile(snt_files->text_cod,MAPFILE_OPTION_READ,0); if (text==NULL) { error("Cannot open file %s\n",snt_files->text_cod); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } struct text_tokens* tok=load_text_tokens(&vec,snt_files->tokens_txt); if (tok==NULL) { error("Cannot load text token file %s\n",snt_files->tokens_txt); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } U_FILE* f_enter=u_fopen(BINARY,snt_files->enter_pos,U_READ); int n_enter_char=0; int* enter_pos=NULL; /* New lines are encoded in 'enter.pos' files. Those files will disappear in the future */ if (f_enter==NULL) { error("Cannot open file %s\n",snt_files->enter_pos); } else { long size=get_file_size(f_enter); enter_pos=(int*)malloc(size); if (enter_pos==NULL) { alloc_error("main_Concord"); u_fclose(f_enter); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return ALLOC_ERROR_CODE; } n_enter_char=(int)fread(enter_pos,sizeof(int),size/sizeof(int),f_enter); if (n_enter_char!=(int)(size/sizeof(int))) { error("Read error on enter.pos file in main_Concord\n"); u_fclose(f_enter); free(enter_pos); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } u_fclose(f_enter); } if (concord_options->result_mode==INDEX_ || concord_options->result_mode==UIMA_ || concord_options->result_mode==XML_ || concord_options->result_mode==XML_WITH_HEADER_ || concord_options->result_mode==AXIS_) { /* We force some options for index, uima and axis files */ concord_options->left_context=0; concord_options->right_context=0; concord_options->sort_mode=TEXT_ORDER; } if (concord_options->only_ambiguous && concord_options->result_mode!=LEMMATIZE_) { /* We force text order when displaying only ambiguous outputs */ concord_options->sort_mode=TEXT_ORDER; } if (concord_options->result_mode==HTML_ || concord_options->result_mode==DIFF_ || concord_options->result_mode==LEMMATIZE_) { /* We need the offset file if and only if we have to produce * an html concordance with positions in .snt file */ concord_options->snt_offsets=load_snt_offsets(snt_files->snt_offsets_pos); if (concord_options->snt_offsets==NULL) { error("Cannot read snt offset file %s\n",snt_files->snt_offsets_pos); free(enter_pos); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } } if (offset_file[0]!='\0') { concord_options->uima_offsets=load_uima_offsets(&vec,offset_file); if (concord_options->uima_offsets==NULL) { error("Cannot read offset file %s\n",offset_file); free(enter_pos); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } } if (PRLG[0]!='\0') { concord_options->PRLG_data=load_PRLG_data(&vec,PRLG); if (concord_options->PRLG_data==NULL) { error("Cannot read PRLG file %s\n",PRLG); free(enter_pos); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); return DEFAULT_ERROR_CODE; } } if (concord_options->result_mode==CSV_) { concord_options->sort_mode=TEXT_ORDER; concord_options->only_matches=1; } /* Once we have set all parameters, we call the function that * will actually create the concordance. */ create_concordance(&vec,concor,text,tok,n_enter_char,enter_pos,concord_options); free(enter_pos); free_text_tokens(tok); af_close_mapfile(text); free_snt_files(snt_files); u_fclose(concor); free_conc_opt(concord_options); u_printf("Done.\n"); return SUCCESS_RETURN_CODE; }