/** * This function reads one concordance line from 'f', and splits its * components into 'left', 'middle' and 'right'. * * IMPORTANT: in order to fix a bug, we may have to reorder some matches. However, we can't * easily reorder lines in concordance files. So, we indicate the length of the * expected match instead of relying on the length of the match that was actually read. * This is why there is the (++total>=60) hack. */ void read_concordance_line(U_FILE* f,unichar* left,unichar* middle,unichar* right,unichar* indices,int expected_match_length) { int i,c; i=0; while ((c=u_fgetc(f))!='\t') { left[i++]=(unichar)c; } left[i]='\0'; int total=i+expected_match_length; i=0; while ((c=u_fgetc(f))!='\t') { middle[i++]=(unichar)c; } middle[i]='\0'; i=0; while ((c=u_fgetc(f))!='\t') { if (++total>=60) { c='\0'; } right[i++]=(unichar)c; } right[i]='\0'; i=0; while ((c=u_fgetc(f))!='\n') { indices[i++]=(unichar)c; } indices[i]='\0'; }
void protect_special_characters(const char *text,Encoding encoding_output,int bom_output,int mask_encoding_compatibility_input){ U_FILE *source; U_FILE *destination; //fprintf(stdout,"protect special character\n"); char temp_name_file[FILENAME_MAX]; char path[FILENAME_MAX]; get_path(text,path); sprintf(temp_name_file,"%stemp",path); source = u_fopen_existing_versatile_encoding(mask_encoding_compatibility_input, text,U_READ); if( source == NULL){ perror("u_fopen\n"); fprintf(stderr,"Cannot open file %s\n",text); exit(1); } destination = u_fopen_versatile_encoding(encoding_output,bom_output,mask_encoding_compatibility_input,temp_name_file,U_WRITE); if( destination == NULL){ perror("u_fopen\n"); fprintf(stderr,"Cannot open file %s\n",temp_name_file); exit(1); } int a; a = u_fgetc(source); while(a!=EOF){ u_fputc((unichar)a,destination); if(a=='{'){ //fprintf(stdout,"opening bracket found\n"); unichar *bracket_string = get_braced_string(source); unichar *protected_bracket_string = protect_braced_string(bracket_string); //u_fprints(protected_bracket_string,destination); u_fprintf(destination,"%S",protected_bracket_string); //u_printf("%S --- ",bracket_string); //u_printf("%S\n",protected_bracket_string); free(bracket_string); free(protected_bracket_string); } a = u_fgetc(source); } u_fclose(source); u_fclose(destination); copy_file(text,temp_name_file); // should delete the 'temp' file }
/** * Reads and processes a line of the Thai text file. */ int read_line_thai(struct sort_infos* inf) { unichar line[LINE_LENGTH]; unichar thai_line[LINE_LENGTH]; int c; int ret = 1; int i = 0; while ((c = u_fgetc(inf->f)) != '\n' && c != EOF && i < LINE_LENGTH) { line[i++] = (unichar) c; } line[i] = '\0'; if (c == EOF) ret = 0; else (inf->number_of_lines)++; if (i == 0) { /* We ignore the empty line */ return ret; } if (i == LINE_LENGTH) { error("Line %d: line too long\n", inf->number_of_lines); return ret; } convert_thai(line, thai_line); get_node_thai(thai_line, 0, inf->root, line, inf); return ret; }
/** * Reads and processes a line of the text file. * Returns 0 if the end of file has been reached; 1 otherwise. */ int read_line(struct sort_infos* inf) { unichar line[LINE_LENGTH]; int c; int ret = 1; int i = 0; while ((c = u_fgetc(inf->f)) != '\n' && c != EOF && i < LINE_LENGTH) { line[i++] = (unichar) c; } line[i] = '\0'; if (c == EOF) ret = 0; else (inf->number_of_lines)++; if (i == 0) { /* We ignore the empty line */ return ret; } if (i == LINE_LENGTH) { /* Too long lines are not taken into account */ error("Line %d: line too long\n", inf->number_of_lines); return ret; } get_node(line, 0, inf->root, inf); return ret; }
/** * This function reads the given char order file. */ void read_char_order(const VersatileEncodingConfig* vec, const char* name, struct sort_infos* inf) { int c; int current_line = 1; U_FILE* f = u_fopen(vec, name, U_READ); if (f == NULL) { error("Cannot open file %s\n", name); return; } unichar current_canonical = '\0'; int current_priority = 0; while ((c = u_fgetc(f)) != EOF) { if (c != '\n') { /* we ignore the \n char */ if (inf->class_numbers[(unichar) c] != 0) { error("Error in %s: char 0x%x appears several times\n", name, c); } else { inf->class_numbers[(unichar) c] = current_line; if (current_canonical == '\0') { current_canonical = (unichar) c; } inf->canonical[(unichar) c] = current_canonical; inf->priority[(unichar) c] = ++current_priority; } } else { current_line++; current_canonical = '\0'; current_priority = 0; } } u_fclose(f); }
/** * Loads a match list. Match lists are supposed to have been * generated by the Locate program. */ struct match_list* load_match_list(U_FILE* f,OutputPolicy *output_policy,unichar *header,Abstract_allocator prv_alloc) { struct match_list* l=NULL; struct match_list* end_of_list=NULL; int start,end,start_char,end_char,start_letter,end_letter; Ustring* line=new_Ustring(); char is_an_output; /* We read the header */ unichar foo=0; if (header==NULL) { header=&foo; } u_fscanf(f,"#%C\n",header); OutputPolicy policy; switch(*header) { case 'D': { policy=DEBUG_OUTPUTS; /* In debug mode, we have to skip the debug header */ int n_graphs; u_fscanf(f,"%d\n",&n_graphs); while ((n_graphs--)>-1) { /* -1, because we also have to skip the #[IMR] line */ readline(line,f); } break; } case 'M': policy=MERGE_OUTPUTS; break; case 'R': case 'T': case 'X': policy=REPLACE_OUTPUTS; break; case 'I': default: policy=IGNORE_OUTPUTS; break; } if (output_policy!=NULL) { (*output_policy)=policy; } while (6==u_fscanf(f,"%d.%d.%d %d.%d.%d",&start,&start_char,&start_letter,&end,&end_char,&end_letter)) { /* We look if there is an output or not, i.e. a space or a new line */ int c=u_fgetc(f); if (c==' ') { /* If we have an output to read */ readline(line,f); /* In debug mode, we have to stop at the char #1 */ int i=-1; while (line->str[++i]!=1 && line->str[i]!='\0') { } line->str[i]='\0'; } is_an_output=(policy!=IGNORE_OUTPUTS); if (l==NULL) { l=new_match(start,end,start_char,end_char,start_letter,end_letter,is_an_output?line->str:NULL,-1,NULL,prv_alloc); end_of_list=l; } else { end_of_list->next=new_match(start,end,start_char,end_char,start_letter,end_letter,is_an_output?line->str:NULL,-1,NULL,prv_alloc); end_of_list=end_of_list->next; } } free_Ustring(line); return l; }
/** * \brief \b fgets working with \b U_FILE and storing \b char * * Needed to process configuration file * * @param[out] line the text read * @param[in] n max number of character read * @param[in] u file descriptor * * @return NULL if no character has been read before \c EOF has been encountered, \c line otherwise */ char *cassys_fgets(char *line, int n, U_FILE *u) { int i = 0; int c; c = u_fgetc(u); if (c == EOF) { return NULL; } while (c != EOF && c != '\n' && i < n) { line[i] = (char) c; c=u_fgetc(u); i++; } line[i] = '\0'; //fprintf(stdout, "fgets result =%s\n",line); return line; }
static int32_t u_scanf_simple_percent_handler(UFILE *input, u_scanf_spec_info *info, ufmt_args *args, const UChar *fmt, int32_t *fmtConsumed, int32_t *argConverted) { /* make sure the next character in the input is a percent */ *argConverted = 0; if(u_fgetc(input) != 0x0025) { *argConverted = -1; } return 1; }
/** * Loads an alphabet file and returns the associated 'Alphabet*' structure. * If 'korean' is non null, we compute the equivalences between Chinese and Hangul * characters. */ Alphabet* load_alphabet(const VersatileEncodingConfig* vec,const char* filename,int korean) { void* a=get_persistent_structure(filename); if (a!=NULL) { return (Alphabet*)a; } U_FILE* f; f=u_fopen(vec,filename,U_READ); if (f==NULL) { return NULL; } Alphabet* alphabet=new_alphabet(korean); int c; unichar lower,upper; while ((c=u_fgetc(f))!=EOF) { upper=(unichar)c; if (upper=='\n') { /* We skip empty lines */ continue; } if (upper=='#') { // we are in the case of an interval #AZ -> [A..Z] lower=(unichar)u_fgetc(f); upper=(unichar)u_fgetc(f); if (lower>upper) { error("Error in alphabet file: for an interval like #AZ, A must be before Z\n"); free_alphabet(alphabet); u_fclose(f); return NULL; } for (c=lower;c<=upper;c++) { SET_CASE_FLAG_MACRO(c,alphabet,1|2); add_letter_equivalence(alphabet,(unichar)c,(unichar)c); } u_fgetc(f); // reading the \n } else { SET_CASE_FLAG_MACRO(upper,alphabet,1); lower=(unichar)u_fgetc(f); if (lower!='\n') { SET_CASE_FLAG_MACRO(lower,alphabet,2); u_fgetc(f); // reading the \n add_letter_equivalence(alphabet,lower,upper); } else { // we are in the case of a single (no min/maj distinction like in thai) SET_CASE_FLAG_MACRO(upper,alphabet,2); add_letter_equivalence(alphabet,upper,upper); } } } u_fclose(f); return alphabet; }
unichar *get_braced_string(U_FILE *u){ //u_printf("get_braced string = "); int brace_level = 0; // already one brace opened long origin_position = ftell(u); if (origin_position == -1) { perror("ftell\n"); fatal_error("ftell"); } int length = 0; int a = u_fgetc(u); bool protected_char = false; while (a != EOF) { //u_printf("%C",(unichar)a); unichar c = (unichar)a; if (protected_char) { protected_char = false; } else { if (c == '\\') { protected_char = true; } else { if (c == '}') { if (brace_level == 0) { break; } else { brace_level--; } } if(c=='{'){ brace_level++; } } } length++; a = u_fgetc(u); } //u_printf("\n"); if(a == EOF){ fatal_error("Unexpected end of file"); } unichar *result; result = (unichar*)malloc(sizeof(unichar)*(length+1)); if(result == NULL){ perror("malloc\n"); fprintf(stderr,"Impossible to allocate memory\n"); exit(1); } int fseek_result = fseek(u,origin_position,SEEK_SET); if(fseek_result==-1){ perror("fseek"); fatal_error("fseek"); } for (int i = 0; i < length; ++i) { result[i]=(unichar)u_fgetc(u); } result[length]='\0'; return result; }
void char_by_char_tokenization(U_FILE* f,U_FILE* coded_text,U_FILE* output,Alphabet* alph, vector_ptr* tokens,struct hash_table* hashtable, vector_int* n_occur,vector_int* n_enter_pos, int *SENTENCES,int *TOKENS_TOTAL,int *WORDS_TOTAL, int *DIGITS_TOTAL) { int c; unichar s[MAX_TAG_LENGTH]; int n; char ENTER; int COUNT=0; int current_megabyte=0; c=u_fgetc(f); while (c!=EOF) { COUNT++; if ((COUNT/(1024*512))!=current_megabyte) { current_megabyte++; u_printf("%d megabytes read... \r",(COUNT/(1024*512))); } if (c==' ' || c==0x0d || c==0x0a) { ENTER=0; if (c=='\n') { ENTER=1; } // if the char is a separator, we jump all the separators while ((c=u_fgetc(f))==' ' || c==0x0d || c==0x0a) { if (c=='\n') ENTER=1; COUNT++; } s[0]=' '; s[1]='\0'; n=get_token_number(s,tokens,hashtable,n_occur); /* If there is a \n, we note it */ if (ENTER==1) { vector_int_add(n_enter_pos,*TOKENS_TOTAL); } (*TOKENS_TOTAL)++; fwrite(&n,4,1,coded_text); } else if (c=='{') { s[0]='{'; int z=1; while (z<(MAX_TAG_LENGTH-1) && (c=u_fgetc(f))!='}' && c!='{' && c!='\n') { s[z++]=(unichar)c; COUNT++; } if (c=='\n') { // if the tag contains a return fatal_error("Error: a tag containing a new-line sequence has been found\n"); } if (z==(MAX_TAG_LENGTH-1) || c!='}') { // if the tag has no ending } if (z==(MAX_TAG_LENGTH-1)) {z--;} s[z]='\0'; fatal_error("Error: a tag without ending } has been found:\n==>%S<==\n",s); } s[z]='}'; s[z+1]='\0'; if (!u_strcmp(s,"{S}")) { // if we have found a sentence delimiter (*SENTENCES)++; } else { if (u_strcmp(s,"{STOP}") && !check_tag_token(s)) { // if a tag is incorrect, we exit fatal_error("The text contains an invalid tag. Unitex cannot process it."); } } n=get_token_number(s,tokens,hashtable,n_occur); (*TOKENS_TOTAL)++; fwrite(&n,4,1,coded_text); c=u_fgetc(f); } else { s[0]=(unichar)c; s[1]='\0'; n=get_token_number(s,tokens,hashtable,n_occur); (*TOKENS_TOTAL)++; if (is_letter((unichar)c,alph)) (*WORDS_TOTAL)++; else if (c>='0' && c<='9') (*DIGITS_TOTAL)++; fwrite(&n,4,1,coded_text); c=u_fgetc(f); } } for (n=0;n<tokens->nbelems;n++) { u_fprintf(output,"%S\n",tokens->tab[n],output); } }
int xmlize(const VersatileEncodingConfig* vec,const char* fin,const char* fout,int ouput_style) { U_FILE* input = u_fopen(vec, fin, U_READ); if (input == NULL) { error("Input file '%s' not found!\n", fin); return DEFAULT_ERROR_CODE; } U_FILE* output = u_fopen(UTF8, fout, U_WRITE); if (output == NULL) { error("Cannot open output file '%s'!\n", fout); u_fclose(input); return DEFAULT_ERROR_CODE; } else // FIXME(johndoe) put breaks if(ouput_style==XML) { u_fprintf(output, xml_open); } else { u_fprintf(output, tei_open); } int sentence_count = 1; int sentence_count_relative = 1; int paragraph_count = 1; u_fprintf(output, "<p><s id=\"n%d\" xml:id=\"d1p%ds%d\">",sentence_count++,paragraph_count,sentence_count_relative++); int current_state = 0; unichar c; int i; while ((i = u_fgetc(input)) != EOF) { c = (unichar)i; switch (current_state) { case 0: { if ( c == '{') current_state = 1; else if(c == '&') u_fprintf(output, "&"); else if(c == '<') u_fprintf(output, "<"); else if(c == '>') u_fprintf(output, ">"); else u_fputc(c, output); break; } case 1: { if (c == 'S') current_state = 2; else { u_fputc('{', output); u_fputc(c, output); current_state = 0; } break; } case 2: { if (c == '}') current_state = 3; else { u_fputc('{', output); u_fputc('S', output); u_fputc(c, output); current_state = 0; } break; } case 3: { if (c == '{') current_state = 4; else if (c == '\n' || c == ' ' || c == '\t') { u_fputc(c, output); current_state = 3; } else { u_fprintf(output, "</s><s id=\"n%d\" xml:id=\"d1p%ds%d\">",sentence_count++,paragraph_count,sentence_count_relative++); u_fputc(c, output); current_state = 0; } break; } case 4: { if (c == 'S') current_state = 7; else if (c == 'P') current_state = 5; else { u_fputc('{', output); u_fputc(c, output); current_state = 0; } break; } case 5: { if (c == '}') { u_fprintf(output, "</s></p>\n"); paragraph_count++; sentence_count_relative=1; current_state = 6; } else { u_fputc('{', output); u_fputc('P', output); u_fputc(c, output); current_state = 0; } break; } case 6: { if (c == '\n' || c == ' ' || c == '\t') u_fputc(c, output); else { u_fprintf(output, "<p><s id=\"n%d\" xml:id=\"d1p%ds%d\">",sentence_count++,paragraph_count,sentence_count_relative++); u_fputc(c, output); current_state = 0; } break; } case 7: { if (c == '}') { current_state = 3; } else { u_fputc('{', output); u_fputc('S', output); u_fputc(c, output); current_state = 0; } break; } } } if (current_state == 3) { //... } else if (current_state == 6) { //... } else { u_fprintf(output, "</s></p>\n"); } if(ouput_style==XML) { u_fprintf(output, xml_close); } else { u_fprintf(output, tei_close); } u_fclose(input); u_fclose(output); u_printf("Done.\n"); return SUCCESS_RETURN_CODE; }
int tei2txt(char *fin, char *fout, const VersatileEncodingConfig* vec) { void* html_ctx = init_HTML_character_context(); if (html_ctx == NULL) { alloc_error("tei2txt"); return ALLOC_ERROR_CODE; } U_FILE* input = u_fopen(vec, fin, U_READ); if (input == NULL) { error("Input file '%s' not found!\n", fin); free_HTML_character_context(html_ctx); return DEFAULT_ERROR_CODE; } U_FILE* output = u_fopen(vec, fout, U_WRITE); if (output == NULL) { error("Cannot open output file '%s'!\n", fout); u_fclose(input); free_HTML_character_context(html_ctx); return DEFAULT_ERROR_CODE; } unichar buffer[5000]; int i, j, k; unichar c; if((i = u_fgetc(input)) != EOF) { c = (unichar)i; for (;;) { while(c != '<' && (i = u_fgetc(input)) != EOF) { c = (unichar)i; } j = 0; while((i = u_fgetc(input)) != EOF && (c = (unichar)i) != ' ' && (c = (unichar)i) != '\t' && (c = (unichar)i) != '\n' && (c = (unichar)i) != '>') { buffer[j++] = c; } buffer[j] = '\0'; if (c!='>') { /* We do this because we can find <body ...> */ while((i = u_fgetc(input)) != EOF && (c = (unichar)i) != '>') {} } //u_printf("Current tag : <%S>\n", buffer); if(!u_strcmp(buffer, body)) { break; } else { buffer[0] = '\0'; } } } else { error("Empty TEI file %s\n", fin); } char schars[11]; int first_sentence=1; int current_state = 0; int inside_sentence=0; while ((i = u_fgetc(input)) != EOF) { c = (unichar)i; switch (current_state) { case 0: { if(c == '<') { current_state = 1; inside_sentence=0; } else if(c == '&') { current_state = 3; } else if (inside_sentence) { u_fputc(c, output); } break; } case 1: { if(c == 's' || c == 'S') { current_state = 2; } else { while((i = u_fgetc(input)) != EOF) { c = (unichar)i; if(c == '>') { break; } } current_state = 0; } break; } case 2: { if(c == ' ' || c == '>') { current_state = 0; inside_sentence=1; if (!first_sentence) { /* We put a {STOP} tag in order to avoid matches that overlap 2 sentences */ u_fprintf(output,"\n{STOP}{S}"); } else { first_sentence=0; } } if(c != '>') { while((i = u_fgetc(input)) != EOF) { c = (unichar)i; if(c == '>') { break; } } } break; } case 3: { j = 0; while(c != ';' && (i = u_fgetc(input)) != EOF) { //u_printf("Current S-character: %C\n", c); schars[j++] = (char)c; c = (unichar)i; } schars[j] = '\0'; //u_printf("Current S-chain: %S\n", schars); k = get_HTML_character(html_ctx,schars, 1); switch (k) { case UNKNOWN_CHARACTER: { u_fputc('?', output); break; } case MALFORMED_HTML_CODE: { error("Malformed HTML character declaration &%s;\n", schars); u_fputc('?', output); break; } default: { c = (unichar)k; u_fputc(c, output); break; } } schars[0] = '\0'; current_state = 0; break; } } } u_fclose(output); u_fclose(input); free_HTML_character_context(html_ctx); u_printf("Done.\n"); return SUCCESS_RETURN_CODE; }
U_CFUNC int32_t u_scanf_parse(UFILE *f, const UChar *patternSpecification, va_list ap) { const UChar *alias; int32_t count, converted, argConsumed, cpConsumed; uint16_t handlerNum; ufmt_args args; u_scanf_spec spec; ufmt_type_info info; u_scanf_handler handler; /* alias the pattern */ alias = patternSpecification; /* haven't converted anything yet */ argConsumed = 0; converted = 0; cpConsumed = 0; /* iterate through the pattern */ for(;;) { /* match any characters up to the next '%' */ while(*alias != UP_PERCENT && *alias != 0x0000 && u_fgetc(f) == *alias) { alias++; } /* if we aren't at a '%', or if we're at end of string, break*/ if(*alias != UP_PERCENT || *alias == 0x0000) break; /* parse the specifier */ count = u_scanf_parse_spec(alias, &spec); /* update the pointer in pattern */ alias += count; handlerNum = (uint16_t)(spec.fInfo.fSpec - USCANF_BASE_FMT_HANDLERS); if (handlerNum < USCANF_NUM_FMT_HANDLERS) { /* skip the argument, if necessary */ /* query the info function for argument information */ info = g_u_scanf_infos[ handlerNum ].info; if (info != ufmt_count && u_feof(f)) { break; } else if(spec.fInfo.fSkipArg) { args.ptrValue = NULL; } else { switch(info) { case ufmt_count: /* set the spec's width to the # of items converted */ spec.fInfo.fWidth = cpConsumed; U_FALLTHROUGH; case ufmt_char: case ufmt_uchar: case ufmt_int: case ufmt_string: case ufmt_ustring: case ufmt_pointer: case ufmt_float: case ufmt_double: args.ptrValue = va_arg(ap, void*); break; default: /* else args is ignored */ args.ptrValue = NULL; break; } } /* call the handler function */ handler = g_u_scanf_infos[ handlerNum ].handler; if(handler != 0) { /* reset count to 1 so that += for alias works. */ count = 1; cpConsumed += (*handler)(f, &spec.fInfo, &args, alias, &count, &argConsumed); /* if the handler encountered an error condition, break */ if(argConsumed < 0) { converted = -1; break; } /* add to the # of items converted */ converted += argConsumed; /* update the pointer in pattern */ alias += count-1; } /* else do nothing */ } /* else do nothing */ /* just ignore unknown tags */ }