static void add_to_buffer(int *bufptr,unsigned short int c) { buffer[++(*bufptr)]=c; if (*bufptr > PARAGRAPH_BUFFER-2) { buffer[++(*bufptr)]=0; output_paragraph(buffer); *bufptr=-1; } }
int process_file(FILE *f,long stop, FILE *out, get_unicode_char_t get_unicode_char, struct ole_params_t *ole_params, struct io_funcs_t *io_funcs) { int bufptr; int tabmode=0; long offset=0; int hyperlink_mode = 0; unsigned short c; unsigned short int *buffer = (unsigned short int*) malloc(sizeof(unsigned short int) * PARAGRAPH_BUFFER); unsigned char read_buf[256]; int buf_is_unicode = 0; char outputbuffer[LINE_BUF_SIZE] = ""; int bufpos = 0; if (!buffer) { errno = ENOMEM; return -1; } /* Now we are starting to read with get_unicode_char */ while (!io_funcs->catdoc_eof(f) && offset<stop) { bufptr = -1; do { c=get_unicode_char(f,&offset,stop, ole_params, read_buf, &buf_is_unicode, io_funcs); if (c == -1) { free(buffer); return -1; } /* Following symbols below 32 are allowed inside paragraph: 0x0002 - footnote mark 0x0007 - table separator (converted to tabmode) 0x0009 - Horizontal tab ( printed as is) 0x000B - hard return 0x000C - page break 0x000D - return - marks an end of paragraph 0x001E - IS2 for some reason means short defis in Word. 0x001F - soft hyphen in Word 0x0013 - start embedded hyperlink 0x0014 - separate hyperlink URL from text 0x0015 - end embedded hyperlink */ if (tabmode) { tabmode=0; if (c==0x007) { buffer[++bufptr]=0x1E; continue; } else { buffer[++bufptr]=0x1C; } } if (c<32) { switch (c) { case 0x007: tabmode = 1; break; case 0x000D: case 0x000B: buffer[++bufptr]=0x000A; break; case 0x000C: buffer[++bufptr]=c; break; case 0x001E: buffer[++bufptr]='-'; break; case 0x0002: break; case 0x001F: buffer[++bufptr]=0xAD;/* translate to Unicode soft hyphen */ break; case 0x0009: buffer[++bufptr]=c; break; case 0x0013: hyperlink_mode=1; buffer[++bufptr]=' '; break; case 0x0014: hyperlink_mode = 0; /*fall through */ case 0x0015: /* just treat hyperlink separators as * space */ buffer[++bufptr]=' '; break; case 0x0001: if (hyperlink_mode) break; /* else fall through */ default: bufptr=-1; /* Any other control char - discard para*/ } } else if (c != 0xfeff) { /* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything * else*/ buffer[++bufptr]=c; } } while (bufptr<PARAGRAPH_BUFFER-2 && !io_funcs->catdoc_eof(f) && buffer[bufptr]!=0x000a); if (bufptr>0) { buffer[++bufptr]=0; output_paragraph(buffer, out, outputbuffer, &bufpos); } } free(buffer); return 0; }
int parse_rtf(FILE *f) { int para_mode=0, data_skip_mode=0,i; RTFGroupData *groups=NULL; int group_count=0, group_store=20; // this makes sure we have an mbcs lookup table available (void)read_charset("shiftjis"); // and now set it back to the original (void)read_charset(source_csname); int bufptr=-1; fseek(f,0,SEEK_SET); if((groups=(RTFGroupData*)calloc(group_store,sizeof(RTFGroupData))) == NULL ) { perror("Can\'t allocate memory: "); return 1; } groups[0].uc = 1; /* RTF spec says DEfault uc = 1 */ groups[0].mbcs = 0; /* assume not using multibyte characters */ groups[0].codepage = 1252; groups[0].charset = source_charset; while ( !feof(f) ) { int c = fgetc(f); if ( feof( f ) ) break; switch (c) { case '\\': { int code; RTFcommand com; if ((code=getRtfCommand(f, &com)) != 0) break; switch (com.type) { case RTF_SPEC_CHAR: if (com.numarg == '*' && data_skip_mode == 0) { data_skip_mode=group_count; } else if (com.numarg == '\r') { end_paragraph(&bufptr); } else if (com.numarg == '~') { add_to_buffer(&bufptr,0xA0);/* NO-BREAK SPACE */ } else if (com.numarg == '-') { add_to_buffer(&bufptr,0xAD);/* Optional hyphen */ } break; case RTF_EMDASH: add_to_buffer(&bufptr,0x2014);/* EM DASH*/ break; case RTF_ENDASH: add_to_buffer(&bufptr,0x2013);break; case RTF_BULLET: add_to_buffer(&bufptr,0x2022);break; case RTF_LQUOTE: add_to_buffer(&bufptr,0x2018);break; case RTF_RQUOTE: add_to_buffer(&bufptr,0x2019);break; case RTF_LDBLQUOTE: add_to_buffer(&bufptr,0x201C);break; case RTF_RDBLQUOTE: add_to_buffer(&bufptr,0x201D);break; case RTF_ZWNONJOINER: add_to_buffer(&bufptr,0xfeff);break; case RTF_EMSPACE: case RTF_ENSPACE: add_to_buffer(&bufptr,' ');break; case RTF_CHAR: if (data_skip_mode == 0) { short int *charset = groups[group_count].charset; // check for multibyte characters - filter check on DBCS lead bytes as unicode charset cp932 if (groups[group_count].mbcs && ( ((com.numarg >= 0x81) && (com.numarg <= 0x9f)) || ((com.numarg >= 0xe0) && (com.numarg <= 0xfc)) ) ) { // is next char a command lead-in int next_char = fgetc(f); if (next_char == '\\') { next_char = fgetc(f); // is it an escaped character? if (next_char == '\'') { RTFcommand com2; ungetc(next_char,f); next_char=getRtfCommand(f, &com2); if ((next_char != -1) && (com2.type == RTF_CHAR)) { // if a trailing byte in mcbs 2nd byte range if ((com2.numarg >= 0x40) && (com2.numarg <= 0xfc)) // add mbcs char add_to_buffer(&bufptr, rtf_to_unicode((int) (((unsigned char)com.numarg) << 8) | (unsigned char)(com2.numarg), charset)); else { // else add as 2 hich bytes add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset)); add_to_buffer(&bufptr,rtf_to_unicode(com2.numarg, charset)); } } // screwup in 2nd byte. Add hich char else add_to_buffer(&bufptr,rtf_to_unicode(com.numarg,charset)); } // not a escaped character else { // push back values ungetc(next_char,f); ungetc('\\',f); // add hich char add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset)); } } // not a command following else { // push back values ungetc(next_char,f); // add hich char add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset)); } } else add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset)); } break; case RTF_UC: groups[group_count].uc=com.numarg; break; case RTF_TAB: add_to_buffer(&bufptr,0x0009); break; case RTF_UNICODE_CHAR: if (com.numarg < 0) break; if (data_skip_mode == 0) add_to_buffer(&bufptr,com.numarg); i=groups[group_count].uc; if (i > 0) { while (i--) { c = fgetc(f); // are we reading and skipping a control sequence? if (c == '\\') { // bin it (likely a \'xx value) getRtfCommand(f, &com); } } } break; case RTF_PARA: /* *** CHECKME *** if (para_mode > 0) {*/ end_paragraph(&bufptr); /*}*/ para_mode=group_count; break; case RTF_PICT: case RTF_FONTTBL: loadFontTable(f); break; case RTF_F: { RTFFontTableEntry *entry = lookupFontTableEntry(com.numarg); if (entry) { if (!entry->codepage) entry->codepage = charsetCodepage(entry->charset); if (entry->codepage != groups[group_count].codepage) { groups[group_count].codepage = entry->codepage; rtfSetCharset(&groups[group_count]); } } } break; case RTF_INFO: case RTF_COLORTBL: case RTF_STYLESHEET: case RTF_LISTTABLE: case RTF_LISTOVERRIDETABLE: case RTF_RSIDTBL: case RTF_GENERATOR: case RTF_DATAFIELD: if (data_skip_mode == 0){ data_skip_mode=group_count; } break; case RTF_LANG: /* fprintf(stderr, "Selected lang = %d\n",com.numarg); */ groups[group_count].codepage = lcidCodepage(com.numarg); rtfSetCharset(&groups[group_count]); break; case RTF_DEFLANG: case RTF_DEFLANGFE: groups[group_count].codepage = lcidCodepage(com.numarg); rtfSetCharset(&groups[group_count]); break; case RTF_FONT_CHARSET: groups[group_count].codepage = charsetCodepage(com.numarg); rtfSetCharset(&groups[group_count]); break; case RTF_CODEPAGE: groups[group_count].codepage = com.numarg; rtfSetCharset(&groups[group_count]); break; case RTF_PLAIN: groups[group_count].mbcs = 0; groups[group_count].codepage = groups[0].codepage; rtfSetCharset(&groups[group_count]); break; case RTF_LOCH: groups[group_count].mbcs = 1; break; case RTF_HICH: groups[group_count].mbcs = 1; break; case RTF_DBCH: groups[group_count].mbcs = 1; break; case RTF_INDEX: { int current_group = group_count; /* skip all of current group */ do { c = fgetc(f); if (c == '{') group_count++; else if (c == '}') group_count--; } while (group_count >= current_group); ungetc('}',f); } break; default: /* fprintf(stderr, "Unknown command with name %s and arg=%d\n", */ /* com.name, com.numarg); */ ; } break; } case '{': group_count++; if (group_count >= group_store ) { group_store+=10; if((groups=(RTFGroupData*)realloc(groups, group_store*sizeof(RTFGroupData))) == NULL ) { perror("Can\'t allocate memory: "); return 1; } } // this looks wrong - removed pending review FIXME // if (para_mode) // add_to_buffer(&bufptr,0x20); groups[group_count]=groups[group_count-1]; break; case '}': group_count--; if(group_count < 0) group_count=0; if(para_mode > 0 && para_mode > group_count) { /*add_to_buffer(&bufptr,0); output_paragraph(buffer); fprintf(stderr,"\nGROUP_END para_mode=%d group_count=%d bufptr=%d\n", para_mode,group_count,bufptr); bufptr=-1;*/ para_mode=0; } if(data_skip_mode > group_count) { data_skip_mode=0; } break; default: if (data_skip_mode == 0) if (c != '\n' && c != '\r') add_to_buffer(&bufptr,rtf_to_unicode(c, groups[group_count].charset)); } } if (bufptr>=0) { add_to_buffer(&bufptr,'\n'); add_to_buffer(&bufptr,0); output_paragraph(buffer); } free(groups); return 0; }
static void end_paragraph(int *bufptr) { add_to_buffer(bufptr,0x000a); add_to_buffer(bufptr,0); output_paragraph(buffer); *bufptr=-1; }
int process_file(FILE *f,long stop) { int bufptr; int tabmode=0; long offset=0; int hyperlink_mode = 0; unsigned short c; /* Now we are starting to read with get_unicode_char */ while (!catdoc_eof(f) && offset<stop) { bufptr = -1; do { int unichar = get_unicode_char(f,&offset,stop); if (unichar < 0) continue; c = unichar; /* Following symbols below 32 are allowed inside paragraph: 0x0002 - footnote mark 0x0007 - table separator (converted to tabmode) 0x0009 - Horizontal tab ( printed as is) 0x000B - hard return 0x000C - page break 0x000D - return - marks an end of paragraph 0x001E - IS2 for some reason means short defis in Word. 0x001F - soft hyphen in Word 0x0013 - start embedded hyperlink 0x0014 - separate hyperlink URL from text 0x0015 - end embedded hyperlink */ if (tabmode) { tabmode=0; if (c==0x007) { buffer[++bufptr]=0x1E; continue; } else { buffer[++bufptr]=0x1C; } } if (c<32) { switch (c) { case 0x007: tabmode = 1; break; case 0x000D: case 0x000B: buffer[++bufptr]=0x000A; break; case 0x000C: buffer[++bufptr]=c; break; case 0x001E: buffer[++bufptr]='-'; break; case 0x0002: break; case 0x001F: buffer[++bufptr]=0xAD;/* translate to Unicode soft hyphen */ break; case 0x0009: buffer[++bufptr]=c; break; case 0x0013: hyperlink_mode=1; buffer[++bufptr]=' '; break; case 0x0014: hyperlink_mode = 0; /*fall through */ case 0x0015: /* just treat hyperlink separators as * space */ buffer[++bufptr]=' '; break; case 0x0001: if (hyperlink_mode) break; /* else fall through */ default: bufptr=-1; /* Any other control char - discard para*/ } } else if (c != 0xfeff) { /* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything * else*/ buffer[++bufptr]=c; } } while (bufptr >=0 && bufptr<PARAGRAPH_BUFFER-2 && !catdoc_eof(f) && buffer[bufptr]!=0x000a); if (bufptr>0) { buffer[++bufptr]=0; output_paragraph(buffer); } } return 0; }