static void rtfSetCharset(RTFGroupData *group) { const char *charset_name; char *save_buf = input_buffer; if (forced_charset) return; if (getCharset(group->codepage)) return; charset_name = charset_from_codepage(group->codepage); check_charset(&source_csname,charset_name); input_buffer=NULL; // if (group->charset && *group->charset) { // free(group->charset); // group->charset = NULL; // } addCharset(read_charset(source_csname), group->codepage); group->charset = getCharset(group->codepage); if (!group->charset) group->charset = getDefaultCharset(); input_buffer = save_buf; }
int parse_rtf(FILE *f) { int para_mode=0, data_skip_mode=0,i; RTFGroupData *groups=NULL; int group_count=0, group_store=20; // this makes sure we have an mbcs lookup table available (void)read_charset("shiftjis"); // and now set it back to the original (void)read_charset(source_csname); int bufptr=-1; fseek(f,0,SEEK_SET); if((groups=(RTFGroupData*)calloc(group_store,sizeof(RTFGroupData))) == NULL ) { perror("Can\'t allocate memory: "); return 1; } groups[0].uc = 1; /* RTF spec says DEfault uc = 1 */ groups[0].mbcs = 0; /* assume not using multibyte characters */ groups[0].codepage = 1252; groups[0].charset = source_charset; while ( !feof(f) ) { int c = fgetc(f); if ( feof( f ) ) break; switch (c) { case '\\': { int code; RTFcommand com; if ((code=getRtfCommand(f, &com)) != 0) break; switch (com.type) { case RTF_SPEC_CHAR: if (com.numarg == '*' && data_skip_mode == 0) { data_skip_mode=group_count; } else if (com.numarg == '\r') { end_paragraph(&bufptr); } else if (com.numarg == '~') { add_to_buffer(&bufptr,0xA0);/* NO-BREAK SPACE */ } else if (com.numarg == '-') { add_to_buffer(&bufptr,0xAD);/* Optional hyphen */ } break; case RTF_EMDASH: add_to_buffer(&bufptr,0x2014);/* EM DASH*/ break; case RTF_ENDASH: add_to_buffer(&bufptr,0x2013);break; case RTF_BULLET: add_to_buffer(&bufptr,0x2022);break; case RTF_LQUOTE: add_to_buffer(&bufptr,0x2018);break; case RTF_RQUOTE: add_to_buffer(&bufptr,0x2019);break; case RTF_LDBLQUOTE: add_to_buffer(&bufptr,0x201C);break; case RTF_RDBLQUOTE: add_to_buffer(&bufptr,0x201D);break; case RTF_ZWNONJOINER: add_to_buffer(&bufptr,0xfeff);break; case RTF_EMSPACE: case RTF_ENSPACE: add_to_buffer(&bufptr,' ');break; case RTF_CHAR: if (data_skip_mode == 0) { short int *charset = groups[group_count].charset; // check for multibyte characters - filter check on DBCS lead bytes as unicode charset cp932 if (groups[group_count].mbcs && ( ((com.numarg >= 0x81) && (com.numarg <= 0x9f)) || ((com.numarg >= 0xe0) && (com.numarg <= 0xfc)) ) ) { // is next char a command lead-in int next_char = fgetc(f); if (next_char == '\\') { next_char = fgetc(f); // is it an escaped character? if (next_char == '\'') { RTFcommand com2; ungetc(next_char,f); next_char=getRtfCommand(f, &com2); if ((next_char != -1) && (com2.type == RTF_CHAR)) { // if a trailing byte in mcbs 2nd byte range if ((com2.numarg >= 0x40) && (com2.numarg <= 0xfc)) // add mbcs char add_to_buffer(&bufptr, rtf_to_unicode((int) (((unsigned char)com.numarg) << 8) | (unsigned char)(com2.numarg), charset)); else { // else add as 2 hich bytes add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset)); add_to_buffer(&bufptr,rtf_to_unicode(com2.numarg, charset)); } } // screwup in 2nd byte. Add hich char else add_to_buffer(&bufptr,rtf_to_unicode(com.numarg,charset)); } // not a escaped character else { // push back values ungetc(next_char,f); ungetc('\\',f); // add hich char add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset)); } } // not a command following else { // push back values ungetc(next_char,f); // add hich char add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset)); } } else add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset)); } break; case RTF_UC: groups[group_count].uc=com.numarg; break; case RTF_TAB: add_to_buffer(&bufptr,0x0009); break; case RTF_UNICODE_CHAR: if (com.numarg < 0) break; if (data_skip_mode == 0) add_to_buffer(&bufptr,com.numarg); i=groups[group_count].uc; if (i > 0) { while (i--) { c = fgetc(f); // are we reading and skipping a control sequence? if (c == '\\') { // bin it (likely a \'xx value) getRtfCommand(f, &com); } } } break; case RTF_PARA: /* *** CHECKME *** if (para_mode > 0) {*/ end_paragraph(&bufptr); /*}*/ para_mode=group_count; break; case RTF_PICT: case RTF_FONTTBL: loadFontTable(f); break; case RTF_F: { RTFFontTableEntry *entry = lookupFontTableEntry(com.numarg); if (entry) { if (!entry->codepage) entry->codepage = charsetCodepage(entry->charset); if (entry->codepage != groups[group_count].codepage) { groups[group_count].codepage = entry->codepage; rtfSetCharset(&groups[group_count]); } } } break; case RTF_INFO: case RTF_COLORTBL: case RTF_STYLESHEET: case RTF_LISTTABLE: case RTF_LISTOVERRIDETABLE: case RTF_RSIDTBL: case RTF_GENERATOR: case RTF_DATAFIELD: if (data_skip_mode == 0){ data_skip_mode=group_count; } break; case RTF_LANG: /* fprintf(stderr, "Selected lang = %d\n",com.numarg); */ groups[group_count].codepage = lcidCodepage(com.numarg); rtfSetCharset(&groups[group_count]); break; case RTF_DEFLANG: case RTF_DEFLANGFE: groups[group_count].codepage = lcidCodepage(com.numarg); rtfSetCharset(&groups[group_count]); break; case RTF_FONT_CHARSET: groups[group_count].codepage = charsetCodepage(com.numarg); rtfSetCharset(&groups[group_count]); break; case RTF_CODEPAGE: groups[group_count].codepage = com.numarg; rtfSetCharset(&groups[group_count]); break; case RTF_PLAIN: groups[group_count].mbcs = 0; groups[group_count].codepage = groups[0].codepage; rtfSetCharset(&groups[group_count]); break; case RTF_LOCH: groups[group_count].mbcs = 1; break; case RTF_HICH: groups[group_count].mbcs = 1; break; case RTF_DBCH: groups[group_count].mbcs = 1; break; case RTF_INDEX: { int current_group = group_count; /* skip all of current group */ do { c = fgetc(f); if (c == '{') group_count++; else if (c == '}') group_count--; } while (group_count >= current_group); ungetc('}',f); } break; default: /* fprintf(stderr, "Unknown command with name %s and arg=%d\n", */ /* com.name, com.numarg); */ ; } break; } case '{': group_count++; if (group_count >= group_store ) { group_store+=10; if((groups=(RTFGroupData*)realloc(groups, group_store*sizeof(RTFGroupData))) == NULL ) { perror("Can\'t allocate memory: "); return 1; } } // this looks wrong - removed pending review FIXME // if (para_mode) // add_to_buffer(&bufptr,0x20); groups[group_count]=groups[group_count-1]; break; case '}': group_count--; if(group_count < 0) group_count=0; if(para_mode > 0 && para_mode > group_count) { /*add_to_buffer(&bufptr,0); output_paragraph(buffer); fprintf(stderr,"\nGROUP_END para_mode=%d group_count=%d bufptr=%d\n", para_mode,group_count,bufptr); bufptr=-1;*/ para_mode=0; } if(data_skip_mode > group_count) { data_skip_mode=0; } break; default: if (data_skip_mode == 0) if (c != '\n' && c != '\r') add_to_buffer(&bufptr,rtf_to_unicode(c, groups[group_count].charset)); } } if (bufptr>=0) { add_to_buffer(&bufptr,'\n'); add_to_buffer(&bufptr,0); output_paragraph(buffer); } free(groups); return 0; }
void process_item (int rectype, int reclen, unsigned char *rec) { if (rectype != CONTINUE && prev_rectype == SST) { /* we have accumulated unparsed SST, and now encountered * another record, which indicates that SST is ended */ /* fprintf(stderr,"parse sst!\n");*/ parse_sst(sstBuffer,sstBytes); } switch (rectype) { case FILEPASS: { fprintf(stderr,"File is encrypted\n"); exit(69); break; } case WRITEPROT: /* File is write protected, but we only read it */ break; case 0x42: { if (source_charset) break; codepage=getshort(rec,0); /*fprintf(stderr,"CODEPAGE %d\n",codepage); */ if (codepage!=1200) { const char *cp = charset_from_codepage(codepage); source_charset=read_charset(cp); } break; } case FORMAT: { int format_code; format_code=getshort(rec,0); SetFormatIdxUsed(format_code); /* this debug code prints format string */ /* int i; char *ptr; fprintf(stderr,"Format %x \"",format_code); if (rec[2] == reclen - 3 && rec[3] != 0) { for (i=0,ptr=rec+3;i<rec[2];i++,ptr++) { fputc(*ptr,stderr); } } else { for (i=0,ptr=rec+5;i<rec[2];i++,ptr+=2) { fputc(*ptr,stderr); } } fprintf (stderr,"\"\n"); */ break; } case SST: { /* Just copy SST into buffer, and wait until we get * all CONTINUE records */ /* fprintf(stderr,"SST\n"); */ /* If exists first SST entry, then just drop it and start new*/ if (sstBuffer != NULL) free(sstBuffer); if (sst != NULL) free(sst); sstBuffer=(unsigned char*)malloc(reclen); sstBytes = reclen; if (sstBuffer == NULL ) { perror("SSTptr alloc error! "); exit(1); } memcpy(sstBuffer,rec,reclen); break; } case CONTINUE: { if (prev_rectype != SST) { return; /* to avoid changing of prev_rectype;*/ } sstBuffer=realloc(sstBuffer,sstBytes+reclen); if (sstBuffer == NULL ) { perror("SSTptr realloc error! "); exit(1); } memcpy(sstBuffer+sstBytes,rec,reclen); sstBytes+=reclen; return; } case LABEL: { int row,col; unsigned char **pcell; unsigned char *src=(unsigned char *)rec+6; saved_reference=NULL; row = getshort(rec,0); col = getshort(rec,2); /* fprintf(stderr,"LABEL!\n"); */ pcell=allocate(row,col); *pcell=copy_unicode_string(&src); break; } case BLANK: { int row,col;unsigned char **pcell; row = getshort(rec,0); col = getshort(rec,2); pcell=allocate(row,col); *pcell=NULL; break; } case MULBLANK: { int row, startcol,endcol; unsigned char **pcell; row = getshort(rec,0); startcol = getshort(rec,2); endcol=getshort(rec,reclen-2); pcell=allocate(row,endcol); *pcell=NULL; (void)startcol; break; } case CONSTANT_STRING: { int row = getshort(rec,0); int col = getshort(rec,2); unsigned char **pcell; int string_no=getshort(rec,6); if (!sst) { fprintf(stderr,"CONSTANT_STRING before SST parsed\n"); exit(1); } /* fprintf(stderr,"col=%d row=%d no=%d\n",col,row,string_no); */ saved_reference=NULL; pcell=allocate(row,col); if (string_no>=sstsize|| string_no < 0 ) { fprintf(stderr,"string index out of boundary\n"); exit(1); } else if (sst[string_no] !=NULL) { int len; unsigned char *outptr; len=strlen((char *)sst[string_no]); outptr=*pcell=malloc(len+1); strcpy((char *)outptr,(char *)sst[string_no]); } else { *pcell=malloc(1); **pcell = 0; } break; } case 0x03: case 0x103: case 0x303: case NUMBER: { int row,col; unsigned char **pcell; saved_reference=NULL; row = getshort(rec,0)-startrow; col = getshort(rec,2); pcell=allocate(row,col); *pcell=(unsigned char *)strdup(format_double(rec,6,getshort(rec,4))); break; } case INTEGER_CELL: { int row,col; unsigned char **pcell; row = getshort(rec,0)-startrow; col = getshort(rec,2); pcell=allocate(row,col); *pcell=(unsigned char *)strdup(format_int(getshort(rec,7),getshort(rec,4))); break; } case RK: { int row,col,format_code; unsigned char **pcell; saved_reference=NULL; row = getshort(rec,0)-startrow; col = getshort(rec,2); pcell=allocate(row,col); format_code = getshort(rec,4); *pcell=(unsigned char *)strdup(format_rk(rec+6,format_code)); break; } case MULRK: { int row,col,startcol,endcol,offset,format_code; unsigned char **pcell; row = getshort(rec,0)-startrow; startcol = getshort(rec,2); endcol = getshort(rec,reclen-2); saved_reference=NULL; for (offset=4,col=startcol;col<=endcol;offset+=6,col++) { pcell=allocate(row,col); format_code=getshort(rec,offset); *pcell=(unsigned char *)strdup(format_rk(rec+offset+2,format_code)); } break; } case FORMULA: { int row,col; unsigned char **pcell; saved_reference=NULL; row = getshort(rec,0)-startrow; col = getshort(rec,2); pcell=allocate(row,col); if (((unsigned char)rec[12]==0xFF)&&(unsigned char)rec[13]==0xFF) { /* not a floating point value */ if (rec[6]==1) { /*boolean*/ char buf[2]="0"; buf[0]+=rec[9]; *pcell=(unsigned char *)strdup(buf); } else if (rec[6]==2) { /*error*/ char buf[6]="ERROR"; *pcell=(unsigned char *)strdup(buf); } else if (rec[6]==0) { saved_reference=pcell; } } else { int format_code=getshort(rec,4); *pcell=(unsigned char *)strdup(format_double(rec,6,format_code)); } break; } case STRING: { unsigned char *src=(unsigned char *)rec; if (!saved_reference) { fprintf(stderr,"String record without preceeding string formula\n"); break; } *saved_reference=copy_unicode_string(&src); break; } case BOF: { if (rowptr) { fprintf(stderr,"BOF when current sheet is not flushed\n"); free_sheet(); } break; } case XF: case 0x43: /*from perl module Spreadsheet::ParseExecel */ { short int formatIndex = getshort(rec,2); /* we are interested only in format index here */ if (formatTableIndex >= formatTableSize) { formatTable=realloc(formatTable, (formatTableSize+=16)*sizeof(short int)); if (!formatTable) { fprintf(stderr,"Out of memory for format table"); exit (1); } } formatTable[formatTableIndex++] = formatIndex; break; } case MS1904: /* Macintosh 1904 date system */ date_shift=24107.0; break; case MSEOF: { if (!rowptr) break; print_sheet(); free_sheet(); break; } case ROW: { /* fprintf(stderr,"Row! %d %d %d\n",getshort(rec,0), getshort(rec+2,0),getshort(rec+4,0)); */ break; } case INDEX: { /* fprintf(stderr,"INDEX! %d %d\n", getlong(rec+4,0), getlong(rec+8,0)); */ break; } default: { #if 0 fprintf(stderr,"Unknown record 0x%x\n length %d\n",rectype,reclen); #endif } } prev_rectype=rectype; }
/* * Extracts string from sst and returns mallocked copy of it */ unsigned char *copy_unicode_string (unsigned char **src) { int count=0; int flags = 0; int start_offset=0; int to_skip=0; /* Used to counmt data after end of string */ int offset = 1; /* Variable length of the first field */ int charsize; /* char *realstart=*src; */ unsigned char *dest;/* where to copy string */ unsigned char *s,*d,*c; int i,u,l,len; /* for(i=0;i<20;i++) */ /* fprintf(stderr,"%02x ",(*src)[i]); */ /* fprintf(stderr,"\n"); */ flags = *(*src+1+offset); if (! ( flags == 0 || flags == 1 || flags == 8 || flags == 9 || flags == 4 || flags == 5 || flags == 0x0c || flags == 0x0d ) ) { count=**src; flags = *(*src+offset); offset --; if (! ( flags == 0 || flags == 1 || flags == 8 || flags == 9 || flags == 4 || flags == 5 || flags == 0x0c || flags == 0x0d ) ) { /* fprintf(stderr,"Strange flags = %d, returning NULL\n", flags); */ return NULL; } } else { count=getshort(*src,0); } charsize=(flags &0x01) ? 2 : 1; switch (flags & 12 ) { case 0x0c: /* Far East with RichText formating */ to_skip=4*getshort(*src,2+offset)+getlong(*src, 4+offset); start_offset=2+offset+2+4; /* fprintf(stderr,"Far East with RichText formating\n"); */ break; case 0x08: /* With RichText formating */ to_skip=4*getshort(*src,2+offset); start_offset=2+offset+2; /* fprintf(stderr,"With RichText formating %d\n",getshort(*src,2+offset)); */ break; case 0x04: /* Far East */ to_skip=getlong(*src, 2+offset); start_offset=2+offset+4; /* fprintf(stderr,"Far East\n"); */ break; default: to_skip=0; start_offset=2+offset; /* fprintf(stderr,"Default string\n"); */ } /* fprintf(stderr,"count=%d skip=%d start_offset=%d\n", */ /* count, to_skip, start_offset); */ /* а здесь мы копируем строку */ if ( (dest=malloc(count+1)) == NULL ) { perror("Dest string alloc error"); *src+=(to_skip+start_offset+(count*charsize)); exit(0); } *src+=start_offset; len = count; *dest=0;l=0; for (s=*src,d=dest,i=0;i<count;i++,s+=charsize) { /* fprintf(stderr,"l=%d len=%d count=%d charsize=%d\n",l,len,count,charsize); */ if ( (charsize == 1 && (*s == 1 || *s == 0)) || (charsize == 2 && (*s == 1 || *s == 0) && *(s+1) != 4)) { /* fprintf(stderr,"extchar (unicode)=%02x %02x\n",*s, *(s+1)); */ charsize=(*s &0x01) ? 2 : 1; if (charsize == 2) s-=1; count++; continue; } if ( charsize == 2 ){ u=(unsigned short)getshort(s,0); c=(unsigned char *)convert_char(u); /* fprintf(stderr,"char=%02x %02x\n", *s, *(s+1)); */ } else { if (!source_charset) { check_charset(&source_csname,source_csname); /* fprintf(stderr,"charset=%s\n",source_csname);*/ source_charset=read_charset(source_csname); } u=(unsigned short)to_unicode(source_charset,(unsigned char)*s); c=(unsigned char *)convert_char(u); } if (c != NULL) { int dl = strlen((char *)c); while (l+dl>=len) { len+=16; dest=realloc(dest,len+1); } d=dest+l; strcpy((char *)d,(char *)c); l+=dl; } } *src=s+to_skip; return dest; }
/** * * * @param argc * @param argv * * @return */ int main(int argc, char *argv[]) { FILE *input; FILE *new_file, *ole_file; char *filename =NULL; short int *tmp_charset; int c; int i; char *tempname; read_config_file(SYSTEMRC); #ifdef USERRC tempname=find_file(strdup(USERRC),getenv("HOME")); if (tempname) { read_config_file(tempname); free(tempname); } #endif #ifdef HAVE_LANGINFO get_locale_charset(); #endif check_charset(&dest_csname,dest_csname); while ((c=getopt(argc,argv,"Vls:d:p:"))!=-1) { switch(c) { case 'l': list_charsets(); exit(0); case 's': check_charset(&source_csname,optarg); source_charset=read_charset(source_csname); break; case 'd': check_charset(&dest_csname,optarg); break; case 'V': printf("Catdoc Version %s\n",CATDOC_VERSION); exit(0); default: help(); exit(1); } } /* If we are using system strftime, we need to set LC_TIME locale * category unless choosen charset is not same as system locale */ #if defined(HAVE_LANGINFO) && defined(HAVE_STRFTIME) && !defined(__TURB0C__) set_time_locale(); #endif /* charset conversion init*/ input_buffer=malloc(FILE_BUFFER); if (strcmp(dest_csname,"utf-8")) { tmp_charset=read_charset(dest_csname); if (!tmp_charset) { fprintf(stderr,"Cannot load target charset %s\n",dest_csname); exit(1); } target_charset=make_reverse_map(tmp_charset); free(tmp_charset); } else { target_charset=NULL; } spec_chars=read_substmap(stradd("ascii",SPEC_EXT)); if (!spec_chars) { fprintf(stderr,"Cannod read substitution map ascii%s\n", SPEC_EXT); exit(1); } replacements=read_substmap(stradd("ascii",REPL_EXT)); if (!replacements) { fprintf(stderr,"Cannod read substitution map ascii%s\n", REPL_EXT); exit(1); } if (optind>=argc) { if (isatty(fileno(stdin))) { help(); exit(0); } do_ppt(stdin,"STDIN"); exit (0); } for (i=optind;i<argc;i++) { filename = argv[i]; input=fopen(filename,"rb"); if (!input) { perror(filename); exit(1); } if ((new_file=ole_init(input, NULL, 0)) != NULL) { set_ole_func(); while((ole_file=ole_readdir(new_file)) != NULL) { int res=ole_open(ole_file); /* fprintf(stderr, "name = %s\n", ((oleEntry*)ole_file)->name); */ if (res >= 0) { if (strcasecmp(((oleEntry*)ole_file)->name , "PowerPoint Document") == 0) { do_ppt(ole_file,filename); } } ole_close(ole_file); } set_std_func(); ole_finish(); fclose(new_file); } else { fprintf(stderr, "%s is not OLE file or Error\n", filename); } } return 0; }