/** * * * @param input * @param filename */ void do_ppt(FILE *input,char *filename) { int itemsread=1; int rectype; long reclen; unsigned char recbuf[8]; while(itemsread) { itemsread = catdoc_read(recbuf, 1, 8, input); /* fprintf(stderr,"itemsread=%d: ",itemsread); */ /* for(i=0; i<8; i++) */ /* fprintf(stderr,"%02x ",recbuf[i]); */ /* fprintf(stderr,"\n"); */ if (catdoc_eof(input)) { process_item(DOCUMENT_END,0,input); return; } if(itemsread < 8) break; rectype=getshort(recbuf,2); reclen=getulong(recbuf,4); if (reclen < 0) { return; } process_item(rectype,reclen,input); } }
void do_table(FILE *input,char *filename) { long rectype; long reclen,build_year=0,build_rel=0,offset=0; int eof_flag=0; int itemsread=1; date_shift=25569.0; /* Windows 1900 date system */ CleanUpFormatIdxUsed(); while (itemsread) { catdoc_read(rec,2,1,input); biff_version=getshort(rec,0); catdoc_read(rec,2,1,input); reclen=getshort(rec,0); if ( biff_version == 0x0809 || biff_version == 0x0409 || biff_version == 0x0209 || biff_version == 0x0009 ) { if (reclen==8 || reclen==16) { if (biff_version == 0x0809 ) { itemsread=catdoc_read(rec,4,1,input); if (itemsread == 0) break; build_year=getshort(rec+2,0); build_rel=getshort(rec,0); (void) build_rel; if(build_year > 5 ) { catdoc_read(rec,8,1,input); biff_version=8; offset=12; } else { biff_version=7; offset=4; } } else if (biff_version == 0x0209 ) { biff_version=3; offset=2; } else if (biff_version == 0x0409 ) { offset=2; biff_version=4; } else { biff_version=2; } itemsread=catdoc_read(rec,reclen-offset,1,input); break; } else { fprintf(stderr,"%s: Invalid BOF record\n",filename); return; } } else { itemsread=catdoc_read(rec,126,1,input); } } if (catdoc_eof(input)) { fprintf(stderr,"%s: No BOF record found\n",filename); exit(1); } while(itemsread){ unsigned char buffer[2]; itemsread = catdoc_read(buffer, 2, 1, input); if (catdoc_eof(input)) { process_item(MSEOF,0,NULL); return; } if(itemsread == 0) break; rectype=getshort(buffer,0); itemsread = catdoc_read(buffer, 2, 1, input); if(itemsread == 0) break; reclen=getshort(buffer,0); if (reclen && reclen <MAX_MS_RECSIZE &&reclen >0){ itemsread = catdoc_read(rec, 1, reclen, input); rec[reclen] = '\0'; } if(eof_flag) { if (rectype != BOF) { break; } } /* fprintf(stderr,"Rectype 0x%04X reclen=%d\n",rectype, reclen); */ process_item(rectype,reclen,rec); if (rectype == MSEOF) { eof_flag=1; } else { eof_flag=0; } } return; }
int process_file(FILE *f,long stop) { int bufptr; int tabmode=0; long offset=0; int hyperlink_mode = 0; unsigned short c; /* Now we are starting to read with get_unicode_char */ while (!catdoc_eof(f) && offset<stop) { bufptr = -1; do { int unichar = get_unicode_char(f,&offset,stop); if (unichar < 0) continue; c = unichar; /* Following symbols below 32 are allowed inside paragraph: 0x0002 - footnote mark 0x0007 - table separator (converted to tabmode) 0x0009 - Horizontal tab ( printed as is) 0x000B - hard return 0x000C - page break 0x000D - return - marks an end of paragraph 0x001E - IS2 for some reason means short defis in Word. 0x001F - soft hyphen in Word 0x0013 - start embedded hyperlink 0x0014 - separate hyperlink URL from text 0x0015 - end embedded hyperlink */ if (tabmode) { tabmode=0; if (c==0x007) { buffer[++bufptr]=0x1E; continue; } else { buffer[++bufptr]=0x1C; } } if (c<32) { switch (c) { case 0x007: tabmode = 1; break; case 0x000D: case 0x000B: buffer[++bufptr]=0x000A; break; case 0x000C: buffer[++bufptr]=c; break; case 0x001E: buffer[++bufptr]='-'; break; case 0x0002: break; case 0x001F: buffer[++bufptr]=0xAD;/* translate to Unicode soft hyphen */ break; case 0x0009: buffer[++bufptr]=c; break; case 0x0013: hyperlink_mode=1; buffer[++bufptr]=' '; break; case 0x0014: hyperlink_mode = 0; /*fall through */ case 0x0015: /* just treat hyperlink separators as * space */ buffer[++bufptr]=' '; break; case 0x0001: if (hyperlink_mode) break; /* else fall through */ default: bufptr=-1; /* Any other control char - discard para*/ } } else if (c != 0xfeff) { /* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything * else*/ buffer[++bufptr]=c; } } while (bufptr >=0 && bufptr<PARAGRAPH_BUFFER-2 && !catdoc_eof(f) && buffer[bufptr]!=0x000a); if (bufptr>0) { buffer[++bufptr]=0; output_paragraph(buffer); } } return 0; }
void copy_out (FILE *f,char *header) { char *buf=(char *)buffer; int count,i; long offset; if (get_unicode_char == get_word8_char) { /* non-word file and -u specified. Trying to guess which kind of * unicode is used */ if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) { get_unicode_char = get_utf16msb; fputs(convert_char(header[2]<<8|header[3]),output_file); fputs(convert_char(header[4]<<8|header[5]),output_file); fputs(convert_char(header[6]<<8|header[7]),output_file); } else if ((unsigned char)header[0]!=0xFF || (unsigned char)header[1]!=0xFE) { int c,j,d; /* if it is not utf16, assume it is UTF8. We are told -u, * aren't we */ get_unicode_char = get_utf8; i=0; while (i<8) { c=(unsigned char)header[i++]; if (c >=0x80) { if ( c<0xE0) { c=(c & 0x1F); count =1; } else { c=(c & 0xF); count = 2; } for (j=0;j<count;j++) { if (i<7) { d=(unsigned char) header[i++]; } else { d=fgetc(f); } c=c<<6 | (d & 0x3F); } } fputs (convert_char(c),output_file); } } else { get_unicode_char = get_utf16lsb; fputs(convert_char(header[3]<<8|header[2]),output_file); fputs(convert_char(header[5]<<8|header[4]),output_file); fputs(convert_char(header[7]<<8|header[6]),output_file); } while (!catdoc_eof(f)) { i=get_unicode_char(f,&offset,0x7FFFFFFF); if (i!=EOF) fputs(convert_char(i),output_file); } } else { for (i=0;i<8;i++) { fputs(convert_char(to_unicode(source_charset,(unsigned char)header[i])),output_file); } /* Assuming 8-bit input text */ while ((count = catdoc_read(buf,1,PARAGRAPH_BUFFER,f))) { for (i=0;i<count;i++) { fputs(convert_char(to_unicode(source_charset, (unsigned char)buf[i])),output_file); } } } }