/********************************************************************* * Determines format of input file and calls parse_word_header or * process_file if * it is word processor file or copy_out if it is plain text file * return not 0 when error ********************************************************************/ int analyze_format(FILE *f) { unsigned char buffer[129]; long offset=0; FILE *new_file, *ole_file; int ret_code=69; if (!signature_check) { /* forced parsing */ /* no autodetect possible. Assume 8-bit if not overriden on * command line */ if (!get_unicode_char) get_unicode_char=get_8bit_char; return process_file(f,LONG_MAX); } catdoc_read(buffer,4,1,f); buffer[4]=0; if (strncmp((char *)&buffer,write_sign,2)==0) { printf("[Windows Write file. Some garbage expected]\n"); get_unicode_char=get_8bit_char; return process_file(f,LONG_MAX); } else if (strncmp((char *)&buffer,rtf_sign,4)==0) { return parse_rtf(f); } else if (strncmp((char *)&buffer, zip_sign,4) == 0) { fprintf(stderr,"This file looks like ZIP archive or Office 2007 " "or later file.\nNot supported by catdoc\n"); exit(1); } else if (strncmp((char *)&buffer,old_word_sign,2)==0) { fread(buffer+4,1,124,f); return parse_word_header(buffer,f,128,0); } fread(buffer+4,1,4,f); if (strncmp((char *)&buffer,ole_sign,8)==0) { if ((new_file=ole_init(f, buffer, 8)) != NULL) { set_ole_func(); while((ole_file=ole_readdir(new_file)) != NULL) { int res=ole_open(ole_file); if (res >= 0) { if (strcmp(((oleEntry*)ole_file)->name , "WordDocument") == 0) { offset=catdoc_read(buffer, 1, 128, ole_file); ret_code=parse_word_header(buffer,ole_file,-offset,offset); } } ole_close(ole_file); } set_std_func(); ole_finish(); } else { fprintf(stderr,"Broken OLE file. Try using -b switch\n"); exit(1); } } else { copy_out(f,(char *)&buffer); return 0; } return ret_code; }
/** * * * @param input * @param filename */ void do_ppt(FILE *input,char *filename) { int itemsread=1; int rectype; long reclen; unsigned char recbuf[8]; while(itemsread) { itemsread = catdoc_read(recbuf, 1, 8, input); /* fprintf(stderr,"itemsread=%d: ",itemsread); */ /* for(i=0; i<8; i++) */ /* fprintf(stderr,"%02x ",recbuf[i]); */ /* fprintf(stderr,"\n"); */ if (catdoc_eof(input)) { process_item(DOCUMENT_END,0,input); return; } if(itemsread < 8) break; rectype=getshort(recbuf,2); reclen=getulong(recbuf,4); if (reclen < 0) { return; } process_item(rectype,reclen,input); } }
/* Reads file from MS-Word 97 and above file. Takes in account strange* * situation that unicode and non-unicode 256-byte blocks could be * * intermixed in word file * * * * Parameters: * * * * f - file to read * * offset - position of the character inside file (to determine * * possible block boundaries * **********************************************************************/ int get_word8_char(FILE *f,long *offset,long fileend) { int count,i,u; char c; if ((i=(*offset)%256) ==0) { count=catdoc_read(read_buf,1,256,f); memset(read_buf+count,0,256-count); buf_is_unicode=0; if (*offset+(long)count>fileend) { count=fileend-*offset; } while (i<count) { c=read_buf[i++]; if ((c==0x20|| c==0x0D||ispunct(c))&&i<count&&read_buf[i]==0) { buf_is_unicode=1; break; } i++; } i=0; } if (buf_is_unicode) { u=read_buf[i] | read_buf[i+1]<<8; (*offset)+=2; } else { u=to_unicode(source_charset,read_buf[i]); (*offset)++; } return u; }
void do_table(FILE *input,char *filename) { long rectype; long reclen,build_year=0,build_rel=0,offset=0; int eof_flag=0; int itemsread=1; date_shift=25569.0; /* Windows 1900 date system */ CleanUpFormatIdxUsed(); while (itemsread) { catdoc_read(rec,2,1,input); biff_version=getshort(rec,0); catdoc_read(rec,2,1,input); reclen=getshort(rec,0); if ( biff_version == 0x0809 || biff_version == 0x0409 || biff_version == 0x0209 || biff_version == 0x0009 ) { if (reclen==8 || reclen==16) { if (biff_version == 0x0809 ) { itemsread=catdoc_read(rec,4,1,input); if (itemsread == 0) break; build_year=getshort(rec+2,0); build_rel=getshort(rec,0); (void) build_rel; if(build_year > 5 ) { catdoc_read(rec,8,1,input); biff_version=8; offset=12; } else { biff_version=7; offset=4; } } else if (biff_version == 0x0209 ) { biff_version=3; offset=2; } else if (biff_version == 0x0409 ) { offset=2; biff_version=4; } else { biff_version=2; } itemsread=catdoc_read(rec,reclen-offset,1,input); break; } else { fprintf(stderr,"%s: Invalid BOF record\n",filename); return; } } else { itemsread=catdoc_read(rec,126,1,input); } } if (catdoc_eof(input)) { fprintf(stderr,"%s: No BOF record found\n",filename); exit(1); } while(itemsread){ unsigned char buffer[2]; itemsread = catdoc_read(buffer, 2, 1, input); if (catdoc_eof(input)) { process_item(MSEOF,0,NULL); return; } if(itemsread == 0) break; rectype=getshort(buffer,0); itemsread = catdoc_read(buffer, 2, 1, input); if(itemsread == 0) break; reclen=getshort(buffer,0); if (reclen && reclen <MAX_MS_RECSIZE &&reclen >0){ itemsread = catdoc_read(rec, 1, reclen, input); rec[reclen] = '\0'; } if(eof_flag) { if (rectype != BOF) { break; } } /* fprintf(stderr,"Rectype 0x%04X reclen=%d\n",rectype, reclen); */ process_item(rectype,reclen,rec); if (rectype == MSEOF) { eof_flag=1; } else { eof_flag=0; } } return; }
/** * * * @param rectype * @param reclen * @param input */ static void process_item (int rectype, long reclen, FILE* input) { int i=0, u; static char buf[2]; switch(rectype) { case DOCUMENT_END: /* fprintf(stderr,"End of document, ended at %ld\n",catdoc_tell(input)); */ catdoc_seek(input, reclen, SEEK_CUR); break; case DOCUMENT: /* fprintf(stderr,"Start of document, reclen=%ld, started at %ld\n", reclen, */ /* catdoc_tell(input)); */ break; case DOCUMENT_ATOM: /* fprintf(stderr,"DocumentAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case SLIDE: /* fprintf(stderr,"Slide, reclen=%ld\n", reclen); */ /* fputs("---------------------------------------\n",stderr); */ break; case SLIDE_ATOM: /* fprintf(stderr,"SlideAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case SLIDE_BASE: /* fprintf(stderr,"SlideBase, reclen=%ld\n", reclen); */ break; case SLIDE_BASE_ATOM: /* fprintf(stderr,"SlideBaseAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case NOTES: /* fprintf(stderr,"Notes, reclen=%ld\n", reclen); */ break; case NOTES_ATOM: /* fprintf(stderr,"NotesAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case HEADERS_FOOTERS: /* fprintf(stderr,"HeadersFooters, reclen=%ld\n", reclen); */ break; case HEADERS_FOOTERS_ATOM: /* fprintf(stderr,"HeadersFootersAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case MAIN_MASTER: /* fprintf(stderr,"MainMaster, reclen=%ld\n", reclen); */ break; case TEXT_BYTES_ATOM: { /* fprintf(stderr,"TextBytes, reclen=%ld\n", reclen); */ for(i=0; i < reclen; i++) { catdoc_read(buf,1,1,input); if((unsigned char)*buf!=0x0d) fputs(convert_char((unsigned char)*buf),stdout); else fputc('\n',stdout); } fputc('\n',stdout); } break; case TEXT_CHARS_ATOM: case CSTRING: { long text_len; /* fprintf(stderr,"CString, reclen=%ld\n", reclen); */ text_len=reclen/2; for(i=0; i < text_len; i++) { catdoc_read(buf,2,1,input); u=(unsigned short)getshort(buf,0); if(u!=0x0d) fputs(convert_char(u),stdout); else fputc('\n',stdout); } fputc('\n',stdout); } break; case USER_EDIT_ATOM: /* fprintf(stderr,"UserEditAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case COLOR_SCHEME_ATOM: /* fprintf(stderr,"ColorSchemeAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case PPDRAWING: /* fprintf(stderr,"PPDrawing, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case ENVIRONMENT: /* fprintf(stderr,"Environment, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case SSDOC_INFO_ATOM: /* fprintf(stderr,"SSDocInfoAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case SSSLIDE_INFO_ATOM: /* fprintf(stderr,"SSSlideInfoAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case PROG_TAGS: /* fprintf(stderr,"ProgTags, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case PROG_STRING_TAG: /* fprintf(stderr,"ProgStringTag, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case PROG_BINARY_TAG: /* fprintf(stderr,"ProgBinaryTag, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case LIST: /* fprintf(stderr,"List, reclen=%ld\n", reclen); */ break; case SLIDE_LIST_WITH_TEXT: /* fprintf(stderr,"SlideListWithText, reclen=%ld\n", reclen); */ /* fputs("---------------------------------------\n",stderr); */ break; case PERSIST_PTR_INCREMENTAL_BLOCK: /* fprintf(stderr,"PersistPtrIncrementalBlock, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case EX_OLE_OBJ_STG: /* fprintf(stderr,"ExOleObjStg, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case PPDRAWING_GROUP: /* fprintf(stderr,"PpdrawingGroup, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case EX_OBJ_LIST: /* fprintf(stderr,"ExObjList, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case TX_MASTER_STYLE_ATOM: /* fprintf(stderr,"TxMasterStyleAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case HANDOUT: /* fprintf(stderr,"Handout, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case SLIDE_PERSIST_ATOM: /* fprintf(stderr,"SlidePersistAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case TEXT_HEADER_ATOM: /* fprintf(stderr,"TextHeaderAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case TEXT_SPEC_INFO: /* fprintf(stderr,"TextSpecInfo, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; case STYLE_TEXT_PROP_ATOM: /* fprintf(stderr,"StyleTextPropAtom, reclen=%ld\n", reclen); */ catdoc_seek(input, reclen, SEEK_CUR); break; /* case : fprintf(stderr,", reclen=%ld\n", reclen); catdoc_seek(input, reclen, SEEK_CUR); break;*/ /* case : fprintf(stderr,", reclen=%ld\n", reclen); catdoc_seek(input, reclen, SEEK_CUR); break;*/ default: /* fprintf(stderr,"Default action for rectype=%d reclen=%ld\n", */ /* rectype, reclen); */ catdoc_seek(input, reclen, SEEK_CUR); } }
void copy_out (FILE *f,char *header) { char *buf=(char *)buffer; int count,i; long offset; if (get_unicode_char == get_word8_char) { /* non-word file and -u specified. Trying to guess which kind of * unicode is used */ if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) { get_unicode_char = get_utf16msb; fputs(convert_char(header[2]<<8|header[3]),output_file); fputs(convert_char(header[4]<<8|header[5]),output_file); fputs(convert_char(header[6]<<8|header[7]),output_file); } else if ((unsigned char)header[0]!=0xFF || (unsigned char)header[1]!=0xFE) { int c,j,d; /* if it is not utf16, assume it is UTF8. We are told -u, * aren't we */ get_unicode_char = get_utf8; i=0; while (i<8) { c=(unsigned char)header[i++]; if (c >=0x80) { if ( c<0xE0) { c=(c & 0x1F); count =1; } else { c=(c & 0xF); count = 2; } for (j=0;j<count;j++) { if (i<7) { d=(unsigned char) header[i++]; } else { d=fgetc(f); } c=c<<6 | (d & 0x3F); } } fputs (convert_char(c),output_file); } } else { get_unicode_char = get_utf16lsb; fputs(convert_char(header[3]<<8|header[2]),output_file); fputs(convert_char(header[5]<<8|header[4]),output_file); fputs(convert_char(header[7]<<8|header[6]),output_file); } while (!catdoc_eof(f)) { i=get_unicode_char(f,&offset,0x7FFFFFFF); if (i!=EOF) fputs(convert_char(i),output_file); } } else { for (i=0;i<8;i++) { fputs(convert_char(to_unicode(source_charset,(unsigned char)header[i])),output_file); } /* Assuming 8-bit input text */ while ((count = catdoc_read(buf,1,PARAGRAPH_BUFFER,f))) { for (i=0;i<count;i++) { fputs(convert_char(to_unicode(source_charset, (unsigned char)buf[i])),output_file); } } } }