Exemplo n.º 1
0
/********************************************************************* 
 * Determines format of input file and calls parse_word_header or 
 * process_file if
 * it is word processor file or copy_out if it is plain text file
 * return not 0 when error
 ********************************************************************/ 
int analyze_format(FILE *f) {
	unsigned char buffer[129];
	long offset=0;
	FILE *new_file, *ole_file;
	int ret_code=69;

	if (!signature_check) {
		/* forced parsing */
		/* no autodetect possible. Assume 8-bit if not overriden on
		 * command line */ 
		if (!get_unicode_char) 
			get_unicode_char=get_8bit_char;
		return process_file(f,LONG_MAX);
	}
	catdoc_read(buffer,4,1,f);
	buffer[4]=0;
	if (strncmp((char *)&buffer,write_sign,2)==0) {
		printf("[Windows Write file. Some garbage expected]\n");
		get_unicode_char=get_8bit_char;
		return process_file(f,LONG_MAX);
	} else if (strncmp((char *)&buffer,rtf_sign,4)==0) {
		return parse_rtf(f);
	} else if (strncmp((char *)&buffer, zip_sign,4) == 0) {
		fprintf(stderr,"This file looks like ZIP archive or Office 2007 "
		"or later file.\nNot supported by catdoc\n");
		exit(1);
	} else if (strncmp((char *)&buffer,old_word_sign,2)==0) {
	   fread(buffer+4,1,124,f);	
	   return parse_word_header(buffer,f,128,0);
	}	
	fread(buffer+4,1,4,f);
	if (strncmp((char *)&buffer,ole_sign,8)==0) {
		if ((new_file=ole_init(f, buffer, 8)) != NULL) {
			set_ole_func();
			while((ole_file=ole_readdir(new_file)) != NULL) {
				int res=ole_open(ole_file);
				if (res >= 0) {
					if (strcmp(((oleEntry*)ole_file)->name , "WordDocument") == 0) {
						offset=catdoc_read(buffer, 1, 128, ole_file);
						ret_code=parse_word_header(buffer,ole_file,-offset,offset);
					}
				} 
				ole_close(ole_file);
			}
			set_std_func();
			ole_finish();
		} else {
			fprintf(stderr,"Broken OLE file. Try using -b switch\n");
			exit(1);
		}	
	} else {

		copy_out(f,(char *)&buffer);
		return 0;
	}
	
	return ret_code;
}   
Exemplo n.º 2
0
/** 
 * 
 * 
 * @param input 
 * @param filename 
 */
void do_ppt(FILE *input,char *filename) {
	int itemsread=1;
	int rectype;
	long reclen;
	unsigned char recbuf[8];

	while(itemsread) {
		itemsread = catdoc_read(recbuf, 1, 8, input);
/* 		fprintf(stderr,"itemsread=%d: ",itemsread); */
/* 		for(i=0; i<8; i++) */
/* 			fprintf(stderr,"%02x ",recbuf[i]); */
/* 		fprintf(stderr,"\n"); */
		
		if (catdoc_eof(input)) {
			process_item(DOCUMENT_END,0,input);
			return;
		}
		if(itemsread < 8)
			break;
		rectype=getshort(recbuf,2);
		reclen=getulong(recbuf,4);
		if (reclen < 0) {
			return;
		}	
		process_item(rectype,reclen,input);
	}
}
Exemplo n.º 3
0
/* Reads file from MS-Word 97 and above file. Takes in account strange*
 * situation that unicode and non-unicode 256-byte blocks could be    *
 * intermixed in word file                                            *
 *                                                                    *
 * Parameters:                                                        *
 *                                                                    *
 * f - file to read                                                   *
 * offset - position of the character inside file (to determine       * 
 * possible  block boundaries                                         *
 **********************************************************************/ 
int get_word8_char(FILE *f,long *offset,long fileend) {
	int count,i,u;
	char c;
	if ((i=(*offset)%256) ==0) {
		count=catdoc_read(read_buf,1,256,f);
		memset(read_buf+count,0,256-count);
		buf_is_unicode=0;
		if (*offset+(long)count>fileend) {
			count=fileend-*offset;
		}	
		while (i<count) {
			c=read_buf[i++];
			if ((c==0x20|| c==0x0D||ispunct(c))&&i<count&&read_buf[i]==0) {
				buf_is_unicode=1;
				break;
			}
			i++;
		}   
		i=0;
	}    
	if (buf_is_unicode) {
		u=read_buf[i] | read_buf[i+1]<<8;
		(*offset)+=2;
	} else {
		u=to_unicode(source_charset,read_buf[i]);
		(*offset)++;
	}
	return u;
}  
Exemplo n.º 4
0
void do_table(FILE *input,char *filename) {    
	long rectype;
	long reclen,build_year=0,build_rel=0,offset=0;
	int eof_flag=0;
	int itemsread=1;
	date_shift=25569.0; /* Windows 1900 date system */
	CleanUpFormatIdxUsed();
	while (itemsread) {
		catdoc_read(rec,2,1,input);
		biff_version=getshort(rec,0);
		catdoc_read(rec,2,1,input);
		reclen=getshort(rec,0);
		if ( biff_version == 0x0809 || biff_version == 0x0409 ||
				 biff_version == 0x0209 || biff_version == 0x0009 ) {
			if (reclen==8 || reclen==16) {
				if (biff_version == 0x0809 ) {
					itemsread=catdoc_read(rec,4,1,input);
					if (itemsread == 0) 
						break;
					build_year=getshort(rec+2,0);
					build_rel=getshort(rec,0);
					(void) build_rel;
					if(build_year > 5 ) {
						catdoc_read(rec,8,1,input);
						biff_version=8;
						offset=12;
					}
					else {
						biff_version=7;
						offset=4;
					}
				} else if (biff_version == 0x0209 ) {
					biff_version=3;
					offset=2;
				} else if (biff_version == 0x0409 ) {
					offset=2;
					biff_version=4;
				} else {
					biff_version=2;
				}
				itemsread=catdoc_read(rec,reclen-offset,1,input);
				break;
			} else {
				fprintf(stderr,"%s: Invalid BOF record\n",filename);
				return;
			} 
		} else {
			itemsread=catdoc_read(rec,126,1,input);
		}
	}
	if (catdoc_eof(input)) {
		fprintf(stderr,"%s: No BOF record found\n",filename);
		exit(1);
	}    
	while(itemsread){
		unsigned char buffer[2];

		itemsread = catdoc_read(buffer, 2, 1, input);
		if (catdoc_eof(input)) {
			process_item(MSEOF,0,NULL);
			return;
		}
		
		if(itemsread == 0)
			break;

		rectype=getshort(buffer,0);
		itemsread = catdoc_read(buffer, 2, 1, input);
		if(itemsread == 0)
			break;
		reclen=getshort(buffer,0);
		if (reclen && reclen <MAX_MS_RECSIZE &&reclen >0){
			itemsread = catdoc_read(rec, 1, reclen, input);
			rec[reclen] = '\0';
		}
		if(eof_flag) {
			if (rectype != BOF) {
				break;
			}    
		}
/* 		fprintf(stderr,"Rectype 0x%04X reclen=%d\n",rectype, reclen); */
		process_item(rectype,reclen,rec);
		if (rectype == MSEOF) {
			eof_flag=1;
		} else {
			eof_flag=0;	
		}  
	}
	return;
}
Exemplo n.º 5
0
/** 
 * 
 * 
 * @param rectype 
 * @param reclen 
 * @param input 
 */
static void process_item (int rectype, long reclen, FILE* input) {
	int i=0, u;
	static char buf[2];

	switch(rectype) {
	case DOCUMENT_END:
/* 		fprintf(stderr,"End of document, ended at %ld\n",catdoc_tell(input)); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case DOCUMENT:
/* 		fprintf(stderr,"Start of document, reclen=%ld, started at %ld\n", reclen, */
/* 						catdoc_tell(input)); */
		break;

	case DOCUMENT_ATOM:
/* 		fprintf(stderr,"DocumentAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case SLIDE:
/* 		fprintf(stderr,"Slide, reclen=%ld\n", reclen); */
/*  		fputs("---------------------------------------\n",stderr); */
		break;

	case SLIDE_ATOM:
/* 		fprintf(stderr,"SlideAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;
		
	case SLIDE_BASE:
/* 		fprintf(stderr,"SlideBase, reclen=%ld\n", reclen); */
		break;

	case SLIDE_BASE_ATOM:
/* 		fprintf(stderr,"SlideBaseAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;
		
	case NOTES:
/* 		fprintf(stderr,"Notes, reclen=%ld\n", reclen); */
		break;

	case NOTES_ATOM:
/* 		fprintf(stderr,"NotesAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;
		
	case HEADERS_FOOTERS:
/* 		fprintf(stderr,"HeadersFooters, reclen=%ld\n", reclen); */
		break;

	case HEADERS_FOOTERS_ATOM:
/* 		fprintf(stderr,"HeadersFootersAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;
		
	case MAIN_MASTER:
/* 		fprintf(stderr,"MainMaster, reclen=%ld\n", reclen); */
		break;
		
	case TEXT_BYTES_ATOM: {
/* 			fprintf(stderr,"TextBytes, reclen=%ld\n", reclen); */
			for(i=0; i < reclen; i++) {
				catdoc_read(buf,1,1,input);
				if((unsigned char)*buf!=0x0d)
					fputs(convert_char((unsigned char)*buf),stdout);
				else
					fputc('\n',stdout);
			}
			fputc('\n',stdout);
		}
		break;
		
	case TEXT_CHARS_ATOM: 
	case CSTRING: {
			long text_len;
			
/* 			fprintf(stderr,"CString, reclen=%ld\n", reclen); */
			text_len=reclen/2;
			for(i=0; i < text_len; i++) {
				catdoc_read(buf,2,1,input);
				u=(unsigned short)getshort(buf,0);
				if(u!=0x0d)
					fputs(convert_char(u),stdout);
				else
					fputc('\n',stdout);
			}
			fputc('\n',stdout);
		}
		break;
		
	case USER_EDIT_ATOM:
/* 		fprintf(stderr,"UserEditAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case COLOR_SCHEME_ATOM:
/* 		fprintf(stderr,"ColorSchemeAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case PPDRAWING:
/* 		fprintf(stderr,"PPDrawing, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case ENVIRONMENT:
/* 		fprintf(stderr,"Environment, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case SSDOC_INFO_ATOM:
/* 		fprintf(stderr,"SSDocInfoAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case SSSLIDE_INFO_ATOM:
/* 		fprintf(stderr,"SSSlideInfoAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case PROG_TAGS:
/* 		fprintf(stderr,"ProgTags, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case PROG_STRING_TAG:
/* 		fprintf(stderr,"ProgStringTag, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case PROG_BINARY_TAG:
/* 		fprintf(stderr,"ProgBinaryTag, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case LIST:
/* 		fprintf(stderr,"List, reclen=%ld\n", reclen); */
		break;

	case SLIDE_LIST_WITH_TEXT:
/* 		fprintf(stderr,"SlideListWithText, reclen=%ld\n", reclen); */
/*  		fputs("---------------------------------------\n",stderr); */
		break;

	case PERSIST_PTR_INCREMENTAL_BLOCK:
/* 		fprintf(stderr,"PersistPtrIncrementalBlock, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case EX_OLE_OBJ_STG:
/* 		fprintf(stderr,"ExOleObjStg, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case PPDRAWING_GROUP:
/* 		fprintf(stderr,"PpdrawingGroup, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case EX_OBJ_LIST:
/* 		fprintf(stderr,"ExObjList, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case TX_MASTER_STYLE_ATOM:
/* 		fprintf(stderr,"TxMasterStyleAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case HANDOUT:
/* 		fprintf(stderr,"Handout, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case SLIDE_PERSIST_ATOM:
/* 		fprintf(stderr,"SlidePersistAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case TEXT_HEADER_ATOM:
/* 		fprintf(stderr,"TextHeaderAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case TEXT_SPEC_INFO:
/* 		fprintf(stderr,"TextSpecInfo, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

	case STYLE_TEXT_PROP_ATOM:
/* 		fprintf(stderr,"StyleTextPropAtom, reclen=%ld\n", reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);
		break;

		/*	case :
		fprintf(stderr,", reclen=%ld\n", reclen);
		catdoc_seek(input, reclen, SEEK_CUR);
		break;*/

		/*	case :
		fprintf(stderr,", reclen=%ld\n", reclen);
		catdoc_seek(input, reclen, SEEK_CUR);
		break;*/

	default:
/* 		fprintf(stderr,"Default action for rectype=%d reclen=%ld\n", */
/* 						rectype, reclen); */
		catdoc_seek(input, reclen, SEEK_CUR);

	}
	
}
Exemplo n.º 6
0
void copy_out (FILE *f,char *header) {
	char *buf=(char *)buffer;
	int count,i;
	long offset;

	if (get_unicode_char == get_word8_char) {
		/* non-word file and -u specified. Trying to guess which kind of
		 * unicode is used
		 */
		if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) {
			get_unicode_char = get_utf16msb;
			fputs(convert_char(header[2]<<8|header[3]),output_file);
			fputs(convert_char(header[4]<<8|header[5]),output_file);
			fputs(convert_char(header[6]<<8|header[7]),output_file);
		} else if ((unsigned char)header[0]!=0xFF ||
				(unsigned char)header[1]!=0xFE) {
			int c,j,d;
			/* if it is not utf16, assume it is UTF8. We are told -u,
			 * aren't we */
			get_unicode_char = get_utf8;
			i=0;
			while (i<8) {
				c=(unsigned char)header[i++];		
				if (c >=0x80) {
					if ( c<0xE0) {
						c=(c & 0x1F);
						count =1;
					} else {
						c=(c & 0xF);
						count = 2;
					}
					for (j=0;j<count;j++) {
						if (i<7) {
							d=(unsigned char) header[i++];
						} else {
							d=fgetc(f);
						}
						c=c<<6 | (d & 0x3F);
					}
				}
				fputs (convert_char(c),output_file);
			}
		} else {
			get_unicode_char = get_utf16lsb;
			fputs(convert_char(header[3]<<8|header[2]),output_file);
			fputs(convert_char(header[5]<<8|header[4]),output_file);
			fputs(convert_char(header[7]<<8|header[6]),output_file);
		}	    
		while (!catdoc_eof(f)) {
			i=get_unicode_char(f,&offset,0x7FFFFFFF); 
			if (i!=EOF) fputs(convert_char(i),output_file);
		}    
	} else {
		for (i=0;i<8;i++) {
			fputs(convert_char(to_unicode(source_charset,(unsigned char)header[i])),output_file);
		}			 
		/* Assuming 8-bit input text */
		while ((count = catdoc_read(buf,1,PARAGRAPH_BUFFER,f))) {
			for (i=0;i<count;i++) {
				fputs(convert_char(to_unicode(source_charset,
								(unsigned char)buf[i])),output_file);
			}		       
		}
	} 
}