Пример #1
0
static void add_to_buffer(int *bufptr,unsigned short int c) {
	buffer[++(*bufptr)]=c;
	if (*bufptr > PARAGRAPH_BUFFER-2) {
		buffer[++(*bufptr)]=0;
		output_paragraph(buffer);
		*bufptr=-1;
	}
}
Пример #2
0
int process_file(FILE *f,long stop, FILE *out, get_unicode_char_t get_unicode_char,
                 struct ole_params_t *ole_params, struct io_funcs_t *io_funcs) {
	int bufptr;
	int tabmode=0;
	long offset=0;
	int hyperlink_mode = 0;
	unsigned short c;
    unsigned short int *buffer = (unsigned short int*) malloc(sizeof(unsigned short int) * PARAGRAPH_BUFFER);
    unsigned char read_buf[256];
    int buf_is_unicode = 0;
    char outputbuffer[LINE_BUF_SIZE] = "";
    int bufpos = 0;

    if (!buffer) {
        errno = ENOMEM;
        return -1;
    }

	/* Now we are starting to read with get_unicode_char */
	while (!io_funcs->catdoc_eof(f) && offset<stop) {
		bufptr = -1;
		do {
			c=get_unicode_char(f,&offset,stop, ole_params, read_buf, &buf_is_unicode, io_funcs);
            if (c == -1)
            {
                free(buffer);
                return -1;
            }
			/* Following symbols below 32 are allowed inside paragraph:
			   0x0002 - footnote mark
			   0x0007 - table separator (converted to tabmode)
			   0x0009 - Horizontal tab ( printed as is)
			   0x000B - hard return
			   0x000C - page break
			   0x000D - return - marks an end of paragraph
			   0x001E - IS2 for some reason means short defis in Word.
			   0x001F - soft hyphen in Word
			   0x0013 - start embedded hyperlink
			   0x0014 - separate hyperlink URL from text
			   0x0015 - end embedded hyperlink
			   */
			if (tabmode) {
				tabmode=0;
				if (c==0x007) {
					buffer[++bufptr]=0x1E;
					continue;
				} else {
					buffer[++bufptr]=0x1C;
				}  
			}   	 
			if (c<32) {
				switch (c) {
					case 0x007:
						tabmode = 1;
						break;
					case 0x000D:
					case 0x000B:
						buffer[++bufptr]=0x000A;
						break;
					case 0x000C:
						buffer[++bufptr]=c;
						break;
					case 0x001E:
						buffer[++bufptr]='-';
						break;
					case 0x0002: break;

					case 0x001F:
								 buffer[++bufptr]=0xAD;/* translate to Unicode
														  soft hyphen */
								 break;						  
					case 0x0009:
								 buffer[++bufptr]=c;
								 break;
					case 0x0013:
								 hyperlink_mode=1;
								 buffer[++bufptr]=' ';
								 break;
					case 0x0014:
								 hyperlink_mode = 0;
								 /*fall through */
					case 0x0015:
								 /* just treat hyperlink separators as
								  * space */
								 buffer[++bufptr]=' ';
								 break;
					case 0x0001: if (hyperlink_mode) 
									 	break;
								 /* else fall through */
					default:
								 bufptr=-1; /* Any other control char - discard para*/
				}
			} else if (c != 0xfeff) {
				/* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything
				 * else*/
				buffer[++bufptr]=c;
			}
		} while (bufptr<PARAGRAPH_BUFFER-2 &&
				 !io_funcs->catdoc_eof(f) &&
				 buffer[bufptr]!=0x000a);
		if (bufptr>0) {
			buffer[++bufptr]=0;
			output_paragraph(buffer, out, outputbuffer, &bufpos);
		}
	}
    free(buffer);

	return 0;
}
Пример #3
0
int parse_rtf(FILE *f) {
	int para_mode=0, data_skip_mode=0,i;
	RTFGroupData *groups=NULL;
	int group_count=0, group_store=20;
	// this makes sure we have an mbcs lookup table available
	(void)read_charset("shiftjis");
	// and now set it back to the original
	(void)read_charset(source_csname);
	int bufptr=-1;
	fseek(f,0,SEEK_SET);
	if((groups=(RTFGroupData*)calloc(group_store,sizeof(RTFGroupData))) == NULL ) {
		perror("Can\'t allocate memory: ");
		return 1;
	}
	groups[0].uc = 1; /* RTF spec says DEfault uc = 1 */
	groups[0].mbcs = 0; /* assume not using multibyte characters */
	groups[0].codepage = 1252;
	groups[0].charset = source_charset;
	while ( !feof(f) ) {
		int c = fgetc(f);
		if ( feof( f ) )
			break;
		switch (c) {
		case '\\': {
			int code;
			RTFcommand com;
			if ((code=getRtfCommand(f, &com)) != 0)
				break;
			switch (com.type) {
			case RTF_SPEC_CHAR:
				if (com.numarg == '*' && data_skip_mode == 0) {
					data_skip_mode=group_count;
				} else if (com.numarg == '\r') {
					end_paragraph(&bufptr);
				} else if (com.numarg == '~') {
					add_to_buffer(&bufptr,0xA0);/* NO-BREAK SPACE */
				} else if (com.numarg == '-') {
					add_to_buffer(&bufptr,0xAD);/* Optional hyphen */
				}
				   break;
			case RTF_EMDASH:
				   add_to_buffer(&bufptr,0x2014);/* EM DASH*/
				   break;
			case RTF_ENDASH: 
				   add_to_buffer(&bufptr,0x2013);break;
			case RTF_BULLET: 
				   add_to_buffer(&bufptr,0x2022);break;
			case RTF_LQUOTE: add_to_buffer(&bufptr,0x2018);break;
			case RTF_RQUOTE: add_to_buffer(&bufptr,0x2019);break;
			case RTF_LDBLQUOTE: add_to_buffer(&bufptr,0x201C);break;
			case RTF_RDBLQUOTE: add_to_buffer(&bufptr,0x201D);break;
			case RTF_ZWNONJOINER: add_to_buffer(&bufptr,0xfeff);break;
			case RTF_EMSPACE:
			case RTF_ENSPACE:
					add_to_buffer(&bufptr,' ');break;
			case RTF_CHAR:
				if (data_skip_mode == 0) {
					short int *charset = groups[group_count].charset;
					// check for multibyte characters - filter check on DBCS lead bytes as unicode charset cp932
					if (groups[group_count].mbcs &&
						(
						((com.numarg >= 0x81) && (com.numarg <= 0x9f)) ||
						((com.numarg >= 0xe0) && (com.numarg <= 0xfc))
						)
					) {
						// is next char a command lead-in
						int next_char = fgetc(f);
						if (next_char == '\\') {
							next_char = fgetc(f);
							// is it an escaped character?
							if (next_char == '\'') {
								RTFcommand com2;
								ungetc(next_char,f);
								next_char=getRtfCommand(f, &com2);
								if ((next_char != -1) && (com2.type == RTF_CHAR)) {
									// if a trailing byte in mcbs 2nd byte range
									if ((com2.numarg >= 0x40) && (com2.numarg <= 0xfc))
										// add mbcs char
										add_to_buffer(&bufptr, rtf_to_unicode((int) (((unsigned char)com.numarg) << 8) | (unsigned char)(com2.numarg), charset));
									else {
										// else add as 2 hich bytes
										add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset));
										add_to_buffer(&bufptr,rtf_to_unicode(com2.numarg, charset));
									}
								}
								// screwup in 2nd byte. Add hich char
								else
									add_to_buffer(&bufptr,rtf_to_unicode(com.numarg,charset));
							}
							// not a escaped character
							else {
								// push back values
								ungetc(next_char,f);
								ungetc('\\',f);
								// add hich char
								add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset));
							}
						}
						// not a command following
						else {
							// push back values
							ungetc(next_char,f);
							// add hich char
							add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset));
						}
					}
					else
						add_to_buffer(&bufptr,rtf_to_unicode(com.numarg, charset));
				}
				break;
			case RTF_UC:
				groups[group_count].uc=com.numarg;
				break;
			case RTF_TAB:
				add_to_buffer(&bufptr,0x0009);
				break;
			case RTF_UNICODE_CHAR:
				if (com.numarg < 0)
					break;
				if (data_skip_mode == 0)
					add_to_buffer(&bufptr,com.numarg);
				i=groups[group_count].uc;
				if (i > 0) {
					while (i--) {
						c = fgetc(f);
						// are we reading and skipping a control sequence?
						if (c == '\\') {
							// bin it (likely a \'xx value)
							getRtfCommand(f, &com);
						}
					}
				}
				break;
			case RTF_PARA:
				/* *** CHECKME *** if (para_mode > 0) {*/
					end_paragraph(&bufptr);	
				/*}*/	
				para_mode=group_count;
				break;
			case RTF_PICT:
			case RTF_FONTTBL:
				loadFontTable(f);
				break;
			case RTF_F:
				{
					RTFFontTableEntry *entry = lookupFontTableEntry(com.numarg);
					if (entry) {
						if (!entry->codepage)
							entry->codepage = charsetCodepage(entry->charset);
						if (entry->codepage != groups[group_count].codepage) {
							groups[group_count].codepage = entry->codepage;
							rtfSetCharset(&groups[group_count]);
						}
					}
				}
				break;
			case RTF_INFO:
			case RTF_COLORTBL:
			case RTF_STYLESHEET:
			case RTF_LISTTABLE:
			case RTF_LISTOVERRIDETABLE:
			case RTF_RSIDTBL:
			case RTF_GENERATOR:
			case RTF_DATAFIELD:
				if (data_skip_mode == 0){
					data_skip_mode=group_count;
				}
				break;
			case RTF_LANG:
/* 				fprintf(stderr, "Selected lang = %d\n",com.numarg); */ 
				groups[group_count].codepage = lcidCodepage(com.numarg);
				rtfSetCharset(&groups[group_count]);
				break;
			case RTF_DEFLANG:
			case RTF_DEFLANGFE:
				groups[group_count].codepage = lcidCodepage(com.numarg);
				rtfSetCharset(&groups[group_count]);
				break;
			case RTF_FONT_CHARSET:
				groups[group_count].codepage = charsetCodepage(com.numarg);
				rtfSetCharset(&groups[group_count]);
				break;
			case RTF_CODEPAGE:
				groups[group_count].codepage = com.numarg;
				rtfSetCharset(&groups[group_count]);
				break;
			case RTF_PLAIN:
				groups[group_count].mbcs = 0;
				groups[group_count].codepage = groups[0].codepage;
				rtfSetCharset(&groups[group_count]);
				break;
			case RTF_LOCH:
				groups[group_count].mbcs = 1;
				break;
			case RTF_HICH:
				groups[group_count].mbcs = 1;
				break;
			case RTF_DBCH:
				groups[group_count].mbcs = 1;
				break;
			case RTF_INDEX:
				{
					int current_group = group_count;
					/* skip all of current group */
					do {
						c = fgetc(f);
						if (c == '{')
							group_count++;
						else if (c == '}')
							group_count--;
					} while (group_count >= current_group);
					ungetc('}',f);
				}
				break;
		default:
/*  				fprintf(stderr, "Unknown command with name %s and arg=%d\n",  */
/*  						com.name, com.numarg);  */
			;
			}
			break;
		}
		case '{':
			group_count++;
			if (group_count >= group_store ) {
				group_store+=10;
				if((groups=(RTFGroupData*)realloc(groups,
					group_store*sizeof(RTFGroupData)))
					== NULL ) {
					perror("Can\'t allocate memory: ");
					return 1;
				}
			}
// this looks wrong - removed pending review FIXME
//			if (para_mode)
//				add_to_buffer(&bufptr,0x20);
			groups[group_count]=groups[group_count-1];
			break;
		case '}':
			group_count--;
			if(group_count < 0)
				group_count=0;
			if(para_mode > 0 && para_mode > group_count) {
				/*add_to_buffer(&bufptr,0);
				output_paragraph(buffer);
				fprintf(stderr,"\nGROUP_END para_mode=%d group_count=%d bufptr=%d\n", para_mode,group_count,bufptr);
				bufptr=-1;*/
				para_mode=0;
			}
			if(data_skip_mode > group_count) {
				data_skip_mode=0;
			}
			break;
		default:
			if (data_skip_mode == 0)
				if (c != '\n' && c != '\r')
					add_to_buffer(&bufptr,rtf_to_unicode(c, groups[group_count].charset));
		}
	}
	if (bufptr>=0) {
		add_to_buffer(&bufptr,'\n');
		add_to_buffer(&bufptr,0);
		output_paragraph(buffer);
	}	
	free(groups);
	return 0;
}
Пример #4
0
static void end_paragraph(int *bufptr) {
				   add_to_buffer(bufptr,0x000a);
				   add_to_buffer(bufptr,0);
				   output_paragraph(buffer);
				   *bufptr=-1;
}				   
Пример #5
0
int process_file(FILE *f,long stop) {
	int bufptr;
	int tabmode=0;
	long offset=0;
	int hyperlink_mode = 0;
	unsigned short c;
	/* Now we are starting to read with get_unicode_char */
	while (!catdoc_eof(f) && offset<stop) {
		bufptr = -1;
		do {
			int unichar = get_unicode_char(f,&offset,stop);
			if (unichar < 0)
				continue;
			c = unichar;
			/* Following symbols below 32 are allowed inside paragraph:
			   0x0002 - footnote mark
			   0x0007 - table separator (converted to tabmode)
			   0x0009 - Horizontal tab ( printed as is)
			   0x000B - hard return
			   0x000C - page break
			   0x000D - return - marks an end of paragraph
			   0x001E - IS2 for some reason means short defis in Word.
			   0x001F - soft hyphen in Word
			   0x0013 - start embedded hyperlink
			   0x0014 - separate hyperlink URL from text
			   0x0015 - end embedded hyperlink
			   */
			if (tabmode) {
				tabmode=0;
				if (c==0x007) {
					buffer[++bufptr]=0x1E;
					continue;
				} else {
					buffer[++bufptr]=0x1C;
				}  
			}   	 
			if (c<32) {
				switch (c) {
					case 0x007:
						tabmode = 1;
						break;
					case 0x000D:
					case 0x000B:
						buffer[++bufptr]=0x000A;
						break;
					case 0x000C:
						buffer[++bufptr]=c;
						break;
					case 0x001E:
						buffer[++bufptr]='-';
						break;
					case 0x0002: break;

					case 0x001F:
								 buffer[++bufptr]=0xAD;/* translate to Unicode
														  soft hyphen */
								 break;						  
					case 0x0009:
								 buffer[++bufptr]=c;
								 break;
					case 0x0013:
								 hyperlink_mode=1;
								 buffer[++bufptr]=' ';
								 break;
					case 0x0014:
								 hyperlink_mode = 0;
								 /*fall through */
					case 0x0015:
								 /* just treat hyperlink separators as
								  * space */
								 buffer[++bufptr]=' ';
								 break;
					case 0x0001: if (hyperlink_mode) 
									 	break;
								 /* else fall through */
					default:
								 bufptr=-1; /* Any other control char - discard para*/
				}
			} else if (c != 0xfeff) {
				/* skip ZERO-WIDTH-UNBREAKABLE-SPACE. Output anything
				 * else*/
				buffer[++bufptr]=c;
			}
		} while (bufptr >=0 && bufptr<PARAGRAPH_BUFFER-2 && !catdoc_eof(f) && buffer[bufptr]!=0x000a);
		if (bufptr>0) {
			buffer[++bufptr]=0;
			output_paragraph(buffer);
		}
	}
	return 0;
}