Ejemplo n.º 1
0
bool XAP_Dictionary::_parseUTF8(void)
{
	UT_GrowBuf gbBlock(1024);
	bool bEatLF = false;
	gchar buf[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00};
	gint len;

	while (fread(buf, 1, sizeof(gchar), m_fp) > 0)
	{
		switch (buf[0])
		{
		case '\r':
		case '\n':
			if ((buf[0] == '\n') && bEatLF)
			{
				bEatLF = false;
				break;
			}

			if (buf[0] == '\r')
			{
				bEatLF = true;
			}
			
			// we interprete either CRLF, CR, or LF as a word delimiter.
			
			if (gbBlock.getLength() > 0)
			{
				X_ReturnIfFail(addWord(reinterpret_cast<UT_UCS4Char*>(gbBlock.getPointer(0)), gbBlock.getLength()));
				gbBlock.truncate(0);
			}
			break;

		default:
			bEatLF = false;

			len = g_utf8_next_char(buf) - buf;
			if (len > 1) {
				fread (buf + 1, len - 1, sizeof (gchar), m_fp);
			}
			UT_UCSChar uc = g_utf8_get_char(buf);
			X_ReturnIfFail(gbBlock.ins(gbBlock.getLength(),reinterpret_cast<UT_GrowBufElement*>(&uc),1));
			break;
		}
	} 

	if (gbBlock.getLength() > 0)
	{
		X_ReturnIfFail(addWord(reinterpret_cast<UT_UCS4Char*>(gbBlock.getPointer(0)), gbBlock.getLength()));
	}

	return true;
}
UT_Error IE_Imp_PalmDoc::_parseFile(GsfInput * pdfp)
{
	UT_GrowBuf gbBlock(1024);
	bool bEatLF = false;
	bool bEmptyFile = true;
	UT_UCSChar c;
	UT_UCS4Char wc;

	pdb_header	header;
	doc_record0	rec0;
	bool		bCompressed = false;
	int		num_records, rec_num;
	DWord		file_size, offset;

	gsf_input_read( pdfp, PDB_HEADER_SIZE, (guint8*)&header);
	if (strncmp( header.type,    DOC_TYPE,    sizeof(header.type) ) ||
	    strncmp( header.creator, DOC_CREATOR, sizeof(header.creator) ))
        {
		UT_DEBUGMSG(("This is not a DOC file!\n"));

		// Create an empty paragraph.
		X_ReturnNoMemIfError(appendStrux(PTX_Block, NULL));
		return UT_OK;
	}

	num_records = _swap_Word( header.numRecords ) - 1;

	gsf_input_seek( pdfp, PDB_HEADER_SIZE, G_SEEK_SET );
	GET_DWord( pdfp, offset );
	gsf_input_seek( pdfp, offset, G_SEEK_SET );
	gsf_input_read( pdfp, sizeof(rec0), (guint8*)&rec0);

	if ( _swap_Word( rec0.version ) == 2 )
		bCompressed = true;

	gsf_input_seek( pdfp, 0, G_SEEK_END );
	file_size = gsf_input_tell( pdfp );

	for (rec_num = 1; rec_num <= num_records; ++rec_num )
	{
		DWord next_offset;

		gsf_input_seek( pdfp, PDB_HEADER_SIZE + PDB_RECORD_HEADER_SIZE * rec_num, G_SEEK_SET);
		GET_DWord( pdfp, offset );
		if( rec_num < num_records )
		{
			gsf_input_seek( pdfp, PDB_HEADER_SIZE + PDB_RECORD_HEADER_SIZE * (rec_num + 1), G_SEEK_SET);
			GET_DWord( pdfp, next_offset );
		}
		else
			next_offset = file_size;

		gsf_input_seek( pdfp, offset, G_SEEK_SET );

		// be overly cautious here
		_zero_fill (m_buf->buf, BUFFER_SIZE);
		gsf_input_read(pdfp, next_offset - offset, m_buf->buf);
		m_buf->position = next_offset - offset;

		if ( bCompressed )
			_uncompress( m_buf );

		m_buf->position = 0;

		while ( (m_buf->position) < (m_buf->len) )
		{
		  // don't copy over null chars
		        if (m_buf->buf[m_buf->position] == '\0')
			  {
			    ++m_buf->position;
			    continue;
			  }
			if( !m_Mbtowc.mbtowc( wc, m_buf->buf[m_buf->position] ) )
		 	   continue;
			c = static_cast<UT_UCSChar>(wc);
			switch (c)
			{
			case static_cast<UT_UCSChar>('\r'):
			case static_cast<UT_UCSChar>('\n'):
			
				if ((c == static_cast<UT_UCSChar>('\n')) && bEatLF)
				{
					bEatLF = false;
					break;
				}

				if (c == static_cast<UT_UCSChar>('\r'))
				{
					bEatLF = true;
				}
		
				// we interprete either CRLF, CR, or LF as a paragraph break.
		
				// start a paragraph and emit any text that we
				// have accumulated.
				X_ReturnNoMemIfError(appendStrux(PTX_Block, NULL));
				bEmptyFile = false;
				if (gbBlock.getLength() > 0)
				{
					X_ReturnNoMemIfError(appendSpan(reinterpret_cast<const UT_UCSChar*>(gbBlock.getPointer(0)), gbBlock.getLength()));
					gbBlock.truncate(0);
				}
				break;

			default:
				bEatLF = false;
				X_ReturnNoMemIfError(gbBlock.ins(gbBlock.getLength(),reinterpret_cast<const UT_GrowBufElement *>(&c),1));
				break;
			}

			++m_buf->position;
		} 

	}
	if (gbBlock.getLength() > 0 || bEmptyFile)
	{
		// if we have text left over (without final CR/LF),
		// or if we read an empty file,
		// create a paragraph and emit the text now.
		X_ReturnNoMemIfError(appendStrux(PTX_Block, NULL));
		if (gbBlock.getLength() > 0)
			X_ReturnNoMemIfError(appendSpan(reinterpret_cast<const UT_UCSChar *>(gbBlock.getPointer(0)), gbBlock.getLength()));
	}

	return UT_OK;
}
/*!
  Parse stream contents into the document
 \param stream Stream to import from

 This code is used for both files and the clipboard
 */
UT_Error IE_Imp_Text::_parseStream(ImportStream * pStream)
{
	UT_return_val_if_fail(pStream, UT_ERROR);

	bool bFirstChar = true;
	UT_GrowBuf gbBlock(1024);
	UT_UCSChar c;

	if (!m_bExplicitlySetEncoding) {
		std::string prop;

		prop = getProperty ("encoding");
		if (!prop.empty()) {
			_setEncoding (prop.c_str());
		}
	}

	pStream->init(m_szEncoding);

	while (pStream->getChar(c))
	{
		// TODO We should switch fonts when we encounter
		// TODO characters from different scripts
		switch (c)
		{
		case UCS_CR:
		case UCS_LF:
		case UCS_LINESEP:
		case UCS_PARASEP:
			// we interpret either CRLF, CR, or LF as a paragraph break.
			// we also accept U+2028 (line separator) and U+2029 (para separator)
			// especially since these are recommended by Mac OS X.

			// flush out what we have
			if (gbBlock.getLength() > 0)
				X_ReturnNoMemIfError(_insertSpan(gbBlock));
			X_ReturnNoMemIfError(_insertBlock());
			break;

		case UCS_BOM:
			// This is Byte Order Mark at the start of file, Zero Width Non Joiner elsewhere
			if (bFirstChar)
				break;

		// if we encounter any of the following characters we will
		// substitute a '?' as they correspond to control characters,
		// though some text files use them for their character representations
		// We do this instead of of immediately returning an error
		// (and assuming they have no business in a text file) so we can
		// still show usable text to a user who has one of these files.
		case 0x0000:
		case 0x0001:
		case 0x0002:
		case 0x0003:
		case 0x0004:
		case 0x0005:
		case 0x0006:
		case 0x0007:
		case 0x0008:
		case 0x000e:
		case 0x000f:
		case 0x0010:
		case 0x0011:
		case 0x0012:
		case 0x0013:
		case 0x0014:
		case 0x0015:
		case 0x0016:
		case 0x0017:
		case 0x0018:
		case 0x0019:
		case 0x001a:
		case 0x001b:
		case 0x001c:
		case 0x001d:
		case 0x001e:
		case 0x001f:
			// UT_ASSERT(!(c <= 0x001f));
			c = '?';
			/* return UT_ERROR; // fall through with modified character */
			
		default:
			X_ReturnNoMemIfError(gbBlock.append(reinterpret_cast<UT_GrowBufElement*>(&c),1));
			break;
		}
		bFirstChar = false;
	}

	if (gbBlock.getLength() > 0)
		X_ReturnNoMemIfError(_insertSpan(gbBlock));

	return UT_OK;
}