bool XAP_Dictionary::_parseUTF8(void) { UT_GrowBuf gbBlock(1024); bool bEatLF = false; gchar buf[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; gint len; while (fread(buf, 1, sizeof(gchar), m_fp) > 0) { switch (buf[0]) { case '\r': case '\n': if ((buf[0] == '\n') && bEatLF) { bEatLF = false; break; } if (buf[0] == '\r') { bEatLF = true; } // we interprete either CRLF, CR, or LF as a word delimiter. if (gbBlock.getLength() > 0) { X_ReturnIfFail(addWord(reinterpret_cast<UT_UCS4Char*>(gbBlock.getPointer(0)), gbBlock.getLength())); gbBlock.truncate(0); } break; default: bEatLF = false; len = g_utf8_next_char(buf) - buf; if (len > 1) { fread (buf + 1, len - 1, sizeof (gchar), m_fp); } UT_UCSChar uc = g_utf8_get_char(buf); X_ReturnIfFail(gbBlock.ins(gbBlock.getLength(),reinterpret_cast<UT_GrowBufElement*>(&uc),1)); break; } } if (gbBlock.getLength() > 0) { X_ReturnIfFail(addWord(reinterpret_cast<UT_UCS4Char*>(gbBlock.getPointer(0)), gbBlock.getLength())); } return true; }
UT_Error IE_Imp_PalmDoc::_parseFile(GsfInput * pdfp) { UT_GrowBuf gbBlock(1024); bool bEatLF = false; bool bEmptyFile = true; UT_UCSChar c; UT_UCS4Char wc; pdb_header header; doc_record0 rec0; bool bCompressed = false; int num_records, rec_num; DWord file_size, offset; gsf_input_read( pdfp, PDB_HEADER_SIZE, (guint8*)&header); if (strncmp( header.type, DOC_TYPE, sizeof(header.type) ) || strncmp( header.creator, DOC_CREATOR, sizeof(header.creator) )) { UT_DEBUGMSG(("This is not a DOC file!\n")); // Create an empty paragraph. X_ReturnNoMemIfError(appendStrux(PTX_Block, NULL)); return UT_OK; } num_records = _swap_Word( header.numRecords ) - 1; gsf_input_seek( pdfp, PDB_HEADER_SIZE, G_SEEK_SET ); GET_DWord( pdfp, offset ); gsf_input_seek( pdfp, offset, G_SEEK_SET ); gsf_input_read( pdfp, sizeof(rec0), (guint8*)&rec0); if ( _swap_Word( rec0.version ) == 2 ) bCompressed = true; gsf_input_seek( pdfp, 0, G_SEEK_END ); file_size = gsf_input_tell( pdfp ); for (rec_num = 1; rec_num <= num_records; ++rec_num ) { DWord next_offset; gsf_input_seek( pdfp, PDB_HEADER_SIZE + PDB_RECORD_HEADER_SIZE * rec_num, G_SEEK_SET); GET_DWord( pdfp, offset ); if( rec_num < num_records ) { gsf_input_seek( pdfp, PDB_HEADER_SIZE + PDB_RECORD_HEADER_SIZE * (rec_num + 1), G_SEEK_SET); GET_DWord( pdfp, next_offset ); } else next_offset = file_size; gsf_input_seek( pdfp, offset, G_SEEK_SET ); // be overly cautious here _zero_fill (m_buf->buf, BUFFER_SIZE); gsf_input_read(pdfp, next_offset - offset, m_buf->buf); m_buf->position = next_offset - offset; if ( bCompressed ) _uncompress( m_buf ); m_buf->position = 0; while ( (m_buf->position) < (m_buf->len) ) { // don't copy over null chars if (m_buf->buf[m_buf->position] == '\0') { ++m_buf->position; continue; } if( !m_Mbtowc.mbtowc( wc, m_buf->buf[m_buf->position] ) ) continue; c = static_cast<UT_UCSChar>(wc); switch (c) { case static_cast<UT_UCSChar>('\r'): case static_cast<UT_UCSChar>('\n'): if ((c == static_cast<UT_UCSChar>('\n')) && bEatLF) { bEatLF = false; break; } if (c == static_cast<UT_UCSChar>('\r')) { bEatLF = true; } // we interprete either CRLF, CR, or LF as a paragraph break. // start a paragraph and emit any text that we // have accumulated. X_ReturnNoMemIfError(appendStrux(PTX_Block, NULL)); bEmptyFile = false; if (gbBlock.getLength() > 0) { X_ReturnNoMemIfError(appendSpan(reinterpret_cast<const UT_UCSChar*>(gbBlock.getPointer(0)), gbBlock.getLength())); gbBlock.truncate(0); } break; default: bEatLF = false; X_ReturnNoMemIfError(gbBlock.ins(gbBlock.getLength(),reinterpret_cast<const UT_GrowBufElement *>(&c),1)); break; } ++m_buf->position; } } if (gbBlock.getLength() > 0 || bEmptyFile) { // if we have text left over (without final CR/LF), // or if we read an empty file, // create a paragraph and emit the text now. X_ReturnNoMemIfError(appendStrux(PTX_Block, NULL)); if (gbBlock.getLength() > 0) X_ReturnNoMemIfError(appendSpan(reinterpret_cast<const UT_UCSChar *>(gbBlock.getPointer(0)), gbBlock.getLength())); } return UT_OK; }
/*! Parse stream contents into the document \param stream Stream to import from This code is used for both files and the clipboard */ UT_Error IE_Imp_Text::_parseStream(ImportStream * pStream) { UT_return_val_if_fail(pStream, UT_ERROR); bool bFirstChar = true; UT_GrowBuf gbBlock(1024); UT_UCSChar c; if (!m_bExplicitlySetEncoding) { std::string prop; prop = getProperty ("encoding"); if (!prop.empty()) { _setEncoding (prop.c_str()); } } pStream->init(m_szEncoding); while (pStream->getChar(c)) { // TODO We should switch fonts when we encounter // TODO characters from different scripts switch (c) { case UCS_CR: case UCS_LF: case UCS_LINESEP: case UCS_PARASEP: // we interpret either CRLF, CR, or LF as a paragraph break. // we also accept U+2028 (line separator) and U+2029 (para separator) // especially since these are recommended by Mac OS X. // flush out what we have if (gbBlock.getLength() > 0) X_ReturnNoMemIfError(_insertSpan(gbBlock)); X_ReturnNoMemIfError(_insertBlock()); break; case UCS_BOM: // This is Byte Order Mark at the start of file, Zero Width Non Joiner elsewhere if (bFirstChar) break; // if we encounter any of the following characters we will // substitute a '?' as they correspond to control characters, // though some text files use them for their character representations // We do this instead of of immediately returning an error // (and assuming they have no business in a text file) so we can // still show usable text to a user who has one of these files. case 0x0000: case 0x0001: case 0x0002: case 0x0003: case 0x0004: case 0x0005: case 0x0006: case 0x0007: case 0x0008: case 0x000e: case 0x000f: case 0x0010: case 0x0011: case 0x0012: case 0x0013: case 0x0014: case 0x0015: case 0x0016: case 0x0017: case 0x0018: case 0x0019: case 0x001a: case 0x001b: case 0x001c: case 0x001d: case 0x001e: case 0x001f: // UT_ASSERT(!(c <= 0x001f)); c = '?'; /* return UT_ERROR; // fall through with modified character */ default: X_ReturnNoMemIfError(gbBlock.append(reinterpret_cast<UT_GrowBufElement*>(&c),1)); break; } bFirstChar = false; } if (gbBlock.getLength() > 0) X_ReturnNoMemIfError(_insertSpan(gbBlock)); return UT_OK; }