/*!
  Detect encoding of text buffer
 \param pData Buffer
 \param lenData Length of buffer

 Supports UTF-8 and UCS-2 big and little endian
 CJK encodings could be added
 */
UT_Error IE_Imp_Text::_recognizeEncoding(const char *szBuf, UT_uint32 iNumbytes)
{
	if (IE_Imp_Text_Sniffer::_recognizeUTF8(szBuf, iNumbytes))
		_setEncoding("UTF-8");
	else
	{
		IE_Imp_Text_Sniffer::UCS2_Endian eUcs2 = IE_Imp_Text_Sniffer::UE_NotUCS;

		eUcs2 = IE_Imp_Text_Sniffer::_recognizeUCS2(szBuf, iNumbytes, false);

		if (eUcs2 == IE_Imp_Text_Sniffer::UE_BigEnd)
			_setEncoding(XAP_EncodingManager::get_instance()->getUCS2BEName());
		else if (eUcs2 == IE_Imp_Text_Sniffer::UE_LittleEnd)
			_setEncoding(XAP_EncodingManager::get_instance()->getUCS2LEName());
		else
			_setEncoding(
#ifdef TOOLKIT_WIN
				XAP_EncodingManager::get_instance()->getNative8BitEncodingName()
#else
				"ISO-8859-1"
#endif
			);
	}

	return UT_OK;
}
Exemple #2
0
void XAP_UnixDialog_Encoding::event_Ok(void)
{
    GtkTreeSelection * selection;
    GtkTreeIter iter;
    GtkTreeModel * model;

    gint row = 0;

    selection = gtk_tree_view_get_selection( GTK_TREE_VIEW(m_listEncodings) );

    // if there is no selection, or the selection's data (GtkListItem widget)
    // is empty, return cancel.  GTK can make this happen.
    if ( !selection ||
            !gtk_tree_selection_get_selected (selection, &model, &iter)
       )
    {
        _setAnswer (XAP_Dialog_Encoding::a_CANCEL);
        return;
    }

    // get the ID of the selected Type
    gtk_tree_model_get (model, &iter, 1, &row, -1);

    if (row >= 0) {
        _setSelectionIndex(static_cast<UT_uint32>(row));
        _setEncoding (_getAllEncodings()[row]);
        _setAnswer (XAP_Dialog_Encoding::a_OK);
    } else {
        UT_ASSERT_NOT_REACHED();
        _setAnswer (XAP_Dialog_Encoding::a_CANCEL);
    }
}
IE_Exp_Text::IE_Exp_Text(PD_Document * pDocument, bool bEncoded)
	: IE_Exp(pDocument),
	  m_pListener(NULL),
	  m_bIsEncoded(false),
	  m_szEncoding(0),
	  m_bExplicitlySetEncoding(false),
	  m_bIs16Bit(false),
	  m_bUnicode(false),
	  m_bBigEndian(false),
	  m_bUseBOM(false)
{
	m_error = UT_OK;

	// Get encoding dialog prefs setting
	bool bAlwaysPrompt = false;
	XAP_App::getApp()->getPrefsValueBool(AP_PREF_KEY_AlwaysPromptEncoding, &bAlwaysPrompt);

	m_bIsEncoded = bAlwaysPrompt | bEncoded;

	const char *szEncodingName = pDocument->getEncodingName();
	if (!szEncodingName || !*szEncodingName)
		szEncodingName = XAP_EncodingManager::get_instance()->getNativeEncodingName();

	_setEncoding(szEncodingName);
}
UT_Error IE_Exp_Text::_writeDocument(void)
{
	// Don't call base method if user cancels encoding dialog
	if (!(!m_bIsEncoded || m_bExplicitlySetEncoding || _doEncodingDialog(m_szEncoding)))
		return UT_SAVE_CANCELLED;

	// TODO If we're going to the clipboard and the OS supports unicode, set encoding.
	// TODO Only supports Windows so far.
	// TODO Should use a finer-grain technique than IsWinNT() since Win98 supports unicode clipboard.
	if (getDocRange())
	{
#ifdef WIN32
		if (UT_IsWinNT())
			_setEncoding(XAP_EncodingManager::get_instance()->getNativeUnicodeEncodingName());
#endif
	}

	m_pListener = _constructListener();
	if (!m_pListener)
		return UT_IE_NOMEMORY;

	if (getDocRange())
		getDoc()->tellListenerSubset(static_cast<PL_Listener *>(m_pListener),getDocRange());
	else
		getDoc()->tellListener(static_cast<PL_Listener *>(m_pListener));
	DELETEP(m_pListener);

	return ((m_error) ? UT_IE_COULDNOTWRITE : UT_OK);
}
PL_Listener * IE_Exp_Text::_constructListener(void)
{
	if (!m_bExplicitlySetEncoding) {
		const std::string & prop = getProperty ("encoding");
		if (!prop.empty()) {
			_setEncoding (prop.c_str());
		}
	}

	return new Text_Listener(getDoc(),this,(getDocRange()!=NULL),m_szEncoding,
							 m_bIs16Bit,m_bUnicode,m_bUseBOM,m_bBigEndian);
}
bool IE_Imp_Text::pasteFromBuffer(PD_DocumentRange * pDocRange,
								  const unsigned char * pData, UT_uint32 lenData,
								  const char *szEncoding)
{
	UT_return_val_if_fail(getDoc() == pDocRange->m_pDoc,false);
	UT_return_val_if_fail(pDocRange->m_pos1 == pDocRange->m_pos2,false);

	// Attempt to guess whether we're pasting 8 bit or unicode text
	if (szEncoding)
		_setEncoding(szEncoding);
	else
		_recognizeEncoding(reinterpret_cast<const char *>(pData), lenData);

	ImportStreamClipboard stream(pData, lenData);
	setClipboard (pDocRange->m_pos1);
	_parseStream(&stream);
	return true;
}
IE_Imp_Text::IE_Imp_Text(PD_Document * pDocument, const char * encoding)
  : IE_Imp(pDocument),
    m_szEncoding(0),
    m_bExplicitlySetEncoding(false),
    m_bIsEncoded(false),
    m_bIs16Bit(false),
    m_bUseBOM(false),
	m_bBigEndian(false),
	m_bBlockDirectionPending(true),
	m_bFirstBlockData(true),
	m_pBlock(0)
{
  m_bIsEncoded = ((encoding != NULL) && (strlen(encoding) > 0));
  
  if ( m_bIsEncoded )
    {
      m_bExplicitlySetEncoding = true ;
      _setEncoding(encoding);
    }
}
IE_Exp_Text::IE_Exp_Text(PD_Document * pDocument, const char * encoding)
  : IE_Exp(pDocument),
    m_pListener(NULL),
    m_bIsEncoded(false),
    m_szEncoding(0),
    m_bExplicitlySetEncoding(false),
    m_bIs16Bit(false),
    m_bUnicode(false),
    m_bBigEndian(false),
	m_bUseBOM(false)
{
  m_error = UT_OK;
  
  m_bIsEncoded = ((encoding != NULL) && (strlen(encoding) > 0));
  
  if ( m_bIsEncoded )
    {
      m_bExplicitlySetEncoding = true;
     _setEncoding(encoding);
    }
}
/*!
  Request file encoding from user

 This function should be identical to the one in ie_Exp_Text
 */
bool IE_Imp_Text::_doEncodingDialog(const char *szEncoding)
{
	XAP_Dialog_Id id = XAP_DIALOG_ID_ENCODING;

	XAP_DialogFactory * pDialogFactory
		= static_cast<XAP_DialogFactory *>(XAP_App::getApp()->getDialogFactory());

	XAP_Dialog_Encoding * pDialog
		= static_cast<XAP_Dialog_Encoding *>(pDialogFactory->requestDialog(id));
	UT_return_val_if_fail(pDialog, false);

	pDialog->setEncoding(szEncoding);

	// run the dialog
	XAP_Frame * pFrame = XAP_App::getApp()->getLastFocussedFrame();
	UT_return_val_if_fail(pFrame, false);

	pDialog->runModal(pFrame);

	// extract what they did

	bool bOK = (pDialog->getAnswer() == XAP_Dialog_Encoding::a_OK);

	if (bOK)
	{
		const gchar * s;
		static gchar szEnc[16];

		s = pDialog->getEncoding();
		UT_return_val_if_fail (s, false);

		strcpy(szEnc,s);
		_setEncoding(static_cast<const char *>(szEnc));
		getDoc()->setEncodingName(szEnc);
	}

	pDialogFactory->releaseDialog(pDialog);

	return bOK;
}
/*
  Construct text importer
 \param pDocument Document to import text into
 \param bEncoded True if we should show encoding dialog

 Uses current document's encoding if it is set
*/
IE_Imp_Text::IE_Imp_Text(PD_Document * pDocument, bool bEncoded)
  : IE_Imp(pDocument),
    m_szEncoding(0),
    m_bExplicitlySetEncoding(false),
    m_bIsEncoded(false),
    m_bIs16Bit(false),
    m_bUseBOM(false),
	m_bBigEndian(false),
	m_bBlockDirectionPending(true),
	m_bFirstBlockData(true),
	m_pBlock(NULL)
{
	// Get encoding dialog prefs setting
	bool bAlwaysPrompt;
	XAP_App::getApp()->getPrefsValueBool(AP_PREF_KEY_AlwaysPromptEncoding, &bAlwaysPrompt);

	m_bIsEncoded = bAlwaysPrompt | bEncoded;

	const char *szEncodingName = pDocument->getEncodingName();
	if (!szEncodingName || !*szEncodingName)
		szEncodingName = XAP_EncodingManager::get_instance()->getNativeEncodingName();

	_setEncoding(szEncodingName);
}
/*!
  Parse stream contents into the document
 \param stream Stream to import from

 This code is used for both files and the clipboard
 */
UT_Error IE_Imp_Text::_parseStream(ImportStream * pStream)
{
	UT_return_val_if_fail(pStream, UT_ERROR);

	bool bFirstChar = true;
	UT_GrowBuf gbBlock(1024);
	UT_UCSChar c;

	if (!m_bExplicitlySetEncoding) {
		std::string prop;

		prop = getProperty ("encoding");
		if (!prop.empty()) {
			_setEncoding (prop.c_str());
		}
	}

	pStream->init(m_szEncoding);

	while (pStream->getChar(c))
	{
		// TODO We should switch fonts when we encounter
		// TODO characters from different scripts
		switch (c)
		{
		case UCS_CR:
		case UCS_LF:
		case UCS_LINESEP:
		case UCS_PARASEP:
			// we interpret either CRLF, CR, or LF as a paragraph break.
			// we also accept U+2028 (line separator) and U+2029 (para separator)
			// especially since these are recommended by Mac OS X.

			// flush out what we have
			if (gbBlock.getLength() > 0)
				X_ReturnNoMemIfError(_insertSpan(gbBlock));
			X_ReturnNoMemIfError(_insertBlock());
			break;

		case UCS_BOM:
			// This is Byte Order Mark at the start of file, Zero Width Non Joiner elsewhere
			if (bFirstChar)
				break;

		// if we encounter any of the following characters we will
		// substitute a '?' as they correspond to control characters,
		// though some text files use them for their character representations
		// We do this instead of of immediately returning an error
		// (and assuming they have no business in a text file) so we can
		// still show usable text to a user who has one of these files.
		case 0x0000:
		case 0x0001:
		case 0x0002:
		case 0x0003:
		case 0x0004:
		case 0x0005:
		case 0x0006:
		case 0x0007:
		case 0x0008:
		case 0x000e:
		case 0x000f:
		case 0x0010:
		case 0x0011:
		case 0x0012:
		case 0x0013:
		case 0x0014:
		case 0x0015:
		case 0x0016:
		case 0x0017:
		case 0x0018:
		case 0x0019:
		case 0x001a:
		case 0x001b:
		case 0x001c:
		case 0x001d:
		case 0x001e:
		case 0x001f:
			// UT_ASSERT(!(c <= 0x001f));
			c = '?';
			/* return UT_ERROR; // fall through with modified character */
			
		default:
			X_ReturnNoMemIfError(gbBlock.append(reinterpret_cast<UT_GrowBufElement*>(&c),1));
			break;
		}
		bFirstChar = false;
	}

	if (gbBlock.getLength() > 0)
		X_ReturnNoMemIfError(_insertSpan(gbBlock));

	return UT_OK;
}