charsetConverter::charsetConverter(const charset& source, const charset& dest) : m_desc(NULL), m_source(source), m_dest(dest) { // Get an iconv descriptor const iconv_t cd = iconv_open(dest.getName().c_str(), source.getName().c_str()); if (cd != reinterpret_cast <iconv_t>(-1)) { iconv_t* p = new iconv_t; *p= cd; m_desc = p; } }
charsetFilteredOutputStream_icu::charsetFilteredOutputStream_icu (const charset& source, const charset& dest, outputStream* os, const charsetConverterOptions& opts) : m_from(NULL), m_to(NULL), m_sourceCharset(source), m_destCharset(dest), m_stream(*os), m_options(opts) { UErrorCode err = U_ZERO_ERROR; m_from = ucnv_open(source.getName().c_str(), &err); if (!U_SUCCESS(err)) { throw exceptions::charset_conv_error ("Cannot initialize ICU converter for source charset '" + source.getName() + "' (error code: " + u_errorName(err) + "."); } m_to = ucnv_open(dest.getName().c_str(), &err); if (!U_SUCCESS(err)) { throw exceptions::charset_conv_error ("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + "."); } // Tell ICU what to do when encountering an illegal byte sequence if (m_options.silentlyReplaceInvalidSequences) { // Set replacement chars for when converting from Unicode to codepage icu::UnicodeString substString(m_options.invalidSequence.c_str()); ucnv_setSubstString(m_to, substString.getTerminatedBuffer(), -1, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting substitution string."); } else { // Tell ICU top stop (and return an error) on illegal byte sequences ucnv_setToUCallBack (m_to, UCNV_TO_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting ToU callback."); ucnv_setFromUCallBack (m_to, UCNV_FROM_U_CALLBACK_STOP, UCNV_SUB_STOP_ON_ILLEGAL, NULL, NULL, &err); if (U_FAILURE(err)) throw exceptions::charset_conv_error("[ICU] Error when setting FromU callback."); } }
// static wordEncoder::Encoding wordEncoder::guessBestEncoding (const string& buffer, const charset& charset) { // Charset-specific encoding encoding recEncoding; if (charset.getRecommendedEncoding(recEncoding)) { if (recEncoding == encoding(encodingTypes::QUOTED_PRINTABLE)) return ENCODING_QP; else return ENCODING_B64; } // Use Base64 if more than 40% non-ASCII, or Quoted-Printable else (default) const string::size_type asciiCount = utility::stringUtils::countASCIIchars(buffer.begin(), buffer.end()); const string::size_type asciiPercent = (buffer.length() == 0 ? 100 : (100 * asciiCount) / buffer.length()); if (asciiPercent < 60) return ENCODING_B64; else return ENCODING_QP; }
// static bool wordEncoder::isEncodingNeeded (const generationContext& ctx, const string& buffer, const charset& charset, const string& lang) { if (!ctx.getInternationalizedEmailSupport()) { // Charset-specific encoding encoding recEncoding; if (charset.getRecommendedEncoding(recEncoding)) return true; // No encoding is needed if the buffer only contains ASCII chars if (utility::stringUtils::findFirstNonASCIIchar(buffer.begin(), buffer.end()) != string::npos) return true; } // Force encoding when there are only ASCII chars, but there is // also at least one of '\n' or '\r' (header fields) if (buffer.find_first_of("\n\r") != string::npos) return true; // If any RFC-2047 sequence is found in the buffer, encode it if (buffer.find("=?") != string::npos || buffer.find("?=") != string::npos) return true; // If a language is specified, force encoding if (!lang.empty()) return true; return false; }
charsetFilteredOutputStream::charsetFilteredOutputStream (const charset& source, const charset& dest, outputStream& os) : m_desc(NULL), m_sourceCharset(source), m_destCharset(dest), m_stream(os), m_unconvCount(0) { // Get an iconv descriptor const iconv_t cd = iconv_open(dest.getName().c_str(), source.getName().c_str()); if (cd != reinterpret_cast <iconv_t>(-1)) { iconv_t* p = new iconv_t; *p= cd; m_desc = p; } }
charsetConverter_icu::charsetConverter_icu (const charset& source, const charset& dest, const charsetConverterOptions& opts) : m_from(NULL), m_to(NULL), m_source(source), m_dest(dest), m_options(opts) { UErrorCode err = U_ZERO_ERROR; m_from = ucnv_open(source.getName().c_str(), &err); if (!U_SUCCESS(err)) { throw exceptions::charset_conv_error ("Cannot initialize ICU converter for source charset '" + source.getName() + "' (error code: " + u_errorName(err) + "."); } m_to = ucnv_open(dest.getName().c_str(), &err); if (!U_SUCCESS(err)) { throw exceptions::charset_conv_error ("Cannot initialize ICU converter for destination charset '" + dest.getName() + "' (error code: " + u_errorName(err) + "."); } }
const encoding encoding::decide(ref <const contentHandler> data, const charset& chset, const EncodingUsage usage) { if (usage == USAGE_TEXT) { encoding recEncoding; if (chset.getRecommendedEncoding(recEncoding)) { recEncoding.setUsage(usage); return recEncoding; } } return decide(data, usage); }
// static bool wordEncoder::isEncodingNeeded(const string& buffer, const charset& charset) { // Charset-specific encoding encoding recEncoding; if (charset.getRecommendedEncoding(recEncoding)) return true; // No encoding is needed if the buffer only contains ASCII chars if (utility::stringUtils::findFirstNonASCIIchar(buffer.begin(), buffer.end()) != string::npos) return true; // Force encoding when there are only ASCII chars, but there is // also at least one of '\n' or '\r' (header fields) if (buffer.find_first_of("\n\r") != string::npos) return true; return false; }
const encoding encoding::decide( const shared_ptr <const contentHandler>& data, const charset& chset, const EncodingUsage usage ) { // Do not re-encode data if it is already encoded if (data->isEncoded() && !data->getEncoding().shouldReencode()) { return data->getEncoding(); } if (usage == USAGE_TEXT) { encoding recEncoding; if (chset.getRecommendedEncoding(recEncoding)) { recEncoding.setUsage(usage); return recEncoding; } } return decide(data, usage); }
void text::createFromString(const string& in, const charset& ch) { size_t asciiCount = 0; size_t asciiPercent = 0; removeAllWords(); // Check whether there is a recommended encoding for this charset. // If so, the whole buffer will be encoded. Else, the number of // 7-bit (ASCII) bytes in the input will be used to determine if // we need to encode the whole buffer. encoding recommendedEnc; const bool alwaysEncode = ch.getRecommendedEncoding(recommendedEnc); if (!alwaysEncode) { asciiCount = utility::stringUtils::countASCIIchars(in.begin(), in.end()); asciiPercent = (in.length() == 0 ? 100 : (100 * asciiCount) / in.length()); } // If there are "too much" non-ASCII chars, encode everything if (alwaysEncode || asciiPercent < 60) // less than 60% ASCII chars { appendWord(make_shared <word>(in, ch)); } // Else, only encode words which need it else { bool is8bit = false; // is the current word 8-bit? bool prevIs8bit = false; // is previous word 8-bit? unsigned int count = 0; // total number of words for (size_t end = in.size(), pos = 0, start = 0 ; ; ) { if (pos == end || parserHelpers::isSpace(in[pos])) { const string chunk(in.begin() + start, in.begin() + pos); if (pos != end) ++pos; if (is8bit) { if (count && prevIs8bit) { // No need to create a new encoded word, just append // the current word to the previous one. shared_ptr <word> w = getWordAt(getWordCount() - 1); w->getBuffer() += " " + chunk; } else { if (count) { shared_ptr <word> w = getWordAt(getWordCount() - 1); w->getBuffer() += ' '; } appendWord(make_shared <word>(chunk, ch)); prevIs8bit = true; ++count; } } else { if (count && !prevIs8bit) { shared_ptr <word> w = getWordAt(getWordCount() - 1); w->getBuffer() += " " + chunk; } else { appendWord(make_shared <word> (chunk, charset(charsets::US_ASCII))); prevIs8bit = false; ++count; } } if (pos == end) break; is8bit = false; start = pos; } else if (!parserHelpers::isAscii(in[pos])) { is8bit = true; ++pos; } else { ++pos; } } } }