void QCharsetDetector::setDeclaredEncoding(const QString &encoding) { Q_D(QCharsetDetector); clearError(); d->_declaredEncoding = encoding; if (d->_declaredEncoding == QLatin1String("GB2312")) d->_declaredEncoding = QLatin1String("GB18030"); ucsdet_setDeclaredEncoding(d->_uCharsetDetector, d->_declaredEncoding.toLatin1().constData(), int32_t(-1), &(d->_status)); if(hasError()) qWarning() << __PRETTY_FUNCTION__ << errorString(); }
void c_EncodingDetector::t_setdeclaredencoding(const String& text) { UErrorCode status = U_ZERO_ERROR; m_declaredencoding = text; ucsdet_setDeclaredEncoding( m_encoding_detector, m_declaredencoding.data(), m_declaredencoding.length(), &status); if (U_FAILURE(status)) { throw Exception( "Could not set encoding detector declared encoding to [%s], error %d (%s)", text.c_str(), status, u_errorName(status)); } }
void c_EncodingDetector::t_setdeclaredencoding(CStrRef text) { INSTANCE_METHOD_INJECTION_BUILTIN(EncodingDetector, EncodingDetector::setdeclaredencoding); UErrorCode status = U_ZERO_ERROR; ucsdet_setDeclaredEncoding( m_encoding_detector, text.data(), text.length(), &status); if (U_FAILURE(status)) { throw Exception( "Could not set encoding detector declared encoding to [%s], error %d (%s)", text.c_str(), status, u_errorName(status)); } }
static void set_declared_encoding(VALUE self, VALUE declared_encoding) { if (!NIL_P(declared_encoding)){ declared_encoding = StringValue(declared_encoding); UErrorCode status = U_ZERO_ERROR; UCharsetDetector *detector; Data_Get_Struct(self, UCharsetDetector, detector); ucsdet_setDeclaredEncoding(detector, StringValuePtr(declared_encoding), RSTRING_LEN(declared_encoding), &status); ensure(status); UCharsetDetector_set_declared_encoding(self, declared_encoding); } }
/* * Attempt to detect the encoding of this string * * str - a String, what you want to detect the encoding of * hint_enc - an optional String (like "UTF-8"), the encoding name which will * be used as an additional hint to the charset detector * * Returns: a dict with encoding, language, type and confidence parameters */ PyObject * charlockholmes_encoding_detect(PyObject *self, PyObject *args) { PyObject *content; UErrorCode status = U_ZERO_ERROR; const UCharsetMatch *match; const char *mname; const char *mlang; const char *hint_enc = NULL; int mconfidence; if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) { return NULL; } if (detect_binary_content(content)) { return Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100); } if (hint_enc != NULL) { ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status); } ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status); match = ucsdet_detect(ch_ucd, &status); if (match) { mname = ucsdet_getName(match, &status); mlang = ucsdet_getLanguage(match, &status); mconfidence = ucsdet_getConfidence(match, &status); if (mlang && mlang[0]) return Py_BuildValue("{ss,ss,si,ss}", "type", "text", "encoding", mname, "confidence", mconfidence, "language", mlang); else return Py_BuildValue("{ss,ss,si}", "type", "text", "encoding", mname, "confidence", mconfidence); } Py_INCREF(Py_None); return Py_None; }
/* * Attempt to detect the encoding of this string, and return * a list with all the possible encodings that match it. * * * str - a String, what you want to detect the encoding of * hint_enc - an optional String (like "UTF-8"), the encoding name which will * be used as an additional hint to the charset detector * * Returns: an list with zero or more dicts * each one of them with with encoding, language, type and confidence * parameters */ PyObject * charlockholmes_encoding_detect_all(PyObject *self, PyObject *args) { PyObject *lst; PyObject *content; UErrorCode status = U_ZERO_ERROR; const UCharsetMatch **matches; const char *mname; const char *mlang; const char *hint_enc = NULL; int mconfidence; int i, match_count; if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) { return NULL; } if (detect_binary_content(content)) { lst = PyList_New(1); if (!lst) return NULL; content = Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100); PyList_SET_ITEM(lst, 0, content); return lst; } if (hint_enc != NULL) { ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status); } ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status); matches = ucsdet_detectAll(ch_ucd, &match_count, &status); if (matches) { lst = PyList_New(match_count); if (!lst) return NULL; for (i = 0; i < match_count; ++i) { mname = ucsdet_getName(matches[i], &status); mlang = ucsdet_getLanguage(matches[i], &status); mconfidence = ucsdet_getConfidence(matches[i], &status); if (mlang && mlang[0]) content = Py_BuildValue("{ss,ss,si,ss}", "type", "text", "encoding", mname, "confidence", mconfidence, "language", mlang); else content = Py_BuildValue("{ss,ss,si}", "type", "text", "encoding", mname, "confidence", mconfidence); PyList_SET_ITEM(lst, i, content); } return lst; } Py_INCREF(Py_None); return Py_None; }
int main(int argc, char** argv) { UErrorCode e = U_ZERO_ERROR; std::string filename = argc > 1 ? argv[1] : "main.hs"; std::ifstream file (filename, std::ios::in | std::ios::binary | std::ios::ate); if (!file.is_open()) { std::cerr << "I can't open that file. I hate you too." << std::endl; return 1; } std::string raw; raw.reserve(file.tellg()); file.seekg(0, std::ios::beg); raw.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); file.close(); UCharsetDetector *ucd = ucsdet_open(&e); ucsdet_setDeclaredEncoding(ucd, "UTF-8", -1, &e); ucsdet_setText(ucd, raw.c_str(), raw.size(), &e); const UCharsetMatch *ucm = ucsdet_detect(ucd, &e); if (U_FAILURE(e)) { std::cerr << "Charset detection error: " << u_errorName(e) << std::endl; return e; } std::cout << "Charset detected: " << ucsdet_getName(ucm, &e) << " confidence: " << ucsdet_getConfidence(ucm, &e) << std::endl; if (U_FAILURE(e)) { std::cerr << "Charset detection error: " << u_errorName(e) << std::endl; return e; } UChar *buf = new UChar[raw.size() + 1]; int out = ucsdet_getUChars(ucm, buf, raw.size(), &e); if (U_FAILURE(e)) { std::cerr << "Charset conversion error: " << u_errorName(e) << std::endl; return e; } ucsdet_close(ucd); buf[out] = 0; icu::UnicodeString source(buf); delete [] buf; source.append("\n"); std::cout << "Read:" << std::endl << source << std::endl; dhc::lexer::layout l(source); while (!l.finished()) { dhc::lexer::match_ptr token (l.next()); if (token) { std::cout << token->flatten() << ' '; } else { std::cerr << filename << std::endl; } } std::cout << std::endl; dhc::parser::parser p(source); std::cout << "Created parser" << std::endl; if (!p.finished()) { dhc::lexer::match_ptr token (p.parse()); if (token) { print_tree(token, 0); } else { std::cerr << p.error(filename) << std::endl; } } return 0; }