void QCharsetDetector::setDeclaredEncoding(const QString &encoding)
{
    Q_D(QCharsetDetector);
    clearError();
    d->_declaredEncoding = encoding;
    if (d->_declaredEncoding == QLatin1String("GB2312"))
        d->_declaredEncoding = QLatin1String("GB18030");
    ucsdet_setDeclaredEncoding(d->_uCharsetDetector,
                               d->_declaredEncoding.toLatin1().constData(),
                               int32_t(-1),
                               &(d->_status));
    if(hasError())
        qWarning() << __PRETTY_FUNCTION__ << errorString();
}
Ejemplo n.º 2
0
void c_EncodingDetector::t_setdeclaredencoding(const String& text) {
  UErrorCode status = U_ZERO_ERROR;
  m_declaredencoding = text;
  ucsdet_setDeclaredEncoding(
    m_encoding_detector,
    m_declaredencoding.data(),
    m_declaredencoding.length(),
    &status);
  if (U_FAILURE(status)) {
    throw Exception(
      "Could not set encoding detector declared encoding to [%s], error %d (%s)",
      text.c_str(), status, u_errorName(status));
  }
}
Ejemplo n.º 3
0
void c_EncodingDetector::t_setdeclaredencoding(CStrRef text) {
  INSTANCE_METHOD_INJECTION_BUILTIN(EncodingDetector, EncodingDetector::setdeclaredencoding);
  UErrorCode status = U_ZERO_ERROR;
  ucsdet_setDeclaredEncoding(
    m_encoding_detector,
    text.data(),
    text.length(),
    &status);
  if (U_FAILURE(status)) {
    throw Exception(
      "Could not set encoding detector declared encoding to [%s], error %d (%s)",
      text.c_str(), status, u_errorName(status));
  }
}
Ejemplo n.º 4
0
static void
set_declared_encoding(VALUE self, VALUE declared_encoding)
{
    if (!NIL_P(declared_encoding)){
        declared_encoding = StringValue(declared_encoding);
        
        UErrorCode status = U_ZERO_ERROR;
        UCharsetDetector *detector;
        Data_Get_Struct(self, UCharsetDetector, detector);

        ucsdet_setDeclaredEncoding(detector, StringValuePtr(declared_encoding), RSTRING_LEN(declared_encoding), &status);
        ensure(status);
        
        UCharsetDetector_set_declared_encoding(self, declared_encoding);
    }
}
Ejemplo n.º 5
0
/*
 * Attempt to detect the encoding of this string
 *
 * str      - a String, what you want to detect the encoding of
 * hint_enc - an optional String (like "UTF-8"), the encoding name which will
 *            be used as an additional hint to the charset detector
 *
 * Returns: a dict with encoding, language, type and confidence parameters
 */
PyObject *
charlockholmes_encoding_detect(PyObject *self, PyObject *args)
{
    PyObject *content;
    UErrorCode status = U_ZERO_ERROR;
    const UCharsetMatch *match;
    const char *mname;
    const char *mlang;
    const char *hint_enc = NULL;
    int mconfidence;

    if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) {
        return NULL;
    }

    if (detect_binary_content(content)) {
        return Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100);
    }

    if (hint_enc != NULL) {
        ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status);
    }

    ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status);
    match = ucsdet_detect(ch_ucd, &status);
    if (match) {
        mname = ucsdet_getName(match, &status);
        mlang = ucsdet_getLanguage(match, &status);
        mconfidence = ucsdet_getConfidence(match, &status);
        if (mlang && mlang[0])
            return Py_BuildValue("{ss,ss,si,ss}",
                    "type", "text",
                    "encoding", mname,
                    "confidence", mconfidence,
                    "language", mlang);
        else
            return Py_BuildValue("{ss,ss,si}",
                    "type", "text",
                    "encoding", mname,
                    "confidence", mconfidence);
    }

    Py_INCREF(Py_None);
    return Py_None;
}
Ejemplo n.º 6
0
/*
 * Attempt to detect the encoding of this string, and return
 * a list with all the possible encodings that match it.
 *
 *
 * str      - a String, what you want to detect the encoding of
 * hint_enc - an optional String (like "UTF-8"), the encoding name which will
 *            be used as an additional hint to the charset detector
 *
 * Returns: an list with zero or more dicts
 *          each one of them with with encoding, language, type and confidence
 *          parameters
 */
PyObject *
charlockholmes_encoding_detect_all(PyObject *self, PyObject *args)
{
    PyObject *lst;
    PyObject *content;
    UErrorCode status = U_ZERO_ERROR;
    const UCharsetMatch **matches;
    const char *mname;
    const char *mlang;
    const char *hint_enc = NULL;
    int mconfidence;
    int i, match_count;

    if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) {
        return NULL;
    }

    if (detect_binary_content(content)) {
        lst = PyList_New(1);
        if (!lst)
            return NULL;

        content = Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100);
        PyList_SET_ITEM(lst, 0, content);
        return lst;
    }

    if (hint_enc != NULL) {
        ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status);
    }

    ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status);
    matches = ucsdet_detectAll(ch_ucd, &match_count, &status);

    if (matches) {
        lst = PyList_New(match_count);
        if (!lst)
            return NULL;

    	for (i = 0; i < match_count; ++i) {
            mname = ucsdet_getName(matches[i], &status);
            mlang = ucsdet_getLanguage(matches[i], &status);
            mconfidence = ucsdet_getConfidence(matches[i], &status);
            if (mlang && mlang[0])
                content = Py_BuildValue("{ss,ss,si,ss}",
                        "type", "text",
                        "encoding", mname,
                        "confidence", mconfidence,
                        "language", mlang);
            else
                content = Py_BuildValue("{ss,ss,si}",
                        "type", "text",
                        "encoding", mname,
                        "confidence", mconfidence);

            PyList_SET_ITEM(lst, i, content);
        }

        return lst;
    }

    Py_INCREF(Py_None);
    return Py_None;
}
Ejemplo n.º 7
0
int main(int argc, char** argv)
{
    UErrorCode e = U_ZERO_ERROR;
    std::string filename = argc > 1 ? argv[1] : "main.hs";

    std::ifstream file (filename, std::ios::in | std::ios::binary | std::ios::ate);
    if (!file.is_open()) {
        std::cerr << "I can't open that file. I hate you too." << std::endl;
        return 1;
    }

    std::string raw;
    raw.reserve(file.tellg());

    file.seekg(0, std::ios::beg);
    raw.assign((std::istreambuf_iterator<char>(file)),
                 std::istreambuf_iterator<char>());

    file.close();

    UCharsetDetector *ucd = ucsdet_open(&e);

    ucsdet_setDeclaredEncoding(ucd, "UTF-8", -1, &e);
    ucsdet_setText(ucd, raw.c_str(), raw.size(), &e);
    const UCharsetMatch *ucm = ucsdet_detect(ucd, &e);
    if (U_FAILURE(e))
    {
        std::cerr << "Charset detection error: " << u_errorName(e) << std::endl;
        return e;
    }

    std::cout << "Charset detected: " << ucsdet_getName(ucm, &e) << " confidence: " << ucsdet_getConfidence(ucm, &e) << std::endl;
    if (U_FAILURE(e))
    {
        std::cerr << "Charset detection error: " << u_errorName(e) << std::endl;
        return e;
    }
    
    UChar *buf = new UChar[raw.size() + 1];

    int out = ucsdet_getUChars(ucm, buf, raw.size(), &e);
    if (U_FAILURE(e))
    {
        std::cerr << "Charset conversion error: " << u_errorName(e) << std::endl;
        return e;
    }

    ucsdet_close(ucd);

    buf[out] = 0;

    icu::UnicodeString source(buf);
    delete [] buf;

    source.append("\n");
    std::cout << "Read:" << std::endl << source << std::endl;

    dhc::lexer::layout l(source);

    while (!l.finished()) {
        dhc::lexer::match_ptr token (l.next());

        if (token) {
            std::cout << token->flatten() << ' ';
        } else {
            std::cerr << filename << std::endl;
        }
    }

    std::cout << std::endl;

    dhc::parser::parser p(source);

    std::cout << "Created parser" << std::endl;

    if (!p.finished()) {
        dhc::lexer::match_ptr token (p.parse());

        if (token) {
            print_tree(token, 0);
        } else {
            std::cerr << p.error(filename) << std::endl;
        }
    }

    return 0;
}