/* * call-seq: * detect(text=nil, declared_encoding=nil) * * Return the charset that best matches the supplied input data. * * Note though, that because the detection * only looks at the start of the input data, * there is a possibility that the returned charset will fail to handle * the full set of input data. * * The function will fail if * * no charset appears to match the data * * no input text has been provided (with +text+ or set with #text= ) */ static VALUE UCharsetDetector_detect(int argc, VALUE *argv, VALUE self) { VALUE text; VALUE declared_encoding; rb_scan_args(argc, argv, "02", &text, &declared_encoding); set_text(self, text); set_declared_encoding(self, declared_encoding); UErrorCode status = U_ZERO_ERROR; UCharsetDetector *detector; Data_Get_Struct(self, UCharsetDetector, detector); const UCharsetMatch *match = ucsdet_detect(detector, &status); ensure(status); const char *encoding_name = ucsdet_getName(match, &status); ensure(status); int32_t encoding_confidence = ucsdet_getConfidence(match, &status); ensure(status); const char *encoding_language = ucsdet_getLanguage(match, &status); ensure(status); VALUE hash = rb_hash_new(); rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name)); rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence)); rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language)); return hash; }
static int64_t HHVM_METHOD(EncodingMatch, getConfidence) { FETCH_MATCH(data, this_); UErrorCode error = U_ZERO_ERROR; auto confidence = ucsdet_getConfidence(data->match(), &error); if (U_FAILURE(error)) { data->throwException("Could not get confidence for match, error " "%d (%s)", error, u_errorName(error)); } return confidence; }
int64_t c_EncodingMatch::t_getconfidence() { validate(); UErrorCode status = U_ZERO_ERROR; int32_t confidence = ucsdet_getConfidence( m_encoding_match, &status); if (U_FAILURE(status)) { throw Exception( "Could not get confidence for match, error %d (%s)", status, u_errorName(status)); } return confidence; }
int c_EncodingMatch::t_getconfidence() { INSTANCE_METHOD_INJECTION_BUILTIN(EncodingMatch, EncodingMatch::getconfidence); validate(); UErrorCode status = U_ZERO_ERROR; int32_t confidence = ucsdet_getConfidence( m_encoding_match, &status); if (U_FAILURE(status)) { throw Exception( "Could not get confidence for match, error %d (%s)", status, u_errorName(status)); } return confidence; }
/* * Attempt to detect the encoding of this string * * str - a String, what you want to detect the encoding of * hint_enc - an optional String (like "UTF-8"), the encoding name which will * be used as an additional hint to the charset detector * * Returns: a dict with encoding, language, type and confidence parameters */ PyObject * charlockholmes_encoding_detect(PyObject *self, PyObject *args) { PyObject *content; UErrorCode status = U_ZERO_ERROR; const UCharsetMatch *match; const char *mname; const char *mlang; const char *hint_enc = NULL; int mconfidence; if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) { return NULL; } if (detect_binary_content(content)) { return Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100); } if (hint_enc != NULL) { ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status); } ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status); match = ucsdet_detect(ch_ucd, &status); if (match) { mname = ucsdet_getName(match, &status); mlang = ucsdet_getLanguage(match, &status); mconfidence = ucsdet_getConfidence(match, &status); if (mlang && mlang[0]) return Py_BuildValue("{ss,ss,si,ss}", "type", "text", "encoding", mname, "confidence", mconfidence, "language", mlang); else return Py_BuildValue("{ss,ss,si}", "type", "text", "encoding", mname, "confidence", mconfidence); } Py_INCREF(Py_None); return Py_None; }
static int _charset_detect(const char *in, size_t len, const char **charset, int *confidence) { UCharsetDetector *csd; const UCharsetMatch *ucm; UErrorCode status = U_ZERO_ERROR; csd = ucsdet_open(&status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error opening character set detector: %s.\n", u_errorName(status)); return -1; } ucsdet_setText(csd, in, len, &status); if( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error setting text for character set detection: %s.\n", u_errorName(status)); goto error; } ucm = ucsdet_detect(csd, &status); if( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: character set detection failed: %s.\n", u_errorName(status)); goto error; } *confidence = ucsdet_getConfidence(ucm, &status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set confidence: %s.\n", u_errorName(status)); goto error; } *charset = ucsdet_getName(ucm, &status); if ( U_FAILURE(status) ) { prelude_log(PRELUDE_LOG_ERR, "ICU: error retrieving character set name: %s.\n", u_errorName(status)); goto error; } return 0; error: ucsdet_close(csd); return -1; }
/* * call-seq: * detect_all(text=nil, declared_encoding=nil) * * Find all charset matches that appear to be consistent with the input, * returning an array of results. The results are ordered with the * best quality match first. * * Because the detection only looks at a limited amount of the * input byte data, some of the returned charsets may fail to handle * the all of input data. * * Return an error if * * no charset appears to match the data * * no input text has been provided (with +text+ or set with #text= ) */ static VALUE UCharsetDetector_detect_all(int argc, VALUE *argv, VALUE self) { VALUE text; VALUE declared_encoding; rb_scan_args(argc, argv, "02", &text, &declared_encoding); set_text(self, text); set_declared_encoding(self, declared_encoding); UCharsetDetector *detector; Data_Get_Struct(self, UCharsetDetector, detector); UErrorCode status = U_ZERO_ERROR; int32_t matches_found; const UCharsetMatch **matches = ucsdet_detectAll(detector, &matches_found, &status); ensure(status); VALUE ary = rb_ary_new(); int i = 0; for (i = 0; i < matches_found; i++) { const char *encoding_name = ucsdet_getName(matches[i], &status); ensure(status); int32_t encoding_confidence = ucsdet_getConfidence(matches[i], &status); ensure(status); const char *encoding_language = ucsdet_getLanguage(matches[i], &status); ensure(status); VALUE hash = rb_hash_new(); rb_hash_aset(hash, ID2SYM(rb_intern("encoding")), rb_str_new2(encoding_name)); rb_hash_aset(hash, ID2SYM(rb_intern("confidence")), INT2NUM(encoding_confidence)); rb_hash_aset(hash, ID2SYM(rb_intern("language")), rb_str_new2(encoding_language)); rb_ary_push(ary, hash); } return ary; }
/* * Attempt to detect the encoding of this string, and return * a list with all the possible encodings that match it. * * * str - a String, what you want to detect the encoding of * hint_enc - an optional String (like "UTF-8"), the encoding name which will * be used as an additional hint to the charset detector * * Returns: an list with zero or more dicts * each one of them with with encoding, language, type and confidence * parameters */ PyObject * charlockholmes_encoding_detect_all(PyObject *self, PyObject *args) { PyObject *lst; PyObject *content; UErrorCode status = U_ZERO_ERROR; const UCharsetMatch **matches; const char *mname; const char *mlang; const char *hint_enc = NULL; int mconfidence; int i, match_count; if (!PyArg_ParseTuple(args, "S|s", &content, &hint_enc)) { return NULL; } if (detect_binary_content(content)) { lst = PyList_New(1); if (!lst) return NULL; content = Py_BuildValue("{ss,si}", "type", "binary", "confidence", 100); PyList_SET_ITEM(lst, 0, content); return lst; } if (hint_enc != NULL) { ucsdet_setDeclaredEncoding(ch_ucd, hint_enc, strlen(hint_enc), &status); } ucsdet_setText(ch_ucd, PyString_AsString(content), (int32_t)PyString_Size(content), &status); matches = ucsdet_detectAll(ch_ucd, &match_count, &status); if (matches) { lst = PyList_New(match_count); if (!lst) return NULL; for (i = 0; i < match_count; ++i) { mname = ucsdet_getName(matches[i], &status); mlang = ucsdet_getLanguage(matches[i], &status); mconfidence = ucsdet_getConfidence(matches[i], &status); if (mlang && mlang[0]) content = Py_BuildValue("{ss,ss,si,ss}", "type", "text", "encoding", mname, "confidence", mconfidence, "language", mlang); else content = Py_BuildValue("{ss,ss,si}", "type", "text", "encoding", mname, "confidence", mconfidence); PyList_SET_ITEM(lst, i, content); } return lst; } Py_INCREF(Py_None); return Py_None; }
QList<QCharsetMatch> QCharsetDetector::detectAll() { Q_D(QCharsetDetector); clearError(); // get list of matches from ICU: qint32 matchesFound; const UCharsetMatch **uCharsetMatch = ucsdet_detectAll(d->_uCharsetDetector, &matchesFound, &(d->_status)); if(hasError()) { qWarning() << __PRETTY_FUNCTION__ << errorString(); return QList<QCharsetMatch>(); } // sometimes the number of matches found by ucsdet_detectAll() // maybe 0 (matchesFound == 0) but d->_status has no error. Do not // return here with an error if this happens because the fine // tuning below may add more matches. Better check whether no // matches were found at all *after* the fine tuning. // fill list of matches into a QList<QCharsetMatch>: QList<QCharsetMatch> qCharsetMatchList; for (qint32 i = 0; i < matchesFound; ++i) { QCharsetMatch qCharsetMatch; qCharsetMatch.setName( QString::fromLatin1(ucsdet_getName(uCharsetMatch[i], &(d->_status)))); if(hasError()) { qWarning() << __PRETTY_FUNCTION__ << errorString(); return QList<QCharsetMatch>(); } qCharsetMatch.setConfidence( static_cast<qint32>(ucsdet_getConfidence (uCharsetMatch[i], &(d->_status)))); if(hasError()) { qWarning() << __PRETTY_FUNCTION__ << errorString(); return QList<QCharsetMatch>(); } qCharsetMatch.setLanguage( QString::fromLatin1(ucsdet_getLanguage(uCharsetMatch[i], &(d->_status)))); if(hasError()) { qWarning() << __PRETTY_FUNCTION__ << errorString(); return QList<QCharsetMatch>(); } qCharsetMatchList << qCharsetMatch; } if(d->_allDetectableCharsets.isEmpty()) getAllDetectableCharsets(); // libicu sometimes does not detect single byte encodings at all // even if they can encode the input without error. This seems to // contradict the documentation on // http://icu-project.org/apiref/icu4c/ucsdet_8h.html which says: // // A confidence value of ten does have a general meaning - it is // used for charsets that can represent the input data, but for // which there is no other indication that suggests that the // charset is the correct one. Pure 7 bit ASCII data, for example, // is compatible with a great many charsets, most of which will // appear as possible matches with a confidence of 10. // // But if such a single byte encoding has been set as the declared // encoding, it should at least be tried, therefore add it here to // the list of matches with the confidence value of 10. If it // cannot encode the complete input, the iteration over the list // of matches will detect that and remove it again. if(!d->_declaredEncoding.isEmpty() && (d->_declaredEncoding.startsWith(QLatin1String("ISO-8859-")) || d->_declaredEncoding.startsWith(QLatin1String("windows-12")) || d->_declaredEncoding.startsWith(QLatin1String("KOI8")))) qCharsetMatchList << QCharsetMatch(d->_declaredEncoding, QString(), 10); // Similar as for declaredEncoding, when declaredLocale is used // and it is a locale where the legacy encoding is a single byte // encoding, it should at least be tried, therefore add the legacy // single byte encoding for the declared locale here. If it // cannot encode the complete input, it will be removed again // later. Multibyte encodings like Shift_JIS, EUC-JP, Big5, // etc. ... do not need to be added, contrary to the single byte // encodings I could find no case where the matches returned by // libicu did omit a multibyte encoding when it should have been // included. if(!d->_declaredLocale.isEmpty()) { QString language = d->_declaredLocale.left(2); if(language == QLatin1String("ru")) { qCharsetMatchList << QCharsetMatch(QLatin1String("KOI8-R"), language, 10); qCharsetMatchList << QCharsetMatch(QLatin1String("windows-1251"), language, 10); qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-5"), language, 10); } else if(language == QLatin1String("tr")) qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-9"), language, 10); else if(language == QLatin1String("el")) qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-7"), language, 10); else if(language == QLatin1String("en") || language == QLatin1String("da") || language == QLatin1String("de") || language == QLatin1String("es") || language == QLatin1String("fi") || language == QLatin1String("fr") || language == QLatin1String("it") || language == QLatin1String("nl") || language == QLatin1String("no") || language == QLatin1String("nn") || language == QLatin1String("nb") || language == QLatin1String("pt") || language == QLatin1String("sv")) qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-1"), language, 10); else if(language == QLatin1String("cs") || language == QLatin1String("hu") || language == QLatin1String("pl") || language == QLatin1String("ro")) qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-1"), language, 10); else if(language == QLatin1String("ar") || language == QLatin1String("fa") || language == QLatin1String("ur")) qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-6"), language, 10); else if(language == QLatin1String("he")) qCharsetMatchList << QCharsetMatch(QLatin1String("ISO-8859-8"), language, 10); } // iterate over the detected matches and do some fine tuning: bool sortNeeded = false; qint32 koi8rConfidence = 0; qint32 iso88595Confidence = 0; qint32 windows1251Confidence = 0; QList<QCharsetMatch>::iterator it = qCharsetMatchList.begin(); while(it != qCharsetMatchList.end()) { if((*it).name() == QLatin1String("KOI8-R")) koi8rConfidence += (*it).confidence(); if((*it).name() == QLatin1String("ISO-8859-5")) iso88595Confidence += (*it).confidence(); if((*it).name() == QLatin1String("windows-1251")) windows1251Confidence += (*it).confidence(); if((*it).name() == QLatin1String("ISO-2022-JP")) { // non-Japanese text in ISO-2022-JP encoding is possible // but very unlikely: (*it).setLanguage(QLatin1String("ja")); } if((*it).name() == QLatin1String("UTF-8") && (*it).confidence() >= 80 && (*it).confidence() < 99) { // Actually libicu currently only returns confidence // values of 100, 80, 25, and 10 for UTF-8. A value of 80 // can mean two things: // // 1) (hasBOM && numValid > numInvalid*10) // 2) (numValid > 0 && numInvalid == 0) // // If it is case 1), the match will be removed anyway by // the check below which tests whether the complete input // can be encoded. I.e. we don’t need to care about this. // // If it is case 2) *and* the check below whether the // complete input can be encoded does not remove it, we // have valid UTF-8 and it is very unlikely that it is // anything else, therefore I think the confidence of 80 // is too low and should be increased. // With a confidence of only 80, a longer ASCII text with // less than 4 UTF-8 characters will detect as ISO-8859-1 // which is most certainly wrong. (*it).setConfidence(99); sortNeeded = true; } if(!d->_declaredEncoding.isEmpty() && (*it).name() == d->_declaredEncoding && (*it).confidence() == 10) { // A confidence value of 10 means the charset can // represent the input data, but there is no other // indication that suggests that the charset is the // correct one. But if the user has set this to be the // declared encoding, it should be preferred over the // other encodings which also got confidence 10 (there are // often many with confidence 10). Do not increase the // confidence too much though in order not to override // real evidence that the input does really use something // different than the declared encoding. (*it).setConfidence(40); sortNeeded = true; } if(!d->_declaredLocale.isEmpty() && d->_declaredLocale.startsWith((*it).language()) && (*it).confidence() == 10) { // A confidence value of 10 means the charset can // represent the input data, but there is no other // indication that suggests that the charset is the // correct one. But if the detected language for this // charset matches the language declared by the user, this // charset should be preferred over the others which also // got confidence 10 (there are often many with confidence // 10). Do not increase the confidence too much though in // order not to override real evidence that the input does // really use something different than the declared // encoding. Use a slightly lower value than for the // declared encoding. Setting the declared encoding // is more precise and should have somewhat higher priority if (d->_declaredLocale.startsWith(QLatin1String("ru"))) { // Treat the Russian setDeclaredLocale("ru") case a // bit different than the single byte encodings for // other languages: Only increase the weight of // Russian encodings if setDeclaredLocale("ru") has // been used if libicu has really detected the same // Russian encoding as well. libicu usually detects // these Russian encodings with very low confidences < // 10 for short input. But if we are already pretty // sure that it is Russian because of // setDeclaredLocale("ru"), then these low confidences // detected by libicu seem to be useful to distinguish // between the different Russian legacy encodings. // // If the setDeclareLocale("ru") has been used, the // accumulated confidence for the Russian single byte // encoding is 10 (because of setDeclaredLocale("ru")) // plus whatever libicu has detected. If libicu has // not detected anything, the accumulated confidence // is exactly 10 here and there is no way to // distinguish between the Russian legacy // encodings. Therefore, don’t increase the confidence // if the accumulated confidence is not > 10. // // But if libicu has detected something with small // confidence, the accumulated confidence is 10 plus // something small. In that case, adding something // around 20 seems to work reasonably well. // // I add 20 to the confidence for KOI8-R and // ISO-8859-5 but 21 to the confidence for // windows-1251 to prefer windows-1251 a little bit // over ISO-8859-5. if((*it).name() == QLatin1String("KOI8-R") && koi8rConfidence > 10 && koi8rConfidence < 30) (*it).setConfidence(20 + koi8rConfidence); else if((*it).name() == QLatin1String("ISO-8859-5") && iso88595Confidence > 10 && iso88595Confidence < 30) (*it).setConfidence(20 + iso88595Confidence); else if((*it).name() == QLatin1String("windows-1251") && windows1251Confidence > 10 && windows1251Confidence < 30) (*it).setConfidence(21 + windows1251Confidence); } else if ((d->_declaredLocale.contains(QLatin1String("TW")) || d->_declaredLocale.contains(QLatin1String("HK")) || d->_declaredLocale.contains(QLatin1String("MO"))) && (*it).name() == QLatin1String("Big5")) { // Traditional Chinese, Big5 more likely (*it).setConfidence(39); } else if ((d->_declaredLocale.contains(QLatin1String("CN")) || d->_declaredLocale.contains(QLatin1String("SG")) || d->_declaredLocale == QLatin1String("zh")) && (*it).name() == QLatin1String("GB18030")) { // Simplified Chinese, GB18030/GB2312 more likely. // Simplified Chinese is also assumed if only “zh” // is set. If the variant is unknown, simplified // Chinese seems a bit more likely. On top of that, // the settings application sets only “zh” for // simplified Chinese and the translations for // simplified Chinese are also in files like // “foo_zh.qm” which makes simplified Chinese more // likely when only “zh” is set on the device (see // also NB#242154). (*it).setConfidence(39); } else { (*it).setConfidence(38); } sortNeeded = true; } if(!d->_allDetectableCharsets.contains((*it).name())) { // remove matches for charsets not supported by QTextCodec // then it is probably some weird charset we cannot use anyway it = qCharsetMatchList.erase(it); } else { // test whether the complete input text can be encoded // using this match, if not remove the match clearError(); text(*it); if(hasError()) { // qMailLog(Messaging) << __PRETTY_FUNCTION__ // << "removing match" << (*it).name() // << "because it cannot encode the complete input" // << errorString(); it = qCharsetMatchList.erase(it); clearError(); } else ++it; } } // sort the list of matches again if confidences have been changed: if(sortNeeded) std::sort(qCharsetMatchList.begin(), qCharsetMatchList.end(), std::greater<QCharsetMatch>()); if(qCharsetMatchList.isEmpty()) { // is there any better status to describe this case? d->_status = U_CE_NOT_FOUND_ERROR; qWarning() << __PRETTY_FUNCTION__ << "number of matches found=0" << errorString(); return QList<QCharsetMatch>(); } return qCharsetMatchList; }
bool detectTextEncoding(const char* data, size_t len, const char* hintEncodingName, TextEncoding* detectedEncoding) { *detectedEncoding = TextEncoding(); int matchesCount = 0; UErrorCode status = U_ZERO_ERROR; UCharsetDetector* detector = ucsdet_open(&status); if (U_FAILURE(status)) return false; ucsdet_enableInputFilter(detector, true); ucsdet_setText(detector, data, static_cast<int32_t>(len), &status); if (U_FAILURE(status)) return false; // FIXME: A few things we can do other than improving // the ICU detector itself. // 1. Use ucsdet_detectAll and pick the most likely one given // "the context" (parent-encoding, referrer encoding, etc). // 2. 'Emulate' Firefox/IE's non-Universal detectors (e.g. // Chinese, Japanese, Russian, Korean and Hebrew) by picking the // encoding with a highest confidence among the detector-specific // limited set of candidate encodings. // Below is a partial implementation of the first part of what's outlined // above. const UCharsetMatch** matches = ucsdet_detectAll(detector, &matchesCount, &status); if (U_FAILURE(status)) { ucsdet_close(detector); return false; } const char* encoding = 0; if (hintEncodingName) { TextEncoding hintEncoding(hintEncodingName); // 10 is the minimum confidence value consistent with the codepoint // allocation in a given encoding. The size of a chunk passed to // us varies even for the same html file (apparently depending on // the network load). When we're given a rather short chunk, we // don't have a sufficiently reliable signal other than the fact that // the chunk is consistent with a set of encodings. So, instead of // setting an arbitrary threshold, we have to scan all the encodings // consistent with the data. const int32_t kThresold = 10; for (int i = 0; i < matchesCount; ++i) { int32_t confidence = ucsdet_getConfidence(matches[i], &status); if (U_FAILURE(status)) { status = U_ZERO_ERROR; continue; } if (confidence < kThresold) break; const char* matchEncoding = ucsdet_getName(matches[i], &status); if (U_FAILURE(status)) { status = U_ZERO_ERROR; continue; } if (TextEncoding(matchEncoding) == hintEncoding) { encoding = hintEncodingName; break; } } } // If no match is found so far, just pick the top match. // This can happen, say, when a parent frame in EUC-JP refers to // a child frame in Shift_JIS and both frames do NOT specify the encoding // making us resort to auto-detection (when it IS turned on). if (!encoding && matchesCount > 0) encoding = ucsdet_getName(matches[0], &status); if (U_SUCCESS(status)) { *detectedEncoding = TextEncoding(encoding); ucsdet_close(detector); return true; } ucsdet_close(detector); return false; }
UErrorCode detect_ICU(const text* buffer, text** encoding, text** lang, int32_t* confidence) { const char* cbuffer = text_to_cstring(buffer); //int cbuffer_len = strlen(cbuffer); UCharsetDetector* csd; const UCharsetMatch* csm; UErrorCode status = U_ZERO_ERROR; csd = ucsdet_open(&status); // set text buffer // use -1 for string length since NUL terminated ucsdet_setText(csd, cbuffer, STRING_IS_NULL_TERMINATED, &status); //ucsdet_setText(csd, cbuffer, cbuffer_len, &status); // detect charset csm = ucsdet_detect(csd, &status); // charset match is NULL if no match if (NULL == csm) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("ICU error: No charset match for \"%s\" - assuming ISO-8859-1.", cbuffer))); *encoding = cstring_to_text("ISO-8859-1"); *lang = NULL; *confidence = 0; ucsdet_close(csd); pfree((void *) cbuffer); return status; } else if (U_FAILURE(status)) { ereport(WARNING, (errcode(ERRCODE_EXTERNAL_ROUTINE_EXCEPTION), errmsg("ICU error: %s\n", u_errorName(status)))); *encoding = NULL; *lang = NULL; *confidence = 0; ucsdet_close(csd); pfree((void *) cbuffer); return status; } *encoding = cstring_to_text(ucsdet_getName(csm, &status)); *lang = cstring_to_text(ucsdet_getLanguage(csm, &status)); *confidence = ucsdet_getConfidence(csm, &status); // close charset detector // UCharsetMatch is owned by UCharsetDetector so its memory will be // freed when the char set detector is closed ucsdet_close(csd); pfree((void *) cbuffer); return status; }
int main(int argc, char** argv) { UErrorCode e = U_ZERO_ERROR; std::string filename = argc > 1 ? argv[1] : "main.hs"; std::ifstream file (filename, std::ios::in | std::ios::binary | std::ios::ate); if (!file.is_open()) { std::cerr << "I can't open that file. I hate you too." << std::endl; return 1; } std::string raw; raw.reserve(file.tellg()); file.seekg(0, std::ios::beg); raw.assign((std::istreambuf_iterator<char>(file)), std::istreambuf_iterator<char>()); file.close(); UCharsetDetector *ucd = ucsdet_open(&e); ucsdet_setDeclaredEncoding(ucd, "UTF-8", -1, &e); ucsdet_setText(ucd, raw.c_str(), raw.size(), &e); const UCharsetMatch *ucm = ucsdet_detect(ucd, &e); if (U_FAILURE(e)) { std::cerr << "Charset detection error: " << u_errorName(e) << std::endl; return e; } std::cout << "Charset detected: " << ucsdet_getName(ucm, &e) << " confidence: " << ucsdet_getConfidence(ucm, &e) << std::endl; if (U_FAILURE(e)) { std::cerr << "Charset detection error: " << u_errorName(e) << std::endl; return e; } UChar *buf = new UChar[raw.size() + 1]; int out = ucsdet_getUChars(ucm, buf, raw.size(), &e); if (U_FAILURE(e)) { std::cerr << "Charset conversion error: " << u_errorName(e) << std::endl; return e; } ucsdet_close(ucd); buf[out] = 0; icu::UnicodeString source(buf); delete [] buf; source.append("\n"); std::cout << "Read:" << std::endl << source << std::endl; dhc::lexer::layout l(source); while (!l.finished()) { dhc::lexer::match_ptr token (l.next()); if (token) { std::cout << token->flatten() << ' '; } else { std::cerr << filename << std::endl; } } std::cout << std::endl; dhc::parser::parser p(source); std::cout << "Created parser" << std::endl; if (!p.finished()) { dhc::lexer::match_ptr token (p.parse()); if (token) { print_tree(token, 0); } else { std::cerr << p.error(filename) << std::endl; } } return 0; }