Exemplo n.º 1
0
static String HHVM_METHOD(EncodingMatch, getUTF8) {
  FETCH_MATCH(data, this_);
  UErrorCode error;
  icu::UnicodeString ustr;
  int32_t ustrSize = ustr.getCapacity();

  do {
    if (UNLIKELY(ustrSize < ustr.getCapacity())) {
      // Should never happen
      error = U_INTERNAL_PROGRAM_ERROR;
      break;
    }
    error = U_ZERO_ERROR;
    UChar* buf = ustr.getBuffer(ustrSize);
    ustrSize = ucsdet_getUChars(data->match(), buf, ustrSize, &error);
    ustr.releaseBuffer(ustrSize);
  } while (error == U_BUFFER_OVERFLOW_ERROR);

  if (U_FAILURE(error)) {
    data->throwException("Could not get UTF-8 for match, error %d (%s)",
                         error, u_errorName(error));
  }

  error = U_ZERO_ERROR;
  String ret(u8(ustr, error));
  if (U_FAILURE(error)) {
    data->throwException("Error converting buffer to UTF8 %d (%d)",
                         error, u_errorName(error));
  }
  return ret;
}
Exemplo n.º 2
0
String c_EncodingMatch::t_getutf8() {
  INSTANCE_METHOD_INJECTION_BUILTIN(EncodingMatch, EncodingMatch::getutf8);
  validate();

  UErrorCode status;
  icu::UnicodeString ustr;
  int32_t ustrSize = ustr.getCapacity();

  do {
    status = U_ZERO_ERROR;
    UChar* buf = ustr.getBuffer(ustrSize);
    ustrSize = ucsdet_getUChars(
      m_encoding_match,
      buf,
      ustrSize,
      &status);
    ustr.releaseBuffer();
    ustr.truncate(ustrSize);
  } while (status == U_BUFFER_OVERFLOW_ERROR);

  if (U_FAILURE(status)) {
    throw Exception(
      "Could not get UTF-8 for match, error %d (%s)",
      status, u_errorName(status));
  }
#if HAVE_OLD_LIBICU
  std::string utf8str (icuStringToUTF8(ustr));
#else
  std::string utf8str;
  ustr.toUTF8String(utf8str);
#endif
  return String(utf8str);
}
Exemplo n.º 3
0
String c_EncodingMatch::t_getutf8() {
  validate();

  UErrorCode status;
  icu::UnicodeString ustr;
  int32_t ustrSize = ustr.getCapacity();

  do {
    status = U_ZERO_ERROR;
    UChar* buf = ustr.getBuffer(ustrSize);
    ustrSize = ucsdet_getUChars(
      m_encoding_match,
      buf,
      ustrSize,
      &status);
    ustr.releaseBuffer();
    ustr.truncate(ustrSize);
  } while (status == U_BUFFER_OVERFLOW_ERROR);

  if (U_FAILURE(status)) {
    throw Exception(
      "Could not get UTF-8 for match, error %d (%s)",
      status, u_errorName(status));
  }
  std::string utf8str;
  ustr.toUTF8String(utf8str);
  return String(utf8str);
}
Exemplo n.º 4
0
int main(int argc, char** argv)
{
    UErrorCode e = U_ZERO_ERROR;
    std::string filename = argc > 1 ? argv[1] : "main.hs";

    std::ifstream file (filename, std::ios::in | std::ios::binary | std::ios::ate);
    if (!file.is_open()) {
        std::cerr << "I can't open that file. I hate you too." << std::endl;
        return 1;
    }

    std::string raw;
    raw.reserve(file.tellg());

    file.seekg(0, std::ios::beg);
    raw.assign((std::istreambuf_iterator<char>(file)),
                 std::istreambuf_iterator<char>());

    file.close();

    UCharsetDetector *ucd = ucsdet_open(&e);

    ucsdet_setDeclaredEncoding(ucd, "UTF-8", -1, &e);
    ucsdet_setText(ucd, raw.c_str(), raw.size(), &e);
    const UCharsetMatch *ucm = ucsdet_detect(ucd, &e);
    if (U_FAILURE(e))
    {
        std::cerr << "Charset detection error: " << u_errorName(e) << std::endl;
        return e;
    }

    std::cout << "Charset detected: " << ucsdet_getName(ucm, &e) << " confidence: " << ucsdet_getConfidence(ucm, &e) << std::endl;
    if (U_FAILURE(e))
    {
        std::cerr << "Charset detection error: " << u_errorName(e) << std::endl;
        return e;
    }
    
    UChar *buf = new UChar[raw.size() + 1];

    int out = ucsdet_getUChars(ucm, buf, raw.size(), &e);
    if (U_FAILURE(e))
    {
        std::cerr << "Charset conversion error: " << u_errorName(e) << std::endl;
        return e;
    }

    ucsdet_close(ucd);

    buf[out] = 0;

    icu::UnicodeString source(buf);
    delete [] buf;

    source.append("\n");
    std::cout << "Read:" << std::endl << source << std::endl;

    dhc::lexer::layout l(source);

    while (!l.finished()) {
        dhc::lexer::match_ptr token (l.next());

        if (token) {
            std::cout << token->flatten() << ' ';
        } else {
            std::cerr << filename << std::endl;
        }
    }

    std::cout << std::endl;

    dhc::parser::parser p(source);

    std::cout << "Created parser" << std::endl;

    if (!p.finished()) {
        dhc::lexer::match_ptr token (p.parse());

        if (token) {
            print_tree(token, 0);
        } else {
            std::cerr << p.error(filename) << std::endl;
        }
    }

    return 0;
}