int main(int argc, char** argv) { if (argc < 3) { std::cout << "Usage: " << argv[0] << " string1 string2" << std::endl; return 1; } std::cout << "Matching " << argv[1] << " and " << argv[2] << "; " << std::flush; // Convert input to UTF32 int-arrays std::vector<uint32_t> s2, t2; utf8to32(argv[1], s2); utf8to32(argv[2], t2); std::cout << "Levenshtein distance is " << levenshtein(s2, t2) << std::endl; }
void loadQueries(std::vector<std::vector<uint32_t>> &queries, const std::string &queryfn) { std::ifstream queryfile(queryfn); for (std::string line; std::getline(queryfile, line);) { std::vector<uint32_t> query; utf8to32(line.data(), query); queries.push_back(query); } queryfile.close(); }
void unicode_unittest() { { char* test_string = (char*)"abcd åäö"; char32 dest_string[9] = {0x33}; EXPECT_EQUAL( utf8to32( (char*)"", dest_string, 8 ), 0 ); EXPECT_EQUAL( 0x00, dest_string[1] ); EXPECT_EQUAL( utf8to32( (char*)"a", dest_string, 8 ), 1 ); EXPECT_EQUAL( 0x61, dest_string[0] ); EXPECT_EQUAL( 0x00, dest_string[1] ); EXPECT_EQUAL( utf8to32( test_string, dest_string, 8 ), 11 ); EXPECT_EQUAL( 0x61, dest_string[0] ); EXPECT_EQUAL( 0x62, dest_string[1] ); EXPECT_EQUAL( 0x63, dest_string[2] ); EXPECT_EQUAL( 0x64, dest_string[3] ); EXPECT_EQUAL( 0x20, dest_string[4] ); EXPECT_EQUAL( 0xE5, dest_string[5] ); EXPECT_EQUAL( 0xE4, dest_string[6] ); EXPECT_EQUAL( 0xF6, dest_string[7] ); EXPECT_EQUAL( 0x00, dest_string[8] ); EXPECT_EQUAL( utf8len( test_string ), 8 ); EXPECT_EQUAL( utf8len( "" ), 0 ); EXPECT_EQUAL( utf8len( "a" ), 1 ); EXPECT_EQUAL( utf8len( "ÿ" ), 1 ); EXPECT_EQUAL( utf8len( "bÿ" ), 2 ); EXPECT_EQUAL( utf8len( "ÿb" ), 2 ); EXPECT_EQUAL( utf8len( "2ÿå" ), 3 ); } /*{ char32 utf32_string[4] = {0x41, 0xC1, 0xE5, 0x00}; char utf8_dest_string[6]; utf32to8( utf32_string, utf8_dest_string, 5 ); utf8_dest_string[5] = '\0'; EXPECT_EQUAL( strcmp( "AÁå", utf8_dest_string ), 0 ); }*/ }
void indexFromFile(const char *inputFile, const char *indexFile) { std::ifstream file(inputFile); std::ofstream index(indexFile); for (std::string line; std::getline(file, line);) { std::vector<uint32_t> indexed; utf8to32(line.data(), indexed); // Length of the new string uint32_t length = indexed.size(); index.write(reinterpret_cast<const char *>(&length), sizeof(uint32_t)); index.write(reinterpret_cast<const char *>(indexed.data()), sizeof(uint32_t)*length); } file.close(); index.close(); }
Value* charCodeAt() { std::string s = getThis()->toString(); double pos = getScopeChain()->get("pos")->toInteger(); // pos represents a number of bytes. if (0.0 <= pos && pos < s.length()) { u32 utf32; int offset = static_cast<int>(pos); char* next = utf8to32(s.c_str() + offset, &utf32); if (next) { return new NumberValue(utf32); } } return new NumberValue(NAN); }
// Note charAt(pos) for esjs returns a string which contains a byte // sequence of a valid UTF-8 character. // i.e., The length of the returned string can be greater than one. Value* charAt() { std::string s = getThis()->toString(); double pos = getScopeChain()->get("pos")->toInteger(); if (0.0 <= pos && pos < s.length()) { u32 utf32; int offset = static_cast<int>(pos); char* next = utf8to32(s.c_str() + offset, &utf32); if (next) { return new StringValue(s.substr(offset, next - (s.c_str() + offset))); } } return new StringValue(""); }
Value* toUpperCase(Value* value) { std::string result; const char* next = value->toString().c_str(); while (next && *next) { char utf8[5]; u32 utf32; next = utf8to32(next, &utf32); char* nextResult = utf32to8(utftoupper(utf32), utf8); if (nextResult) { *nextResult = '\0'; result += utf8; } } return new StringValue(result); }
inline void utf8_to_wchar_impl< 4 >( const std::string & in , std::wstring & out ) { utf8to32( in.begin() , in.end() , std::back_inserter(out) ); }