void Convert() { const int BUFFER_SIZE = 1024 * 1024; static bool bufferInitialized = false; static string buffer; static char* bufferBegin; static const char* bufferEnd; static char* bufferPtr; static size_t bufferSizeAvailble; if (!bufferInitialized) { bufferInitialized = true; buffer.resize(BUFFER_SIZE + 1); bufferBegin = const_cast<char*>(buffer.c_str()); bufferEnd = buffer.c_str() + BUFFER_SIZE; bufferPtr = bufferBegin; bufferSizeAvailble = BUFFER_SIZE; } FILE* fin = fopen(inputFileName.Get().c_str(), "r"); if (!fin) { throw FileNotFound(inputFileName.Get()); } FILE* fout = GetOutputStream(); while (!feof(fin)) { size_t length = fread(bufferPtr, sizeof(char), bufferSizeAvailble, fin); bufferPtr[length] = '\0'; size_t remainingLength = 0; string remainingTemp; if (length == bufferSizeAvailble) { // fread may breaks UTF8 character // Find the end of last character char* lastChPtr = bufferBegin; while (lastChPtr < bufferEnd) { size_t nextCharLen = UTF8Util::NextCharLength(lastChPtr); if (lastChPtr + nextCharLen > bufferEnd) { break; } lastChPtr += nextCharLen; } remainingLength = bufferEnd - lastChPtr; if (remainingLength > 0) { remainingTemp = UTF8Util::FromSubstr(lastChPtr, remainingLength); *lastChPtr = '\0'; } } // Perform conversion const string& converted = converter->Convert(buffer); fputs(converted.c_str(), fout); if (!noFlush) { // Flush every line if the output stream is stdout. fflush(fout); } // Reset pointer bufferPtr = bufferBegin + remainingLength; bufferSizeAvailble = BUFFER_SIZE - remainingLength; if (remainingLength > 0) { strncpy(bufferBegin, remainingTemp.c_str(), remainingLength); } } fclose(fout); }
vector<string> MaxMatchSegmentation::Segment(const string& text) { vector<string> segments; vector<string> buffer; auto clearBuffer = [&segments, &buffer]() { if (buffer.size() > 0) { segments.push_back(UTF8Util::Join(buffer)); buffer.clear(); } }; for (const char* pstr = text.c_str(); *pstr != '\0';) { Optional<DictEntry> matched = dict->MatchPrefix(pstr); size_t matchedLength; if (matched.IsNull()) { matchedLength = UTF8Util::NextCharLength(pstr); buffer.push_back(UTF8Util::FromSubstr(pstr, matchedLength)); } else { clearBuffer(); matchedLength = matched.Get().key.length(); segments.push_back(matched.Get().key); } pstr += matchedLength; } clearBuffer(); return segments; }
FILE* GetOutputStream() { if (outputFileName.IsNull()) { return stdout; } else { FILE* fp = fopen(outputFileName.Get().c_str(), "w"); if (!fp) { throw FileNotWritable(outputFileName.Get()); } return fp; } }
Optional(Optional const& other) { if (other.valid_) { Construct(other.Get()); } }
string Conversion::Convert(const string& phrase) const { std::ostringstream buffer; for (const char* pstr = phrase.c_str(); *pstr != '\0';) { Optional<const DictEntry*> matched = dict->MatchPrefix(pstr); size_t matchedLength; if (matched.IsNull()) { matchedLength = UTF8Util::NextCharLength(pstr); buffer << UTF8Util::FromSubstr(pstr, matchedLength); } else { matchedLength = matched.Get()->KeyLength(); buffer << matched.Get()->GetDefault(); } pstr += matchedLength; } return buffer.str(); }
DictEntryPtrVectorPtr MaxMatchSegmentation::Segment(const string& text) { DictEntryPtrVectorPtr segments(new DictEntryPtrVector); const char* pstr = text.c_str(); while (*pstr != '\0') { Optional<DictEntryPtr> matched = dict->MatchPrefix(pstr); size_t matchedLength; if (matched.IsNull()) { matchedLength = UTF8Util::NextCharLength(pstr); segments->push_back(DictEntryPtr(new DictEntry(UTF8Util::FromSubstr(pstr, matchedLength)))); } else { matchedLength = matched.Get()->key.length(); segments->push_back(DictEntryPtr(matched.Get())); } pstr += matchedLength; } return segments; }
static void TestDict(DictPtr dict) { Optional<DictEntry> entry; entry = dict->MatchPrefix("BYVoid"); AssertTrue(!entry.IsNull()); AssertEquals("BYVoid", entry.Get().key); AssertEquals("byv", entry.Get().GetDefault()); entry = dict->MatchPrefix("BYVoid123"); AssertTrue(!entry.IsNull()); AssertEquals("BYVoid", entry.Get().key); AssertEquals("byv", entry.Get().GetDefault()); entry = dict->MatchPrefix(utf8("積羽沉舟")); AssertTrue(!entry.IsNull()); AssertEquals(utf8("積羽沉舟"), entry.Get().key); AssertEquals(utf8("羣輕折軸"), entry.Get().GetDefault()); entry = dict->MatchPrefix("Unknown"); AssertTrue(entry.IsNull()); const vector<DictEntry> matches = dict->MatchAllPrefixes(utf8("清華大學計算機系")); AssertEquals(3, matches.size()); AssertEquals(utf8("清華大學"), matches.at(0).key); AssertEquals("TsinghuaUniversity", matches.at(0).GetDefault()); AssertEquals(utf8("清華"), matches.at(1).key); AssertEquals("Tsinghua", matches.at(1).GetDefault()); AssertEquals(utf8("清"), matches.at(2).key); AssertEquals("Tsing", matches.at(2).GetDefault()); }
void test_optional(){ using namespace faint; // Helper constants for testing the Optional const IntSize altSize = IntSize(5,5); const IntSize bmpSize = IntSize(10,10); const Bitmap alt(altSize); const Bitmap bmp(bmpSize); VERIFY(alt.GetSize() != bmp.GetSize()); // Uninitialized optional (not set). Optional<Bitmap> optional; VERIFY(optional.NotSet()); VERIFY(!optional.IsSet()); VERIFY(!optional); static_assert(is_true<decltype(has_or(optional))>(), "Optional of value type lacks Or-method"); EQUAL(optional.Or(alt).GetSize(), altSize); optional.IfSet(FAIL_IF_CALLED()); optional.Visit(FAIL_IF_CALLED(), FAIL_UNLESS_CALLED()); optional.Set(bmp); VERIFY(!optional.NotSet()); VERIFY(optional.IsSet()); VERIFY(optional); EQUAL(optional.Or(alt).GetSize(), bmpSize); optional.IfSet(FAIL_UNLESS_CALLED()); optional.Visit(FAIL_UNLESS_CALLED(), FAIL_IF_CALLED()); IntSize sz = optional.VisitSimple( [](const Bitmap& bmp){ return bmp.GetSize(); }, alt.GetSize()); EQUAL(sz, bmpSize); EQUAL(optional.Get().GetSize(), bmpSize); // Take the object (clearing the optional) Bitmap bmp2 = optional.Take(); EQUAL(bmp2.GetSize(), bmp.GetSize()); VERIFY(optional.NotSet()); // Initializing construction Optional<Bitmap> optional2(bmp2); VERIFY(optional2.IsSet()); EQUAL(optional2.Get().GetSize(), bmp.GetSize()); optional2.Clear(); VERIFY(optional2.NotSet()); // Reference int i = 7; Optional<int&> oi(i); oi.Get() = 8; EQUAL(i, 8); static_assert(is_false<decltype(has_or(oi))>(), "Optional of reference type has Or-method."); // Non-reference int j = 7; Optional<int> oj(j); oj.Get() = 8; EQUAL(j, 7); // Ensure that copying a reference-Optional // does not copy the contained value. FailIfCopied f(10); Optional<FailIfCopied&> o(f); o.Get().value++; EQUAL(o.Get().value, 11); Optional<FailIfCopied&> o2(o); VERIFY(o2.IsSet()); o2.Get().value++; EQUAL(o.Get().value, 12); IntHolder h(12); EQUAL(h.value, 12); h.Get(true).Get()++; EQUAL(h.value, 13); EQUAL(h.Get(true).Get(), 13); }