size_t lastHyphenLocation(StringView string, size_t beforeIndex, const AtomicString& localeIdentifier) { // libhyphen accepts strings in UTF-8 format, but WebCore can only provide StringView // which stores either UTF-16 or Latin1 data. This is unfortunate for performance // reasons and we should consider switching to a more flexible hyphenation library // if it is available. CString utf8StringCopy = string.toStringWithoutCopying().utf8(); // WebCore often passes strings like " wordtohyphenate" to the platform layer. Since // libhyphen isn't advanced enough to deal with leading spaces (presumably CoreFoundation // can), we should find the appropriate indexes into the string to skip them. int32_t leadingSpaceBytes; int32_t leadingSpaceCharacters; countLeadingSpaces(utf8StringCopy, leadingSpaceBytes, leadingSpaceCharacters); // The libhyphen documentation specifies that this array should be 5 bytes longer than // the byte length of the input string. Vector<char> hyphenArray(utf8StringCopy.length() - leadingSpaceBytes + 5); char* hyphenArrayData = hyphenArray.data(); String lowercaseLocaleIdentifier = AtomicString(localeIdentifier.string().convertToASCIILowercase()); ASSERT(availableLocales().contains(lowercaseLocaleIdentifier)); for (const auto& dictionaryPath : availableLocales().get(lowercaseLocaleIdentifier)) { RefPtr<HyphenationDictionary> dictionary = TinyLRUCachePolicy<AtomicString, RefPtr<HyphenationDictionary>>::cache().get(AtomicString(dictionaryPath)); char** replacements = nullptr; int* positions = nullptr; int* removedCharacterCounts = nullptr; hnj_hyphen_hyphenate2(dictionary->libhyphenDictionary(), utf8StringCopy.data() + leadingSpaceBytes, utf8StringCopy.length() - leadingSpaceBytes, hyphenArrayData, nullptr, /* output parameter for hyphenated word */ &replacements, &positions, &removedCharacterCounts); if (replacements) { for (unsigned i = 0; i < utf8StringCopy.length() - leadingSpaceBytes - 1; i++) free(replacements[i]); free(replacements); } free(positions); free(removedCharacterCounts); for (int i = beforeIndex - leadingSpaceCharacters - 2; i >= 0; i--) { // libhyphen will put an odd number in hyphenArrayData at all // hyphenation points. A number & 1 will be true for odd numbers. if (hyphenArrayData[i] & 1) return i + 1 + leadingSpaceCharacters; } } return 0; }
void Hyphenator::slotHyphenateWord(PageItem* it, const QString& text, int firstC) { if ((!m_usable))//FIXME:av || (!ScMW->Sprachen.contains(it->Language))) return; const char *word; char *buffer; const int BORDER = 2; QByteArray te; //uint maxC = it->itemText.length() - 1; QString found = text; if (found.contains(SpecialChars::SHYPHEN)) return; // else if (findException(found, &buffer) it->itemText.hyphenateWord(firstC, found.length(), buffer); else if (signed(found.length()) >= MinWordLen) { NewDict(it->itemText.charStyle(firstC).language()); te = m_codec->fromUnicode( found ); word = te.data(); int wordlen = strlen(word); buffer = static_cast<char*>(malloc(wordlen+BORDER+3)); if (buffer == NULL) return; char ** rep = NULL; int * pos = NULL; int * cut = NULL; if (!hnj_hyphen_hyphenate2(m_hdict, word, wordlen, buffer, NULL, &rep, &pos, &cut)) { //uint i = 0; buffer[wordlen] = '\0'; it->itemText.hyphenateWord(firstC, found.length(), buffer); } free(buffer); if (rep) { for (int i = 0; i < wordlen - 1; ++i) if (rep[i]) free(rep[i]); free(rep); } if (pos) free(pos); if (cut) free(cut); buffer = NULL; rep = NULL; pos = NULL; cut = NULL; } }
void Hyphenator::slotHyphenateWord(PageItem* it, const QString& text, int firstC) { if (text.contains(SpecialChars::SHYPHEN)) return; const CharStyle& style = it->itemText.charStyle(firstC); if (text.length() < style.hyphenWordMin()) return; bool ok = loadDict(style.language()); if (!ok) return; QByteArray te = m_codec->fromUnicode(text); char *buffer = static_cast<char*>(malloc(te.length() + 5)); if (buffer == nullptr) return; char **rep = nullptr; int *pos = nullptr; int *cut = nullptr; // TODO: support non-standard hyphenation, see hnj_hyphen_hyphenate2 docs if (!hnj_hyphen_hyphenate2(m_hdict, te.data(), te.length(), buffer, nullptr, &rep, &pos, &cut)) { buffer[te.length()] = '\0'; it->itemText.hyphenateWord(firstC, text.length(), buffer); } free(buffer); if (rep) { for (int i = 0; i < te.length() - 1; ++i) free(rep[i]); } free(rep); free(pos); free(cut); }
void Hyphenator::slotHyphenate(PageItem* it) { if (!(it->asTextFrame()) || (it->itemText.length() == 0)) return; m_doc->DoDrawing = false; QString text = ""; int startC = 0; if (it->itemText.selectionLength() > 0) { startC = it->itemText.startOfSelection(); text = it->itemText.text(startC, it->itemText.selectionLength()); } else { text = it->itemText.text(0, it->itemText.length()); } rememberedWords.clear(); qApp->setOverrideCursor(QCursor(Qt::WaitCursor)); BreakIterator* bi = StoryText::getWordIterator(); bi->setText((const UChar*) text.utf16()); int pos = bi->first(); while (pos != BreakIterator::DONE) { int firstC = pos; pos = bi->next(); int lastC = pos; int countC = lastC - firstC; const CharStyle& style = it->itemText.charStyle(firstC); if (countC > 0 && countC > style.hyphenWordMin() - 1) { QString word = text.mid(firstC, countC); QString wordLower = QLocale(style.language()).toLower(word); if (wordLower.contains(SpecialChars::SHYPHEN)) break; bool ok = loadDict(style.language()); if (!ok) continue; QByteArray te = m_codec->fromUnicode(wordLower); char *buffer = static_cast<char*>(malloc(te.length() + 5)); if (buffer == nullptr) break; char **rep = nullptr; int *pos = nullptr; int *cut = nullptr; // TODO: support non-standard hyphenation, see hnj_hyphen_hyphenate2 docs if (!hnj_hyphen_hyphenate2(m_hdict, te.data(), te.length(), buffer, nullptr, &rep, &pos, &cut)) { int i = 0; buffer[te.length()] = '\0'; bool hasHyphen = false; for (i = 1; i < wordLower.length()-1; ++i) { if(buffer[i] & 1) { hasHyphen = true; break; } } QString outs = ""; QString input = ""; outs += word[0]; for (i = 1; i < wordLower.length()-1; ++i) { outs += word[i]; if(buffer[i] & 1) outs += "-"; } outs += word.rightRef(1); input = outs; if (!ignoredWords.contains(word)) { if (!hasHyphen) it->itemText.hyphenateWord(startC + firstC, wordLower.length(), nullptr); else if (m_automatic) { if (specialWords.contains(word)) { outs = specialWords.value(word); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } } it->itemText.hyphenateWord(startC + firstC, wordLower.length(), buffer); } else { if (specialWords.contains(word)) { outs = specialWords.value(word); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } } if (rememberedWords.contains(input)) { outs = rememberedWords.value(input); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } it->itemText.hyphenateWord(firstC, wordLower.length(), buffer); } else { qApp->changeOverrideCursor(QCursor(Qt::ArrowCursor)); PrefsContext* prefs = PrefsManager::instance()->prefsFile->getContext("hyhpen_options"); int xpos = prefs->getInt("Xposition", -9999); int ypos = prefs->getInt("Yposition", -9999); HyAsk *dia = new HyAsk((QWidget*)parent(), outs); if ((xpos != -9999) && (ypos != -9999)) dia->move(xpos, ypos); qApp->processEvents(); if (dia->exec()) { outs = dia->Wort->text(); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } if (!rememberedWords.contains(input)) rememberedWords.insert(input, outs); if (dia->addToIgnoreList->isChecked()) { if (!ignoredWords.contains(word)) ignoredWords.insert(word); } if (dia->addToExceptionList->isChecked()) { if (!specialWords.contains(word)) specialWords.insert(word, outs); } it->itemText.hyphenateWord(firstC, wordLower.length(), buffer); } else { free(buffer); buffer = nullptr; prefs->set("Xposition", dia->xpos); prefs->set("Yposition", dia->ypos); delete dia; break; } prefs->set("Xposition", dia->xpos); prefs->set("Yposition", dia->ypos); delete dia; qApp->changeOverrideCursor(QCursor(Qt::WaitCursor)); } } } } free(buffer); if (rep) { for (int i = 0; i < te.length() - 1; ++i) free(rep[i]); } free(rep); free(pos); free(cut); } } qApp->restoreOverrideCursor(); m_doc->DoDrawing = true; rememberedWords.clear(); }
nsresult nsHyphenator::Hyphenate(const nsAString& aString, nsTArray<bool>& aHyphens) { if (!aHyphens.SetLength(aString.Length())) { return NS_ERROR_OUT_OF_MEMORY; } memset(aHyphens.Elements(), false, aHyphens.Length()); bool inWord = false; uint32_t wordStart = 0, wordLimit = 0; uint32_t chLen; for (uint32_t i = 0; i < aString.Length(); i += chLen) { uint32_t ch = aString[i]; chLen = 1; if (NS_IS_HIGH_SURROGATE(ch)) { if (i + 1 < aString.Length() && NS_IS_LOW_SURROGATE(aString[i+1])) { ch = SURROGATE_TO_UCS4(ch, aString[i+1]); chLen = 2; } else { NS_WARNING("unpaired surrogate found during hyphenation"); } } nsIUGenCategory::nsUGenCategory cat = mozilla::unicode::GetGenCategory(ch); if (cat == nsIUGenCategory::kLetter || cat == nsIUGenCategory::kMark) { if (!inWord) { inWord = true; wordStart = i; } wordLimit = i + chLen; if (i + chLen < aString.Length()) { continue; } } if (inWord) { const PRUnichar *begin = aString.BeginReading(); NS_ConvertUTF16toUTF8 utf8(begin + wordStart, wordLimit - wordStart); nsAutoTArray<char,200> utf8hyphens; utf8hyphens.SetLength(utf8.Length() + 5); char **rep = nullptr; int *pos = nullptr; int *cut = nullptr; int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, utf8.BeginReading(), utf8.Length(), utf8hyphens.Elements(), nullptr, &rep, &pos, &cut); if (!err) { // Surprisingly, hnj_hyphen_hyphenate2 converts the 'hyphens' buffer // from utf8 code unit indexing (which would match the utf8 input // string directly) to Unicode character indexing. // We then need to convert this to utf16 code unit offsets for Gecko. const char *hyphPtr = utf8hyphens.Elements(); const PRUnichar *cur = begin + wordStart; const PRUnichar *end = begin + wordLimit; while (cur < end) { if (*hyphPtr & 0x01) { aHyphens[cur - begin] = true; } cur++; if (cur < end && NS_IS_LOW_SURROGATE(*cur) && NS_IS_HIGH_SURROGATE(*(cur-1))) { cur++; } hyphPtr++; } } } inWord = false; } return NS_OK; }
void Hyphenator::slotHyphenate(PageItem* it) { if ((!m_usable) || !(it->asTextFrame()) || (it->itemText.length() == 0)) return; m_doc->DoDrawing = false; const char *word; char *buffer; const int BORDER = 2; QString text = ""; QString buf; QByteArray te; int startC = 0; if (it->itemText.lengthOfSelection() > 0) { startC = it->itemText.startOfSelection(); text = it->itemText.text(startC, it->itemText.lengthOfSelection()); } else { text = it->itemText.text(0, it->itemText.length()); } int firstC = 0; int lastC = 0; int Ccount = 0; QString found = ""; QString found2 = ""; rememberedWords.clear(); //uint maxC = it->itemText.length() - 1; qApp->setOverrideCursor(QCursor(Qt::WaitCursor)); QRegExp wordBoundary("\\w"); QRegExp whiteSpace("\\s|\\W|\\d|\\n|\\r|\\t"); while ((firstC+Ccount < signed(text.length())) && (firstC != -1) && (lastC < signed(text.length()))) { firstC = text.indexOf(wordBoundary, firstC+Ccount); if (firstC < 0) break; if (firstC > 0 && text.at(firstC-1) == SpecialChars::SHYPHEN) { Ccount = 1; continue; } lastC = text.indexOf(whiteSpace, firstC); if (lastC < 0) lastC = signed(text.length()); Ccount = lastC - firstC; if (lastC < signed(text.length()) && text.at(lastC) == SpecialChars::SHYPHEN) { ++Ccount; continue; } if (Ccount > MinWordLen-1) { found = text.mid(firstC, Ccount).toLower(); found2 = text.mid(firstC, Ccount); if (found.contains(SpecialChars::SHYPHEN)) break; NewDict(it->itemText.charStyle(firstC).language()); te = m_codec->fromUnicode( found ); word = te.data(); int wordlen = strlen(word); buffer = static_cast<char*>(malloc(wordlen+BORDER+3)); if (buffer == NULL) break; char ** rep = NULL; int * pos = NULL; int * cut = NULL; if (!hnj_hyphen_hyphenate2(m_hdict, word, wordlen, buffer, NULL, &rep, &pos, &cut)) { int i = 0; buffer[wordlen] = '\0'; bool hasHyphen = false; for (i = 1; i < found.length()-1; ++i) { if(buffer[i] & 1) { hasHyphen = true; break; } } QString outs = ""; QString input = ""; outs += found2[0]; for (i = 1; i < found.length()-1; ++i) { outs += found2[i]; if(buffer[i] & 1) outs += "-"; } outs += found2.right(1); input = outs; if (!ignoredWords.contains(found2)) { if (!hasHyphen) it->itemText.hyphenateWord(startC + firstC, found.length(), NULL); else if (Automatic) { if (specialWords.contains(found2)) { outs = specialWords.value(found2); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } } it->itemText.hyphenateWord(startC + firstC, found.length(), buffer); } else { if (specialWords.contains(found2)) { outs = specialWords.value(found2); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } } if (rememberedWords.contains(input)) { outs = rememberedWords.value(input); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } it->itemText.hyphenateWord(firstC, found.length(), buffer); } else { qApp->changeOverrideCursor(QCursor(Qt::ArrowCursor)); PrefsContext* prefs = PrefsManager::instance()->prefsFile->getContext("hyhpen_options"); int xpos = prefs->getInt("Xposition", -9999); int ypos = prefs->getInt("Yposition", -9999); HyAsk *dia = new HyAsk((QWidget*)parent(), outs); if ((xpos != -9999) && (ypos != -9999)) dia->move(xpos, ypos); qApp->processEvents(); if (dia->exec()) { outs = dia->Wort->text(); uint ii = 1; for (i = 1; i < outs.length()-1; ++i) { QChar cht = outs[i]; if (cht == '-') buffer[ii-1] = 1; else { buffer[ii] = 0; ++ii; } } if (!rememberedWords.contains(input)) rememberedWords.insert(input, outs); if (dia->addToIgnoreList->isChecked()) { if (!ignoredWords.contains(found2)) ignoredWords.insert(found2); } if (dia->addToExceptionList->isChecked()) { if (!specialWords.contains(found2)) specialWords.insert(found2, outs); } it->itemText.hyphenateWord(firstC, found.length(), buffer); } else { free(buffer); buffer = NULL; prefs->set("Xposition", dia->xpos); prefs->set("Yposition", dia->ypos); delete dia; break; } prefs->set("Xposition", dia->xpos); prefs->set("Yposition", dia->ypos); delete dia; qApp->changeOverrideCursor(QCursor(Qt::WaitCursor)); } } } } free(buffer); if (rep) { for (int i = 0; i < wordlen - 1; ++i) if (rep[i]) free(rep[i]); free(rep); } if (pos) free(pos); if (cut) free(cut); buffer = NULL; rep = NULL; pos = NULL; cut = NULL; } if (Ccount == 0) Ccount++; } qApp->restoreOverrideCursor(); m_doc->DoDrawing = true; rememberedWords.clear(); }
nsresult nsHyphenator::Hyphenate(const nsAString& aString, nsTArray<PRPackedBool>& aHyphens) { if (!aHyphens.SetLength(aString.Length())) { return NS_ERROR_OUT_OF_MEMORY; } memset(aHyphens.Elements(), PR_FALSE, aHyphens.Length()); PRBool inWord = PR_FALSE; PRUint32 wordStart = 0, wordLimit = 0; for (PRUint32 i = 0; i < aString.Length(); i++) { PRUnichar ch = aString[i]; nsIUGenCategory::nsUGenCategory cat = mCategories->Get(ch); if (cat == nsIUGenCategory::kLetter || cat == nsIUGenCategory::kMark) { if (!inWord) { inWord = PR_TRUE; wordStart = i; } wordLimit = i + 1; if (i < aString.Length() - 1) { continue; } } if (inWord) { NS_ConvertUTF16toUTF8 utf8(aString.BeginReading() + wordStart, wordLimit - wordStart); nsAutoTArray<char,200> utf8hyphens; utf8hyphens.SetLength(utf8.Length() + 5); char **rep = nsnull; int *pos = nsnull; int *cut = nsnull; int err = hnj_hyphen_hyphenate2((HyphenDict*)mDict, utf8.BeginReading(), utf8.Length(), utf8hyphens.Elements(), nsnull, &rep, &pos, &cut); if (!err) { PRUint32 utf16offset = wordStart; const char *cp = utf8.BeginReading(); while (cp < utf8.EndReading()) { if (UTF8traits::isASCII(*cp)) { // single-byte utf8 char cp++; utf16offset++; } else if (UTF8traits::is2byte(*cp)) { // 2-byte sequence cp += 2; utf16offset++; } else if (UTF8traits::is3byte(*cp)) { // 3-byte sequence cp += 3; utf16offset++; } else { // must be a 4-byte sequence (no need to check validity, // as this was just created with NS_ConvertUTF16toUTF8) NS_ASSERTION(UTF8traits::is4byte(*cp), "unexpected utf8 byte"); cp += 4; utf16offset += 2; } NS_ASSERTION(cp <= utf8.EndReading(), "incomplete utf8 string?"); NS_ASSERTION(utf16offset <= aString.Length(), "length mismatch?"); if (utf8hyphens[cp - utf8.BeginReading() - 1] & 0x01) { aHyphens[utf16offset - 1] = PR_TRUE; } } } } inWord = PR_FALSE; } return NS_OK; }