/** @short Decode a header in the RFC 2047 format into a unicode string */ QString decodeWordSequence(const QByteArray& str) { QRegExp whitespace("^\\s+$"); QString out; // Any idea why this isn't matching? //QRegExp encodedWord("\\b=\\?\\S+\\?\\S+\\?\\S*\\?=\\b"); QRegExp encodedWord("\"?=\\?(\\S+)\\?(\\S+)\\?(.*)\\?=\"?"); // set minimal=true, to match sequences which do not have whit space in between 2 encoded words; otherwise by default greedy matching is performed // eg. "Sm=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=sbord" will match "=?ISO-8859-1?B?9g==?=rg=?ISO-8859-1?B?5Q==?=" as a single encoded word without minimal=true // with minimal=true, "=?ISO-8859-1?B?9g==?=" will be the first encoded word and "=?ISO-8859-1?B?5Q==?=" the second. // -- assuming there are no nested encodings, will there be? encodedWord.setMinimal(true); int pos = 0; int lastPos = 0; while (pos != -1) { pos = encodedWord.indexIn(str, pos); if (pos != -1) { int endPos = pos + encodedWord.matchedLength(); QString preceding(str.mid(lastPos, (pos - lastPos))); QString decoded = decodeWord(str.mid(pos, (endPos - pos)), encodedWord.cap(1).toLatin1(), encodedWord.cap(2).toUpper().toLatin1(), encodedWord.cap(3).toLatin1()); // If there is only whitespace between two encoded words, it should not be included if (!whitespace.exactMatch(preceding)) out.append(preceding); out.append(decoded); pos = endPos; lastPos = pos; } } // Copy anything left out.append(QString::fromUtf8(str.mid(lastPos))); return out; }
OSMAND_CORE_API QVector<int> OSMAND_CORE_CALL OsmAnd::ICU::getTextWrapping(const QString& input, const int maxCharsPerLine) { assert(maxCharsPerLine > 0); QVector<int> result; UErrorCode icuError = U_ZERO_ERROR; bool ok = true; // Create break iterator const auto pBreakIterator = g_pIcuWordBreakIterator->clone(); if(pBreakIterator == nullptr || !U_SUCCESS(icuError)) { LogPrintf(LogSeverityLevel::Error, "ICU error: %d", icuError); if(pBreakIterator != nullptr) delete pBreakIterator; return (result << 0); } // Set text for breaking pBreakIterator->setText(UnicodeString(reinterpret_cast<const UChar*>(input.unicode()), input.length())); auto cursor = 0; while(ok && cursor < input.length()) { // Get next desired breaking position auto lookAheadCursor = cursor + maxCharsPerLine; if(lookAheadCursor >= input.length()) break; // If look-ahead cursor is still in bounds of input, and is pointing to: // - control character // - space character // - non-spacing mark // then move forward until a valuable character is found while(lookAheadCursor < input.length()) { const auto c = static_cast<UChar>(input[lookAheadCursor].unicode()); if(!u_isspace(c) && u_charType(c) != U_CONTROL_CHAR && u_charType(c) != U_NON_SPACING_MARK) break; lookAheadCursor++; } // Now locate last legal word-break at or before the look-ahead cursor const auto lastBreak = pBreakIterator->preceding(lookAheadCursor + 1); // If last legal word-break wasn't found since current cursor, perform a hard-break if(lastBreak <= cursor) { result.push_back(lookAheadCursor); cursor = lookAheadCursor; continue; } // Otherwise a legal word-break was found, so move there and find next valuable character // and place line start there cursor = lastBreak; while(cursor < input.length()) { const auto c = static_cast<UChar>(input[cursor].unicode()); if(!u_isspace(c) && u_charType(c) != U_CONTROL_CHAR && u_charType(c) != U_NON_SPACING_MARK) break; cursor++; } result.push_back(cursor); } if(result.isEmpty()) result.push_back(0); if(pBreakIterator != nullptr) delete pBreakIterator; if(!ok) { LogPrintf(LogSeverityLevel::Error, "ICU error: %d", icuError); return (result << 0); } return result; }
//------------------------------------------------------------------------------- // // checkDictionary This function handles all processing of characters in // the "dictionary" set. It will determine the appropriate // course of action, and possibly set up a cache in the // process. // //------------------------------------------------------------------------------- int32_t BreakIterator::checkDictionary(int32_t startPos, int32_t endPos, UBool reverse) { #if 1 return reverse ? startPos : endPos; #else // Reset the old break cache first. uint32_t dictionaryCount = fDictionaryCharCount; reset(); if (dictionaryCount <= 1 || (endPos - startPos) <= 1) { return (reverse ? startPos : endPos); } // Starting from the starting point, scan towards the proposed result, // looking for the first dictionary character (which may be the one // we're on, if we're starting in the middle of a range). utext_setNativeIndex(fText, reverse ? endPos : startPos); if (reverse) { UTEXT_PREVIOUS32(fText); } int32_t rangeStart = startPos; int32_t rangeEnd = endPos; uint16_t category; int32_t current; UErrorCode status = U_ZERO_ERROR; UStack breaks(status); int32_t foundBreakCount = 0; UChar32 c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); // Is the character we're starting on a dictionary character? If so, we // need to back up to include the entire run; otherwise the results of // the break algorithm will differ depending on where we start. Since // the result is cached and there is typically a non-dictionary break // within a small number of words, there should be little performance impact. if (category & 0x4000) { if (reverse) { do { utext_next32(fText); // TODO: recast to work directly with postincrement. c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } while (c != U_SENTINEL && (category & 0x4000)); // Back up to the last dictionary character rangeEnd = (int32_t)UTEXT_GETNATIVEINDEX(fText); if (c == U_SENTINEL) { // c = fText->last32(); // TODO: why was this if needed? c = UTEXT_PREVIOUS32(fText); } else { c = UTEXT_PREVIOUS32(fText); } } else { do { c = UTEXT_PREVIOUS32(fText); UTRIE_GET16(&fData->fTrie, c, category); } while (c != U_SENTINEL && (category & 0x4000)); // Back up to the last dictionary character if (c == U_SENTINEL) { // c = fText->first32(); c = utext_current32(fText); } else { utext_next32(fText); c = utext_current32(fText); } rangeStart = (int32_t)UTEXT_GETNATIVEINDEX(fText);; } UTRIE_GET16(&fData->fTrie, c, category); } // Loop through the text, looking for ranges of dictionary characters. // For each span, find the appropriate break engine, and ask it to find // any breaks within the span. // Note: we always do this in the forward direction, so that the break // cache is built in the right order. if (reverse) { utext_setNativeIndex(fText, rangeStart); c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } while(U_SUCCESS(status)) { while((current = (int32_t)UTEXT_GETNATIVEINDEX(fText)) < rangeEnd && (category & 0x4000) == 0) { utext_next32(fText); // TODO: tweak for post-increment operation c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } if (current >= rangeEnd) { break; } // We now have a dictionary character. Get the appropriate language object // to deal with it. const LanguageBreakEngine *lbe = getLanguageBreakEngine(c); // Ask the language object if there are any breaks. It will leave the text // pointer on the other side of its range, ready to search for the next one. if (lbe != NULL) { foundBreakCount += lbe->findBreaks(fText, rangeStart, rangeEnd, FALSE, fBreakType, breaks); } // Reload the loop variables for the next go-round c = utext_current32(fText); UTRIE_GET16(&fData->fTrie, c, category); } // If we found breaks, build a new break cache. The first and last entries must // be the original starting and ending position. if (foundBreakCount > 0) { int32_t totalBreaks = foundBreakCount; if (startPos < breaks.elementAti(0)) { totalBreaks += 1; } if (endPos > breaks.peeki()) { totalBreaks += 1; } fCachedBreakPositions = (int32_t *)uprv_malloc(totalBreaks * sizeof(int32_t)); if (fCachedBreakPositions != NULL) { int32_t out = 0; fNumCachedBreakPositions = totalBreaks; if (startPos < breaks.elementAti(0)) { fCachedBreakPositions[out++] = startPos; } for (int32_t i = 0; i < foundBreakCount; ++i) { fCachedBreakPositions[out++] = breaks.elementAti(i); } if (endPos > fCachedBreakPositions[out-1]) { fCachedBreakPositions[out] = endPos; } // If there are breaks, then by definition, we are replacing the original // proposed break by one of the breaks we found. Use following() and // preceding() to do the work. They should never recurse in this case. if (reverse) { return preceding(endPos - 1); } else { return following(startPos); } } // If the allocation failed, just fall through to the "no breaks found" case. } // If we get here, there were no language-based breaks. Set the text pointer // to the original proposed break. utext_setNativeIndex(fText, reverse ? startPos : endPos); return (reverse ? startPos : endPos); #endif }