void NS_GetComplexLineBreaks(const PRUnichar* aText, uint32_t aLength, uint8_t* aBreakBefore) { NS_ASSERTION(aText, "aText shouldn't be null"); memset(aBreakBefore, 0, aLength * sizeof(uint8_t)); CFStringRef str = ::CFStringCreateWithCharactersNoCopy(kCFAllocatorDefault, reinterpret_cast<const UniChar*>(aText), aLength, kCFAllocatorNull); if (!str) { return; } CFStringTokenizerRef st = ::CFStringTokenizerCreate(kCFAllocatorDefault, str, ::CFRangeMake(0, aLength), kCFStringTokenizerUnitLineBreak, NULL); if (!st) { ::CFRelease(str); return; } CFStringTokenizerTokenType tt = ::CFStringTokenizerAdvanceToNextToken(st); while (tt != kCFStringTokenizerTokenNone) { CFRange r = ::CFStringTokenizerGetCurrentTokenRange(st); if (r.location != 0) { // Ignore leading edge aBreakBefore[r.location] = true; } tt = CFStringTokenizerAdvanceToNextToken(st); } ::CFRelease(st); ::CFRelease(str); }
/** * Split a text into a list of strings. Reduce the texts to a CFStringRef of * hashes where where each Unicode character represents one token (or boundary between tokens). * @param text CFString to encode. * @param tokenArray CFMutableArray of unique strings. * @param tokenHash Map of strings to indices. * @return Encoded CFStringRef. */ CFStringRef diff_tokensToCharsMungeCFStringCreate(CFStringRef text, CFMutableArrayRef tokenArray, CFMutableDictionaryRef tokenHash, CFOptionFlags tokenizerOptions) { CFMutableStringRef chars = CFStringCreateMutable(kCFAllocatorDefault, 0); CFIndex textLength = CFStringGetLength(text); //CFLocaleRef currentLocale = CFLocaleCopyCurrent(); CFRange tokenizerRange = CFRangeMake(0, textLength); CFStringTokenizerRef tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, text, tokenizerRange, tokenizerOptions, NULL); //CFRelease(currentLocale); // Set tokenizer to the start of the string. CFStringTokenizerTokenType tokenType = CFStringTokenizerGoToTokenAtIndex(tokenizer, 0); // Walk the text, pulling out a substring for each token (or boundary between tokens). // A token is either a word, sentence, paragraph or line depending on what tokenizerOptions is set to. CFRange tokenRange; CFIndex prevTokenRangeMax = 0; while (tokenType != kCFStringTokenizerTokenNone) { tokenRange = CFStringTokenizerGetCurrentTokenRange(tokenizer); if (tokenRange.location > prevTokenRangeMax) { // This probably is a bug in the tokenizer: for some reason, gaps in the tokenization can appear. // One particular example is the tokenizer skipping a line feed ('\n') directly after a string of Chinese characters CFRange gapRange = CFRangeMake(prevTokenRangeMax, (tokenRange.location - prevTokenRangeMax)); diff_mungeTokenForRange(text, gapRange, chars, tokenHash, tokenArray); } diff_mungeTokenForRange(text, tokenRange, chars, tokenHash, tokenArray); tokenType = CFStringTokenizerAdvanceToNextToken(tokenizer); prevTokenRangeMax = (tokenRange.location + tokenRange.length); } CFRelease(tokenizer); return chars; }
/** * Split a text into a list of strings. Reduce the texts to a CFStringRef of * hashes where where each Unicode character represents one token (or boundary between tokens). * @param text CFString to encode. * @param lineArray CFMutableArray of unique strings. * @param lineHash Map of strings to indices. * @return Encoded CFStringRef. */ CFStringRef diff_tokensToCharsMungeCFStringCreate(CFStringRef text, CFMutableArrayRef tokenArray, CFMutableDictionaryRef tokenHash, CFOptionFlags tokenizerOptions) { CFStringRef token; CFMutableStringRef chars = CFStringCreateMutable(kCFAllocatorDefault, 0); CFIndex textLength = CFStringGetLength(text); //CFLocaleRef currentLocale = CFLocaleCopyCurrent(); CFRange tokenizerRange = CFRangeMake(0, textLength); CFStringTokenizerRef tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, text, tokenizerRange, tokenizerOptions, NULL); //CFRelease(currentLocale); // Set tokenizer to the start of the string. CFStringTokenizerTokenType mask = CFStringTokenizerGoToTokenAtIndex(tokenizer, 0); // Walk the text, pulling out a substring for each token (or boundary between tokens). // A token is either a word, sentence, paragraph or line depending on what tokenizerOptions is set to. CFRange tokenRange; while (mask != kCFStringTokenizerTokenNone) { tokenRange = CFStringTokenizerGetCurrentTokenRange(tokenizer); token = diff_CFStringCreateSubstring(text, tokenRange.location, tokenRange.length); diff_mungeHelper(token, tokenArray, tokenHash, chars); CFRelease(token); mask = CFStringTokenizerAdvanceToNextToken(tokenizer); } CFRelease(tokenizer); return chars; }