void
NS_GetComplexLineBreaks(const PRUnichar* aText, uint32_t aLength,
                        uint8_t* aBreakBefore)
{
  NS_ASSERTION(aText, "aText shouldn't be null");

  memset(aBreakBefore, 0, aLength * sizeof(uint8_t));

  CFStringRef str = ::CFStringCreateWithCharactersNoCopy(kCFAllocatorDefault, reinterpret_cast<const UniChar*>(aText), aLength, kCFAllocatorNull);
  if (!str) {
    return;
  }

  CFStringTokenizerRef st = ::CFStringTokenizerCreate(kCFAllocatorDefault, str, ::CFRangeMake(0, aLength), kCFStringTokenizerUnitLineBreak, NULL);
  if (!st) {
    ::CFRelease(str);
    return;
  }

  CFStringTokenizerTokenType tt = ::CFStringTokenizerAdvanceToNextToken(st);
  while (tt != kCFStringTokenizerTokenNone) {
    CFRange r = ::CFStringTokenizerGetCurrentTokenRange(st);
    if (r.location != 0) { // Ignore leading edge
      aBreakBefore[r.location] = true;
    }
    tt = CFStringTokenizerAdvanceToNextToken(st);
  }

  ::CFRelease(st);
  ::CFRelease(str);
}
/**
 * Split a text into a list of strings.   Reduce the texts to a CFStringRef of
 * hashes where where each Unicode character represents one token (or boundary between tokens).
 * @param text CFString to encode.
 * @param tokenArray CFMutableArray of unique strings.
 * @param tokenHash Map of strings to indices.
 * @return Encoded CFStringRef.
 */
CFStringRef diff_tokensToCharsMungeCFStringCreate(CFStringRef text, CFMutableArrayRef tokenArray, CFMutableDictionaryRef tokenHash, CFOptionFlags tokenizerOptions) {
  
  CFMutableStringRef chars = CFStringCreateMutable(kCFAllocatorDefault, 0);
  
  CFIndex textLength = CFStringGetLength(text);
  
  //CFLocaleRef currentLocale = CFLocaleCopyCurrent();
  
  CFRange tokenizerRange = CFRangeMake(0, textLength);
  
  CFStringTokenizerRef tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, text, tokenizerRange, tokenizerOptions, NULL);
  
  //CFRelease(currentLocale);
  
  // Set tokenizer to the start of the string. 
  CFStringTokenizerTokenType tokenType = CFStringTokenizerGoToTokenAtIndex(tokenizer, 0);
  
  // Walk the text, pulling out a substring for each token (or boundary between tokens). 
  // A token is either a word, sentence, paragraph or line depending on what tokenizerOptions is set to. 
  CFRange tokenRange;
  CFIndex prevTokenRangeMax = 0;
  while (tokenType != kCFStringTokenizerTokenNone) {
    tokenRange = CFStringTokenizerGetCurrentTokenRange(tokenizer);
    
    if (tokenRange.location > prevTokenRangeMax) {
      // This probably is a bug in the tokenizer: for some reason, gaps in the tokenization can appear. 
      // One particular example is the tokenizer skipping a line feed ('\n') directly after a string of Chinese characters
      CFRange gapRange = CFRangeMake(prevTokenRangeMax, (tokenRange.location - prevTokenRangeMax));
      diff_mungeTokenForRange(text, gapRange, chars, tokenHash, tokenArray);
    }
    
    diff_mungeTokenForRange(text, tokenRange, chars, tokenHash, tokenArray);
    
    tokenType = CFStringTokenizerAdvanceToNextToken(tokenizer);
    
    prevTokenRangeMax = (tokenRange.location + tokenRange.length);
  }
  
  CFRelease(tokenizer);
  
  return chars;
  
}
/**
 * Split a text into a list of strings.   Reduce the texts to a CFStringRef of
 * hashes where where each Unicode character represents one token (or boundary between tokens).
 * @param text CFString to encode.
 * @param lineArray CFMutableArray of unique strings.
 * @param lineHash Map of strings to indices.
 * @return Encoded CFStringRef.
 */
CFStringRef diff_tokensToCharsMungeCFStringCreate(CFStringRef text, CFMutableArrayRef tokenArray, CFMutableDictionaryRef tokenHash, CFOptionFlags tokenizerOptions) {
  
  CFStringRef token;
  CFMutableStringRef chars = CFStringCreateMutable(kCFAllocatorDefault, 0);
  
  CFIndex textLength = CFStringGetLength(text);
  
  //CFLocaleRef currentLocale = CFLocaleCopyCurrent();
  
  CFRange tokenizerRange = CFRangeMake(0, textLength);
  
  CFStringTokenizerRef tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, text, tokenizerRange, tokenizerOptions, NULL);
  
  //CFRelease(currentLocale);
  
  // Set tokenizer to the start of the string. 
  CFStringTokenizerTokenType mask = CFStringTokenizerGoToTokenAtIndex(tokenizer, 0);
  
  // Walk the text, pulling out a substring for each token (or boundary between tokens). 
  // A token is either a word, sentence, paragraph or line depending on what tokenizerOptions is set to. 
  CFRange tokenRange;
  while (mask != kCFStringTokenizerTokenNone) {
    tokenRange = CFStringTokenizerGetCurrentTokenRange(tokenizer);
    
    token = diff_CFStringCreateSubstring(text, tokenRange.location, tokenRange.length);
    diff_mungeHelper(token, tokenArray, tokenHash, chars);
    CFRelease(token);
    
    mask = CFStringTokenizerAdvanceToNextToken(tokenizer);
  }
  
  CFRelease(tokenizer);
  
  return chars;
  
}