/**
 * Split a text into a list of strings.   Reduce the texts to a CFStringRef of
 * hashes where where each Unicode character represents one token (or boundary between tokens).
 * @param text CFString to encode.
 * @param tokenArray CFMutableArray of unique strings.
 * @param tokenHash Map of strings to indices.
 * @return Encoded CFStringRef.
 */
CFStringRef diff_tokensToCharsMungeCFStringCreate(CFStringRef text, CFMutableArrayRef tokenArray, CFMutableDictionaryRef tokenHash, CFOptionFlags tokenizerOptions) {
  
  CFMutableStringRef chars = CFStringCreateMutable(kCFAllocatorDefault, 0);
  
  CFIndex textLength = CFStringGetLength(text);
  
  //CFLocaleRef currentLocale = CFLocaleCopyCurrent();
  
  CFRange tokenizerRange = CFRangeMake(0, textLength);
  
  CFStringTokenizerRef tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, text, tokenizerRange, tokenizerOptions, NULL);
  
  //CFRelease(currentLocale);
  
  // Set tokenizer to the start of the string. 
  CFStringTokenizerTokenType tokenType = CFStringTokenizerGoToTokenAtIndex(tokenizer, 0);
  
  // Walk the text, pulling out a substring for each token (or boundary between tokens). 
  // A token is either a word, sentence, paragraph or line depending on what tokenizerOptions is set to. 
  CFRange tokenRange;
  CFIndex prevTokenRangeMax = 0;
  while (tokenType != kCFStringTokenizerTokenNone) {
    tokenRange = CFStringTokenizerGetCurrentTokenRange(tokenizer);
    
    if (tokenRange.location > prevTokenRangeMax) {
      // This probably is a bug in the tokenizer: for some reason, gaps in the tokenization can appear. 
      // One particular example is the tokenizer skipping a line feed ('\n') directly after a string of Chinese characters
      CFRange gapRange = CFRangeMake(prevTokenRangeMax, (tokenRange.location - prevTokenRangeMax));
      diff_mungeTokenForRange(text, gapRange, chars, tokenHash, tokenArray);
    }
    
    diff_mungeTokenForRange(text, tokenRange, chars, tokenHash, tokenArray);
    
    tokenType = CFStringTokenizerAdvanceToNextToken(tokenizer);
    
    prevTokenRangeMax = (tokenRange.location + tokenRange.length);
  }
  
  CFRelease(tokenizer);
  
  return chars;
  
}
/**
 * Split a text into a list of strings.   Reduce the texts to a CFStringRef of
 * hashes where where each Unicode character represents one token (or boundary between tokens).
 * @param text CFString to encode.
 * @param lineArray CFMutableArray of unique strings.
 * @param lineHash Map of strings to indices.
 * @return Encoded CFStringRef.
 */
CFStringRef diff_tokensToCharsMungeCFStringCreate(CFStringRef text, CFMutableArrayRef tokenArray, CFMutableDictionaryRef tokenHash, CFOptionFlags tokenizerOptions) {
  
  CFStringRef token;
  CFMutableStringRef chars = CFStringCreateMutable(kCFAllocatorDefault, 0);
  
  CFIndex textLength = CFStringGetLength(text);
  
  //CFLocaleRef currentLocale = CFLocaleCopyCurrent();
  
  CFRange tokenizerRange = CFRangeMake(0, textLength);
  
  CFStringTokenizerRef tokenizer = CFStringTokenizerCreate(kCFAllocatorDefault, text, tokenizerRange, tokenizerOptions, NULL);
  
  //CFRelease(currentLocale);
  
  // Set tokenizer to the start of the string. 
  CFStringTokenizerTokenType mask = CFStringTokenizerGoToTokenAtIndex(tokenizer, 0);
  
  // Walk the text, pulling out a substring for each token (or boundary between tokens). 
  // A token is either a word, sentence, paragraph or line depending on what tokenizerOptions is set to. 
  CFRange tokenRange;
  while (mask != kCFStringTokenizerTokenNone) {
    tokenRange = CFStringTokenizerGetCurrentTokenRange(tokenizer);
    
    token = diff_CFStringCreateSubstring(text, tokenRange.location, tokenRange.length);
    diff_mungeHelper(token, tokenArray, tokenHash, chars);
    CFRelease(token);
    
    mask = CFStringTokenizerAdvanceToNextToken(tokenizer);
  }
  
  CFRelease(tokenizer);
  
  return chars;
  
}