Example #1
1
//----------------------------------------------------------------------------------------
//
//   ubrk_openRules      open a break iterator from a set of break rules.
//                       Invokes the rule builder.
//
//----------------------------------------------------------------------------------------
U_CAPI UBreakIterator* U_EXPORT2
ubrk_openRules(  const UChar        *rules,
                       int32_t       rulesLength,
                 const UChar        *text,
                       int32_t       textLength,
                       UParseError  *parseErr,
                       UErrorCode   *status)  {

    if (status == NULL || U_FAILURE(*status)){
        return 0;
    }

    BreakIterator *result = 0;
    UnicodeString ruleString(rules, rulesLength);
    result = RBBIRuleBuilder::createRuleBasedBreakIterator(ruleString, *parseErr, *status);
    if(U_FAILURE(*status)) {
        return 0;
    }

    if (text != NULL) {
        UCharCharacterIterator *iter = 0;
        iter = new UCharCharacterIterator(text, textLength);
        if(iter == 0) {
            *status = U_MEMORY_ALLOCATION_ERROR;
            delete result;
            return 0;
        }
        result->adoptText(iter);
    }
    return (UBreakIterator *)result;
}
Example #2
0
void ubrk_refreshUText(UBreakIterator *bi,
                       UText          *text,
                       UErrorCode     *status)
{
    BreakIterator *bii = reinterpret_cast<BreakIterator *>(bi);
    bii->refreshInputText(text, *status);
}
Example #3
0
/*
 * Return the next break, counting words and spaces.
 */
int32_t SpaceBreakIterator::next()
{
    if (fDone) {
        return BreakIterator::DONE;
    }
    
    int32_t nextBreak;
    do {
        nextBreak = fBreakIter->next();
        
        if (nextBreak == BreakIterator::DONE) {
            fDone = TRUE;
            return BreakIterator::DONE;
        }
    }
    while(nextBreak > 0 && fComplexContext.contains(fText[nextBreak-1])
            && fComplexContext.contains(fText[nextBreak]));
    
   int32_t result = nextBreak - fSpaceCount;
    
    if (nextBreak < fTextCount) {
        if (fText[nextBreak] == 0x0020 /*Unicode::isSpaceChar(fText[nextBreak])*/) {
            fSpaceCount += fBreakIter->next() - nextBreak;
        }
    }
    
    fWordCount += 1;

    return result;
}
U_NAMESPACE_USE

/* functions available in the common library (for unistr_case.cpp) */

/* public API functions */

U_CAPI int32_t U_EXPORT2
u_strToTitle(UChar *dest, int32_t destCapacity,
             const UChar *src, int32_t srcLength,
             UBreakIterator *titleIter,
             const char *locale,
             UErrorCode *pErrorCode) {
    LocalPointer<BreakIterator> ownedIter;
    BreakIterator *iter;
    if(titleIter!=NULL) {
        iter=reinterpret_cast<BreakIterator *>(titleIter);
    } else {
        iter=BreakIterator::createWordInstance(Locale(locale), *pErrorCode);
        ownedIter.adoptInstead(iter);
    }
    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }
    UnicodeString s(srcLength<0, src, srcLength);
    iter->setText(s);
    return ustrcase_mapWithOverlap(
        ustrcase_getCaseLocale(locale), 0, iter,
        dest, destCapacity,
        src, srcLength,
        ustrcase_internalToTitle, *pErrorCode);
}
Example #5
0
U_DRAFT void U_EXPORT2
ubrk_setUText(UBreakIterator *bi,
             UText          *text,
             UErrorCode     *status)
{
    BreakIterator *brit = (BreakIterator *)bi;
    brit->setText(text, *status);
}
Example #6
0
static void _breakiterator_rewind(zend_object_iterator *iter)
{
	BreakIterator *biter = _breakiter_prolog(iter);
	zoi_with_current *zoi_iter = (zoi_with_current*)iter;

	int32_t pos = biter->first();
	ZVAL_LONG(&zoi_iter->current, (zend_long)pos);
}
Example #7
0
//----------------------------------------------------------------------------------------
//
//    ubrk_open      Create a canned type of break iterator based on type (word, line, etc.)
//                   and locale.
//
//----------------------------------------------------------------------------------------
U_CAPI UBreakIterator* U_EXPORT2
ubrk_open(UBreakIteratorType type,
      const char *locale,
      const UChar *text,
      int32_t textLength,
      UErrorCode *status)
{

  if(U_FAILURE(*status)) return 0;

  BreakIterator *result = 0;

  switch(type) {

  case UBRK_CHARACTER:
    result = BreakIterator::createCharacterInstance(Locale(locale), *status);
    break;

  case UBRK_WORD:
    result = BreakIterator::createWordInstance(Locale(locale), *status);
    break;

  case UBRK_LINE:
    result = BreakIterator::createLineInstance(Locale(locale), *status);
    break;

  case UBRK_SENTENCE:
    result = BreakIterator::createSentenceInstance(Locale(locale), *status);
    break;

  case UBRK_TITLE:
    result = BreakIterator::createTitleInstance(Locale(locale), *status);
    break;
  }

  // check for allocation error
  if (U_FAILURE(*status)) {
     return 0;
  }
  if(result == 0) {
    *status = U_MEMORY_ALLOCATION_ERROR;
    return 0;
  }

  if (text != NULL) {
      UCharCharacterIterator *iter = 0;
      iter = new UCharCharacterIterator(text, textLength);
      if(iter == 0) {
          *status = U_MEMORY_ALLOCATION_ERROR;
          delete result;
          return 0;
      }
      result->adoptText(iter);
  }

  return (UBreakIterator*)result;
}
Example #8
0
/*
 * This method does the acutal break comparison and reports the results.
 * It uses a SpaceBreakIterator to iterate over the text with spaces,
 * and a word instance of a Thai BreakIterator to iterate over the text
 * without spaces.
 */
UBool ThaiWordbreakTest::compareWordBreaks(const UChar *spaces, int32_t spaceCount,
                                           const UChar *noSpaces, int32_t noSpaceCount)
{
    UBool result = TRUE;
    Locale thai("th");
    UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(noSpaces, noSpaceCount);
    UErrorCode status = U_ZERO_ERROR;
    
    BreakIterator *breakIter = BreakIterator::createWordInstance(thai, status);
    breakIter->adoptText(noSpaceIter);
    
    SpaceBreakIterator spaceIter(spaces, spaceCount);
    
    int32_t nextBreak = 0;
    int32_t nextSpaceBreak = 0;
    int32_t iterCount = 0;
    
    while (TRUE) {
        nextSpaceBreak = spaceIter.next();
        nextBreak = breakIter->next();
        
        if (nextSpaceBreak == BreakIterator::DONE || nextBreak == BreakIterator::DONE) {
            if (nextBreak != BreakIterator::DONE) {
                fprintf(stderr, "break iterator didn't end.\n");
            } else if (nextSpaceBreak != BreakIterator::DONE) {
                fprintf(stderr, "premature break iterator end.\n");
            }
            
            break;
        }
        
        while (nextSpaceBreak != nextBreak &&
               nextSpaceBreak != BreakIterator::DONE && nextBreak != BreakIterator::DONE) {
            if (nextSpaceBreak < nextBreak) {
                breakNotFound(nextSpaceBreak);
                result = FALSE;
                nextSpaceBreak = spaceIter.next();
            } else if (nextSpaceBreak > nextBreak) {
                foundInvalidBreak(nextBreak);
                result = FALSE;
                nextBreak = breakIter->next();
            }
        }
        
        if (fVerbose) {
            printf("%d   %d\n", nextSpaceBreak, nextBreak);
        }
    }
        
   
    fWordCount = spaceIter.getWordCount();
    
    delete breakIter;

    return result;
}
Example #9
0
/* Print each element in reverse order: */
void printEachBackward( BreakIterator& boundary)
{
    int32_t end = boundary.last();
    for (int32_t start = boundary.previous();
         start != BreakIterator::DONE;
         end = start, start = boundary.previous())
    {
        printTextRange( boundary, start, end );
    }
}
Example #10
0
/* Print each element in order: */
void printEachForward( BreakIterator& boundary)
{
    int32_t start = boundary.first();
    for (int32_t end = boundary.next();
         end != BreakIterator::DONE;
         start = end, end = boundary.next())
    {
        printTextRange( boundary, start, end );
    }
}
Example #11
0
/* Creating and using text boundaries */
int main( void )
{
    puts("ICU Break Iterator Sample Program\n");
    puts("C++ Break Iteration\n");
    BreakIterator* boundary;
    UnicodeString stringToExamine("Aaa bbb ccc. Ddd eee fff.");
    printf("Examining: ");
    printUnicodeString(stringToExamine);
    puts("");

    //print each sentence in forward and reverse order
    UErrorCode status = U_ZERO_ERROR;
    boundary = BreakIterator::createSentenceInstance(
        Locale::getUS(), status );
    if (U_FAILURE(status)) {
        printf("failed to create sentence break iterator.  status = %s", 
            u_errorName(status));
        exit(1);
    }

    boundary->setText(stringToExamine);
    puts("\n Sentence Boundaries... ");
    puts("----- forward: -----------");
    printEachForward(*boundary);
    puts("----- backward: ----------");
    printEachBackward(*boundary);
    delete boundary;

    //print each word in order
    printf("\n Word Boundaries... \n");
    boundary = BreakIterator::createWordInstance(
        Locale::getUS(), status);
    boundary->setText(stringToExamine);
    puts("----- forward: -----------");
    printEachForward(*boundary);
    //print first element
    puts("----- first: -------------");
    printFirst(*boundary);
    //print last element
    puts("----- last: --------------");
    printLast(*boundary);
    //print word at charpos 10
    puts("----- at pos 10: ---------");
    printAt(*boundary, 10 );

    delete boundary;

    puts("\nEnd C++ Break Iteration");

    // Call the C version
    return c_main();
}
Example #12
0
U_CAPI void U_EXPORT2
ubrk_close(UBreakIterator *bi)
{
    BreakIterator *ubi = (BreakIterator*) bi;
    if (ubi) {
        if (ubi->isBufferClone()) {
            ubi->~BreakIterator();
            *(uint32_t *)ubi = 0xdeadbeef;
        } else {
            delete ubi;
        }
    }
}
Example #13
0
U_CAPI void U_EXPORT2
ubrk_setText(UBreakIterator* bi,
             const UChar*    text,
             int32_t         textLength,
             UErrorCode*     status)
{
    BreakIterator *brit = (BreakIterator *)bi;
    UText  ut = UTEXT_INITIALIZER;
    utext_openUChars(&ut, text, textLength, status);
    brit->setText(&ut, *status);
    // A stack allocated UText wrapping a UCHar * string
    //   can be dumped without explicitly closing it.
}
Example #14
0
static void _breakiterator_move_forward(zend_object_iterator *iter)
{
	BreakIterator *biter = _breakiter_prolog(iter);
	zoi_with_current *zoi_iter = (zoi_with_current*)iter;

	iter->funcs->invalidate_current(iter);

	if (biter == NULL) {
		return;
	}

	int32_t pos = biter->next();
	if (pos != BreakIterator::DONE) {
		ZVAL_LONG(&zoi_iter->current, (zend_long)pos);
	} //else we've reached the end of the enum, nothing more is required
}
Example #15
0
/*
 * Generate a text file with spaces in it from a file without.
 */
int generateFile(const UChar *chars, int32_t length) {
    Locale root("");
    UCharCharacterIterator *noSpaceIter = new UCharCharacterIterator(chars, length);
    UErrorCode status = U_ZERO_ERROR;
    
    UnicodeSet complexContext(UNICODE_STRING_SIMPLE("[:LineBreak=SA:]"), status);
    BreakIterator *breakIter = BreakIterator::createWordInstance(root, status);
    breakIter->adoptText(noSpaceIter);
    char outbuf[1024];
    int32_t strlength;
    UChar bom = 0xFEFF;
    
    printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &bom, 1, &status));
    int32_t prevbreak = 0;
    while (U_SUCCESS(status)) {
        int32_t nextbreak = breakIter->next();
        if (nextbreak == BreakIterator::DONE) {
            break;
        }
        printf("%s", u_strToUTF8(outbuf, sizeof(outbuf), &strlength, &chars[prevbreak],
                                    nextbreak-prevbreak, &status));
        if (nextbreak > 0 && complexContext.contains(chars[nextbreak-1])
            && complexContext.contains(chars[nextbreak])) {
            printf(" ");
        }
        prevbreak = nextbreak;
    }
    
    if (U_FAILURE(status)) {
        fprintf(stderr, "generate failed: %s\n", u_errorName(status));
        return status;
    }
    else {
        return 0;
    }
}
Example #16
0
void printTextRange( BreakIterator& iterator, 
                    int32_t start, int32_t end )
{
    CharacterIterator *strIter = iterator.getText().clone();
    UnicodeString  s;
    strIter->getText(s);

    printf(" %ld %ld\t", (long)start, (long)end);
    printUnicodeString(UnicodeString(s, 0, start));
    printf("|");
    printUnicodeString(UnicodeString(s, start, end-start));
    printf("|");
    printUnicodeString(UnicodeString(s, end));
    puts("");
    delete strIter;
}
Example #17
0
void RBBIAPITest::doBoundaryTest(BreakIterator& bi, UnicodeString& text, int32_t *boundaries){
     logln((UnicodeString)"testIsBoundary():");
        int32_t p = 0;
        UBool isB;
        for (int32_t i = 0; i < text.length(); i++) {
            isB = bi.isBoundary(i);
            logln((UnicodeString)"bi.isBoundary(" + i + ") -> " + isB);

            if (i == boundaries[p]) {
                if (!isB)
                    errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected true, got false");
                p++;
            }
            else {
                if (isB)
                    errln((UnicodeString)"Wrong result from isBoundary() for " + i + (UnicodeString)": expected false, got true");
            }
        }
}
void GetWordBoundaryPositions(const FunctionCallbackInfo<Value>& args) {
    Isolate* isolate = Isolate::GetCurrent();
    HandleScope scope(isolate);

    if (args.Length() != 2) {
        isolate->ThrowException(Exception::TypeError(String::NewFromUtf8(isolate, "must supply locale and text")));
        return;
    }

    if (!args[0]->IsString()) {
        isolate->ThrowException(Exception::TypeError(String::NewFromUtf8(isolate, "text is not specified")));
        return;
    }

    if (!args[1]->IsString()) {
        isolate->ThrowException(Exception::TypeError(String::NewFromUtf8(isolate, "locale is not specified")));
        return;
    }

    // convert v8 locale to ICU
    String::Utf8Value locale(args[1]->ToString());
    const char* country = strtok(*locale, "_"), *language = strtok(NULL, "_");
    Locale icuLocale(language, country);

    // create the BreakIterator instance
    UErrorCode err = U_ZERO_ERROR;
    BreakIterator *iterator = BreakIterator::createWordInstance(icuLocale, err);
    if (U_FAILURE(err)) {
        ErrorCode errCode;
        errCode.set(err);
        isolate->ThrowException(Exception::TypeError(String::NewFromUtf8(isolate, errCode.errorName())));
        return;
    }

    // Convert v8 text to ICU Unicode value
    Local<String> textStr = args[0]->ToString();
    String::Utf8Value textValue(textStr);
    UnicodeString uTextValue(*textValue, "UTF-8");
    if (uTextValue.isBogus()) {
        isolate->ThrowException(Exception::TypeError(String::NewFromUtf8(isolate, "unable to create unicode string")));
        return;
    }
    iterator->setText(uTextValue);

    // populate boundaries
    Local<Array> results = Array::New(isolate);
    int32_t arrayPosition = 0;
    int32_t currentBoundary = iterator->first();
    int32_t previousBoundary = 0;

    while (currentBoundary != BreakIterator::DONE) {
        if (currentBoundary > 0) {
            Local<Object> boundaryResult = Object::New(isolate);
            boundaryResult->Set(String::NewFromUtf8(isolate, "start"), Number::New(isolate, previousBoundary));
            boundaryResult->Set(String::NewFromUtf8(isolate, "end"), Number::New(isolate, currentBoundary));

            results->Set(arrayPosition++, boundaryResult);
        }

        previousBoundary = currentBoundary;
        currentBoundary = iterator->next();
    }

    // cleanup
    delete iterator;

    args.GetReturnValue().Set(results);
}
Example #19
0
void Hyphenator::slotHyphenate(PageItem* it)
{
	if (!(it->asTextFrame()) || (it->itemText.length() == 0))
		return;
	m_doc->DoDrawing = false;

	QString text = "";

	int startC = 0;
	if (it->itemText.selectionLength() > 0)
	{
		startC = it->itemText.startOfSelection();
		text = it->itemText.text(startC, it->itemText.selectionLength());
	}
	else {
		text = it->itemText.text(0, it->itemText.length());
	}

	rememberedWords.clear();
	qApp->setOverrideCursor(QCursor(Qt::WaitCursor));

	BreakIterator* bi = StoryText::getWordIterator();
	bi->setText((const UChar*) text.utf16());
	int pos = bi->first();
	while (pos != BreakIterator::DONE)
	{
		int firstC = pos;
		pos = bi->next();
		int lastC = pos;
		int countC = lastC - firstC;

		const CharStyle& style = it->itemText.charStyle(firstC);
		if (countC > 0 && countC > style.hyphenWordMin() - 1)
		{
			QString word = text.mid(firstC, countC);
			QString wordLower = QLocale(style.language()).toLower(word);
			if (wordLower.contains(SpecialChars::SHYPHEN))
				break;

			bool ok = loadDict(style.language());
			if (!ok)
				continue;

			QByteArray te = m_codec->fromUnicode(wordLower);
			char *buffer = static_cast<char*>(malloc(te.length() + 5));
			if (buffer == nullptr)
				break;

			char **rep = nullptr;
			int *pos = nullptr;
			int *cut = nullptr;
			// TODO: support non-standard hyphenation, see hnj_hyphen_hyphenate2 docs
			if (!hnj_hyphen_hyphenate2(m_hdict, te.data(), te.length(), buffer, nullptr, &rep, &pos, &cut))
			{
	  			int i = 0;
				buffer[te.length()] = '\0';
				bool hasHyphen = false;
				for (i = 1; i < wordLower.length()-1; ++i)
				{
					if(buffer[i] & 1)
					{
						hasHyphen = true;
						break;
					}
				}
				QString outs = "";
				QString input = "";
				outs += word[0];
				for (i = 1; i < wordLower.length()-1; ++i)
				{
					outs += word[i];
					if(buffer[i] & 1)
						outs += "-";
				}
				outs += word.rightRef(1);
				input = outs;
				if (!ignoredWords.contains(word))
				{
					if (!hasHyphen)
						it->itemText.hyphenateWord(startC + firstC, wordLower.length(), nullptr);
					else if (m_automatic)
					{
						if (specialWords.contains(word))
						{
							outs = specialWords.value(word);
							uint ii = 1;
							for (i = 1; i < outs.length()-1; ++i)
							{
								QChar cht = outs[i];
								if (cht == '-')
									buffer[ii-1] = 1;
								else
								{
									buffer[ii] = 0;
									++ii;
								}
							}
						}
						it->itemText.hyphenateWord(startC + firstC, wordLower.length(), buffer);
					}
					else
					{
						if (specialWords.contains(word))
						{
							outs = specialWords.value(word);
							uint ii = 1;
							for (i = 1; i < outs.length()-1; ++i)
							{
								QChar cht = outs[i];
								if (cht == '-')
									buffer[ii-1] = 1;
								else
								{
									buffer[ii] = 0;
									++ii;
								}
							}
						}
						if (rememberedWords.contains(input))
						{
							outs = rememberedWords.value(input);
							uint ii = 1;
							for (i = 1; i < outs.length()-1; ++i)
							{
								QChar cht = outs[i];
								if (cht == '-')
									buffer[ii-1] = 1;
								else
								{
									buffer[ii] = 0;
									++ii;
								}
							}
							it->itemText.hyphenateWord(firstC, wordLower.length(), buffer);
						}
						else
						{
							qApp->changeOverrideCursor(QCursor(Qt::ArrowCursor));
							PrefsContext* prefs = PrefsManager::instance()->prefsFile->getContext("hyhpen_options");
							int xpos = prefs->getInt("Xposition", -9999);
							int ypos = prefs->getInt("Yposition", -9999);
							HyAsk *dia = new HyAsk((QWidget*)parent(), outs);
							if ((xpos != -9999) && (ypos != -9999))
								dia->move(xpos, ypos);
							qApp->processEvents();
							if (dia->exec())
							{
								outs = dia->Wort->text();
								uint ii = 1;
								for (i = 1; i < outs.length()-1; ++i)
								{
									QChar cht = outs[i];
									if (cht == '-')
										buffer[ii-1] = 1;
									else
									{
										buffer[ii] = 0;
										++ii;
									}
								}
								if (!rememberedWords.contains(input))
									rememberedWords.insert(input, outs);
								if (dia->addToIgnoreList->isChecked())
								{
									if (!ignoredWords.contains(word))
										ignoredWords.insert(word);
								}
								if (dia->addToExceptionList->isChecked())
								{
									if (!specialWords.contains(word))
										specialWords.insert(word, outs);
								}
								it->itemText.hyphenateWord(firstC, wordLower.length(), buffer);
							}
							else
							{
								free(buffer);
								buffer = nullptr;
								prefs->set("Xposition", dia->xpos);
								prefs->set("Yposition", dia->ypos);
								delete dia;
								break;
							}
							prefs->set("Xposition", dia->xpos);
							prefs->set("Yposition", dia->ypos);
							delete dia;
							qApp->changeOverrideCursor(QCursor(Qt::WaitCursor));
						}
					}
				}
			}
			free(buffer);
			if (rep)
			{
				for (int i = 0; i < te.length() - 1; ++i)
					free(rep[i]);
			}
			free(rep);
			free(pos);
			free(cut);
		}
	}
	qApp->restoreOverrideCursor();
	m_doc->DoDrawing = true;
	rememberedWords.clear();
}
Example #20
0
TRI_vector_string_t* Utf8Helper::getWords (const char* const text,
                                           const size_t textLength,
                                           const size_t minimalLength,
                                           const size_t maximalLength,
                                           bool lowerCase) {
  TRI_vector_string_t* words;
  UErrorCode status = U_ZERO_ERROR;
  UnicodeString word;

  if (textLength == 0) {
    // input text is empty
    return NULL;
  }

  if (textLength < minimalLength) {
    // input text is shorter than required minimum length
    return NULL;
  }

  size_t textUtf16Length = 0;
  UChar* textUtf16 = NULL;

  if (lowerCase) {
    // lower case string
    int32_t lowerLength = 0;
    char* lower = tolower(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, lowerLength);

    if (lower == NULL) {
      // out of memory
      return NULL;
    }

    if (lowerLength == 0) {
      TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower);
      return NULL;
    }

    textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, lower, lowerLength, &textUtf16Length);
    TRI_Free(TRI_UNKNOWN_MEM_ZONE, lower);
  }
  else {
    textUtf16 = TRI_Utf8ToUChar(TRI_UNKNOWN_MEM_ZONE, text, (int32_t) textLength, &textUtf16Length);
  }

  if (textUtf16 == NULL) {
    return NULL;
  }

  ULocDataLocaleType type = ULOC_VALID_LOCALE;
  const Locale& locale = _coll->getLocale(type, status);

  if (U_FAILURE(status)) {
    TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
    LOG_ERROR("error in Collator::getLocale(...): %s", u_errorName(status));
    return NULL;
  }

  UChar* tempUtf16 = (UChar *) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, (textUtf16Length + 1) * sizeof(UChar), false);

  if (tempUtf16 == NULL) {
    TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
    return NULL;
  }

  words = (TRI_vector_string_t*) TRI_Allocate(TRI_UNKNOWN_MEM_ZONE, sizeof(TRI_vector_string_t), false);

  if (words == NULL) {
    TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
    TRI_Free(TRI_UNKNOWN_MEM_ZONE, tempUtf16);
    return NULL;
  }

  // estimate an initial vector size. this is not accurate, but setting the initial size to some
  // value in the correct order of magnitude will save a lot of vector reallocations later
  size_t initialWordCount = textLength / (2 * (minimalLength + 1));
  if (initialWordCount < 32) {
    // alloc at least 32 pointers (= 256b)
    initialWordCount = 32;
  }
  else if (initialWordCount > 8192) {
    // alloc at most 8192 pointers (= 64kb)
    initialWordCount = 8192;
  }

  TRI_InitVectorString2(words, TRI_UNKNOWN_MEM_ZONE, initialWordCount);

  BreakIterator* wordIterator = BreakIterator::createWordInstance(locale, status);
  UnicodeString utext(textUtf16);

  wordIterator->setText(utext);
  int32_t start = wordIterator->first();
  for(int32_t end = wordIterator->next(); end != BreakIterator::DONE;
    start = end, end = wordIterator->next()) {

    size_t tempUtf16Length = (size_t) (end - start);
    // end - start = word length
    if (tempUtf16Length >= minimalLength) {
      size_t chunkLength = tempUtf16Length;
      if (chunkLength > maximalLength) {
        chunkLength = maximalLength;
      }
      utext.extractBetween(start, (int32_t) (start + chunkLength), tempUtf16, 0);

      size_t utf8WordLength;
      char* utf8Word = TRI_UCharToUtf8(TRI_UNKNOWN_MEM_ZONE, tempUtf16, chunkLength, &utf8WordLength);
      if (utf8Word != 0) {
        TRI_PushBackVectorString(words, utf8Word);
      }
    }
  }

  delete wordIterator;

  TRI_Free(TRI_UNKNOWN_MEM_ZONE, textUtf16);
  TRI_Free(TRI_UNKNOWN_MEM_ZONE, tempUtf16);

  if (words->_length == 0) {
    // no words found
    TRI_FreeVectorString(TRI_UNKNOWN_MEM_ZONE, words);
    return NULL;
  }

  return words;
}
Example #21
0
/* Print the first element */
void printFirst(BreakIterator& boundary)
{
    int32_t start = boundary.first();
    int32_t end = boundary.next();
    printTextRange( boundary, start, end );
}
Example #22
0
//
//  TestRuleStatus
//      Test word break rule status constants.
//
void RBBIAPITest::TestRuleStatus() {
     UChar str[30];
     //no longer test Han or hiragana breaking here: ruleStatusVec would return nothing
     // changed UBRK_WORD_KANA to UBRK_WORD_IDEO
     u_unescape("plain word 123.45 \\u30a1\\u30a2 ",
              // 012345678901234567  8      9    0      
              //                     Katakana      
                str, 30);
     UnicodeString testString1(str);
     int32_t bounds1[] = {0, 5, 6, 10, 11, 17, 18, 20, 21};
     int32_t tag_lo[]  = {UBRK_WORD_NONE,     UBRK_WORD_LETTER, UBRK_WORD_NONE,    UBRK_WORD_LETTER,
                          UBRK_WORD_NONE,     UBRK_WORD_NUMBER, UBRK_WORD_NONE,
                          UBRK_WORD_IDEO,     UBRK_WORD_NONE};

     int32_t tag_hi[]  = {UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT, UBRK_WORD_NONE_LIMIT, UBRK_WORD_LETTER_LIMIT,
                          UBRK_WORD_NONE_LIMIT, UBRK_WORD_NUMBER_LIMIT, UBRK_WORD_NONE_LIMIT,
                          UBRK_WORD_IDEO_LIMIT, UBRK_WORD_NONE_LIMIT};

     UErrorCode status=U_ZERO_ERROR;

     BreakIterator *bi = BreakIterator::createWordInstance(Locale::getEnglish(), status);
     if(U_FAILURE(status)) {
         errcheckln(status, "Fail : in construction - %s", u_errorName(status));
     } else {
         bi->setText(testString1);
         // First test that the breaks are in the right spots.
         doBoundaryTest(*bi, testString1, bounds1);

         // Then go back and check tag values
         int32_t i = 0;
         int32_t pos, tag;
         for (pos = bi->first(); pos != BreakIterator::DONE; pos = bi->next(), i++) {
             if (pos != bounds1[i]) {
                 errln("FAIL: unexpected word break at postion %d", pos);
                 break;
             }
             tag = bi->getRuleStatus();
             if (tag < tag_lo[i] || tag >= tag_hi[i]) {
                 errln("FAIL: incorrect tag value %d at position %d", tag, pos);
                 break;
             }

             // Check that we get the same tag values from getRuleStatusVec()
             int32_t vec[10];
             int t = bi->getRuleStatusVec(vec, 10, status);
             TEST_ASSERT_SUCCESS(status);
             TEST_ASSERT(t==1);
             TEST_ASSERT(vec[0] == tag);
         }
     }
     delete bi;

     // Now test line break status.  This test mostly is to confirm that the status constants
     //                              are correctly declared in the header.
     testString1 =   "test line. \n";
     // break type    s    s     h

     bi = BreakIterator::createLineInstance(Locale::getEnglish(), status);
     if(U_FAILURE(status)) {
         errcheckln(status, "failed to create word break iterator. - %s", u_errorName(status));
     } else {
         int32_t i = 0;
         int32_t pos, tag;
         UBool   success;

         bi->setText(testString1);
         pos = bi->current();
         tag = bi->getRuleStatus();
         for (i=0; i<3; i++) {
             switch (i) {
             case 0:
                 success = pos==0  && tag==UBRK_LINE_SOFT; break;
             case 1:
                 success = pos==5  && tag==UBRK_LINE_SOFT; break;
             case 2:
                 success = pos==12 && tag==UBRK_LINE_HARD; break;
             default:
                 success = FALSE; break;
             }
             if (success == FALSE) {
                 errln("Fail: incorrect word break status or position.  i=%d, pos=%d, tag=%d",
                     i, pos, tag);
                 break;
             }
             pos = bi->next();
             tag = bi->getRuleStatus();
         }
         if (UBRK_LINE_SOFT >= UBRK_LINE_SOFT_LIMIT ||
             UBRK_LINE_HARD >= UBRK_LINE_HARD_LIMIT ||
             (UBRK_LINE_HARD > UBRK_LINE_SOFT && UBRK_LINE_HARD < UBRK_LINE_SOFT_LIMIT)) {
             errln("UBRK_LINE_* constants from header are inconsistent.");
         }
     }
     delete bi;

}
Example #23
0
/* Print the last element */
void printLast(BreakIterator& boundary)
{
    int32_t end = boundary.last();
    int32_t start = boundary.previous();
    printTextRange( boundary, start, end );
}
Example #24
0
/* Print the element at a specified position */
void printAt(BreakIterator &boundary, int32_t pos )
{
    int32_t end = boundary.following(pos);
    int32_t start = boundary.previous();
    printTextRange( boundary, start, end );
}
Example #25
0
// In the Unicode string characters are always stored in logical order.
// This makes line breaking easy. One word is added to the current line at a time. Once the line is too long
// we either go back one step or inset the line break at the current position (depending on "wrap_before" setting).
// At the end everything that is left over is added as the final line.
void text_layout::break_line(text_line & line, double wrap_width, unsigned text_ratio, bool wrap_before)
{
    shape_text(line);
    if (!wrap_width || line.width() < wrap_width)
    {
        add_line(line);
        return;

    }
    if (text_ratio)
    {
        double wrap_at;
        double string_width = line.width();
        double string_height = line.line_height();
        for (double i = 1.0; ((wrap_at = string_width/i)/(string_height*i)) > text_ratio && (string_width/i) > wrap_width; i += 1.0) ;
        wrap_width = wrap_at;
    }

    mapnik::value_unicode_string const& text = itemizer_.text();
    Locale locale; // TODO: Is the default constructor correct?
    UErrorCode status = U_ZERO_ERROR;
    BreakIterator *breakitr = BreakIterator::createLineInstance(locale, status);

    // Not breaking the text if an error occurs is probably the best thing we can do.
    // https://github.com/mapnik/mapnik/issues/2072
    if (!U_SUCCESS(status))
    {
        add_line(line);
        MAPNIK_LOG_ERROR(text_layout) << " could not create BreakIterator: " << u_errorName(status);
        return;
    }

    breakitr->setText(text);

    double current_line_length = 0;
    int last_break_position = static_cast<int>(line.first_char());
    for (unsigned i=line.first_char(); i < line.last_char(); ++i)
    {
        // TODO: character_spacing
        std::map<unsigned, double>::const_iterator width_itr = width_map_.find(i);
        if (width_itr != width_map_.end())
        {
            current_line_length += width_itr->second;
        }
        if (current_line_length <= wrap_width) continue;

        int break_position = wrap_before ? breakitr->preceding(i) : breakitr->following(i);
        // following() returns a break position after the last word. So DONE should only be returned
        // when calling preceding.
        if (break_position <= last_break_position || break_position == static_cast<int>(BreakIterator::DONE))
        {
            // A single word is longer than the maximum line width.
            // Violate line width requirement and choose next break position
            break_position = breakitr->following(i);
            if (break_position == static_cast<int>(BreakIterator::DONE))
            {
                break_position = line.last_char();
                MAPNIK_LOG_ERROR(text_layout) << "Unexpected result in break_line. Trying to recover...\n";
            }
        }
        // Break iterator operates on the whole string, while we only look at one line. So we need to
        // clamp break values.
        if (break_position < static_cast<int>(line.first_char()))
        {
            break_position = line.first_char();
        }
        if (break_position > static_cast<int>(line.last_char()))
        {
            break_position = line.last_char();
        }

        text_line new_line(last_break_position, break_position);
        clear_cluster_widths(last_break_position, break_position);
        shape_text(new_line);
        add_line(new_line);
        last_break_position = break_position;
        i = break_position - 1;
        current_line_length = 0;
    }
    if (last_break_position == static_cast<int>(line.first_char()))
    {
        // No line breaks => no reshaping required
        add_line(line);
    }
    else if (last_break_position != static_cast<int>(line.last_char()))
    {
        text_line new_line(last_break_position, line.last_char());
        clear_cluster_widths(last_break_position, line.last_char());
        shape_text(new_line);
        add_line(new_line);
    }
}
Example #26
0
/** Word wrap text
 *
 * @param str character vector
 * @param width single integer
 * @param cost_exponent single double
 * @param indent single integer
 * @param exdent single integer
 * @param prefix single string
 * @param initial single string
 * @param locale locale identifier or NULL for default locale
 * @param use_length single logical value
 *
 * @return list
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-27)
 *          single function for wrap_greedy and wrap_dynamic
 *          (dispatch inside);
 *          use BreakIterator
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
 *    new args: indent, exdent, prefix, initial
 *
 * @version 0.5-1 (Marek Gagolewski, 2014-12-19)
 *    #133 allow width <= 0
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-28)
 *    don't trim so many white spaces at the end of each word (normalize arg does that)
 *    #139: allow a "whitespace" break iterator
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-04-23)
 *    `use_length` arg added
 *
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-06-09)
 *    BIGSKIP: no more CHARSXP on out on "" input
 */
SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent,
   SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only,
   SEXP use_length, SEXP locale)
{
   bool use_length_val      = stri__prepare_arg_logical_1_notNA(use_length, "use_length");
   double exponent_val      = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent");
   bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only");

   int width_val = stri__prepare_arg_integer_1_notNA(width, "width");
   if (width_val <= 0) width_val = 0;

   int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent");
   if (indent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "indent");

   int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent");
   if (exdent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "exdent");


   const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */
   Locale loc = Locale::createFromName(qloc);
   PROTECT(str     = stri_prepare_arg_string(str, "str"));
   PROTECT(prefix  = stri_prepare_arg_string_1(prefix, "prefix"));
   PROTECT(initial = stri_prepare_arg_string_1(initial, "initial"));

   BreakIterator* briter = NULL;
   UText* str_text = NULL;

   STRI__ERROR_HANDLER_BEGIN(3)
   UErrorCode status = U_ZERO_ERROR;
   briter = BreakIterator::createLineInstance(loc, status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   R_len_t str_length = LENGTH(str);
   StriContainerUTF8_indexable str_cont(str, str_length);
   StriContainerUTF8 prefix_cont(prefix, 1);
   StriContainerUTF8 initial_cont(initial, 1);


   // prepare indent/exdent/prefix/initial stuff:
   // 1st line, 1st para (i==0, u==0): initial+indent
   // nth line, 1st para (i==0, u> 0): prefix +exdent
   // 1st line, nth para (i> 0, u==0): prefix +indent
   // nth line, nth para (i> 0, u> 0): prefix +exdent
   StriWrapLineStart ii(initial_cont.get(0), indent_val);
   StriWrapLineStart pi(prefix_cont.get(0), indent_val);
   StriWrapLineStart pe(prefix_cont.get(0), exdent_val);


   status = U_ZERO_ERROR;
   //Unicode Newline Guidelines - Unicode Technical Report #13
   UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   uset_linebreaks.freeze();

   status = U_ZERO_ERROR;
   UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   uset_whitespaces.freeze();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length));
   for (R_len_t i = 0; i < str_length; ++i)
   {
      if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) {
         SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
         continue;
      }

      status = U_ZERO_ERROR;
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t str_cur_n = str_cont.get(i).length();
      str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status);
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

      status = U_ZERO_ERROR;
      briter->setText(str_text, status);
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

      // all right, first let's generate a list of places at which we may do line breaks
      deque< R_len_t > occurrences_list; // this could be an R_len_t queue
      R_len_t match = briter->first();
      while (match != BreakIterator::DONE) {

         if (!whitespace_only_val)
            occurrences_list.push_back(match);
         else {
            if (match > 0 && match < str_cur_n) {
               UChar32 c;
               U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c);
               if (uset_whitespaces.contains(c))
                  occurrences_list.push_back(match);
            }
            else
               occurrences_list.push_back(match);
         }

         match = briter->next();
      }

      R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries
      if (noccurrences <= 1) { // no match (1 boundary == 0)
         SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i)));
         continue;
      }

      // the number of "words" is:
      R_len_t nwords = noccurrences - 1;

      // convert occurrences_list to a vector
      // in order to obtain end positions (in a string) of each "words",
      // noting that occurrences_list.at(0) == 0
#ifndef NDEBUG
      if (occurrences_list.at(0) != 0)
         throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)");
#endif

      std::vector<R_len_t> end_pos_orig(nwords);
      deque<R_len_t>::iterator iter = ++(occurrences_list.begin());
      for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) {
         end_pos_orig[j] = (*iter); // this is a UTF-8 index
      }


      // now:
      // we'll get the total widths/number of code points in each "word"
      std::vector<R_len_t> widths_orig(nwords);
      // we'll get the total widths/number of code points without trailing whitespaces
      std::vector<R_len_t> widths_trim(nwords);
      // we'll get the end positions without trailing whitespaces
      std::vector<R_len_t> end_pos_trim(nwords);
      // detect line endings (fail on a match)

      UChar32 c = 0;
      R_len_t j = 0;
      R_len_t cur_block = 0;
      R_len_t cur_width_orig = 0;
      R_len_t cur_width_trim = 0;
      R_len_t cur_count_orig = 0;
      R_len_t cur_count_trim = 0;
      R_len_t cur_end_pos_trim = 0;
      while (j < str_cur_n) {
         R_len_t jlast = j;
         U8_NEXT(str_cur_s, j, str_cur_n, c);
         if (c < 0) // invalid utf-8 sequence
            throw StriException(MSG__INVALID_UTF8);

         if (uset_linebreaks.contains(c))
            throw StriException(MSG__NEWLINE_FOUND);

         cur_width_orig += stri__width_char(c);
         ++cur_count_orig;
         if (uset_whitespaces.contains(c)) {
// OLD: trim all white spaces from the end:
//            ++cur_count_trim;
//           [we have the normalize arg for that]

// NEW: trim just one white space at the end:
            cur_width_trim = stri__width_char(c);
            cur_count_trim = 1;
            cur_end_pos_trim = jlast;
         }
         else {
            cur_width_trim = 0;
            cur_count_trim = 0;
            cur_end_pos_trim = j;
         }

         if (j >= str_cur_n || end_pos_orig[cur_block] <= j) {
            // we'll start a new block in a moment
            if (use_length_val) {
               widths_orig[cur_block] = cur_count_orig;
               widths_trim[cur_block] = cur_count_orig-cur_count_trim;
            }
            else {
               widths_orig[cur_block] = cur_width_orig;
               widths_trim[cur_block] = cur_width_orig-cur_width_trim;
            }
            end_pos_trim[cur_block] = cur_end_pos_trim;
            cur_block++;
            cur_width_orig = 0;
            cur_width_trim = 0;
            cur_count_orig = 0;
            cur_count_trim = 0;
            cur_end_pos_trim = j;
         }
      }

      // do wrap
      std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}?
      if (exponent_val <= 0.0) {
         stri__wrap_greedy(wrap_after, nwords, width_val,
            widths_orig, widths_trim,
               (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
               (use_length_val)?pe.count:pe.width);
      }
      else {
         stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val,
            widths_orig, widths_trim,
               (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
               (use_length_val)?pe.count:pe.width);
      }

      // wrap_after.size() line breaks => wrap_after.size()+1 lines
      R_len_t nlines = (R_len_t)wrap_after.size()+1;
      R_len_t last_pos = 0;
      SEXP ans;
      STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines));
      deque<R_len_t>::iterator iter_wrap = wrap_after.begin();
      for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) {
         R_len_t wrap_after_cur = *iter_wrap;
         R_len_t cur_pos = end_pos_trim[wrap_after_cur];

         std::string cs;
         if (i == 0 && u == 0)     cs = ii.str;
         else if (i > 0 && u == 0) cs = pi.str;
         else                      cs = pe.str;
         cs.append(str_cur_s+last_pos, cur_pos-last_pos);
         SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));

         last_pos = end_pos_orig[wrap_after_cur];
      }

      // last line goes here:
      std::string cs;
      if (i == 0 && nlines-1 == 0)     cs = ii.str;
      else if (i > 0 && nlines-1 == 0) cs = pi.str;
      else                             cs = pe.str;
      cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos);
      SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));

      SET_VECTOR_ELT(ret, i, ans);
      STRI__UNPROTECT(1);
   }

   if (briter) { delete briter; briter = NULL; }
   if (str_text) { utext_close(str_text); str_text = NULL; }
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      if (briter) { delete briter; briter = NULL; }
      if (str_text) { utext_close(str_text); str_text = NULL; }
   })
}