C++ (Cpp) utext_openUTF8 Examples

Example #1

0

Show file

File: ucasemap_titlecase_brkiter.cpp Project: cyrusimap/icu4c

U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToTitle(UCaseMap *csm,
                     char *dest, int32_t destCapacity,
                     const char *src, int32_t srcLength,
                     UErrorCode *pErrorCode) {
    if (U_FAILURE(*pErrorCode)) {
        return 0;
    }
    UText utext=UTEXT_INITIALIZER;
    utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode);
    if(csm->iter==NULL) {
        csm->iter=BreakIterator::createWordInstance(Locale(csm->locale), *pErrorCode);
    }
    if (U_FAILURE(*pErrorCode)) {
        return 0;
    }
    csm->iter->setText(&utext, *pErrorCode);
    int32_t length=ucasemap_mapUTF8(
            csm->caseLocale, csm->options, csm->iter,
            (uint8_t *)dest, destCapacity,
            (const uint8_t *)src, srcLength,
            ucasemap_internalUTF8ToTitle, pErrorCode);
    utext_close(&utext);
    return length;
}

Example #2

0

Show file

File: segmenter.cpp Project: MGKhKhD/meta

    /**
     * Generic segmentation method that operates on the substring between
     * the given indices, using the given strategy for segmenting that
     * substring.
     *
     * @param first The index of the beginning of the string to work on
     * @param last The index of the end of the string to work on
     * @param type The type of segmentation to perform
     * @return a vector of segments (whose meaning depends on `type`)
     */
    std::vector<segment> segments(int32_t first, int32_t last,
                                  segment_t type) const
    {
        std::vector<segment> results;
        icu::BreakIterator* iter;
        if (type == segment_t::SENTENCES)
            iter = sentence_iter_.get();
        else if (type == segment_t::WORDS)
            iter = word_iter_.get();
        else
            throw std::runtime_error{"Unknown segmentation type"};

        auto status = U_ZERO_ERROR;
        UText utxt = UTEXT_INITIALIZER;
        utext_openUTF8(&utxt, text_.data() + first, last - first, &status);
        if (!U_SUCCESS(status))
        {
            std::string err = "Failed to open UText: ";
            err += u_errorName(status);
            throw std::runtime_error{err};
        }

        iter->setText(&utxt, status);
        if (!U_SUCCESS(status))
        {
            utext_close(&utxt);
            std::string err = "Failed to setText: ";
            err += u_errorName(status);
            throw std::runtime_error{err};
        }

        auto start = iter->first();
        auto end = iter->next();
        while (end != icu::BreakIterator::DONE)
        {
            results.emplace_back(first + start, first + end);
            start = end;
            end = iter->next();
        }
        utext_close(&utxt);
        return results;
    }

Example #3

0

Show file

File: cbiapts.c Project: Epictetus/build-couchdb

/*
 *  static void TestBreakIteratorUText(void);
 *
 *         Test that ubrk_setUText() is present and works for a simple case.
 */
static void TestBreakIteratorUText(void) {
    const char *UTF8Str = "\x41\xc3\x85\x5A\x20\x41\x52\x69\x6E\x67";  /* c3 85 is utf-8 for A with a ring on top */
                      /*   0  1   2 34567890  */

    UErrorCode      status = U_ZERO_ERROR;
    UBreakIterator *bi     = NULL;
    int32_t         pos    = 0;


    UText *ut = utext_openUTF8(NULL, UTF8Str, -1, &status);
    TEST_ASSERT_SUCCESS(status);

    bi = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status);
    if (U_FAILURE(status)) {
        log_err_status(status, "Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));
        return;
    }

    ubrk_setUText(bi, ut, &status);
    if (U_FAILURE(status)) {
        log_err("Failure at file %s, line %d, error = %s\n", __FILE__, __LINE__, u_errorName(status));
        return;
    }

    pos = ubrk_first(bi);
    TEST_ASSERT(pos == 0);

    pos = ubrk_next(bi);
    TEST_ASSERT(pos == 4);

    pos = ubrk_next(bi);
    TEST_ASSERT(pos == 5);

    pos = ubrk_next(bi);
    TEST_ASSERT(pos == 10);

    pos = ubrk_next(bi);
    TEST_ASSERT(pos == UBRK_DONE);
    ubrk_close(bi);
    utext_close(ut);
}

Example #4

0

Show file

File: ucasemap_titlecase_brkiter.cpp Project: ONLYOFFICE/core

U_CAPI int32_t U_EXPORT2
ucasemap_utf8ToTitle(UCaseMap *csm,
                     char *dest, int32_t destCapacity,
                     const char *src, int32_t srcLength,
                     UErrorCode *pErrorCode) {
    UText utext=UTEXT_INITIALIZER;
    utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(csm->iter==NULL) {
        csm->iter=ubrk_open(UBRK_WORD, csm->locale,
                            NULL, 0,
                            pErrorCode);
    }
    ubrk_setUText(csm->iter, &utext, pErrorCode);
    int32_t length=ucasemap_mapUTF8(csm,
                   (uint8_t *)dest, destCapacity,
                   (const uint8_t *)src, srcLength,
                   ucasemap_internalUTF8ToTitle, pErrorCode);
    utext_close(&utext);
    return length;
}

Example #5

0

Show file

File: ucasemap.c Project: CucasLoon/in-the-box

/*
 * Internal titlecasing function.
 */
static int32_t
_toTitle(UCaseMap *csm,
         uint8_t *dest, int32_t destCapacity,
         const uint8_t *src, UCaseContext *csc,
         int32_t srcLength,
         UErrorCode *pErrorCode) {
    UText utext=UTEXT_INITIALIZER;
    const UChar *s;
    UChar32 c;
    int32_t prev, titleStart, titleLimit, idx, destIndex, length;
    UBool isFirstIndex;

    utext_openUTF8(&utext, (const char *)src, srcLength, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        return 0;
    }
    if(csm->iter==NULL) {
        csm->iter=ubrk_open(UBRK_WORD, csm->locale,
                            NULL, 0,
                            pErrorCode);
    }
    ubrk_setUText(csm->iter, &utext, pErrorCode);
    if(U_FAILURE(*pErrorCode)) {
        utext_close(&utext);
        return 0;
    }

    /* set up local variables */
    destIndex=0;
    prev=0;
    isFirstIndex=TRUE;

    /* titlecasing loop */
    while(prev<srcLength) {
        /* find next index where to titlecase */
        if(isFirstIndex) {
            isFirstIndex=FALSE;
            idx=ubrk_first(csm->iter);
        } else {
            idx=ubrk_next(csm->iter);
        }
        if(idx==UBRK_DONE || idx>srcLength) {
            idx=srcLength;
        }

        /*
         * Unicode 4 & 5 section 3.13 Default Case Operations:
         *
         * R3  toTitlecase(X): Find the word boundaries based on Unicode Standard Annex
         * #29, "Text Boundaries." Between each pair of word boundaries, find the first
         * cased character F. If F exists, map F to default_title(F); then map each
         * subsequent character C to default_lower(C).
         *
         * In this implementation, segment [prev..index[ into 3 parts:
         * a) uncased characters (copy as-is) [prev..titleStart[
         * b) first case letter (titlecase)         [titleStart..titleLimit[
         * c) subsequent characters (lowercase)                 [titleLimit..index[
         */
        if(prev<idx) {
            /* find and copy uncased characters [prev..titleStart[ */
            titleStart=titleLimit=prev;
            U8_NEXT(src, titleLimit, idx, c);
            if((csm->options&U_TITLECASE_NO_BREAK_ADJUSTMENT)==0 && UCASE_NONE==ucase_getType(csm->csp, c)) {
                /* Adjust the titlecasing index (titleStart) to the next cased character. */
                for(;;) {
                    titleStart=titleLimit;
                    if(titleLimit==idx) {
                        /*
                         * only uncased characters in [prev..index[
                         * stop with titleStart==titleLimit==index
                         */
                        break;
                    }
                    U8_NEXT(src, titleLimit, idx, c);
                    if(UCASE_NONE!=ucase_getType(csm->csp, c)) {
                        break; /* cased letter at [titleStart..titleLimit[ */
                    }
                }
                length=titleStart-prev;
                if(length>0) {
                    if((destIndex+length)<=destCapacity) {
                        uprv_memcpy(dest+destIndex, src+prev, length);
                    }
                    destIndex+=length;
                }
            }

            if(titleStart<titleLimit) {
                /* titlecase c which is from [titleStart..titleLimit[ */
                csc->cpStart=titleStart;
                csc->cpLimit=titleLimit;
                c=ucase_toFullTitle(csm->csp, c, utf8_caseContextIterator, csc, &s, csm->locale, &csm->locCache);
                destIndex=appendResult(dest, destIndex, destCapacity, c, s);

                
                /* Special case Dutch IJ titlecasing */
                if ( titleStart+1 < idx && 
                     ucase_getCaseLocale(csm->locale,&csm->locCache) == UCASE_LOC_DUTCH &&
                     ( src[titleStart] == 0x0049 || src[titleStart] == 0x0069 ) &&
                     ( src[titleStart+1] == 0x004A || src[titleStart+1] == 0x006A )) { 
                            c=0x004A;
                            destIndex=appendResult(dest, destIndex, destCapacity, c, s);
                            titleLimit++;
                }
                /* lowercase [titleLimit..index[ */
                if(titleLimit<idx) {
                    if((csm->options&U_TITLECASE_NO_LOWERCASE)==0) {
                        /* Normal operation: Lowercase the rest of the word. */
                        destIndex+=
                            _caseMap(
                                csm, ucase_toFullLower,
                                dest+destIndex, destCapacity-destIndex,
                                src, csc,
                                titleLimit, idx,
                                pErrorCode);
                    } else {
                        /* Optionally just copy the rest of the word unchanged. */
                        length=idx-titleLimit;
                        if((destIndex+length)<=destCapacity) {
                            uprv_memcpy(dest+destIndex, src+titleLimit, length);
                        }
                        destIndex+=length;
                    }
                }
            }
        }

        prev=idx;
    }

    if(destIndex>destCapacity) {
        *pErrorCode=U_BUFFER_OVERFLOW_ERROR;
    }
    utext_close(&utext);
    return destIndex;
}

Example #6

0

Show file

File: rbbiapts.cpp Project: Distrotech/icu

void RBBIAPITest::TestGetSetAdoptText()
{
    logln((UnicodeString)"Testing getText setText ");
    IcuTestErrorCode status(*this, "TestGetSetAdoptText");
    UnicodeString str1="first string.";
    UnicodeString str2="Second string.";
    LocalPointer<RuleBasedBreakIterator> charIter1((RuleBasedBreakIterator*)RuleBasedBreakIterator::createCharacterInstance(Locale::getDefault(), status));
    LocalPointer<RuleBasedBreakIterator> wordIter1((RuleBasedBreakIterator*)RuleBasedBreakIterator::createWordInstance(Locale::getDefault(), status));
    if(status.isFailure()){
        errcheckln(status, "Fail : in construction - %s", status.errorName());
            return;
    }


    CharacterIterator* text1= new StringCharacterIterator(str1);
    CharacterIterator* text1Clone = text1->clone();
    CharacterIterator* text2= new StringCharacterIterator(str2);
    CharacterIterator* text3= new StringCharacterIterator(str2, 3, 10, 3); //  "ond str"

    wordIter1->setText(str1);
    CharacterIterator *tci = &wordIter1->getText();
    UnicodeString      tstr;
    tci->getText(tstr);
    TEST_ASSERT(tstr == str1);
    if(wordIter1->current() != 0)
        errln((UnicodeString)"ERROR:1 setText did not set the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n");

    wordIter1->next(2);

    wordIter1->setText(str2);
    if(wordIter1->current() != 0)
        errln((UnicodeString)"ERROR:2 setText did not reset the iteration position to the beginning of the text, it is" + wordIter1->current() + (UnicodeString)"\n");


    charIter1->adoptText(text1Clone);
    TEST_ASSERT(wordIter1->getText() != charIter1->getText());
    tci = &wordIter1->getText();
    tci->getText(tstr);
    TEST_ASSERT(tstr == str2);
    tci = &charIter1->getText();
    tci->getText(tstr);
    TEST_ASSERT(tstr == str1);


    LocalPointer<RuleBasedBreakIterator> rb((RuleBasedBreakIterator*)wordIter1->clone());
    rb->adoptText(text1);
    if(rb->getText() != *text1)
        errln((UnicodeString)"ERROR:1 error in adoptText ");
    rb->adoptText(text2);
    if(rb->getText() != *text2)
        errln((UnicodeString)"ERROR:2 error in adoptText ");

    // Adopt where iterator range is less than the entire orignal source string.
    //   (With the change of the break engine to working with UText internally,
    //    CharacterIterators starting at positions other than zero are not supported)
    rb->adoptText(text3);
    TEST_ASSERT(rb->preceding(2) == 0);
    TEST_ASSERT(rb->following(11) == BreakIterator::DONE);
    //if(rb->preceding(2) != 3) {
    //    errln((UnicodeString)"ERROR:3 error in adoptText ");
    //}
    //if(rb->following(11) != BreakIterator::DONE) {
    //    errln((UnicodeString)"ERROR:4 error in adoptText ");
    //}

    // UText API
    //
    //   Quick test to see if UText is working at all.
    //
    const char *s1 = "\x68\x65\x6C\x6C\x6F\x20\x77\x6F\x72\x6C\x64"; /* "hello world" in UTF-8 */
    const char *s2 = "\x73\x65\x65\x20\x79\x61"; /* "see ya" in UTF-8 */
    //                012345678901

    status.reset();
    LocalUTextPointer ut(utext_openUTF8(NULL, s1, -1, status));
    wordIter1->setText(ut.getAlias(), status);
    TEST_ASSERT_SUCCESS(status);

    int32_t pos;
    pos = wordIter1->first();
    TEST_ASSERT(pos==0);
    pos = wordIter1->next();
    TEST_ASSERT(pos==5);
    pos = wordIter1->next();
    TEST_ASSERT(pos==6);
    pos = wordIter1->next();
    TEST_ASSERT(pos==11);
    pos = wordIter1->next();
    TEST_ASSERT(pos==UBRK_DONE);

    status.reset();
    LocalUTextPointer ut2(utext_openUTF8(NULL, s2, -1, status));
    TEST_ASSERT_SUCCESS(status);
    wordIter1->setText(ut2.getAlias(), status);
    TEST_ASSERT_SUCCESS(status);

    pos = wordIter1->first();
    TEST_ASSERT(pos==0);
    pos = wordIter1->next();
    TEST_ASSERT(pos==3);
    pos = wordIter1->next();
    TEST_ASSERT(pos==4);

    pos = wordIter1->last();
    TEST_ASSERT(pos==6);
    pos = wordIter1->previous();
    TEST_ASSERT(pos==4);
    pos = wordIter1->previous();
    TEST_ASSERT(pos==3);
    pos = wordIter1->previous();
    TEST_ASSERT(pos==0);
    pos = wordIter1->previous();
    TEST_ASSERT(pos==UBRK_DONE);

    status.reset();
    UnicodeString sEmpty;
    LocalUTextPointer gut2(utext_openUnicodeString(NULL, &sEmpty, status));
    wordIter1->getUText(gut2.getAlias(), status);
    TEST_ASSERT_SUCCESS(status);
    status.reset();
}

Example #7

0

Show file

File: utexttst.c Project: icu-project/icu4c

static void TestAPI(void) {
    UErrorCode      status = U_ZERO_ERROR;
    UBool           gFailed = FALSE;
    (void)gFailed;   /* Suppress set but not used warning. */

    /* Open    */
    {
        UText           utLoc = UTEXT_INITIALIZER;
        const char *    cString = "\x61\x62\x63\x64";
        UChar           uString[]  = {0x41, 0x42, 0x43, 0};
        UText          *uta;
        UText          *utb;
        UChar           c;

        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);
        c = utext_next32(uta);
        TEST_ASSERT(c == 0x41);
        utb = utext_close(uta); 
        TEST_ASSERT(utb == NULL);

        uta = utext_openUTF8(&utLoc, cString, -1, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(uta == &utLoc);

        uta = utext_close(&utLoc);
        TEST_ASSERT(uta == &utLoc);
    }

    /* utext_clone()  */
    {
        UChar   uString[]  = {0x41, 0x42, 0x43, 0};
        int64_t len;
        UText   *uta;
        UText   *utb;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);
        utb = utext_clone(NULL, uta, FALSE, FALSE, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(utb != NULL);
        TEST_ASSERT(utb != uta);
        len = utext_nativeLength(uta);
        TEST_ASSERT(len == u_strlen(uString));
        utext_close(uta);
        utext_close(utb);
    }

    /* basic access functions  */
    {
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UText     *uta;
        UChar32   c;
        int64_t   len;
        UBool     b;
        int64_t   i;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_ASSERT(uta!=NULL);
        TEST_SUCCESS(status);
        b = utext_isLengthExpensive(uta);
        TEST_ASSERT(b==TRUE);
        len = utext_nativeLength(uta);
        TEST_ASSERT(len == u_strlen(uString));
        b = utext_isLengthExpensive(uta);
        TEST_ASSERT(b==FALSE);

        c = utext_char32At(uta, 0);
        TEST_ASSERT(c==uString[0]);
        
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[0]);

        c = utext_next32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[1]);

        c = utext_previous32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[0]);

        c = utext_next32From(uta, 1);
        TEST_ASSERT(c==uString[1]);
        c = utext_next32From(uta, u_strlen(uString));
        TEST_ASSERT(c==U_SENTINEL);

        c = utext_previous32From(uta, 2);
        TEST_ASSERT(c==uString[1]);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i == 1);

        utext_setNativeIndex(uta, 0);
        b = utext_moveIndex32(uta, 1);
        TEST_ASSERT(b==TRUE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==1);

        b = utext_moveIndex32(uta, u_strlen(uString)-1);
        TEST_ASSERT(b==TRUE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==u_strlen(uString));

        b = utext_moveIndex32(uta, 1);
        TEST_ASSERT(b==FALSE);
        i = utext_getNativeIndex(uta);
        TEST_ASSERT(i==u_strlen(uString));

        utext_setNativeIndex(uta, 0);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c==uString[0]);
        c = utext_current32(uta);
        TEST_ASSERT(c==uString[1]);

        c = UTEXT_PREVIOUS32(uta);
        TEST_ASSERT(c==uString[0]);
        c = UTEXT_PREVIOUS32(uta);
        TEST_ASSERT(c==U_SENTINEL);


        utext_close(uta);
    }

    {
        /*
         * UText opened on a NULL string with zero length
         */
        UText    *uta;
        UChar32   c;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, NULL, 0, &status);
        TEST_SUCCESS(status);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c == U_SENTINEL);
        utext_close(uta);

        uta = utext_openUTF8(NULL, NULL, 0, &status);
        TEST_SUCCESS(status);
        c = UTEXT_NEXT32(uta);
        TEST_ASSERT(c == U_SENTINEL);
        utext_close(uta);
    }


    {
        /*
         * extract
         */
        UText     *uta;
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UChar     buf[100];
        int32_t   i;
        /* Test pinning of input bounds */
        UChar     uString2[]  = {0x41, 0x42, 0x43, 0x44, 0x45,
                                 0x46, 0x47, 0x48, 0x49, 0x4A, 0};
        UChar *   uString2Ptr = uString2 + 5;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);

        status = U_ZERO_ERROR;
        i = utext_extract(uta, 0, 100, NULL, 0, &status);
        TEST_ASSERT(status==U_BUFFER_OVERFLOW_ERROR);
        TEST_ASSERT(i == u_strlen(uString));

        status = U_ZERO_ERROR;
        memset(buf, 0, sizeof(buf));
        i = utext_extract(uta, 0, 100, buf, 100, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(i == u_strlen(uString));
        i = u_strcmp(uString, buf);
        TEST_ASSERT(i == 0);
        utext_close(uta);

        /* Test pinning of input bounds */
        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString2Ptr, -1, &status);
        TEST_SUCCESS(status);

        status = U_ZERO_ERROR;
        memset(buf, 0, sizeof(buf));
        i = utext_extract(uta, -3, 20, buf, 100, &status);
        TEST_SUCCESS(status);
        TEST_ASSERT(i == u_strlen(uString2Ptr));
        i = u_strcmp(uString2Ptr, buf);
        TEST_ASSERT(i == 0);
        utext_close(uta);
    }

    {
        /*
         *  Copy, Replace, isWritable
         *    Can't create an editable UText from plain C, so all we
         *    can easily do is check that errors returned.
         */
        UText     *uta;
        UChar     uString[]  = {0x41, 0x42, 0x43, 0};
        UBool     b;

        status = U_ZERO_ERROR;
        uta = utext_openUChars(NULL, uString, -1, &status);
        TEST_SUCCESS(status);

        b = utext_isWritable(uta);
        TEST_ASSERT(b == FALSE);

        b = utext_hasMetaData(uta);
        TEST_ASSERT(b == FALSE);

        utext_replace(uta,
                      0, 1,     /* start, limit */
                      uString, -1,  /* replacement, replacement length */
                      &status);
        TEST_ASSERT(status == U_NO_WRITE_PERMISSION);


        utext_copy(uta,
                   0, 1,         /* start, limit      */
                   2,            /* destination index */
                   FALSE,        /* move flag         */
                   &status);
        TEST_ASSERT(status == U_NO_WRITE_PERMISSION);

        utext_close(uta);
    }


}

Example #8

0

Show file

File: stri_wrap.cpp Project: mmcnulty0531/stringi

/** Word wrap text
 *
 * @param str character vector
 * @param width single integer
 * @param cost_exponent single double
 * @param indent single integer
 * @param exdent single integer
 * @param prefix single string
 * @param initial single string
 * @param locale locale identifier or NULL for default locale
 * @param use_length single logical value
 *
 * @return list
 *
 * @version 0.1-?? (Bartek Tartanus)
 *
 * @version 0.2-2 (Marek Gagolewski, 2014-04-27)
 *          single function for wrap_greedy and wrap_dynamic
 *          (dispatch inside);
 *          use BreakIterator
 *
 * @version 0.3-1 (Marek Gagolewski, 2014-11-04)
 *    Issue #112: str_prepare_arg* retvals were not PROTECTed from gc
 *
 * @version 0.4-1 (Marek Gagolewski, 2014-12-06)
 *    new args: indent, exdent, prefix, initial
 *
 * @version 0.5-1 (Marek Gagolewski, 2014-12-19)
 *    #133 allow width <= 0
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-02-28)
 *    don't trim so many white spaces at the end of each word (normalize arg does that)
 *    #139: allow a "whitespace" break iterator
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-04-23)
 *    `use_length` arg added
 *
 *
 * @version 0.5-1 (Marek Gagolewski, 2015-06-09)
 *    BIGSKIP: no more CHARSXP on out on "" input
 */
SEXP stri_wrap(SEXP str, SEXP width, SEXP cost_exponent,
   SEXP indent, SEXP exdent, SEXP prefix, SEXP initial, SEXP whitespace_only,
   SEXP use_length, SEXP locale)
{
   bool use_length_val      = stri__prepare_arg_logical_1_notNA(use_length, "use_length");
   double exponent_val      = stri__prepare_arg_double_1_notNA(cost_exponent, "cost_exponent");
   bool whitespace_only_val = stri__prepare_arg_logical_1_notNA(whitespace_only, "whitespace_only");

   int width_val = stri__prepare_arg_integer_1_notNA(width, "width");
   if (width_val <= 0) width_val = 0;

   int indent_val = stri__prepare_arg_integer_1_notNA(indent, "indent");
   if (indent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "indent");

   int exdent_val = stri__prepare_arg_integer_1_notNA(exdent, "exdent");
   if (exdent_val < 0) Rf_error(MSG__EXPECTED_POSITIVE, "exdent");


   const char* qloc = stri__prepare_arg_locale(locale, "locale", true); /* this is R_alloc'ed */
   Locale loc = Locale::createFromName(qloc);
   PROTECT(str     = stri_prepare_arg_string(str, "str"));
   PROTECT(prefix  = stri_prepare_arg_string_1(prefix, "prefix"));
   PROTECT(initial = stri_prepare_arg_string_1(initial, "initial"));

   BreakIterator* briter = NULL;
   UText* str_text = NULL;

   STRI__ERROR_HANDLER_BEGIN(3)
   UErrorCode status = U_ZERO_ERROR;
   briter = BreakIterator::createLineInstance(loc, status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

   R_len_t str_length = LENGTH(str);
   StriContainerUTF8_indexable str_cont(str, str_length);
   StriContainerUTF8 prefix_cont(prefix, 1);
   StriContainerUTF8 initial_cont(initial, 1);


   // prepare indent/exdent/prefix/initial stuff:
   // 1st line, 1st para (i==0, u==0): initial+indent
   // nth line, 1st para (i==0, u> 0): prefix +exdent
   // 1st line, nth para (i> 0, u==0): prefix +indent
   // nth line, nth para (i> 0, u> 0): prefix +exdent
   StriWrapLineStart ii(initial_cont.get(0), indent_val);
   StriWrapLineStart pi(prefix_cont.get(0), indent_val);
   StriWrapLineStart pe(prefix_cont.get(0), exdent_val);


   status = U_ZERO_ERROR;
   //Unicode Newline Guidelines - Unicode Technical Report #13
   UnicodeSet uset_linebreaks(UnicodeString::fromUTF8("[\\u000A-\\u000D\\u0085\\u2028\\u2029]"), status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   uset_linebreaks.freeze();

   status = U_ZERO_ERROR;
   UnicodeSet uset_whitespaces(UnicodeString::fromUTF8("\\p{White_space}"), status);
   STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})
   uset_whitespaces.freeze();

   SEXP ret;
   STRI__PROTECT(ret = Rf_allocVector(VECSXP, str_length));
   for (R_len_t i = 0; i < str_length; ++i)
   {
      if (str_cont.isNA(i) || prefix_cont.isNA(0) || initial_cont.isNA(0)) {
         SET_VECTOR_ELT(ret, i, stri__vector_NA_strings(1));
         continue;
      }

      status = U_ZERO_ERROR;
      const char* str_cur_s = str_cont.get(i).c_str();
      R_len_t str_cur_n = str_cont.get(i).length();
      str_text = utext_openUTF8(str_text, str_cur_s, str_cont.get(i).length(), &status);
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

      status = U_ZERO_ERROR;
      briter->setText(str_text, status);
      STRI__CHECKICUSTATUS_THROW(status, {/* do nothing special on err */})

      // all right, first let's generate a list of places at which we may do line breaks
      deque< R_len_t > occurrences_list; // this could be an R_len_t queue
      R_len_t match = briter->first();
      while (match != BreakIterator::DONE) {

         if (!whitespace_only_val)
            occurrences_list.push_back(match);
         else {
            if (match > 0 && match < str_cur_n) {
               UChar32 c;
               U8_GET((const uint8_t*)str_cur_s, 0, match-1, str_cur_n, c);
               if (uset_whitespaces.contains(c))
                  occurrences_list.push_back(match);
            }
            else
               occurrences_list.push_back(match);
         }

         match = briter->next();
      }

      R_len_t noccurrences = (R_len_t)occurrences_list.size(); // number of boundaries
      if (noccurrences <= 1) { // no match (1 boundary == 0)
         SET_VECTOR_ELT(ret, i, Rf_ScalarString(str_cont.toR(i)));
         continue;
      }

      // the number of "words" is:
      R_len_t nwords = noccurrences - 1;

      // convert occurrences_list to a vector
      // in order to obtain end positions (in a string) of each "words",
      // noting that occurrences_list.at(0) == 0
#ifndef NDEBUG
      if (occurrences_list.at(0) != 0)
         throw StriException("NDEBUG: stri_wrap: (occurrences_list.at(0) != 0)");
#endif

      std::vector<R_len_t> end_pos_orig(nwords);
      deque<R_len_t>::iterator iter = ++(occurrences_list.begin());
      for (R_len_t j = 0; iter != occurrences_list.end(); ++iter, ++j) {
         end_pos_orig[j] = (*iter); // this is a UTF-8 index
      }


      // now:
      // we'll get the total widths/number of code points in each "word"
      std::vector<R_len_t> widths_orig(nwords);
      // we'll get the total widths/number of code points without trailing whitespaces
      std::vector<R_len_t> widths_trim(nwords);
      // we'll get the end positions without trailing whitespaces
      std::vector<R_len_t> end_pos_trim(nwords);
      // detect line endings (fail on a match)

      UChar32 c = 0;
      R_len_t j = 0;
      R_len_t cur_block = 0;
      R_len_t cur_width_orig = 0;
      R_len_t cur_width_trim = 0;
      R_len_t cur_count_orig = 0;
      R_len_t cur_count_trim = 0;
      R_len_t cur_end_pos_trim = 0;
      while (j < str_cur_n) {
         R_len_t jlast = j;
         U8_NEXT(str_cur_s, j, str_cur_n, c);
         if (c < 0) // invalid utf-8 sequence
            throw StriException(MSG__INVALID_UTF8);

         if (uset_linebreaks.contains(c))
            throw StriException(MSG__NEWLINE_FOUND);

         cur_width_orig += stri__width_char(c);
         ++cur_count_orig;
         if (uset_whitespaces.contains(c)) {
// OLD: trim all white spaces from the end:
//            ++cur_count_trim;
//           [we have the normalize arg for that]

// NEW: trim just one white space at the end:
            cur_width_trim = stri__width_char(c);
            cur_count_trim = 1;
            cur_end_pos_trim = jlast;
         }
         else {
            cur_width_trim = 0;
            cur_count_trim = 0;
            cur_end_pos_trim = j;
         }

         if (j >= str_cur_n || end_pos_orig[cur_block] <= j) {
            // we'll start a new block in a moment
            if (use_length_val) {
               widths_orig[cur_block] = cur_count_orig;
               widths_trim[cur_block] = cur_count_orig-cur_count_trim;
            }
            else {
               widths_orig[cur_block] = cur_width_orig;
               widths_trim[cur_block] = cur_width_orig-cur_width_trim;
            }
            end_pos_trim[cur_block] = cur_end_pos_trim;
            cur_block++;
            cur_width_orig = 0;
            cur_width_trim = 0;
            cur_count_orig = 0;
            cur_count_trim = 0;
            cur_end_pos_trim = j;
         }
      }

      // do wrap
      std::deque<R_len_t> wrap_after; // wrap line after which word in {0..nwords-1}?
      if (exponent_val <= 0.0) {
         stri__wrap_greedy(wrap_after, nwords, width_val,
            widths_orig, widths_trim,
               (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
               (use_length_val)?pe.count:pe.width);
      }
      else {
         stri__wrap_dynamic(wrap_after, nwords, width_val, exponent_val,
            widths_orig, widths_trim,
               (use_length_val)?((i==0)?ii.count:pi.count):((i==0)?ii.width:pi.width),
               (use_length_val)?pe.count:pe.width);
      }

      // wrap_after.size() line breaks => wrap_after.size()+1 lines
      R_len_t nlines = (R_len_t)wrap_after.size()+1;
      R_len_t last_pos = 0;
      SEXP ans;
      STRI__PROTECT(ans = Rf_allocVector(STRSXP, nlines));
      deque<R_len_t>::iterator iter_wrap = wrap_after.begin();
      for (R_len_t u = 0; iter_wrap != wrap_after.end(); ++iter_wrap, ++u) {
         R_len_t wrap_after_cur = *iter_wrap;
         R_len_t cur_pos = end_pos_trim[wrap_after_cur];

         std::string cs;
         if (i == 0 && u == 0)     cs = ii.str;
         else if (i > 0 && u == 0) cs = pi.str;
         else                      cs = pe.str;
         cs.append(str_cur_s+last_pos, cur_pos-last_pos);
         SET_STRING_ELT(ans, u, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));

         last_pos = end_pos_orig[wrap_after_cur];
      }

      // last line goes here:
      std::string cs;
      if (i == 0 && nlines-1 == 0)     cs = ii.str;
      else if (i > 0 && nlines-1 == 0) cs = pi.str;
      else                             cs = pe.str;
      cs.append(str_cur_s+last_pos, end_pos_trim[nwords-1]-last_pos);
      SET_STRING_ELT(ans, nlines-1, Rf_mkCharLenCE(cs.c_str(), cs.size(), CE_UTF8));

      SET_VECTOR_ELT(ret, i, ans);
      STRI__UNPROTECT(1);
   }

   if (briter) { delete briter; briter = NULL; }
   if (str_text) { utext_close(str_text); str_text = NULL; }
   STRI__UNPROTECT_ALL
   return ret;
   STRI__ERROR_HANDLER_END({
      if (briter) { delete briter; briter = NULL; }
      if (str_text) { utext_close(str_text); str_text = NULL; }
   })
}