Ejemplo n.º 1
0
ERL_NIF_TERM split(ErlNifEnv* env, int argc, const ERL_NIF_TERM argv[])
{
    ErlNifBinary in;
    cloner* ptr; 
    UBreakIterator* iter; 
    int len = -1, last, pos, is_valid;
    UErrorCode status = U_ZERO_ERROR;
    ERL_NIF_TERM head, tail;
    UChar* bin; 
    UChar* text;
    

    if (argc != 2)
        return enif_make_badarg(env);

    /* Last argument must be a binary */
    if (!(enif_inspect_binary(env, argv[1], &in)
       && enif_get_resource(env, argv[0], iterator_type, (void**) &ptr))) {
        return enif_make_badarg(env);
    }    
    iter = (UBreakIterator*) cloner_get(ptr);
    CHECK_RES(env, iter);

    if (iter == NULL) {
        return enif_make_badarg(env);
    }


    text = (UChar*) in.data;

    ubrk_setText(iter, text, TO_ULEN(in.size), &status);
    CHECK(env, status);
    
    tail = enif_make_list(env, 0);
    pos = (int) ubrk_last(iter);

    while (pos) {
        last = pos;
        is_valid = is_valid_elem(ptr, iter);

        /* get the next elem. */
        pos = (int) ubrk_previous(iter);

        if (pos == UBRK_DONE)
            pos = 0;

        if (is_valid) /* Is the old element valid? */
        {
            len = FROM_ULEN(last - pos);

            bin = (UChar*) enif_make_new_binary(env, len, &head);
            memcpy(bin, 
                (const char*) (text + pos), 
                len);
            tail = enif_make_list_cell(env, head, tail);
        }
    };

    return tail;
}
Ejemplo n.º 2
0
/* Print each element in reverse order: */
void print_each_backward( UBreakIterator* boundary, UChar* str) {
  int32_t start;
  int32_t end = ubrk_last(boundary);
  for (start = ubrk_previous(boundary); start != UBRK_DONE;  end = start,
	 start = ubrk_previous(boundary)) {
    print_text_range( str, start, end );
  }
}
Ejemplo n.º 3
0
void findWordBoundary(const UChar* chars, int len, int position, int* start, int* end)
{
    UBreakIterator* it = wordBreakIterator(chars, len);
    *end = ubrk_following(it, position);
    if (*end < 0)
        *end = ubrk_last(it);
    *start = ubrk_previous(it);
}
Ejemplo n.º 4
0
ERL_NIF_TERM split_index(ErlNifEnv* env, int argc, 
    const ERL_NIF_TERM argv[])
{
    ErlNifBinary in;
    cloner* ptr; 
    UBreakIterator* iter; 
    int pos;
    UErrorCode status = U_ZERO_ERROR;
    ERL_NIF_TERM head, tail;
    

    if (argc != 2)
        return enif_make_badarg(env);

    /* Last argument must be a binary */
    if (!(enif_inspect_binary(env, argv[1], &in)
       && enif_get_resource(env, argv[0], iterator_type, (void**) &ptr))) {
        return enif_make_badarg(env);
    }    
    iter = (UBreakIterator*) cloner_get(ptr);
    CHECK_RES(env, iter);

    if (iter == NULL) {
        return enif_make_badarg(env);
    }

    ubrk_setText(iter, 
        (UChar*) in.data, 
        TO_ULEN(in.size), 
        &status);
    CHECK(env, status);
    
    tail = enif_make_list(env, 0);
    pos = (int) ubrk_last(iter);

    while ((pos != UBRK_DONE) && (pos != 0)) {
        if (is_valid_elem(ptr, iter))
        {
            head = enif_make_int(env, pos);
            tail = enif_make_list_cell(env, head, tail);
        }

        /* get the next elem. */
        pos = (int) ubrk_previous(iter);
    };

    return tail;
}
Ejemplo n.º 5
0
/* {{{ 	grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
UChar *
grapheme_get_haystack_offset(UBreakIterator* bi, UChar *uhaystack, int32_t uhaystack_len, int32_t offset)
{
	UErrorCode		status;
	int32_t pos;
	int32_t (*iter_op)(UBreakIterator* bi);
	int iter_incr;

	if ( NULL != bi ) {
		status = U_ZERO_ERROR;
		ubrk_setText (bi, uhaystack, uhaystack_len, &status);
	}

	if ( 0 == offset ) {
		return uhaystack;
	}
	
	if ( offset < 0 ) {
		iter_op = ubrk_previous;
		ubrk_last(bi); /* one past the end */
		iter_incr = 1;
	}
	else {
		iter_op = ubrk_next;
		iter_incr = -1;
	}
	
	pos = 0;
	
	while ( pos != UBRK_DONE && offset != 0 ) {
	
		pos = iter_op(bi);
		
		if ( UBRK_DONE != pos ) {
			offset += iter_incr;
		}
	}

	if ( offset != 0 ) {
		return NULL;
	}
	
	return uhaystack + pos;
}
Ejemplo n.º 6
0
/* {{{ 	grapheme_get_haystack_offset - bump the haystack pointer based on the grapheme count offset */
int32_t grapheme_get_haystack_offset(UBreakIterator* bi, int32_t offset)
{
	int32_t pos;
	int32_t (*iter_op)(UBreakIterator* bi);
	int iter_incr;

	if ( 0 == offset ) {
		return 0;
	}
	
	if ( offset < 0 ) {
		iter_op = ubrk_previous;
		ubrk_last(bi); /* one past the end */
		iter_incr = 1;
	}
	else {
		iter_op = ubrk_next;
		iter_incr = -1;
	}
	
	pos = 0;
	
	while ( pos != UBRK_DONE && offset != 0 ) {
	
		pos = iter_op(bi);
		
		if ( UBRK_DONE != pos ) {
			offset += iter_incr;
		}
	}

	if ( offset != 0 ) {
		return -1;
	}
	
	return pos;
}
Ejemplo n.º 7
0
/* Print last element */
void printLast(UBreakIterator* boundary, UChar* str) {
  int32_t start;
  int32_t end = ubrk_last(boundary);
  start = ubrk_previous(boundary);
  printTextRange(str, start, end );
}
static jint lastImpl(JNIEnv*, jclass, jint address) {
    return ubrk_last(breakIterator(address));
}
Ejemplo n.º 9
0
/* {{{ grapheme_strrpos_utf16 - strrpos using utf16 */
int
grapheme_strrpos_utf16(unsigned char *haystack, int32_t haystack_len, unsigned char*needle, int32_t needle_len, int32_t offset, int f_ignore_case TSRMLS_DC)
{
    UChar *uhaystack, *puhaystack, *uhaystack_end, *uneedle;
    int32_t uhaystack_len, uneedle_len;
    UErrorCode status;
    unsigned char u_break_iterator_buffer[U_BRK_SAFECLONE_BUFFERSIZE];
    UBreakIterator* bi = NULL;
    int ret_pos, pos;

    /* convert the strings to UTF-16. */
    uhaystack = NULL;
    uhaystack_len = 0;
    status = U_ZERO_ERROR;
    intl_convert_utf8_to_utf16(&uhaystack, &uhaystack_len, (char *) haystack, haystack_len, &status );

    if ( U_FAILURE( status ) ) {
        /* Set global error code. */
        intl_error_set_code( NULL, status TSRMLS_CC );

        /* Set error messages. */
        intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
        efree( uhaystack );
        return -1;
    }

    if ( f_ignore_case ) {
        grapheme_intl_case_fold(&uhaystack, &uhaystack, &uhaystack_len, &status );
    }

    /* get a pointer to the haystack taking into account the offset */
    bi = NULL;
    status = U_ZERO_ERROR;
    bi = grapheme_get_break_iterator(u_break_iterator_buffer, &status TSRMLS_CC );

    puhaystack = grapheme_get_haystack_offset(bi, uhaystack, uhaystack_len, offset);

    if ( NULL == puhaystack ) {
        intl_error_set( NULL, U_ILLEGAL_ARGUMENT_ERROR, "grapheme_strpos: Offset not contained in string", 1 TSRMLS_CC );
        efree( uhaystack );
        ubrk_close (bi);
        return -1;
    }

    uneedle = NULL;
    uneedle_len = 0;
    status = U_ZERO_ERROR;
    intl_convert_utf8_to_utf16(&uneedle, &uneedle_len, (char *) needle, needle_len, &status );

    if ( U_FAILURE( status ) ) {
        /* Set global error code. */
        intl_error_set_code( NULL, status TSRMLS_CC );

        /* Set error messages. */
        intl_error_set_custom_msg( NULL, "Error converting input string to UTF-16", 0 TSRMLS_CC );
        efree( uhaystack );
        efree( uneedle );
        ubrk_close (bi);
        return -1;
    }

    if ( f_ignore_case ) {
        grapheme_intl_case_fold(&uneedle, &uneedle, &uneedle_len, &status );
    }

    ret_pos = -1;   /* -1 represents 'not found' */

    /* back up until there's needle_len characters to compare */

    uhaystack_end = uhaystack + uhaystack_len;
    pos = ubrk_last(bi);
    puhaystack = uhaystack + pos;

    while ( uhaystack_end - puhaystack < uneedle_len ) {

        pos = ubrk_previous(bi);

        if ( UBRK_DONE == pos ) {
            break;
        }

        puhaystack = uhaystack + pos;
    }

    /* is there enough haystack left to hold the needle? */
    if ( ( uhaystack_end - puhaystack ) < uneedle_len ) {
        /* not enough, not found */
        goto exit;
    }

    while ( UBRK_DONE != pos ) {

        if (!u_memcmp(uneedle, puhaystack, uneedle_len)) {  /* needle_len - 1 in zend memnstr? */

            /* does the grapheme in the haystack end at the same place as the last grapheme in the needle? */

            if ( ubrk_isBoundary(bi, pos + uneedle_len) ) {

                /* found it, get grapheme count offset */
                ret_pos = grapheme_count_graphemes(bi, uhaystack, pos);
                break;
            }

            /* set position back */
            ubrk_isBoundary(bi, pos);
        }

        pos = ubrk_previous(bi);
        puhaystack = uhaystack + pos;
    }

exit:
    efree( uhaystack );
    efree( uneedle );
    ubrk_close (bi);

    return ret_pos;
}
int textBreakLast(TextBreakIterator* iterator)
{
    return ubrk_last(reinterpret_cast<UBreakIterator*>(iterator));
}
Ejemplo n.º 11
0
static void TestBreakIteratorCAPI()
{
    UErrorCode status = U_ZERO_ERROR;
    UBreakIterator *word, *sentence, *line, *character, *b, *bogus;
    int32_t start,pos,end,to;
    int32_t i;
    int32_t count = 0;

    UChar text[50];

    /* Note:  the adjacent "" are concatenating strings, not adding a \" to the
       string, which is probably what whoever wrote this intended.  Don't fix,
       because it would throw off the hard coded break positions in the following
       tests. */
    u_uastrcpy(text, "He's from Africa. ""Mr. Livingston, I presume?"" Yeah");


/*test ubrk_open()*/
    log_verbose("\nTesting BreakIterator open functions\n");
                                            
    /* Use french for fun */
    word         = ubrk_open(UBRK_WORD, "en_US", text, u_strlen(text), &status);
    if(status == U_FILE_ACCESS_ERROR) {
        log_data_err("Check your data - it doesn't seem to be around\n");
        return;
    } else if(U_FAILURE(status)){
        log_err_status(status, "FAIL: Error in ubrk_open() for word breakiterator: %s\n", myErrorName(status));
    }
    else{
        log_verbose("PASS: Successfully opened  word breakiterator\n");
    }
    
    sentence     = ubrk_open(UBRK_SENTENCE, "en_US", text, u_strlen(text), &status);
    if(U_FAILURE(status)){
        log_err_status(status, "FAIL: Error in ubrk_open() for sentence breakiterator: %s\n", myErrorName(status));
        return;
    }
    else{
        log_verbose("PASS: Successfully opened  sentence breakiterator\n");
    }
    
    line         = ubrk_open(UBRK_LINE, "en_US", text, u_strlen(text), &status);
    if(U_FAILURE(status)){
        log_err("FAIL: Error in ubrk_open() for line breakiterator: %s\n", myErrorName(status));
        return;
    }
    else{
        log_verbose("PASS: Successfully opened  line breakiterator\n");
    }
    
    character     = ubrk_open(UBRK_CHARACTER, "en_US", text, u_strlen(text), &status);
    if(U_FAILURE(status)){
        log_err("FAIL: Error in ubrk_open() for character breakiterator: %s\n", myErrorName(status));
        return;
    }
    else{
        log_verbose("PASS: Successfully opened  character breakiterator\n");
    }
    /*trying to open an illegal iterator*/
    bogus     = ubrk_open((UBreakIteratorType)5, "en_US", text, u_strlen(text), &status);
    if(U_SUCCESS(status)){
        log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n");
    }
    if(U_FAILURE(status)){
        if(status != U_ILLEGAL_ARGUMENT_ERROR){
            log_err("FAIL: Error in ubrk_open() for BOGUS breakiterator. Expected U_ILLEGAL_ARGUMENT_ERROR\n Got %s\n", myErrorName(status));
        }
    }
    status=U_ZERO_ERROR;


/* ======= Test ubrk_countAvialable() and ubrk_getAvialable() */

    log_verbose("\nTesting ubrk_countAvailable() and ubrk_getAvailable()\n");
    count=ubrk_countAvailable();
    /* use something sensible w/o hardcoding the count */
    if(count < 0){
        log_err("FAIL: Error in ubrk_countAvialable() returned %d\n", count);
    }
    else{
        log_verbose("PASS: ubrk_countAvialable() successful returned %d\n", count);
    }
    for(i=0;i<count;i++)
    {
        log_verbose("%s\n", ubrk_getAvailable(i)); 
        if (ubrk_getAvailable(i) == 0)
            log_err("No locale for which breakiterator is applicable\n");
        else 
            log_verbose("A locale %s for which breakiterator is applicable\n",ubrk_getAvailable(i));
    }

/*========Test ubrk_first(), ubrk_last()...... and other functions*/

    log_verbose("\nTesting the functions for word\n");
    start = ubrk_first(word);
    if(start!=0)
        log_err("error ubrk_start(word) did not return 0\n");
    log_verbose("first (word = %d\n", (int32_t)start);
       pos=ubrk_next(word);
    if(pos!=4)
        log_err("error ubrk_next(word) did not return 4\n");
    log_verbose("next (word = %d\n", (int32_t)pos);
    pos=ubrk_following(word, 4);
    if(pos!=5)
        log_err("error ubrl_following(word,4) did not return 6\n");
    log_verbose("next (word = %d\n", (int32_t)pos);
    end=ubrk_last(word);
    if(end!=49)
        log_err("error ubrk_last(word) did not return 49\n");
    log_verbose("last (word = %d\n", (int32_t)end);
    
    pos=ubrk_previous(word);
    log_verbose("%d   %d\n", end, pos);
     
    pos=ubrk_previous(word);
    log_verbose("%d \n", pos);

    if (ubrk_isBoundary(word, 2) != FALSE) {
        log_err("error ubrk_isBoundary(word, 2) did not return FALSE\n");
    }
    pos=ubrk_current(word);
    if (pos != 4) {
        log_err("error ubrk_current() != 4 after ubrk_isBoundary(word, 2)\n");
    }
    if (ubrk_isBoundary(word, 4) != TRUE) {
        log_err("error ubrk_isBoundary(word, 4) did not return TRUE\n");
    }


    
    log_verbose("\nTesting the functions for character\n");
    ubrk_first(character);
    pos = ubrk_following(character, 5);
    if(pos!=6)
       log_err("error ubrk_following(character,5) did not return 6\n");
    log_verbose("Following (character,5) = %d\n", (int32_t)pos);
    pos=ubrk_following(character, 18);
    if(pos!=19)
       log_err("error ubrk_following(character,18) did not return 19\n");
    log_verbose("Followingcharacter,18) = %d\n", (int32_t)pos);
    pos=ubrk_preceding(character, 22);
    if(pos!=21)
       log_err("error ubrk_preceding(character,22) did not return 21\n");
    log_verbose("preceding(character,22) = %d\n", (int32_t)pos);
    

    log_verbose("\nTesting the functions for line\n");
    pos=ubrk_first(line);
    if(pos != 0)
        log_err("error ubrk_first(line) returned %d, expected 0\n", (int32_t)pos);
    pos = ubrk_next(line);
    pos=ubrk_following(line, 18);
    if(pos!=22)
        log_err("error ubrk_following(line) did not return 22\n");
    log_verbose("following (line) = %d\n", (int32_t)pos);

    
    log_verbose("\nTesting the functions for sentence\n");
    ubrk_first(sentence);
    pos = ubrk_current(sentence);
    log_verbose("Current(sentence) = %d\n", (int32_t)pos);
       pos = ubrk_last(sentence);
    if(pos!=49)
        log_err("error ubrk_last for sentence did not return 49\n");
    log_verbose("Last (sentence) = %d\n", (int32_t)pos);
    ubrk_first(sentence);
    to = ubrk_following( sentence, 0 );
    if (to == 0) log_err("ubrk_following returned 0\n");
    to = ubrk_preceding( sentence, to );
    if (to != 0) log_err("ubrk_preceding didn't return 0\n");
    if (ubrk_first(sentence)!=ubrk_current(sentence)) {
        log_err("error in ubrk_first() or ubrk_current()\n");
    }
    
 
    /*---- */
    /*Testing ubrk_open and ubrk_close()*/
   log_verbose("\nTesting open and close for us locale\n");
    b = ubrk_open(UBRK_WORD, "fr_FR", text, u_strlen(text), &status);
    if (U_FAILURE(status)) {
        log_err("ubrk_open for word returned NULL: %s\n", myErrorName(status));
    }
    ubrk_close(b);

    /* Test setText and setUText */
    {
        UChar s1[] = {0x41, 0x42, 0x20, 0};
        UChar s2[] = {0x41, 0x42, 0x43, 0x44, 0x45, 0};
        UText *ut = NULL;
        UBreakIterator *bb;
        int j;

        log_verbose("\nTesting ubrk_setText() and ubrk_setUText()\n");
        status = U_ZERO_ERROR;
        bb = ubrk_open(UBRK_WORD, "en_US", NULL, 0, &status);
        TEST_ASSERT_SUCCESS(status);
        ubrk_setText(bb, s1, -1, &status);
        TEST_ASSERT_SUCCESS(status);
        ubrk_first(bb);
        j = ubrk_next(bb);
        TEST_ASSERT(j == 2);
        ut = utext_openUChars(ut, s2, -1, &status);
        ubrk_setUText(bb, ut, &status);
        TEST_ASSERT_SUCCESS(status);
        j = ubrk_next(bb);
        TEST_ASSERT(j == 5);

        ubrk_close(bb);
        utext_close(ut);
    }

    ubrk_close(word);
    ubrk_close(sentence);
    ubrk_close(line);
    ubrk_close(character);
}
Ejemplo n.º 12
0
/*
 * imp: common/ubrk.cpp
 * hdr: common/unicode/ubrk.h
 * @stable ICU 2.0
 * #if !UCONFIG_NO_BREAK_ITERATION
 * (don't actually conditionalize this, if the underlying library is not
 * built with break iteration, we want to fail at build time, not runtime)
 */
U_CAPI int32_t U_EXPORT2
ubrk_last_4_0(UBreakIterator *bi)
{
    return ubrk_last(bi);
}
Ejemplo n.º 13
0
static void TestBreakIteratorSuppressions(void) {
    const TestBISuppressionsItem * itemPtr;
    
    for (itemPtr = testBISuppressionsItems; itemPtr->locale != NULL; itemPtr++) {
        UChar textU[kTextULenMax];
        int32_t textULen = u_unescape(itemPtr->text, textU, kTextULenMax);
        UErrorCode status = U_ZERO_ERROR;
        UBreakIterator *bi = ubrk_open(UBRK_SENTENCE, itemPtr->locale, textU, textULen, &status);
        log_verbose("#%d: %s\n", (itemPtr-testBISuppressionsItems), itemPtr->locale);
        if (U_SUCCESS(status)) {
            int32_t offset, start;
            const int32_t * expOffsetPtr;
            const int32_t * expOffsetStart;

            expOffsetStart = expOffsetPtr = itemPtr->expFwdOffsets;
            ubrk_first(bi);
            for (; (offset = ubrk_next(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
                if (offset != *expOffsetPtr) {
                    log_err("FAIL: ubrk_next loc \"%s\", expected %d, got %d\n", itemPtr->locale, *expOffsetPtr, offset);
                }
            }
            if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
                log_err("FAIL: ubrk_next loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", itemPtr->locale, offset, *expOffsetPtr);
            }

            expOffsetStart = expOffsetPtr = itemPtr->expFwdOffsets;
            start = ubrk_first(bi) + 1;
            for (; (offset = ubrk_following(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
                if (offset != *expOffsetPtr) {
                    log_err("FAIL: ubrk_following(%d) loc \"%s\", expected %d, got %d\n", start, itemPtr->locale, *expOffsetPtr, offset);
                }
                start = *expOffsetPtr + 1;
            }
            if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
                log_err("FAIL: ubrk_following(%d) loc \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", start, itemPtr->locale, offset, *expOffsetPtr);
            }

            expOffsetStart = expOffsetPtr = itemPtr->expRevOffsets;
            offset = ubrk_last(bi);
            log_verbose("___ @%d ubrk_last\n", offset);
            if(offset == 0) {
              log_err("FAIL: ubrk_last loc \"%s\" unexpected %d\n", itemPtr->locale, offset);
            }
            for (; (offset = ubrk_previous(bi)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
                if (offset != *expOffsetPtr) {
                    log_err("FAIL: ubrk_previous loc \"%s\", expected %d, got %d\n", itemPtr->locale, *expOffsetPtr, offset);
                } else {
                    log_verbose("[%d] @%d ubrk_previous()\n", (expOffsetPtr - expOffsetStart), offset);
                }
            }
            if (offset != UBRK_DONE || *expOffsetPtr >= 0) {
                log_err("FAIL: ubrk_previous loc \"%s\", expected UBRK_DONE & expOffset[%d] -1, got %d and %d\n", itemPtr->locale,
                        expOffsetPtr - expOffsetStart,
                        offset, *expOffsetPtr);
            }

            expOffsetStart = expOffsetPtr = itemPtr->expRevOffsets;
            start = ubrk_last(bi) - 1;
            for (; (offset = ubrk_preceding(bi, start)) != UBRK_DONE && *expOffsetPtr >= 0; expOffsetPtr++) {
                if (offset != *expOffsetPtr) {
                    log_err("FAIL: ubrk_preceding(%d) loc \"%s\", expected %d, got %d\n", start, itemPtr->locale, *expOffsetPtr, offset);
                }
                start = *expOffsetPtr - 1;
            }
            if (start >=0 && (offset != UBRK_DONE || *expOffsetPtr >= 0)) {
                log_err("FAIL: ubrk_preceding loc(%d) \"%s\", expected UBRK_DONE & expOffset -1, got %d and %d\n", start, itemPtr->locale, offset, *expOffsetPtr);
            }

            ubrk_close(bi);
        } else {
            log_data_err("FAIL: ubrk_open(UBRK_SENTENCE, \"%s\", ...) status %s (Are you missing data?)\n", itemPtr->locale, u_errorName(status));
        }
    }
}
Ejemplo n.º 14
0
int32_t __hs_ubrk_last(UBreakIterator *bi)
{
    return ubrk_last(bi);
}