Example #1
0
static void grapheme_composition(MVMThreadContext *tc, MVMNormalizer *n, MVMint32 from, MVMint32 to) {
    if (to - from >= 2) {
        MVMint32 starterish = from;
        MVMint32 insert_pos = from;
        MVMint32 pos        = from;
        while (pos < to) {
            MVMint32 next_pos = pos + 1;
            if (next_pos == to || should_break(tc, n->buffer[pos], n->buffer[next_pos])) {
                /* Last in buffer or next code point is a non-starter; turn
                 * sequence into a synthetic. */
                MVMGrapheme32 g = MVM_nfg_codes_to_grapheme(tc, n->buffer + starterish, next_pos - starterish);
                if (n->translate_newlines && g == MVM_nfg_crlf_grapheme(tc))
                    g = '\n';
                n->buffer[insert_pos++] = g;

                /* The next code point is our new starterish (harmless if we
                 * are already at the end of the buffer). */
                starterish = next_pos;
            }
            pos++;
        }
        memmove(n->buffer + insert_pos, n->buffer + to, (n->buffer_end - to) * sizeof(MVMCodepoint));
        n->buffer_end -= to - insert_pos;
    }
}
Example #2
0
/* Returns non-zero if the result of concatenating the two strings will freely
 * leave us in NFG without any further effort. */
MVMint32 MVM_nfg_is_concat_stable(MVMThreadContext *tc, MVMString *a, MVMString *b) {
    MVMGrapheme32 last_a;
    MVMGrapheme32 first_b;
    MVMGrapheme32 crlf;

    /* If either string is empty, we're good. */
    if (a->body.num_graphs == 0 || b->body.num_graphs == 0)
        return 1;

    /* Get first and last graphemes of the strings. */
    last_a  = MVM_string_get_grapheme_at_nocheck(tc, a, a->body.num_graphs - 1);
    first_b = MVM_string_get_grapheme_at_nocheck(tc, b, 0);
    /* Put the case where we are adding a lf or crlf line ending */
    if (first_b == '\n')
        /* If we see \r + \n we need to renormalize. Otherwise we're good */
        return last_a == '\r' ? 0 : 1;

    crlf = MVM_nfg_crlf_grapheme(tc);
    /* As a control code we are always going to break if we see one of these.
     * Check first_b for speeding up line endings */
    if (first_b == crlf || last_a == crlf)
        return 0;
    /* If either is synthetic other than "\r\n", assume we'll have to re-normalize
     * (this is an over-estimate, most likely). Note if you optimize this that it
     * serves as a guard for what follows.
     * TODO get the last codepoint of last_a and first codepoint of first_b and call
     * MVM_unicode_normalize_should_break */
    if (last_a < 0 || first_b < 0)
        return 0;

    /* If both less than the first significant char for NFC we are good */
    if (last_a < MVM_NORMALIZE_FIRST_SIG_NFC && first_b < MVM_NORMALIZE_FIRST_SIG_NFC) {
        return 1;
    }
    else {
        /* Check if the two codepoints would be joined during normalization.
         * Returns 1 if they would break and thus is safe under concat, or 0 if
         * they would be joined. */
        MVMNormalizer norm;
        int rtrn;
        MVM_unicode_normalizer_init(tc, &norm, MVM_NORMALIZE_NFG);
        /* Since we are only looking at two codepoints, we don't know what came
         * before. Because of special rules with Regional Indicators, pretend
         * the previous codepoint was a regional indicator. This will return the
         * special value of 2 from MVM_unicode_normalize_should_break and trigger
         * re_nfg if last_a and first_b are both regional indicators and we will
         * never break NFG regardless of what the codepoint before last_a is. */
        norm.regional_indicator = 1;
        rtrn = MVM_unicode_normalize_should_break(tc, last_a, first_b, &norm);
        MVM_unicode_normalizer_cleanup(tc, &norm);
        /* If both CCC are non-zero then it may need to be reordered. For now return 0.
         * This can be optimized. */
        if (MVM_unicode_relative_ccc(tc, last_a) != 0 && MVM_unicode_relative_ccc(tc, first_b) != 0)
            return 0;
        return rtrn;
    }
}
Example #3
0
/* Sets a decode stream separator to its default value. */
void MVM_string_decode_stream_sep_default(MVMThreadContext *tc, MVMDecodeStreamSeparators *sep_spec) {
    sep_spec->num_seps = 2;
    sep_spec->sep_lengths = MVM_malloc(sep_spec->num_seps * sizeof(MVMint32));
    sep_spec->sep_graphemes = MVM_malloc(sep_spec->num_seps * sizeof(MVMGrapheme32));

    sep_spec->sep_lengths[0] = 1;
    sep_spec->sep_graphemes[0] = '\n';

    sep_spec->sep_lengths[1] = 1;
    sep_spec->sep_graphemes[1] = MVM_nfg_crlf_grapheme(tc);
}
Example #4
0
/* Decodes the specified number of bytes of latin1 into an NFG string,
 * creating a result of the specified type. The type must have the MVMString
 * REPR. */
MVMString * MVM_string_latin1_decode(MVMThreadContext *tc, const MVMObject *result_type,
                                     char *latin1_c, size_t bytes) {
    MVMuint8  *latin1 = (MVMuint8 *)latin1_c;
    MVMString *result = (MVMString *)REPR(result_type)->allocate(tc, STABLE(result_type));
    size_t i, result_graphs;

    result->body.storage_type    = MVM_STRING_GRAPHEME_32;
    result->body.storage.blob_32 = MVM_malloc(sizeof(MVMint32) * bytes);

    result_graphs = 0;
    for (i = 0; i < bytes; i++) {
        if (latin1[i] == '\r' && i + 1 < bytes && latin1[i + 1] == '\n') {
            result->body.storage.blob_32[result_graphs++] = MVM_nfg_crlf_grapheme(tc);
            i++;
        }
        else {
            result->body.storage.blob_32[result_graphs++] = latin1[i];
        }
    }
    result->body.num_graphs = result_graphs;

    return result;
}
Example #5
0
/* Decodes using a decodestream. Decodes as far as it can with the input
 * buffers, or until a stopper is reached. */
MVMuint32 MVM_string_latin1_decodestream(MVMThreadContext *tc, MVMDecodeStream *ds,
                                    const MVMint32 *stopper_chars,
                                    MVMDecodeStreamSeparators *seps) {
    MVMint32 count = 0, total = 0;
    MVMint32 bufsize;
    MVMGrapheme32 *buffer;
    MVMDecodeStreamBytes *cur_bytes;
    MVMDecodeStreamBytes *last_accept_bytes = ds->bytes_head;
    MVMint32 last_accept_pos, last_was_cr;
    MVMuint32 reached_stopper;

    /* If there's no buffers, we're done. */
    if (!ds->bytes_head)
        return 0;
    last_accept_pos = ds->bytes_head_pos;

    /* If we're asked for zero chars, also done. */
    if (stopper_chars && *stopper_chars == 0)
        return 1;

    /* Take length of head buffer as initial guess. */
    bufsize = ds->bytes_head->length;
    buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));

    /* Decode each of the buffers. */
    cur_bytes = ds->bytes_head;
    last_was_cr = 0;
    reached_stopper = 0;
    while (cur_bytes) {
        /* Process this buffer. */
        MVMint32  pos = cur_bytes == ds->bytes_head ? ds->bytes_head_pos : 0;
        unsigned char *bytes = (unsigned char *)cur_bytes->bytes;
        while (pos < cur_bytes->length) {
            MVMCodepoint codepoint = bytes[pos++];
            MVMGrapheme32 graph;
            if (last_was_cr) {
                if (codepoint == '\n') {
                    graph = MVM_nfg_crlf_grapheme(tc);
                }
                else {
                    graph = '\r';
                    pos--;
                }
                last_was_cr = 0;
            }
            else if (codepoint == '\r') {
                last_was_cr = 1;
                continue;
            }
            else {
                graph = codepoint;
            }
            if (count == bufsize) {
                /* We filled the buffer. Attach this one to the buffers
                 * linked list, and continue with a new one. */
                MVM_string_decodestream_add_chars(tc, ds, buffer, bufsize);
                buffer = MVM_malloc(bufsize * sizeof(MVMGrapheme32));
                count = 0;
            }
            buffer[count++] = graph;
            last_accept_bytes = cur_bytes;
            last_accept_pos = pos;
            total++;
            if (stopper_chars && *stopper_chars == total) {
                reached_stopper = 1;
                goto done;
            }
            if (MVM_string_decode_stream_maybe_sep(tc, seps, codepoint)) {
                reached_stopper = 1;
                goto done;
            }
        }
        cur_bytes = cur_bytes->next;
    }
  done:

    /* Attach what we successfully parsed as a result buffer, and trim away
     * what we chewed through. */
    if (count) {
        MVM_string_decodestream_add_chars(tc, ds, buffer, count);
    }
    else {
        MVM_free(buffer);
    }
    MVM_string_decodestream_discard_to(tc, ds, last_accept_bytes, last_accept_pos);

    return reached_stopper;
}