MVMnum64 MVM_coerce_s_n(MVMThreadContext *tc, MVMString *s) { MVMCodepointIter ci; MVMCodepoint cp; MVMnum64 n = 123; MVM_string_ci_init(tc, &ci, s, 0, 0); if (get_cp(tc, &ci, &cp)) return 0; skip_whitespace(tc, &ci, &cp); // Do we have only whitespace if (!MVM_string_ci_has_more(tc, &ci) && cp == END_OF_NUM) { return 0; } n = parse_real(tc, &ci, &cp, s); skip_whitespace(tc, &ci, &cp); if (MVM_string_ci_has_more(tc, &ci) || cp != END_OF_NUM) { parse_error(tc, s, "trailing characters"); } return n; }
/* Takes an NFG string and populates the array out, which must be a 32-bit * integer array, with codepoints normalized according to the specified * normalization form. */ void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out) { MVMCodepoint *result; MVMint64 result_pos, result_alloc; MVMCodepointIter ci; /* Validate output array and set up result storage. */ assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers"); result_alloc = s->body.num_graphs; result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); result_pos = 0; /* Create codepoint iterator. */ MVM_string_ci_init(tc, &ci, s); /* If we want NFC, just iterate, since NFG is constructed out of NFC. */ if (form == MVM_NORMALIZE_NFC) { while (MVM_string_ci_has_more(tc, &ci)) { maybe_grow_result(&result, &result_alloc, result_pos + 1); result[result_pos++] = MVM_string_ci_get_codepoint(tc, &ci); } } /* Otherwise, need to feed it through a normalizer. */ else { MVMNormalizer norm; MVMint32 ready; MVM_unicode_normalizer_init(tc, &norm, form); while (MVM_string_ci_has_more(tc, &ci)) { MVMCodepoint cp; ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &cp); if (ready) { maybe_grow_result(&result, &result_alloc, result_pos + ready); result[result_pos++] = cp; while (--ready > 0) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); maybe_grow_result(&result, &result_alloc, result_pos + ready); while (ready--) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); } /* Put result into array body. */ ((MVMArray *)out)->body.slots.u32 = result; ((MVMArray *)out)->body.start = 0; ((MVMArray *)out)->body.elems = result_pos; }
/* Encodes the specified substring to latin-1. Anything outside of latin-1 range * will become a ?. The result string is NULL terminated, but the specified * size is the non-null part. */ char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length) { /* Latin-1 is a single byte encoding, so each grapheme will just become * a single byte. */ MVMuint32 startu = (MVMuint32)start; MVMStringIndex strgraphs = MVM_string_graphs(tc, str); MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length); MVMuint8 *result; size_t i; /* must check start first since it's used in the length check */ if (start < 0 || start > strgraphs) MVM_exception_throw_adhoc(tc, "start out of range"); if (length < -1 || start + lengthu > strgraphs) MVM_exception_throw_adhoc(tc, "length out of range"); result = MVM_malloc(lengthu + 1); if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) { /* No encoding needed; directly copy. */ memcpy(result, str->body.storage.blob_ascii, lengthu); result[lengthu] = 0; } else { MVMuint32 i = 0; MVMCodepointIter ci; MVM_string_ci_init(tc, &ci, str); while (MVM_string_ci_has_more(tc, &ci)) { MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci); if (ord >= 0 && ord <= 255) result[i] = (MVMuint8)ord; else result[i] = '?'; i++; } result[i] = 0; } if (output_size) *output_size = lengthu; return (char *)result; }
/* Encodes the specified substring to latin-1. Anything outside of latin-1 range * will become a ?. The result string is NULL terminated, but the specified * size is the non-null part. */ char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines) { /* Latin-1 is a single byte encoding, but \r\n is a 2-byte grapheme, so we * may have to resize as we go. */ MVMuint32 startu = (MVMuint32)start; MVMStringIndex strgraphs = MVM_string_graphs(tc, str); MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length); MVMuint8 *result; size_t result_alloc; MVMuint8 *repl_bytes = NULL; MVMuint64 repl_length; /* must check start first since it's used in the length check */ if (start < 0 || start > strgraphs) MVM_exception_throw_adhoc(tc, "start out of range"); if (length < -1 || start + lengthu > strgraphs) MVM_exception_throw_adhoc(tc, "length out of range"); if (replacement) repl_bytes = (MVMuint8 *) MVM_string_latin1_encode_substr(tc, replacement, &repl_length, 0, -1, NULL, translate_newlines); result_alloc = lengthu; result = MVM_malloc(result_alloc + 1); if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) { /* No encoding needed; directly copy. */ memcpy(result, str->body.storage.blob_ascii, lengthu); result[lengthu] = 0; if (output_size) *output_size = lengthu; } else { MVMuint32 i = 0; MVMCodepointIter ci; MVM_string_ci_init(tc, &ci, str, translate_newlines); while (MVM_string_ci_has_more(tc, &ci)) { MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci); if (i == result_alloc) { result_alloc += 8; result = MVM_realloc(result, result_alloc + 1); } if (ord >= 0 && ord <= 255) { result[i] = (MVMuint8)ord; i++; } else if (replacement) { if (repl_length >= result_alloc || i >= result_alloc - repl_length) { result_alloc += repl_length; result = MVM_realloc(result, result_alloc + 1); } memcpy(result + i, repl_bytes, repl_length); i += repl_length; } else { MVM_free(result); MVM_free(repl_bytes); MVM_exception_throw_adhoc(tc, "Error encoding Latin-1 string: could not encode codepoint %d", ord); } } result[i] = 0; if (output_size) *output_size = i; } MVM_free(repl_bytes); return (char *)result; }
/* MVM_unicode_string_compare implements the Unicode Collation Algorthm */ MVMint64 MVM_unicode_string_compare(MVMThreadContext *tc, MVMString *a, MVMString *b, MVMint64 collation_mode, MVMint64 lang_mode, MVMint64 country_mode) { MVMStringIndex alen, blen; /* Iteration variables */ MVMCodepointIter a_ci, b_ci; MVMGrapheme32 ai, bi; /* Set it all to 0 to start with. We alter this based on the collation_mode later on */ level_eval level_eval_settings = { { {0,0,0}, {0,0,0}, {0,0,0}, {0,0,0} } }; /* The default level_eval settings, used between two non-equal levels */ union level_eval_u2 level_eval_default = { {-1, 0, 1} }; /* Collation stacks */ collation_stack stack_a; collation_stack stack_b; ring_buffer buf_a, buf_b; /* This value stores what the return value would be if the strings were compared * by codepoint. This is used to break collation value ties */ MVMint64 compare_by_cp_rtrn = 0; MVMint64 pos_a = 0, pos_b = 0, i = 0, rtrn = 0; MVMint16 grab_a_done = 0, grab_b_done = 0; /* From 0 to 2 for primary, secondary, tertiary levels */ MVMint16 level_a = 0, level_b = 0; MVMint64 skipped_a = 0, skipped_b = 0; /* This code sets up level_eval_settings based on the collation_mode */ #define setmodeup(mode, level, Less, Same, More) {\ if (collation_mode & mode) {\ level_eval_settings.a[level].a2[0] += Less;\ level_eval_settings.a[level].a2[1] += Same;\ level_eval_settings.a[level].a2[2] += More;\ }\ } /* Primary */ setmodeup(MVM_COLLATION_PRIMARY_POSITIVE, 0, -1, 0, 1); setmodeup(MVM_COLLATION_PRIMARY_NEGATIVE, 0, 1, 0, -1); /* Secondary */ setmodeup(MVM_COLLATION_SECONDARY_POSITIVE, 1, -1, 0, 1); setmodeup(MVM_COLLATION_SECONDARY_NEGATIVE, 1, 1, 0, -1); /* Tertiary */ setmodeup(MVM_COLLATION_TERTIARY_POSITIVE, 2, -1, 0, 1); setmodeup(MVM_COLLATION_TERTIARY_NEGATIVE, 2, 1, 0, -1); /* Quaternary */ setmodeup(MVM_COLLATION_QUATERNARY_POSITIVE, 3, -1, 0, 1); setmodeup(MVM_COLLATION_QUATERNARY_NEGATIVE, 3, 1, 0, -1); DEBUG_COLLATION_MODE_PRINT(level_eval_settings); init_stack(tc, &stack_a); init_stack(tc, &stack_b); MVM_string_check_arg(tc, a, "compare"); MVM_string_check_arg(tc, b, "compare"); /* Simple cases when one or both are zero length. */ alen = MVM_string_graphs_nocheck(tc, a); blen = MVM_string_graphs_nocheck(tc, b); if (alen == 0 || blen == 0) return collation_return_by_quaternary(tc, &level_eval_settings, alen, blen, 0); /* Initialize a codepoint iterator * For now we decompose utf8-c8 synthetics. Eventually we may want to pass * them back and choose some way to generate sorting info for them, similar * to how Unassigned codepoints are dealt with */ MVMROOT(tc, a_ci, { MVM_string_ci_init(tc, &a_ci, a, 0, 0); MVMROOT(tc, b_ci, { MVM_string_ci_init(tc, &b_ci, b, 0, 0); }); });