/* Takes an NFG string and populates the array out, which must be a 32-bit * integer array, with codepoints normalized according to the specified * normalization form. */ void MVM_unicode_string_to_codepoints(MVMThreadContext *tc, MVMString *s, MVMNormalization form, MVMObject *out) { MVMCodepoint *result; MVMint64 result_pos, result_alloc; MVMCodepointIter ci; /* Validate output array and set up result storage. */ assert_codepoint_array(tc, out, "Normalization output must be native array of 32-bit integers"); result_alloc = s->body.num_graphs; result = MVM_malloc(result_alloc * sizeof(MVMCodepoint)); result_pos = 0; /* Create codepoint iterator. */ MVM_string_ci_init(tc, &ci, s); /* If we want NFC, just iterate, since NFG is constructed out of NFC. */ if (form == MVM_NORMALIZE_NFC) { while (MVM_string_ci_has_more(tc, &ci)) { maybe_grow_result(&result, &result_alloc, result_pos + 1); result[result_pos++] = MVM_string_ci_get_codepoint(tc, &ci); } } /* Otherwise, need to feed it through a normalizer. */ else { MVMNormalizer norm; MVMint32 ready; MVM_unicode_normalizer_init(tc, &norm, form); while (MVM_string_ci_has_more(tc, &ci)) { MVMCodepoint cp; ready = MVM_unicode_normalizer_process_codepoint(tc, &norm, MVM_string_ci_get_codepoint(tc, &ci), &cp); if (ready) { maybe_grow_result(&result, &result_alloc, result_pos + ready); result[result_pos++] = cp; while (--ready > 0) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); } } MVM_unicode_normalizer_eof(tc, &norm); ready = MVM_unicode_normalizer_available(tc, &norm); maybe_grow_result(&result, &result_alloc, result_pos + ready); while (ready--) result[result_pos++] = MVM_unicode_normalizer_get_codepoint(tc, &norm); MVM_unicode_normalizer_cleanup(tc, &norm); } /* Put result into array body. */ ((MVMArray *)out)->body.slots.u32 = result; ((MVMArray *)out)->body.start = 0; ((MVMArray *)out)->body.elems = result_pos; }
int static get_cp(MVMThreadContext *tc, MVMCodepointIter *ci, MVMCodepoint *cp) { if (!MVM_string_ci_has_more(tc, ci)) { *cp = END_OF_NUM; // FIXME pick a safe value return 1; } else { *cp = MVM_string_ci_get_codepoint(tc, ci); return 0; } }
/* Encodes the specified substring to latin-1. Anything outside of latin-1 range * will become a ?. The result string is NULL terminated, but the specified * size is the non-null part. */ char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length) { /* Latin-1 is a single byte encoding, so each grapheme will just become * a single byte. */ MVMuint32 startu = (MVMuint32)start; MVMStringIndex strgraphs = MVM_string_graphs(tc, str); MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length); MVMuint8 *result; size_t i; /* must check start first since it's used in the length check */ if (start < 0 || start > strgraphs) MVM_exception_throw_adhoc(tc, "start out of range"); if (length < -1 || start + lengthu > strgraphs) MVM_exception_throw_adhoc(tc, "length out of range"); result = MVM_malloc(lengthu + 1); if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) { /* No encoding needed; directly copy. */ memcpy(result, str->body.storage.blob_ascii, lengthu); result[lengthu] = 0; } else { MVMuint32 i = 0; MVMCodepointIter ci; MVM_string_ci_init(tc, &ci, str); while (MVM_string_ci_has_more(tc, &ci)) { MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci); if (ord >= 0 && ord <= 255) result[i] = (MVMuint8)ord; else result[i] = '?'; i++; } result[i] = 0; } if (output_size) *output_size = lengthu; return (char *)result; }
/* Encodes the specified substring to latin-1. Anything outside of latin-1 range * will become a ?. The result string is NULL terminated, but the specified * size is the non-null part. */ char * MVM_string_latin1_encode_substr(MVMThreadContext *tc, MVMString *str, MVMuint64 *output_size, MVMint64 start, MVMint64 length, MVMString *replacement, MVMint32 translate_newlines) { /* Latin-1 is a single byte encoding, but \r\n is a 2-byte grapheme, so we * may have to resize as we go. */ MVMuint32 startu = (MVMuint32)start; MVMStringIndex strgraphs = MVM_string_graphs(tc, str); MVMuint32 lengthu = (MVMuint32)(length == -1 ? strgraphs - startu : length); MVMuint8 *result; size_t result_alloc; MVMuint8 *repl_bytes = NULL; MVMuint64 repl_length; /* must check start first since it's used in the length check */ if (start < 0 || start > strgraphs) MVM_exception_throw_adhoc(tc, "start out of range"); if (length < -1 || start + lengthu > strgraphs) MVM_exception_throw_adhoc(tc, "length out of range"); if (replacement) repl_bytes = (MVMuint8 *) MVM_string_latin1_encode_substr(tc, replacement, &repl_length, 0, -1, NULL, translate_newlines); result_alloc = lengthu; result = MVM_malloc(result_alloc + 1); if (str->body.storage_type == MVM_STRING_GRAPHEME_ASCII) { /* No encoding needed; directly copy. */ memcpy(result, str->body.storage.blob_ascii, lengthu); result[lengthu] = 0; if (output_size) *output_size = lengthu; } else { MVMuint32 i = 0; MVMCodepointIter ci; MVM_string_ci_init(tc, &ci, str, translate_newlines); while (MVM_string_ci_has_more(tc, &ci)) { MVMCodepoint ord = MVM_string_ci_get_codepoint(tc, &ci); if (i == result_alloc) { result_alloc += 8; result = MVM_realloc(result, result_alloc + 1); } if (ord >= 0 && ord <= 255) { result[i] = (MVMuint8)ord; i++; } else if (replacement) { if (repl_length >= result_alloc || i >= result_alloc - repl_length) { result_alloc += repl_length; result = MVM_realloc(result, result_alloc + 1); } memcpy(result + i, repl_bytes, repl_length); i += repl_length; } else { MVM_free(result); MVM_free(repl_bytes); MVM_exception_throw_adhoc(tc, "Error encoding Latin-1 string: could not encode codepoint %d", ord); } } result[i] = 0; if (output_size) *output_size = i; } MVM_free(repl_bytes); return (char *)result; }
/* Returns the number of added collation keys */ static MVMint64 collation_push_cp (MVMThreadContext *tc, collation_stack *stack, MVMCodepointIter *ci, int *cp_maybe, int cp_num, char *name) { MVMint64 rtrn = 0; MVMCodepoint cps[10]; MVMint64 num_cps_processed = 0; int query = -1; int cp_num_orig = cp_num; /* If supplied -1 that means we need to grab it from the codepoint iterator. Otherwise * the value we were passed is the codepoint we should process */ if (cp_num == 0) { cps[0] = MVM_string_ci_get_codepoint(tc, ci); cp_num = 1; } else { MVMint32 i; for (i = 0; i < cp_num; i++) { cps[i] = cp_maybe[i]; } } query = get_main_node(tc, cps[0], 0, starter_main_nodes_elems); if (query != -1) { DEBUG_PRINT_SUB_NODE(main_nodes[query]); /* If there are no sub_node_elems that means we don't need to look at * the next codepoint, we are already at the correct node * If there's no more codepoints in the iterator we also are done here */ if (main_nodes[query].sub_node_elems < 1 || (cp_num < 2 && !MVM_string_ci_has_more(tc, ci))) { collation_add_keys_from_node(tc, NULL, stack, ci, name, cps[0], &main_nodes[query]); num_cps_processed++; } /* Otherwise we need to check the next codepoint(s) (0 < sub_node_elems) */ else { MVMint64 last_good_i = 0, last_good_result = -1; MVMint64 i, result = query; DEBUG_PRINT_SUB_NODE(main_nodes[query]); for (i = 0; result != -1 && MVM_string_ci_has_more(tc, ci) && i < 9;) { i++; /* Only grab a codepoint if it doesn't already exist in the array */ if (cp_num <= i) { cps[i] = MVM_string_ci_get_codepoint(tc, ci); cp_num++; } result = find_next_node(tc, main_nodes[result], cps[i]); /* If we got something other than -1 and it has collation elements * store the value so we know how far is valid */ if (result != -1 && main_nodes[result].collation_key_elems != 0) { last_good_i = i; last_good_result = result; } if (result != -1) DEBUG_PRINT_SUB_NODE(main_nodes[result]); } /* If there is no last_good_result we should return a value from main_nodes */ DEBUG_PRINT_SUB_NODE( (last_good_result == -1 ? main_nodes[query] : main_nodes[last_good_result]) ); /* If the terminal_subnode can't be processed then that means it will push the starter codepoint ( cp[0] )'s value onto * the stack, and we must set last_good_i to 0 since it didn't work out */ if (!collation_add_keys_from_node(tc, (last_good_result == -1 ? NULL : &main_nodes[last_good_result]), stack, ci, name, cps[0], &main_nodes[query])) { /* If we get 0 from collation_add_keys_from_node then we only processed * a single codepoint so set last_good_i to 0 */ last_good_i = 0; } num_cps_processed = last_good_i + 1; } } else { /* Push the first codepoint onto the stack */ rtrn = collation_push_MVM_values(tc, cps[0], stack, ci, name); num_cps_processed = 1; } /* If there are any more codepoints remaining call collation_push_cp on the remaining */ if (num_cps_processed < cp_num) { return num_cps_processed + collation_push_cp(tc, stack, ci, cps + num_cps_processed, cp_num - num_cps_processed, name); } return num_cps_processed; }