*/ REBSER *Decode_UTF_String(REBYTE *bp, REBCNT len, REBINT utf, REBFLG ccr) /* ** Do all the details to decode a string. ** Input is a byte series. Len is len of input. ** The utf is 0, 8, +/-16, +/-32. ** A special -1 means use the BOM. ** ***********************************************************************/ { REBSER *ser = BUF_UTF8; // buffer is Unicode width REBSER *dst; REBINT size; //REBFLG ccr = FALSE; // in original R3-alpha if was TRUE //@@ https://github.com/rebol/rebol-issues/issues/2336 if (utf == -1) { utf = What_UTF(bp, len); if (utf) { if (utf == 8) bp += 3, len -= 3; else if (utf == -16 || utf == 16) bp += 2, len -= 2; else if (utf == -32 || utf == 32) bp += 4, len -= 4; } } if (utf == 0 || utf == 8) { size = Decode_UTF8((REBUNI*)Reset_Buffer(ser, len), bp, len, ccr); } else if (utf == -16 || utf == 16) { size = Decode_UTF16((REBUNI*)Reset_Buffer(ser, len/2 + 1), bp, len, utf < 0, ccr); } else if (utf == -32 || utf == 32) { size = Decode_UTF32((REBUNI*)Reset_Buffer(ser, len/4 + 1), bp, len, utf < 0, ccr); } else { return NULL; } if (size < 0) { size = -size; dst = Make_Binary(size); Append_Uni_Bytes(dst, UNI_HEAD(ser), size); } else { dst = Make_Unicode(size); Append_Uni_Uni(dst, UNI_HEAD(ser), size); } return dst; }
static REBSER *make_string(REBVAL *arg, REBOOL make) { REBSER *ser = 0; // MAKE <type> 123 if (make && (IS_INTEGER(arg) || IS_DECIMAL(arg))) { ser = Make_Binary(Int32s(arg, 0)); } // MAKE/TO <type> <binary!> else if (IS_BINARY(arg)) { REBYTE *bp = VAL_BIN_DATA(arg); REBCNT len = VAL_LEN(arg); switch (What_UTF(bp, len)) { case 0: break; case 8: // UTF-8 encoded bp += 3; len -= 3; break; default: Trap0(RE_BAD_DECODE); } ser = Decode_UTF_String(bp, len, 8); // UTF-8 } // MAKE/TO <type> <any-string> else if (ANY_BINSTR(arg)) { ser = Copy_String(VAL_SERIES(arg), VAL_INDEX(arg), VAL_LEN(arg)); } // MAKE/TO <type> <any-word> else if (ANY_WORD(arg)) { ser = Copy_Mold_Value(arg, TRUE); //ser = Append_UTF8(0, Get_Word_Name(arg), -1); } // MAKE/TO <type> #"A" else if (IS_CHAR(arg)) { ser = (VAL_CHAR(arg) > 0xff) ? Make_Unicode(2) : Make_Binary(2); Append_Byte(ser, VAL_CHAR(arg)); } // MAKE/TO <type> <any-value> // else if (IS_NONE(arg)) { // ser = Make_Binary(0); // } else ser = Copy_Form_Value(arg, 1<<MOPT_TIGHT); return ser; }
// // Decode_UTF_String: C // // Do all the details to decode a string. // Input is a byte series. Len is len of input. // The utf is 0, 8, +/-16 // A special -1 means use the BOM, if present, or UTF-8 otherwise. // // Returns the decoded string or NULL for unsupported encodings. // REBSER *Decode_UTF_String(REBYTE *bp, REBCNT len, REBINT utf) { REBSER *ser = BUF_UTF8; // buffer is Unicode width REBSER *dst; REBINT size; if (utf == -1) { // Try to detect UTF encoding from a BOM. Returns 0 if no BOM present. utf = What_UTF(bp, len); if (utf != 0) { if (utf == 8) bp += 3, len -= 3; else if (utf == -16 || utf == 16) bp += 2, len -= 2; else return NULL; } } if (utf == 0 || utf == 8) { size = Decode_UTF8_May_Fail( cast(REBUNI*, Reset_Buffer(ser, len)), bp, len, TRUE ); }
static REBSER *MAKE_TO_String_Common(const REBVAL *arg) { REBSER *ser = 0; // MAKE/TO <type> <binary!> if (IS_BINARY(arg)) { REBYTE *bp = VAL_BIN_AT(arg); REBCNT len = VAL_LEN_AT(arg); switch (What_UTF(bp, len)) { case 0: break; case 8: // UTF-8 encoded bp += 3; len -= 3; break; default: fail (Error(RE_BAD_UTF8)); } ser = Decode_UTF_String(bp, len, 8); // UTF-8 } // MAKE/TO <type> <any-string> else if (ANY_BINSTR(arg)) { ser = Copy_String_Slimming(VAL_SERIES(arg), VAL_INDEX(arg), VAL_LEN_AT(arg)); } // MAKE/TO <type> <any-word> else if (ANY_WORD(arg)) { ser = Copy_Mold_Value(arg, 0 /* opts... MOPT_0? */); } // MAKE/TO <type> #"A" else if (IS_CHAR(arg)) { ser = (VAL_CHAR(arg) > 0xff) ? Make_Unicode(2) : Make_Binary(2); Append_Codepoint_Raw(ser, VAL_CHAR(arg)); } else ser = Copy_Form_Value(arg, 1 << MOPT_TIGHT); return ser; }