/** * @internal walk the given UTF8 string, looking for non-ASCII characters. * @return 0 if none were found, or, if non-ASCII strings were found, * answer the length of the buffer if it were converted to platform * encoding * * @note this relies on the assumption that wide chars are Unicode. * If not, the platform will need different support for this */ static IDATA walkUTF8String (const U_8 * buf, IDATA nbytes) { const U_8 *end = buf + nbytes; const U_8 *cursor = buf; IDATA newLength = 0; int hasHighChars = 0; /* reset the shift state */ wctomb (NULL, 0); while (cursor < end) { if ((*cursor & 0x80) == 0x80) { char temp[MB_CUR_MAX]; int wcresult; U_16 unicode; U_32 numberU8Consumed = decodeUTF8CharN (cursor, &unicode, end - cursor); if (numberU8Consumed == 0) { /* an illegal encoding was encountered! Don't try to decode the string */ return 0; } cursor += numberU8Consumed; /* calculate the encoded length of this character */ wcresult = wctomb (temp, (wchar_t) unicode); if (wcresult == -1) { /* an un-encodable char was encountered */ newLength += 1; } else { newLength += wcresult; } hasHighChars = 1; } else { newLength += 1; cursor += 1; } } return hasHighChars ? newLength : 0; }
/** * Output the buffer onto the another buffer as text. The in buffer is a UTF8-encoded array of chars. * It is converted to the appropriate platform encoding. * * @param[in] portLibrary The port library * @param[in] buf buffer of text to be converted. * @param[in] nbytes size of buffer of text to be converted. * * @return buffer of converted to the appropriate platform encoding text. */ char *VMCALL hybuf_write_text (struct HyPortLibrary * portLibrary, const char *buf, IDATA nbytes) { IDATA i; int newlines = 0, highchars = 0; char *newBuf = NULL; IDATA newLen; char *outBuf = (char*)buf; /* scan the buffer for any characters which need to be converted */ for (i = 0; i < nbytes; i++) { if (outBuf[i] == '\n') { newlines += 1; } else if ((U_8) outBuf[i] & 0x80) { highchars += 1; } } newlines = 0; /* if there are any non-ASCII chars, convert to Unicode and then to the local code page */ if (highchars) { U_16 *wBuf; newLen = (nbytes + newlines) * 2; wBuf = portLibrary->mem_allocate_memory (portLibrary, newLen); if (wBuf) { U_8 *in = (U_8 *) outBuf; U_8 *end = in + nbytes; U_16 *out = wBuf; while (in < end) { if (*in == '\n') { *out++ = (U_16) '\r'; *out++ = (U_16) '\n'; in += 1; } else { U_32 numberU8Consumed = decodeUTF8CharN (in, out++, end - in); if (numberU8Consumed == 0) { break; } in += numberU8Consumed; } } /* in will be NULL if an error occurred */ if (in) { UINT codePage = GetConsoleOutputCP (); IDATA wLen = out - wBuf; IDATA mbLen = WideCharToMultiByte (codePage, 0, wBuf, wLen, NULL, 0, NULL, NULL); if (mbLen > 0) { newBuf = portLibrary->mem_allocate_memory (portLibrary, mbLen + 1); /* if we couldn't allocate the buffer, just output the data the way it was */ if (newBuf) { WideCharToMultiByte (codePage, 0, wBuf, wLen, newBuf, mbLen, NULL, NULL); outBuf = newBuf; nbytes = mbLen; newBuf[nbytes] = '\0'; newBuf = NULL; } } } portLibrary->mem_free_memory (portLibrary, wBuf); } } else if (newlines) { /* change any LFs to CRLFs */ newLen = nbytes + newlines; newBuf = portLibrary->mem_allocate_memory (portLibrary, newLen + 1); /* if we couldn't allocate the buffer, just output the data the way it was */ if (newBuf) { char *cursor = newBuf; for (i = 0; i < nbytes; i++) { if (outBuf[i] == '\n') *cursor++ = '\r'; *cursor++ = outBuf[i]; } if (outBuf != buf) { portLibrary->mem_free_memory (portLibrary, outBuf); } outBuf = newBuf; nbytes = newLen; outBuf[nbytes] = '\0'; } } if (outBuf == buf) { outBuf = portLibrary->mem_allocate_memory (portLibrary, nbytes + 1); memcpy((void*)outBuf, (const void*)buf, nbytes); outBuf[nbytes] = '\0'; } return outBuf; }
/** * Output the buffer onto the stream as text. The buffer is a UTF8-encoded array of chars. * It is converted to the appropriate platform encoding. * * @param[in] portLibrary The port library * @param[in] fd the file descriptor. * @param[in] buf buffer of text to be output. * @param[in] nbytes size of buffer of text to be output. * * @return 0 on success, negative error code on failure. */ IDATA VMCALL hyfile_write_text (struct HyPortLibrary * portLibrary, IDATA fd, const char *buf, IDATA nbytes) { IDATA result; IDATA i; int newlines = 0, highchars = 0; char stackBuf[512]; char *newBuf = stackBuf; IDATA newLen; /* scan the buffer for any characters which need to be converted */ for (i = 0; i < nbytes; i++) { if (buf[i] == '\n') { newlines += 1; } else if ((U_8) buf[i] & 0x80) { highchars += 1; } } /* if there are any non-ASCII chars, convert to Unicode and then to the local code page */ if (highchars) { U_16 wStackBuf[512]; U_16 *wBuf = wStackBuf; newLen = (nbytes + newlines) * 2; if (newLen > sizeof (wStackBuf)) { wBuf = portLibrary->mem_allocate_memory (portLibrary, newLen); } if (wBuf) { U_8 *in = (U_8 *) buf; U_8 *end = in + nbytes; U_16 *out = wBuf; while (in < end) { if (*in == '\n') { *out++ = (U_16) '\r'; *out++ = (U_16) '\n'; in += 1; } else { U_32 numberU8Consumed = decodeUTF8CharN (in, out++, end - in); if (numberU8Consumed == 0) { break; } in += numberU8Consumed; } } /* in will be NULL if an error occurred */ if (in) { UINT codePage = GetConsoleOutputCP (); IDATA wLen = out - wBuf; IDATA mbLen = WideCharToMultiByte (codePage, 0, wBuf, wLen, NULL, 0, NULL, NULL); if (mbLen > 0) { if (mbLen > sizeof (stackBuf)) { newBuf = portLibrary->mem_allocate_memory (portLibrary, mbLen); /* if we couldn't allocate the buffer, just output the data the way it was */ } if (newBuf) { WideCharToMultiByte (codePage, 0, wBuf, wLen, newBuf, mbLen, NULL, NULL); buf = newBuf; nbytes = mbLen; } } } if (wBuf != wStackBuf) { portLibrary->mem_free_memory (portLibrary, wBuf); } } } else if (newlines) { /* change any LFs to CRLFs */ newLen = nbytes + newlines; if (newLen > sizeof (stackBuf)) { newBuf = portLibrary->mem_allocate_memory (portLibrary, newLen); /* if we couldn't allocate the buffer, just output the data the way it was */ } if (newBuf) { char *cursor = newBuf; for (i = 0; i < nbytes; i++) { if (buf[i] == '\n') *cursor++ = '\r'; *cursor++ = buf[i]; } buf = newBuf; nbytes = newLen; } } result = portLibrary->file_write (portLibrary, fd, (void *) buf, nbytes); if (newBuf != stackBuf && newBuf != NULL) { portLibrary->mem_free_memory (portLibrary, newBuf); } return (result == nbytes) ? 0 : result; }
static intptr_t file_write_using_iconv(struct OMRPortLibrary *portLibrary, intptr_t fd, const char *buf, intptr_t nbytes) { intptr_t result = 0; char stackBuf[512]; char *bufStart = NULL; uintptr_t outBufLen = sizeof(stackBuf); iconv_t converter = J9VM_INVALID_ICONV_DESCRIPTOR; size_t inbytesleft = 0; size_t outbytesleft = 0; char *inbuf = NULL; char *outbuf = NULL; intptr_t bytesToWrite = 0; #ifdef J9ZOS390 /* LIR 1280 (z/OS only) - every failed call to iconv_open() is recorded on the operator console, so don't retry */ if (FALSE == PPG_file_text_iconv_open_failed) { /* iconv_get is not an a2e function, so we need to pass it honest-to-goodness EBCDIC strings */ #pragma convlit(suspend) #endif #ifndef OMRZTPF converter = iconv_get(portLibrary, J9FILETEXT_ICONV_DESCRIPTOR, nl_langinfo(CODESET), "UTF-8"); #else converter = iconv_get(portLibrary, J9FILETEXT_ICONV_DESCRIPTOR, "IBM1047", "ISO8859-1" ); #endif #ifdef J9ZOS390 #pragma convlit(resume) if (J9VM_INVALID_ICONV_DESCRIPTOR == converter) { PPG_file_text_iconv_open_failed = TRUE; } } #endif if (J9VM_INVALID_ICONV_DESCRIPTOR == converter) { /* no converter available for this code set. Just dump the UTF-8 chars */ result = portLibrary->file_write(portLibrary, fd, (void *)buf, nbytes); return (result == nbytes) ? 0 : result; } inbuf = (char *)buf; /* for some reason this argument isn't const */ outbuf = bufStart = stackBuf; inbytesleft = nbytes; outbytesleft = sizeof(stackBuf); while ((size_t)-1 == iconv(converter, &inbuf, &inbytesleft, &outbuf, &outbytesleft)) { int tmp_errno = errno; if (inbytesleft == 0) { break; } if ((outbytesleft == 0) || (tmp_errno == E2BIG)) { /* input conversion stopped due to lack of space in the output buffer */ if (growBuffer(portLibrary, stackBuf, &bufStart, &outbuf, &outbytesleft, &outBufLen) < 0) { /* failed to grow buffer, just output what we've got so far */ break; } } else if (tmp_errno == EILSEQ) { /* input conversion stopped due to an input byte that does not belong to the input code set */ const char *unicodeFormat = "\\u%04x"; #define J9FILETEXT_ESCAPE_STR_SIZE 6 /* max size of unicode format */ char escapedStr[J9FILETEXT_ESCAPE_STR_SIZE]; char *escapedStrStart = escapedStr; uint16_t unicodeC = 0; size_t escapedLength = 0; size_t utf8Length = decodeUTF8CharN((const uint8_t *)inbuf, &unicodeC, inbytesleft); if (utf8Length == 0) { /* invalid encoding, including 4-byte UTF-8 */ utf8Length = 1; escapedLength = 1; escapedStr[0] = '?'; } else { escapedLength = portLibrary->str_printf(portLibrary, escapedStr, J9FILETEXT_ESCAPE_STR_SIZE, unicodeFormat, (uintptr_t)unicodeC); } inbytesleft -= utf8Length; inbuf += utf8Length; if ((size_t)-1 == iconv(converter, &escapedStrStart, &escapedLength, &outbuf, &outbytesleft)) { /* not handling EILSEQ here because: * 1. we can't do much if iconv() fails to convert ascii. * 2. inbuf and inbytesleft have been explicitly updated so the while loop will get terminated after converting the rest of the characters. */ tmp_errno = errno; /* if the remaining outbuf is too small, then grow it before storing Unicode string representation */ if (tmp_errno == E2BIG) { if (growBuffer(portLibrary, stackBuf, &bufStart, &outbuf, &outbytesleft, &outBufLen) < 0) { /* failed to grow buffer, just output what we've got so far */ break; } } } } else { /* input conversion stopped due to an incomplete character or shift sequence at the end of the input buffer */ break; } } iconv_free(portLibrary, J9FILETEXT_ICONV_DESCRIPTOR, converter); /* CMVC 152575 - the converted string is not necessarily the same length in bytes as the original string */ bytesToWrite = outbuf - bufStart; result = portLibrary->file_write(portLibrary, fd, (void *)bufStart, bytesToWrite); if (bufStart != stackBuf) { portLibrary->mem_free_memory(portLibrary, bufStart); } return (result == bytesToWrite) ? 0 : result; }
/** * Decode the UTF8 character. * * Decode the input UTF8 character and stores it into result. * * @param[in] input The UTF8 character * @param[in,out] result buffer for unicode characters * * @return The number of UTF8 characters consumed (1,2,3) on success, 0 on failure */ U_32 decodeUTF8Char (const U_8 * input, U_16 * result) { /* a UTF8 character can't require more than 3 bytes */ return decodeUTF8CharN (input, result, 3); }