// This function handles the case where the file is open in UTF-8 text mode and // the translated UTF-16 form of the text has a different number of characters // than the original UTF-8 text (remember: when reading a file in UTF-8 mode, the // lowio library converts the UTF-8 to UTF-16). static __int64 __cdecl common_ftell_translated_utf8_nolock( __crt_stdio_stream const stream, __int64 const lowio_position ) throw() { int const fh = _fileno(stream.public_stream()); // If the buffer has been exhausted, then the current lowio position is also // the current stdio position: if (stream->_cnt == 0) return lowio_position; __int64 const current_buffer_position = (stream->_ptr - stream->_base) / static_cast<__int64>(sizeof(wchar_t)); // Otherwise, we have to re-read the buffer, in binary mode, so that we can // analyze the original UTF-8 text to compute the actual position in the // file. To do this, we seek the lowio pointer back to the beginning of // the stdio buffer, re-read the buffer, then seek the lowio pointer back // to its original location: __int64 const base_buffer_position = _lseeki64(fh, _startpos(fh), SEEK_SET); if (base_buffer_position != _startpos(fh)) return -1; DWORD bytes_read; char raw_buffer[_INTERNAL_BUFSIZ]; if (!ReadFile(reinterpret_cast<HANDLE>(_osfhnd(fh)), raw_buffer, _INTERNAL_BUFSIZ, &bytes_read, nullptr)) return -1; // Seek back to where we were, to ensure the stdio stream is left in a // consistent state (and "unmodified" from before the call): if (_lseeki64(fh, lowio_position, SEEK_SET) < 0) return -1; // This should not normally happen: we should always read enough bytes: if (current_buffer_position > static_cast<__int64>(bytes_read)) return -1; // Scan the raw, untranslated buffer to find the current position, updating // the file pointer to account for newline translation in the buffer: char const* const raw_first = raw_buffer; #pragma warning(disable:__WARNING_UNUSED_POINTER_ASSIGNMENT) // 28930 char const* const raw_last = raw_buffer + bytes_read; char const* raw_it = raw_first; for (__int64 i = 0; i != current_buffer_position && raw_it < raw_last; ++i, ++raw_it) { if (*raw_it == CR) { if (raw_it < raw_last - 1 && *(raw_it + 1) == LF) ++raw_it; } else { raw_it += _utf8_no_of_trailbytes(*raw_it); } } return base_buffer_position + (raw_it - raw_first); }
/* now define version that doesn't lock/unlock, validate fh */ int __cdecl _read_nolock ( int fh, void *inputbuf, unsigned cnt ) { int bytes_read; /* number of bytes read */ char *buffer; /* buffer to read to */ int os_read; /* bytes read on OS call */ char *p, *q; /* pointers into buffer */ wchar_t *pu, *qu; /* wchar_t pointers into buffer for UTF16 */ char peekchr; /* peek-ahead character */ wchar_t wpeekchr; /* peek-ahead wchar_t */ __int64 filepos; /* file position after seek */ ULONG dosretval; /* o.s. return value */ char tmode; /* textmode - ANSI/UTF-8/UTF-16 */ void *buf; /* buffer to read to */ int retval = -2; /* return value */ unsigned inputsize = cnt; /* validate fh */ _CHECK_FH_CLEAR_OSSERR_RETURN( fh, EBADF, -1 ); _VALIDATE_CLEAR_OSSERR_RETURN((fh >= 0 && (unsigned)fh < (unsigned)_nhandle), EBADF, -1); _VALIDATE_CLEAR_OSSERR_RETURN((_osfile(fh) & FOPEN), EBADF, -1); bytes_read = 0; /* nothing read yet */ if (cnt == 0 || (_osfile(fh) & FEOFLAG)) { /* nothing to read or at EOF, so return 0 read */ return 0; } _VALIDATE_CLEAR_OSSERR_RETURN( (inputbuf != NULL), EINVAL, -1 ); tmode = _textmode(fh); switch(tmode) { case __IOINFO_TM_UTF8 : /* For a UTF-8 file, we need 2 buffers, because after reading we need to convert it into UNICODE - MultiByteToWideChar doesn't do in-place conversions. */ /* MultiByte To WideChar conversion may double the size of the buffer required & hence we divide cnt by 2 */ /* * Since we are reading UTF8 stream, cnt bytes read may vary * from cnt wchar_t characters to cnt/4 wchar_t characters. For * this reason if we need to read cnt characters, we will * allocate MBCS buffer of cnt. In case cnt is 0, we will * have 4 as minimum value. This will make sure we don't * overflow for reading from pipe case. * * * In this case the numbers of wchar_t characters that we can * read is cnt/2. This means that the buffer size that we will * require is cnt/2. */ /* For UTF8 we want the count to be an even number */ _VALIDATE_CLEAR_OSSERR_RETURN(((cnt & 1) == 0), EINVAL, -1); cnt = (cnt/2) < 4 ? 4 : (cnt/2); buf = _malloc_crt(cnt); if(!buf) { errno = ENOMEM; _doserrno = E_nomem; return -1; } break; case __IOINFO_TM_UTF16LE : /* For UTF16 the count always needs to be an even number */ _VALIDATE_CLEAR_OSSERR_RETURN(((cnt & 1) == 0), EINVAL, -1); cnt &= (~1); /* Fall Through to default */ default : /* For non-UTF8 files, we need only 1 buffer - make buf point to the users input buffer */ buf = inputbuf; } buffer = buf; if ((_osfile(fh) & (FPIPE|FDEV)) && _pipech(fh) != LF && cnt != 0) { /* a pipe/device and pipe lookahead non-empty: read the lookahead * char */ *buffer++ = _pipech(fh); ++bytes_read; --cnt; _pipech(fh) = LF; /* mark as empty */ /* For UTF16, there maybe one more look ahead char. For UTF8, there maybe 2 more look ahead chars */ if((tmode != __IOINFO_TM_ANSI) && (_pipech2(fh)[0] != LF) && cnt != 0) { *buffer++ = _pipech2(fh)[0]; ++bytes_read; --cnt; _pipech2(fh)[0] = LF; /* mark as empty */ if((tmode == __IOINFO_TM_UTF8) && (_pipech2(fh)[1] != LF) && cnt != 0) { *buffer++ = _pipech2(fh)[1]; ++bytes_read; --cnt; _pipech2(fh)[1] = LF; /* mark as empty */ } } } /* read the data */ if ( !ReadFile( (HANDLE)_osfhnd(fh), buffer, cnt, (LPDWORD)&os_read, NULL ) || os_read < 0 || (size_t)os_read > cnt) { /* ReadFile has reported an error. recognize two special cases. * * 1. map ERROR_ACCESS_DENIED to EBADF * * 2. just return 0 if ERROR_BROKEN_PIPE has occurred. it * means the handle is a read-handle on a pipe for which * all write-handles have been closed and all data has been * read. */ if ( (dosretval = GetLastError()) == ERROR_ACCESS_DENIED ) { /* wrong read/write mode should return EBADF, not EACCES */ errno = EBADF; _doserrno = dosretval; retval = -1; goto error_return; } else if ( dosretval == ERROR_BROKEN_PIPE ) { retval = 0; goto error_return; } else { _dosmaperr(dosretval); retval = -1; goto error_return; } } bytes_read += os_read; /* update bytes read */ if (_osfile(fh) & FTEXT) { /* now must translate CR-LFs to LFs in the buffer */ /* For ANSI & UTF8, we read byte by byte. For UTF16, we need to read 2 bytes (wchar_t's) at a time */ if(tmode != __IOINFO_TM_UTF16LE) { /* set CRLF flag to indicate LF at beginning of buffer */ if ( (os_read != 0) && (*(char *)buf == LF) ) _osfile(fh) |= FCRLF; else _osfile(fh) &= ~FCRLF; /* convert chars in the buffer: p is src, q is dest */ p = q = buf; while (p < (char *)buf + bytes_read) { if (*p == CTRLZ) { /* if fh is not a device, set ctrl-z flag */ if ( !(_osfile(fh) & FDEV) ) _osfile(fh) |= FEOFLAG; else *q++ = *p++; break; /* stop translating */ } else if (*p != CR) *q++ = *p++; else { /* *p is CR, so must check next char for LF */ if (p < (char *)buf + bytes_read - 1) { if (*(p+1) == LF) { p += 2; *q++ = LF; /* convert CR-LF to LF */ } else *q++ = *p++; /* store char normally */ } else { /* This is the hard part. We found a CR at end of buffer. We must peek ahead to see if next char is an LF. */ ++p; dosretval = 0; if ( !ReadFile( (HANDLE)_osfhnd(fh), &peekchr, 1, (LPDWORD)&os_read, NULL ) ) dosretval = GetLastError(); if (dosretval != 0 || os_read == 0) { /* couldn't read ahead, store CR */ *q++ = CR; } else { /* * peekchr now has the extra character -- we now * have several possibilities: * * 1. disk file and char is not LF; just seek * back and copy CR * 2. disk file and char is LF; seek back and * discard CR * 3. disk file, char is LF but this is a * one-byte read: store LF, don't seek back * 4. pipe/device and char is LF; store LF. * 5. pipe/device and char isn't LF, store CR * and put char in pipe lookahead buffer. */ if (_osfile(fh) & (FDEV|FPIPE)) { /* non-seekable device */ if (peekchr == LF) *q++ = LF; else { *q++ = CR; _pipech(fh) = peekchr; } } else { /* disk file */ if (q == buf && peekchr == LF) { /* nothing read yet; must make some progress */ *q++ = LF; } else { /* seek back */ filepos = _lseeki64_nolock(fh, -1i64, FILE_CURRENT); if (peekchr != LF) *q++ = CR; } } } } } } /* we now change bytes_read to reflect the true number of chars in the buffer */ bytes_read = (int)(q - (char *)buf); if((tmode == __IOINFO_TM_UTF8) && (bytes_read != 0)) { /* UTF8 reads need to be converted into UTF16 */ --q; /* q has gone beyond the last char */ /* * If the last byte is a standalone UTF-8 char. We then * take the whole buffer. Otherwise we skip back till we * come to a lead byte. If the leadbyte forms a complete * UTF-8 character will the remaining part of the buffer, * then again we take the whole buffer. If not, we skip to * one byte which should be the final trail byte of the * previous UTF-8 char or a standalone UTF-8 character */ if(_utf8_is_independent(*q)) { ++q; /* * Final byte is standalone, we reset q, because we * will now consider the full buffer which we have read */ } else { int ctr = 1; int cnt_trailbytes; while(!_utf8_is_leadbyte(*q) && ctr <= 4 && q >= (char *)buf) { --q; ++ctr; } cnt_trailbytes = _utf8_no_of_trailbytes(*q); if(cnt_trailbytes == 0) { /* * Should have exited the while by finding a lead * byte else, the file has incorrect UTF-8 chars */ errno = EILSEQ; retval = -1; goto error_return; } if(cnt_trailbytes + 1 == ctr) { /* * The leadbyte + the remaining bytes form a full * set */ q += ctr; } else { /* Seek back */ if (_osfile(fh) & (FDEV|FPIPE)) { /* * non-seekable device. Put the extra chars in * _pipech & _pipech2. We would have a maximum * of 3 extra chars */ _pipech(fh) = *q; ++q; if(ctr >= 2) { _pipech2(fh)[0] = *q; ++q; } if(ctr == 3) { _pipech2(fh)[1] = *q; ++q; } /* * We need to point q back to beyond whatever * we actually took in. */ q -= ctr; } else { /* We have read extra chars, so we seek back */ filepos = _lseeki64_nolock(fh, -ctr, FILE_CURRENT); } } } bytes_read = (int)(q - (char *)buf); bytes_read = MultiByteToWideChar( CP_UTF8, 0, buf, bytes_read, inputbuf, inputsize/2); if(!bytes_read) { _dosmaperr(GetLastError()); retval = -1; goto error_return; } /* MultiByteToWideChar returns no of wchar_t's. Double it */ bytes_read = bytes_read*2; } } else { /* set CRLF flag to indicate LF at beginning of buffer */ if ( (os_read != 0) && (*(wchar_t *)buf == LF) ) _osfile(fh) |= FCRLF; else _osfile(fh) &= ~FCRLF; /* convert chars in the buffer: pu is src, qu is dest */ pu = qu = (wchar_t *)buf; while ((char *)pu < (char *)buf + bytes_read) { if (*pu == CTRLZ) { /* if fh is not a device, set ctrl-z flag */ if ( !(_osfile(fh) & FDEV) ) _osfile(fh) |= FEOFLAG; else *qu++ = *pu++; break; /* stop translating */ } else if (*pu != CR) *qu++ = *pu++; else { /* *pu is CR, so must check next wchar_t for LF */ if ((char *)pu < (char *)buf + bytes_read - 2) { if (*(pu+1) == LF) { pu += 2; *qu++ = LF; /* convert CR-LF to LF */ } else *qu++ = *pu++; /* store char normally */ } else { /* This is the hard part. We found a CR at end of buffer. We must peek ahead to see if next wchar_t is an LF. */ ++pu; dosretval = 0; if ( !ReadFile( (HANDLE)_osfhnd(fh), &wpeekchr, 2, (LPDWORD)&os_read, NULL ) ) dosretval = GetLastError(); if (dosretval != 0 || os_read == 0) { /* couldn't read ahead, store CR */ *qu++ = CR; } else { /* * peekchr now has the extra character -- we * now have several possibilities: * 1. wchar_t is not LF; just seek back and * copy CR * 2. wchar_t is LF; seek back and discard CR * 3. disk file, wchar_t is LF but this is a * one-byte read: store LF, don't seek back. */ if (_osfile(fh) & (FDEV|FPIPE)) { /* non-seekable device */ if (wpeekchr == LF) *qu++ = LF; else { char * pwpeekchr = (char *)&wpeekchr; *qu++ = CR; _pipech(fh) = *pwpeekchr; ++pwpeekchr; _pipech2(fh)[0] = *pwpeekchr; _pipech2(fh)[1] = LF; /* Mark as empty */ } } else { if ((char *)qu == buf && wpeekchr == LF) { /* nothing read yet; must make some progress */ *qu++ = LF; } else { /* seek back */ filepos = _lseeki64_nolock(fh, -2, FILE_CURRENT); if (wpeekchr != LF) *qu++ = CR; } } } } } } /* we now change bytes_read to reflect the true number of chars in the buffer */ bytes_read = (int)((char *)qu - (char *)buf); } } error_return: if(buf != inputbuf) { free(buf); } return (retval == -2) ? bytes_read : retval ; }