Beispiel #1
0
static void get_utf8(int i, FILE *fp)
{
    long u = 0, j;
    int i2 = EOF, i3 = EOF, i4 = EOF;

    switch (UTF8length(i)) {
    case 2:
        i2 = getcUTF8(fp); if (i2 == EOF) break;
        u = UTF8BtoUCS(i, i2);
        break;
    case 3:
        i2 = getcUTF8(fp); if (i2 == EOF) break;
        i3 = getcUTF8(fp); if (i3 == EOF) break;
        u = UTF8CtoUCS(i, i2, i3);
        if (u == U_BOM) return; /* just ignore */
        if (u == U_VOICED      && combin_voiced_sound(false)) return;
        if (u == U_SEMI_VOICED && combin_voiced_sound(true))  return;
        break;
    case 4:
        i2 = getcUTF8(fp); if (i2 == EOF) break;
        i3 = getcUTF8(fp); if (i3 == EOF) break;
        i4 = getcUTF8(fp); if (i4 == EOF) break;
        u = UTF8DtoUCS(i, i2, i3, i4);
        break;
    default:
        u = U_REPLACEMENT_CHARACTER;
        break;
    }

    j = toBUFF(fromUCS(u));
    if (j == 0) { /* can't represent (typically umlaut o in EUC) */
        write_hex(i);
        if (i2 != EOF) write_hex(i2);
        if (i3 != EOF) write_hex(i3);
        if (i4 != EOF) write_hex(i4);
    } else {
        write_multibyte(j);
    }
}
Beispiel #2
0
/* input line with encoding conversion */
long input_line2(FILE *fp, unsigned char *buff, long pos,
                 const long buffsize, int *lastchar)
{
    long i;
    static boolean injis = false;
    const int fd = fileno(fp);

    if (infile_enc[fd] == ENC_UNKNOWN) { /* just after opened */
        ungetbuff[fd].size = 0;
        if (isUTF8Nstream(fp)) infile_enc[fd] = ENC_UTF8;
        else                   infile_enc[fd] = get_file_enc();
    }
    buffer = buff;
    first = last = pos;

    while (last < buffsize-30 && (i=getc4(fp)) != EOF && i!='\n' && i!='\r') {
        /* 30 is enough large size for one char */
        /* attention: 4 times of write_hex() eats 16byte */
#ifdef WIN32
        if (i == 0x1a && first == last &&
            fd == fileno(stdin) && _isatty(fd)) { /* Ctrl+Z on console */
                i = EOF;
                break;
        } else
#endif
        if (i == ESC) {
            if ((i=getc4(fp)) == '$') { /* ESC '$' (Kanji-in) */
                i = getc4(fp);
                if (i == '@' || i == 'B') {
                    injis = true;
                } else {               /* broken Kanji-in */
                    buffer[last++] = ESC;
                    buffer[last++] = '$';
                    if (is_tail(&i, fp)) break;
                    buffer[last++] = i;
                }
            } else if (i == '(') {     /* ESC '(' (Kanji-out) */
                i = getc4(fp);
                if (i == 'J' || i == 'B' || i == 'H') {
                    injis = false;
                } else {               /* broken Kanji-out */
                    buffer[last++] = ESC;
                    buffer[last++] = '(';
                    if (is_tail(&i, fp)) break;
                    buffer[last++] = i;
                }
            } else { /* broken ESC */
                buffer[last++] = ESC;
                if (is_tail(&i, fp)) break;
                buffer[last++] = i;
            }
        } else { /* rather than ESC */
            if (injis) { /* in JIS */
                long j = getc4(fp);
                if (is_tail(&j, fp)) {
                    buffer[last++] = i;
                    i = j;
                    break;
                } else { /* JIS encoding */
                    i = fromJIS(HILO(i,j));
                    if (i == 0) i = fromUCS(U_REPLACEMENT_CHARACTER);
                    write_multibyte(toBUFF(i));
                }
            } else {  /* normal */
                if        (infile_enc[fd] == ENC_SJIS && isSJISkanji1(i)) {
                    get_sjis(i, fp);
                } else if (infile_enc[fd] == ENC_EUC  && isEUCkanji1(i)) {
                    get_euc(i, fp);
                } else if (infile_enc[fd] == ENC_UTF8 && UTF8length(i) > 1) {
                    get_utf8(i, fp);
                } else {
                    buffer[last++] = i;
                }
            }
        }
    }

    buffer[last] = '\0';
    if (i == EOF || i == '\n' || i == '\r') injis = false;
    if (lastchar != NULL) *lastchar = i;

    return last;
}
Beispiel #3
0
/* putc() with code conversion */
int putc2(int c, FILE *fp)
{
    static int num[NOFILE];
        /* 0    : not in Kanji
           1..4 : in JIS Kanji and num[] bytes are in store[][]
           -1   : in JIS Kanji and store[][] is empty */
    static unsigned char store[NOFILE][4];
    const int fd = fileno(fp);
    int ret = c, output_enc;

#ifdef WIN32
    if ((fp == stdout || fp == stderr) && (_isatty(fd) || !prior_file_enc)) {
        if (sjisterminal) {
            if (is_internalUPTEX())
                output_enc = ENC_UTF8;
            else
                output_enc = ENC_SJIS;
        } else
#else
    if ((fp == stdout || fp == stderr) && !prior_file_enc) {
#endif

        output_enc = get_terminal_enc();
    } else
        output_enc = get_file_enc();

    if (num[fd] > 0) {        /* multi-byte char */
        if (is_internalUPTEX() && iskanji1(c)) { /* error */
            ret = flush(store[fd], num[fd], fp);
            num[fd] = 0;
        }
        store[fd][num[fd]] = c;
        num[fd]++;
        if (multistrlen(store[fd], num[fd], 0) == num[fd]) {
            long i = fromBUFF(store[fd], num[fd], 0);
            ret = put_multibyte(toENC(i, output_enc), fp);
            num[fd] = -1;
        } else if ((is_internalUPTEX() && num[fd] == 4) ||
                   (!is_internalUPTEX() && num[fd] == 2)) { /* error */
            ret = flush(store[fd], num[fd], fp);
            num[fd] = -1;
        }
    } else if (iskanji1(c)) { /* first multi-byte char */
        if (num[fd] == 0 && output_enc == ENC_JIS) {
            ret = put_multibyte(KANJI_IN, fp);
        }
        store[fd][0] = c;
        num[fd] = 1;
    } else {                  /* ASCII */
        if (num[fd] < 0 && output_enc == ENC_JIS) {
            put_multibyte(KANJI_OUT, fp);
        }
        ret = putc(c, fp);
        num[fd] = 0;
    }
    return ret;
}

/* fputs() with code conversion */
int fputs2(const char *s, FILE *fp)
{
    while (*s != '\0') {
        int ret = putc2((unsigned char)*s, fp);
        if (ret == EOF) return EOF;
        s++;
    }
    return 1;
}


static struct unget_st {
    int size;
    int buff[4];
} ungetbuff[NOFILE];

static int getc4(FILE *fp)
{
    struct unget_st *p = &ungetbuff[fileno(fp)];

    if (p->size == 0)
#ifdef WIN32
    {
        const int fd = fileno(fp);
        HANDLE hStdin;
        DWORD ret;
        wchar_t wc[2];
        long c;
        static wchar_t wcbuf = L'\0';

        if (!(fd == fileno(stdin) && _isatty(fd) && is_internalUPTEX()))
            return getc(fp);

        hStdin = GetStdHandle(STD_INPUT_HANDLE);
        if (wcbuf) {
            wc[0] = wcbuf;
            wcbuf = L'\0';
        }
        else if (ReadConsoleW(hStdin, wc, 1, &ret, NULL) == 0)
            return EOF;
        if (0xd800<=wc[0] && wc[0]<0xdc00) {
            if (ReadConsoleW(hStdin, wc+1, 1, &ret, NULL) == 0)
                return EOF;
            if (0xdc00<=wc[1] && wc[1]<0xe000) {
                c = UTF16StoUTF32(wc[0], wc[1]);
            } else {
                wcbuf = wc[1];
                c = U_REPLACEMENT_CHARACTER;  /* illegal upper surrogate pair */
            }
        } else if (0xdc00<=wc[0] && wc[0]<0xe000) {
            c = U_REPLACEMENT_CHARACTER;      /* illegal lower surrogate pair */
        } else {
            c = wc[0];
        }
        c = UCStoUTF8(c);
        /* always */       p->buff[p->size++]=BYTE4(c);
        if (BYTE3(c) != 0) p->buff[p->size++]=BYTE3(c);
        if (BYTE2(c) != 0) p->buff[p->size++]=BYTE2(c);
        if (BYTE1(c) != 0) p->buff[p->size++]=BYTE1(c);
    }
#else
        return getc(fp);
#endif
    return p->buff[--p->size];
}

static int ungetc4(int c, FILE *fp)
{
    struct unget_st *p = &ungetbuff[fileno(fp)];

    if (p->size >= 4) return EOF;
    return p->buff[p->size++] = c;
}


static unsigned char *buffer;
static long first, last;
static boolean combin_voiced_sound(boolean semi)
{
    int i, mblen;

    mblen = is_internalUPTEX() ? 3 : 2;
    if (last-mblen < first) return false;
    if (multistrlen(buffer,last,last-mblen) != mblen) return false;
    i = toUCS(fromBUFF(buffer,last,last-mblen));
    i = get_voiced_sound(i, semi);
    if (i == 0) return false;
    i = toBUFF(fromUCS(i));
    if (BYTE2(i) != 0) buffer[last-3] = BYTE2(i);
    /* always */       buffer[last-2] = BYTE3(i);
    /* always */       buffer[last-1] = BYTE4(i);
    return true;
}
Beispiel #4
0
/* putc() with code conversion */
int putc2(int c, FILE *fp)
{
    static int num[NOFILE];
        /* 0    : not in Kanji
           1..4 : in JIS Kanji and num[] bytes are in store[][]
           -1   : in JIS Kanji and store[][] is empty */
    static unsigned char store[NOFILE][4];
    const int fd = fileno(fp);
    int ret = c, output_enc;

#ifdef WIN32
    if ((fp == stdout || fp == stderr) && (_isatty(fd) || !prior_file_enc)) {
        if (sjisterminal) {
            if (is_internalUPTEX())
                output_enc = ENC_UTF8;
            else
                output_enc = ENC_SJIS;
        } else
#else
    if ((fp == stdout || fp == stderr) && !prior_file_enc) {
#endif

        output_enc = get_terminal_enc();
    } else
        output_enc = get_file_enc();

    if (num[fd] > 0) {        /* multi-byte char */
        if (is_internalUPTEX() && iskanji1(c)) { /* error */
            ret = flush(store[fd], num[fd], fp);
            num[fd] = 0;
        }
        store[fd][num[fd]] = c;
        num[fd]++;
        if (multistrlen(store[fd], num[fd], 0) == num[fd]) {
            long i = fromBUFF(store[fd], num[fd], 0);
            ret = put_multibyte(toENC(i, output_enc), fp);
            num[fd] = -1;
        } else if ((is_internalUPTEX() && num[fd] == 4) ||
                   (!is_internalUPTEX() && num[fd] == 2)) { /* error */
            ret = flush(store[fd], num[fd], fp);
            num[fd] = -1;
        }
    } else if (iskanji1(c)) { /* first multi-byte char */
        if (num[fd] == 0 && output_enc == ENC_JIS) {
            ret = put_multibyte(KANJI_IN, fp);
        }
        store[fd][0] = c;
        num[fd] = 1;
    } else {                  /* ASCII */
        if (num[fd] < 0 && output_enc == ENC_JIS) {
            put_multibyte(KANJI_OUT, fp);
        }
        ret = putc(c, fp);
        num[fd] = 0;
    }
    return ret;
}

/* fputs() with code conversion */
int fputs2(const char *s, FILE *fp)
{
    while (*s != '\0') {
        int ret = putc2((unsigned char)*s, fp);
        if (ret == EOF) return EOF;
        s++;
    }
    return 1;
}


static struct unget_st {
    int size;
    int buff[4];
} ungetbuff[NOFILE];

static int getc4(FILE *fp)
{
    struct unget_st *p = &ungetbuff[fileno(fp)];

    if (p->size == 0) return getc(fp);
    return p->buff[--p->size];
}

static int ungetc4(int c, FILE *fp)
{
    struct unget_st *p = &ungetbuff[fileno(fp)];

    if (p->size >= 4) return EOF;
    return p->buff[p->size++] = c;
}


static unsigned char *buffer;
static long first, last;
static boolean combin_voiced_sound(boolean semi)
{
    int i;

    if (last-2 < first) return false;
    if (multistrlen(buffer,last,last-2) != 2) return false;
    i = toUCS(fromBUFF(buffer,last,last-2));
    i = get_voiced_sound(i, semi);
    if (i == 0) return false;
    i = toBUFF(fromUCS(i));
    buffer[last-2] = HI(i);
    buffer[last-1] = LO(i);
    return true;
}