예제 #1
0
void surrogate(int s,FILE * fo)
    {
    static int prev = 0;
    if(prev == 0)
        {
        if((s & 0xFC00) == 0xD800) // first word surrogat
            {
            prev = s;
            }
        else
            {
            int eight = getEightBit(s);
            if(eight == 0 && s != 0)
                fprintf(fo,"&#x%x;",s);            
            else 
                fputc(eight,fo);
            }
        }
    else
        {
        if((s & 0xFC00) == 0xDC00) // second word surrogat
            {
            s = (s & 0x3ff) + ((prev & 0x3ff) << 10) + 0x10000;
            fprintf(fo,"&#x%x;",s);
            }
        else
            {
            // Assume it is UCS-2, not UTF-16
            fprintf(fo,"&#x%x;",prev);
            if(s > 0)
                fprintf(fo,"&#x%x;",s); // You can call surrogate with a zero or -1 to empty prev.
            }
        prev = 0;
        }
    }
예제 #2
0
// EF BB BF = 1110 1111  1011 1011 1011 1111 = 1111 11 1011 11 1111 = 11111110 11111111 = FEFF = Byte Order Mark
// Added by many Windows programs to UTF8 files.
bool UTF8(FILE * fi,FILE * fo)
    {
    int k[6];
    FILE * fto = tmpfile();
    while((k[0] = fgetc(fi)) != EOF)
        {
        switch(k[0] & 0xc0) // 11bbbbbb
            {
            case  0xc0:
                {
                if((k[0] & 0xfe) == 0xfe)
                    return false;
                // Start of multibyte
                int i = 1;
                //bitpat(k[0],8);
                while((k[0] << i) & 0x80) // Read the second, third and fourth
                            // highest bits and read a byte for every bit == 1
                    {
                    k[i] = fgetc(fi);
                    //bitpat(k[i],8);
                    if((k[i++] & 0xc0) != 0x80) // 10bbbbbb
                        return false;
                    }
                int K = ((k[0] << i) & 0xff) // shift high-bits out of byte
                        << (5 * i - 6); // shift to high position
                //putchar('\n');
                //bitpat(K,32);
                int I = --i;
                while(i > 0)
                    {
                    K |= (k[i] & 0x3f) << ((I - i) * 6);
                    //putchar('\n');
                    //bitpat(K,32);
                    --i;
                    }
                //putchar('\n');
                //char a[2] = {K,0};
                if(K <= 0x7f)
                    return false; // overlong UTF-8 sequence
                int eight = getEightBit(K);
                if(eight == 0 && K != 0)
                    fprintf(fto,"&#x%x;",K);
                else 
                    fputc(eight,fto);
                break;
                }
            case 0x80: // 10bbbbbb
                // Not UTF-8
                return false;
            default:
                fputc(k[0],fto);
            }        
        }
    rewind(fto);
    while((k[0] = fgetc(fto)) != EOF)
        fputc(k[0],fo);
    return true;
    }
예제 #3
0
void getbyte(int s)
    {
    int eight = getEightBit(s);
    if(eight > 127 || eight < 0)
        ascii = false;
    }