示例#1
0
int main(int argc,char * argv[])
    {
    int b1,b2,b3 = 0;
    FILE * fi = stdin;
    if(argc > 1)
        {
        fi = fopen(argv[1],"rb");
        }
    if(argc > 2)
        {
        type = argv[2];
        char * valid[] =
            {"LE"
            ,"BE"
            ,"UCS-2"
            ,"UCS-2-LE"
            ,"UCS-2-BE"
            ,"BIN"
            ,"UTF-16"
            ,"UTF-16-LE"
            ,"UTF-16-BE"
            ,"ASCII"
            ,"ISO"
            ,"UTF-8"
            ,"UTF-8-BOM"
            };
        int i;
        for(i = sizeof(valid)/sizeof(valid[0]);--i >= 0;)
            if(!strcmp(type,valid[i]))
                break;
        if(i < 0)
            {
            printf("second argument must be one of:\n");
            for(i = 0;i < sizeof(valid)/sizeof(valid[0]);++i)
                printf("%s\n",valid[i]);
            return 1;
            }
        }
    if(!fi)
        {
        printf("Cannot open input %s\n",argv[1]);
        return 1;
        }
    makeDuples();
    b1 = fgetc(fi);
    b2 = fgetc(fi);

    if(b1 == 0xff && b2 == 0xfe)
        {
        if(type[0])
            {
            le = true;
            if(!strcmp(type,"LE"))
                result = 1;
            }
        else
            {
            printf("Byte order: little endian\n");
            }
        littleendian(fi);
        }
    else if(b1 == 0xfe && b2 == 0xff)
        {
        bigendian(fi,true);
        }
    else if(b1 == 0 || b2 == 0) // No BOM, but still looks like UTF-16
        { // assume big endian
        surrogate((b1 << 8) + b2);
        bigendian(fi,false);
        }
    else
        {
        FILE * ftempInput = tmpfile();
        b3 = fgetc(fi);
        if(b3 == 0)// No BOM, but still looks like UTF-16
            { // assume big endian
            surrogate((b1 << 8) + b2);
            surrogate((b3 << 8) + fgetc(fi));
            bigendian(fi,false);
            }
        else
            {
            if(b1 == 0xEF && b2 == 0xBB && b3 == 0xBF) // BOM found, probably UTF8
                {
                ;// remove BOM
                }
            else
                {
                fputc(b1,ftempInput);
                fputc(b2,ftempInput);
                fputc(b3,ftempInput);
                b1 = b2 = b3 = 0;
                }

            int k;
            bool zeroFound = false;
            while((k = fgetc(fi)) != EOF)
                {
                if(k == 0)
                    zeroFound = true;
                fputc(k,ftempInput);
                }
            rewind(ftempInput);
            if(zeroFound)
                {
                if(b1 && b2 && b3)
                    {
                    surrogate((b1 << 8) + b2);
                    surrogate((b3 << 8) + fgetc(ftempInput));
                    }
                bigendian(ftempInput,false);
                }
            else
                {
                bool bom = false;
                if(b1 && b2 && b3)
                    {
                    if(type[0])
                        {
                        bom = true;
                        }
                    else
                        printf("BOM found, but not UTF-16. (UTF-8 file created in Windows?)\n");
                    }
                if(UTF8(ftempInput))
                    {
                    utf8 = true;
                    if(ascii)
                        {
                        if(type[0])
                            {
                            if(  !bom && (!strcmp(type,"ASCII") || !strcmp(type,"ISO"))
                              || !strcmp(type,"UTF-8") || bom && !strcmp(type,"UTF-8-BOM")
                              )
                                result = 1;
                            }
                        else
                            {
                            printf("encoding: ASCII (subset of UTF-8 and all ISO-8859 encodings)\n");
                            }
                        }
                    else
                        {
                        if(type[0])
                            {
                            if(!strcmp(type,"UTF-8") || bom && !strcmp(type,"UTF-8-BOM"))
                                result = 1;
                            }
                        else
                            {
                            printf("encoding: UTF-8\n");
                            }
                        }
                    }
                else
                    {
                    int c = 0;
                    while((k = fgetc(ftempInput)) != EOF)
                        getbyte(k);
                    if(type[0])
                        {
                        if(!bom && (!strcmp(type,"ASCII") || !strcmp(type,"ISO")))
                            result = 1;
                        }
                    else
                        {
                        printf("encoding: 8-bits\n");
                        }
                    }
                }
            }
        }
    if(fi != stdin)
        fclose(fi);
    if(!type[0])
        fprintf(stderr,"%s",report());
    if(!utf8 && ascii)
        if(!type[0])
            printf("File could have been encoded in ASCII!\n");
    deleteDuples();
    if(type[0])
        {
        printf("[%d]\t%s\n",result,argv[1]);
        }
    return 0;
    }
示例#2
0
int main(int argc,char * argv[])
    {
    int b1,b2,b3 = 0;
    FILE * fi = stdin;
    FILE * fo = stdout;
    if(argc > 1)
        {
        fi = fopen(argv[1],"rb");
        if(argc > 2)
            fo = fopen(argv[2],"wb");
        }
    if(!fi || !fo)
        return 1;
    makeDuples();
    b1 = fgetc(fi);
    b2 = fgetc(fi);
    if(b1 == 0xff && b2 == 0xfe)
        littleendian(fi,fo);
    else if(b1 == 0xfe && b2 == 0xff)
        bigendian(fi,fo);
    else if(b1 == 0 || b2 == 0) // No BOM, but still looks like UTF-16
        { // assume big endian
        surrogate((b1 << 8) + b2,fo);
        bigendian(fi,fo);
        }
    else
        {
        FILE * ftempInput = tmpfile();
        b3 = fgetc(fi);
        if(b3 == 0)// No BOM, but still looks like UTF-16
            { // assume big endian
            surrogate((b1 << 8) + b2,fo);
            surrogate((b3 << 8) + fgetc(fi),fo);
            bigendian(fi,fo);
            }
        else
            {
            if(b1 == 0xEF && b2 == 0xBB && b3 == 0xBF) // BOM found, probably UTF8
                ; // remove BOM
            else
                {
                fputc(b1,ftempInput);
                fputc(b2,ftempInput);
                fputc(b3,ftempInput);
                b1 = b2 = b3 = 0;
                }

            int k;
            bool zeroFound = false;
            while((k = fgetc(fi)) != EOF)
                {
                if(k == 0)
                    zeroFound = true;
                fputc(k,ftempInput);
                }
            rewind(ftempInput);
            if(zeroFound)
                {
                if(b1 && b2 && b3)
                    {
                    surrogate((b1 << 8) + b2,fo);
                    surrogate((b3 << 8) + fgetc(ftempInput),fo);
                    }
                bigendian(ftempInput,fo);
                }
            else if(!UTF8(ftempInput,fo))
                {
                rewind(ftempInput);
                if(b1 && b2 && b3) // "BOM" found, but not in UTF8 file!
                    { // write "BOM"
                    fputc(b1,fo);
                    fputc(b2,fo);
                    fputc(b3,fo);
                    }
                copy(ftempInput,fo);
                }
            }
        }
    if(fi != stdin)
        fclose(fi);
    if(fo != stdout)
        fclose(fo);
    fprintf(stderr,"%s",report());
    deleteDuples();
    return 0;
    }