int main(int argc,char * argv[]) { int b1,b2,b3 = 0; FILE * fi = stdin; if(argc > 1) { fi = fopen(argv[1],"rb"); } if(argc > 2) { type = argv[2]; char * valid[] = {"LE" ,"BE" ,"UCS-2" ,"UCS-2-LE" ,"UCS-2-BE" ,"BIN" ,"UTF-16" ,"UTF-16-LE" ,"UTF-16-BE" ,"ASCII" ,"ISO" ,"UTF-8" ,"UTF-8-BOM" }; int i; for(i = sizeof(valid)/sizeof(valid[0]);--i >= 0;) if(!strcmp(type,valid[i])) break; if(i < 0) { printf("second argument must be one of:\n"); for(i = 0;i < sizeof(valid)/sizeof(valid[0]);++i) printf("%s\n",valid[i]); return 1; } } if(!fi) { printf("Cannot open input %s\n",argv[1]); return 1; } makeDuples(); b1 = fgetc(fi); b2 = fgetc(fi); if(b1 == 0xff && b2 == 0xfe) { if(type[0]) { le = true; if(!strcmp(type,"LE")) result = 1; } else { printf("Byte order: little endian\n"); } littleendian(fi); } else if(b1 == 0xfe && b2 == 0xff) { bigendian(fi,true); } else if(b1 == 0 || b2 == 0) // No BOM, but still looks like UTF-16 { // assume big endian surrogate((b1 << 8) + b2); bigendian(fi,false); } else { FILE * ftempInput = tmpfile(); b3 = fgetc(fi); if(b3 == 0)// No BOM, but still looks like UTF-16 { // assume big endian surrogate((b1 << 8) + b2); surrogate((b3 << 8) + fgetc(fi)); bigendian(fi,false); } else { if(b1 == 0xEF && b2 == 0xBB && b3 == 0xBF) // BOM found, probably UTF8 { ;// remove BOM } else { fputc(b1,ftempInput); fputc(b2,ftempInput); fputc(b3,ftempInput); b1 = b2 = b3 = 0; } int k; bool zeroFound = false; while((k = fgetc(fi)) != EOF) { if(k == 0) zeroFound = true; fputc(k,ftempInput); } rewind(ftempInput); if(zeroFound) { if(b1 && b2 && b3) { surrogate((b1 << 8) + b2); surrogate((b3 << 8) + fgetc(ftempInput)); } bigendian(ftempInput,false); } else { bool bom = false; if(b1 && b2 && b3) { if(type[0]) { bom = true; } else printf("BOM found, but not UTF-16. (UTF-8 file created in Windows?)\n"); } if(UTF8(ftempInput)) { utf8 = true; if(ascii) { if(type[0]) { if( !bom && (!strcmp(type,"ASCII") || !strcmp(type,"ISO")) || !strcmp(type,"UTF-8") || bom && !strcmp(type,"UTF-8-BOM") ) result = 1; } else { printf("encoding: ASCII (subset of UTF-8 and all ISO-8859 encodings)\n"); } } else { if(type[0]) { if(!strcmp(type,"UTF-8") || bom && !strcmp(type,"UTF-8-BOM")) result = 1; } else { printf("encoding: UTF-8\n"); } } } else { int c = 0; while((k = fgetc(ftempInput)) != EOF) getbyte(k); if(type[0]) { if(!bom && (!strcmp(type,"ASCII") || !strcmp(type,"ISO"))) result = 1; } else { printf("encoding: 8-bits\n"); } } } } } if(fi != stdin) fclose(fi); if(!type[0]) fprintf(stderr,"%s",report()); if(!utf8 && ascii) if(!type[0]) printf("File could have been encoded in ASCII!\n"); deleteDuples(); if(type[0]) { printf("[%d]\t%s\n",result,argv[1]); } return 0; }
int main(int argc,char * argv[]) { int b1,b2,b3 = 0; FILE * fi = stdin; FILE * fo = stdout; if(argc > 1) { fi = fopen(argv[1],"rb"); if(argc > 2) fo = fopen(argv[2],"wb"); } if(!fi || !fo) return 1; makeDuples(); b1 = fgetc(fi); b2 = fgetc(fi); if(b1 == 0xff && b2 == 0xfe) littleendian(fi,fo); else if(b1 == 0xfe && b2 == 0xff) bigendian(fi,fo); else if(b1 == 0 || b2 == 0) // No BOM, but still looks like UTF-16 { // assume big endian surrogate((b1 << 8) + b2,fo); bigendian(fi,fo); } else { FILE * ftempInput = tmpfile(); b3 = fgetc(fi); if(b3 == 0)// No BOM, but still looks like UTF-16 { // assume big endian surrogate((b1 << 8) + b2,fo); surrogate((b3 << 8) + fgetc(fi),fo); bigendian(fi,fo); } else { if(b1 == 0xEF && b2 == 0xBB && b3 == 0xBF) // BOM found, probably UTF8 ; // remove BOM else { fputc(b1,ftempInput); fputc(b2,ftempInput); fputc(b3,ftempInput); b1 = b2 = b3 = 0; } int k; bool zeroFound = false; while((k = fgetc(fi)) != EOF) { if(k == 0) zeroFound = true; fputc(k,ftempInput); } rewind(ftempInput); if(zeroFound) { if(b1 && b2 && b3) { surrogate((b1 << 8) + b2,fo); surrogate((b3 << 8) + fgetc(ftempInput),fo); } bigendian(ftempInput,fo); } else if(!UTF8(ftempInput,fo)) { rewind(ftempInput); if(b1 && b2 && b3) // "BOM" found, but not in UTF8 file! { // write "BOM" fputc(b1,fo); fputc(b2,fo); fputc(b3,fo); } copy(ftempInput,fo); } } } if(fi != stdin) fclose(fi); if(fo != stdout) fclose(fo); fprintf(stderr,"%s",report()); deleteDuples(); return 0; }