void surrogate(int s,FILE * fo) { static int prev = 0; if(prev == 0) { if((s & 0xFC00) == 0xD800) // first word surrogat { prev = s; } else { int eight = getEightBit(s); if(eight == 0 && s != 0) fprintf(fo,"&#x%x;",s); else fputc(eight,fo); } } else { if((s & 0xFC00) == 0xDC00) // second word surrogat { s = (s & 0x3ff) + ((prev & 0x3ff) << 10) + 0x10000; fprintf(fo,"&#x%x;",s); } else { // Assume it is UCS-2, not UTF-16 fprintf(fo,"&#x%x;",prev); if(s > 0) fprintf(fo,"&#x%x;",s); // You can call surrogate with a zero or -1 to empty prev. } prev = 0; } }
// EF BB BF = 1110 1111 1011 1011 1011 1111 = 1111 11 1011 11 1111 = 11111110 11111111 = FEFF = Byte Order Mark // Added by many Windows programs to UTF8 files. bool UTF8(FILE * fi,FILE * fo) { int k[6]; FILE * fto = tmpfile(); while((k[0] = fgetc(fi)) != EOF) { switch(k[0] & 0xc0) // 11bbbbbb { case 0xc0: { if((k[0] & 0xfe) == 0xfe) return false; // Start of multibyte int i = 1; //bitpat(k[0],8); while((k[0] << i) & 0x80) // Read the second, third and fourth // highest bits and read a byte for every bit == 1 { k[i] = fgetc(fi); //bitpat(k[i],8); if((k[i++] & 0xc0) != 0x80) // 10bbbbbb return false; } int K = ((k[0] << i) & 0xff) // shift high-bits out of byte << (5 * i - 6); // shift to high position //putchar('\n'); //bitpat(K,32); int I = --i; while(i > 0) { K |= (k[i] & 0x3f) << ((I - i) * 6); //putchar('\n'); //bitpat(K,32); --i; } //putchar('\n'); //char a[2] = {K,0}; if(K <= 0x7f) return false; // overlong UTF-8 sequence int eight = getEightBit(K); if(eight == 0 && K != 0) fprintf(fto,"&#x%x;",K); else fputc(eight,fto); break; } case 0x80: // 10bbbbbb // Not UTF-8 return false; default: fputc(k[0],fto); } } rewind(fto); while((k[0] = fgetc(fto)) != EOF) fputc(k[0],fo); return true; }
void getbyte(int s) { int eight = getEightBit(s); if(eight > 127 || eight < 0) ascii = false; }