int file_buffer(RMagic *ms, int fd, const char *inname, const void *buf, size_t nb) { int mime, m = 0; if (!ms) return -1; mime = ms->flags & R_MAGIC_MIME; if (nb == 0) { if ((!mime || (mime & R_MAGIC_MIME_TYPE)) && file_printf(ms, mime ? "application/x-empty" : "empty") == -1) return -1; return 1; } else if (nb == 1) { if ((!mime || (mime & R_MAGIC_MIME_TYPE)) && file_printf(ms, mime ? "application/octet-stream" : "very short file (no magic)") == -1) return -1; return 1; } #if 0 /* try compression stuff */ if ((ms->flags & R_MAGIC_NO_CHECK_COMPRESS) != 0 || (m = file_zmagic(ms, fd, inname, buf, nb)) == 0) { #endif /* Check if we have a tar file */ if ((ms->flags & R_MAGIC_NO_CHECK_TAR) != 0 || (m = file_is_tar(ms, buf, nb)) == 0) { /* try tests in /etc/magic (or surrogate magic file) */ if ((ms->flags & R_MAGIC_NO_CHECK_SOFT) != 0 || (m = file_softmagic(ms, buf, nb, BINTEST)) == 0) { /* try known keywords, check whether it is ASCII */ if ((ms->flags & R_MAGIC_NO_CHECK_ASCII) != 0 || (m = file_ascmagic(ms, buf, nb)) == 0) { /* abandon hope, all ye who remain here */ if ((!mime || (mime & R_MAGIC_MIME_TYPE))) { // if (mime) file_printf (ms, "application/octet-stream"); return -1; } m = 1; } } } #if 0 } #endif return m; }
int file_ascmagic(RMagic *ms, const ut8 *buf, size_t nbytes) { return 0; size_t i; ut8 *nbuf = NULL, *utf8_buf = NULL, *utf8_end; unichar *ubuf = NULL; size_t ulen, mlen; const struct names *p; int rv = -1; int mime = ms->flags & R_MAGIC_MIME; const char *code = NULL; const char *code_mime = NULL; const char *type = NULL; const char *subtype = NULL; const char *subtype_mime = NULL; int has_escapes = 0; int has_backspace = 0; int seen_cr = 0; int n_crlf = 0; int n_lf = 0; int n_cr = 0; int n_nel = 0; size_t last_line_end = (size_t)-1; int has_long_lines = 0; /* * Undo the NUL-termination kindly provided by process() * but leave at least one byte to look at */ while (nbytes > 1 && buf[nbytes - 1] == '\0') nbytes--; if (!(nbuf = calloc(1, (nbytes + 1) * sizeof(nbuf[0])))) goto done; if (!(ubuf = calloc(1, (nbytes + 1) * sizeof(ubuf[0])))) goto done; /* * Then try to determine whether it's any character code we can * identify. Each of these tests, if it succeeds, will leave * the text converted into one-unichar-per-character Unicode in * ubuf, and the number of characters converted in ulen. */ if (looks_ascii(buf, nbytes, ubuf, &ulen)) { code = "ASCII"; code_mime = "us-ascii"; type = "text"; } else if (looks_utf8_with_BOM(buf, nbytes, ubuf, &ulen) > 0) { code = "UTF-8 Unicode (with BOM)"; code_mime = "utf-8"; type = "text"; } else if (file_looks_utf8(buf, nbytes, ubuf, &ulen) > 1) { code = "UTF-8 Unicode"; code_mime = "utf-8"; type = "text"; } else if ((i = looks_ucs16(buf, nbytes, ubuf, &ulen)) != 0) { if (i == 1) code = "Little-endian UTF-16 Unicode"; else code = "Big-endian UTF-16 Unicode"; type = "character data"; code_mime = "utf-16"; /* is this defined? */ } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) { if (!memcmp (buf, "\xff\xff\xff\xff", 4)) { // uninitialized memory is not iso-8859!! goto done; } code = "ISO-8859"; type = "text"; code_mime = "iso-8859-1"; } else if (looks_extended(buf, nbytes, ubuf, &ulen)) { code = "Non-ISO extended-ASCII"; type = "text"; code_mime = "unknown"; } else { from_ebcdic(buf, nbytes, nbuf); if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) { code = "EBCDIC"; type = "character data"; code_mime = "ebcdic"; } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) { code = "International EBCDIC"; type = "character data"; code_mime = "ebcdic"; } else { rv = 0; goto done; /* doesn't look like text at all */ } } if (nbytes <= 1) { rv = 0; goto done; } /* Convert ubuf to UTF-8 and try text soft magic */ /* If original was ASCII or UTF-8, could use nbuf instead of re-converting. */ /* malloc size is a conservative overestimate; could be re-converting improved, or at least realloced after re-converting conversion. */ mlen = ulen * 6; if (!(utf8_buf = malloc(mlen))) { file_oomem(ms, mlen); goto done; } if (!(utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen))) goto done; if (file_softmagic(ms, utf8_buf, utf8_end - utf8_buf, TEXTTEST) != 0) { rv = 1; goto done; } /* look for tokens from names.h - this is expensive! */ if ((ms->flags & R_MAGIC_NO_CHECK_TOKENS) != 0) goto subtype_identified; i = 0; while (i < ulen) { size_t end; /* skip past any leading space */ while (i < ulen && ISSPC(ubuf[i])) i++; if (i >= ulen) break; /* find the next whitespace */ for (end = i + 1; end < nbytes; end++) if (ISSPC(ubuf[end])) break; /* compare the word thus isolated against the token list */ for (p = names; p < names + NNAMES; p++) { if (ascmatch((const ut8 *)p->name, ubuf + i, end - i)) { subtype = types[p->type].human; subtype_mime = types[p->type].mime; goto subtype_identified; } } i = end; } subtype_identified: /* Now try to discover other details about the file. */ for (i = 0; i < ulen; i++) { if (ubuf[i] == '\n') { if (seen_cr) n_crlf++; else n_lf++; last_line_end = i; } else if (seen_cr) n_cr++; seen_cr = (ubuf[i] == '\r'); if (seen_cr) last_line_end = i; if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ n_nel++; last_line_end = i; } /* If this line is _longer_ than MAXLINELEN, remember it. */ if (i > last_line_end + MAXLINELEN) has_long_lines = 1; if (ubuf[i] == '\033') has_escapes = 1; if (ubuf[i] == '\b') has_backspace = 1; } /* Beware, if the data has been truncated, the final CR could have been followed by a LF. If we have HOWMANY bytes, it indicates that the data might have been truncated, probably even before this function was called. */ if (seen_cr && nbytes < HOWMANY) n_cr++; if (mime) { if (mime & R_MAGIC_MIME_TYPE) { if (subtype_mime) { if (file_printf(ms, subtype_mime) == -1) goto done; } else { if (file_printf(ms, "text/plain") == -1) goto done; } } if ((mime == 0 || mime == R_MAGIC_MIME) && code_mime) { if ((mime & R_MAGIC_MIME_TYPE) && file_printf(ms, " charset=") == -1) goto done; if (file_printf(ms, code_mime) == -1) goto done; } if (mime == R_MAGIC_MIME_ENCODING) if (file_printf(ms, "binary") == -1){ rv = 1; goto done; } } else { if (file_printf(ms, code) == -1) goto done; if (subtype) { if (file_printf(ms, " ") == -1) goto done; if (file_printf(ms, subtype) == -1) goto done; } if (file_printf(ms, " ") == -1) goto done; if (file_printf(ms, type) == -1) goto done; if (has_long_lines) if (file_printf(ms, ", with very long lines") == -1) goto done; /* * Only report line terminators if we find one other than LF, * or if we find none at all. */ if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) || (n_crlf != 0 || n_cr != 0 || n_nel != 0)) { if (file_printf(ms, ", with") == -1) goto done; if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) { if (file_printf(ms, " no") == -1) goto done; } else { if (n_crlf) { if (file_printf(ms, " CRLF") == -1) goto done; if (n_cr || n_lf || n_nel) if (file_printf(ms, ",") == -1) goto done; } if (n_cr) { if (file_printf(ms, " CR") == -1) goto done; if (n_lf || n_nel) if (file_printf(ms, ",") == -1) goto done; } if (n_lf) { if (file_printf(ms, " LF") == -1) goto done; if (n_nel) if (file_printf(ms, ",") == -1) goto done; } if (n_nel) if (file_printf(ms, " NEL") == -1) goto done; } if (file_printf(ms, " line terminators") == -1) goto done; } if (has_escapes) if (file_printf(ms, ", with escape sequences") == -1) goto done; if (has_backspace) if (file_printf(ms, ", with overstriking") == -1) goto done; } rv = 1; done: free (nbuf); free (ubuf); free (utf8_buf); return rv; }