/** * maps a file to our address space * and returns it the calling function. */ void load_file(URL *U, char *file) { FILE *fp; size_t len = 0; struct stat st; char *filename; char postdata[POST_BUF]; unichar ubuf[POST_BUF+1]; size_t ulen; filename = trim(file); memset(postdata, 0, POST_BUF); if((lstat(filename, &st) == 0) || (errno != ENOENT)){ len = st.st_size; if((fp = fopen(filename, "r")) == NULL){ joe_error("could not open file: %s", filename); return; } if((fread(postdata, 1, len, fp )) == len){ if(looks_ascii((unsigned char*)postdata,len,ubuf,&ulen)) trim(postdata); else if(looks_utf8((unsigned char*)postdata,len,ubuf,&ulen)) trim(postdata); } else { joe_error( "unable to read file: %s", filename ); } fclose(fp); } if(strlen(postdata) > 0){ U->conttype = xstrdup(get_content_type(filename)); U->postlen = strlen(postdata); U->postdata = malloc(U->postlen); memcpy(U->postdata, postdata, U->postlen); U->postdata[U->postlen] = 0; } return; }
int file_ascmagic(RMagic *ms, const ut8 *buf, size_t nbytes) { return 0; size_t i; ut8 *nbuf = NULL, *utf8_buf = NULL, *utf8_end; unichar *ubuf = NULL; size_t ulen, mlen; const struct names *p; int rv = -1; int mime = ms->flags & R_MAGIC_MIME; const char *code = NULL; const char *code_mime = NULL; const char *type = NULL; const char *subtype = NULL; const char *subtype_mime = NULL; int has_escapes = 0; int has_backspace = 0; int seen_cr = 0; int n_crlf = 0; int n_lf = 0; int n_cr = 0; int n_nel = 0; size_t last_line_end = (size_t)-1; int has_long_lines = 0; /* * Undo the NUL-termination kindly provided by process() * but leave at least one byte to look at */ while (nbytes > 1 && buf[nbytes - 1] == '\0') nbytes--; if (!(nbuf = calloc(1, (nbytes + 1) * sizeof(nbuf[0])))) goto done; if (!(ubuf = calloc(1, (nbytes + 1) * sizeof(ubuf[0])))) goto done; /* * Then try to determine whether it's any character code we can * identify. Each of these tests, if it succeeds, will leave * the text converted into one-unichar-per-character Unicode in * ubuf, and the number of characters converted in ulen. */ if (looks_ascii(buf, nbytes, ubuf, &ulen)) { code = "ASCII"; code_mime = "us-ascii"; type = "text"; } else if (looks_utf8_with_BOM(buf, nbytes, ubuf, &ulen) > 0) { code = "UTF-8 Unicode (with BOM)"; code_mime = "utf-8"; type = "text"; } else if (file_looks_utf8(buf, nbytes, ubuf, &ulen) > 1) { code = "UTF-8 Unicode"; code_mime = "utf-8"; type = "text"; } else if ((i = looks_ucs16(buf, nbytes, ubuf, &ulen)) != 0) { if (i == 1) code = "Little-endian UTF-16 Unicode"; else code = "Big-endian UTF-16 Unicode"; type = "character data"; code_mime = "utf-16"; /* is this defined? */ } else if (looks_latin1(buf, nbytes, ubuf, &ulen)) { if (!memcmp (buf, "\xff\xff\xff\xff", 4)) { // uninitialized memory is not iso-8859!! goto done; } code = "ISO-8859"; type = "text"; code_mime = "iso-8859-1"; } else if (looks_extended(buf, nbytes, ubuf, &ulen)) { code = "Non-ISO extended-ASCII"; type = "text"; code_mime = "unknown"; } else { from_ebcdic(buf, nbytes, nbuf); if (looks_ascii(nbuf, nbytes, ubuf, &ulen)) { code = "EBCDIC"; type = "character data"; code_mime = "ebcdic"; } else if (looks_latin1(nbuf, nbytes, ubuf, &ulen)) { code = "International EBCDIC"; type = "character data"; code_mime = "ebcdic"; } else { rv = 0; goto done; /* doesn't look like text at all */ } } if (nbytes <= 1) { rv = 0; goto done; } /* Convert ubuf to UTF-8 and try text soft magic */ /* If original was ASCII or UTF-8, could use nbuf instead of re-converting. */ /* malloc size is a conservative overestimate; could be re-converting improved, or at least realloced after re-converting conversion. */ mlen = ulen * 6; if (!(utf8_buf = malloc(mlen))) { file_oomem(ms, mlen); goto done; } if (!(utf8_end = encode_utf8(utf8_buf, mlen, ubuf, ulen))) goto done; if (file_softmagic(ms, utf8_buf, utf8_end - utf8_buf, TEXTTEST) != 0) { rv = 1; goto done; } /* look for tokens from names.h - this is expensive! */ if ((ms->flags & R_MAGIC_NO_CHECK_TOKENS) != 0) goto subtype_identified; i = 0; while (i < ulen) { size_t end; /* skip past any leading space */ while (i < ulen && ISSPC(ubuf[i])) i++; if (i >= ulen) break; /* find the next whitespace */ for (end = i + 1; end < nbytes; end++) if (ISSPC(ubuf[end])) break; /* compare the word thus isolated against the token list */ for (p = names; p < names + NNAMES; p++) { if (ascmatch((const ut8 *)p->name, ubuf + i, end - i)) { subtype = types[p->type].human; subtype_mime = types[p->type].mime; goto subtype_identified; } } i = end; } subtype_identified: /* Now try to discover other details about the file. */ for (i = 0; i < ulen; i++) { if (ubuf[i] == '\n') { if (seen_cr) n_crlf++; else n_lf++; last_line_end = i; } else if (seen_cr) n_cr++; seen_cr = (ubuf[i] == '\r'); if (seen_cr) last_line_end = i; if (ubuf[i] == 0x85) { /* X3.64/ECMA-43 "next line" character */ n_nel++; last_line_end = i; } /* If this line is _longer_ than MAXLINELEN, remember it. */ if (i > last_line_end + MAXLINELEN) has_long_lines = 1; if (ubuf[i] == '\033') has_escapes = 1; if (ubuf[i] == '\b') has_backspace = 1; } /* Beware, if the data has been truncated, the final CR could have been followed by a LF. If we have HOWMANY bytes, it indicates that the data might have been truncated, probably even before this function was called. */ if (seen_cr && nbytes < HOWMANY) n_cr++; if (mime) { if (mime & R_MAGIC_MIME_TYPE) { if (subtype_mime) { if (file_printf(ms, subtype_mime) == -1) goto done; } else { if (file_printf(ms, "text/plain") == -1) goto done; } } if ((mime == 0 || mime == R_MAGIC_MIME) && code_mime) { if ((mime & R_MAGIC_MIME_TYPE) && file_printf(ms, " charset=") == -1) goto done; if (file_printf(ms, code_mime) == -1) goto done; } if (mime == R_MAGIC_MIME_ENCODING) if (file_printf(ms, "binary") == -1){ rv = 1; goto done; } } else { if (file_printf(ms, code) == -1) goto done; if (subtype) { if (file_printf(ms, " ") == -1) goto done; if (file_printf(ms, subtype) == -1) goto done; } if (file_printf(ms, " ") == -1) goto done; if (file_printf(ms, type) == -1) goto done; if (has_long_lines) if (file_printf(ms, ", with very long lines") == -1) goto done; /* * Only report line terminators if we find one other than LF, * or if we find none at all. */ if ((n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) || (n_crlf != 0 || n_cr != 0 || n_nel != 0)) { if (file_printf(ms, ", with") == -1) goto done; if (n_crlf == 0 && n_cr == 0 && n_nel == 0 && n_lf == 0) { if (file_printf(ms, " no") == -1) goto done; } else { if (n_crlf) { if (file_printf(ms, " CRLF") == -1) goto done; if (n_cr || n_lf || n_nel) if (file_printf(ms, ",") == -1) goto done; } if (n_cr) { if (file_printf(ms, " CR") == -1) goto done; if (n_lf || n_nel) if (file_printf(ms, ",") == -1) goto done; } if (n_lf) { if (file_printf(ms, " LF") == -1) goto done; if (n_nel) if (file_printf(ms, ",") == -1) goto done; } if (n_nel) if (file_printf(ms, " NEL") == -1) goto done; } if (file_printf(ms, " line terminators") == -1) goto done; } if (has_escapes) if (file_printf(ms, ", with escape sequences") == -1) goto done; if (has_backspace) if (file_printf(ms, ", with overstriking") == -1) goto done; } rv = 1; done: free (nbuf); free (ubuf); free (utf8_buf); return rv; }