/* * Sanitize and enhance an HTML message for display. * Also convert weird character sets to UTF-8 if necessary. * Also fixup img src="cid:..." type inline images to fetch the image * */ void output_html(const char *supplied_charset, int treat_as_wiki, int msgnum, StrBuf *Source, StrBuf *Target) { char buf[SIZ]; char *msg; char *ptr; char *msgstart; char *msgend; StrBuf *converted_msg; int buffer_length = 1; int line_length = 0; int content_length = 0; char new_window[SIZ]; int brak = 0; int alevel = 0; int scriptlevel = 0; int script_start_pos = (-1); int i; int linklen; char charset[128]; StrBuf *BodyArea = NULL; #ifdef HAVE_ICONV iconv_t ic = (iconv_t)(-1) ; char *ibuf; /* Buffer of characters to be converted */ char *obuf; /* Buffer for converted characters */ size_t ibuflen; /* Length of input buffer */ size_t obuflen; /* Length of output buffer */ char *osav; /* Saved pointer to output buffer */ #endif if (Target == NULL) Target = WC->WBuf; safestrncpy(charset, supplied_charset, sizeof charset); msg = strdup(""); sprintf(new_window, "<a target=\"%s\" href=", TARGET); if (Source == NULL) while (serv_getln(buf, sizeof buf), strcmp(buf, "000")) { line_length = strlen(buf); buffer_length = content_length + line_length + 2; ptr = realloc(msg, buffer_length); if (ptr == NULL) { StrBufAppendPrintf(Target, "<b>"); StrBufAppendPrintf(Target, _("realloc() error! couldn't get %d bytes: %s"), buffer_length + 1, strerror(errno)); StrBufAppendPrintf(Target, "</b><br><br>\n"); while (serv_getln(buf, sizeof buf), strcmp(buf, "000")) { /** flush */ } free(msg); return; } msg = ptr; strcpy(&msg[content_length], buf); content_length += line_length; strcpy(&msg[content_length], "\n"); content_length += 1; } else { content_length = StrLength(Source); free(msg); msg = (char*) ChrPtr(Source);/* TODO: remove cast */ buffer_length = content_length; } /** Do a first pass to isolate the message body */ ptr = msg + 1; msgstart = msg; msgend = &msg[content_length]; while (ptr < msgend) { /** Advance to next tag */ ptr = strchr(ptr, '<'); if ((ptr == NULL) || (ptr >= msgend)) break; ++ptr; if ((ptr == NULL) || (ptr >= msgend)) break; /* * Look for META tags. Some messages (particularly in * Asian locales) illegally declare a message's character * set in the HTML instead of in the MIME headers. This * is wrong but we have to work around it anyway. */ if (!strncasecmp(ptr, "META", 4)) { char *meta_start; char *meta_end; int meta_length; char *meta; char *meta_http_equiv; char *meta_content; char *spaceptr; meta_start = &ptr[4]; meta_end = strchr(ptr, '>'); if ((meta_end != NULL) && (meta_end <= msgend)) { meta_length = meta_end - meta_start + 1; meta = malloc(meta_length + 1); safestrncpy(meta, meta_start, meta_length); meta[meta_length] = 0; striplt(meta); if (!strncasecmp(meta, "HTTP-EQUIV=", 11)) { meta_http_equiv = strdup(&meta[11]); spaceptr = strchr(meta_http_equiv, ' '); if (spaceptr != NULL) { *spaceptr = 0; meta_content = strdup(++spaceptr); if (!strncasecmp(meta_content, "content=", 8)) { strcpy(meta_content, &meta_content[8]); stripquotes(meta_http_equiv); stripquotes(meta_content); extract_charset_from_meta(charset, meta_http_equiv, meta_content); } free(meta_content); } free(meta_http_equiv); } free(meta); } } /* * Any of these tags cause everything up to and including * the tag to be removed. */ if ( (!strncasecmp(ptr, "HTML", 4)) ||(!strncasecmp(ptr, "HEAD", 4)) ||(!strncasecmp(ptr, "/HEAD", 5)) ||(!strncasecmp(ptr, "BODY", 4)) ) { char *pBody = NULL; if (!strncasecmp(ptr, "BODY", 4)) { pBody = ptr; } ptr = strchr(ptr, '>'); if ((ptr == NULL) || (ptr >= msgend)) break; if ((pBody != NULL) && (ptr - pBody > 4)) { char* src; char *cid_start, *cid_end; *ptr = '\0'; pBody += 4; while ((isspace(*pBody)) && (pBody < ptr)) pBody ++; BodyArea = NewStrBufPlain(NULL, ptr - pBody); if (pBody < ptr) { src = strstr(pBody, "cid:"); if (src) { cid_start = src + 4; cid_end = cid_start; while ((*cid_end != '"') && !isspace(*cid_end) && (cid_end < ptr)) cid_end ++; /* copy tag and attributes up to src="cid: */ StrBufAppendBufPlain(BodyArea, pBody, src - pBody, 0); /* add in /webcit/mimepart/<msgno>/CID/ trailing / stops dumb URL filters getting excited */ StrBufAppendPrintf(BodyArea, "/webcit/mimepart/%d/",msgnum); StrBufAppendBufPlain(BodyArea, cid_start, cid_end - cid_start, 0); if (ptr - cid_end > 0) StrBufAppendBufPlain(BodyArea, cid_end + 1, ptr - cid_end, 0); } else StrBufAppendBufPlain(BodyArea, pBody, ptr - pBody, 0); } *ptr = '>'; } ++ptr; if ((ptr == NULL) || (ptr >= msgend)) break; msgstart = ptr; } /* * Any of these tags cause everything including and following * the tag to be removed. */ if ( (!strncasecmp(ptr, "/HTML", 5)) ||(!strncasecmp(ptr, "/BODY", 5)) ) { --ptr; msgend = ptr; strcpy(ptr, ""); } ++ptr; } if (msgstart > msg) { strcpy(msg, msgstart); } /* Now go through the message, parsing tags as necessary. */ converted_msg = NewStrBufPlain(NULL, content_length + 8192); /** Convert foreign character sets to UTF-8 if necessary. */ #ifdef HAVE_ICONV if ( (strcasecmp(charset, "us-ascii")) && (strcasecmp(charset, "UTF-8")) && (strcasecmp(charset, "")) ) { syslog(LOG_DEBUG, "Converting %s to UTF-8\n", charset); ctdl_iconv_open("UTF-8", charset, &ic); if (ic == (iconv_t)(-1) ) { syslog(LOG_WARNING, "%s:%d iconv_open() failed: %s\n", __FILE__, __LINE__, strerror(errno)); } } if (Source == NULL) { if (ic != (iconv_t)(-1) ) { ibuf = msg; ibuflen = content_length; obuflen = content_length + (content_length / 2) ; obuf = (char *) malloc(obuflen); osav = obuf; iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen); content_length = content_length + (content_length / 2) - obuflen; osav[content_length] = 0; free(msg); msg = osav; iconv_close(ic); } } else { if (ic != (iconv_t)(-1) ) { StrBuf *Buf = NewStrBufPlain(NULL, StrLength(Source) + 8096);; StrBufConvert(Source, Buf, &ic); FreeStrBuf(&Buf); iconv_close(ic); msg = (char*)ChrPtr(Source); /* TODO: get rid of this. */ } } #endif /* * At this point, the message has been stripped down to * only the content inside the <BODY></BODY> tags, and has * been converted to UTF-8 if it was originally in a foreign * character set. The text is also guaranteed to be null * terminated now. */ if (converted_msg == NULL) { StrBufAppendPrintf(Target, "Error %d: %s<br>%s:%d", errno, strerror(errno), __FILE__, __LINE__); goto BAIL; } if (BodyArea != NULL) { StrBufAppendBufPlain(converted_msg, HKEY("<table "), 0); StrBufAppendBuf(converted_msg, BodyArea, 0); StrBufAppendBufPlain(converted_msg, HKEY(" width=\"100%\"><tr><td>"), 0); } ptr = msg; msgend = strchr(msg, 0); while (ptr < msgend) { /** Try to sanitize the html of any rogue scripts */ if (!strncasecmp(ptr, "<script", 7)) { if (scriptlevel == 0) { script_start_pos = StrLength(converted_msg); } ++scriptlevel; } if (!strncasecmp(ptr, "</script", 8)) { --scriptlevel; } /** * Change mailto: links to WebCit mail, by replacing the * link with one that points back to our mail room. Due to * the way we parse URL's, it'll even handle mailto: links * that have "?subject=" in them. */ if (!strncasecmp(ptr, "<a href=\"mailto:", 16)) { content_length += 64; StrBufAppendPrintf(converted_msg, "<a href=\"display_enter?force_room=_MAIL_?recp="); ptr = &ptr[16]; ++alevel; ++brak; } /** Make external links open in a separate window */ else if (!strncasecmp(ptr, "<a href=\"", 9)) { ++alevel; ++brak; if ( ((strchr(ptr, ':') < strchr(ptr, '/'))) && ((strchr(ptr, '/') < strchr(ptr, '>'))) ) { /* open external links to new window */ StrBufAppendPrintf(converted_msg, new_window); ptr = &ptr[8]; } else if ( (treat_as_wiki) && (strncasecmp(ptr, "<a href=\"wiki?", 14)) && (strncasecmp(ptr, "<a href=\"dotgoto?", 17)) && (strncasecmp(ptr, "<a href=\"knrooms?", 17)) ) { content_length += 64; StrBufAppendPrintf(converted_msg, "<a href=\"wiki?go="); StrBufUrlescAppend(converted_msg, WC->CurRoom.name, NULL); StrBufAppendPrintf(converted_msg, "?page="); ptr = &ptr[9]; } else { StrBufAppendPrintf(converted_msg, "<a href=\""); ptr = &ptr[9]; } } /** Fixup <img src="cid:... ...> to fetch the mime part */ else if (!strncasecmp(ptr, "<img ", 5)) { char *cid_start, *cid_end; char* tag_end=strchr(ptr,'>'); char* src; /* FIXME - handle this situation (maybe someone opened an <img cid... * and then ended the message) */ if (!tag_end) { syslog(LOG_DEBUG, "tag_end is null and ptr is:\n"); syslog(LOG_DEBUG, "%s\n", ptr); syslog(LOG_DEBUG, "Theoretical bytes remaining: %d\n", (int)(msgend - ptr)); } src=strstr(ptr, "src=\"cid:"); ++brak; if (src && isspace(*(src-1)) && tag_end && (cid_start=strchr(src,':')) && (cid_end=strchr(cid_start,'"')) && (cid_end < tag_end) ) { /* copy tag and attributes up to src="cid: */ StrBufAppendBufPlain(converted_msg, ptr, src - ptr, 0); cid_start++; /* add in /webcit/mimepart/<msgno>/CID/ trailing / stops dumb URL filters getting excited */ StrBufAppendPrintf(converted_msg, " src=\"/webcit/mimepart/%d/",msgnum); StrBufAppendBufPlain(converted_msg, cid_start, cid_end - cid_start, 0); StrBufAppendBufPlain(converted_msg, "/\"", -1, 0); ptr = cid_end+1; } StrBufAppendBufPlain(converted_msg, ptr, tag_end - ptr, 0); ptr = tag_end; } /** * Turn anything that looks like a URL into a real link, as long * as it's not inside a tag already */ else if ( (brak == 0) && (alevel == 0) && ( (!strncasecmp(ptr, "http://", 7)) || (!strncasecmp(ptr, "https://", 8)))) { /** Find the end of the link */ int strlenptr; linklen = 0; strlenptr = strlen(ptr); for (i=0; i<=strlenptr; ++i) { if ((ptr[i]==0) ||(isspace(ptr[i])) ||(ptr[i]==10) ||(ptr[i]==13) ||(ptr[i]=='(') ||(ptr[i]==')') ||(ptr[i]=='<') ||(ptr[i]=='>') ||(ptr[i]=='[') ||(ptr[i]==']') ||(ptr[i]=='"') ||(ptr[i]=='\'') ) linklen = i; /* did s.b. send us an entity? */ if (ptr[i] == '&') { if ((ptr[i+2] ==';') || (ptr[i+3] ==';') || (ptr[i+5] ==';') || (ptr[i+6] ==';') || (ptr[i+7] ==';')) linklen = i; } if (linklen > 0) break; } if (linklen > 0) { char *ltreviewptr; char *nbspreviewptr; char linkedchar; int len; len = linklen; linkedchar = ptr[len]; ptr[len] = '\0'; /* spot for some subject strings tinymce tends to give us. */ ltreviewptr = strchr(ptr, '<'); if (ltreviewptr != NULL) { *ltreviewptr = '\0'; linklen = ltreviewptr - ptr; } nbspreviewptr = strstr(ptr, " "); if (nbspreviewptr != NULL) { /* nbspreviewptr = '\0'; */ linklen = nbspreviewptr - ptr; } if (ltreviewptr != 0) *ltreviewptr = '<'; ptr[len] = linkedchar; content_length += (32 + linklen); StrBufAppendPrintf(converted_msg, "%s\"", new_window); StrBufAppendBufPlain(converted_msg, ptr, linklen, 0); StrBufAppendPrintf(converted_msg, "\">"); StrBufAppendBufPlain(converted_msg, ptr, linklen, 0); ptr += linklen; StrBufAppendPrintf(converted_msg, "</A>"); } } else { StrBufAppendBufPlain(converted_msg, ptr, 1, 0); ptr++; } if ((ptr >= msg) && (ptr <= msgend)) { /* * We need to know when we're inside a tag, * so we don't turn things that look like URL's into * links, when they're already links - or image sources. */ if ((ptr > msg) && (*(ptr-1) == '<')) { ++brak; } if ((ptr > msg) && (*(ptr-1) == '>')) { --brak; if ((scriptlevel == 0) && (script_start_pos >= 0)) { StrBufCutRight(converted_msg, StrLength(converted_msg) - script_start_pos); script_start_pos = (-1); } } if (!strncasecmp(ptr, "</A>", 3)) --alevel; } } if (BodyArea != NULL) { StrBufAppendBufPlain(converted_msg, HKEY("</td></tr></table>"), 0); FreeStrBuf(&BodyArea); } /** uncomment these two lines to override conversion */ /** memcpy(converted_msg, msg, content_length); */ /** output_length = content_length; */ /** Output our big pile of markup */ StrBufAppendBuf(Target, converted_msg, 0); BAIL: /** A little trailing vertical whitespace... */ StrBufAppendPrintf(Target, "<br><br>\n"); /** Now give back the memory */ FreeStrBuf(&converted_msg); if ((msg != NULL) && (Source == NULL)) free(msg); }
int ReadHttpSubject(ParsedHttpHdrs *Hdr, StrBuf *Line, StrBuf *Buf) { const char *Args; void *vLine, *vHandler; const char *Pos = NULL; Hdr->HR.ReqLine = Line; /* The requesttype... GET, POST... */ StrBufExtract_token(Buf, Hdr->HR.ReqLine, 0, ' '); if (GetHash(HttpReqTypes, SKEY(Buf), &vLine) && (vLine != NULL)) { Hdr->HR.eReqType = *(long*)vLine; } else { Hdr->HR.eReqType = eGET; return 1; } StrBufCutLeft(Hdr->HR.ReqLine, StrLength(Buf) + 1); /* the HTTP Version... */ StrBufExtract_token(Buf, Hdr->HR.ReqLine, 1, ' '); StrBufCutRight(Hdr->HR.ReqLine, StrLength(Buf) + 1); if (StrLength(Buf) == 0) { Hdr->HR.eReqType = eGET; return 1; } StrBufAppendBuf(Hdr->this_page, Hdr->HR.ReqLine, 0); /* chop Filename / query arguments */ Args = strchr(ChrPtr(Hdr->HR.ReqLine), '?'); if (Args == NULL) /* whe're not that picky about params... TODO: this will spoil '&' in filenames.*/ Args = strchr(ChrPtr(Hdr->HR.ReqLine), '&'); if (Args != NULL) { Args ++; /* skip the ? */ StrBufPlain(Hdr->PlainArgs, Args, StrLength(Hdr->HR.ReqLine) - (Args - ChrPtr(Hdr->HR.ReqLine))); StrBufCutAt(Hdr->HR.ReqLine, 0, Args - 1); } /* don't parse them yet, maybe we don't even care... */ /* now lookup what we are going to do with this... */ /* skip first slash */ StrBufExtract_NextToken(Buf, Hdr->HR.ReqLine, &Pos, '/'); do { StrBufExtract_NextToken(Buf, Hdr->HR.ReqLine, &Pos, '/'); GetHash(HandlerHash, SKEY(Buf), &vHandler), Hdr->HR.Handler = (WebcitHandler*) vHandler; if (Hdr->HR.Handler == NULL) break; /* * If the request is prefixed by "/webcit" then chop that off. This * allows a front end web server to forward all /webcit requests to us * while still using the same web server port for other things. */ if ((Hdr->HR.Handler->Flags & URLNAMESPACE) != 0) continue; break; } while (1); /* remove the handlername from the URL */ if ((Pos != NULL) && (Pos != StrBufNOTNULL)){ StrBufCutLeft(Hdr->HR.ReqLine, Pos - ChrPtr(Hdr->HR.ReqLine)); } if (Hdr->HR.Handler != NULL) { if ((Hdr->HR.Handler->Flags & BOGUS) != 0) { return 1; } Hdr->HR.DontNeedAuth = ( ((Hdr->HR.Handler->Flags & ISSTATIC) != 0) || ((Hdr->HR.Handler->Flags & ANONYMOUS) != 0) ); } else { /* If this is a "flat" request for the root, display the configured landing page. */ int return_value; StrBuf *NewLine = NewStrBuf(); Hdr->HR.DontNeedAuth = 1; StrBufAppendPrintf(NewLine, "GET /landing?go=%s?failvisibly=1 HTTP/1.0", ChrPtr(Buf)); if (verbose) syslog(LOG_DEBUG, "Replacing with: %s", ChrPtr(NewLine)); return_value = ReadHttpSubject(Hdr, NewLine, Buf); FreeStrBuf(&NewLine); return return_value; } return 0; }