Beispiel #1
0
/*
 * Sanitize and enhance an HTML message for display.
 * Also convert weird character sets to UTF-8 if necessary.
 * Also fixup img src="cid:..." type inline images to fetch the image
 *
 */
void output_html(const char *supplied_charset, int treat_as_wiki, int msgnum, StrBuf *Source, StrBuf *Target) {
	char buf[SIZ];
	char *msg;
	char *ptr;
	char *msgstart;
	char *msgend;
	StrBuf *converted_msg;
	int buffer_length = 1;
	int line_length = 0;
	int content_length = 0;
	char new_window[SIZ];
	int brak = 0;
	int alevel = 0;
	int scriptlevel = 0;
	int script_start_pos = (-1);
	int i;
	int linklen;
	char charset[128];
	StrBuf *BodyArea = NULL;
#ifdef HAVE_ICONV
	iconv_t ic = (iconv_t)(-1) ;
	char *ibuf;                   /* Buffer of characters to be converted */
	char *obuf;                   /* Buffer for converted characters      */
	size_t ibuflen;               /* Length of input buffer               */
	size_t obuflen;               /* Length of output buffer              */
	char *osav;                   /* Saved pointer to output buffer       */
#endif
	if (Target == NULL)
		Target = WC->WBuf;

	safestrncpy(charset, supplied_charset, sizeof charset);
	msg = strdup("");
	sprintf(new_window, "<a target=\"%s\" href=", TARGET);

	if (Source == NULL) while (serv_getln(buf, sizeof buf), strcmp(buf, "000")) {
		line_length = strlen(buf);
		buffer_length = content_length + line_length + 2;
		ptr = realloc(msg, buffer_length);
		if (ptr == NULL) {
			StrBufAppendPrintf(Target, "<b>");
			StrBufAppendPrintf(Target, _("realloc() error! couldn't get %d bytes: %s"),
					buffer_length + 1,
					strerror(errno));
			StrBufAppendPrintf(Target, "</b><br><br>\n");
			while (serv_getln(buf, sizeof buf), strcmp(buf, "000")) {
				/** flush */
			}
			free(msg);
			return;
		}
		msg = ptr;
		strcpy(&msg[content_length], buf);
		content_length += line_length;
		strcpy(&msg[content_length], "\n");
		content_length += 1;
	}
	else {
		content_length = StrLength(Source);
		free(msg);
		msg = (char*) ChrPtr(Source);/* TODO: remove cast */
		buffer_length = content_length;
	}

	/** Do a first pass to isolate the message body */
	ptr = msg + 1;
	msgstart = msg;
	msgend = &msg[content_length];

	while (ptr < msgend) {

		/** Advance to next tag */
		ptr = strchr(ptr, '<');
		if ((ptr == NULL) || (ptr >= msgend)) break;
		++ptr;
		if ((ptr == NULL) || (ptr >= msgend)) break;

		/*
		 *  Look for META tags.  Some messages (particularly in
		 *  Asian locales) illegally declare a message's character
		 *  set in the HTML instead of in the MIME headers.  This
		 *  is wrong but we have to work around it anyway.
		 */
		if (!strncasecmp(ptr, "META", 4)) {

			char *meta_start;
			char *meta_end;
			int meta_length;
			char *meta;
			char *meta_http_equiv;
			char *meta_content;
			char *spaceptr;

			meta_start = &ptr[4];
			meta_end = strchr(ptr, '>');
			if ((meta_end != NULL) && (meta_end <= msgend)) {
				meta_length = meta_end - meta_start + 1;
				meta = malloc(meta_length + 1);
				safestrncpy(meta, meta_start, meta_length);
				meta[meta_length] = 0;
				striplt(meta);
				if (!strncasecmp(meta, "HTTP-EQUIV=", 11)) {
					meta_http_equiv = strdup(&meta[11]);
					spaceptr = strchr(meta_http_equiv, ' ');
					if (spaceptr != NULL) {
						*spaceptr = 0;
						meta_content = strdup(++spaceptr);
						if (!strncasecmp(meta_content, "content=", 8)) {
							strcpy(meta_content, &meta_content[8]);
							stripquotes(meta_http_equiv);
							stripquotes(meta_content);
							extract_charset_from_meta(charset,
									meta_http_equiv, meta_content);
						}
						free(meta_content);
					}
					free(meta_http_equiv);
				}
				free(meta);
			}
		}

		/*
		 * Any of these tags cause everything up to and including
		 * the tag to be removed.
		 */	
		if ( (!strncasecmp(ptr, "HTML", 4))
				||(!strncasecmp(ptr, "HEAD", 4))
				||(!strncasecmp(ptr, "/HEAD", 5))
				||(!strncasecmp(ptr, "BODY", 4)) ) {
			char *pBody = NULL;

			if (!strncasecmp(ptr, "BODY", 4)) {
				pBody = ptr;
			}
			ptr = strchr(ptr, '>');
			if ((ptr == NULL) || (ptr >= msgend)) break;
			if ((pBody != NULL) && (ptr - pBody > 4)) {
				char* src;
				char *cid_start, *cid_end;

				*ptr = '\0';
				pBody += 4; 
				while ((isspace(*pBody)) && (pBody < ptr))
					pBody ++;
				BodyArea = NewStrBufPlain(NULL,  ptr - pBody);

				if (pBody < ptr) {
					src = strstr(pBody, "cid:");
					if (src) {
						cid_start = src + 4;
						cid_end = cid_start;
						while ((*cid_end != '"') && 
								!isspace(*cid_end) &&
								(cid_end < ptr))
							cid_end ++;

						/* copy tag and attributes up to src="cid: */
						StrBufAppendBufPlain(BodyArea, pBody, src - pBody, 0);

						/* add in /webcit/mimepart/<msgno>/CID/ 
						   trailing / stops dumb URL filters getting excited */
						StrBufAppendPrintf(BodyArea,
								"/webcit/mimepart/%d/",msgnum);
						StrBufAppendBufPlain(BodyArea, cid_start, cid_end - cid_start, 0);

						if (ptr - cid_end > 0)
							StrBufAppendBufPlain(BodyArea, 
									cid_end + 1, 
									ptr - cid_end, 0);
					}
					else 
						StrBufAppendBufPlain(BodyArea, pBody, ptr - pBody, 0);
				}
				*ptr = '>';
			}
			++ptr;
			if ((ptr == NULL) || (ptr >= msgend)) break;
			msgstart = ptr;
		}

		/*
		 * Any of these tags cause everything including and following
		 * the tag to be removed.
		 */
		if ( (!strncasecmp(ptr, "/HTML", 5))
				||(!strncasecmp(ptr, "/BODY", 5)) ) {
			--ptr;
			msgend = ptr;
			strcpy(ptr, "");

		}

		++ptr;
	}
	if (msgstart > msg) {
		strcpy(msg, msgstart);
	}

	/* Now go through the message, parsing tags as necessary. */
	converted_msg = NewStrBufPlain(NULL, content_length + 8192);


	/** Convert foreign character sets to UTF-8 if necessary. */
#ifdef HAVE_ICONV
	if ( (strcasecmp(charset, "us-ascii"))
			&& (strcasecmp(charset, "UTF-8"))
			&& (strcasecmp(charset, ""))
	   ) {
		syslog(LOG_DEBUG, "Converting %s to UTF-8\n", charset);
		ctdl_iconv_open("UTF-8", charset, &ic);
		if (ic == (iconv_t)(-1) ) {
			syslog(LOG_WARNING, "%s:%d iconv_open() failed: %s\n",
					__FILE__, __LINE__, strerror(errno));
		}
	}
	if  (Source == NULL) {
		if (ic != (iconv_t)(-1) ) {
			ibuf = msg;
			ibuflen = content_length;
			obuflen = content_length + (content_length / 2) ;
			obuf = (char *) malloc(obuflen);
			osav = obuf;
			iconv(ic, &ibuf, &ibuflen, &obuf, &obuflen);
			content_length = content_length + (content_length / 2) - obuflen;
			osav[content_length] = 0;
			free(msg);
			msg = osav;
			iconv_close(ic);
		}
	}
	else {
		if (ic != (iconv_t)(-1) ) {
			StrBuf *Buf = NewStrBufPlain(NULL, StrLength(Source) + 8096);;
			StrBufConvert(Source, Buf, &ic);
			FreeStrBuf(&Buf);
			iconv_close(ic);
			msg = (char*)ChrPtr(Source); /* TODO: get rid of this. */
		}
	}

#endif

	/*
	 *	At this point, the message has been stripped down to
	 *	only the content inside the <BODY></BODY> tags, and has
	 *	been converted to UTF-8 if it was originally in a foreign
	 *	character set.  The text is also guaranteed to be null
	 *	terminated now.
	 */

	if (converted_msg == NULL) {
		StrBufAppendPrintf(Target, "Error %d: %s<br>%s:%d", errno, strerror(errno), __FILE__, __LINE__);
		goto BAIL;
	}

	if (BodyArea != NULL) {
		StrBufAppendBufPlain(converted_msg, HKEY("<table "), 0);  
		StrBufAppendBuf(converted_msg, BodyArea, 0);
		StrBufAppendBufPlain(converted_msg, HKEY(" width=\"100%\"><tr><td>"), 0);
	}
	ptr = msg;
	msgend = strchr(msg, 0);
	while (ptr < msgend) {

		/** Try to sanitize the html of any rogue scripts */
		if (!strncasecmp(ptr, "<script", 7)) {
			if (scriptlevel == 0) {
				script_start_pos = StrLength(converted_msg);
			}
			++scriptlevel;
		}
		if (!strncasecmp(ptr, "</script", 8)) {
			--scriptlevel;
		}

		/**
		 * Change mailto: links to WebCit mail, by replacing the
		 * link with one that points back to our mail room.  Due to
		 * the way we parse URL's, it'll even handle mailto: links
		 * that have "?subject=" in them.
		 */
		if (!strncasecmp(ptr, "<a href=\"mailto:", 16)) {
			content_length += 64;
			StrBufAppendPrintf(converted_msg,
					"<a href=\"display_enter?force_room=_MAIL_?recp=");
			ptr = &ptr[16];
			++alevel;
			++brak;
		}
		/** Make external links open in a separate window */
		else if (!strncasecmp(ptr, "<a href=\"", 9)) {
			++alevel;
			++brak;
			if ( ((strchr(ptr, ':') < strchr(ptr, '/')))
					&&  ((strchr(ptr, '/') < strchr(ptr, '>'))) 
			   ) {
				/* open external links to new window */
				StrBufAppendPrintf(converted_msg, new_window);
				ptr = &ptr[8];
			}
			else if (
				(treat_as_wiki)
				&& (strncasecmp(ptr, "<a href=\"wiki?", 14))
				&& (strncasecmp(ptr, "<a href=\"dotgoto?", 17))
				&& (strncasecmp(ptr, "<a href=\"knrooms?", 17))
			) {
				content_length += 64;
				StrBufAppendPrintf(converted_msg, "<a href=\"wiki?go=");
				StrBufUrlescAppend(converted_msg, WC->CurRoom.name, NULL);
				StrBufAppendPrintf(converted_msg, "?page=");
				ptr = &ptr[9];
			}
			else {
				StrBufAppendPrintf(converted_msg, "<a href=\"");
				ptr = &ptr[9];
			}
		}
		/** Fixup <img src="cid:... ...> to fetch the mime part */
		else if (!strncasecmp(ptr, "<img ", 5)) {
			char *cid_start, *cid_end;
			char* tag_end=strchr(ptr,'>');
			char* src;
			/* FIXME - handle this situation (maybe someone opened an <img cid... 
			 * and then ended the message)
			 */
			if (!tag_end) {
				syslog(LOG_DEBUG, "tag_end is null and ptr is:\n");
				syslog(LOG_DEBUG, "%s\n", ptr);
				syslog(LOG_DEBUG, "Theoretical bytes remaining: %d\n", (int)(msgend - ptr));
			}

			src=strstr(ptr, "src=\"cid:");
			++brak;

			if (src
			    && isspace(*(src-1))
				&& tag_end
				&& (cid_start=strchr(src,':'))
				&& (cid_end=strchr(cid_start,'"'))
				&& (cid_end < tag_end)
			) {
				/* copy tag and attributes up to src="cid: */
				StrBufAppendBufPlain(converted_msg, ptr, src - ptr, 0);
				cid_start++;

				/* add in /webcit/mimepart/<msgno>/CID/ 
				   trailing / stops dumb URL filters getting excited */
				StrBufAppendPrintf(converted_msg,
						" src=\"/webcit/mimepart/%d/",msgnum);
				StrBufAppendBufPlain(converted_msg, cid_start, cid_end - cid_start, 0);
				StrBufAppendBufPlain(converted_msg, "/\"", -1, 0);

				ptr = cid_end+1;
			}
			StrBufAppendBufPlain(converted_msg, ptr, tag_end - ptr, 0);
			ptr = tag_end;
		}

		/**
		 * Turn anything that looks like a URL into a real link, as long
		 * as it's not inside a tag already
		 */
		else if ( (brak == 0) && (alevel == 0) &&
			  ( (!strncasecmp(ptr, "http://", 7)) ||
			    (!strncasecmp(ptr, "https://", 8)))) {
			/** Find the end of the link */
			int strlenptr;
			linklen = 0;
				
			strlenptr = strlen(ptr);
			for (i=0; i<=strlenptr; ++i) {
				if ((ptr[i]==0)
				    ||(isspace(ptr[i]))
				    ||(ptr[i]==10)
				    ||(ptr[i]==13)
				    ||(ptr[i]=='(')
				    ||(ptr[i]==')')
				    ||(ptr[i]=='<')
				    ||(ptr[i]=='>')
				    ||(ptr[i]=='[')
				    ||(ptr[i]==']')
				    ||(ptr[i]=='"')
				    ||(ptr[i]=='\'')
					) linklen = i;
				/* did s.b. send us an entity? */
				if (ptr[i] == '&') {
					if ((ptr[i+2] ==';') ||
					    (ptr[i+3] ==';') ||
					    (ptr[i+5] ==';') ||
					    (ptr[i+6] ==';') ||
					    (ptr[i+7] ==';'))
						linklen = i;
				}
				if (linklen > 0) break;
			}
			if (linklen > 0) {
				char *ltreviewptr;
				char *nbspreviewptr;
				char linkedchar;
				int len;
					
				len = linklen;
				linkedchar = ptr[len];
				ptr[len] = '\0';
				/* spot for some subject strings tinymce tends to give us. */
				ltreviewptr = strchr(ptr, '<');
				if (ltreviewptr != NULL) {
					*ltreviewptr = '\0';
					linklen = ltreviewptr - ptr;
				}

				nbspreviewptr = strstr(ptr, "&nbsp;");
				if (nbspreviewptr != NULL) {
					/* nbspreviewptr = '\0'; */
					linklen = nbspreviewptr - ptr;
				}
				if (ltreviewptr != 0)
					*ltreviewptr = '<';

				ptr[len] = linkedchar;

				content_length += (32 + linklen);
				StrBufAppendPrintf(converted_msg, "%s\"", new_window);
				StrBufAppendBufPlain(converted_msg, ptr, linklen, 0);
				StrBufAppendPrintf(converted_msg, "\">");
				StrBufAppendBufPlain(converted_msg, ptr, linklen, 0);
				ptr += linklen;
				StrBufAppendPrintf(converted_msg, "</A>");
			}
		}
		else {
			StrBufAppendBufPlain(converted_msg, ptr, 1, 0);
			ptr++;
		}


		if ((ptr >= msg) && (ptr <= msgend)) {
			/*
			 * We need to know when we're inside a tag,
			 * so we don't turn things that look like URL's into
			 * links, when they're already links - or image sources.
			 */
			if ((ptr > msg) && (*(ptr-1) == '<')) {
				++brak;
			}
			if ((ptr > msg) && (*(ptr-1) == '>')) {
				--brak;
				if ((scriptlevel == 0) && (script_start_pos >= 0)) {
					StrBufCutRight(converted_msg, StrLength(converted_msg) - script_start_pos);
					script_start_pos = (-1);
				}
			}
			if (!strncasecmp(ptr, "</A>", 3)) --alevel;
		}
	}

	if (BodyArea != NULL) {
		StrBufAppendBufPlain(converted_msg, HKEY("</td></tr></table>"), 0);  
		FreeStrBuf(&BodyArea);
	}

	/**	uncomment these two lines to override conversion	*/
	/**	memcpy(converted_msg, msg, content_length);		*/
	/**	output_length = content_length;				*/

	/** Output our big pile of markup */
	StrBufAppendBuf(Target, converted_msg, 0);

BAIL:	/** A little trailing vertical whitespace... */
	StrBufAppendPrintf(Target, "<br><br>\n");

	/** Now give back the memory */
	FreeStrBuf(&converted_msg);
	if ((msg != NULL) && (Source == NULL)) free(msg);
}
Beispiel #2
0
int ReadHttpSubject(ParsedHttpHdrs *Hdr, StrBuf *Line, StrBuf *Buf)
{
	const char *Args;
	void *vLine, *vHandler;
	const char *Pos = NULL;

	Hdr->HR.ReqLine = Line;
	/* The requesttype... GET, POST... */
	StrBufExtract_token(Buf, Hdr->HR.ReqLine, 0, ' ');
	if (GetHash(HttpReqTypes, SKEY(Buf), &vLine) &&
	    (vLine != NULL))
	{
		Hdr->HR.eReqType = *(long*)vLine;
	}
	else {
		Hdr->HR.eReqType = eGET;
		return 1;
	}
	StrBufCutLeft(Hdr->HR.ReqLine, StrLength(Buf) + 1);

	/* the HTTP Version... */
	StrBufExtract_token(Buf, Hdr->HR.ReqLine, 1, ' ');
	StrBufCutRight(Hdr->HR.ReqLine, StrLength(Buf) + 1);
	
	if (StrLength(Buf) == 0) {
		Hdr->HR.eReqType = eGET;
		return 1;
	}

	StrBufAppendBuf(Hdr->this_page, Hdr->HR.ReqLine, 0);

	/* chop Filename / query arguments */
	Args = strchr(ChrPtr(Hdr->HR.ReqLine), '?');
	if (Args == NULL) /* whe're not that picky about params... TODO: this will spoil '&' in filenames.*/
		Args = strchr(ChrPtr(Hdr->HR.ReqLine), '&');
	if (Args != NULL) {
		Args ++; /* skip the ? */
		StrBufPlain(Hdr->PlainArgs, 
			    Args, 
			    StrLength(Hdr->HR.ReqLine) -
			    (Args - ChrPtr(Hdr->HR.ReqLine)));
		StrBufCutAt(Hdr->HR.ReqLine, 0, Args - 1);
	} /* don't parse them yet, maybe we don't even care... */
	
	/* now lookup what we are going to do with this... */
	/* skip first slash */
	StrBufExtract_NextToken(Buf, Hdr->HR.ReqLine, &Pos, '/');
	do {
		StrBufExtract_NextToken(Buf, Hdr->HR.ReqLine, &Pos, '/');

		GetHash(HandlerHash, SKEY(Buf), &vHandler),
		Hdr->HR.Handler = (WebcitHandler*) vHandler;
		if (Hdr->HR.Handler == NULL)
			break;
		/*
		 * If the request is prefixed by "/webcit" then chop that off.  This
		 * allows a front end web server to forward all /webcit requests to us
		 * while still using the same web server port for other things.
		 */
		if ((Hdr->HR.Handler->Flags & URLNAMESPACE) != 0)
			continue;
		break;
	} while (1);
	/* remove the handlername from the URL */
	if ((Pos != NULL) && (Pos != StrBufNOTNULL)){
		StrBufCutLeft(Hdr->HR.ReqLine, 
			      Pos - ChrPtr(Hdr->HR.ReqLine));
	}

	if (Hdr->HR.Handler != NULL) {
		if ((Hdr->HR.Handler->Flags & BOGUS) != 0) {
			return 1;
		}
		Hdr->HR.DontNeedAuth = (
			((Hdr->HR.Handler->Flags & ISSTATIC) != 0) ||
			((Hdr->HR.Handler->Flags & ANONYMOUS) != 0)
		);
	}
	else {
		/* If this is a "flat" request for the root, display the configured landing page. */
		int return_value;
		StrBuf *NewLine = NewStrBuf();
		Hdr->HR.DontNeedAuth = 1;
		StrBufAppendPrintf(NewLine, "GET /landing?go=%s?failvisibly=1 HTTP/1.0", ChrPtr(Buf));
		if (verbose) syslog(LOG_DEBUG, "Replacing with: %s", ChrPtr(NewLine));
		return_value = ReadHttpSubject(Hdr, NewLine, Buf);
		FreeStrBuf(&NewLine);
		return return_value;
	}

	return 0;
}