void mozTXTToHTMLConv::ScanHTML(nsString& aInString, uint32_t whattodo, nsString &aOutString) { // some common variables we were recalculating // every time inside the for loop... int32_t lengthOfInString = aInString.Length(); const char16_t * uniBuffer = aInString.get(); #ifdef DEBUG_BenB_Perf PRTime parsing_start = PR_IntervalNow(); #endif // Look for simple entities not included in a tags and scan them. // Skip all tags ("<[...]>") and content in an a link tag ("<a [...]</a>"), // comment tag ("<!--[...]-->"), style tag, script tag or head tag. // Unescape the rest (text between tags) and pass it to ScanTXT. for (int32_t i = 0; i < lengthOfInString;) { if (aInString[i] == '<') // html tag { int32_t start = i; if (Substring(aInString, i + 1, 2).LowerCaseEqualsASCII("a ")) // if a tag, skip until </a>. // Make sure there's a space after, not to match "abbr". { i = aInString.Find("</a>", true, i); if (i == kNotFound) i = lengthOfInString; else i += 4; } else if (Substring(aInString, i + 1, 3).LowerCaseEqualsASCII("!--")) // if out-commended code, skip until --> { i = aInString.Find("-->", false, i); if (i == kNotFound) i = lengthOfInString; else i += 3; } else if (Substring(aInString, i + 1, 5).LowerCaseEqualsASCII("style") && (aInString.CharAt(i + 6) == ' ' || aInString.CharAt(i + 6) == '>')) // if style tag, skip until </style> { i = aInString.Find("</style>", true, i); if (i == kNotFound) i = lengthOfInString; else i += 8; } else if (Substring(aInString, i + 1, 6).LowerCaseEqualsASCII("script") && (aInString.CharAt(i + 7) == ' ' || aInString.CharAt(i + 7) == '>')) // if script tag, skip until </script> { i = aInString.Find("</script>", true, i); if (i == kNotFound) i = lengthOfInString; else i += 9; } else if (Substring(aInString, i + 1, 4).LowerCaseEqualsASCII("head") && (aInString.CharAt(i + 5) == ' ' || aInString.CharAt(i + 5) == '>')) // if head tag, skip until </head> // Make sure not to match <header>. { i = aInString.Find("</head>", true, i); if (i == kNotFound) i = lengthOfInString; else i += 7; } else // just skip tag (attributes etc.) { i = aInString.FindChar('>', i); if (i == kNotFound) i = lengthOfInString; else i++; } aOutString.Append(&uniBuffer[start], i - start); } else { uint32_t start = uint32_t(i); i = aInString.FindChar('<', i); if (i == kNotFound) i = lengthOfInString; nsString tempString; tempString.SetCapacity(uint32_t((uint32_t(i) - start) * growthRate)); UnescapeStr(uniBuffer, start, uint32_t(i) - start, tempString); ScanTXT(tempString.get(), tempString.Length(), whattodo, aOutString); } } #ifdef DEBUG_BenB_Perf printf("ScanHTML time: %d ms\n", PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start)); #endif }
void mozTXTToHTMLConv::ScanHTML(nsString& aInString, PRUint32 whattodo, nsString &aOutString) { // some common variables we were recalculating // every time inside the for loop... PRInt32 lengthOfInString = aInString.Length(); const PRUnichar * uniBuffer = aInString.get(); #ifdef DEBUG_BenB_Perf PRTime parsing_start = PR_IntervalNow(); #endif // Look for simple entities not included in a tags and scan them. /* Skip all tags ("<[...]>") and content in an a tag ("<a[...]</a>") or in a tag ("<!--[...]-->"). Unescape the rest (text between tags) and pass it to ScanTXT. */ for (PRInt32 i = 0; i < lengthOfInString;) { if (aInString[i] == '<') // html tag { PRUint32 start = PRUint32(i); if (nsCRT::ToLower((char)aInString[PRUint32(i) + 1]) == 'a') // if a tag, skip until </a> { i = aInString.Find("</a>", true, i); if (i == kNotFound) i = lengthOfInString; else i += 4; } else if (aInString[PRUint32(i) + 1] == '!' && aInString[PRUint32(i) + 2] == '-' && aInString[PRUint32(i) + 3] == '-') //if out-commended code, skip until --> { i = aInString.Find("-->", false, i); if (i == kNotFound) i = lengthOfInString; else i += 3; } else // just skip tag (attributes etc.) { i = aInString.FindChar('>', i); if (i == kNotFound) i = lengthOfInString; else i++; } aOutString.Append(&uniBuffer[start], PRUint32(i) - start); } else { PRUint32 start = PRUint32(i); i = aInString.FindChar('<', i); if (i == kNotFound) i = lengthOfInString; nsString tempString; tempString.SetCapacity(PRUint32((PRUint32(i) - start) * growthRate)); UnescapeStr(uniBuffer, start, PRUint32(i) - start, tempString); ScanTXT(tempString.get(), tempString.Length(), whattodo, aOutString); } } #ifdef DEBUG_BenB_Perf printf("ScanHTML time: %d ms\n", PR_IntervalToMilliseconds(PR_IntervalNow() - parsing_start)); #endif }
// Interpret a line TABLE *ParseTableLine(char *line, char *prefix, UINT prefix_size, LIST *replace_list) { UINT i, len; UINT len_name; UINT string_start; char *name; char *name2; UINT name2_size; wchar_t *unistr; char *str; UINT unistr_size, str_size; TABLE *t; // Validate arguments if (line == NULL || prefix == NULL) { return NULL; } TrimLeft(line); // No line len = StrLen(line); if (len == 0) { return NULL; } // Comment if (line[0] == '#' || (line[0] == '/' && line[1] == '/')) { return NULL; } // Search to the end position of the name len_name = 0; for (i = 0;;i++) { if (line[i] == 0) { // There is only one token return NULL; } if (line[i] == ' ' || line[i] == '\t') { break; } len_name++; } name = Malloc(len_name + 1); StrCpy(name, len_name + 1, line); string_start = len_name; for (i = len_name;i < len;i++) { if (line[i] != ' ' && line[i] != '\t') { break; } string_start++; } if (i == len) { Free(name); return NULL; } // Unescape UnescapeStr(&line[string_start]); // Convert to Unicode unistr_size = CalcUtf8ToUni(&line[string_start], StrLen(&line[string_start])); if (unistr_size == 0) { Free(name); return NULL; } unistr = Malloc(unistr_size); Utf8ToUni(unistr, unistr_size, &line[string_start], StrLen(&line[string_start])); if (UniInChar(unistr, L'$')) { // Replace the replacement string wchar_t *tmp; UINT tmp_size = (UniStrSize(unistr) + 1024) * 2; UINT i; tmp = Malloc(tmp_size); UniStrCpy(tmp, tmp_size, unistr); for (i = 0; i < LIST_NUM(replace_list);i++) { TABLE *r = LIST_DATA(replace_list, i); UniReplaceStrEx(tmp, tmp_size, tmp, (wchar_t *)r->name, r->unistr, false); } unistr = CopyUniStr(tmp); Free(tmp); } // Convert to ANSI str_size = CalcUniToStr(unistr); if (str_size == 0) { str_size = 1; str = Malloc(1); str[0] = 0; } else { str = Malloc(str_size); UniToStr(str, str_size, unistr); } if (StrCmpi(name, "PREFIX") == 0) { // Prefix is specified StrCpy(prefix, prefix_size, str); Trim(prefix); if (StrCmpi(prefix, "$") == 0 || StrCmpi(prefix, "NULL") == 0) { prefix[0] = 0; } Free(name); Free(str); Free(unistr); return NULL; } name2_size = StrLen(name) + StrLen(prefix) + 2; name2 = ZeroMalloc(name2_size); if (prefix[0] != 0) { StrCat(name2, name2_size, prefix); StrCat(name2, name2_size, "@"); } StrCat(name2, name2_size, name); Free(name); // Create a TABLE t = Malloc(sizeof(TABLE)); StrUpper(name2); t->name = name2; t->str = str; t->unistr = unistr; return t; }