struct swline *add_word_to_hash_table( WORD_HASH_TABLE *table_ptr, char *word, int hash_size) { struct swline **hash_array = table_ptr->hash_array; unsigned hashval; struct swline *sp; int len; /* Create the array if it doesn't exist */ if ( !hash_array ) { int ttl_bytes = sizeof(struct swline *) * (hash_size = (hash_size ? hash_size : HASHSIZE)); table_ptr->mem_zone = (void *) Mem_ZoneCreate("Word Hash Zone", 0, 0); //hash_array = (struct swline **)emalloc( ttl_bytes ); hash_array = (struct swline **) Mem_ZoneAlloc( (MEM_ZONE *)table_ptr->mem_zone, ttl_bytes ); memset( hash_array, 0, ttl_bytes ); table_ptr->hash_array = hash_array; table_ptr->hash_size = hash_size; table_ptr->count = 0; } else if ( (sp = is_word_in_hash_table( *table_ptr, word )) ) return sp; hashval = string_hash(word,hash_size); /* Create a new entry */ len = strlen(word); sp = (struct swline *) Mem_ZoneAlloc((MEM_ZONE *)table_ptr->mem_zone, sizeof(struct swline) + len); memcpy(sp->line,word,len + 1); /* Add word to head of list */ sp->next = hash_array[hashval]; hash_array[hashval] = sp; table_ptr->count++; return sp; }
char *parseHTMLtitle(SWISH *sw, char *buffer) { char *title; char *empty_title; empty_title = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone,1); *empty_title = '\0'; if (!buffer) return empty_title; if ((title = parsetag(sw, "title", buffer, TITLETOPLINES, CASE_SENSITIVE_OFF))) return title; return empty_title; }
int countwords_HTML(SWISH *sw, FileProp *fprop, FileRec *fi, char *buffer) { int ftotalwords; int *metaID; int metaIDlen; int position; /* Position of word in file */ int currentmetanames; char *p, *newp, *tag, *endtag; int structure; FileRec *thisFileEntry = fi; struct metaEntry *metaNameEntry; IndexFILE *indexf = sw->indexlist; struct MOD_Index *idx = sw->Index; char *Content = NULL, *Name = NULL, *summary = NULL; char *title = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)parseHTMLtitle(sw, buffer)); if (!isoktitle(sw, title)) return -2; if (fprop->stordesc) summary = parseHtmlSummary(buffer, fprop->stordesc->field, fprop->stordesc->size, sw); addCommonProperties( sw, fprop, fi, title, summary, 0 ); /* Init meta info */ metaID = (int *) Mem_ZoneAlloc(sw->Index->perDocTmpZone,(metaIDlen = 16) * sizeof(int)); currentmetanames = 0; ftotalwords = 0; structure = IN_FILE; metaID[0] = 1; position = 1; for (p = buffer; p && *p;) { /* Look for non escaped '<' */ if ((tag = strchr(p, '<')) && ((tag == p) || (*(tag - 1) != '\\'))) { /* Index up to the tag */ *tag++ = '\0'; newp = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p); if ( ! currentmetanames ) currentmetanames++; ftotalwords += indexstring(sw, newp, idx->filenum, structure, currentmetanames, metaID, &position); /* Now let us look for a not escaped '>' */ for (endtag = tag;;) if ((endtag = strchr(endtag, '>'))) { if (*(endtag - 1) != '\\') break; else endtag++; } else break; if (endtag) { *endtag++ = '\0'; if ((tag[0] == '!') && lstrstr(tag, "META") && (lstrstr(tag, "START") || lstrstr(tag, "END"))) { /* Check for META TAG TYPE 1 */ structure |= IN_META; if (lstrstr(tag, "START")) { char *parsed_tag; if ( (metaNameEntry = getHTMLMeta(indexf, tag, sw, NULL, &parsed_tag, fprop->real_path))) { /* realloc memory if needed */ if (currentmetanames == metaIDlen) { int *newbuf = (int *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, metaIDlen * 2 * sizeof(int)); memcpy((char *)newbuf,(char *)metaID,metaIDlen * sizeof(int)); metaID = newbuf; metaIDlen *= 2; } /* add metaname to array of current metanames */ metaID[currentmetanames] = metaNameEntry->metaID; /* Bump position for all metanames unless metaname in dontbumppositionOnmetatags */ if (!isDontBumpMetaName(sw->dontbumpstarttagslist, metaNameEntry->metaName)) position++; currentmetanames++; p = endtag; /* If it is also a property store it until a < is found */ if ((metaNameEntry = getPropNameByName(&indexf->header, parsed_tag))) { if ((endtag = strchr(p, '<'))) *endtag = '\0'; p = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p); remove_newlines(p); /** why isn't this just done for the entire doc? */ if (!addDocProperty(&thisFileEntry->docProperties, metaNameEntry, (unsigned char *)p, strlen(p), 0)) progwarn("property '%s' not added for document '%s'\n", metaNameEntry->metaName, fprop->real_path); if (endtag) *endtag = '<'; continue; } } } else if (lstrstr(tag, "END")) { /* this will close the last metaname */ if (currentmetanames) { currentmetanames--; if (!currentmetanames) metaID[0] = 1; } } p = endtag; } /* Check for META TAG TYPE 2 */ else if ((tag[0] != '!') && lstrstr(tag, "META") && (Name = lstrstr(tag, "NAME")) && (Content = lstrstr(tag, "CONTENT"))) { ftotalwords += parseMetaData(sw, indexf, tag, idx->filenum, structure, Name, Content, thisFileEntry, &position, fprop->real_path); p = endtag; } /* Check for COMMENT */ else if ((tag[0] == '!') && sw->indexComments) { ftotalwords += parsecomment(sw, tag, idx->filenum, structure, 1, &position); p = endtag; } /* Default: Continue */ else { structure = getstructure(tag, structure); p = endtag; } } else p = tag; /* tag not closed: continue */ } else { /* No more '<' */ newp = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p); if ( ! currentmetanames ) currentmetanames++; ftotalwords += indexstring(sw, newp, idx->filenum, structure, currentmetanames, metaID, &position); p = NULL; } } return ftotalwords; }
/* Gets the content between "<parsetag>" and "</parsetag>" from buffer limiting the scan to the first max_lines lines (0 means all lines) */ static char *parsetag(SWISH *sw, char *parsetag, char *buffer, int max_lines, int case_sensitive) { register int c, d; register char *p, *r; char *tag; int lencontent; char *content; int i, j, lines, status, tagbuflen, totaltaglen, curlencontent; char *begintag; char *endtag; char *newbuf; char *(*f_strstr) (); if (case_sensitive) f_strstr = strstr; else f_strstr = lstrstr; lencontent = strlen(parsetag); begintag = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lencontent + 3); endtag = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lencontent + 4); sprintf(begintag, "<%s>", parsetag); sprintf(endtag, "</%s>", parsetag); tag = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, 1); tag[0] = '\0'; content = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, (lencontent = MAXSTRLEN) + 1); lines = 0; status = NO_TAG; p = content; *p = '\0'; for (r = buffer;;) { c = *r++; if (c == '\n') { lines++; if (max_lines && lines == max_lines) break; } if (!c) return NULL; switch (c) { case '<': tag = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, (tagbuflen = MAXSTRLEN) + 1); totaltaglen = 0; tag[totaltaglen++] = '<'; /* Collect until find '>' */ while (1) { d = *r++; if (!d) return NULL; if (totaltaglen == tagbuflen) { newbuf = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, tagbuflen + 200 + 1); memcpy(newbuf,tag,tagbuflen + 1); tag = newbuf; tagbuflen += 200; } tag[totaltaglen++] = d; if (d == '>') { tag[totaltaglen] = '\0'; break; } } if (f_strstr(tag, endtag)) { status = TAG_CLOSE; *p = '\0'; /* nulls to spaces */ for (i = 0; content[i]; i++) if (content[i] == '\n') content[i] = ' '; /* skip over initial spaces and quotes */ for (i = 0; isspace((int) ((unsigned char) content[i])) || content[i] == '\"'; i++) ; /* shift buffer to left */ for (j = 0; content[i]; j++) content[j] = content[i++]; content[j] = '\0'; /* remove trailing spaces, nulls, quotes */ for (j = strlen(content) - 1; ( j >= 0 ) && ( isspace((int) ((unsigned char) content[j])) || content[j] == '\0' || content[j] == '\"'); j--) content[j] = '\0'; /* replace double quotes with single quotes -- why? */ for (j = 0; content[j]; j++) if (content[j] == '\"') content[j] = '\''; if (*content) return (content); else return NULL; } else if (f_strstr(tag, begintag)) { status = TAG_FOUND; } break; default: if (status == TAG_FOUND) { curlencontent = p - content; if (curlencontent == lencontent) { newbuf = Mem_ZoneAlloc(sw->Index->perDocTmpZone,(lencontent * 2) + 1); memcpy(newbuf,content,lencontent + 1); lencontent *= 2; content = newbuf; p = content + curlencontent; } *p = c; p++; } } } return NULL; }
static char *parseHtmlSummary(char *buffer, char *field, int size, SWISH * sw) { char *p, *q, *tag, *endtag, c = '\0'; char *summary, *beginsum, *endsum, *tmp, *tmp2, *tmp3; int found, lensummary; /* Get the summary if no metaname/field is given */ if (!field && size) { /* Jump title if it exists */ if ((p = lstrstr(buffer, "</title>"))) { p += 8; } else p = buffer; /* Let us try to find <body> */ if ((q = lstrstr(p, "<body"))) { q = strchr(q, '>'); } else q = p; summary = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone,strlen(p)+1); strcpy(summary,p); remove_newlines(summary); //$$$$ Todo: remove tag and content of scripts, css, java, embeddedobjects, comments, etc remove_tags(summary); summary = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)summary); /* use only the required memory -save those not used */ /* 2001-03-13 rasc copy only <size> bytes of string */ if((int) strlen(summary) > size) summary[size]='\0'; return summary; } for (p = buffer, summary = NULL, found = 0, beginsum = NULL, endsum = NULL; p && *p;) { if ((tag = strchr(p, '<')) && ((tag == p) || (*(tag - 1) != '\\'))) { /* Look for non escaped '<' */ tag++; for (endtag = tag;;) if ((endtag = strchr(endtag, '>'))) { if (*(endtag - 1) != '\\') break; else endtag++; } else break; if (endtag) { c = *endtag; *endtag++ = '\0'; if ((tag[0] == '!') && lstrstr(tag, "META") && (lstrstr(tag, "START") || lstrstr(tag, "END"))) { /* Check for META TAG TYPE 1 */ if (lstrstr(tag, "START")) { if ((tmp = lstrstr(tag, "NAME"))) { tmp += 4; if (lstrstr(tmp, field)) { beginsum = endtag; found = 1; } p = endtag; } else p = endtag; } else if (lstrstr(tag, "END")) { if (!found) { p = endtag; } else { endsum = tag - 1; *(endtag - 1) = c; break; } } } /* Check for META TAG TYPE 2 */ else if ((tag[0] != '!') && lstrstr(tag, "META") && (tmp = lstrstr(tag, "NAME")) && (tmp2 = lstrstr(tag, "CONTENT"))) { tmp += 4; tmp3 = lstrstr(tmp, field); if (tmp3 && tmp3 < tmp2) { tmp2 += 7; if ((tmp = strchr(tmp2, '='))) { for (++tmp; isspace((int) ((unsigned char) *tmp)); tmp++); if (*tmp == '\"') { beginsum = tmp + 1; for (tmp = endtag - 1; tmp > beginsum; tmp--) if (*tmp == '\"') break; if (tmp == beginsum) endsum = endtag - 1; else endsum = tmp; } else { beginsum = tmp; endsum = endtag - 1; } found = 1; *(endtag - 1) = c; break; } } p = endtag; } /* Default: Continue */ else { p = endtag; } } else p = NULL; /* tag not closed ->END */ if (endtag) *(endtag - 1) = c; } else { /* No more '<' */ p = NULL; } } if (found && beginsum && endsum && endsum > beginsum) { lensummary = endsum - beginsum; summary = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lensummary + 1); memcpy(summary, beginsum, lensummary); summary[lensummary] = '\0'; } /* If field is set an no metaname is found, let us search */ /* for something like <field>bla bla </field> */ if (!summary && field) { summary = parsetag(sw, field, buffer, 0, CASE_SENSITIVE_OFF); } /* Finally check for something after title (if exists) and */ /* after <body> (if exists) */ if (!summary) { /* Jump title if it exists */ if ((p = lstrstr(buffer, "</title>"))) { p += 8; } else p = buffer; /* Let us try to find <body> */ if ((q = lstrstr(p, "<body"))) { q = strchr(q, '>'); } else q = p; summary = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone,strlen(q) + 1); strcpy(summary,q); } if (summary) { remove_newlines(summary); remove_tags(summary); summary = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)summary); } if (summary && size && ((int) strlen(summary)) > size) summary[size] = '\0'; return summary; }