Пример #1
0
struct swline *add_word_to_hash_table( WORD_HASH_TABLE *table_ptr, char *word, int hash_size)
{
    struct swline **hash_array = table_ptr->hash_array;
    unsigned hashval;
    struct swline *sp;
    int len;

    /* Create the array if it doesn't exist */
    if ( !hash_array )
    {
        int ttl_bytes = sizeof(struct swline *) * (hash_size = (hash_size ? hash_size : HASHSIZE));
       
        table_ptr->mem_zone = (void *) Mem_ZoneCreate("Word Hash Zone", 0, 0); 
        //hash_array = (struct swline  **)emalloc( ttl_bytes );
        hash_array = (struct swline  **) Mem_ZoneAlloc( (MEM_ZONE *)table_ptr->mem_zone, ttl_bytes );
        memset( hash_array, 0, ttl_bytes );
        table_ptr->hash_array = hash_array;
        table_ptr->hash_size = hash_size;
        table_ptr->count = 0;
    }
    else
        if ( (sp = is_word_in_hash_table( *table_ptr, word )) )
            return sp;

    hashval = string_hash(word,hash_size);

    /* Create a new entry */            
    len = strlen(word);
    sp = (struct swline *) Mem_ZoneAlloc((MEM_ZONE *)table_ptr->mem_zone, sizeof(struct swline) + len);

    memcpy(sp->line,word,len + 1);

    /* Add word to head of list */
    
    sp->next = hash_array[hashval];
    hash_array[hashval] = sp;

    table_ptr->count++;

    return sp;
}
Пример #2
0
char   *parseHTMLtitle(SWISH *sw, char *buffer)
{
    char   *title;
    char   *empty_title;

    empty_title = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone,1);
    *empty_title = '\0';

    if (!buffer)
        return empty_title;

    if ((title = parsetag(sw, "title", buffer, TITLETOPLINES, CASE_SENSITIVE_OFF)))
        return title;

    return empty_title;
}
Пример #3
0
int countwords_HTML(SWISH *sw, FileProp *fprop, FileRec *fi, char *buffer)
{
    int     ftotalwords;
    int    *metaID;
    int     metaIDlen;
    int     position;           /* Position of word in file */
    int     currentmetanames;
    char   *p,
           *newp,
           *tag,
           *endtag;
    int     structure;
    FileRec *thisFileEntry = fi;
    struct metaEntry *metaNameEntry;
    IndexFILE *indexf = sw->indexlist;
    struct MOD_Index *idx = sw->Index;
    char   *Content = NULL,
           *Name = NULL,
           *summary = NULL;
    char   *title = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)parseHTMLtitle(sw, buffer));

    if (!isoktitle(sw, title))
        return -2;


    if (fprop->stordesc)
        summary = parseHtmlSummary(buffer, fprop->stordesc->field, fprop->stordesc->size, sw);

    addCommonProperties( sw, fprop, fi, title, summary, 0 );

    /* Init meta info */
    metaID = (int *) Mem_ZoneAlloc(sw->Index->perDocTmpZone,(metaIDlen = 16) * sizeof(int));

    currentmetanames = 0;
    ftotalwords = 0;
    structure = IN_FILE;
    metaID[0] = 1;
    position = 1;
    

    for (p = buffer; p && *p;)
    {

        /* Look for non escaped '<' */
        if ((tag = strchr(p, '<')) && ((tag == p) || (*(tag - 1) != '\\')))
        {
            /* Index up to the tag */
            *tag++ = '\0';

            newp = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);

            if ( ! currentmetanames )
                currentmetanames++;
            ftotalwords += indexstring(sw, newp, idx->filenum, structure, currentmetanames, metaID, &position);

            /* Now let us look for a not escaped '>' */
            for (endtag = tag;;)
                if ((endtag = strchr(endtag, '>')))
                {
                    if (*(endtag - 1) != '\\')
                        break;
                    else
                        endtag++;
                }
                else
                    break;


            if (endtag)
            {
                *endtag++ = '\0';

                if ((tag[0] == '!') && lstrstr(tag, "META") && (lstrstr(tag, "START") || lstrstr(tag, "END")))
                {
                    /* Check for META TAG TYPE 1 */
                    structure |= IN_META;
                    if (lstrstr(tag, "START"))
                    {
                        char   *parsed_tag;

                        if (
                            (metaNameEntry =
                             getHTMLMeta(indexf, tag, sw, NULL, &parsed_tag, fprop->real_path)))
                        {
                            /* realloc memory if needed */
                            if (currentmetanames == metaIDlen)
                            {
                                int *newbuf = (int *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, metaIDlen * 2 * sizeof(int));
                                memcpy((char *)newbuf,(char *)metaID,metaIDlen * sizeof(int));
                                metaID = newbuf;
                                metaIDlen *= 2;
                            }

                            /* add metaname to array of current metanames */
                            metaID[currentmetanames] = metaNameEntry->metaID;

                            /* Bump position for all metanames unless metaname in dontbumppositionOnmetatags */
                            if (!isDontBumpMetaName(sw->dontbumpstarttagslist, metaNameEntry->metaName))
                                position++;

                            currentmetanames++;


                            p = endtag;

                            /* If it is also a property store it until a < is found */
                            if ((metaNameEntry = getPropNameByName(&indexf->header, parsed_tag)))
                            {
                                if ((endtag = strchr(p, '<')))
                                    *endtag = '\0';


                                p = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);

                                remove_newlines(p);  /** why isn't this just done for the entire doc? */

                                if (!addDocProperty(&thisFileEntry->docProperties, metaNameEntry, (unsigned char *)p, strlen(p), 0))
                                    progwarn("property '%s' not added for document '%s'\n", metaNameEntry->metaName, fprop->real_path);


                                if (endtag)
                                    *endtag = '<';

                                continue;
                            }
                        }

                    }

                    else if (lstrstr(tag, "END"))
                    {
                        /* this will close the last metaname */
                        if (currentmetanames)
                        {
                            currentmetanames--;
                            if (!currentmetanames)
                                metaID[0] = 1;
                        }
                    }

                    p = endtag;
                }

                /* Check for META TAG TYPE 2 */
                else if ((tag[0] != '!') && lstrstr(tag, "META") && (Name = lstrstr(tag, "NAME")) && (Content = lstrstr(tag, "CONTENT")))
                {
                    ftotalwords += parseMetaData(sw, indexf, tag, idx->filenum, structure, Name, Content, thisFileEntry, &position, fprop->real_path);
                    p = endtag;
                }               /*  Check for COMMENT */

                else if ((tag[0] == '!') && sw->indexComments)
                {
                    ftotalwords += parsecomment(sw, tag, idx->filenum, structure, 1, &position);
                    p = endtag;
                }               /* Default: Continue */

                else
                {
                    structure = getstructure(tag, structure);
                    p = endtag;
                }
            }
            else
                p = tag;        /* tag not closed: continue */
        }

        else
        {                       /* No more '<' */

            newp = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)p);

            if ( ! currentmetanames )
                currentmetanames++;
            ftotalwords += indexstring(sw, newp, idx->filenum, structure, currentmetanames, metaID, &position);

            p = NULL;
        }
    }
    return ftotalwords;
}
Пример #4
0
/* Gets the content between "<parsetag>" and "</parsetag>" from buffer
limiting the scan to the first max_lines lines (0 means all lines) */
static char   *parsetag(SWISH *sw, char *parsetag, char *buffer, int max_lines, int case_sensitive)
{
    register int c,
            d;
    register char *p,
           *r;
    char   *tag;
    int     lencontent;
    char   *content;
    int     i,
            j,
            lines,
            status,
            tagbuflen,
            totaltaglen,
            curlencontent;
    char   *begintag;
    char   *endtag;
    char   *newbuf;
    char   *(*f_strstr) ();



    if (case_sensitive)
        f_strstr = strstr;
    else
        f_strstr = lstrstr;

    lencontent = strlen(parsetag);
    begintag = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lencontent + 3);
    endtag = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lencontent + 4);
    sprintf(begintag, "<%s>", parsetag);
    sprintf(endtag, "</%s>", parsetag);

    tag = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, 1);
    tag[0] = '\0';

    content = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, (lencontent = MAXSTRLEN) + 1);
    lines = 0;
    status = NO_TAG;
    p = content;
    *p = '\0';


    for (r = buffer;;)
    {
        c = *r++;
        if (c == '\n')
        {
            lines++;
            if (max_lines && lines == max_lines)
                break;
        }
        if (!c)
            return NULL;

        switch (c)
        {
        case '<':
            tag = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, (tagbuflen = MAXSTRLEN) + 1);
            totaltaglen = 0;
            tag[totaltaglen++] = '<';

            /* Collect until find '>' */
            while (1)
            {
                d = *r++;
                if (!d)
                    return NULL;
                if (totaltaglen == tagbuflen)
                {
                    newbuf = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone, tagbuflen + 200 + 1);
		            memcpy(newbuf,tag,tagbuflen + 1);
                    tag = newbuf;
                    tagbuflen += 200;
                }
                tag[totaltaglen++] = d;
                if (d == '>')
                {
                    tag[totaltaglen] = '\0';
                    break;
                }
            }


            if (f_strstr(tag, endtag))
            {
                status = TAG_CLOSE;
                *p = '\0';

                /* nulls to spaces */
                for (i = 0; content[i]; i++)
                    if (content[i] == '\n')
                        content[i] = ' ';

                /* skip over initial spaces and quotes */
                for (i = 0; isspace((int) ((unsigned char) content[i])) || content[i] == '\"'; i++)
                    ;

                /* shift buffer to left */
                for (j = 0; content[i]; j++)
                    content[j] = content[i++];

                content[j] = '\0';


                /* remove trailing spaces, nulls, quotes */
                for (j = strlen(content) - 1; ( j >= 0 ) && ( isspace((int) ((unsigned char) content[j])) || content[j] == '\0' || content[j] == '\"'); j--)
                    content[j] = '\0';

                /* replace double quotes with single quotes -- why? */
                for (j = 0; content[j]; j++)
                    if (content[j] == '\"')
                        content[j] = '\'';

                if (*content)
                    return (content);
                else
                    return NULL;
            }
            else if (f_strstr(tag, begintag))
            {
                status = TAG_FOUND;
            }
            break;
        default:
            if (status == TAG_FOUND)
            {
                curlencontent = p - content;
                if (curlencontent == lencontent)
                {
                    newbuf = Mem_ZoneAlloc(sw->Index->perDocTmpZone,(lencontent * 2) + 1);
                    memcpy(newbuf,content,lencontent + 1);
                    lencontent *= 2;
                    content = newbuf;
                    p = content + curlencontent;
                }
                *p = c;
                p++;
            }
        }
    }
    return NULL;
}
Пример #5
0
static char *parseHtmlSummary(char *buffer, char *field, int size, SWISH * sw)
{
    char   *p,
           *q,
           *tag,
           *endtag,
            c = '\0';
    char   *summary,
           *beginsum,
           *endsum,
           *tmp,
           *tmp2,
           *tmp3;
    int     found,
            lensummary;

    /* Get the summary if no metaname/field is given */
    if (!field && size)
    {
        /* Jump title if it exists */
        if ((p = lstrstr(buffer, "</title>")))
        {
            p += 8;
        }
        else
            p = buffer;
        /* Let us try to find <body> */
        if ((q = lstrstr(p, "<body")))
        {
            q = strchr(q, '>');
        }
        else
            q = p;
        summary = (char *) Mem_ZoneAlloc(sw->Index->perDocTmpZone,strlen(p)+1);
        strcpy(summary,p);
        remove_newlines(summary);

//$$$$ Todo: remove tag and content of scripts, css, java, embeddedobjects, comments, etc  

        remove_tags(summary);

        summary = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)summary);


        /* use only the required memory -save those not used */
        /* 2001-03-13 rasc  copy only <size> bytes of string */
        if((int) strlen(summary) > size)
            summary[size]='\0';
        return summary;
    }

    for (p = buffer, summary = NULL, found = 0, beginsum = NULL, endsum = NULL; p && *p;)
    {
        if ((tag = strchr(p, '<')) && ((tag == p) || (*(tag - 1) != '\\')))
        {                       /* Look for non escaped '<' */
            tag++;
            for (endtag = tag;;)
                if ((endtag = strchr(endtag, '>')))
                {
                    if (*(endtag - 1) != '\\')
                        break;
                    else
                        endtag++;
                }
                else
                    break;
            if (endtag)
            {
                c = *endtag;
                *endtag++ = '\0';
                if ((tag[0] == '!') && lstrstr(tag, "META") && (lstrstr(tag, "START") || lstrstr(tag, "END")))
                {               /* Check for META TAG TYPE 1 */
                    if (lstrstr(tag, "START"))
                    {
                        if ((tmp = lstrstr(tag, "NAME")))
                        {
                            tmp += 4;
                            if (lstrstr(tmp, field))
                            {
                                beginsum = endtag;
                                found = 1;
                            }
                            p = endtag;
                        }
                        else
                            p = endtag;
                    }
                    else if (lstrstr(tag, "END"))
                    {
                        if (!found)
                        {
                            p = endtag;
                        }
                        else
                        {
                            endsum = tag - 1;
                            *(endtag - 1) = c;
                            break;
                        }
                    }
                }               /* Check for META TAG TYPE 2 */
                else if ((tag[0] != '!') && lstrstr(tag, "META") && (tmp = lstrstr(tag, "NAME")) && (tmp2 = lstrstr(tag, "CONTENT")))
                {
                    tmp += 4;
                    tmp3 = lstrstr(tmp, field);
                    if (tmp3 && tmp3 < tmp2)
                    {
                        tmp2 += 7;
                        if ((tmp = strchr(tmp2, '=')))
                        {
                            for (++tmp; isspace((int) ((unsigned char) *tmp)); tmp++);
                            if (*tmp == '\"')
                            {
                                beginsum = tmp + 1;
                                for (tmp = endtag - 1; tmp > beginsum; tmp--)
                                    if (*tmp == '\"')
                                        break;
                                if (tmp == beginsum)
                                    endsum = endtag - 1;
                                else
                                    endsum = tmp;
                            }
                            else
                            {
                                beginsum = tmp;
                                endsum = endtag - 1;
                            }
                            found = 1;
                            *(endtag - 1) = c;
                            break;

                        }
                    }
                    p = endtag;
                }               /* Default: Continue */
                else
                {
                    p = endtag;
                }
            }
            else
                p = NULL;       /* tag not closed ->END */
            if (endtag)
                *(endtag - 1) = c;
        }
        else
        {                       /* No more '<' */
            p = NULL;
        }
    }
    if (found && beginsum && endsum && endsum > beginsum)
    {
        lensummary = endsum - beginsum;
        summary = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone, lensummary + 1);
        memcpy(summary, beginsum, lensummary);
        summary[lensummary] = '\0';
    }
    /* If field is set an no metaname is found, let us search */
    /* for something like <field>bla bla </field> */
    if (!summary && field)
    {
        summary = parsetag(sw, field, buffer, 0, CASE_SENSITIVE_OFF);
    }
    /* Finally check for something after title (if exists) and */
    /* after <body> (if exists) */

    if (!summary)
    {
        /* Jump title if it exists */
        if ((p = lstrstr(buffer, "</title>")))
        {
            p += 8;
        }
        else
            p = buffer;

        /* Let us try to find <body> */
        if ((q = lstrstr(p, "<body")))
        {
            q = strchr(q, '>');
        }
        else
            q = p;

        summary = (char *)Mem_ZoneAlloc(sw->Index->perDocTmpZone,strlen(q) + 1);
        strcpy(summary,q);
    }

    if (summary)
    {
        remove_newlines(summary);
        remove_tags(summary);
        summary = (char *)sw_ConvHTMLEntities2ISO(sw, (unsigned char *)summary);
    }

    if (summary && size && ((int) strlen(summary)) > size)
        summary[size] = '\0';
    return summary;
}