void HttpRequest::parseFieldsMultipart ( char *s , long slen ) {

	// should be NULL terminated since we replaced &'s w/ 0's in set()
	char *send   = s + slen ;
	// reset field count
	long n = m_numFields;

 loop:
	// watch out for overflow
	if ( n >= MAX_CGI_PARMS ) {
		log("http: Received more than %li CGI parms. "
		    "Truncating.",(long)MAX_CGI_PARMS);
		return;
	}
	
	s = strncasestr ( s , "\r\nContent-Disposition:", send - s );
	if ( ! s ) return;

	// get the line end
	s += 2;
	char *lineEnd = strstr ( s , "\r\n" );
	if ( ! lineEnd ) return;

	// get the name
	char *name = strncasestr ( s , "name=\"" , lineEnd - s );
	if ( ! name ) goto loop;

	// point to name
	s = name + 6;
	// set the nth field name in this cgi string
	m_fields[n] = s;

	// point to = sign, use this for multiparts though
	char *equal = strstr ( s , "\"\r\n\r\n" );
	// try next field if none here
	if ( ! equal ) goto loop;
	// set field len
	m_fieldLens [ n ] = equal - s;
	// set = to \0 so getField() returns NULL terminated field name
	*equal = '\0';
	// point to field value
	s = equal + 5;
	// set value (may be \0)
	m_fieldValues [ n ] = s;
	// force to \0 at end
	char *vend = strstr ( s , "\r\n----------"); // 29 -'s then a #
	if ( ! vend ) return;
	// null terminate the value as well
	*vend = '\0';
	// count the number of field/value pairs we get
	n++;
	// remember it
	m_numFields = n;
	// point to next field
	goto loop;

}
Пример #2
0
/*
 * Determines the location at which we want to inject our JavaScript
 */
static char *
find_pointer(char *data, size_t data_len) {
	struct _findp {
		char *what;
		/* 
		 * -1 before,
		 * +1 after
		 * 0 do not inject!
		 */
		int before;
	} findp[] = {
		{ "<head>", 1},
		{ "<body", -1},
		{ "<html>", 1},
		{ "<?xml", 0 },
		{ NULL, 0 }
	};

	struct _findp *where;

	for (where = &findp[0]; where->what != NULL; ++where) {
		char *p = strncasestr(data, where->what, data_len);
		if (p != NULL) {
			if (where->before == 1)
				p += strlen(where->what);
			else if (where->before == 0)
				return (NULL);

			return (p);
		}
	}

	return (data);
}
Пример #3
0
char *strstr_href(char *line,gmx_bool *bInHREF,int *i_dat,int n_dat,char **dat)
{
  char *start,*found,*href=NULL;
  gmx_bool bIn;
  int i;

  found=NULL;
  *i_dat=-1;
  bIn=*bInHREF;
  start=line;
  do {
    if (bIn) {
      while (strlen(start) && (strncasecmp(start,"</a",3) != 0))
	start++;
      if (strlen(start)>0) {
	start+=3;
	bIn=FALSE;
      }
    }
    else {
      href=strncasestr(start,"<a href");
      if (href)
	bIn=TRUE;
      i=0;
      while((i<n_dat) && !found) {
	found=strncasestr(start,dat[i]);
	if (found) {
	  if (href && (found>href))
	    found=NULL;
	  else {
	    if (((found!=start) && isword(found[-1])) || 
		isword(found[strlen(dat[i])])) 
	      found=NULL;
	    else
	      *i_dat=i;
	  }
	  i++;
	}
      }
    }
  } while (strlen(start) && !found && href);
  *bInHREF=bIn;

  return found;
}
static int is_html(const char* url)
{
	size_t len=strlen(url);
	if(!len)
		return 0;
	if(strnstr(url,".php",len) != NULL)
		return 1;
	if(strnstr(url,".jsp",len) != NULL)
		return 1;
	if(strnstr(url,".html",len) != NULL)
		return 1;
	if(strnstr(url,".aspx",len) != NULL)
		return 1;
	if(strnstr(url,".shtml",len) != NULL)
		return 1;
	if(strnstr(url,".htm",len) != NULL)
		return 1;
	if(strncasestr(url,".jpg",len) != NULL)
		return 0;
	if(strncasestr(url,".png",len) != NULL)
		return 0;
	if(strncasestr(url,".gif",len) != NULL)
		return 0;
	if(strncasestr(url,".js",len) != NULL)
		return 0;
	if(strncasestr(url,".css",len) != NULL)
		return 0;
	if(strncasestr(url,".xml",len) != NULL)
		return 0;
	if(strncasestr(url,".swf",len) != NULL)
		return 0;
	return 1;
}
// . parse cgi fields contained in s
// . s points to the stuff immediately after the ?
// . we should have already replaced all &'s in s with /0's
// . we also replace the last \r with a \0
void HttpRequest::parseFields ( char *s , int32_t slen ) {

	// . are we a multipart/form-data?
	// . many of form tags for event submission forms are this
	//   <form enctype="multipart/form-data" ...>
	char *cd = strncasestr ( s , "\r\nContent-Disposition:", slen );
	if ( cd ) {
		parseFieldsMultipart ( s , slen );
		return;
	}

	// should be NULL terminated since we replaced &'s w/ 0's in set()
	char *send   = s + slen ;
	// reset field count
	int32_t n = m_numFields;
	while ( s && s < send ) {
		// watch out for overflow
		if ( n >= MAX_CGI_PARMS ) {
			log("http: Received more than %" PRId32" CGI parms. "
			    "Truncating.",(int32_t)MAX_CGI_PARMS);
			break;
		}
		// set the nth field name in this cgi string
		m_fields [ n ] = s;
		// point to = sign
		char *equal = strchr ( s , '=' );
		// try next field if none here
		if ( ! equal ) { s += strlen ( s ) + 1; continue; }
		// if no equal sign, maybe it is one of diffbot's valueless
		// fields, so support that now
		if ( ! equal ) { 
			// just set value to NULL
			char *end = strchr(s,'&');
			int32_t len = end - s;
			if ( ! end ) len = strlen(s);
			m_fieldLens[n] = len;
			s[len] = '\0';
			m_fieldValues[n] = NULL;
			n++;
			// skip over the '&' too
			s += len + 1; 
			continue; 
		}
		// set field len
		m_fieldLens [ n ] = equal - s;
		// set = to \0 so getField() returns NULL terminated field name
		*equal = '\0';
		// set value (may be \0)
		m_fieldValues [ n ] = equal + 1;
		// count the number of field/value pairs we get
		n++;
		//	skip:
		// point to next field
		s = equal + 1 + strlen ( equal + 1 ) + 1 ;
	}
	m_numFields = n;
}
Пример #6
0
END_TEST

START_TEST (_strncasestr)
{
    char *p;
    char *string    = "This is a testing string";
    int   stringlen = strlen(string);

    p = strncasestr (string, "iS A", stringlen);
    ch_assert (p != NULL);
}
Пример #7
0
int
main(int argc, char * argv[])
{
    if(argc!=3)
    {
        printf("usage: %s haystack find\n", argv[0]);
        return 0;
    }

    printf("Find %s in %s, result: %s\n", argv[1], argv[2],
           strncasestr(argv[1], argv[2], strlen(argv[1])));
    return 0;
}
// . parse cgi fields contained in s
// . s points to the stuff immediately after the ?
// . we should have already replaced all &'s in s with /0's
// . we also replace the last \r with a \0
void HttpRequest::parseFields ( char *s , long slen ) {

	// . are we a multipart/form-data?
	// . many of form tags for event submission forms are this
	//   <form enctype="multipart/form-data" ...>
	char *cd = strncasestr ( s , "\r\nContent-Disposition:", slen );
	if ( cd ) {
		parseFieldsMultipart ( s , slen );
		return;
	}

	// should be NULL terminated since we replaced &'s w/ 0's in set()
	char *send   = s + slen ;
	// reset field count
	long n = m_numFields;
	while ( s && s < send ) {
		// watch out for overflow
		if ( n >= MAX_CGI_PARMS ) {
			log("http: Received more than %li CGI parms. "
			    "Truncating.",(long)MAX_CGI_PARMS);
			break;
		}
		// set the nth field name in this cgi string
		m_fields [ n ] = s;
		// point to = sign
		char *equal = strchr ( s , '=' );
		// try next field if none here
		if ( ! equal ) { s += gbstrlen ( s ) + 1; continue; }
		// set field len
		m_fieldLens [ n ] = equal - s;
		// set = to \0 so getField() returns NULL terminated field name
		*equal = '\0';
		// set value (may be \0)
		m_fieldValues [ n ] = equal + 1;
		// count the number of field/value pairs we get
		n++;
		//	skip:
		// point to next field
		s = equal + 1 + gbstrlen ( equal + 1 ) + 1 ;
	}
	m_numFields = n;
}
Пример #9
0
bool Summary::verifySummary( char *titleBuf, int32_t titleBufLen ) {
	if ( m_summaryLen > 0 ) {
		// trim elipsis
		if ( ( titleBufLen > 4 ) && ( memcmp( (titleBuf + titleBufLen - 4), " ...", 4 ) == 0 ) ) {
			titleBufLen -= 4;
		}

		// verify that it's not the same with title
		if ( strncasestr( m_summary, titleBuf, m_summaryLen, titleBufLen ) ) {
			m_summaryLen = 0;
			m_summary[0] = '\0';

			return false;
		}

		m_summaryExcerptLen[0] = m_summaryLen;
		m_numExcerpts = 1;
		m_displayLen = m_summaryLen;

		return true;
	}

	return false;
}
Пример #10
0
/* Scan code for an assert macro, returning the start of the macro if found or
 * NULL if not. The out param type returns the assertion type, and afterp is
 * set to point immediately after the macro's opening paren.
 */
static char *
find_macro(struct TestFile *tf, enum MacroType *type, int *need_array_it)
{
    size_t len = tf->next_line_pos - tf->read_pos;
    char *assert_pos;

    assert_pos = strncasestr(tf->read_pos, len, "assert_", 7);
    if (assert_pos) {
        if (!need_array_it) {
            fail(tf, assert_pos, "assertions not allowed here");
        }
        char *s = assert_pos + 7;
        size_t rest_len = tf->next_line_pos - s;
        if (rest_len > 6 && !strncasecmp(s, "array_", 6)) { // accept array_
            s += 6;
            rest_len -= 6;
            if (rest_len > 10 && !strncasecmp(s, "equal_with", 10)) {
                *need_array_it = 1;
                tf->next_pos = s + 10;
                *type = ASSERT_ARRAY_EQUAL_WITH;
            } else if (rest_len > 5 && !strncasecmp(s, "equal", 5)) {
                *need_array_it = 1;
                tf->next_pos = s + 5;
                *type = ASSERT_ARRAY_EQUAL;
            } else {
                assert_pos = NULL; // not an assert macro
            }
        } else if (rest_len >  4 && !strncasecmp(s, "true",        4)) {
            tf->next_pos = s + 4;
            *type = ASSERT_TRUE;
        } else if (rest_len >  5 && !strncasecmp(s, "false",       5)) {
            tf->next_pos = s + 5;
            *type = ASSERT_FALSE;
        } else if (rest_len > 10 && !strncasecmp(s, "equal_with", 10)) {
            tf->next_pos = s + 10;
            *type = ASSERT_EQUAL_WITH;
        } else if (rest_len >  5 && !strncasecmp(s, "equal",       5)) {
            tf->next_pos = s + 5;
            *type = ASSERT_EQUAL;
        } else if (rest_len >  9 && !strncasecmp(s, "not_equal",   9)) {
            tf->next_pos = s + 9;
            *type = ASSERT_NOT_EQUAL;
        } else {
            assert_pos = NULL; // not an assert macro
        }
    } else {
        assert_pos = strncasestr(tf->read_pos, len, "flunk",  5);
        if (assert_pos) {
            tf->next_pos = assert_pos + 5;
            *type = FLUNK;
        } else {
            assert_pos = NULL; // no assertions found
        }
    }

    if (assert_pos) {
        // make sure not commented out
        char *s = assert_pos;
        while (--s >= tf->line_pos) {
            if (*s == '!')
                return NULL;
        }

        // find open paren
        while (tf->next_pos < tf->next_line_pos) {
            switch (*tf->next_pos) {
            case ' ':
            case '\t':
                tf->next_pos++;
                break;
            case '(':
                tf->next_pos++;
                tf->read_pos = assert_pos;
                return assert_pos;
            default:
                fail(tf, tf->next_pos, "expected '('");
                return NULL;
            }
        }
    }
    return NULL;
}
Пример #11
0
void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
			     Sections *sections , XmlDoc *xd ) {
	// not valid for now
	m_thumbnailValid = false;
	// reset our array of image node candidates
	m_numImages = 0;
	// flag it
	m_setCalled = true;
	// strange...
	if ( m_imgReply ) { char *xx=NULL;*xx=0; }
	// save this
	m_xml       = xml;
	m_pageUrl   = pageUrl;

	//
	// first add any open graph candidate.
	// basically they page telling us the best image straight up.
	//

	int32_t node2 = -1;
	int32_t startNode = 0;

	// . field can be stuff like "summary","description","keywords",...
	// . if "convertHtmlEntites" is true we change < to &lt; and > to &gt;
	// . <meta property="og:image" content="http://example.com/rock2.jpg"/>
	// . <meta property="og:image" content="http://example.com/rock3.jpg"/>
 ogimgloop:
	char ubuf[2000];
	int32_t ulen = xml->getMetaContent( ubuf, 1999, "og:image", 8, "property", startNode, &node2 );

	// update this in case goto ogimgloop is called
	startNode = node2 + 1;
	// see section below for explanation of what we are storing here...
	if ( node2 >= 0 ) {
		// save it
		m_imageNodes[m_numImages] = node2;
		Query q;
		if ( ulen > MAX_URL_LEN ) goto ogimgloop;
		// set it to the full url
		Url iu;
		// use "pageUrl" as the baseUrl
		iu.set( pageUrl, ubuf, ulen );
		// skip if invalid domain or TLD
		if ( iu.getDomainLen() <= 0 ) goto ogimgloop;
		// for looking it up on disk to see if unique or not
		char buf[2000];
		// if we don't put in quotes it expands '|' into
		// the "PiiPe" operator in Query.cpp
		snprintf ( buf , 1999, "gbimage:\"%s\"",iu.getUrl());
		// TODO: make sure this is a no-split termid storage thingy
		// in Msg14.cpp
		if ( ! q.set2 ( buf , langUnknown , false ) ) return;
		// sanity test
		if ( q.getNumTerms() != 1 ) { char *xx=0;*xx=0; }
		// store the termid
		m_termIds[m_numImages] = q.getTermId(0);
		// advance the counter
		m_numImages++;
		// try to get more graph images if we have some room
		if ( m_numImages + 2 < MAX_IMAGES ) goto ogimgloop;
	}
	


	//m_pageSite  = pageSite;
	// scan the words
	int32_t       nw     = words->getNumWords();
	nodeid_t  *tids   = words->getTagIds();
	int64_t *wids   = words->getWordIds();
	//int32_t      *scores = scoresArg->m_scores;
	Section **sp = NULL; 
	if ( sections ) sp = sections->m_sectionPtrs;
	// not if we don't have any identified sections
	if ( sections && sections->m_numSections <= 0 ) sp = NULL;
	// the positive scored window
	int32_t firstPosScore = -1;
	int32_t lastPosScore  = -1;
	int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT;
	// find positive scoring window
	for ( int32_t i = 0 ; i < nw ; i++ ) {
		// skip if in bad section
		if ( sp && (sp[i]->m_flags & badFlags) ) continue;
		if ( wids[i]   != 0 ) continue;
		// set first positive scoring guy
		if ( firstPosScore == -1 ) firstPosScore = i;
		// keep track of last guy
		lastPosScore = i;
	}
	// sanity check
	if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }
	// . pedal firstPosScore back until we hit a section boundary
	// . i.e. stop once we hit a front/back tag pair, like <div> and </div>
	char tc[512];
	memset ( tc , 0 , 512 );
	int32_t a = firstPosScore;
	for ( ; a >= 0 ; a-- ) {
		// get the tid
		nodeid_t tid = tids[a];
		// remove back bit, if any
		tid &= BACKBITCOMP;
		// skip if not a tag, or a generic xml tag
		if ( tid <= 1 ) continue;
		// mark it
		if ( words->isBackTag(a) ) tc[tid] |= 0x02;
		else                       tc[tid] |= 0x01;
		// continue if not a full front/back pair
		if ( tc[tid] != 0x03 ) continue;
		// continue if not a "section" type tag (see Scores.cpp)
		if ( tid != TAG_DIV      &&
		     tid != TAG_TEXTAREA &&
                     tid != TAG_TR       &&
                     tid != TAG_TD       &&
                     tid != TAG_TABLE      ) 
			continue;
		// ok we should stop now
		break;
	}		
	// min is 0
	if ( a < 0 ) a = 0;

	// now look for the image urls within this window
	for ( int32_t i = a ; i < lastPosScore ; i++ ) {
		// skip if not <img> tag
		if (tids[i] != TAG_IMG ) continue;
		// get the node num into Xml.cpp::m_nodes[] array
		int32_t nn = words->getNodes()[i];
		// check width to rule out small decorating imgs
		int32_t width = xml->getLong(nn,nn+1,"width", -1 );
		if ( width != -1 && width < 50 ) continue;
		// same with height
		int32_t height = xml->getLong(nn,nn+1, "height", -1 );
		if ( height != -1 && height < 50 ) continue;
		// get the url of the image
		int32_t  srcLen;
		char *src = xml->getString(nn,"src",&srcLen);
		// skip if none
		if ( srcLen <= 2 ) continue;
		// set it to the full url
		Url iu;
		// use "pageUrl" as the baseUrl
		iu.set( pageUrl, src, srcLen );
		// skip if invalid domain or TLD
		if ( iu.getDomainLen() <= 0 ) continue;
		// skip if not from same domain as page url
		//int32_t dlen = pageUrl->getDomainLen();
		//if ( iu.getDomainLen() != dlen ) continue;
		//if(strncmp(iu.getDomain(),pageUrl->getDomain(),dlen))continue
		// get the full url
		char *u    = iu.getUrl();
		int32_t  ulen = iu.getUrlLen();
		// skip common crap
		if ( strncasestr(u,ulen,"logo"           ) ) continue;
		if ( strncasestr(u,ulen,"comment"        ) ) continue;
		if ( strncasestr(u,ulen,"print"          ) ) continue;
		if ( strncasestr(u,ulen,"subscribe"      ) ) continue;
		if ( strncasestr(u,ulen,"header"         ) ) continue;
		if ( strncasestr(u,ulen,"footer"         ) ) continue;
		if ( strncasestr(u,ulen,"menu"           ) ) continue;
		if ( strncasestr(u,ulen,"button"         ) ) continue;
		if ( strncasestr(u,ulen,"banner"         ) ) continue;
		if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue;
		if ( strncasestr(u,ulen,"ads.webfeat."   ) ) continue;
		if ( strncasestr(u,ulen,"xads.zedo."     ) ) continue;

		// save it
		m_imageNodes[m_numImages] = nn;

		// before we lookup the image url to see if it is unique we
		// must first make sure that we have an adequate number of
		// permalinks from this same site with this same hop count.
		// we need at least 10 before we extract image thumbnails.
		char buf[2000];
		// set the query
		Query q;

		// if we do have 10 or more, then we lookup the image url to
		// make sure it is indeed unique
		sprintf ( buf , "gbimage:\"%s\"",u);
		// TODO: make sure this is a no-split termid storage thingy
		// in Msg14.cpp
		if ( ! q.set2 ( buf , langUnknown , false ) )
			// return true with g_errno set on error
			return;
		// store the termid
		m_termIds[m_numImages] = q.getTermId(0);

		// advance the counter
		m_numImages++;

		// break if full
		if ( m_numImages >= MAX_IMAGES ) break;
	}
}
Пример #12
0
/*
 station list file format:

 [playlist]
 numberofentries=1
 File1=http://67.159.5.47:8110
 Title1=(#1 - 7/500) The Dominican.net Radio
 */
struct station_list* station_list_create(const char* fn)
{
	int fd;
	rt_uint32_t length, index;
	struct station_list* list;
	char  *line, *pos, prefix[8];

	list = (struct station_list*)rt_malloc(sizeof(struct station_list));
	if (list == RT_NULL) goto _return0; /* no memory */
	list->count = 0; list->items = RT_NULL;
	
#define LINE_BUFFER_SIZE	128
	line = rt_malloc(LINE_BUFFER_SIZE);
	if (line == RT_NULL)
		goto _return1; /* no memory */

	fd = open(fn, O_RDONLY, 0); 
	if (fd < 0) goto _return2; /* open file failed */

	length = read_line(fd, line, LINE_BUFFER_SIZE);
	pos = strncasestr(line, "[playlist]");
	if (pos == RT_NULL) 
	{
		station_list_destroy(list);
		list = RT_NULL;
		goto _return2;
	}

	length = read_line(fd, line, LINE_BUFFER_SIZE);
	pos = strncasestr(line, "numberofentries=");
	if (pos != RT_NULL)
	{
		list->count = (int)strtol(pos + strlen("numberofentries="), RT_NULL, 10);
		if (list->count > 0)
		{
			list->items = (struct station_item*) rt_malloc (sizeof(struct station_item) * list->count);
			rt_memset(list->items, 0, sizeof(struct station_item) * list->count);
		}
	}
	else
	{
		station_list_destroy(list);
		list = RT_NULL;
		goto _return2;
	}

	if (list->items == RT_NULL)
	{
		station_list_destroy(list);
		list = RT_NULL;
		goto _return2;
	}

	index = 0;
	while (index < list->count)
	{
		length = read_line(fd, line, LINE_BUFFER_SIZE);
		if (length > 0)
		{
			rt_snprintf(prefix, sizeof(prefix), "File%d", index + 1);
			pos = strncasestr(line, prefix);
			if (pos != RT_NULL)
				strncpy(list->items[index].url, pos + strlen(prefix) + 1, 128);

			rt_snprintf(prefix, sizeof(prefix), "Title%d", index + 1);
			pos = strncasestr(line, prefix);
			if (pos != RT_NULL)
			{
				strncpy(list->items[index].title, pos + strlen(prefix) + 1, 40);
				index ++;
			}
		}
		else break;
	}

_return2:
	close(fd);
_return1:
	rt_free(line);

_return0:
	return list;
}
Пример #13
0
// . *it is the image type
void getImageInfo ( char *buf , long bufSize , 
		    long *dx , long *dy , long *it ) {

	// default to zeroes
	*dx = 0;
	*dy = 0;

	char *strPtr;
	// get the dimensions of the image
	if( (strPtr = strncasestr( buf, 20, "Exif"  )) ) {
		log(LOG_DEBUG, "image: Image Link: ");
		log(LOG_DEBUG, "image: We currently do not handle EXIF image "
		     "types." );
		// try the nextone
		return;
	}
	else if( (strPtr = strncasestr( buf, 20, "GIF"  )) ) {
		if ( it ) *it = CT_GIF;
		log( LOG_DEBUG, "image: GIF INFORMATION:" );
		if( bufSize > 9 ) {
			*dx = ((unsigned long)buf[7]) << 8;
			*dx += (unsigned char)buf[6];
			*dy = ((unsigned long)buf[9]) << 8;
			*dy += (unsigned char)buf[8];
		}
	}
	else if( (strPtr = strncasestr( buf, 20, "JFIF" )) ) {
		if ( it ) *it = CT_JPG;
		log( LOG_DEBUG, "image: JPEG INFORMATION:" );
		long i;
		for( i = 0; i < bufSize; i++ ) {
			if( bufSize < i+8 )
				break;
			if( (unsigned char)buf[i] != 0xFF ) continue;
			if( (unsigned char)buf[i+1] == 0xC0 ){
				*dy = ((unsigned long)buf[i+5]) << 8;
				*dy += (unsigned char)buf[i+6];
				*dx = ((unsigned long)buf[i+7]) << 8;
				*dx += (unsigned char)buf[i+8];
				break;
			}
			else if( (unsigned char) buf[i+1] == 0xC2 ) {
				*dy = ((unsigned long)buf[i+5]) << 8;
				*dy += (unsigned char)buf[i+6];
				*dx = ((unsigned long)buf[i+7]) << 8;
				*dx += (unsigned char)buf[i+8];
				break;
			}
		}
	}
	else if( (strPtr = strncasestr( buf, 20, "PNG" )) ) {
		if ( it ) *it = CT_PNG;
		log( LOG_DEBUG, "image: PNG INFORMATION:" );
		if( bufSize > 25 ) {
			*dx=(unsigned long)(*(unsigned long *)&buf[16]);
			*dy=(unsigned long)(*(unsigned long *)&buf[20]);
			// these are in network order
			*dx = ntohl(*dx);
			*dy = ntohl(*dy);
		}
	}
	else if( (strPtr = strncasestr( buf, 20, "MM" )) ) {
		if ( it ) *it = CT_TIFF;
		log( LOG_DEBUG, "image: TIFF INFORMATION:" );
		long startCnt = (unsigned long)buf[7]+4;
		for( long i = startCnt; i < bufSize; i += 12 ) {
			if( bufSize < i+10 )
				break;
			if( buf[i] != 0x01 ) continue;
			if( buf[i+1] == 0x01 )
				*dy = (unsigned long)
					(*(unsigned short *)&buf[i+8]);
			else if( buf[i+1] == 0x00 )
				*dx = (unsigned long)
					(*(unsigned short *)&buf[i+8]);
		}
	}
	else if( (strPtr = strncasestr( buf, 20, "II" )) ) {
		if ( it ) *it = CT_TIFF;
		log( LOG_DEBUG, "image: TIFF INFORMATION:" );
		long startCnt = (unsigned long)buf[7]+4;
		for( long i = startCnt; i < bufSize; i += 12 ) {
			if( bufSize < i+10 )
				break;
			if( buf[i] == 0x01 && buf[i+1] == 0x01 )
				*dy = (unsigned long)
					(*(unsigned short *)&buf[i+8]);
			if( buf[i] == 0x00 && buf[i+1] == 0x01 )
				*dx = (unsigned long)
					(*(unsigned short *)&buf[i+8]);
		}
	}
	else if( (strPtr = strncasestr( buf, 20, "BM" )) ) {
		if ( it ) *it = CT_BMP;
		log( LOG_DEBUG, "image: BMP INFORMATION:" );
		if( bufSize > 27 ) {
			*dx=(unsigned long)(*(unsigned long *)&buf[18]);
			*dy=(unsigned long)(*(unsigned long *)&buf[22]);
		}
	}
	else 
		log( LOG_DEBUG, "image: Image Corrupted? No type found in "
		     "data." );
}
Пример #14
0
// returns false and sets g_errno on error
bool Summary::setSummary ( Xml *xml, Words *words, Sections *sections, Pos *pos, Query *q, int32_t maxSummaryLen,
						   int32_t maxNumLines, int32_t numDisplayLines, int32_t maxNumCharsPerLine, Url *f,
                           Matches *matches, char *titleBuf, int32_t titleBufLen ) {
	m_numDisplayLines = numDisplayLines;
	m_displayLen      = 0;

	// assume we got maxnumlines of summary
	if ( (maxNumCharsPerLine + 6) * maxNumLines > maxSummaryLen ) {
		if ( maxNumCharsPerLine < 10 ) {
			maxNumCharsPerLine = 10;
		}

		static char s_flag = 1;
		if ( s_flag ) {
			s_flag = 0;
			log("query: Warning. "
			    "Max summary excerpt length decreased to "
			    "%" PRId32" chars because max summary excerpts and "
			    "max summary length are too big.",
			    maxNumCharsPerLine);
		}
	}

	// . sanity check
	// . summary must fit in m_summary[]
	// . leave room for tailing \0
	if ( maxSummaryLen >= MAX_SUMMARY_LEN ) {
		g_errno = EBUFTOOSMALL;
		return log("query: Summary too big to hold in buffer of %" PRId32" bytes.",(int32_t)MAX_SUMMARY_LEN);
	}

	// do not overrun the final*[] buffers
	if ( maxNumLines > 256 ) { 
		g_errno = EBUFTOOSMALL; 
		return log("query: More than 256 summary lines requested.");
	}

	// Nothing to match...print beginning of content as summary
	if ( matches->m_numMatches == 0 && maxNumLines > 0 ) {
		return getDefaultSummary ( xml, words, sections, pos, maxSummaryLen );
	}

	int32_t need1 = q->m_numWords * sizeof(float);
	m_wordWeightSize = need1;
	if ( need1 < 128 ) {
		m_wordWeights = (float *)m_tmpWordWeightsBuf;
	} else {
		m_wordWeights = (float *)mmalloc ( need1 , "wwsum" );
	}

	if ( ! m_wordWeights ) {
		return false;
	}

	/// @todo ALC fix word weights
	/// non-working logic is removed in commit 5eacee9063861e859b54ec62035a600aa8af25df

	// . compute our word weights wrt each query. words which are more rare
	//   have a higher weight. We use this to weight the terms importance
	//   when generating the summary.
	// . used by the proximity algo
	// . used in setSummaryScores() for scoring summaries


	for ( int32_t i = 0 ; i < q->m_numWords; i++ ) {
		m_wordWeights[i] = 1.0;
	}

	// convenience
	m_maxNumCharsPerLine = maxNumCharsPerLine;
	m_q = q;

	// set the max excerpt len to the max summary excerpt len
	int32_t maxExcerptLen = m_maxNumCharsPerLine;

	int32_t lastNumFinal = 0;
	int32_t maxLoops = 1024;

	// if just computing absScore2...
	if ( maxNumLines <= 0 ) {
		return true;
	}

	char *p = m_summary;
	char *pend = m_summary + maxSummaryLen;

	m_numExcerpts = 0;

	int32_t need2 = (1+1+1) * m_q->m_numWords;
	m_buf4Size = need2;
	if ( need2 < 128 ) {
		m_buf4 = m_tmpBuf4;
	} else {
		m_buf4 = (char *)mmalloc ( need2 , "stkbuf" );
	}

	if ( ! m_buf4 ) {
		return false;
	}

	char *x = m_buf4;
	char *retired = x;
	x += m_q->m_numWords;
	char *maxGotIt = x;
	x += m_q->m_numWords;
	char *gotIt = x;

	// . the "maxGotIt" count vector accumulates into "retired"
	// . that is how we keep track of what query words we used for previous
	//   summary excerpts so we try to get diversified excerpts with 
	//   different query terms/words in them
	//char retired  [ MAX_QUERY_WORDS ];
	memset ( retired, 0, m_q->m_numWords * sizeof(char) );

	// some query words are already matched in the title
	for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ ) {
		if ( matches->m_qwordFlags[i] & MF_TITLEGEN ) {
			retired [ i ] = 1;
		}
	}

	bool hadEllipsis = false;

	// 
	// Loop over all words that match a query term. The matching words
	// could be from any one of the 3 Words arrays above. Find the
	// highest scoring window around each term. And then find the highest
	// of those over all the matching terms.
	//
	int32_t numFinal;
	for ( numFinal = 0; numFinal < maxNumLines; numFinal++ ) {
		if ( numFinal == m_numDisplayLines ) {
			m_displayLen = p - m_summary;
		}

		// reset these at the top of each loop
		Match     *maxm;
		int64_t  maxScore = 0;
		int32_t       maxa = 0;
		int32_t       maxb = 0;
		int32_t       maxi  = -1;
		int32_t       lasta = -1;

		if(lastNumFinal == numFinal) {
			if(maxLoops-- <= 0) {
				log(LOG_WARN, "query: got infinite loop bug, query is %s url is %s", m_q->m_orig, f->getUrl());
				break;
			}
		}
		lastNumFinal = numFinal;

		// loop through all the matches and see which is best
		for ( int32_t i = 0 ; i < matches->m_numMatches ; i++ ) {
			int32_t       a , b;
			// reset lasta if we changed words class
			if ( i > 0 && matches->m_matches[i-1].m_words != matches->m_matches[i].m_words ) {
				lasta = -1;
			}

			// only use matches in title, etc.
			mf_t flags = matches->m_matches[i].m_flags;

			bool skip = true;
			if ( flags & MF_METASUMM ) {
				skip = false;
			}
			if ( flags & MF_METADESC ) {
				skip = false;
			}
			if ( flags & MF_BODY     ) {
				skip = false;
			}
			if ( flags & MF_RSSDESC  ) {
				skip = false;
			}

			if ( skip ) {
				continue;
			}

			// ask him for the query words he matched
			//char gotIt [ MAX_QUERY_WORDS ];
			// clear it for him
			memset ( gotIt, 0, m_q->m_numWords * sizeof(char) );

			// . get score of best window around this match
			// . do not allow left post of window to be <= lasta to
			//   avoid repeating the same window.
			int64_t score = getBestWindow (matches, i, &lasta, &a, &b, gotIt, retired, maxExcerptLen);
			
			// USE THIS BUF BELOW TO DEBUG THE ABOVE CODE. 
			// PRINTS OUT THE SUMMARY
			/*
			//if ( score >=12000 ) {
			char buf[10*1024];
			char *xp = buf;
			if ( i == 0 )
				log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=");
			sprintf(xp, "score=%08" PRId32" a=%05" PRId32" b=%05" PRId32" ",
				(int32_t)score,(int32_t)a,(int32_t)b);
			xp += strlen(xp);
			for ( int32_t j = a; j < b; j++ ){
				//int32_t s = scores->m_scores[j];
				int32_t s = 0;
				if ( s < 0 ) continue;
				char e = 1;
				int32_t len = words->getWordLen(j);
				for(int32_t k=0;k<len;k +=e){
					char c = words->m_words[j][k];
					//if ( is_binary( c ) ) continue;
					*xp = c;
					xp++;
				}
				//p += strlen(p);
				if ( s == 0 ) continue;
				sprintf ( xp ,"(%" PRId32")",s);
				xp += strlen(xp);
			}
			log (LOG_WARN,"query: summary: %s", buf);
			//}
			*/

			// prints out the best window with the score
			/*
			char buf[MAX_SUMMARY_LEN];
			  char *bufPtr = buf;
			  char *bufPtrEnd = p + MAX_SUMMARY_LEN;
			  if ( i == 0 )
			  log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=");
			  int32_t len = 0;
			  Words *ww  = matches->m_matches[i].m_words;
			  //Sections *ss = matches->m_matches[i].m_sections;
			  //if ( ss->m_numSections <= 0 ) ss = NULL;
			  //len=pos->filter(bufPtr, bufPtrEnd, ww, a, b, NULL);
			  //log(LOG_WARN,"summary: %" PRId32") %s - %" PRId64,i,bufPtr,
			  //score);
			  log(LOG_WARN,"summary: %" PRId32") %s - %" PRId64,i,bufPtr,
			  score);
			*/

			// skip if was in title or something
			if ( score <= 0 ) {
				continue;
			}

			// skip if not a winner
			if ( maxi >= 0 && score <= maxScore ) {
				continue;
			}

			// we got a new winner
			maxi     = i;
			maxa     = a;
			maxb     = b;
			maxScore = score;

			// save this too
			gbmemcpy ( maxGotIt , gotIt , m_q->m_numWords );

		}
	
		// retire the query words in the winning summary

		
		//log( LOG_WARN,"summary: took %" PRId64" ms to finish getbestwindo",
		//    gettimeofdayInMilliseconds() - stget );


		// all done if no winner was made
		if ( maxi == -1 || maxa == -1 || maxb == -1) {
			break;
		}

		// who is the winning match?
		maxm = &matches->m_matches[maxi];
		Words *ww = maxm->m_words;

		// we now use "m_swbits" for the summary bits since they are
		// of size sizeof(swbit_t), a int16_t at this point
		swbit_t *bb = maxm->m_bits->m_swbits;

		// this should be impossible
		if ( maxa > ww->getNumWords() || maxb > ww->getNumWords() ) {
			log ( LOG_WARN,"query: summary starts or ends after "
			      "document is over! maxa=%" PRId32" maxb=%" PRId32" nw=%" PRId32,
			      maxa, maxb, ww->getNumWords() );
			maxa = ww->getNumWords() - 1;
			maxb = ww->getNumWords();
		}

		// assume we do not preceed with ellipsis "..."
		bool needEllipsis = true;
		
		const char *c = ww->getWord(maxa)+0;

		// rule of thumb, don't use ellipsis if the first letter is capital, or a non letter
		// is punct word before us pair acrossable? if so then we probably are not the start of a sentence.
		// or if into the sample and previous excerpt had an ellipsis do not bother using one for us.
		if ( !is_alpha_utf8(c) || is_upper_utf8(c) ||
		     (bb[maxa] & D_STARTS_SENTENCE) ||
		     (p > m_summary && hadEllipsis)) {
			needEllipsis = false;
		}

		if ( needEllipsis ) {
			// break out if no room for "..."
			if ( p + 4 + 2 > pend ) {
				break;
			}

			// space first?
			if ( p > m_summary ) {
				*p++ = ' ';
			}

			memcpy ( p, "\342\200\246 ", 4 ); //horizontal ellipsis, code point 0x2026
			p += 4;
		}

		// separate summary excerpts with a single space.
		if ( p > m_summary ) {
			if ( p + 2 > pend ) {
				break;
			}

			*p++ = ' ';
		}

		// assume we need a trailing ellipsis
		needEllipsis = true;

		// so next excerpt does not need to have an ellipsis if we 
		// have one at the end of this excerpt
		hadEllipsis = needEllipsis;

		// start with quote?
		if ( (bb[maxa] & D_IN_QUOTES) && p + 1 < pend ) {
			// preceed with quote
			*p++ = '\"';
		}
	
		// . filter the words into p
		// . removes back to back spaces
		// . converts html entities
		// . filters in stores words in [a,b) interval
		int32_t len = pos->filter( ww, maxa, maxb, false, p, pend, xml->getVersion() );

		// break out if did not fit
		if ( len == 0 ) {
			break;
		}

		// don't consider it if it is a substring of the title
		if ( len == titleBufLen && strncasestr(titleBuf, p, titleBufLen, len) ) {
			// don't consider this one
			numFinal--;
			goto skip;
		}
	
		// don't consider it if the length wasn't anything nice
		if ( len < 5 ){
			numFinal--;
			goto skip;
		}

		// otherwise, keep going
		p += len;

		// now we just indicate which query terms we got
		for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ ) {
			// do not breach
			if ( retired[i] >= 100 ) {
				continue;
			}
			retired [ i ] += maxGotIt [ i ];
		}
	
		// add all the scores of the excerpts to the doc summary score.
		// zero out scores of the winning sample so we don't get them 
		// again. use negative one billion to ensure that we don't get
		// them again
		for ( int32_t j = maxa ; j < maxb ; j++ ) {
			// mark it as used
			bb[j] |= D_USED;
		}

		// if we ended on punct that can be paired across we need
		// to add an ellipsis
		if ( needEllipsis ) {
			if ( p + 4 + 2 > pend ) {
				break;
			}
			memcpy ( p, " \342\200\246", 4 ); //horizontal ellipsis, code point 0x2026
			p += 4;
		}

		// try to put in a small summary excerpt if we have atleast
		// half of the normal excerpt length left
		if ( maxExcerptLen == m_maxNumCharsPerLine && len <= ( m_maxNumCharsPerLine / 2 + 1 ) ) {
			maxExcerptLen = m_maxNumCharsPerLine / 2;

			// don't count it in the finals since we try to get a small excerpt
			numFinal--;
		} else if ( m_numExcerpts < MAX_SUMMARY_EXCERPTS && m_numExcerpts >= 0 ) {
			m_summaryExcerptLen[m_numExcerpts] = p - m_summary;
			m_numExcerpts++;

			// also reset maxExcerptLen
			maxExcerptLen = m_maxNumCharsPerLine;
		}
	
	skip:
		// zero out the scores so they will not be used in others
		for ( int32_t j = maxa ; j < maxb ; j++ ) {
			// mark it
			bb[j] |= D_USED;
		}
	}

	if ( numFinal <= m_numDisplayLines ) {
		m_displayLen = p - m_summary;
	}

	// free the mem we used if we allocated it
	if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
		mfree ( m_buf4 , m_buf4Size , "ssstkb" );
		m_buf4 = NULL;
	}

	// If we still didn't find a summary, get the default summary
	if ( p == m_summary ) {
		bool status = getDefaultSummary ( xml, words, sections, pos, maxSummaryLen );
		if ( m_numDisplayLines > 0 ) {
			m_displayLen = m_summaryLen;
		}
		
		return status;
	}

	// if we don't find a summary, theres no need to NULL terminate
	*p++ = '\0';

	// set length
	m_summaryLen = p - m_summary;

	if ( m_summaryLen > 50000 ) { g_process.shutdownAbort(true); }

	return true;
}
Пример #15
0
/*
**  convert buffer from bristled format to plain format
*/
char *ePerl_Bristled2Plain(char *cpBuf)
{
    char *rc;
    char *cpOutBuf = NULL;
    char *cpOut = NULL;
    char *cps, *cpe;
    char *cps2, *cpe2;
    int nBuf;
    char *cpEND;
    int n;

    if (strlen(cpBuf) == 0) {
        /* make sure we return a buffer which the caller can free() */
        cpOutBuf = (char *)malloc(sizeof(char) * 1);
        *cpOutBuf = NUL;
        return cpOutBuf;
    }

    nBuf = strlen(cpBuf);
    cpEND = cpBuf+nBuf;

    /* allocate memory for the Perl code */
    n = sizeof(char) * nBuf * 10;
    if (nBuf < 1024)
        n = 16384;
    if ((cpOutBuf = (char *)malloc(n)) == NULL) {
        ePerl_SetError("Cannot allocate %d bytes of memory", n);
        CU(NULL);
    }
    cpOut = cpOutBuf;

    /* now step through the file and convert it to legal Perl code.
       This is a bit complicated because we have to make sure that
       we parse the correct delimiters while the delimiter
       characters could also occur inside the Perl code! */
    cps = cpBuf;
    while (cps < cpEND) {

        if (ePerl_case_sensitive_delimiters)
            cpe = strnstr(cps, ePerl_begin_delimiter, cpEND-cps);
        else
            cpe = strncasestr(cps, ePerl_begin_delimiter, cpEND-cps);
        if (cpe == NULL) {

            /* there are no more ePerl blocks, so
               just encapsulate the remaining contents into
               Perl print constructs */

            if (cps < cpEND) {
                cps2 = cps;
                /* first, do all complete lines */
                while (cps2 < cpEND && (cpe2 = strnchr(cps2, '\n', cpEND-cps2)) != NULL) {
                    if (ePerl_line_continuation && cps < cpe2 && *(cpe2-1) == '\\') {
                        if (cpe2-1-cps2 > 0) {
                            cpOut = ePerl_fprintf(cpOut, "print \"");
                            cpOut = ePerl_Efwrite(cps2, cpe2-1-cps2, 1, cpOut);
                            cpOut = ePerl_fprintf(cpOut, "\";");
                        }
                        cpOut = ePerl_fprintf(cpOut, "\n");
                    }
                    else {
                        cpOut = ePerl_fprintf(cpOut, "print \"");
                        cpOut = ePerl_Efwrite(cps2, cpe2-cps2, 1, cpOut);
                        cpOut = ePerl_fprintf(cpOut, "\\n\";\n");
                    }
                    cps2 = cpe2+1;
                }
                /* then do the remainder which is not
                   finished by a newline */
                if (cpEND > cps2) {
                    cpOut = ePerl_fprintf(cpOut, "print \"");
                    cpOut = ePerl_Efwrite(cps2, cpEND-cps2, 1, cpOut);
                    cpOut = ePerl_fprintf(cpOut, "\";");
                }
            }
            break; /* and break the whole processing step */

        }
        else {

            /* Ok, there is at least one more ePerl block */

            /* first, encapsulate the content from current pos
               up to the begin of the ePerl block as print statements */
            if (cps < cpe) {
                cps2 = cps;
                while ((cpe2 = strnchr(cps2, '\n', cpe-cps2)) != NULL) {
                    if (ePerl_line_continuation && cps < cpe2 && *(cpe2-1) == '\\') {
                        if (cpe2-1-cps2 > 0) {
                            cpOut = ePerl_fprintf(cpOut, "print \"");
                            cpOut = ePerl_Efwrite(cps2, cpe2-1-cps2, 1, cpOut);
                            cpOut = ePerl_fprintf(cpOut, "\";");
                        }
                        cpOut = ePerl_fprintf(cpOut, "\n");
                    }
                    else {
                        cpOut = ePerl_fprintf(cpOut, "print \"");
                        cpOut = ePerl_Efwrite(cps2, cpe2-cps2, 1, cpOut);
                        cpOut = ePerl_fprintf(cpOut, "\\n\";\n");
                    }
                    cps2 = cpe2+1;
                }
                if (cpe > cps2) {
                    cpOut = ePerl_fprintf(cpOut, "print \"");
                    cpOut = ePerl_Efwrite(cps2, cpe-cps2, 1, cpOut);
                    cpOut = ePerl_fprintf(cpOut, "\";");
                }
            }

            /* just output a leading space to make
               the -x display more readable. */
            if (cpOut > cpOutBuf && *(cpOut-1) != '\n') 
                cpOut = ePerl_fprintf(cpOut, " ");

            /* skip the start delimiter */
            cps = cpe+strlen(ePerl_begin_delimiter);

            /* recognize the 'print' shortcut with '=',
             * e.g. <:=$var:>
             */
            if (*cps == '=') {
                cpOut = ePerl_fprintf(cpOut, "print ");
                cps++;
            }

            /* skip all following whitespaces.
               Be careful: we could skip newlines too, but then the
               error output will give wrong line numbers!!! */
            while (cps < cpEND) {
                if (*cps != ' ' && *cps != '\t')
                    break;
                cps++;
            }
            cpe = cps;

            /* move forward to end of ePerl block. */
            if (ePerl_case_sensitive_delimiters)
                cpe = strnstr(cpe, ePerl_end_delimiter, cpEND-cpe);
            else
                cpe = strncasestr(cpe, ePerl_end_delimiter, cpEND-cpe);
            if (cpe == NULL) {
                ePerl_SetError("Missing end delimiter");
                CU(NULL);
            }

            /* step again backward over whitespaces */
            for (cpe2 = cpe; 
                 cpe2 > cps && (*(cpe2-1) == ' ' || *(cpe2-1) == '\t' || *(cpe2-1) == '\n');
                 cpe2--)
                ;
            
            /* pass through the ePerl block without changes! */
            if (cpe2 > cps) { 
                if (ePerl_convert_entities == TRUE)
                    cpOut = ePerl_Cfwrite(cps, cpe2-cps, 1, cpOut);
                else
                    cpOut = ePerl_fwrite(cps, cpe2-cps, 1, cpOut);

                /* be smart and automatically add a semicolon
                   if not provided at the end of the ePerl block.
                   But know the continuation indicator "_". */
                if ((*(cpe2-1) != ';') &&
                    (*(cpe2-1) != '_')   )
                    cpOut = ePerl_fprintf(cpOut, ";");
                if (*(cpe2-1) == '_') 
                    cpOut = cpOut - 1;
            }

            /* end preserve newlines for correct line numbers */
            for ( ; cpe2 <= cpe; cpe2++)
                if (*cpe2 == '\n')
                    cpOut = ePerl_fprintf(cpOut, "\n");

            /* output a trailing space to make
               the -x display more readable when
               no newlines have finished the block. */
            if (cpOut > cpOutBuf && *(cpOut-1) != '\n') 
                cpOut = ePerl_fprintf(cpOut, " ");

            /* and adjust the current position to the first character
               after the end delimiter */
            cps = cpe+strlen(ePerl_end_delimiter);

            /* finally just one more feature: when an end delimiter
               is directly followed by ``//'' this discards all
               data up to and including the following newline */
            if (cps < cpEND-2 && *cps == '/' && *(cps+1) == '/') {
                /* skip characters */
                cps += 2;
                for ( ; cps < cpEND && *cps != '\n'; cps++)
                    ;
                if (cps < cpEND)
                    cps++;
                /* but preserve the newline in the script */
                cpOut = ePerl_fprintf(cpOut, "\n");
            }
        }
    }
    RETURN_WVAL(cpOutBuf);

    CUS:
    if (cpOutBuf) 
        free(cpOutBuf);
    RETURN_EXRC;
}
Пример #16
0
// returns false and sets g_errno on error
bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, Query *query,
                       LinkInfo *linkInfo, Url *firstUrl, const char *filteredRootTitleBuf, int32_t filteredRootTitleBufSize,
                       uint8_t contentType, uint8_t langId, int32_t niceness ) {
	// make Msg20.cpp faster if it is just has
	// Msg20Request::m_setForLinkInfo set to true, no need to extricate a title.
	if ( maxTitleLen <= 0 ) {
		return true;
	}

	m_niceness = niceness;
	m_maxTitleLen = maxTitleLen;

	// if this is too big the "first line" algo can be huge!!!
	// and really slow everything way down with a huge title candidate
	int32_t maxTitleWords = 128;

	// assume no title
	reset();

	int32_t NW = words->getNumWords();

	//
	// now get all the candidates
	//

	// . allow up to 100 title CANDIDATES
	// . "as" is the word # of the first word in the candidate
	// . "bs" is the word # of the last word IN the candidate PLUS ONE
	int32_t n = 0;
	int32_t as[MAX_TIT_CANDIDATES];
	int32_t bs[MAX_TIT_CANDIDATES];
	float scores[MAX_TIT_CANDIDATES];
	Words *cptrs[MAX_TIT_CANDIDATES];
	int32_t types[MAX_TIT_CANDIDATES];
	int32_t parent[MAX_TIT_CANDIDATES];

	// record the scoring algos effects
	float  baseScore        [MAX_TIT_CANDIDATES];
	float  noCapsBoost      [MAX_TIT_CANDIDATES];
	float  qtermsBoost      [MAX_TIT_CANDIDATES];
	float  inCommonCandBoost[MAX_TIT_CANDIDATES];

	// reset these
	for ( int32_t i = 0 ; i < MAX_TIT_CANDIDATES ; i++ ) {
		// assume no parent
		parent[i] = -1;
	}

	// xml and words class for each link info, rss item
	Xml   tx[MAX_TIT_CANDIDATES];
	Words tw[MAX_TIT_CANDIDATES];
	int32_t  ti = 0;

	// restrict how many link texts and rss blobs we check for titles
	// because title recs like www.google.com have hundreds and can
	// really slow things down to like 50ms for title generation
	int32_t kcount = 0;
	int32_t rcount = 0;

	//int64_t x = gettimeofdayInMilliseconds();

	// . get every link text
	// . TODO: repeat for linkInfo2, the imported link text
	for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) {
		// breathe
		QUICKPOLL(m_niceness);
		// fast skip check for link text
		if ( k->size_linkText >= 3 && ++kcount >= 20 ) continue;
		// fast skip check for rss item
		if ( k->size_rssItem > 10 && ++rcount >= 20 ) continue;

		// set Url
		Url u;
		u.set( k->getUrl(), k->size_urlBuf );

		// is it the same host as us?
		bool sh = true;

		// skip if not from same host and should be
		if ( firstUrl->getHostLen() != u.getHostLen() ) {
			sh = false;
		}

		// skip if not from same host and should be
		if ( strncmp( firstUrl->getHost(), u.getHost(), u.getHostLen() ) ) {
			sh = false;
		}

		// get the link text
		if ( k->size_linkText >= 3 ) {
			char *p    = k->getLinkText();
			int32_t  plen = k->size_linkText - 1;
			if ( ! verifyUtf8 ( p , plen ) ) {
				log("title: set4 bad link text from url=%s", k->getUrl());
				continue;
			}

			// now the words.
			if ( !tw[ti].set( k->getLinkText(), k->size_linkText - 1, true, 0 ) ) {
				return false;
			}

			// set the bookends, it is the whole thing
			cptrs   [n] = &tw[ti];
			as      [n] = 0;
			bs      [n] = tw[ti].getNumWords();
			// score higher if same host
			if ( sh ) scores[n] = 1.05;
			// do not count so high if remote!
			else      scores[n] = 0.80;
			// set the type
			if ( sh ) types [n] = TT_LINKTEXTLOCAL;
			else      types [n] = TT_LINKTEXTREMOTE;
			// another candidate
			n++;
			// use xml and words
			ti++;
			// break out if too many already. save some for below.
			if ( n + 30 >= MAX_TIT_CANDIDATES ) break;
		}
		// get the rss item
		if ( k->size_rssItem <= 10 ) continue;
		// . returns false and sets g_errno on error
		// . use a 0 for niceness
		if ( ! k->setXmlFromRSS ( &tx[ti] , 0 ) ) return false;
		// get the word range
		int32_t tslen;
		bool isHtmlEnc;
		char *ts = tx[ti].getRSSTitle ( &tslen , &isHtmlEnc );
		// skip if not in the rss
		if ( ! ts ) continue;
		// skip if empty
		if ( tslen <= 0 ) continue;
		// now set words to that
		if ( !tw[ti].set( ts, tslen, true, 0 ) ) {
			return false;
		}

		// point to that
		cptrs   [n] = &tw[ti];
		as      [n] = 0;
		bs      [n] = tw[ti].getNumWords();
		// increment since we are using it
		ti++;
		// base score for rss title
		if ( sh ) scores[n] = 5.0;
		// if not same host, treat like link text
		else      scores[n] = 2.0;
		// set the type
		if ( sh ) types [n] = TT_RSSITEMLOCAL;
		else      types [n] = TT_RSSITEMREMOTE;
		// advance
		n++;
		// break out if too many already. save some for below.
		if ( n + 30 >= MAX_TIT_CANDIDATES ) break;
	}

	//logf(LOG_DEBUG,"title: took1=%" PRId64,gettimeofdayInMilliseconds()-x);
	//x = gettimeofdayInMilliseconds();

	// . set the flags array
	// . indicates what words are in title candidates already, but
	//   that is set below
	// . up here we set words that are not allowed to be in candidates,
	//   like words that are in a link that is not a self link
	// . alloc for it
	char *flags = NULL;
	char localBuf[10000];

	int32_t  need = words->getNumWords();
	if ( need <= 10000 ) {
		flags = (char *)localBuf;
	} else {
		flags = (char *)mmalloc(need,"TITLEflags");
	}

	if ( ! flags ) {
		return false;
	}

	// clear it
	memset ( flags , 0 , need );

	// check tags in body
	nodeid_t *tids = words->getTagIds();

	// scan to set link text flags
	// loop over all "words" in the html body
	char inLink   = false;
	char selfLink = false;
	for ( int32_t i = 0 ; i < NW ; i++ ) {
		// breathe
		QUICKPOLL(m_niceness);

		// if in a link that is not self link, cannot be in a candidate
		if ( inLink && ! selfLink ) {
			flags[i] |= 0x02;
		}

		// out of a link
		if ( tids[i] == (TAG_A | BACKBIT) ) {
			inLink = false;
		}

		// if not start of <a> tag, skip it
		if ( tids[i] != TAG_A ) {
			continue;
		}

		// flag it
		inLink = true;

		// get the node in the xml
		int32_t xn = words->getNodes()[i];

		// is it a self link?
		int32_t len;
		char *link = xml->getString(xn,"href",&len);

		// . set the url class to this
		// . TODO: use the base url in the doc
		Url u;
		u.set( link, len, true, false );

		// compare
		selfLink = u.equals ( firstUrl );

		// skip if not selfLink
		if ( ! selfLink ) {
			continue;
		}

		// if it is a selflink , check for an "onClick" tag in the
		// anchor tag to fix that Mixx issue for:
		// http://www.npr.org/templates/story/story.php?storyId=5417137

		int32_t  oclen;
		char *oc = xml->getString(xn,"onclick",&oclen);

		if ( ! oc ) {
			oc = xml->getString(xn,"onClick",&oclen);
		}

		// assume not a self link if we see that...
		if ( oc ) {
			selfLink = false;
		}

		// if this <a href> link has a "title" attribute, use that
		// instead! that thing is solid gold.
		int32_t  atlen;
		char *atitle = xml->getString(xn,"title",&atlen);

		// stop and use that, this thing is gold!
		if ( ! atitle || atlen <= 0 ) {
			continue;
		}

		// craziness? ignore it...
		if ( atlen > 400 ) {
			continue;
		}

		// if it contains permanent, permalink or share, ignore it!
		if ( strncasestr ( atitle, "permalink", atlen ) ||
		     strncasestr ( atitle,"permanent", atlen) ||
		     strncasestr ( atitle,"share", atlen) ) {
			continue;
		}

		// do not count the link text as viable
		selfLink = false;

		// aw, dammit
		if ( ti >= MAX_TIT_CANDIDATES ) {
			continue;
		}

		// other dammit
		if ( n >= MAX_TIT_CANDIDATES ) {
			break;
		}

		// ok, process it
		if ( ! tw[ti].set ( atitle, atlen, true, 0 )) {
			return false;
		}

		// set the bookends, it is the whole thing
		cptrs   [n] = &tw[ti];
		as      [n] = 0;
		bs      [n] = tw[ti].getNumWords();
		scores  [n] = 3.0; // not ALWAYS solid gold!
		types   [n] = TT_TITLEATT;

		// we are using the words class
		ti++;

		// advance
		n++;

		// break out if too many already. save some for below.
		if ( n + 20 >= MAX_TIT_CANDIDATES ) {
			break;
		}
	}

	//logf(LOG_DEBUG,"title: took2=%" PRId64,gettimeofdayInMilliseconds()-x);
	//x = gettimeofdayInMilliseconds();

	//int64_t *wids = WW->getWordIds();
	// . find the last positive scoring guy
	// . do not consider title candidates after "r" if "r" is non-zero
	// . FIXES http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/

	// the candidate # of the title tag
	int32_t tti = -1;

	// allow up to 4 tags from each type
	char table[512];

	// sanity check
	if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }

	// clear table counts
	memset ( table , 0 , 512 );

	// the first word
	char *wstart = NULL;
	if ( NW > 0 ) {
		wstart = words->getWord(0);
	}

	// loop over all "words" in the html body
	for ( int32_t i = 0 ; i < NW ; i++ ) {
		// come back up here if we encounter another "title-ish" tag
		// within our first alleged "title-ish" tag
	subloop:
		// stop after 30k of text
		if ( words->getWord(i) - wstart > 200000 ) {
			break; // 1106
		}

		// get the tag id minus the back tag bit
		nodeid_t tid = tids[i] & BACKBITCOMP;


		// pen up and pen down for these comment like tags
		if ( tid == TAG_SCRIPT || tid == TAG_STYLE ) {
			// ignore "titles" in script or style tags
			if ( ! (tids[i] & BACKBIT) ) {
				continue;
			}
		}

		/// @todo ALC we should allow more tags than just title/link
		// skip if not a good tag.
		if (tid != TAG_TITLE && tid != TAG_A) {
			continue;
		}

		// must NOT be a back tag
		if ( tids[i] & BACKBIT ) {
			continue;
		}

		// skip if we hit our limit
		if ( table[tid] >= 4 ) {
			continue;
		}

		// skip over tag/word #i
		i++;

		// no words in links, unless it is a self link
		if ( i < NW && (flags[i] & 0x02) ) {
			continue;
		}

		// the start should be here
		int32_t start = -1;

		// do not go too far
		int32_t max = i + 200;

		// find the corresponding back tag for it
		for (  ; i < NW && i < max ; i++ ) {
			// hey we got it, BUT we got no alnum word first
			// so the thing was empty, so loop back to subloop
			if ( (tids[i] & BACKBITCOMP) == tid  &&   
			     (tids[i] & BACKBIT    ) && 
			     start == -1 ) {
				goto subloop;
			}

			// if we hit another title-ish tag, loop back up
			if ( (tids[i] & BACKBITCOMP) == TAG_TITLE || (tids[i] & BACKBITCOMP) == TAG_A ) {
				// if no alnum text, restart at the top
				if ( start == -1 ) {
					goto subloop;
				}

				// otherwise, break out and see if title works
				break;
			}

			// if we hit a breaking tag...
			if ( isBreakingTagId ( tids[i] & BACKBITCOMP ) &&
			     // do not consider <span> tags breaking for 
			     // our purposes. i saw a <h1><span> setup before.
			     tids[i] != TAG_SPAN ) {
				break;
			}

			// skip if not alnum word
			if ( ! words->isAlnum(i) ) {
				continue;
			}

			// if we hit an alnum word, break out
			if ( start == -1 ) {
				start = i;
			}
		}

		// if no start was found, must have had a 0 score in there
		if ( start == -1 ) {
			continue;
		}

		// if we exhausted the doc, we are done
		if ( i >= NW ) {
			break;
		}

		// skip if way too big!
		if ( i >= max ) {
			continue;
		}

		// if was too long do not consider a title
		if ( i - start > 300 ) {
			continue;
		}

		// . skip if too many bytes
		// . this does not include the length of word #i, but #(i-1)
		if ( words->getStringSize ( start , i ) > 1000 ) {
			continue;
		}

		// when using pdftohtml, the title tag is the filename when PDF property does not have title tag
		if ( tid == TAG_TITLE && contentType == CT_PDF ) {
			// skip if title == '/in.[0-9]*'
			char* title_start = words->getWord(start);
			char* title_end = words->getWord(i);
			size_t title_size = title_end - title_start;
			const char* result = strnstr( title_start, "/in.", title_size );
			if (result != NULL) {
				char* endp = NULL;
				// do some further verification to avoid screwing up title
				if ((strtoll(result + 4, &endp, 10) > 0) && (endp == title_end)) {
					continue;
				}
			}
		}

		// count it
		table[tid]++;

		// max it out if we are positive scoring. stop after the
		// first positive scoring guy in a section. this might
		// hurt the "Hamlet" thing though...

		// store a point to the title tag guy. Msg20.cpp needs this
		// because the zak's proximity algo uses it in Summary.cpp
		// and in Msg20.cpp

		// only get the first one! often the 2nd on is in an iframe!! which we now expand into here.
		if ( tid == TAG_TITLE && m_titleTagStart == -1 ) {
			m_titleTagStart = start;
			m_titleTagEnd   = i;

			// save the candidate # because we always use this
			// as the title if we are a root
			if ( tti < 0 ) {
				tti = n;
			}
		}

		// point to words class of the body that was passed in to us
		cptrs[n] = words;
		as[n] = start;
		bs[n] = i;
		if ( tid == TAG_B ) {
			types[n] = TT_BOLDTAG;
			scores[n] = 1.0;
		} else if ( tid == TAG_H1 ) {
			types[n] = TT_HTAG;
			scores[n] = 1.8;
		} else if ( tid == TAG_H2 ) {
			types[n] = TT_HTAG;
			scores[n] = 1.7;
		} else if ( tid == TAG_H3 ) {
			types[n] = TT_HTAG;
			scores[n] = 1.6;
		} else if ( tid == TAG_TITLE ) {
			types[n] = TT_TITLETAG;
			scores[n] = 3.0;
		} else if ( tid == TAG_DIV ) {
			types[n] = TT_DIVTAG;
			scores[n] = 1.0;
		} else if ( tid == TAG_TD ) {
			types[n] = TT_TDTAG;
			scores[n] = 1.0;
		} else if ( tid == TAG_P ) {
			types[n] = TT_PTAG;
			scores[n] = 1.0;
		} else if ( tid == TAG_FONT ) {
			types[n] = TT_FONTTAG;
			scores[n] = 1.0;
		} else if ( tid == TAG_A ) {
			types[n] = TT_ATAG;
			// . self link is very powerful BUT
			//   http://www.npr.org/templates/story/story.php?storyId=5417137
			//   doesn't use it right! so use
			//   1.3 instead of 3.0. that has an "onClick" thing in the
			//   <a> tag, so check for that!
			// this was bad for
			// http://www.spiritualwoman.net/?cat=191
			// so i am demoting from 3.0 to 1.5
			scores[n] = 1.5;
		}

		// count it
		n++;

		// start loop over at tag #i, for loop does an i++, so negate
		// that so this will work
		i--;

		// break out if too many already. save some for below.
		if ( n + 10 >= MAX_TIT_CANDIDATES ) {
			break;
		}
	}

	//logf(LOG_DEBUG,"title: took3=%" PRId64,gettimeofdayInMilliseconds()-x);
	//x = gettimeofdayInMilliseconds();

	// to handle text documents, throw in the first line of text
	// as a title candidate, just make the score really low
	bool textDoc = (contentType == CT_UNKNOWN || contentType == CT_TEXT);

	if (textDoc) {
		// make "i" point to first alphabetical word in the document
		int32_t i ;

		for ( i = 0 ; i < NW && !words->isAlpha(i) ; i++);

		// if we got a first alphabetical word, then assume that to be the start of our title
		if ( i < NW && n < MAX_TIT_CANDIDATES ) {
			// first word in title is "t0"
			int32_t t0 = i;
			// find end of first line
			int32_t numWords = 0;

			// set i to the end now. we MUST find a \n to terminate the
			// title, otherwise we will not have a valid title
			while (i < NW && numWords < maxTitleWords && (words->isAlnum(i) || !words->hasChar(i, '\n'))) {
				if(words->isAlnum(i)) {
					numWords++;
				}

				++i;
			}

			// "t1" is the end
			int32_t t1 = -1;

			// we must have found our \n in order to set "t1"
			if (i <= NW && numWords < maxTitleWords ) {
				t1 = i;
			}

			// set the ptrs
			cptrs   [n] =  words;

			// this is the last resort i guess...
			scores  [n] =  0.5;
			types   [n] =  TT_FIRSTLINE;
			as      [n] =  t0;
			bs      [n] =  t1;

			// add it as a candidate if t0 and t1 were valid
			if (t0 >= 0 && t1 > t0) {
				n++;
			}
		}
	}

	//logf(LOG_DEBUG,"title: took4=%" PRId64,gettimeofdayInMilliseconds()-x);
	//x = gettimeofdayInMilliseconds();

	{
		// now add the last url path to contain underscores or hyphens
		char *pstart = firstUrl->getPath();

		// get first url
		Url *fu = firstUrl;

		// start at the end
		char *p = fu->getUrl() + fu->getUrlLen();

		// end pointer
		char *pend = NULL;

		// come up here for each path component
		while ( p >= pstart ) {
			// save end
			pend = p;

			// skip over /
			if ( *p == '/' ) {
				p--;
			}

			// now go back to next /
			int32_t count = 0;
			for ( ; p >= pstart && *p !='/' ; p-- ) {
				if ( *p == '_' || *p == '-' ) {
					count++;
				}
			}

			// did we get it?
			if ( count > 0 ) {
				break;
			}
		}

		// did we get any?
		if ( p > pstart && n < MAX_TIT_CANDIDATES ) {
			// now set words to that
			if ( ! tw[ti].set ( p, (pend - p), true, 0 )) {
				return false;
			}

			// point to that
			cptrs   [n] = &tw[ti];
			as      [n] = 0;
			bs      [n] = tw[ti].getNumWords();
			scores  [n] = 1.0;
			types   [n] = TT_URLPATH;

			// increment since we are using it
			ti++;

			// advance
			n++;
		}
	}

	// save old n
	int32_t oldn = n;

	// . do not split titles if we are a root url maps.yahoo.com was getting "Maps" for the title
	if ( firstUrl->isRoot() ) {
		oldn = -2;
	}

	// point to list of \0 separated titles
	const char *rootTitleBuf    = NULL;
	const char *rootTitleBufEnd = NULL;

	// get the root title if we are not root!
	if (filteredRootTitleBuf) {
#ifdef _VALGRIND_
		VALGRIND_CHECK_MEM_IS_DEFINED(filteredRootTitleBuf,filteredRootTitleBufSize);
#endif
		// point to list of \0 separated titles
		rootTitleBuf    = filteredRootTitleBuf;
		rootTitleBufEnd =  filteredRootTitleBuf + filteredRootTitleBufSize;
	}

	{
		Matches m;
		if ( rootTitleBuf && query ) {
			m.setQuery ( query );
		}

		// convert into an array
		int32_t nr = 0;
		const char *pr = rootTitleBuf;
		const char *rootTitles[20];
		int32_t  rootTitleLens[20];

		// loop over each root title segment
		for ( ; pr && pr < rootTitleBufEnd ; pr += strnlen(pr,rootTitleBufEnd-pr) + 1 ) {
			// if we had a query...
			if ( query ) {
				// reset it
				m.reset();

				// see if root title segment has query terms in it
				m.addMatches ( const_cast<char*>(pr), strnlen(pr,rootTitleBufEnd-pr), MF_TITLEGEN, m_niceness );

				// if matches query, do NOT add it, we only add it for
				// removing from the title of the page...
				if ( m.getNumMatches() ) {
					continue;
				}
			}
			// point to it. it should start with an alnum already
			// since it is the "filtered" list of root titles...
			// if not, fix it in xmldoc then.
			rootTitles   [nr] = pr;
			rootTitleLens[nr] = gbstrlen(pr);
			// advance
			nr++;
			// no breaching
			if ( nr >= 20 ) break;
		}

		// now split up candidates in children candidates by tokenizing
		// using :, | and - as delimters.
		// the hyphen must have a space on at least one side, so "cd-rom" does
		// not create a pair of tokens...
		// FIX: for the title:
		// Best Careers 2009: Librarian - US News and World Report
		// we need to recognize "Best Careers 2009: Librarian" as a subtitle
		// otherwise we don't get it as the title. so my question is are we
		// going to have to do all the permutations at some point? for now
		// let's just add in pairs...
		for ( int32_t i = 0 ; i < oldn && n + 3 < MAX_TIT_CANDIDATES ; i++ ) {
			// stop if no root title segments
			if ( nr <= 0 ) break;
			// get the word info
			Words *w = cptrs[i];
			int32_t   a = as[i];
			int32_t   b = bs[i];
			// init
			int32_t lasta = a;
			char prev  = false;
			// char length in bytes
			//int32_t charlen = 1;
			// see how many we add
			int32_t added = 0;
			char *skipTo = NULL;
			bool qualified = true;
			// . scan the words looking for a token
			// . sometimes the candidates end in ": " so put in "k < b-1"
			// . made this from k<b-1 to k<b to fix
			//   "Hot Tub Time Machine (2010) - IMDb" to strip IMDb
			for ( int32_t k = a ; k < b && n + 3 < MAX_TIT_CANDIDATES; k++){
				// get word
				char *wp = w->getWord(k);
				// skip if not alnum
				if ( ! w->isAlnum(k) ) {
					// in order for next alnum word to
					// qualify for "clipping" if it matches
					// the root title, there has to be more
					// than just spaces here, some punct.
					// otherwise title
					// "T. D. Jakes: Biography from Answers.com"
					// becomes
					// "T. D. Jakes: Biography from"
					qualified=isWordQualified(wp,w->getWordLen(k));
					continue;
				}
				// gotta be qualified!
				if ( ! qualified ) continue;
				// skip if in root title
				if ( skipTo && wp < skipTo ) continue;
				// does this match any root page title segments?
				int32_t j;
				for ( j = 0 ; j < nr ; j++ ) {
					// . compare to root title
					// . break out if we matched!
					if ( ! strncmp( wp, rootTitles[j], rootTitleLens[j] ) ) {
						break;
					}
				}

				// if we did not match a root title segment,
				// keep on chugging
				if ( j >= nr ) continue;
				// . we got a root title match!
				// . skip over
				skipTo = wp + rootTitleLens[j];
				// must land on qualified punct then!!
				int32_t e = k+1;
				for ( ; e<b && w->getWord(e)<skipTo ; e++ );
				// ok, word #e must be a qualified punct
				if ( e<b &&
				     ! isWordQualified(w->getWord(e),w->getWordLen(e)))
					// assume no match then!!
					continue;
				// if we had a previous guy, reset the end of the
				// previous candidate
				if ( prev ) {
					bs[n-2] = k;
					bs[n-1] = k;
				}
				// . ok, we got two more candidates
				// . well, only one more if this is not the 1st time
				if ( ! prev ) {
					cptrs   [n] = cptrs   [i];
					scores  [n] = scores  [i];
					types   [n] = types   [i];
					as      [n] = lasta;
					bs      [n] = k;
					parent  [n] = i;
					n++;
					added++;
				}
				// the 2nd one
				cptrs   [n] = cptrs   [i];
				scores  [n] = scores  [i];
				types   [n] = types   [i];
				as      [n] = e + 1;
				bs      [n] = bs      [i];
				parent  [n] = i;
				n++;
				added++;

				// now add in the last pair as a whole token
				cptrs   [n] = cptrs   [i];
				scores  [n] = scores  [i];
				types   [n] = types   [i];
				as      [n] = lasta;
				bs      [n] = bs      [i];
				parent  [n] = i;
				n++;
				added++;

				// nuke the current candidate then since it got
				// split up to not contain the root title...
				//cptrs[i] = NULL;

				// update this
				lasta = k+1;

				// if we encounter another delimeter we will have to revise bs[n-1], so note that
				prev = true;
			}

			// nuke the current candidate then since it got
			// split up to not contain the root title...
			if ( added ) {
				scores[i] = 0.001;
				//cptrs[i] = NULL;
			}

			// erase the pair if that there was only one token
			if ( added == 3 ) n--;
		}
	}

	for ( int32_t i = 0 ; i < n ; i++ ) baseScore[i] = scores[i];
	
	//
	// . now punish by 0.85 for every lower case non-stop word it has
	// . reward by 1.1 if has a non-stopword in the query
	//
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// point to the words
		Words *w = cptrs[i];

		// skip if got nuked above
		if ( ! w ) {
			continue;
		}

		// the word ptrs
		char **wptrs = w->getWordPtrs();

		// skip if empty
		if ( w->getNumWords() <= 0 ) {
			continue;
		}

		// get the word boundaries
		int32_t a = as[i];
		int32_t b = bs[i];

		// record the boosts
		float ncb = 1.0;
		float qtb = 1.0;

		// a flag
		char uncapped = false;

		// scan the words in this title candidate
		for ( int32_t j = a ; j < b ; j++ ) {
			// skip stop words
			if ( w->isQueryStopWord( j, langId ) ) {
				continue;
			}

			// punish if uncapitalized non-stopword
			if ( ! w->isCapitalized(j) ) {
				uncapped = true;
			}

			// skip if no query
			if ( ! query ) {
				continue;
			}

			int64_t wid = w->getWordId(j);

			// reward if in the query
			if ( query->getWordNum(wid) >= 0 ) {
				qtb       *= 1.5;
				scores[i] *= 1.5;
			}
		}

		// . only punish once if missing a capitalized word hurts us for:
		//   http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html
		if ( uncapped ) {
			ncb *= 1.00;
			scores[i] *= 1.00;
		}

		// punish if a http:// title thingy
		char *s = wptrs[a];
		int32_t size = w->getStringSize(a,b);
		if ( size > 9 && memcmp("http://", s, 7) == 0 ) {
			ncb *= .10;
		}
		if ( size > 14 && memcmp("h\0t\0t\0p\0:\0/\0/", s, 14) == 0 ) {
			ncb *= .10;
		}

		// set these guys
		scores[i] *= ncb;

		noCapsBoost[i]  = ncb;
		qtermsBoost[i]  = qtb;
	}

	// . now compare each candidate to the other candidates
	// . give a boost if matches
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// point to the words
		Words *w1 = cptrs[i];

		// skip if got nuked above
		if ( ! w1 ) {
			continue;
		}

		int32_t a1 = as[i];
		int32_t b1 = bs[i];

		// reset some flags
		char localFlag1 = 0;
		char localFlag2 = 0;

		// record the boost
		float iccb = 1.0;

		// total boost
		float total = 1.0;

		// to each other candidate
		for ( int32_t j = 0 ; j < n ; j++ ) {
			// not to ourselves
			if ( j == i ) {
				continue;
			}

			// or our derivatives
			if ( parent[j] == i ) {
				continue;
			}

			// or derivates to their parent
			if ( parent[i] == j ) {
				continue;
			}

			// only check parents now. do not check kids.
			// this was only for when doing percent contained
			// not getSimilarity() per se
			//if ( parent[j] != -1 ) continue;

			// TODO: do not accumulate boosts from a parent
			// and its kids, subtitles...
			//
			// do not compare type X to type Y
			if ( types[i] == TT_TITLETAG ) {
				if ( types[j] == TT_TITLETAG ) {
					continue;
				}
			}

			// do not compare a div candidate to another div cand
			// http://friendfeed.com/foxiewire?start=30
			// likewise, a TD to another TD
			// http://content-uk.cricinfo.com/ausvrsa2008_09/engine/match/351681.html
			// ... etc.
			if ( types[i] == TT_BOLDTAG ||
			     types[i] == TT_HTAG    ||
			     types[i] == TT_DIVTAG  ||
			     types[i] == TT_TDTAG   ||
			     types[i] == TT_FONTTAG    ) {
				if ( types[j] == types[i] ) continue;
			}
			// . do not compare one kid to another kid
			// . i.e. if we got "x | y" as a title and "x | z"
			//   as a link text, it will emphasize "x" too much
			//   http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html
			if ( parent[j] != -1 && parent[i] != -1 ) continue;

			// . body type tags are mostly mutually exclusive
			// . for the legacy.com url mentioned below, we have
			//   good stuff in <td> tags, so this hurts us...
			// . but for the sake of 
			//   http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/
			//   i put bold tags back

			if ( types[i] == TT_LINKTEXTLOCAL ) {
				if ( types[j] == TT_LINKTEXTLOCAL ) continue;
			}
			if ( types[i] == TT_RSSITEMLOCAL ) {
				if ( types[j] == TT_RSSITEMLOCAL ) continue;
			}

			// only compare to one local link text for each i
			if ( types[j] == TT_LINKTEXTLOCAL && localFlag1 ) {
				continue;
			}
			if ( types[j] == TT_RSSITEMLOCAL  && localFlag2 ) {
				continue;
			}
			if ( types[j] == TT_LINKTEXTLOCAL ) {
				localFlag1 = 1;
			}
			if ( types[j] == TT_RSSITEMLOCAL  ) {
				localFlag2 = 1;
			}

			// not link title attr to link title attr either
			// fixes http://www.spiritualwoman.net/?cat=191
			if ( types[i] == TT_TITLEATT &&
			     types[j] == TT_TITLEATT )
				continue;

			// get our words
			Words *w2 = cptrs[j];

			// skip if got nuked above
			if ( ! w2 ) continue;
			int32_t   a2 = as   [j];
			int32_t   b2 = bs   [j];

			// how similar is title #i to title #j ?
			float fp = getSimilarity ( w2 , a2 , b2 , w1 , a1 , b1 );

			// error?
			if ( fp == -1.0 ) return false;

			// custom boosting...
			float boost = 1.0;
			if      ( fp >= .95 ) boost = 3.0;
			else if ( fp >= .90 ) boost = 2.0;
			else if ( fp >= .85 ) boost = 1.5;
			else if ( fp >= .80 ) boost = 1.4;
			else if ( fp >= .75 ) boost = 1.3;
			else if ( fp >= .70 ) boost = 1.2;
			else if ( fp >= .60 ) boost = 1.1;
			else if ( fp >= .50 ) boost = 1.08;
			else if ( fp >= .40 ) boost = 1.04;

			// limit total
			total *= boost;
			if ( total > 100.0 ) break;
			// if you are matching the url path, that is pretty 
			// good so give more!
			// actually, that would hurt:
			// http://michellemalkin.com/2008/12/29/gag-worthy/

			// custom boosting!
			if ( fp > 0.0 && g_conf.m_logDebugTitle )
				logf(LOG_DEBUG,"title: i=%" PRId32" j=%" PRId32" fp=%.02f "
				     "b=%.02f", i,j,fp,boost);
			// apply it
			scores[i] *= boost;

			iccb      *= boost;
		}

		inCommonCandBoost[i] = iccb;
	}

	//logf(LOG_DEBUG,"title: took7=%" PRId64,gettimeofdayInMilliseconds()-x);
	//x = gettimeofdayInMilliseconds();


	// loop over all n candidates
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// skip if not in the document body
		if ( cptrs[i] != words ) continue;
		// point to the words
		int32_t       a1    = as   [i];
		int32_t       b1    = bs   [i];

		// . loop through this candidates words
		// . TODO: use memset here?
		for ( int32_t j = a1 ; j <= b1 && j < NW ; j++ ) {
			// flag it
			flags[j] |= 0x01;
		}
	}

	// free our stuff
	if ( flags!=localBuf ) {
		mfree (flags, need, "TITLEflags");
	}

	// now get the highest scoring candidate title
	float max    = -1.0;
	int32_t  winner = -1;
	for ( int32_t i = 0 ; i < n ; i++ ) {
		// skip if got nuked
		if ( ! cptrs[i] ) {
			continue;
		}

		if ( winner != -1 && scores[i] <= max ) {
			continue;
		}

		// url path's cannot be titles in and of themselves
		if ( types[i] == TT_URLPATH ) {
			continue;
		}

		// skip if empty basically, like if title was exact
		// copy of root, then the whole thing got nuked and
		// some empty string added, where a > b
		if ( as[i] >= bs[i] ) {
			continue;
		}

		// got one
		max = scores[i];

		// save it
		winner = i;
	}

	// if we are a root, always pick the title tag as the title
	if ( oldn == -2 && tti >= 0 ) {
		winner = tti;
	}

	// if no winner, all done. no title
	if ( winner == -1 ) {
		// last resort use file name
		if ((contentType == CT_PDF) && (firstUrl->getFilenameLen() != 0)) {
			Words w;
			w.set(firstUrl->getFilename(), firstUrl->getFilenameLen(), true);
			if (!copyTitle(&w, 0, w.getNumWords())) {
				return false;
			}
		}
		return true;
	}

	// point to the words class of the winner
	Words *w = cptrs[winner];
	// skip if got nuked above
	if ( ! w ) { char *xx=NULL;*xx=0; }

	// need to make our own Pos class if title not from body
	Pos  tp;
	if ( w != words ) {
		// set "Scores" ptr to NULL. we assume all are positive scores
		if ( ! tp.set ( w ) ) {
			return false;
		}
	}

	// the string ranges from word #a up to and including word #b
	int32_t a = as[winner];
	int32_t b = bs[winner];
	// sanity check
	if ( a < 0 || b > w->getNumWords() ) { char*xx=NULL;*xx=0; }

	// save the title
	if ( ! copyTitle(w, a, b) ) {
		return false;
	}

	/*
	// debug logging
	SafeBuf sb;
	SafeBuf *pbuf = &sb;

	log("title: candidates for %s",xd->getFirstUrl()->getUrl() );

	pbuf->safePrintf("<div stype=\"border:1px solid black\">");
	pbuf->safePrintf("<b>***Finding Title***</b><br>\n");

	pbuf->safePrintf("<table cellpadding=5 border=2><tr>"
			 "<td colspan=20><center><b>Title Generation</b>"
			 "</center></td>"
			 "</tr>\n<tr>"
			 "<td>#</td>"
			 "<td>type</td>"
			 "<td>parent</td>"
			 "<td>base score</td>"
			 "<td>format penalty</td>"
			 "<td>query term boost</td>"
			 "<td>candidate intersection boost</td>"
			 "<td>FINAL SCORE</td>"
			 "<td>title</td>"
			 "</tr>\n" );
			 

	// print out all candidates
	for ( int32_t i = 0 ; i < n ; i++ ) {
		char *ts = "unknown";
		if ( types[i] == TT_LINKTEXTLOCAL  ) ts = "local inlink text";
		if ( types[i] == TT_LINKTEXTREMOTE ) ts = "remote inlink text";
		if ( types[i] == TT_RSSITEMLOCAL   ) ts = "local rss title";
		if ( types[i] == TT_RSSITEMREMOTE  ) ts = "remote rss title";
		if ( types[i] == TT_BOLDTAG        ) ts = "bold tag";
		if ( types[i] == TT_HTAG           ) ts = "header tag";
		if ( types[i] == TT_TITLETAG       ) ts = "title tag";
		if ( types[i] == TT_FIRSTLINE      ) ts = "first line in text";
		if ( types[i] == TT_FONTTAG        ) ts = "font tag";
		if ( types[i] == TT_ATAG           ) ts = "anchor tag";
		if ( types[i] == TT_DIVTAG         ) ts = "div tag";
		if ( types[i] == TT_TDTAG          ) ts = "td tag";
		if ( types[i] == TT_PTAG           ) ts = "p tag";
		if ( types[i] == TT_URLPATH        ) ts = "url path";
		if ( types[i] == TT_TITLEATT       ) ts = "title attribute";
		// get the title
		pbuf->safePrintf(
				 "<tr>"
				 "<td>#%" PRId32"</td>"
				 "<td><nobr>%s</nobr></td>"
				 "<td>%" PRId32"</td>"
				 "<td>%0.2f</td>" // baseScore
				 "<td>%0.2f</td>"
				 "<td>%0.2f</td>"
				 "<td>%0.2f</td>"
				 "<td>%0.2f</td>"
				 "<td>",
				 i,
				 ts ,
				 parent[i],
				 baseScore[i],
				 noCapsBoost[i],
				 qtermsBoost[i],
				 inCommonCandBoost[i],
				 scores[i]);
		// ptrs
		Words *w = cptrs[i];
		int32_t   a = as[i];
		int32_t   b = bs[i];
		// skip if no words
		if ( w->getNumWords() <= 0 ) continue;
		// the word ptrs
		char **wptrs = w->getWordPtrs();
		// string ptrs
		char *ptr  = wptrs[a];//w->getWord(a);
		int32_t  size = w->getStringSize(a,b);
		// it is utf8
		pbuf->safeMemcpy ( ptr , size );
		// end the line
		pbuf->safePrintf("</td></tr>\n");
	}

	pbuf->safePrintf("</table>\n<br>\n");

	// log these for now
	log("title: %s",sb.getBufStart());
	*/

	return true;

}
void HttpRequest::parseFieldsMultipart ( char *s , int32_t slen ) {

	// should be NULL terminated since we replaced &'s w/ 0's in set()
	char *send   = s + slen ;
	// reset field count
	int32_t n = m_numFields;

 loop:
	// watch out for overflow
	if ( n >= MAX_CGI_PARMS ) {
		log("http: Received more than %" PRId32" CGI parms. "
		    "Truncating.",(int32_t)MAX_CGI_PARMS);
		return;
	}
	
	s = strncasestr ( s , "\r\nContent-Disposition:", send - s );
	if ( ! s ) return;

	// get the line end
	s += 2;
	char *lineEnd = strstr ( s , "\r\n" );
	if ( ! lineEnd ) return;

	// get the name
	char *name = strncasestr ( s , "name=\"" , lineEnd - s );
	if ( ! name ) goto loop;

	// point to name
	s = name + 6;
	// set the nth field name in this cgi string
	m_fields[n] = s;

	// point to = sign, use this for multiparts though
	char *equal = strstr ( s , "\"\r\n\r\n" );
	// for uploading a file it looks like
	// Content-Disposition: form-data; name=\"file\"; filename=\"poo.txt\"\r\nContent-Type: text/plain\r\n\r\nsomething here\n=====\nagain we do it...
	char *equal2 = strstr ( s , "\"" );

	// so if we had that then we had an uploaded file
	bool uploadedFile = false;
	if ( equal2 && equal && equal2 <  equal ) {
		uploadedFile = true;
		equal = equal2;
	}
	// try next field if none here
	if ( ! equal ) goto loop;
	// set field len
	m_fieldLens [ n ] = equal - s;
	// point to field value
	s = equal + 5;
	// unless we had an uploaded file, then skip more
	if ( uploadedFile ) {
		char *fileStart = strstr(equal,"\r\n\r\n");
		if ( fileStart ) fileStart += 4;
		s = fileStart;
	}
	// set = to \0 so getField() returns NULL terminated field name
	*equal = '\0';
	// set value (may be \0)
	m_fieldValues [ n ] = s;
	// force to \0 at end
	char *vend = strstr ( s , "\r\n----------"); // 29 -'s then a #
	if ( ! vend ) return;
	// null terminate the value as well
	*vend = '\0';
	// count the number of field/value pairs we get
	n++;
	// remember it
	m_numFields = n;
	// point to next field
	goto loop;

}
Пример #18
0
void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
			     Sections *sections , XmlDoc *xd ) {
	// not valid for now
	m_thumbnailValid = false;
	// reset our array of image node candidates
	m_numImages = 0;
	// flag it
	m_setCalled = true;
	// strange...
	if ( m_imgReply ) { char *xx=NULL;*xx=0; }
	// save this
	m_xml       = xml;
	m_pageUrl   = pageUrl;

	// if we are a diffbot json reply, trust that diffbot got the
	// best candidate, and just use that
	if ( xd->m_isDiffbotJSONObject ) return;

	//m_pageSite  = pageSite;
	// scan the words
	long       nw     = words->getNumWords();
	nodeid_t  *tids   = words->getTagIds();
	long long *wids   = words->getWordIds();
	//long      *scores = scoresArg->m_scores;
	Section **sp = NULL; 
	if ( sections ) sp = sections->m_sectionPtrs;
	// not if we don't have any identified sections
	if ( sections && sections->m_numSections <= 0 ) sp = NULL;
	// the positive scored window
	long firstPosScore = -1;
	long lastPosScore  = -1;
	long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE;
	// find positive scoring window
	for ( long i = 0 ; i < nw ; i++ ) {
		// skip if in bad section
		if ( sp && (sp[i]->m_flags & badFlags) ) continue;
		if ( wids[i]   != 0 ) continue;
		// set first positive scoring guy
		if ( firstPosScore == -1 ) firstPosScore = i;
		// keep track of last guy
		lastPosScore = i;
	}
	// sanity check
	if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }
	// . pedal firstPosScore back until we hit a section boundary
	// . i.e. stop once we hit a front/back tag pair, like <div> and </div>
	char tc[512];
	memset ( tc , 0 , 512 );
	long a = firstPosScore;
	for ( ; a >= 0 ; a-- ) {
		// get the tid
		nodeid_t tid = tids[a];
		// remove back bit, if any
		tid &= BACKBITCOMP;
		// skip if not a tag, or a generic xml tag
		if ( tid <= 1 ) continue;
		// mark it
		if ( words->isBackTag(a) ) tc[tid] |= 0x02;
		else                       tc[tid] |= 0x01;
		// continue if not a full front/back pair
		if ( tc[tid] != 0x03 ) continue;
		// continue if not a "section" type tag (see Scores.cpp)
		if ( tid != TAG_DIV      &&
		     tid != TAG_TEXTAREA &&
                     tid != TAG_TR       &&
                     tid != TAG_TD       &&
                     tid != TAG_TABLE      ) 
			continue;
		// ok we should stop now
		break;
	}		
	// min is 0
	if ( a < 0 ) a = 0;

	// now look for the image urls within this window
	for ( long i = a ; i < lastPosScore ; i++ ) {
		// skip if not <img> tag
		if (tids[i] != TAG_IMG ) continue;
		// get the node num into Xml.cpp::m_nodes[] array
		long nn = words->m_nodes[i];
		// check width to rule out small decorating imgs
		long width = xml->getLong(nn,nn+1,"width", -1 );
		if ( width != -1 && width < 50 ) continue;
		// same with height
		long height = xml->getLong(nn,nn+1, "height", -1 );
		if ( height != -1 && height < 50 ) continue;
		// get the url of the image
		long  srcLen;
		char *src = xml->getString(nn,"src",&srcLen);
		// skip if none
		if ( srcLen <= 2 ) continue;
		// set it to the full url
		Url iu;
		// use "pageUrl" as the baseUrl
		iu.set ( pageUrl , src , srcLen ); 
		// skip if invalid domain or TLD
		if ( iu.getDomainLen() <= 0 ) continue;
		// skip if not from same domain as page url
		//long dlen = pageUrl->getDomainLen();
		//if ( iu.getDomainLen() != dlen ) continue;
		//if(strncmp(iu.getDomain(),pageUrl->getDomain(),dlen))continue
		// get the full url
		char *u    = iu.getUrl();
		long  ulen = iu.getUrlLen();
		// skip common crap
		if ( strncasestr(u,ulen,"logo"           ) ) continue;
		if ( strncasestr(u,ulen,"comment"        ) ) continue;
		if ( strncasestr(u,ulen,"print"          ) ) continue;
		if ( strncasestr(u,ulen,"subscribe"      ) ) continue;
		if ( strncasestr(u,ulen,"header"         ) ) continue;
		if ( strncasestr(u,ulen,"footer"         ) ) continue;
		if ( strncasestr(u,ulen,"menu"           ) ) continue;
		if ( strncasestr(u,ulen,"button"         ) ) continue;
		if ( strncasestr(u,ulen,"banner"         ) ) continue;
		if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue;
		if ( strncasestr(u,ulen,"ads.webfeat."   ) ) continue;
		if ( strncasestr(u,ulen,"xads.zedo."     ) ) continue;

		// save it
		m_imageNodes[m_numImages] = nn;

		// before we lookup the image url to see if it is unique we
		// must first make sure that we have an adequate number of
		// permalinks from this same site with this same hop count.
		// we need at least 10 before we extract image thumbnails.
		char buf[2000];
		// set the query
		Query q;

		// if we do have 10 or more, then we lookup the image url to
		// make sure it is indeed unique
		sprintf ( buf , "gbimage:%s",u);
		// TODO: make sure this is a no-split termid storage thingy
		// in Msg14.cpp
		if ( ! q.set2 ( buf , langUnknown , false ) )
			// return true with g_errno set on error
			return;
		// store the termid
		m_termIds[m_numImages] = q.getTermId(0);

		// advance the counter
		m_numImages++;

		// break if full
		if ( m_numImages >= MAX_IMAGES ) break;
	}
}
Пример #19
0
static bool atom_field_verify(struct atom * atom, FSAF * fp,
                              size_t start, size_t end)
{
	size_t len = end - start;
	struct fsaf_read_rv r = fsaf_read(fp, start, len);
	assert(r.len == len);
	switch (atom->mode) {
	case M_EXACT:
		if (len != atom->patlen) return false;
		if (atom->ignore_case) {
			return strncasecmp(atom->pat, r.b, len) == 0;
		} else {
			return strncmp(atom->pat, r.b, len) == 0;
		}
	case M_SUBSTR: {
#if 0
		if (atom->ignore_case) {
			return strncasestr(r.b, atom->pat, len);
		} else {
			return strnstr(r.b, atom->pat, len);
		}
#else
		bool rv;
		char * s = strndup(r.b, len);
		if (s == 0) fatal_enomem(0);
		if (atom->ignore_case) {
			rv = strcasestr(s, atom->pat) != 0;
		} else {
			rv = strstr(s, atom->pat) != 0;
		}
		free(s);
		return rv;
#endif
	}
	case M_REGEX: case M_EREGEX: {
		char * s = strndup(r.b, len);
		if (s == 0) fatal_enomem(0);
		int regex_errcode = regexec(&atom->regex, s, 0, 0, 0);
		free(s);
		if (regex_errcode == 0 || regex_errcode == REG_NOMATCH) {
			return (regex_errcode == 0);
		}
		/* Error handling be here. */
		assert(regex_errcode != 0 && regex_errcode != REG_NOMATCH);
		s = get_regerror (regex_errcode, &atom->regex);
		if (s == 0) { enomem (0); return false; }
		message(L_IMPORTANT, 0, "%s", s);
		free(s);
		return false;
	}
	case M_VER_EQ:case M_VER_LT:case M_VER_LE:case M_VER_GT:case M_VER_GE:
		;
		char *pats = strndup(atom->pat, atom->patlen);
		char *cands = strndup(r.b, len);
		struct versionrevision pat, cand;
		if (!parse_version(&pat, pats, atom->patlen)) {
			free(pats);
			free(cands);
			return false;
		}
		if (!parse_version(&cand, cands, len)) {
			free(pats);
			free(cands);
			return false;
		}
		int res = versioncompare(&cand, &pat);
		free(pats);
		free(cands);
		switch (atom->mode) {
		case M_VER_EQ:
			return res == 0;
		case M_VER_LT:
			return res < 0;
		case M_VER_LE:
			return res <= 0;
		case M_VER_GT:
			return res > 0;
		case M_VER_GE:
			return res >= 0;
		default:
			assert(0);
		}
	}
	assert(0);
}