void HttpRequest::parseFieldsMultipart ( char *s , long slen ) { // should be NULL terminated since we replaced &'s w/ 0's in set() char *send = s + slen ; // reset field count long n = m_numFields; loop: // watch out for overflow if ( n >= MAX_CGI_PARMS ) { log("http: Received more than %li CGI parms. " "Truncating.",(long)MAX_CGI_PARMS); return; } s = strncasestr ( s , "\r\nContent-Disposition:", send - s ); if ( ! s ) return; // get the line end s += 2; char *lineEnd = strstr ( s , "\r\n" ); if ( ! lineEnd ) return; // get the name char *name = strncasestr ( s , "name=\"" , lineEnd - s ); if ( ! name ) goto loop; // point to name s = name + 6; // set the nth field name in this cgi string m_fields[n] = s; // point to = sign, use this for multiparts though char *equal = strstr ( s , "\"\r\n\r\n" ); // try next field if none here if ( ! equal ) goto loop; // set field len m_fieldLens [ n ] = equal - s; // set = to \0 so getField() returns NULL terminated field name *equal = '\0'; // point to field value s = equal + 5; // set value (may be \0) m_fieldValues [ n ] = s; // force to \0 at end char *vend = strstr ( s , "\r\n----------"); // 29 -'s then a # if ( ! vend ) return; // null terminate the value as well *vend = '\0'; // count the number of field/value pairs we get n++; // remember it m_numFields = n; // point to next field goto loop; }
/* * Determines the location at which we want to inject our JavaScript */ static char * find_pointer(char *data, size_t data_len) { struct _findp { char *what; /* * -1 before, * +1 after * 0 do not inject! */ int before; } findp[] = { { "<head>", 1}, { "<body", -1}, { "<html>", 1}, { "<?xml", 0 }, { NULL, 0 } }; struct _findp *where; for (where = &findp[0]; where->what != NULL; ++where) { char *p = strncasestr(data, where->what, data_len); if (p != NULL) { if (where->before == 1) p += strlen(where->what); else if (where->before == 0) return (NULL); return (p); } } return (data); }
char *strstr_href(char *line,gmx_bool *bInHREF,int *i_dat,int n_dat,char **dat) { char *start,*found,*href=NULL; gmx_bool bIn; int i; found=NULL; *i_dat=-1; bIn=*bInHREF; start=line; do { if (bIn) { while (strlen(start) && (strncasecmp(start,"</a",3) != 0)) start++; if (strlen(start)>0) { start+=3; bIn=FALSE; } } else { href=strncasestr(start,"<a href"); if (href) bIn=TRUE; i=0; while((i<n_dat) && !found) { found=strncasestr(start,dat[i]); if (found) { if (href && (found>href)) found=NULL; else { if (((found!=start) && isword(found[-1])) || isword(found[strlen(dat[i])])) found=NULL; else *i_dat=i; } i++; } } } } while (strlen(start) && !found && href); *bInHREF=bIn; return found; }
static int is_html(const char* url) { size_t len=strlen(url); if(!len) return 0; if(strnstr(url,".php",len) != NULL) return 1; if(strnstr(url,".jsp",len) != NULL) return 1; if(strnstr(url,".html",len) != NULL) return 1; if(strnstr(url,".aspx",len) != NULL) return 1; if(strnstr(url,".shtml",len) != NULL) return 1; if(strnstr(url,".htm",len) != NULL) return 1; if(strncasestr(url,".jpg",len) != NULL) return 0; if(strncasestr(url,".png",len) != NULL) return 0; if(strncasestr(url,".gif",len) != NULL) return 0; if(strncasestr(url,".js",len) != NULL) return 0; if(strncasestr(url,".css",len) != NULL) return 0; if(strncasestr(url,".xml",len) != NULL) return 0; if(strncasestr(url,".swf",len) != NULL) return 0; return 1; }
// . parse cgi fields contained in s // . s points to the stuff immediately after the ? // . we should have already replaced all &'s in s with /0's // . we also replace the last \r with a \0 void HttpRequest::parseFields ( char *s , int32_t slen ) { // . are we a multipart/form-data? // . many of form tags for event submission forms are this // <form enctype="multipart/form-data" ...> char *cd = strncasestr ( s , "\r\nContent-Disposition:", slen ); if ( cd ) { parseFieldsMultipart ( s , slen ); return; } // should be NULL terminated since we replaced &'s w/ 0's in set() char *send = s + slen ; // reset field count int32_t n = m_numFields; while ( s && s < send ) { // watch out for overflow if ( n >= MAX_CGI_PARMS ) { log("http: Received more than %" PRId32" CGI parms. " "Truncating.",(int32_t)MAX_CGI_PARMS); break; } // set the nth field name in this cgi string m_fields [ n ] = s; // point to = sign char *equal = strchr ( s , '=' ); // try next field if none here if ( ! equal ) { s += strlen ( s ) + 1; continue; } // if no equal sign, maybe it is one of diffbot's valueless // fields, so support that now if ( ! equal ) { // just set value to NULL char *end = strchr(s,'&'); int32_t len = end - s; if ( ! end ) len = strlen(s); m_fieldLens[n] = len; s[len] = '\0'; m_fieldValues[n] = NULL; n++; // skip over the '&' too s += len + 1; continue; } // set field len m_fieldLens [ n ] = equal - s; // set = to \0 so getField() returns NULL terminated field name *equal = '\0'; // set value (may be \0) m_fieldValues [ n ] = equal + 1; // count the number of field/value pairs we get n++; // skip: // point to next field s = equal + 1 + strlen ( equal + 1 ) + 1 ; } m_numFields = n; }
END_TEST START_TEST (_strncasestr) { char *p; char *string = "This is a testing string"; int stringlen = strlen(string); p = strncasestr (string, "iS A", stringlen); ch_assert (p != NULL); }
int main(int argc, char * argv[]) { if(argc!=3) { printf("usage: %s haystack find\n", argv[0]); return 0; } printf("Find %s in %s, result: %s\n", argv[1], argv[2], strncasestr(argv[1], argv[2], strlen(argv[1]))); return 0; }
// . parse cgi fields contained in s // . s points to the stuff immediately after the ? // . we should have already replaced all &'s in s with /0's // . we also replace the last \r with a \0 void HttpRequest::parseFields ( char *s , long slen ) { // . are we a multipart/form-data? // . many of form tags for event submission forms are this // <form enctype="multipart/form-data" ...> char *cd = strncasestr ( s , "\r\nContent-Disposition:", slen ); if ( cd ) { parseFieldsMultipart ( s , slen ); return; } // should be NULL terminated since we replaced &'s w/ 0's in set() char *send = s + slen ; // reset field count long n = m_numFields; while ( s && s < send ) { // watch out for overflow if ( n >= MAX_CGI_PARMS ) { log("http: Received more than %li CGI parms. " "Truncating.",(long)MAX_CGI_PARMS); break; } // set the nth field name in this cgi string m_fields [ n ] = s; // point to = sign char *equal = strchr ( s , '=' ); // try next field if none here if ( ! equal ) { s += gbstrlen ( s ) + 1; continue; } // set field len m_fieldLens [ n ] = equal - s; // set = to \0 so getField() returns NULL terminated field name *equal = '\0'; // set value (may be \0) m_fieldValues [ n ] = equal + 1; // count the number of field/value pairs we get n++; // skip: // point to next field s = equal + 1 + gbstrlen ( equal + 1 ) + 1 ; } m_numFields = n; }
bool Summary::verifySummary( char *titleBuf, int32_t titleBufLen ) { if ( m_summaryLen > 0 ) { // trim elipsis if ( ( titleBufLen > 4 ) && ( memcmp( (titleBuf + titleBufLen - 4), " ...", 4 ) == 0 ) ) { titleBufLen -= 4; } // verify that it's not the same with title if ( strncasestr( m_summary, titleBuf, m_summaryLen, titleBufLen ) ) { m_summaryLen = 0; m_summary[0] = '\0'; return false; } m_summaryExcerptLen[0] = m_summaryLen; m_numExcerpts = 1; m_displayLen = m_summaryLen; return true; } return false; }
/* Scan code for an assert macro, returning the start of the macro if found or * NULL if not. The out param type returns the assertion type, and afterp is * set to point immediately after the macro's opening paren. */ static char * find_macro(struct TestFile *tf, enum MacroType *type, int *need_array_it) { size_t len = tf->next_line_pos - tf->read_pos; char *assert_pos; assert_pos = strncasestr(tf->read_pos, len, "assert_", 7); if (assert_pos) { if (!need_array_it) { fail(tf, assert_pos, "assertions not allowed here"); } char *s = assert_pos + 7; size_t rest_len = tf->next_line_pos - s; if (rest_len > 6 && !strncasecmp(s, "array_", 6)) { // accept array_ s += 6; rest_len -= 6; if (rest_len > 10 && !strncasecmp(s, "equal_with", 10)) { *need_array_it = 1; tf->next_pos = s + 10; *type = ASSERT_ARRAY_EQUAL_WITH; } else if (rest_len > 5 && !strncasecmp(s, "equal", 5)) { *need_array_it = 1; tf->next_pos = s + 5; *type = ASSERT_ARRAY_EQUAL; } else { assert_pos = NULL; // not an assert macro } } else if (rest_len > 4 && !strncasecmp(s, "true", 4)) { tf->next_pos = s + 4; *type = ASSERT_TRUE; } else if (rest_len > 5 && !strncasecmp(s, "false", 5)) { tf->next_pos = s + 5; *type = ASSERT_FALSE; } else if (rest_len > 10 && !strncasecmp(s, "equal_with", 10)) { tf->next_pos = s + 10; *type = ASSERT_EQUAL_WITH; } else if (rest_len > 5 && !strncasecmp(s, "equal", 5)) { tf->next_pos = s + 5; *type = ASSERT_EQUAL; } else if (rest_len > 9 && !strncasecmp(s, "not_equal", 9)) { tf->next_pos = s + 9; *type = ASSERT_NOT_EQUAL; } else { assert_pos = NULL; // not an assert macro } } else { assert_pos = strncasestr(tf->read_pos, len, "flunk", 5); if (assert_pos) { tf->next_pos = assert_pos + 5; *type = FLUNK; } else { assert_pos = NULL; // no assertions found } } if (assert_pos) { // make sure not commented out char *s = assert_pos; while (--s >= tf->line_pos) { if (*s == '!') return NULL; } // find open paren while (tf->next_pos < tf->next_line_pos) { switch (*tf->next_pos) { case ' ': case '\t': tf->next_pos++; break; case '(': tf->next_pos++; tf->read_pos = assert_pos; return assert_pos; default: fail(tf, tf->next_pos, "expected '('"); return NULL; } } } return NULL; }
void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml , Sections *sections , XmlDoc *xd ) { // not valid for now m_thumbnailValid = false; // reset our array of image node candidates m_numImages = 0; // flag it m_setCalled = true; // strange... if ( m_imgReply ) { char *xx=NULL;*xx=0; } // save this m_xml = xml; m_pageUrl = pageUrl; // // first add any open graph candidate. // basically they page telling us the best image straight up. // int32_t node2 = -1; int32_t startNode = 0; // . field can be stuff like "summary","description","keywords",... // . if "convertHtmlEntites" is true we change < to < and > to > // . <meta property="og:image" content="http://example.com/rock2.jpg"/> // . <meta property="og:image" content="http://example.com/rock3.jpg"/> ogimgloop: char ubuf[2000]; int32_t ulen = xml->getMetaContent( ubuf, 1999, "og:image", 8, "property", startNode, &node2 ); // update this in case goto ogimgloop is called startNode = node2 + 1; // see section below for explanation of what we are storing here... if ( node2 >= 0 ) { // save it m_imageNodes[m_numImages] = node2; Query q; if ( ulen > MAX_URL_LEN ) goto ogimgloop; // set it to the full url Url iu; // use "pageUrl" as the baseUrl iu.set( pageUrl, ubuf, ulen ); // skip if invalid domain or TLD if ( iu.getDomainLen() <= 0 ) goto ogimgloop; // for looking it up on disk to see if unique or not char buf[2000]; // if we don't put in quotes it expands '|' into // the "PiiPe" operator in Query.cpp snprintf ( buf , 1999, "gbimage:\"%s\"",iu.getUrl()); // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp if ( ! q.set2 ( buf , langUnknown , false ) ) return; // sanity test if ( q.getNumTerms() != 1 ) { char *xx=0;*xx=0; } // store the termid m_termIds[m_numImages] = q.getTermId(0); // advance the counter m_numImages++; // try to get more graph images if we have some room if ( m_numImages + 2 < MAX_IMAGES ) goto ogimgloop; } //m_pageSite = pageSite; // scan the words int32_t nw = words->getNumWords(); nodeid_t *tids = words->getTagIds(); int64_t *wids = words->getWordIds(); //int32_t *scores = scoresArg->m_scores; Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // not if we don't have any identified sections if ( sections && sections->m_numSections <= 0 ) sp = NULL; // the positive scored window int32_t firstPosScore = -1; int32_t lastPosScore = -1; int32_t badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT; // find positive scoring window for ( int32_t i = 0 ; i < nw ; i++ ) { // skip if in bad section if ( sp && (sp[i]->m_flags & badFlags) ) continue; if ( wids[i] != 0 ) continue; // set first positive scoring guy if ( firstPosScore == -1 ) firstPosScore = i; // keep track of last guy lastPosScore = i; } // sanity check if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; } // . pedal firstPosScore back until we hit a section boundary // . i.e. stop once we hit a front/back tag pair, like <div> and </div> char tc[512]; memset ( tc , 0 , 512 ); int32_t a = firstPosScore; for ( ; a >= 0 ; a-- ) { // get the tid nodeid_t tid = tids[a]; // remove back bit, if any tid &= BACKBITCOMP; // skip if not a tag, or a generic xml tag if ( tid <= 1 ) continue; // mark it if ( words->isBackTag(a) ) tc[tid] |= 0x02; else tc[tid] |= 0x01; // continue if not a full front/back pair if ( tc[tid] != 0x03 ) continue; // continue if not a "section" type tag (see Scores.cpp) if ( tid != TAG_DIV && tid != TAG_TEXTAREA && tid != TAG_TR && tid != TAG_TD && tid != TAG_TABLE ) continue; // ok we should stop now break; } // min is 0 if ( a < 0 ) a = 0; // now look for the image urls within this window for ( int32_t i = a ; i < lastPosScore ; i++ ) { // skip if not <img> tag if (tids[i] != TAG_IMG ) continue; // get the node num into Xml.cpp::m_nodes[] array int32_t nn = words->getNodes()[i]; // check width to rule out small decorating imgs int32_t width = xml->getLong(nn,nn+1,"width", -1 ); if ( width != -1 && width < 50 ) continue; // same with height int32_t height = xml->getLong(nn,nn+1, "height", -1 ); if ( height != -1 && height < 50 ) continue; // get the url of the image int32_t srcLen; char *src = xml->getString(nn,"src",&srcLen); // skip if none if ( srcLen <= 2 ) continue; // set it to the full url Url iu; // use "pageUrl" as the baseUrl iu.set( pageUrl, src, srcLen ); // skip if invalid domain or TLD if ( iu.getDomainLen() <= 0 ) continue; // skip if not from same domain as page url //int32_t dlen = pageUrl->getDomainLen(); //if ( iu.getDomainLen() != dlen ) continue; //if(strncmp(iu.getDomain(),pageUrl->getDomain(),dlen))continue // get the full url char *u = iu.getUrl(); int32_t ulen = iu.getUrlLen(); // skip common crap if ( strncasestr(u,ulen,"logo" ) ) continue; if ( strncasestr(u,ulen,"comment" ) ) continue; if ( strncasestr(u,ulen,"print" ) ) continue; if ( strncasestr(u,ulen,"subscribe" ) ) continue; if ( strncasestr(u,ulen,"header" ) ) continue; if ( strncasestr(u,ulen,"footer" ) ) continue; if ( strncasestr(u,ulen,"menu" ) ) continue; if ( strncasestr(u,ulen,"button" ) ) continue; if ( strncasestr(u,ulen,"banner" ) ) continue; if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue; if ( strncasestr(u,ulen,"ads.webfeat." ) ) continue; if ( strncasestr(u,ulen,"xads.zedo." ) ) continue; // save it m_imageNodes[m_numImages] = nn; // before we lookup the image url to see if it is unique we // must first make sure that we have an adequate number of // permalinks from this same site with this same hop count. // we need at least 10 before we extract image thumbnails. char buf[2000]; // set the query Query q; // if we do have 10 or more, then we lookup the image url to // make sure it is indeed unique sprintf ( buf , "gbimage:\"%s\"",u); // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp if ( ! q.set2 ( buf , langUnknown , false ) ) // return true with g_errno set on error return; // store the termid m_termIds[m_numImages] = q.getTermId(0); // advance the counter m_numImages++; // break if full if ( m_numImages >= MAX_IMAGES ) break; } }
/* station list file format: [playlist] numberofentries=1 File1=http://67.159.5.47:8110 Title1=(#1 - 7/500) The Dominican.net Radio */ struct station_list* station_list_create(const char* fn) { int fd; rt_uint32_t length, index; struct station_list* list; char *line, *pos, prefix[8]; list = (struct station_list*)rt_malloc(sizeof(struct station_list)); if (list == RT_NULL) goto _return0; /* no memory */ list->count = 0; list->items = RT_NULL; #define LINE_BUFFER_SIZE 128 line = rt_malloc(LINE_BUFFER_SIZE); if (line == RT_NULL) goto _return1; /* no memory */ fd = open(fn, O_RDONLY, 0); if (fd < 0) goto _return2; /* open file failed */ length = read_line(fd, line, LINE_BUFFER_SIZE); pos = strncasestr(line, "[playlist]"); if (pos == RT_NULL) { station_list_destroy(list); list = RT_NULL; goto _return2; } length = read_line(fd, line, LINE_BUFFER_SIZE); pos = strncasestr(line, "numberofentries="); if (pos != RT_NULL) { list->count = (int)strtol(pos + strlen("numberofentries="), RT_NULL, 10); if (list->count > 0) { list->items = (struct station_item*) rt_malloc (sizeof(struct station_item) * list->count); rt_memset(list->items, 0, sizeof(struct station_item) * list->count); } } else { station_list_destroy(list); list = RT_NULL; goto _return2; } if (list->items == RT_NULL) { station_list_destroy(list); list = RT_NULL; goto _return2; } index = 0; while (index < list->count) { length = read_line(fd, line, LINE_BUFFER_SIZE); if (length > 0) { rt_snprintf(prefix, sizeof(prefix), "File%d", index + 1); pos = strncasestr(line, prefix); if (pos != RT_NULL) strncpy(list->items[index].url, pos + strlen(prefix) + 1, 128); rt_snprintf(prefix, sizeof(prefix), "Title%d", index + 1); pos = strncasestr(line, prefix); if (pos != RT_NULL) { strncpy(list->items[index].title, pos + strlen(prefix) + 1, 40); index ++; } } else break; } _return2: close(fd); _return1: rt_free(line); _return0: return list; }
// . *it is the image type void getImageInfo ( char *buf , long bufSize , long *dx , long *dy , long *it ) { // default to zeroes *dx = 0; *dy = 0; char *strPtr; // get the dimensions of the image if( (strPtr = strncasestr( buf, 20, "Exif" )) ) { log(LOG_DEBUG, "image: Image Link: "); log(LOG_DEBUG, "image: We currently do not handle EXIF image " "types." ); // try the nextone return; } else if( (strPtr = strncasestr( buf, 20, "GIF" )) ) { if ( it ) *it = CT_GIF; log( LOG_DEBUG, "image: GIF INFORMATION:" ); if( bufSize > 9 ) { *dx = ((unsigned long)buf[7]) << 8; *dx += (unsigned char)buf[6]; *dy = ((unsigned long)buf[9]) << 8; *dy += (unsigned char)buf[8]; } } else if( (strPtr = strncasestr( buf, 20, "JFIF" )) ) { if ( it ) *it = CT_JPG; log( LOG_DEBUG, "image: JPEG INFORMATION:" ); long i; for( i = 0; i < bufSize; i++ ) { if( bufSize < i+8 ) break; if( (unsigned char)buf[i] != 0xFF ) continue; if( (unsigned char)buf[i+1] == 0xC0 ){ *dy = ((unsigned long)buf[i+5]) << 8; *dy += (unsigned char)buf[i+6]; *dx = ((unsigned long)buf[i+7]) << 8; *dx += (unsigned char)buf[i+8]; break; } else if( (unsigned char) buf[i+1] == 0xC2 ) { *dy = ((unsigned long)buf[i+5]) << 8; *dy += (unsigned char)buf[i+6]; *dx = ((unsigned long)buf[i+7]) << 8; *dx += (unsigned char)buf[i+8]; break; } } } else if( (strPtr = strncasestr( buf, 20, "PNG" )) ) { if ( it ) *it = CT_PNG; log( LOG_DEBUG, "image: PNG INFORMATION:" ); if( bufSize > 25 ) { *dx=(unsigned long)(*(unsigned long *)&buf[16]); *dy=(unsigned long)(*(unsigned long *)&buf[20]); // these are in network order *dx = ntohl(*dx); *dy = ntohl(*dy); } } else if( (strPtr = strncasestr( buf, 20, "MM" )) ) { if ( it ) *it = CT_TIFF; log( LOG_DEBUG, "image: TIFF INFORMATION:" ); long startCnt = (unsigned long)buf[7]+4; for( long i = startCnt; i < bufSize; i += 12 ) { if( bufSize < i+10 ) break; if( buf[i] != 0x01 ) continue; if( buf[i+1] == 0x01 ) *dy = (unsigned long) (*(unsigned short *)&buf[i+8]); else if( buf[i+1] == 0x00 ) *dx = (unsigned long) (*(unsigned short *)&buf[i+8]); } } else if( (strPtr = strncasestr( buf, 20, "II" )) ) { if ( it ) *it = CT_TIFF; log( LOG_DEBUG, "image: TIFF INFORMATION:" ); long startCnt = (unsigned long)buf[7]+4; for( long i = startCnt; i < bufSize; i += 12 ) { if( bufSize < i+10 ) break; if( buf[i] == 0x01 && buf[i+1] == 0x01 ) *dy = (unsigned long) (*(unsigned short *)&buf[i+8]); if( buf[i] == 0x00 && buf[i+1] == 0x01 ) *dx = (unsigned long) (*(unsigned short *)&buf[i+8]); } } else if( (strPtr = strncasestr( buf, 20, "BM" )) ) { if ( it ) *it = CT_BMP; log( LOG_DEBUG, "image: BMP INFORMATION:" ); if( bufSize > 27 ) { *dx=(unsigned long)(*(unsigned long *)&buf[18]); *dy=(unsigned long)(*(unsigned long *)&buf[22]); } } else log( LOG_DEBUG, "image: Image Corrupted? No type found in " "data." ); }
// returns false and sets g_errno on error bool Summary::setSummary ( Xml *xml, Words *words, Sections *sections, Pos *pos, Query *q, int32_t maxSummaryLen, int32_t maxNumLines, int32_t numDisplayLines, int32_t maxNumCharsPerLine, Url *f, Matches *matches, char *titleBuf, int32_t titleBufLen ) { m_numDisplayLines = numDisplayLines; m_displayLen = 0; // assume we got maxnumlines of summary if ( (maxNumCharsPerLine + 6) * maxNumLines > maxSummaryLen ) { if ( maxNumCharsPerLine < 10 ) { maxNumCharsPerLine = 10; } static char s_flag = 1; if ( s_flag ) { s_flag = 0; log("query: Warning. " "Max summary excerpt length decreased to " "%" PRId32" chars because max summary excerpts and " "max summary length are too big.", maxNumCharsPerLine); } } // . sanity check // . summary must fit in m_summary[] // . leave room for tailing \0 if ( maxSummaryLen >= MAX_SUMMARY_LEN ) { g_errno = EBUFTOOSMALL; return log("query: Summary too big to hold in buffer of %" PRId32" bytes.",(int32_t)MAX_SUMMARY_LEN); } // do not overrun the final*[] buffers if ( maxNumLines > 256 ) { g_errno = EBUFTOOSMALL; return log("query: More than 256 summary lines requested."); } // Nothing to match...print beginning of content as summary if ( matches->m_numMatches == 0 && maxNumLines > 0 ) { return getDefaultSummary ( xml, words, sections, pos, maxSummaryLen ); } int32_t need1 = q->m_numWords * sizeof(float); m_wordWeightSize = need1; if ( need1 < 128 ) { m_wordWeights = (float *)m_tmpWordWeightsBuf; } else { m_wordWeights = (float *)mmalloc ( need1 , "wwsum" ); } if ( ! m_wordWeights ) { return false; } /// @todo ALC fix word weights /// non-working logic is removed in commit 5eacee9063861e859b54ec62035a600aa8af25df // . compute our word weights wrt each query. words which are more rare // have a higher weight. We use this to weight the terms importance // when generating the summary. // . used by the proximity algo // . used in setSummaryScores() for scoring summaries for ( int32_t i = 0 ; i < q->m_numWords; i++ ) { m_wordWeights[i] = 1.0; } // convenience m_maxNumCharsPerLine = maxNumCharsPerLine; m_q = q; // set the max excerpt len to the max summary excerpt len int32_t maxExcerptLen = m_maxNumCharsPerLine; int32_t lastNumFinal = 0; int32_t maxLoops = 1024; // if just computing absScore2... if ( maxNumLines <= 0 ) { return true; } char *p = m_summary; char *pend = m_summary + maxSummaryLen; m_numExcerpts = 0; int32_t need2 = (1+1+1) * m_q->m_numWords; m_buf4Size = need2; if ( need2 < 128 ) { m_buf4 = m_tmpBuf4; } else { m_buf4 = (char *)mmalloc ( need2 , "stkbuf" ); } if ( ! m_buf4 ) { return false; } char *x = m_buf4; char *retired = x; x += m_q->m_numWords; char *maxGotIt = x; x += m_q->m_numWords; char *gotIt = x; // . the "maxGotIt" count vector accumulates into "retired" // . that is how we keep track of what query words we used for previous // summary excerpts so we try to get diversified excerpts with // different query terms/words in them //char retired [ MAX_QUERY_WORDS ]; memset ( retired, 0, m_q->m_numWords * sizeof(char) ); // some query words are already matched in the title for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ ) { if ( matches->m_qwordFlags[i] & MF_TITLEGEN ) { retired [ i ] = 1; } } bool hadEllipsis = false; // // Loop over all words that match a query term. The matching words // could be from any one of the 3 Words arrays above. Find the // highest scoring window around each term. And then find the highest // of those over all the matching terms. // int32_t numFinal; for ( numFinal = 0; numFinal < maxNumLines; numFinal++ ) { if ( numFinal == m_numDisplayLines ) { m_displayLen = p - m_summary; } // reset these at the top of each loop Match *maxm; int64_t maxScore = 0; int32_t maxa = 0; int32_t maxb = 0; int32_t maxi = -1; int32_t lasta = -1; if(lastNumFinal == numFinal) { if(maxLoops-- <= 0) { log(LOG_WARN, "query: got infinite loop bug, query is %s url is %s", m_q->m_orig, f->getUrl()); break; } } lastNumFinal = numFinal; // loop through all the matches and see which is best for ( int32_t i = 0 ; i < matches->m_numMatches ; i++ ) { int32_t a , b; // reset lasta if we changed words class if ( i > 0 && matches->m_matches[i-1].m_words != matches->m_matches[i].m_words ) { lasta = -1; } // only use matches in title, etc. mf_t flags = matches->m_matches[i].m_flags; bool skip = true; if ( flags & MF_METASUMM ) { skip = false; } if ( flags & MF_METADESC ) { skip = false; } if ( flags & MF_BODY ) { skip = false; } if ( flags & MF_RSSDESC ) { skip = false; } if ( skip ) { continue; } // ask him for the query words he matched //char gotIt [ MAX_QUERY_WORDS ]; // clear it for him memset ( gotIt, 0, m_q->m_numWords * sizeof(char) ); // . get score of best window around this match // . do not allow left post of window to be <= lasta to // avoid repeating the same window. int64_t score = getBestWindow (matches, i, &lasta, &a, &b, gotIt, retired, maxExcerptLen); // USE THIS BUF BELOW TO DEBUG THE ABOVE CODE. // PRINTS OUT THE SUMMARY /* //if ( score >=12000 ) { char buf[10*1024]; char *xp = buf; if ( i == 0 ) log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-="); sprintf(xp, "score=%08" PRId32" a=%05" PRId32" b=%05" PRId32" ", (int32_t)score,(int32_t)a,(int32_t)b); xp += strlen(xp); for ( int32_t j = a; j < b; j++ ){ //int32_t s = scores->m_scores[j]; int32_t s = 0; if ( s < 0 ) continue; char e = 1; int32_t len = words->getWordLen(j); for(int32_t k=0;k<len;k +=e){ char c = words->m_words[j][k]; //if ( is_binary( c ) ) continue; *xp = c; xp++; } //p += strlen(p); if ( s == 0 ) continue; sprintf ( xp ,"(%" PRId32")",s); xp += strlen(xp); } log (LOG_WARN,"query: summary: %s", buf); //} */ // prints out the best window with the score /* char buf[MAX_SUMMARY_LEN]; char *bufPtr = buf; char *bufPtrEnd = p + MAX_SUMMARY_LEN; if ( i == 0 ) log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-="); int32_t len = 0; Words *ww = matches->m_matches[i].m_words; //Sections *ss = matches->m_matches[i].m_sections; //if ( ss->m_numSections <= 0 ) ss = NULL; //len=pos->filter(bufPtr, bufPtrEnd, ww, a, b, NULL); //log(LOG_WARN,"summary: %" PRId32") %s - %" PRId64,i,bufPtr, //score); log(LOG_WARN,"summary: %" PRId32") %s - %" PRId64,i,bufPtr, score); */ // skip if was in title or something if ( score <= 0 ) { continue; } // skip if not a winner if ( maxi >= 0 && score <= maxScore ) { continue; } // we got a new winner maxi = i; maxa = a; maxb = b; maxScore = score; // save this too gbmemcpy ( maxGotIt , gotIt , m_q->m_numWords ); } // retire the query words in the winning summary //log( LOG_WARN,"summary: took %" PRId64" ms to finish getbestwindo", // gettimeofdayInMilliseconds() - stget ); // all done if no winner was made if ( maxi == -1 || maxa == -1 || maxb == -1) { break; } // who is the winning match? maxm = &matches->m_matches[maxi]; Words *ww = maxm->m_words; // we now use "m_swbits" for the summary bits since they are // of size sizeof(swbit_t), a int16_t at this point swbit_t *bb = maxm->m_bits->m_swbits; // this should be impossible if ( maxa > ww->getNumWords() || maxb > ww->getNumWords() ) { log ( LOG_WARN,"query: summary starts or ends after " "document is over! maxa=%" PRId32" maxb=%" PRId32" nw=%" PRId32, maxa, maxb, ww->getNumWords() ); maxa = ww->getNumWords() - 1; maxb = ww->getNumWords(); } // assume we do not preceed with ellipsis "..." bool needEllipsis = true; const char *c = ww->getWord(maxa)+0; // rule of thumb, don't use ellipsis if the first letter is capital, or a non letter // is punct word before us pair acrossable? if so then we probably are not the start of a sentence. // or if into the sample and previous excerpt had an ellipsis do not bother using one for us. if ( !is_alpha_utf8(c) || is_upper_utf8(c) || (bb[maxa] & D_STARTS_SENTENCE) || (p > m_summary && hadEllipsis)) { needEllipsis = false; } if ( needEllipsis ) { // break out if no room for "..." if ( p + 4 + 2 > pend ) { break; } // space first? if ( p > m_summary ) { *p++ = ' '; } memcpy ( p, "\342\200\246 ", 4 ); //horizontal ellipsis, code point 0x2026 p += 4; } // separate summary excerpts with a single space. if ( p > m_summary ) { if ( p + 2 > pend ) { break; } *p++ = ' '; } // assume we need a trailing ellipsis needEllipsis = true; // so next excerpt does not need to have an ellipsis if we // have one at the end of this excerpt hadEllipsis = needEllipsis; // start with quote? if ( (bb[maxa] & D_IN_QUOTES) && p + 1 < pend ) { // preceed with quote *p++ = '\"'; } // . filter the words into p // . removes back to back spaces // . converts html entities // . filters in stores words in [a,b) interval int32_t len = pos->filter( ww, maxa, maxb, false, p, pend, xml->getVersion() ); // break out if did not fit if ( len == 0 ) { break; } // don't consider it if it is a substring of the title if ( len == titleBufLen && strncasestr(titleBuf, p, titleBufLen, len) ) { // don't consider this one numFinal--; goto skip; } // don't consider it if the length wasn't anything nice if ( len < 5 ){ numFinal--; goto skip; } // otherwise, keep going p += len; // now we just indicate which query terms we got for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ ) { // do not breach if ( retired[i] >= 100 ) { continue; } retired [ i ] += maxGotIt [ i ]; } // add all the scores of the excerpts to the doc summary score. // zero out scores of the winning sample so we don't get them // again. use negative one billion to ensure that we don't get // them again for ( int32_t j = maxa ; j < maxb ; j++ ) { // mark it as used bb[j] |= D_USED; } // if we ended on punct that can be paired across we need // to add an ellipsis if ( needEllipsis ) { if ( p + 4 + 2 > pend ) { break; } memcpy ( p, " \342\200\246", 4 ); //horizontal ellipsis, code point 0x2026 p += 4; } // try to put in a small summary excerpt if we have atleast // half of the normal excerpt length left if ( maxExcerptLen == m_maxNumCharsPerLine && len <= ( m_maxNumCharsPerLine / 2 + 1 ) ) { maxExcerptLen = m_maxNumCharsPerLine / 2; // don't count it in the finals since we try to get a small excerpt numFinal--; } else if ( m_numExcerpts < MAX_SUMMARY_EXCERPTS && m_numExcerpts >= 0 ) { m_summaryExcerptLen[m_numExcerpts] = p - m_summary; m_numExcerpts++; // also reset maxExcerptLen maxExcerptLen = m_maxNumCharsPerLine; } skip: // zero out the scores so they will not be used in others for ( int32_t j = maxa ; j < maxb ; j++ ) { // mark it bb[j] |= D_USED; } } if ( numFinal <= m_numDisplayLines ) { m_displayLen = p - m_summary; } // free the mem we used if we allocated it if ( m_buf4 && m_buf4 != m_tmpBuf4 ) { mfree ( m_buf4 , m_buf4Size , "ssstkb" ); m_buf4 = NULL; } // If we still didn't find a summary, get the default summary if ( p == m_summary ) { bool status = getDefaultSummary ( xml, words, sections, pos, maxSummaryLen ); if ( m_numDisplayLines > 0 ) { m_displayLen = m_summaryLen; } return status; } // if we don't find a summary, theres no need to NULL terminate *p++ = '\0'; // set length m_summaryLen = p - m_summary; if ( m_summaryLen > 50000 ) { g_process.shutdownAbort(true); } return true; }
/* ** convert buffer from bristled format to plain format */ char *ePerl_Bristled2Plain(char *cpBuf) { char *rc; char *cpOutBuf = NULL; char *cpOut = NULL; char *cps, *cpe; char *cps2, *cpe2; int nBuf; char *cpEND; int n; if (strlen(cpBuf) == 0) { /* make sure we return a buffer which the caller can free() */ cpOutBuf = (char *)malloc(sizeof(char) * 1); *cpOutBuf = NUL; return cpOutBuf; } nBuf = strlen(cpBuf); cpEND = cpBuf+nBuf; /* allocate memory for the Perl code */ n = sizeof(char) * nBuf * 10; if (nBuf < 1024) n = 16384; if ((cpOutBuf = (char *)malloc(n)) == NULL) { ePerl_SetError("Cannot allocate %d bytes of memory", n); CU(NULL); } cpOut = cpOutBuf; /* now step through the file and convert it to legal Perl code. This is a bit complicated because we have to make sure that we parse the correct delimiters while the delimiter characters could also occur inside the Perl code! */ cps = cpBuf; while (cps < cpEND) { if (ePerl_case_sensitive_delimiters) cpe = strnstr(cps, ePerl_begin_delimiter, cpEND-cps); else cpe = strncasestr(cps, ePerl_begin_delimiter, cpEND-cps); if (cpe == NULL) { /* there are no more ePerl blocks, so just encapsulate the remaining contents into Perl print constructs */ if (cps < cpEND) { cps2 = cps; /* first, do all complete lines */ while (cps2 < cpEND && (cpe2 = strnchr(cps2, '\n', cpEND-cps2)) != NULL) { if (ePerl_line_continuation && cps < cpe2 && *(cpe2-1) == '\\') { if (cpe2-1-cps2 > 0) { cpOut = ePerl_fprintf(cpOut, "print \""); cpOut = ePerl_Efwrite(cps2, cpe2-1-cps2, 1, cpOut); cpOut = ePerl_fprintf(cpOut, "\";"); } cpOut = ePerl_fprintf(cpOut, "\n"); } else { cpOut = ePerl_fprintf(cpOut, "print \""); cpOut = ePerl_Efwrite(cps2, cpe2-cps2, 1, cpOut); cpOut = ePerl_fprintf(cpOut, "\\n\";\n"); } cps2 = cpe2+1; } /* then do the remainder which is not finished by a newline */ if (cpEND > cps2) { cpOut = ePerl_fprintf(cpOut, "print \""); cpOut = ePerl_Efwrite(cps2, cpEND-cps2, 1, cpOut); cpOut = ePerl_fprintf(cpOut, "\";"); } } break; /* and break the whole processing step */ } else { /* Ok, there is at least one more ePerl block */ /* first, encapsulate the content from current pos up to the begin of the ePerl block as print statements */ if (cps < cpe) { cps2 = cps; while ((cpe2 = strnchr(cps2, '\n', cpe-cps2)) != NULL) { if (ePerl_line_continuation && cps < cpe2 && *(cpe2-1) == '\\') { if (cpe2-1-cps2 > 0) { cpOut = ePerl_fprintf(cpOut, "print \""); cpOut = ePerl_Efwrite(cps2, cpe2-1-cps2, 1, cpOut); cpOut = ePerl_fprintf(cpOut, "\";"); } cpOut = ePerl_fprintf(cpOut, "\n"); } else { cpOut = ePerl_fprintf(cpOut, "print \""); cpOut = ePerl_Efwrite(cps2, cpe2-cps2, 1, cpOut); cpOut = ePerl_fprintf(cpOut, "\\n\";\n"); } cps2 = cpe2+1; } if (cpe > cps2) { cpOut = ePerl_fprintf(cpOut, "print \""); cpOut = ePerl_Efwrite(cps2, cpe-cps2, 1, cpOut); cpOut = ePerl_fprintf(cpOut, "\";"); } } /* just output a leading space to make the -x display more readable. */ if (cpOut > cpOutBuf && *(cpOut-1) != '\n') cpOut = ePerl_fprintf(cpOut, " "); /* skip the start delimiter */ cps = cpe+strlen(ePerl_begin_delimiter); /* recognize the 'print' shortcut with '=', * e.g. <:=$var:> */ if (*cps == '=') { cpOut = ePerl_fprintf(cpOut, "print "); cps++; } /* skip all following whitespaces. Be careful: we could skip newlines too, but then the error output will give wrong line numbers!!! */ while (cps < cpEND) { if (*cps != ' ' && *cps != '\t') break; cps++; } cpe = cps; /* move forward to end of ePerl block. */ if (ePerl_case_sensitive_delimiters) cpe = strnstr(cpe, ePerl_end_delimiter, cpEND-cpe); else cpe = strncasestr(cpe, ePerl_end_delimiter, cpEND-cpe); if (cpe == NULL) { ePerl_SetError("Missing end delimiter"); CU(NULL); } /* step again backward over whitespaces */ for (cpe2 = cpe; cpe2 > cps && (*(cpe2-1) == ' ' || *(cpe2-1) == '\t' || *(cpe2-1) == '\n'); cpe2--) ; /* pass through the ePerl block without changes! */ if (cpe2 > cps) { if (ePerl_convert_entities == TRUE) cpOut = ePerl_Cfwrite(cps, cpe2-cps, 1, cpOut); else cpOut = ePerl_fwrite(cps, cpe2-cps, 1, cpOut); /* be smart and automatically add a semicolon if not provided at the end of the ePerl block. But know the continuation indicator "_". */ if ((*(cpe2-1) != ';') && (*(cpe2-1) != '_') ) cpOut = ePerl_fprintf(cpOut, ";"); if (*(cpe2-1) == '_') cpOut = cpOut - 1; } /* end preserve newlines for correct line numbers */ for ( ; cpe2 <= cpe; cpe2++) if (*cpe2 == '\n') cpOut = ePerl_fprintf(cpOut, "\n"); /* output a trailing space to make the -x display more readable when no newlines have finished the block. */ if (cpOut > cpOutBuf && *(cpOut-1) != '\n') cpOut = ePerl_fprintf(cpOut, " "); /* and adjust the current position to the first character after the end delimiter */ cps = cpe+strlen(ePerl_end_delimiter); /* finally just one more feature: when an end delimiter is directly followed by ``//'' this discards all data up to and including the following newline */ if (cps < cpEND-2 && *cps == '/' && *(cps+1) == '/') { /* skip characters */ cps += 2; for ( ; cps < cpEND && *cps != '\n'; cps++) ; if (cps < cpEND) cps++; /* but preserve the newline in the script */ cpOut = ePerl_fprintf(cpOut, "\n"); } } } RETURN_WVAL(cpOutBuf); CUS: if (cpOutBuf) free(cpOutBuf); RETURN_EXRC; }
// returns false and sets g_errno on error bool Title::setTitle ( Xml *xml, Words *words, int32_t maxTitleLen, Query *query, LinkInfo *linkInfo, Url *firstUrl, const char *filteredRootTitleBuf, int32_t filteredRootTitleBufSize, uint8_t contentType, uint8_t langId, int32_t niceness ) { // make Msg20.cpp faster if it is just has // Msg20Request::m_setForLinkInfo set to true, no need to extricate a title. if ( maxTitleLen <= 0 ) { return true; } m_niceness = niceness; m_maxTitleLen = maxTitleLen; // if this is too big the "first line" algo can be huge!!! // and really slow everything way down with a huge title candidate int32_t maxTitleWords = 128; // assume no title reset(); int32_t NW = words->getNumWords(); // // now get all the candidates // // . allow up to 100 title CANDIDATES // . "as" is the word # of the first word in the candidate // . "bs" is the word # of the last word IN the candidate PLUS ONE int32_t n = 0; int32_t as[MAX_TIT_CANDIDATES]; int32_t bs[MAX_TIT_CANDIDATES]; float scores[MAX_TIT_CANDIDATES]; Words *cptrs[MAX_TIT_CANDIDATES]; int32_t types[MAX_TIT_CANDIDATES]; int32_t parent[MAX_TIT_CANDIDATES]; // record the scoring algos effects float baseScore [MAX_TIT_CANDIDATES]; float noCapsBoost [MAX_TIT_CANDIDATES]; float qtermsBoost [MAX_TIT_CANDIDATES]; float inCommonCandBoost[MAX_TIT_CANDIDATES]; // reset these for ( int32_t i = 0 ; i < MAX_TIT_CANDIDATES ; i++ ) { // assume no parent parent[i] = -1; } // xml and words class for each link info, rss item Xml tx[MAX_TIT_CANDIDATES]; Words tw[MAX_TIT_CANDIDATES]; int32_t ti = 0; // restrict how many link texts and rss blobs we check for titles // because title recs like www.google.com have hundreds and can // really slow things down to like 50ms for title generation int32_t kcount = 0; int32_t rcount = 0; //int64_t x = gettimeofdayInMilliseconds(); // . get every link text // . TODO: repeat for linkInfo2, the imported link text for ( Inlink *k = NULL; linkInfo && (k = linkInfo->getNextInlink(k)) ; ) { // breathe QUICKPOLL(m_niceness); // fast skip check for link text if ( k->size_linkText >= 3 && ++kcount >= 20 ) continue; // fast skip check for rss item if ( k->size_rssItem > 10 && ++rcount >= 20 ) continue; // set Url Url u; u.set( k->getUrl(), k->size_urlBuf ); // is it the same host as us? bool sh = true; // skip if not from same host and should be if ( firstUrl->getHostLen() != u.getHostLen() ) { sh = false; } // skip if not from same host and should be if ( strncmp( firstUrl->getHost(), u.getHost(), u.getHostLen() ) ) { sh = false; } // get the link text if ( k->size_linkText >= 3 ) { char *p = k->getLinkText(); int32_t plen = k->size_linkText - 1; if ( ! verifyUtf8 ( p , plen ) ) { log("title: set4 bad link text from url=%s", k->getUrl()); continue; } // now the words. if ( !tw[ti].set( k->getLinkText(), k->size_linkText - 1, true, 0 ) ) { return false; } // set the bookends, it is the whole thing cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); // score higher if same host if ( sh ) scores[n] = 1.05; // do not count so high if remote! else scores[n] = 0.80; // set the type if ( sh ) types [n] = TT_LINKTEXTLOCAL; else types [n] = TT_LINKTEXTREMOTE; // another candidate n++; // use xml and words ti++; // break out if too many already. save some for below. if ( n + 30 >= MAX_TIT_CANDIDATES ) break; } // get the rss item if ( k->size_rssItem <= 10 ) continue; // . returns false and sets g_errno on error // . use a 0 for niceness if ( ! k->setXmlFromRSS ( &tx[ti] , 0 ) ) return false; // get the word range int32_t tslen; bool isHtmlEnc; char *ts = tx[ti].getRSSTitle ( &tslen , &isHtmlEnc ); // skip if not in the rss if ( ! ts ) continue; // skip if empty if ( tslen <= 0 ) continue; // now set words to that if ( !tw[ti].set( ts, tslen, true, 0 ) ) { return false; } // point to that cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); // increment since we are using it ti++; // base score for rss title if ( sh ) scores[n] = 5.0; // if not same host, treat like link text else scores[n] = 2.0; // set the type if ( sh ) types [n] = TT_RSSITEMLOCAL; else types [n] = TT_RSSITEMREMOTE; // advance n++; // break out if too many already. save some for below. if ( n + 30 >= MAX_TIT_CANDIDATES ) break; } //logf(LOG_DEBUG,"title: took1=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); // . set the flags array // . indicates what words are in title candidates already, but // that is set below // . up here we set words that are not allowed to be in candidates, // like words that are in a link that is not a self link // . alloc for it char *flags = NULL; char localBuf[10000]; int32_t need = words->getNumWords(); if ( need <= 10000 ) { flags = (char *)localBuf; } else { flags = (char *)mmalloc(need,"TITLEflags"); } if ( ! flags ) { return false; } // clear it memset ( flags , 0 , need ); // check tags in body nodeid_t *tids = words->getTagIds(); // scan to set link text flags // loop over all "words" in the html body char inLink = false; char selfLink = false; for ( int32_t i = 0 ; i < NW ; i++ ) { // breathe QUICKPOLL(m_niceness); // if in a link that is not self link, cannot be in a candidate if ( inLink && ! selfLink ) { flags[i] |= 0x02; } // out of a link if ( tids[i] == (TAG_A | BACKBIT) ) { inLink = false; } // if not start of <a> tag, skip it if ( tids[i] != TAG_A ) { continue; } // flag it inLink = true; // get the node in the xml int32_t xn = words->getNodes()[i]; // is it a self link? int32_t len; char *link = xml->getString(xn,"href",&len); // . set the url class to this // . TODO: use the base url in the doc Url u; u.set( link, len, true, false ); // compare selfLink = u.equals ( firstUrl ); // skip if not selfLink if ( ! selfLink ) { continue; } // if it is a selflink , check for an "onClick" tag in the // anchor tag to fix that Mixx issue for: // http://www.npr.org/templates/story/story.php?storyId=5417137 int32_t oclen; char *oc = xml->getString(xn,"onclick",&oclen); if ( ! oc ) { oc = xml->getString(xn,"onClick",&oclen); } // assume not a self link if we see that... if ( oc ) { selfLink = false; } // if this <a href> link has a "title" attribute, use that // instead! that thing is solid gold. int32_t atlen; char *atitle = xml->getString(xn,"title",&atlen); // stop and use that, this thing is gold! if ( ! atitle || atlen <= 0 ) { continue; } // craziness? ignore it... if ( atlen > 400 ) { continue; } // if it contains permanent, permalink or share, ignore it! if ( strncasestr ( atitle, "permalink", atlen ) || strncasestr ( atitle,"permanent", atlen) || strncasestr ( atitle,"share", atlen) ) { continue; } // do not count the link text as viable selfLink = false; // aw, dammit if ( ti >= MAX_TIT_CANDIDATES ) { continue; } // other dammit if ( n >= MAX_TIT_CANDIDATES ) { break; } // ok, process it if ( ! tw[ti].set ( atitle, atlen, true, 0 )) { return false; } // set the bookends, it is the whole thing cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); scores [n] = 3.0; // not ALWAYS solid gold! types [n] = TT_TITLEATT; // we are using the words class ti++; // advance n++; // break out if too many already. save some for below. if ( n + 20 >= MAX_TIT_CANDIDATES ) { break; } } //logf(LOG_DEBUG,"title: took2=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); //int64_t *wids = WW->getWordIds(); // . find the last positive scoring guy // . do not consider title candidates after "r" if "r" is non-zero // . FIXES http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/ // the candidate # of the title tag int32_t tti = -1; // allow up to 4 tags from each type char table[512]; // sanity check if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; } // clear table counts memset ( table , 0 , 512 ); // the first word char *wstart = NULL; if ( NW > 0 ) { wstart = words->getWord(0); } // loop over all "words" in the html body for ( int32_t i = 0 ; i < NW ; i++ ) { // come back up here if we encounter another "title-ish" tag // within our first alleged "title-ish" tag subloop: // stop after 30k of text if ( words->getWord(i) - wstart > 200000 ) { break; // 1106 } // get the tag id minus the back tag bit nodeid_t tid = tids[i] & BACKBITCOMP; // pen up and pen down for these comment like tags if ( tid == TAG_SCRIPT || tid == TAG_STYLE ) { // ignore "titles" in script or style tags if ( ! (tids[i] & BACKBIT) ) { continue; } } /// @todo ALC we should allow more tags than just title/link // skip if not a good tag. if (tid != TAG_TITLE && tid != TAG_A) { continue; } // must NOT be a back tag if ( tids[i] & BACKBIT ) { continue; } // skip if we hit our limit if ( table[tid] >= 4 ) { continue; } // skip over tag/word #i i++; // no words in links, unless it is a self link if ( i < NW && (flags[i] & 0x02) ) { continue; } // the start should be here int32_t start = -1; // do not go too far int32_t max = i + 200; // find the corresponding back tag for it for ( ; i < NW && i < max ; i++ ) { // hey we got it, BUT we got no alnum word first // so the thing was empty, so loop back to subloop if ( (tids[i] & BACKBITCOMP) == tid && (tids[i] & BACKBIT ) && start == -1 ) { goto subloop; } // if we hit another title-ish tag, loop back up if ( (tids[i] & BACKBITCOMP) == TAG_TITLE || (tids[i] & BACKBITCOMP) == TAG_A ) { // if no alnum text, restart at the top if ( start == -1 ) { goto subloop; } // otherwise, break out and see if title works break; } // if we hit a breaking tag... if ( isBreakingTagId ( tids[i] & BACKBITCOMP ) && // do not consider <span> tags breaking for // our purposes. i saw a <h1><span> setup before. tids[i] != TAG_SPAN ) { break; } // skip if not alnum word if ( ! words->isAlnum(i) ) { continue; } // if we hit an alnum word, break out if ( start == -1 ) { start = i; } } // if no start was found, must have had a 0 score in there if ( start == -1 ) { continue; } // if we exhausted the doc, we are done if ( i >= NW ) { break; } // skip if way too big! if ( i >= max ) { continue; } // if was too long do not consider a title if ( i - start > 300 ) { continue; } // . skip if too many bytes // . this does not include the length of word #i, but #(i-1) if ( words->getStringSize ( start , i ) > 1000 ) { continue; } // when using pdftohtml, the title tag is the filename when PDF property does not have title tag if ( tid == TAG_TITLE && contentType == CT_PDF ) { // skip if title == '/in.[0-9]*' char* title_start = words->getWord(start); char* title_end = words->getWord(i); size_t title_size = title_end - title_start; const char* result = strnstr( title_start, "/in.", title_size ); if (result != NULL) { char* endp = NULL; // do some further verification to avoid screwing up title if ((strtoll(result + 4, &endp, 10) > 0) && (endp == title_end)) { continue; } } } // count it table[tid]++; // max it out if we are positive scoring. stop after the // first positive scoring guy in a section. this might // hurt the "Hamlet" thing though... // store a point to the title tag guy. Msg20.cpp needs this // because the zak's proximity algo uses it in Summary.cpp // and in Msg20.cpp // only get the first one! often the 2nd on is in an iframe!! which we now expand into here. if ( tid == TAG_TITLE && m_titleTagStart == -1 ) { m_titleTagStart = start; m_titleTagEnd = i; // save the candidate # because we always use this // as the title if we are a root if ( tti < 0 ) { tti = n; } } // point to words class of the body that was passed in to us cptrs[n] = words; as[n] = start; bs[n] = i; if ( tid == TAG_B ) { types[n] = TT_BOLDTAG; scores[n] = 1.0; } else if ( tid == TAG_H1 ) { types[n] = TT_HTAG; scores[n] = 1.8; } else if ( tid == TAG_H2 ) { types[n] = TT_HTAG; scores[n] = 1.7; } else if ( tid == TAG_H3 ) { types[n] = TT_HTAG; scores[n] = 1.6; } else if ( tid == TAG_TITLE ) { types[n] = TT_TITLETAG; scores[n] = 3.0; } else if ( tid == TAG_DIV ) { types[n] = TT_DIVTAG; scores[n] = 1.0; } else if ( tid == TAG_TD ) { types[n] = TT_TDTAG; scores[n] = 1.0; } else if ( tid == TAG_P ) { types[n] = TT_PTAG; scores[n] = 1.0; } else if ( tid == TAG_FONT ) { types[n] = TT_FONTTAG; scores[n] = 1.0; } else if ( tid == TAG_A ) { types[n] = TT_ATAG; // . self link is very powerful BUT // http://www.npr.org/templates/story/story.php?storyId=5417137 // doesn't use it right! so use // 1.3 instead of 3.0. that has an "onClick" thing in the // <a> tag, so check for that! // this was bad for // http://www.spiritualwoman.net/?cat=191 // so i am demoting from 3.0 to 1.5 scores[n] = 1.5; } // count it n++; // start loop over at tag #i, for loop does an i++, so negate // that so this will work i--; // break out if too many already. save some for below. if ( n + 10 >= MAX_TIT_CANDIDATES ) { break; } } //logf(LOG_DEBUG,"title: took3=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); // to handle text documents, throw in the first line of text // as a title candidate, just make the score really low bool textDoc = (contentType == CT_UNKNOWN || contentType == CT_TEXT); if (textDoc) { // make "i" point to first alphabetical word in the document int32_t i ; for ( i = 0 ; i < NW && !words->isAlpha(i) ; i++); // if we got a first alphabetical word, then assume that to be the start of our title if ( i < NW && n < MAX_TIT_CANDIDATES ) { // first word in title is "t0" int32_t t0 = i; // find end of first line int32_t numWords = 0; // set i to the end now. we MUST find a \n to terminate the // title, otherwise we will not have a valid title while (i < NW && numWords < maxTitleWords && (words->isAlnum(i) || !words->hasChar(i, '\n'))) { if(words->isAlnum(i)) { numWords++; } ++i; } // "t1" is the end int32_t t1 = -1; // we must have found our \n in order to set "t1" if (i <= NW && numWords < maxTitleWords ) { t1 = i; } // set the ptrs cptrs [n] = words; // this is the last resort i guess... scores [n] = 0.5; types [n] = TT_FIRSTLINE; as [n] = t0; bs [n] = t1; // add it as a candidate if t0 and t1 were valid if (t0 >= 0 && t1 > t0) { n++; } } } //logf(LOG_DEBUG,"title: took4=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); { // now add the last url path to contain underscores or hyphens char *pstart = firstUrl->getPath(); // get first url Url *fu = firstUrl; // start at the end char *p = fu->getUrl() + fu->getUrlLen(); // end pointer char *pend = NULL; // come up here for each path component while ( p >= pstart ) { // save end pend = p; // skip over / if ( *p == '/' ) { p--; } // now go back to next / int32_t count = 0; for ( ; p >= pstart && *p !='/' ; p-- ) { if ( *p == '_' || *p == '-' ) { count++; } } // did we get it? if ( count > 0 ) { break; } } // did we get any? if ( p > pstart && n < MAX_TIT_CANDIDATES ) { // now set words to that if ( ! tw[ti].set ( p, (pend - p), true, 0 )) { return false; } // point to that cptrs [n] = &tw[ti]; as [n] = 0; bs [n] = tw[ti].getNumWords(); scores [n] = 1.0; types [n] = TT_URLPATH; // increment since we are using it ti++; // advance n++; } } // save old n int32_t oldn = n; // . do not split titles if we are a root url maps.yahoo.com was getting "Maps" for the title if ( firstUrl->isRoot() ) { oldn = -2; } // point to list of \0 separated titles const char *rootTitleBuf = NULL; const char *rootTitleBufEnd = NULL; // get the root title if we are not root! if (filteredRootTitleBuf) { #ifdef _VALGRIND_ VALGRIND_CHECK_MEM_IS_DEFINED(filteredRootTitleBuf,filteredRootTitleBufSize); #endif // point to list of \0 separated titles rootTitleBuf = filteredRootTitleBuf; rootTitleBufEnd = filteredRootTitleBuf + filteredRootTitleBufSize; } { Matches m; if ( rootTitleBuf && query ) { m.setQuery ( query ); } // convert into an array int32_t nr = 0; const char *pr = rootTitleBuf; const char *rootTitles[20]; int32_t rootTitleLens[20]; // loop over each root title segment for ( ; pr && pr < rootTitleBufEnd ; pr += strnlen(pr,rootTitleBufEnd-pr) + 1 ) { // if we had a query... if ( query ) { // reset it m.reset(); // see if root title segment has query terms in it m.addMatches ( const_cast<char*>(pr), strnlen(pr,rootTitleBufEnd-pr), MF_TITLEGEN, m_niceness ); // if matches query, do NOT add it, we only add it for // removing from the title of the page... if ( m.getNumMatches() ) { continue; } } // point to it. it should start with an alnum already // since it is the "filtered" list of root titles... // if not, fix it in xmldoc then. rootTitles [nr] = pr; rootTitleLens[nr] = gbstrlen(pr); // advance nr++; // no breaching if ( nr >= 20 ) break; } // now split up candidates in children candidates by tokenizing // using :, | and - as delimters. // the hyphen must have a space on at least one side, so "cd-rom" does // not create a pair of tokens... // FIX: for the title: // Best Careers 2009: Librarian - US News and World Report // we need to recognize "Best Careers 2009: Librarian" as a subtitle // otherwise we don't get it as the title. so my question is are we // going to have to do all the permutations at some point? for now // let's just add in pairs... for ( int32_t i = 0 ; i < oldn && n + 3 < MAX_TIT_CANDIDATES ; i++ ) { // stop if no root title segments if ( nr <= 0 ) break; // get the word info Words *w = cptrs[i]; int32_t a = as[i]; int32_t b = bs[i]; // init int32_t lasta = a; char prev = false; // char length in bytes //int32_t charlen = 1; // see how many we add int32_t added = 0; char *skipTo = NULL; bool qualified = true; // . scan the words looking for a token // . sometimes the candidates end in ": " so put in "k < b-1" // . made this from k<b-1 to k<b to fix // "Hot Tub Time Machine (2010) - IMDb" to strip IMDb for ( int32_t k = a ; k < b && n + 3 < MAX_TIT_CANDIDATES; k++){ // get word char *wp = w->getWord(k); // skip if not alnum if ( ! w->isAlnum(k) ) { // in order for next alnum word to // qualify for "clipping" if it matches // the root title, there has to be more // than just spaces here, some punct. // otherwise title // "T. D. Jakes: Biography from Answers.com" // becomes // "T. D. Jakes: Biography from" qualified=isWordQualified(wp,w->getWordLen(k)); continue; } // gotta be qualified! if ( ! qualified ) continue; // skip if in root title if ( skipTo && wp < skipTo ) continue; // does this match any root page title segments? int32_t j; for ( j = 0 ; j < nr ; j++ ) { // . compare to root title // . break out if we matched! if ( ! strncmp( wp, rootTitles[j], rootTitleLens[j] ) ) { break; } } // if we did not match a root title segment, // keep on chugging if ( j >= nr ) continue; // . we got a root title match! // . skip over skipTo = wp + rootTitleLens[j]; // must land on qualified punct then!! int32_t e = k+1; for ( ; e<b && w->getWord(e)<skipTo ; e++ ); // ok, word #e must be a qualified punct if ( e<b && ! isWordQualified(w->getWord(e),w->getWordLen(e))) // assume no match then!! continue; // if we had a previous guy, reset the end of the // previous candidate if ( prev ) { bs[n-2] = k; bs[n-1] = k; } // . ok, we got two more candidates // . well, only one more if this is not the 1st time if ( ! prev ) { cptrs [n] = cptrs [i]; scores [n] = scores [i]; types [n] = types [i]; as [n] = lasta; bs [n] = k; parent [n] = i; n++; added++; } // the 2nd one cptrs [n] = cptrs [i]; scores [n] = scores [i]; types [n] = types [i]; as [n] = e + 1; bs [n] = bs [i]; parent [n] = i; n++; added++; // now add in the last pair as a whole token cptrs [n] = cptrs [i]; scores [n] = scores [i]; types [n] = types [i]; as [n] = lasta; bs [n] = bs [i]; parent [n] = i; n++; added++; // nuke the current candidate then since it got // split up to not contain the root title... //cptrs[i] = NULL; // update this lasta = k+1; // if we encounter another delimeter we will have to revise bs[n-1], so note that prev = true; } // nuke the current candidate then since it got // split up to not contain the root title... if ( added ) { scores[i] = 0.001; //cptrs[i] = NULL; } // erase the pair if that there was only one token if ( added == 3 ) n--; } } for ( int32_t i = 0 ; i < n ; i++ ) baseScore[i] = scores[i]; // // . now punish by 0.85 for every lower case non-stop word it has // . reward by 1.1 if has a non-stopword in the query // for ( int32_t i = 0 ; i < n ; i++ ) { // point to the words Words *w = cptrs[i]; // skip if got nuked above if ( ! w ) { continue; } // the word ptrs char **wptrs = w->getWordPtrs(); // skip if empty if ( w->getNumWords() <= 0 ) { continue; } // get the word boundaries int32_t a = as[i]; int32_t b = bs[i]; // record the boosts float ncb = 1.0; float qtb = 1.0; // a flag char uncapped = false; // scan the words in this title candidate for ( int32_t j = a ; j < b ; j++ ) { // skip stop words if ( w->isQueryStopWord( j, langId ) ) { continue; } // punish if uncapitalized non-stopword if ( ! w->isCapitalized(j) ) { uncapped = true; } // skip if no query if ( ! query ) { continue; } int64_t wid = w->getWordId(j); // reward if in the query if ( query->getWordNum(wid) >= 0 ) { qtb *= 1.5; scores[i] *= 1.5; } } // . only punish once if missing a capitalized word hurts us for: // http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html if ( uncapped ) { ncb *= 1.00; scores[i] *= 1.00; } // punish if a http:// title thingy char *s = wptrs[a]; int32_t size = w->getStringSize(a,b); if ( size > 9 && memcmp("http://", s, 7) == 0 ) { ncb *= .10; } if ( size > 14 && memcmp("h\0t\0t\0p\0:\0/\0/", s, 14) == 0 ) { ncb *= .10; } // set these guys scores[i] *= ncb; noCapsBoost[i] = ncb; qtermsBoost[i] = qtb; } // . now compare each candidate to the other candidates // . give a boost if matches for ( int32_t i = 0 ; i < n ; i++ ) { // point to the words Words *w1 = cptrs[i]; // skip if got nuked above if ( ! w1 ) { continue; } int32_t a1 = as[i]; int32_t b1 = bs[i]; // reset some flags char localFlag1 = 0; char localFlag2 = 0; // record the boost float iccb = 1.0; // total boost float total = 1.0; // to each other candidate for ( int32_t j = 0 ; j < n ; j++ ) { // not to ourselves if ( j == i ) { continue; } // or our derivatives if ( parent[j] == i ) { continue; } // or derivates to their parent if ( parent[i] == j ) { continue; } // only check parents now. do not check kids. // this was only for when doing percent contained // not getSimilarity() per se //if ( parent[j] != -1 ) continue; // TODO: do not accumulate boosts from a parent // and its kids, subtitles... // // do not compare type X to type Y if ( types[i] == TT_TITLETAG ) { if ( types[j] == TT_TITLETAG ) { continue; } } // do not compare a div candidate to another div cand // http://friendfeed.com/foxiewire?start=30 // likewise, a TD to another TD // http://content-uk.cricinfo.com/ausvrsa2008_09/engine/match/351681.html // ... etc. if ( types[i] == TT_BOLDTAG || types[i] == TT_HTAG || types[i] == TT_DIVTAG || types[i] == TT_TDTAG || types[i] == TT_FONTTAG ) { if ( types[j] == types[i] ) continue; } // . do not compare one kid to another kid // . i.e. if we got "x | y" as a title and "x | z" // as a link text, it will emphasize "x" too much // http://content-uk.cricinfo.com/ausvrsa2008_09/engine/current/match/351682.html if ( parent[j] != -1 && parent[i] != -1 ) continue; // . body type tags are mostly mutually exclusive // . for the legacy.com url mentioned below, we have // good stuff in <td> tags, so this hurts us... // . but for the sake of // http://larvatusprodeo.net/2009/01/07/partisanship-politics-and-participation/ // i put bold tags back if ( types[i] == TT_LINKTEXTLOCAL ) { if ( types[j] == TT_LINKTEXTLOCAL ) continue; } if ( types[i] == TT_RSSITEMLOCAL ) { if ( types[j] == TT_RSSITEMLOCAL ) continue; } // only compare to one local link text for each i if ( types[j] == TT_LINKTEXTLOCAL && localFlag1 ) { continue; } if ( types[j] == TT_RSSITEMLOCAL && localFlag2 ) { continue; } if ( types[j] == TT_LINKTEXTLOCAL ) { localFlag1 = 1; } if ( types[j] == TT_RSSITEMLOCAL ) { localFlag2 = 1; } // not link title attr to link title attr either // fixes http://www.spiritualwoman.net/?cat=191 if ( types[i] == TT_TITLEATT && types[j] == TT_TITLEATT ) continue; // get our words Words *w2 = cptrs[j]; // skip if got nuked above if ( ! w2 ) continue; int32_t a2 = as [j]; int32_t b2 = bs [j]; // how similar is title #i to title #j ? float fp = getSimilarity ( w2 , a2 , b2 , w1 , a1 , b1 ); // error? if ( fp == -1.0 ) return false; // custom boosting... float boost = 1.0; if ( fp >= .95 ) boost = 3.0; else if ( fp >= .90 ) boost = 2.0; else if ( fp >= .85 ) boost = 1.5; else if ( fp >= .80 ) boost = 1.4; else if ( fp >= .75 ) boost = 1.3; else if ( fp >= .70 ) boost = 1.2; else if ( fp >= .60 ) boost = 1.1; else if ( fp >= .50 ) boost = 1.08; else if ( fp >= .40 ) boost = 1.04; // limit total total *= boost; if ( total > 100.0 ) break; // if you are matching the url path, that is pretty // good so give more! // actually, that would hurt: // http://michellemalkin.com/2008/12/29/gag-worthy/ // custom boosting! if ( fp > 0.0 && g_conf.m_logDebugTitle ) logf(LOG_DEBUG,"title: i=%" PRId32" j=%" PRId32" fp=%.02f " "b=%.02f", i,j,fp,boost); // apply it scores[i] *= boost; iccb *= boost; } inCommonCandBoost[i] = iccb; } //logf(LOG_DEBUG,"title: took7=%" PRId64,gettimeofdayInMilliseconds()-x); //x = gettimeofdayInMilliseconds(); // loop over all n candidates for ( int32_t i = 0 ; i < n ; i++ ) { // skip if not in the document body if ( cptrs[i] != words ) continue; // point to the words int32_t a1 = as [i]; int32_t b1 = bs [i]; // . loop through this candidates words // . TODO: use memset here? for ( int32_t j = a1 ; j <= b1 && j < NW ; j++ ) { // flag it flags[j] |= 0x01; } } // free our stuff if ( flags!=localBuf ) { mfree (flags, need, "TITLEflags"); } // now get the highest scoring candidate title float max = -1.0; int32_t winner = -1; for ( int32_t i = 0 ; i < n ; i++ ) { // skip if got nuked if ( ! cptrs[i] ) { continue; } if ( winner != -1 && scores[i] <= max ) { continue; } // url path's cannot be titles in and of themselves if ( types[i] == TT_URLPATH ) { continue; } // skip if empty basically, like if title was exact // copy of root, then the whole thing got nuked and // some empty string added, where a > b if ( as[i] >= bs[i] ) { continue; } // got one max = scores[i]; // save it winner = i; } // if we are a root, always pick the title tag as the title if ( oldn == -2 && tti >= 0 ) { winner = tti; } // if no winner, all done. no title if ( winner == -1 ) { // last resort use file name if ((contentType == CT_PDF) && (firstUrl->getFilenameLen() != 0)) { Words w; w.set(firstUrl->getFilename(), firstUrl->getFilenameLen(), true); if (!copyTitle(&w, 0, w.getNumWords())) { return false; } } return true; } // point to the words class of the winner Words *w = cptrs[winner]; // skip if got nuked above if ( ! w ) { char *xx=NULL;*xx=0; } // need to make our own Pos class if title not from body Pos tp; if ( w != words ) { // set "Scores" ptr to NULL. we assume all are positive scores if ( ! tp.set ( w ) ) { return false; } } // the string ranges from word #a up to and including word #b int32_t a = as[winner]; int32_t b = bs[winner]; // sanity check if ( a < 0 || b > w->getNumWords() ) { char*xx=NULL;*xx=0; } // save the title if ( ! copyTitle(w, a, b) ) { return false; } /* // debug logging SafeBuf sb; SafeBuf *pbuf = &sb; log("title: candidates for %s",xd->getFirstUrl()->getUrl() ); pbuf->safePrintf("<div stype=\"border:1px solid black\">"); pbuf->safePrintf("<b>***Finding Title***</b><br>\n"); pbuf->safePrintf("<table cellpadding=5 border=2><tr>" "<td colspan=20><center><b>Title Generation</b>" "</center></td>" "</tr>\n<tr>" "<td>#</td>" "<td>type</td>" "<td>parent</td>" "<td>base score</td>" "<td>format penalty</td>" "<td>query term boost</td>" "<td>candidate intersection boost</td>" "<td>FINAL SCORE</td>" "<td>title</td>" "</tr>\n" ); // print out all candidates for ( int32_t i = 0 ; i < n ; i++ ) { char *ts = "unknown"; if ( types[i] == TT_LINKTEXTLOCAL ) ts = "local inlink text"; if ( types[i] == TT_LINKTEXTREMOTE ) ts = "remote inlink text"; if ( types[i] == TT_RSSITEMLOCAL ) ts = "local rss title"; if ( types[i] == TT_RSSITEMREMOTE ) ts = "remote rss title"; if ( types[i] == TT_BOLDTAG ) ts = "bold tag"; if ( types[i] == TT_HTAG ) ts = "header tag"; if ( types[i] == TT_TITLETAG ) ts = "title tag"; if ( types[i] == TT_FIRSTLINE ) ts = "first line in text"; if ( types[i] == TT_FONTTAG ) ts = "font tag"; if ( types[i] == TT_ATAG ) ts = "anchor tag"; if ( types[i] == TT_DIVTAG ) ts = "div tag"; if ( types[i] == TT_TDTAG ) ts = "td tag"; if ( types[i] == TT_PTAG ) ts = "p tag"; if ( types[i] == TT_URLPATH ) ts = "url path"; if ( types[i] == TT_TITLEATT ) ts = "title attribute"; // get the title pbuf->safePrintf( "<tr>" "<td>#%" PRId32"</td>" "<td><nobr>%s</nobr></td>" "<td>%" PRId32"</td>" "<td>%0.2f</td>" // baseScore "<td>%0.2f</td>" "<td>%0.2f</td>" "<td>%0.2f</td>" "<td>%0.2f</td>" "<td>", i, ts , parent[i], baseScore[i], noCapsBoost[i], qtermsBoost[i], inCommonCandBoost[i], scores[i]); // ptrs Words *w = cptrs[i]; int32_t a = as[i]; int32_t b = bs[i]; // skip if no words if ( w->getNumWords() <= 0 ) continue; // the word ptrs char **wptrs = w->getWordPtrs(); // string ptrs char *ptr = wptrs[a];//w->getWord(a); int32_t size = w->getStringSize(a,b); // it is utf8 pbuf->safeMemcpy ( ptr , size ); // end the line pbuf->safePrintf("</td></tr>\n"); } pbuf->safePrintf("</table>\n<br>\n"); // log these for now log("title: %s",sb.getBufStart()); */ return true; }
void HttpRequest::parseFieldsMultipart ( char *s , int32_t slen ) { // should be NULL terminated since we replaced &'s w/ 0's in set() char *send = s + slen ; // reset field count int32_t n = m_numFields; loop: // watch out for overflow if ( n >= MAX_CGI_PARMS ) { log("http: Received more than %" PRId32" CGI parms. " "Truncating.",(int32_t)MAX_CGI_PARMS); return; } s = strncasestr ( s , "\r\nContent-Disposition:", send - s ); if ( ! s ) return; // get the line end s += 2; char *lineEnd = strstr ( s , "\r\n" ); if ( ! lineEnd ) return; // get the name char *name = strncasestr ( s , "name=\"" , lineEnd - s ); if ( ! name ) goto loop; // point to name s = name + 6; // set the nth field name in this cgi string m_fields[n] = s; // point to = sign, use this for multiparts though char *equal = strstr ( s , "\"\r\n\r\n" ); // for uploading a file it looks like // Content-Disposition: form-data; name=\"file\"; filename=\"poo.txt\"\r\nContent-Type: text/plain\r\n\r\nsomething here\n=====\nagain we do it... char *equal2 = strstr ( s , "\"" ); // so if we had that then we had an uploaded file bool uploadedFile = false; if ( equal2 && equal && equal2 < equal ) { uploadedFile = true; equal = equal2; } // try next field if none here if ( ! equal ) goto loop; // set field len m_fieldLens [ n ] = equal - s; // point to field value s = equal + 5; // unless we had an uploaded file, then skip more if ( uploadedFile ) { char *fileStart = strstr(equal,"\r\n\r\n"); if ( fileStart ) fileStart += 4; s = fileStart; } // set = to \0 so getField() returns NULL terminated field name *equal = '\0'; // set value (may be \0) m_fieldValues [ n ] = s; // force to \0 at end char *vend = strstr ( s , "\r\n----------"); // 29 -'s then a # if ( ! vend ) return; // null terminate the value as well *vend = '\0'; // count the number of field/value pairs we get n++; // remember it m_numFields = n; // point to next field goto loop; }
void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml , Sections *sections , XmlDoc *xd ) { // not valid for now m_thumbnailValid = false; // reset our array of image node candidates m_numImages = 0; // flag it m_setCalled = true; // strange... if ( m_imgReply ) { char *xx=NULL;*xx=0; } // save this m_xml = xml; m_pageUrl = pageUrl; // if we are a diffbot json reply, trust that diffbot got the // best candidate, and just use that if ( xd->m_isDiffbotJSONObject ) return; //m_pageSite = pageSite; // scan the words long nw = words->getNumWords(); nodeid_t *tids = words->getTagIds(); long long *wids = words->getWordIds(); //long *scores = scoresArg->m_scores; Section **sp = NULL; if ( sections ) sp = sections->m_sectionPtrs; // not if we don't have any identified sections if ( sections && sections->m_numSections <= 0 ) sp = NULL; // the positive scored window long firstPosScore = -1; long lastPosScore = -1; long badFlags = SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_MARQUEE; // find positive scoring window for ( long i = 0 ; i < nw ; i++ ) { // skip if in bad section if ( sp && (sp[i]->m_flags & badFlags) ) continue; if ( wids[i] != 0 ) continue; // set first positive scoring guy if ( firstPosScore == -1 ) firstPosScore = i; // keep track of last guy lastPosScore = i; } // sanity check if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; } // . pedal firstPosScore back until we hit a section boundary // . i.e. stop once we hit a front/back tag pair, like <div> and </div> char tc[512]; memset ( tc , 0 , 512 ); long a = firstPosScore; for ( ; a >= 0 ; a-- ) { // get the tid nodeid_t tid = tids[a]; // remove back bit, if any tid &= BACKBITCOMP; // skip if not a tag, or a generic xml tag if ( tid <= 1 ) continue; // mark it if ( words->isBackTag(a) ) tc[tid] |= 0x02; else tc[tid] |= 0x01; // continue if not a full front/back pair if ( tc[tid] != 0x03 ) continue; // continue if not a "section" type tag (see Scores.cpp) if ( tid != TAG_DIV && tid != TAG_TEXTAREA && tid != TAG_TR && tid != TAG_TD && tid != TAG_TABLE ) continue; // ok we should stop now break; } // min is 0 if ( a < 0 ) a = 0; // now look for the image urls within this window for ( long i = a ; i < lastPosScore ; i++ ) { // skip if not <img> tag if (tids[i] != TAG_IMG ) continue; // get the node num into Xml.cpp::m_nodes[] array long nn = words->m_nodes[i]; // check width to rule out small decorating imgs long width = xml->getLong(nn,nn+1,"width", -1 ); if ( width != -1 && width < 50 ) continue; // same with height long height = xml->getLong(nn,nn+1, "height", -1 ); if ( height != -1 && height < 50 ) continue; // get the url of the image long srcLen; char *src = xml->getString(nn,"src",&srcLen); // skip if none if ( srcLen <= 2 ) continue; // set it to the full url Url iu; // use "pageUrl" as the baseUrl iu.set ( pageUrl , src , srcLen ); // skip if invalid domain or TLD if ( iu.getDomainLen() <= 0 ) continue; // skip if not from same domain as page url //long dlen = pageUrl->getDomainLen(); //if ( iu.getDomainLen() != dlen ) continue; //if(strncmp(iu.getDomain(),pageUrl->getDomain(),dlen))continue // get the full url char *u = iu.getUrl(); long ulen = iu.getUrlLen(); // skip common crap if ( strncasestr(u,ulen,"logo" ) ) continue; if ( strncasestr(u,ulen,"comment" ) ) continue; if ( strncasestr(u,ulen,"print" ) ) continue; if ( strncasestr(u,ulen,"subscribe" ) ) continue; if ( strncasestr(u,ulen,"header" ) ) continue; if ( strncasestr(u,ulen,"footer" ) ) continue; if ( strncasestr(u,ulen,"menu" ) ) continue; if ( strncasestr(u,ulen,"button" ) ) continue; if ( strncasestr(u,ulen,"banner" ) ) continue; if ( strncasestr(u,ulen,"ad.doubleclick.") ) continue; if ( strncasestr(u,ulen,"ads.webfeat." ) ) continue; if ( strncasestr(u,ulen,"xads.zedo." ) ) continue; // save it m_imageNodes[m_numImages] = nn; // before we lookup the image url to see if it is unique we // must first make sure that we have an adequate number of // permalinks from this same site with this same hop count. // we need at least 10 before we extract image thumbnails. char buf[2000]; // set the query Query q; // if we do have 10 or more, then we lookup the image url to // make sure it is indeed unique sprintf ( buf , "gbimage:%s",u); // TODO: make sure this is a no-split termid storage thingy // in Msg14.cpp if ( ! q.set2 ( buf , langUnknown , false ) ) // return true with g_errno set on error return; // store the termid m_termIds[m_numImages] = q.getTermId(0); // advance the counter m_numImages++; // break if full if ( m_numImages >= MAX_IMAGES ) break; } }
static bool atom_field_verify(struct atom * atom, FSAF * fp, size_t start, size_t end) { size_t len = end - start; struct fsaf_read_rv r = fsaf_read(fp, start, len); assert(r.len == len); switch (atom->mode) { case M_EXACT: if (len != atom->patlen) return false; if (atom->ignore_case) { return strncasecmp(atom->pat, r.b, len) == 0; } else { return strncmp(atom->pat, r.b, len) == 0; } case M_SUBSTR: { #if 0 if (atom->ignore_case) { return strncasestr(r.b, atom->pat, len); } else { return strnstr(r.b, atom->pat, len); } #else bool rv; char * s = strndup(r.b, len); if (s == 0) fatal_enomem(0); if (atom->ignore_case) { rv = strcasestr(s, atom->pat) != 0; } else { rv = strstr(s, atom->pat) != 0; } free(s); return rv; #endif } case M_REGEX: case M_EREGEX: { char * s = strndup(r.b, len); if (s == 0) fatal_enomem(0); int regex_errcode = regexec(&atom->regex, s, 0, 0, 0); free(s); if (regex_errcode == 0 || regex_errcode == REG_NOMATCH) { return (regex_errcode == 0); } /* Error handling be here. */ assert(regex_errcode != 0 && regex_errcode != REG_NOMATCH); s = get_regerror (regex_errcode, &atom->regex); if (s == 0) { enomem (0); return false; } message(L_IMPORTANT, 0, "%s", s); free(s); return false; } case M_VER_EQ:case M_VER_LT:case M_VER_LE:case M_VER_GT:case M_VER_GE: ; char *pats = strndup(atom->pat, atom->patlen); char *cands = strndup(r.b, len); struct versionrevision pat, cand; if (!parse_version(&pat, pats, atom->patlen)) { free(pats); free(cands); return false; } if (!parse_version(&cand, cands, len)) { free(pats); free(cands); return false; } int res = versioncompare(&cand, &pat); free(pats); free(cands); switch (atom->mode) { case M_VER_EQ: return res == 0; case M_VER_LT: return res < 0; case M_VER_LE: return res <= 0; case M_VER_GT: return res > 0; case M_VER_GE: return res >= 0; default: assert(0); } } assert(0); }