C++ (Cpp) is_alpha_utf8示例

编程语言: C++ (Cpp)

方法/功能: is_alpha_utf8

hotexamples.com的示例: 2

C++ (Cpp) is_alpha_utf8 - 已找到2个示例。这些是从开源项目中提取的最受好评的is_alpha_utf8现实C++ (Cpp)示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： fctypes.cpp 项目： privacore/open-source-search-engine

bool has_alpha_utf8 ( char *s , char *send ) {
	char cs = 0;
	for ( ; s < send ; s += cs ) {
		cs = getUtf8CharSize ( s );
		if ( cs == 1 ) {
			if (is_alpha_a(*s)) return true;
			continue;
		}
		if ( is_alpha_utf8(s) ) return true;
	}
	return false;
}

示例#2

显示文件

文件： Summary.cpp 项目： exename/open-source-search-engine

// returns false and sets g_errno on error
bool Summary::setSummary ( Xml *xml, Words *words, Sections *sections, Pos *pos, Query *q, int32_t maxSummaryLen,
						   int32_t maxNumLines, int32_t numDisplayLines, int32_t maxNumCharsPerLine, Url *f,
                           Matches *matches, char *titleBuf, int32_t titleBufLen ) {
	m_numDisplayLines = numDisplayLines;
	m_displayLen      = 0;

	// assume we got maxnumlines of summary
	if ( (maxNumCharsPerLine + 6) * maxNumLines > maxSummaryLen ) {
		if ( maxNumCharsPerLine < 10 ) {
			maxNumCharsPerLine = 10;
		}

		static char s_flag = 1;
		if ( s_flag ) {
			s_flag = 0;
			log("query: Warning. "
			    "Max summary excerpt length decreased to "
			    "%" PRId32" chars because max summary excerpts and "
			    "max summary length are too big.",
			    maxNumCharsPerLine);
		}
	}

	// . sanity check
	// . summary must fit in m_summary[]
	// . leave room for tailing \0
	if ( maxSummaryLen >= MAX_SUMMARY_LEN ) {
		g_errno = EBUFTOOSMALL;
		return log("query: Summary too big to hold in buffer of %" PRId32" bytes.",(int32_t)MAX_SUMMARY_LEN);
	}

	// do not overrun the final*[] buffers
	if ( maxNumLines > 256 ) { 
		g_errno = EBUFTOOSMALL; 
		return log("query: More than 256 summary lines requested.");
	}

	// Nothing to match...print beginning of content as summary
	if ( matches->m_numMatches == 0 && maxNumLines > 0 ) {
		return getDefaultSummary ( xml, words, sections, pos, maxSummaryLen );
	}

	int32_t need1 = q->m_numWords * sizeof(float);
	m_wordWeightSize = need1;
	if ( need1 < 128 ) {
		m_wordWeights = (float *)m_tmpWordWeightsBuf;
	} else {
		m_wordWeights = (float *)mmalloc ( need1 , "wwsum" );
	}

	if ( ! m_wordWeights ) {
		return false;
	}

	/// @todo ALC fix word weights
	/// non-working logic is removed in commit 5eacee9063861e859b54ec62035a600aa8af25df

	// . compute our word weights wrt each query. words which are more rare
	//   have a higher weight. We use this to weight the terms importance
	//   when generating the summary.
	// . used by the proximity algo
	// . used in setSummaryScores() for scoring summaries


	for ( int32_t i = 0 ; i < q->m_numWords; i++ ) {
		m_wordWeights[i] = 1.0;
	}

	// convenience
	m_maxNumCharsPerLine = maxNumCharsPerLine;
	m_q = q;

	// set the max excerpt len to the max summary excerpt len
	int32_t maxExcerptLen = m_maxNumCharsPerLine;

	int32_t lastNumFinal = 0;
	int32_t maxLoops = 1024;

	// if just computing absScore2...
	if ( maxNumLines <= 0 ) {
		return true;
	}

	char *p = m_summary;
	char *pend = m_summary + maxSummaryLen;

	m_numExcerpts = 0;

	int32_t need2 = (1+1+1) * m_q->m_numWords;
	m_buf4Size = need2;
	if ( need2 < 128 ) {
		m_buf4 = m_tmpBuf4;
	} else {
		m_buf4 = (char *)mmalloc ( need2 , "stkbuf" );
	}

	if ( ! m_buf4 ) {
		return false;
	}

	char *x = m_buf4;
	char *retired = x;
	x += m_q->m_numWords;
	char *maxGotIt = x;
	x += m_q->m_numWords;
	char *gotIt = x;

	// . the "maxGotIt" count vector accumulates into "retired"
	// . that is how we keep track of what query words we used for previous
	//   summary excerpts so we try to get diversified excerpts with 
	//   different query terms/words in them
	//char retired  [ MAX_QUERY_WORDS ];
	memset ( retired, 0, m_q->m_numWords * sizeof(char) );

	// some query words are already matched in the title
	for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ ) {
		if ( matches->m_qwordFlags[i] & MF_TITLEGEN ) {
			retired [ i ] = 1;
		}
	}

	bool hadEllipsis = false;

	// 
	// Loop over all words that match a query term. The matching words
	// could be from any one of the 3 Words arrays above. Find the
	// highest scoring window around each term. And then find the highest
	// of those over all the matching terms.
	//
	int32_t numFinal;
	for ( numFinal = 0; numFinal < maxNumLines; numFinal++ ) {
		if ( numFinal == m_numDisplayLines ) {
			m_displayLen = p - m_summary;
		}

		// reset these at the top of each loop
		Match     *maxm;
		int64_t  maxScore = 0;
		int32_t       maxa = 0;
		int32_t       maxb = 0;
		int32_t       maxi  = -1;
		int32_t       lasta = -1;

		if(lastNumFinal == numFinal) {
			if(maxLoops-- <= 0) {
				log(LOG_WARN, "query: got infinite loop bug, query is %s url is %s", m_q->m_orig, f->getUrl());
				break;
			}
		}
		lastNumFinal = numFinal;

		// loop through all the matches and see which is best
		for ( int32_t i = 0 ; i < matches->m_numMatches ; i++ ) {
			int32_t       a , b;
			// reset lasta if we changed words class
			if ( i > 0 && matches->m_matches[i-1].m_words != matches->m_matches[i].m_words ) {
				lasta = -1;
			}

			// only use matches in title, etc.
			mf_t flags = matches->m_matches[i].m_flags;

			bool skip = true;
			if ( flags & MF_METASUMM ) {
				skip = false;
			}
			if ( flags & MF_METADESC ) {
				skip = false;
			}
			if ( flags & MF_BODY     ) {
				skip = false;
			}
			if ( flags & MF_RSSDESC  ) {
				skip = false;
			}

			if ( skip ) {
				continue;
			}

			// ask him for the query words he matched
			//char gotIt [ MAX_QUERY_WORDS ];
			// clear it for him
			memset ( gotIt, 0, m_q->m_numWords * sizeof(char) );

			// . get score of best window around this match
			// . do not allow left post of window to be <= lasta to
			//   avoid repeating the same window.
			int64_t score = getBestWindow (matches, i, &lasta, &a, &b, gotIt, retired, maxExcerptLen);
			
			// USE THIS BUF BELOW TO DEBUG THE ABOVE CODE. 
			// PRINTS OUT THE SUMMARY
			/*
			//if ( score >=12000 ) {
			char buf[10*1024];
			char *xp = buf;
			if ( i == 0 )
				log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=");
			sprintf(xp, "score=%08" PRId32" a=%05" PRId32" b=%05" PRId32" ",
				(int32_t)score,(int32_t)a,(int32_t)b);
			xp += strlen(xp);
			for ( int32_t j = a; j < b; j++ ){
				//int32_t s = scores->m_scores[j];
				int32_t s = 0;
				if ( s < 0 ) continue;
				char e = 1;
				int32_t len = words->getWordLen(j);
				for(int32_t k=0;k<len;k +=e){
					char c = words->m_words[j][k];
					//if ( is_binary( c ) ) continue;
					*xp = c;
					xp++;
				}
				//p += strlen(p);
				if ( s == 0 ) continue;
				sprintf ( xp ,"(%" PRId32")",s);
				xp += strlen(xp);
			}
			log (LOG_WARN,"query: summary: %s", buf);
			//}
			*/

			// prints out the best window with the score
			/*
			char buf[MAX_SUMMARY_LEN];
			  char *bufPtr = buf;
			  char *bufPtrEnd = p + MAX_SUMMARY_LEN;
			  if ( i == 0 )
			  log (LOG_WARN,"=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=");
			  int32_t len = 0;
			  Words *ww  = matches->m_matches[i].m_words;
			  //Sections *ss = matches->m_matches[i].m_sections;
			  //if ( ss->m_numSections <= 0 ) ss = NULL;
			  //len=pos->filter(bufPtr, bufPtrEnd, ww, a, b, NULL);
			  //log(LOG_WARN,"summary: %" PRId32") %s - %" PRId64,i,bufPtr,
			  //score);
			  log(LOG_WARN,"summary: %" PRId32") %s - %" PRId64,i,bufPtr,
			  score);
			*/

			// skip if was in title or something
			if ( score <= 0 ) {
				continue;
			}

			// skip if not a winner
			if ( maxi >= 0 && score <= maxScore ) {
				continue;
			}

			// we got a new winner
			maxi     = i;
			maxa     = a;
			maxb     = b;
			maxScore = score;

			// save this too
			gbmemcpy ( maxGotIt , gotIt , m_q->m_numWords );

		}
	
		// retire the query words in the winning summary

		
		//log( LOG_WARN,"summary: took %" PRId64" ms to finish getbestwindo",
		//    gettimeofdayInMilliseconds() - stget );


		// all done if no winner was made
		if ( maxi == -1 || maxa == -1 || maxb == -1) {
			break;
		}

		// who is the winning match?
		maxm = &matches->m_matches[maxi];
		Words *ww = maxm->m_words;

		// we now use "m_swbits" for the summary bits since they are
		// of size sizeof(swbit_t), a int16_t at this point
		swbit_t *bb = maxm->m_bits->m_swbits;

		// this should be impossible
		if ( maxa > ww->getNumWords() || maxb > ww->getNumWords() ) {
			log ( LOG_WARN,"query: summary starts or ends after "
			      "document is over! maxa=%" PRId32" maxb=%" PRId32" nw=%" PRId32,
			      maxa, maxb, ww->getNumWords() );
			maxa = ww->getNumWords() - 1;
			maxb = ww->getNumWords();
		}

		// assume we do not preceed with ellipsis "..."
		bool needEllipsis = true;
		
		const char *c = ww->getWord(maxa)+0;

		// rule of thumb, don't use ellipsis if the first letter is capital, or a non letter
		// is punct word before us pair acrossable? if so then we probably are not the start of a sentence.
		// or if into the sample and previous excerpt had an ellipsis do not bother using one for us.
		if ( !is_alpha_utf8(c) || is_upper_utf8(c) ||
		     (bb[maxa] & D_STARTS_SENTENCE) ||
		     (p > m_summary && hadEllipsis)) {
			needEllipsis = false;
		}

		if ( needEllipsis ) {
			// break out if no room for "..."
			if ( p + 4 + 2 > pend ) {
				break;
			}

			// space first?
			if ( p > m_summary ) {
				*p++ = ' ';
			}

			memcpy ( p, "\342\200\246 ", 4 ); //horizontal ellipsis, code point 0x2026
			p += 4;
		}

		// separate summary excerpts with a single space.
		if ( p > m_summary ) {
			if ( p + 2 > pend ) {
				break;
			}

			*p++ = ' ';
		}

		// assume we need a trailing ellipsis
		needEllipsis = true;

		// so next excerpt does not need to have an ellipsis if we 
		// have one at the end of this excerpt
		hadEllipsis = needEllipsis;

		// start with quote?
		if ( (bb[maxa] & D_IN_QUOTES) && p + 1 < pend ) {
			// preceed with quote
			*p++ = '\"';
		}
	
		// . filter the words into p
		// . removes back to back spaces
		// . converts html entities
		// . filters in stores words in [a,b) interval
		int32_t len = pos->filter( ww, maxa, maxb, false, p, pend, xml->getVersion() );

		// break out if did not fit
		if ( len == 0 ) {
			break;
		}

		// don't consider it if it is a substring of the title
		if ( len == titleBufLen && strncasestr(titleBuf, p, titleBufLen, len) ) {
			// don't consider this one
			numFinal--;
			goto skip;
		}
	
		// don't consider it if the length wasn't anything nice
		if ( len < 5 ){
			numFinal--;
			goto skip;
		}

		// otherwise, keep going
		p += len;

		// now we just indicate which query terms we got
		for ( int32_t i = 0 ; i < m_q->m_numWords ; i++ ) {
			// do not breach
			if ( retired[i] >= 100 ) {
				continue;
			}
			retired [ i ] += maxGotIt [ i ];
		}
	
		// add all the scores of the excerpts to the doc summary score.
		// zero out scores of the winning sample so we don't get them 
		// again. use negative one billion to ensure that we don't get
		// them again
		for ( int32_t j = maxa ; j < maxb ; j++ ) {
			// mark it as used
			bb[j] |= D_USED;
		}

		// if we ended on punct that can be paired across we need
		// to add an ellipsis
		if ( needEllipsis ) {
			if ( p + 4 + 2 > pend ) {
				break;
			}
			memcpy ( p, " \342\200\246", 4 ); //horizontal ellipsis, code point 0x2026
			p += 4;
		}

		// try to put in a small summary excerpt if we have atleast
		// half of the normal excerpt length left
		if ( maxExcerptLen == m_maxNumCharsPerLine && len <= ( m_maxNumCharsPerLine / 2 + 1 ) ) {
			maxExcerptLen = m_maxNumCharsPerLine / 2;

			// don't count it in the finals since we try to get a small excerpt
			numFinal--;
		} else if ( m_numExcerpts < MAX_SUMMARY_EXCERPTS && m_numExcerpts >= 0 ) {
			m_summaryExcerptLen[m_numExcerpts] = p - m_summary;
			m_numExcerpts++;

			// also reset maxExcerptLen
			maxExcerptLen = m_maxNumCharsPerLine;
		}
	
	skip:
		// zero out the scores so they will not be used in others
		for ( int32_t j = maxa ; j < maxb ; j++ ) {
			// mark it
			bb[j] |= D_USED;
		}
	}

	if ( numFinal <= m_numDisplayLines ) {
		m_displayLen = p - m_summary;
	}

	// free the mem we used if we allocated it
	if ( m_buf4 && m_buf4 != m_tmpBuf4 ) {
		mfree ( m_buf4 , m_buf4Size , "ssstkb" );
		m_buf4 = NULL;
	}

	// If we still didn't find a summary, get the default summary
	if ( p == m_summary ) {
		bool status = getDefaultSummary ( xml, words, sections, pos, maxSummaryLen );
		if ( m_numDisplayLines > 0 ) {
			m_displayLen = m_summaryLen;
		}
		
		return status;
	}

	// if we don't find a summary, theres no need to NULL terminate
	*p++ = '\0';

	// set length
	m_summaryLen = p - m_summary;

	if ( m_summaryLen > 50000 ) { g_process.shutdownAbort(true); }

	return true;
}