Пример #1
0
void UrlBreak ( Split_t & tBest, const char * sWord )
{
	const int iLen = strlen(sWord);

	tBest.m_Pos.Resize(0);

	// current partial splits
	// begin with an empty one
	CSphVector<Split_t> dSplits;
	dSplits.Add();

	// our best guess so far
	// begin with a trivial baseline one (ie. no splits at all)
	Prob_t p = g_LM.GetProb ( sWord, iLen );
	tBest.m_Pos.Add ( iLen );
	tBest.m_fProb = p.m_fProb;
	tBest.m_bAllDict = tBest.m_bAnyDict = p.m_bDict;

	if ( iLen>=DICT_COMPOUND_MIN && tBest.m_bAllDict )
	{
		static const float THRESH = logf ( DICT_COMPOUND_THRESH );
		if ( tBest.m_fProb<=THRESH )
			tBest.m_fProb *= DICT_COMPOUND_COEFF;
	}

	// work the current splits
	CSphVector<Split_t> dSplits2;
	while ( dSplits.GetLength() )
	{
		int iWorkedSplits = 0;
		float fPrevBest = tBest.m_fProb;

		ARRAY_FOREACH ( iSplit, dSplits )
		{
			Split_t & s = dSplits[iSplit];

			// filter out splits that were added before (!) a new best guess on the previous iteration
			if ( dSplits[iSplit] < tBest )
				continue;
			iWorkedSplits++;

			int iLast = 0;
			if ( s.m_Pos.GetLength() )
				iLast = s.m_Pos.Last();

			for ( int i=1+iLast; i<iLen; i++ )
			{
				// consider a split at position i
				// it generates a word candidate [iLast,i) and a tail [i,iLen)
				// let's score those
				Prob_t tCand = g_LM.GetProb ( sWord+iLast, i-iLast );
				Prob_t tTail = g_LM.GetProb ( sWord+i, iLen-i );

				// if the current best is all-keywords, the new candidates must be, too
				if ( tBest.m_bAllDict && !tCand.m_bDict )
					continue;

				// compute partial and full split candidates generated by the current guess
				Split_t tPartial = s;
				tPartial.AddSplitPos ( tCand, i );

				Split_t tFull = tPartial;
				tFull.AddSplitPos ( tTail, iLen );

				// check if the full one is our new best full one
				bool bNewBest = false;
				if ( tBest < tFull )
				{
					// FIXME? we do this even when the new split is *not* all-keywords,
					// but the old best split was; is this ever a problem?
					tBest = tFull;
//					tBest.Dump ( sWord, "new-best" );
					bNewBest = true;
				}

				// check if the resulting partial split is worth scanning further
				if ( tBest < tPartial )
				{
					dSplits2.Add ( tPartial );
//					dSplits2.Last().Dump ( sWord, "scan-partial" );
				}
			}
		}

		// damage control!
		// if we just processed over 100K candidate splits and got no improvement
		// lets assume that our chances of getting one are kinda low and bail
		if ( iWorkedSplits>=100000 && tBest.m_fProb>=fPrevBest )
			break;

		// keep going
		dSplits.SwapData ( dSplits2 );
		dSplits2.Resize ( 0 );
	}