void UrlBreak ( Split_t & tBest, const char * sWord ) { const int iLen = strlen(sWord); tBest.m_Pos.Resize(0); // current partial splits // begin with an empty one CSphVector<Split_t> dSplits; dSplits.Add(); // our best guess so far // begin with a trivial baseline one (ie. no splits at all) Prob_t p = g_LM.GetProb ( sWord, iLen ); tBest.m_Pos.Add ( iLen ); tBest.m_fProb = p.m_fProb; tBest.m_bAllDict = tBest.m_bAnyDict = p.m_bDict; if ( iLen>=DICT_COMPOUND_MIN && tBest.m_bAllDict ) { static const float THRESH = logf ( DICT_COMPOUND_THRESH ); if ( tBest.m_fProb<=THRESH ) tBest.m_fProb *= DICT_COMPOUND_COEFF; } // work the current splits CSphVector<Split_t> dSplits2; while ( dSplits.GetLength() ) { int iWorkedSplits = 0; float fPrevBest = tBest.m_fProb; ARRAY_FOREACH ( iSplit, dSplits ) { Split_t & s = dSplits[iSplit]; // filter out splits that were added before (!) a new best guess on the previous iteration if ( dSplits[iSplit] < tBest ) continue; iWorkedSplits++; int iLast = 0; if ( s.m_Pos.GetLength() ) iLast = s.m_Pos.Last(); for ( int i=1+iLast; i<iLen; i++ ) { // consider a split at position i // it generates a word candidate [iLast,i) and a tail [i,iLen) // let's score those Prob_t tCand = g_LM.GetProb ( sWord+iLast, i-iLast ); Prob_t tTail = g_LM.GetProb ( sWord+i, iLen-i ); // if the current best is all-keywords, the new candidates must be, too if ( tBest.m_bAllDict && !tCand.m_bDict ) continue; // compute partial and full split candidates generated by the current guess Split_t tPartial = s; tPartial.AddSplitPos ( tCand, i ); Split_t tFull = tPartial; tFull.AddSplitPos ( tTail, iLen ); // check if the full one is our new best full one bool bNewBest = false; if ( tBest < tFull ) { // FIXME? we do this even when the new split is *not* all-keywords, // but the old best split was; is this ever a problem? tBest = tFull; // tBest.Dump ( sWord, "new-best" ); bNewBest = true; } // check if the resulting partial split is worth scanning further if ( tBest < tPartial ) { dSplits2.Add ( tPartial ); // dSplits2.Last().Dump ( sWord, "scan-partial" ); } } } // damage control! // if we just processed over 100K candidate splits and got no improvement // lets assume that our chances of getting one are kinda low and bail if ( iWorkedSplits>=100000 && tBest.m_fProb>=fPrevBest ) break; // keep going dSplits.SwapData ( dSplits2 ); dSplits2.Resize ( 0 ); }