bool CISpellAffixRule::Apply ( CSphString & sWord ) { if ( m_sCondition.IsEmpty () ) return true; if ( sWord.IsEmpty () ) return false; m_iWordLen = strlen ( sWord.cstr () ); bool bDotCond = ( m_sCondition=="." ); if ( m_eRule==RULE_SUFFIXES ) { if ( !bDotCond && !CheckSuffix ( sWord ) ) return false; if ( !StripAppendSuffix ( sWord ) ) return false; } else { if ( !bDotCond && !CheckPrefix ( sWord ) ) return false; if ( !StripAppendPrefix ( sWord ) ) return false; } return true; }
bool CISpellAffixRule::StripAppendPrefix ( CSphString & sWord ) const { static char szTmp [MAX_STR_LENGTH]; if ( !m_sStrip.IsEmpty () ) { const char * Pos = strstr ( sWord.cstr (), m_sStrip.cstr () ); if ( Pos!=sWord.cstr() ) return false; } if ( !m_sAppend.IsEmpty () ) strcpy ( szTmp, m_sAppend.cstr() ); // NOLINT strncpy ( szTmp + m_iAppendLen, sWord.cstr () + m_iStripLen, m_iWordLen - m_iStripLen ); szTmp [m_iWordLen - m_iStripLen + m_iAppendLen] = '\0'; sWord = szTmp; return true; }
bool CISpellAffixRule::StripAppendSuffix ( CSphString & sWord ) const { static char szTmp [ MAX_STR_LENGTH]; if ( !m_sStrip.IsEmpty () ) { if ( m_iWordLen < m_iStripLen ) return false; if ( strncmp ( sWord.cstr () + m_iWordLen - m_iStripLen, m_sStrip.cstr (), m_iStripLen ) ) return false; } strncpy ( szTmp, sWord.cstr (), m_iWordLen - m_iStripLen ); szTmp [m_iWordLen - m_iStripLen] = '\0'; if ( !m_sAppend.IsEmpty () ) strcat ( szTmp, m_sAppend.cstr () ); // NOLINT sWord = szTmp; return true; }
void DoIndexing ( CSphSource * pSrc, ISphRtIndex * pIndex ) { CSphString sError; CSphVector<DWORD> dMvas; int64_t tmStart = sphMicroTimer (); int64_t tmAvgCommit = 0; int64_t tmMaxCommit = 0; int iCommits = 0; for ( ;; ) { if ( !pSrc->IterateDocument ( sError ) ) sphDie ( "iterate-document failed: %s", sError.cstr() ); ISphHits * pHitsNext = pSrc->IterateHits ( sError ); if ( !sError.IsEmpty() ) sphDie ( "iterate-hits failed: %s", sError.cstr() ); if ( pSrc->m_tDocInfo.m_iDocID ) pIndex->AddDocument ( pHitsNext, pSrc->m_tDocInfo, NULL, dMvas, sError ); if ( ( pSrc->GetStats().m_iTotalDocuments % COMMIT_STEP )==0 || !pSrc->m_tDocInfo.m_iDocID ) { int64_t tmCommit = sphMicroTimer(); pIndex->Commit (); tmCommit = sphMicroTimer()-tmCommit; iCommits++; tmAvgCommit += tmCommit; tmMaxCommit = Max ( tmMaxCommit, tmCommit ); if ( !pSrc->m_tDocInfo.m_iDocID ) { tmAvgCommit /= iCommits; break; } } if (!( pSrc->GetStats().m_iTotalDocuments % 100 )) printf ( "%d docs\r", (int)pSrc->GetStats().m_iTotalDocuments ); static bool bOnce = true; if ( iCommits*COMMIT_STEP>=5000 && bOnce ) { printf ( "\n" ); DoSearch ( pIndex ); bOnce = false; } } pSrc->Disconnect(); int64_t tmEnd = sphMicroTimer (); float fTotalMB = (float)pSrc->GetStats().m_iTotalBytes/1000000.0f; printf ( "commit-step %d, %d docs, %d bytes, %d.%03d sec, %.2f MB/sec\n", COMMIT_STEP, (int)pSrc->GetStats().m_iTotalDocuments, (int)pSrc->GetStats().m_iTotalBytes, (int)((tmEnd-tmStart)/1000000), (int)(((tmEnd-tmStart)%1000000)/1000), fTotalMB*1000000.0f/(tmEnd-tmStart) ); printf ( "commit-docs %d, avg %d.%03d msec, max %d.%03d msec\n", COMMIT_STEP, (int)(tmAvgCommit/1000), (int)(tmAvgCommit%1000), (int)(tmMaxCommit/1000), (int)(tmMaxCommit%1000) ); g_fTotalMB += fTotalMB; }
void CISpellAffix::LoadLocale () { if ( m_bUseDictConversion ) printf ( "Using dictionary-defined character set\n" ); else if ( !m_sCharsetFile.IsEmpty () ) { FILE * pFile = fopen ( m_sCharsetFile.cstr (), "rt" ); if ( pFile ) { printf ( "Using charater set from '%s'\n", m_sCharsetFile.cstr () ); const int MAX_CHARSET_LENGTH = 4096; char szBuffer [MAX_CHARSET_LENGTH]; char * szResult = fgets ( szBuffer, MAX_CHARSET_LENGTH, pFile ); if ( szResult ) { CSphVector<CSphRemapRange> dRemaps; if ( sphParseCharset ( szBuffer, dRemaps ) ) { m_bUseLowerCaser = true; m_LowerCaser.AddRemaps ( dRemaps, 0 ); } else { printf ( "Failed to parse charset from '%s'\n", m_sCharsetFile.cstr() ); } } else { printf ( "Failed to read charset from '%s'\n", m_sCharsetFile.cstr() ); } fclose ( pFile ); } else { printf ( "Failed to open '%s'\n", m_sCharsetFile.cstr() ); } } else { if ( !m_sLocale.IsEmpty () ) { char dLocaleC[256], dLocaleUser[256]; setlocale ( LC_ALL, "C" ); for ( int i=0; i<256; i++ ) dLocaleC[i] = (char) tolower(i); char * szLocale = setlocale ( LC_CTYPE, m_sLocale.cstr() ); if ( szLocale ) { printf ( "Using user-defined locale (locale=%s)\n", m_sLocale.cstr() ); for ( int i=0; i<256; i++ ) dLocaleUser[i] = (char) tolower(i); if ( !memcmp ( dLocaleC, dLocaleUser, 256 ) ) printf ( "WARNING: user-defined locale provides the same case conversion as the default \"C\" locale\n" ); } else printf ( "WARNING: could not set user-defined locale for case conversions (locale=%s)\n", m_sLocale.cstr() ); } else printf ( "WARNING: no character set specified\n" ); } }