示例#1
0
bool CISpellAffixRule::Apply ( CSphString & sWord )
{
	if ( m_sCondition.IsEmpty () )
		return true;

	if ( sWord.IsEmpty () )
		return false;

	m_iWordLen = strlen ( sWord.cstr () );

	bool bDotCond = ( m_sCondition=="." );
	if ( m_eRule==RULE_SUFFIXES )
	{
		if ( !bDotCond && !CheckSuffix ( sWord ) )
			return false;

		if ( !StripAppendSuffix ( sWord ) )
			return false;
	} else
	{
		if ( !bDotCond && !CheckPrefix ( sWord ) )
			return false;

		if ( !StripAppendPrefix ( sWord ) )
			return false;
	}
	return true;
}
示例#2
0
文件: indextool.cpp 项目: eJon/common
void StripStdin ( const char * sIndexAttrs, const char * sRemoveElements )
{
	CSphString sError;
	CSphHTMLStripper tStripper ( true );
	if ( !tStripper.SetIndexedAttrs ( sIndexAttrs, sError )
		|| !tStripper.SetRemovedElements ( sRemoveElements, sError ) )
			sphDie ( "failed to configure stripper: %s", sError.cstr() );

	CSphVector<BYTE> dBuffer;
	while ( !feof(stdin) )
	{
		char sBuffer[1024];
		int iLen = fread ( sBuffer, 1, sizeof(sBuffer), stdin );
		if ( !iLen )
			break;

		int iPos = dBuffer.GetLength();
		dBuffer.Resize ( iPos+iLen );
		memcpy ( &dBuffer[iPos], sBuffer, iLen );
	}
	dBuffer.Add ( 0 );

	tStripper.Strip ( &dBuffer[0] );
	fprintf ( stdout, "dumping stripped results...\n%s\n", &dBuffer[0] );
}
示例#3
0
void ExcerptGen_c::AddJunk ( int iStart, int iLength )
{
    int iChunkStart = iStart;

    for ( int i = iStart; i < iStart+iLength; i++ )
        if ( sphIsSpace ( m_sBuffer.cstr () [i] ) != sphIsSpace ( m_sBuffer.cstr () [iChunkStart] ) )
        {
            m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
            Token_t & tLast = m_dTokens.Last ();
            tLast.m_eType   = TOK_SPACE;
            tLast.m_iStart	= iChunkStart;
            tLast.m_iLengthBytes = i - iChunkStart;
            tLast.m_iWordID = 0;
            tLast.m_uWords = 0;

            iChunkStart = i;
        }

    m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
    Token_t & tLast = m_dTokens.Last ();
    tLast.m_eType   = TOK_SPACE;
    tLast.m_iStart	= iChunkStart;
    tLast.m_iLengthBytes = iStart + iLength - iChunkStart;
    tLast.m_iWordID = 0;
    tLast.m_uWords = 0;
}
示例#4
0
文件: testrt.cpp 项目: frankee/csft
void SetupIndexing ( CSphSource_MySQL * pSrc, const CSphSourceParams_MySQL & tParams )
{
    CSphString sError;
    if ( !pSrc->Setup ( tParams ) )
        sphDie ( "setup failed" );
    if ( !pSrc->Connect ( sError ) )
        sphDie ( "connect failed: %s", sError.cstr() );
    if ( !pSrc->IterateStart ( sError ) )
        sphDie ( "iterate-start failed: %s", sError.cstr() );
}
示例#5
0
bool sphConfTokenizer ( const CSphConfigSection & hIndex, CSphTokenizerSettings & tSettings, CSphString & sError )
{
	// charset_type
	CSphScopedPtr<ISphTokenizer> pTokenizer ( NULL );

	if ( !hIndex("charset_type") || hIndex["charset_type"]=="sbcs" )
	{
		tSettings.m_iType = TOKENIZER_SBCS;
	}
	else if ( hIndex["charset_type"]=="utf-8" )
	{
		tSettings.m_iType = hIndex("ngram_chars") ? TOKENIZER_NGRAM : TOKENIZER_UTF8;
	}
	else
	{
		sError.SetSprintf ( "unknown charset type '%s'", hIndex["charset_type"].cstr() );
		return false;
	}

	tSettings.m_sCaseFolding	= hIndex.GetStr ( "charset_table" );
	tSettings.m_iMinWordLen		= Max ( hIndex.GetInt ( "min_word_len" ), 0 );
	tSettings.m_sNgramChars		= hIndex.GetStr ( "ngram_chars" );
	tSettings.m_iNgramLen		= Max ( hIndex.GetInt ( "ngram_len" ), 0 );
	tSettings.m_sSynonymsFile	= hIndex.GetStr ( "exceptions" ); // new option name
	if ( tSettings.m_sSynonymsFile.IsEmpty() )
		tSettings.m_sSynonymsFile = hIndex.GetStr ( "synonyms" ); // deprecated option name
	tSettings.m_sIgnoreChars	= hIndex.GetStr ( "ignore_chars" );

	// phrase boundaries
	int iBoundaryStep = Max ( hIndex.GetInt ( "phrase_boundary_step" ), -1 );
	if ( iBoundaryStep!=0 )
		tSettings.m_sBoundary = hIndex.GetStr ( "phrase_boundary" );

	return true;
}
示例#6
0
bool sphPluginDrop ( PluginType_e eType, const char * sName, CSphString & sError )
{
#if !HAVE_DLOPEN
	sError = "no dlopen(), no plugins";
	return false;
#else
	CSphScopedLock<CSphMutex> tLock ( g_tPluginMutex );

	PluginKey_t tKey ( eType, sName );
	PluginDesc_c ** ppPlugin = g_hPlugins(tKey);
	if ( !ppPlugin || !*ppPlugin )
	{
		sError.SetSprintf ( "plugin '%s' does not exist", sName );
		return false;
	}

	PluginDesc_c * pPlugin = *ppPlugin;
	PluginLib_c * pLib = pPlugin->GetLib();

	Verify ( g_hPlugins.Delete(tKey) );
	pPlugin->Release();

	if ( --pLib->m_iHashedPlugins==0 )
	{
		g_hPluginLibs.Delete ( pLib->GetName() );
		pLib->Release();
	}

	return true;
#endif // HAVE_DLOPEN
}
示例#7
0
bool sphPluginParseSpec ( const CSphString & sParams, CSphVector<CSphString> & dParams, CSphString & sError )
{
	dParams.Resize ( 0 );
	sphSplit ( dParams, sParams.cstr(), ":" );

	switch ( dParams.GetLength() )
	{
	case 0:
		return true;

	case 1:
		sError = "filter name required in spec string; example: \"plugins.so:myfilter\"";
		return false;

	case 2:
		dParams.Add ( "" );
		return true;

	case 3:
		return true;
	}

	sError = "too many parts in spec string; must be in \"plugins.so:myfilter:options\" format";
	return false;
}
示例#8
0
bool CISpellAffixRule::CheckSuffix ( const CSphString & sWord ) const
{
	int iCondI = m_iCondLen-1;
	for ( int i=m_iWordLen-1; iCondI>=0 && i>=0; --i )
	{
		if ( m_sCondition.cstr()[iCondI]=='.' )
		{
			--iCondI;
		} else if ( m_sCondition.cstr()[iCondI]!=']' )
		{
			if ( m_sCondition.cstr()[iCondI]!=sWord.cstr()[i] )
				return false;
			--iCondI;
		} else
		{
			int iRangeStart = -1;
			for ( int j=iCondI; j>=0 && iRangeStart==-1; --j )
				if ( m_sCondition.cstr()[j]=='[' )
					iRangeStart = j;

			if ( iRangeStart==-1 )
				return false;
			else
			{
				if ( !IsInSet ( sWord.cstr () [i], m_sCondition.SubString ( iRangeStart + 1, iCondI - iRangeStart - 1 ).cstr () ) )
					return false;
				iCondI = iRangeStart - 1;
			}
		}
	}
	return true;
}
示例#9
0
void ExcerptGen_c::AddJunk ( int iStart, int iLength, int iBoundary )
{
	int iChunkStart = iStart;
	int iSaved = 0;

	for ( int i = iStart; i < iStart+iLength; i++ ){
		const char* buf_ptr = NULL;
		if(m_bUtf8){
			buf_ptr = m_sBufferUTF8.cstr ();
		}else{
			buf_ptr = m_sBuffer.cstr ();
		}
		if ( sphIsSpace (  buf_ptr[i] ) != sphIsSpace ( buf_ptr[iChunkStart] ) )
		{
			m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
			Token_t & tLast = m_dTokens.Last ();
			tLast.m_eType   = TOK_SPACE;
			tLast.m_iStart	= iChunkStart;
			tLast.m_iLengthBytes = i - iChunkStart;
			tLast.m_iWordID = 0;
			tLast.m_uWords = 0;

			iChunkStart = i;
			iSaved += tLast.m_iLengthBytes;

			if ( iBoundary != -1 && iSaved > iBoundary - iStart )
			{
				AddBoundary();
				iBoundary = -1;
			}
		}
	}

	m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
	Token_t & tLast = m_dTokens.Last ();
	tLast.m_eType   = TOK_SPACE;
	tLast.m_iStart	= iChunkStart;
	tLast.m_iLengthBytes = iStart + iLength - iChunkStart;
	tLast.m_iWordID = 0;
	tLast.m_uWords = 0;

	if ( iBoundary != -1 ) AddBoundary();
}
void ExcerptGen_c::AddJunk ( int iStart, int iLength, int iBoundary )
{
	assert ( iLength>0 );
	assert ( iLength<=m_sBuffer.Length() );
	assert ( iStart+iLength<=m_sBuffer.Length() );

	int iChunkStart = iStart;
	int iSaved = 0;

	for ( int i = iStart; i < iStart+iLength; i++ )
		if ( sphIsSpace ( m_sBuffer.cstr () [i] )!=sphIsSpace ( m_sBuffer.cstr () [iChunkStart] ) )
		{
			Token_t & tLast = m_dTokens.Add();
			tLast.m_eType = TOK_SPACE;
			tLast.m_iStart = iChunkStart;
			tLast.m_iLengthBytes = i - iChunkStart;
			tLast.m_iWordID = 0;
			tLast.m_uWords = 0;
			tLast.m_uPosition = 0;

			iChunkStart = i;
			iSaved += tLast.m_iLengthBytes;

			if ( iBoundary!=-1 && iSaved > ( iBoundary-iStart ) )
			{
				AddBoundary();
				iBoundary = -1;
			}
		}

	Token_t & tLast = m_dTokens.Add();
	tLast.m_eType = TOK_SPACE;
	tLast.m_iStart = iChunkStart;
	tLast.m_iLengthBytes = iStart + iLength - iChunkStart;
	tLast.m_iWordID = 0;
	tLast.m_uWords = 0;
	tLast.m_uPosition = 0;

	if ( iBoundary!=-1 )
		AddBoundary();
}
示例#11
0
bool CISpellAffixRule::CheckPrefix ( const CSphString & sWord ) const
{
	int iCondI = 0;
	for ( int i = 0; iCondI < m_iCondLen && i < m_iWordLen; ++i )
	{
		if ( m_sCondition.cstr()[iCondI]!='[' )
		{
			if ( m_sCondition.cstr()[iCondI]!=sWord.cstr()[i] )
				return false;

			++iCondI;

		} else
		{
			int iRangeEnd = -1;

			for ( int j=iCondI; j<m_iCondLen && iRangeEnd==-1; ++j )
				if ( m_sCondition.cstr()[j]==']' )
					iRangeEnd = j;

			if ( iRangeEnd==-1 )
				return false;
			else
			{
				if ( !IsInSet ( sWord.cstr () [i], m_sCondition.SubString ( iCondI + 1, iRangeEnd - iCondI - 1 ).cstr () ) )
					return false;
			}
		}
	}

	return true;
}
示例#12
0
static bool PluginLoadSymbols ( void * pDesc, const SymbolDesc_t * pSymbol, void * pHandle, const char * sName, CSphString & sError )
{
#if !HAVE_DLOPEN
	sError = "no dlopen(), no plugins";
	return false;
#else
	CSphString s;
	while ( pSymbol->m_iOffsetOf>=0 )
	{
		s.SetSprintf ( pSymbol->m_sPostfix[0] ? "%s_%s" : "%s%s", sName, pSymbol->m_sPostfix );
		void ** ppFunc = (void**)((BYTE*)pDesc + pSymbol->m_iOffsetOf);
		*ppFunc = dlsym ( pHandle, s.cstr() );
		if ( !*ppFunc && pSymbol->m_bRequired )
		{
			sError.SetSprintf ( "symbol %s() not found", s.cstr() );
			return false;
		}
		pSymbol++;
	}
	return true;
#endif // HAVE_DLOPEN
}
示例#13
0
bool CISpellAffixRule::StripAppendPrefix ( CSphString & sWord ) const
{
	static char szTmp [MAX_STR_LENGTH];

	if ( !m_sStrip.IsEmpty () )
	{
		const char * Pos = strstr ( sWord.cstr (), m_sStrip.cstr () );
		if ( Pos!=sWord.cstr() )
			return false;
	}

	if ( !m_sAppend.IsEmpty () )
		strcpy ( szTmp, m_sAppend.cstr() ); // NOLINT

	strncpy ( szTmp + m_iAppendLen, sWord.cstr () + m_iStripLen, m_iWordLen - m_iStripLen );
	szTmp [m_iWordLen - m_iStripLen + m_iAppendLen] = '\0';

	sWord = szTmp;

	return true;
}
示例#14
0
bool CISpellAffixRule::StripAppendSuffix ( CSphString & sWord ) const
{
	static char szTmp [ MAX_STR_LENGTH];

	if ( !m_sStrip.IsEmpty () )
	{
		if ( m_iWordLen < m_iStripLen )
			return false;

		if ( strncmp ( sWord.cstr () + m_iWordLen - m_iStripLen, m_sStrip.cstr (), m_iStripLen ) )
			return false;
	}

	strncpy ( szTmp, sWord.cstr (), m_iWordLen - m_iStripLen );
	szTmp [m_iWordLen - m_iStripLen] = '\0';

	if ( !m_sAppend.IsEmpty () )
		strcat ( szTmp, m_sAppend.cstr () ); // NOLINT

	sWord = szTmp;

	return true;
}
示例#15
0
static PluginLib_c * LoadPluginLibrary ( const char * sLibName, CSphString & sError, bool bLinuxReload=false )
{
	CSphString sTmpfile;
	CSphString sLibfile;
	sLibfile.SetSprintf ( "%s/%s", g_sPluginDir.cstr(), sLibName );

	// dlopen caches the old file content, even if file was updated
	// let's reload library from the temporary file to invalidate the cache
	if ( bLinuxReload )
	{
		sTmpfile.SetSprintf ( "%s/%s.%u", g_sPluginDir.cstr(), sLibName, sphRand() );
		if ( ::rename ( sLibfile.cstr(), sTmpfile.cstr() ) )
		{
			sError.SetSprintf ( "failed to rename file (src=%s, dst=%s, errno=%d, error=%s)", sLibfile.cstr(), sTmpfile.cstr(), errno, strerror(errno) );
			return NULL;
		}
	}

	void * pHandle = dlopen ( bLinuxReload ? sTmpfile.cstr() : sLibfile.cstr(), RTLD_LAZY | RTLD_LOCAL );
	if ( !pHandle )
	{
		const char * sDlerror = dlerror();
		sError.SetSprintf ( "dlopen() failed: %s", sDlerror ? sDlerror : "(null)" );
		return NULL;
	}
	sphLogDebug ( "dlopen(%s)=%p", bLinuxReload ? sTmpfile.cstr() : sLibfile.cstr(), pHandle );

	// rename file back to the original name
	if ( bLinuxReload )
	{
		if ( ::rename ( sTmpfile.cstr(), sLibfile.cstr() ) )
		{
			sError.SetSprintf ( "failed to rename file (src=%s, dst=%s, errno=%d, error=%s)", sTmpfile.cstr(), sLibfile.cstr(), errno, strerror(errno) );
			return NULL;
		}
	}

	CSphString sBasename = sLibName;
	const char * pDot = strchr ( sBasename.cstr(), '.' );
	if ( pDot )
		sBasename = sBasename.SubString ( 0, pDot-sBasename.cstr() );

	CSphString sTmp;
	PluginVer_fn fnVer = (PluginVer_fn) dlsym ( pHandle, sTmp.SetSprintf ( "%s_ver", sBasename.cstr() ).cstr() );
	if ( !fnVer )
	{
		sError.SetSprintf ( "symbol '%s_ver' not found in '%s': update your UDF implementation", sBasename.cstr(), sLibName );
		dlclose ( pHandle );
		return NULL;
	}

	if ( fnVer() < SPH_UDF_VERSION )
	{
		sError.SetSprintf ( "library '%s' was compiled using an older version of sphinxudf.h; it needs to be recompiled", sLibName );
		dlclose ( pHandle );
		return NULL;
	}

	return new PluginLib_c ( pHandle, sLibName );
}
示例#16
0
bool sphPluginReload ( const char * sName, CSphString & sError )
{
#if !HAVE_DLOPEN
	sError = "no dlopen(), no plugins";
	return false;
#else
	// find all plugins from the given library
	CSphScopedLock<CSphMutex> tLock ( g_tPluginMutex );

	CSphVector<PluginKey_t> dKeys;
	CSphVector<PluginDesc_c*> dPlugins;

	g_hPlugins.IterateStart();
	while ( g_hPlugins.IterateNext() )
	{
		PluginDesc_c * v = g_hPlugins.IterateGet();
		if ( v->GetLibName()==sName )
		{
			dKeys.Add ( g_hPlugins.IterateGetKey() );
			dPlugins.Add ( g_hPlugins.IterateGet() );
		}
	}

	// no plugins loaded? oops
	if ( dPlugins.GetLength()==0 )
	{
		sError.SetSprintf ( "no active plugins loaded from %s", sName );
		return false;
	}

	// load new library and check every plugin
#if !USE_WINDOWS
	PluginLib_c * pNewLib = LoadPluginLibrary ( sName, sError, true );
#else
	PluginLib_c * pNewLib = LoadPluginLibrary ( sName, sError );
#endif
	if ( !pNewLib )
		return false;

	// load all plugins
	CSphVector<PluginDesc_c*> dNewPlugins;
	ARRAY_FOREACH ( i, dPlugins )
	{
		PluginDesc_c * pDesc = NULL;
		const SymbolDesc_t * pSym = NULL;
		switch ( dKeys[i].m_eType )
		{
			case PLUGIN_RANKER:					pDesc = new PluginRanker_c ( pNewLib ); pSym = g_dSymbolsRanker; break;
			case PLUGIN_INDEX_TOKEN_FILTER:		pDesc = new PluginTokenFilter_c ( pNewLib ); pSym = g_dSymbolsTokenFilter; break;
			case PLUGIN_QUERY_TOKEN_FILTER:		pDesc = new PluginQueryTokenFilter_c ( pNewLib ); pSym = g_dSymbolsQueryTokenFilter; break;
			case PLUGIN_FUNCTION:				pDesc = new PluginUDF_c ( pNewLib, dPlugins[i]->GetUdfRetType() ); pSym = g_dSymbolsUDF; break;
			default:
				sphDie ( "INTERNAL ERROR: unknown plugin type %d in sphPluginReload()", (int)dKeys[i].m_eType );
				return false;
		}

		if ( !PluginLoadSymbols ( pDesc, pSym, pNewLib->GetHandle(), dKeys[i].m_sName.cstr(), sError ) )
		{
			pDesc->Release();
			break;
		}

		dNewPlugins.Add ( pDesc );
	}
示例#17
0
文件: testrt.cpp 项目: frankee/csft
void DoIndexing ( CSphSource * pSrc, ISphRtIndex * pIndex )
{
    CSphString sError;
    CSphVector<DWORD> dMvas;

    int64_t tmStart = sphMicroTimer ();
    int64_t tmAvgCommit = 0;
    int64_t tmMaxCommit = 0;
    int iCommits = 0;
    for ( ;; )
    {
        if ( !pSrc->IterateDocument ( sError ) )
            sphDie ( "iterate-document failed: %s", sError.cstr() );
        ISphHits * pHitsNext = pSrc->IterateHits ( sError );
        if ( !sError.IsEmpty() )
            sphDie ( "iterate-hits failed: %s", sError.cstr() );

        if ( pSrc->m_tDocInfo.m_iDocID )
            pIndex->AddDocument ( pHitsNext, pSrc->m_tDocInfo, NULL, dMvas, sError );

        if ( ( pSrc->GetStats().m_iTotalDocuments % COMMIT_STEP )==0 || !pSrc->m_tDocInfo.m_iDocID )
        {
            int64_t tmCommit = sphMicroTimer();
            pIndex->Commit ();
            tmCommit = sphMicroTimer()-tmCommit;

            iCommits++;
            tmAvgCommit += tmCommit;
            tmMaxCommit = Max ( tmMaxCommit, tmCommit );

            if ( !pSrc->m_tDocInfo.m_iDocID )
            {
                tmAvgCommit /= iCommits;
                break;
            }
        }

        if (!( pSrc->GetStats().m_iTotalDocuments % 100 ))
            printf ( "%d docs\r", (int)pSrc->GetStats().m_iTotalDocuments );

        static bool bOnce = true;
        if ( iCommits*COMMIT_STEP>=5000 && bOnce )
        {
            printf ( "\n" );
            DoSearch ( pIndex );
            bOnce = false;
        }
    }

    pSrc->Disconnect();

    int64_t tmEnd = sphMicroTimer ();
    float fTotalMB = (float)pSrc->GetStats().m_iTotalBytes/1000000.0f;
    printf ( "commit-step %d, %d docs, %d bytes, %d.%03d sec, %.2f MB/sec\n",
             COMMIT_STEP,
             (int)pSrc->GetStats().m_iTotalDocuments,
             (int)pSrc->GetStats().m_iTotalBytes,
             (int)((tmEnd-tmStart)/1000000), (int)(((tmEnd-tmStart)%1000000)/1000),
             fTotalMB*1000000.0f/(tmEnd-tmStart) );
    printf ( "commit-docs %d, avg %d.%03d msec, max %d.%03d msec\n", COMMIT_STEP,
             (int)(tmAvgCommit/1000), (int)(tmAvgCommit%1000),
             (int)(tmMaxCommit/1000), (int)(tmMaxCommit%1000) );
    g_fTotalMB += fTotalMB;
}
示例#18
0
文件: testrt.cpp 项目: frankee/csft
int main ()
{
    // threads should be initialized before memory allocations
    char cTopOfMainStack;
    sphThreadInit();
    MemorizeStack ( &cTopOfMainStack );

    CSphString sError;
    CSphDictSettings tDictSettings;

    ISphTokenizer * pTok = sphCreateUTF8Tokenizer();
    CSphDict * pDict = sphCreateDictionaryCRC ( tDictSettings, pTok, sError, "rt1" );
    CSphSource * pSrc = SpawnSource ( "SELECT id, channel_id, UNIX_TIMESTAMP(published) published, title, UNCOMPRESS(content) content FROM posting WHERE id<=10000 AND id%2=0", pTok, pDict );

    ISphTokenizer * pTok2 = sphCreateUTF8Tokenizer();
    CSphDict * pDict2 = sphCreateDictionaryCRC ( tDictSettings, pTok, sError, "rt2" );
    CSphSource * pSrc2 = SpawnSource ( "SELECT id, channel_id, UNIX_TIMESTAMP(published) published, title, UNCOMPRESS(content) content FROM posting WHERE id<=10000 AND id%2=1", pTok2, pDict2 );

    CSphSchema tSrcSchema;
    if ( !pSrc->UpdateSchema ( &tSrcSchema, sError ) )
        sphDie ( "update-schema failed: %s", sError.cstr() );

    CSphSchema tSchema; // source schema must be all dynamic attrs; but index ones must be static
    tSchema.m_dFields = tSrcSchema.m_dFields;
    for ( int i=0; i<tSrcSchema.GetAttrsCount(); i++ )
        tSchema.AddAttr ( tSrcSchema.GetAttr(i), false );

    CSphConfigSection tRTConfig;
    sphRTInit();
    sphRTConfigure ( tRTConfig, true );
    SmallStringHash_T< CSphIndex * > dTemp;
    sphReplayBinlog ( dTemp, 0 );
    ISphRtIndex * pIndex = sphCreateIndexRT ( tSchema, "testrt", 32*1024*1024, "data/dump", false );
    pIndex->SetTokenizer ( pTok ); // index will own this pair from now on
    pIndex->SetDictionary ( pDict );
    if ( !pIndex->Prealloc ( false, false, sError ) )
        sphDie ( "prealloc failed: %s", pIndex->GetLastError().cstr() );
    g_pIndex = pIndex;

    // initial indexing
    int64_t tmStart = sphMicroTimer();

    SphThread_t t1, t2;
    sphThreadCreate ( &t1, IndexingThread, pSrc );
    sphThreadCreate ( &t2, IndexingThread, pSrc2 );
    sphThreadJoin ( &t1 );
    sphThreadJoin ( &t2 );

#if 0
    // update
    tParams.m_sQuery = "SELECT id, channel_id, UNIX_TIMESTAMP(published) published, title, UNCOMPRESS(content) content FROM rt2 WHERE id<=10000";
    SetupIndexing ( pSrc, tParams );
    DoIndexing ( pSrc, pIndex );
#endif

    // search
    DoSearch ( pIndex );

    // shutdown index (should cause dump)
    int64_t tmShutdown = sphMicroTimer();

#if SPH_ALLOCS_PROFILER
    printf ( "pre-shutdown allocs=%d, bytes="INT64_FMT"\n", sphAllocsCount(), sphAllocBytes() );
#endif
    SafeDelete ( pIndex );
#if SPH_ALLOCS_PROFILER
    printf ( "post-shutdown allocs=%d, bytes="INT64_FMT"\n", sphAllocsCount(), sphAllocBytes() );
#endif

    int64_t tmEnd = sphMicroTimer();
    printf ( "shutdown done in %d.%03d sec\n", (int)((tmEnd-tmShutdown)/1000000), (int)(((tmEnd-tmShutdown)%1000000)/1000) );
    printf ( "total with shutdown %d.%03d sec, %.2f MB/sec\n",
             (int)((tmEnd-tmStart)/1000000), (int)(((tmEnd-tmStart)%1000000)/1000),
             g_fTotalMB*1000000.0f/(tmEnd-tmStart) );

#if SPH_DEBUG_LEAKS || SPH_ALLOCS_PROFILER
    sphAllocsStats();
#endif
#if USE_WINDOWS
    PROCESS_MEMORY_COUNTERS pmc;
    HANDLE hProcess = OpenProcess ( PROCESS_QUERY_INFORMATION | PROCESS_VM_READ, FALSE, GetCurrentProcessId() );
    if ( hProcess && GetProcessMemoryInfo ( hProcess, &pmc, sizeof(pmc)) )
    {
        printf ( "--- peak-wss=%d, peak-pagefile=%d\n", (int)pmc.PeakWorkingSetSize, (int)pmc.PeakPagefileUsage );
    }
#endif

    SafeDelete ( pIndex );
    sphRTDone ();
}
示例#19
0
char * ExcerptGen_c::BuildExcerpt ( const ExcerptQuery_t & q, CSphDict * pDict, ISphTokenizer * pTokenizer )
{
	m_dTokens.Reserve ( 1024 );
	m_sBuffer = q.m_sSource;

	const bool bUtf8 = pTokenizer->IsUtf8();
	m_bUtf8 = bUtf8;
	// tokenize query words
	int iWordsLength = strlen ( q.m_sWords.cstr() );

	CSphVector<char> dKwBuffer ( iWordsLength );
	CSphVector<Keyword_t> dKeywords;
	dKeywords.Reserve ( MAX_HIGHLIGHT_WORDS );

	BYTE * sWord;
	int iKwIndex = 0;

	pTokenizer->SetBuffer ( (BYTE*)q.m_sWords.cstr(), iWordsLength );
	while ( ( sWord = pTokenizer->GetToken() ) != NULL )
	{
		SphWordID_t iWord = pDict->GetWordID ( sWord );
		if ( iWord )
		{
			m_dWords.Resize ( m_dWords.GetLength () + 1 );
			Token_t & tLast = m_dWords.Last ();
			tLast.m_eType = TOK_WORD;
			tLast.m_iWordID = iWord;
			tLast.m_iLengthBytes = strlen ( (const char *)sWord );
			tLast.m_iLengthCP = bUtf8 ? sphUTF8Len ( (const char *)sWord ) : tLast.m_iLengthBytes;

			// store keyword
			dKeywords.Resize( dKeywords.GetLength() + 1 );
			Keyword_t & kwLast = dKeywords.Last ();

			// find stars
			bool bStarBack = *pTokenizer->GetTokenEnd() == '*';
			bool bStarFront = ( pTokenizer->GetTokenStart() != pTokenizer->GetBufferPtr() ) &&
				pTokenizer->GetTokenStart()[-1] == '*';
			kwLast.m_uStar = ( bStarFront ? STAR_FRONT : 0 ) | ( bStarBack ? STAR_BACK : 0 );

			// store token
			const int iEndIndex = iKwIndex + tLast.m_iLengthBytes + 1;
			dKwBuffer.Resize ( iEndIndex );
			kwLast.m_iWord = iKwIndex;
			strcpy ( &dKwBuffer [ iKwIndex ], (const char *)sWord );
			iKwIndex = iEndIndex;

			if ( m_dWords.GetLength() == MAX_HIGHLIGHT_WORDS )
				break;
		}
	}

	// tokenize document
	pTokenizer->SetBuffer ( (BYTE*)q.m_sSource.cstr (), strlen ( q.m_sSource.cstr () ) );

	const char * pStartPtr = pTokenizer->GetBufferPtr ();
	const char * pLastTokenEnd = pStartPtr;

	//assign utf-8
	m_sBufferUTF8 = pStartPtr;

	while ( ( sWord = pTokenizer->GetToken() ) != NULL )
	{
		const char * pTokenStart = pTokenizer->GetTokenStart ();

		if ( pTokenStart != pStartPtr )
			AddJunk ( pLastTokenEnd - pStartPtr,
					  pTokenStart - pLastTokenEnd,
					  pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1 );

		SphWordID_t iWord = pDict->GetWordID ( sWord );

		pLastTokenEnd = pTokenizer->GetTokenEnd ();

		m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
		Token_t & tLast = m_dTokens.Last ();
		tLast.m_eType	= iWord ? TOK_WORD : TOK_SPACE;
		tLast.m_iStart  = pTokenStart - pStartPtr;
		tLast.m_iLengthBytes = pLastTokenEnd - pTokenStart;
		tLast.m_iWordID = iWord;
		tLast.m_uWords = 0;

		// fill word mask
		if ( iWord )
		{
			bool bMatch = false;
			int iOffset;

			ARRAY_FOREACH ( nWord, m_dWords )
			{
				const char * keyword = &dKwBuffer [ dKeywords[nWord].m_iWord ];
				const Token_t & token = m_dWords[nWord];

				switch ( dKeywords[nWord].m_uStar )
				{
				case STAR_NONE:
					bMatch = iWord == token.m_iWordID;
					break;

				case STAR_FRONT:
					iOffset = tLast.m_iLengthBytes - token.m_iLengthBytes;
					bMatch = (iOffset >= 0) &&
						( memcmp( keyword, sWord + iOffset, token.m_iLengthBytes ) == 0 );
					break;

				case STAR_BACK:
					bMatch = ( tLast.m_iLengthBytes >= token.m_iLengthBytes ) &&
						( memcmp( keyword, sWord, token.m_iLengthBytes ) == 0 );
					break;

				case STAR_BOTH:
					bMatch = strstr( (const char *)sWord, keyword ) != NULL;
					break;
				}

				if ( bMatch )
					tLast.m_uWords |= (1UL << nWord);
			}
		}
	}

	// last space if any
	if ( pLastTokenEnd != pTokenizer->GetBufferEnd () )
	{
		int iOffset = pTokenizer->GetBoundary() ? pTokenizer->GetBoundaryOffset() : -1;
		AddJunk ( pLastTokenEnd - pStartPtr, pTokenizer->GetBufferEnd () - pLastTokenEnd, iOffset );
	}
	
	m_dTokens.Resize ( m_dTokens.GetLength () + 1 );
	Token_t & tLast = m_dTokens.Last ();
	tLast.m_eType   = TOK_NONE;
	tLast.m_iStart  = 0;
	tLast.m_iLengthBytes = 0;
	tLast.m_iWordID = 0;
	tLast.m_uWords = 0;

	// sum token lengths
	int iSourceCodes = 0;
	ARRAY_FOREACH ( i, m_dTokens )
	{
		m_dTokens [i].m_iWeight = 0;

		if ( m_dTokens [i].m_iLengthBytes )
		{
			if ( bUtf8 )
			{
				//int iLen = sphUTF8Len ( m_sBuffer.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() );
				int iLen = sphUTF8Len ( m_sBufferUTF8.SubString ( m_dTokens[i].m_iStart, m_dTokens[i].m_iLengthBytes ).cstr() );
				m_dTokens[i].m_iLengthCP = iLen;
			}
			else
				m_dTokens[i].m_iLengthCP = m_dTokens[i].m_iLengthBytes;
			iSourceCodes += m_dTokens[i].m_iLengthCP;
		}
		else
			m_dTokens [i].m_iLengthCP = 0;
	}
示例#20
0
	PluginKey_t ( PluginType_e eType, const char * sName )
		: m_eType ( eType )
		, m_sName ( sName )
	{
		m_sName.ToLower();
	}
示例#21
0
void CISpellAffix::LoadLocale ()
{
	if ( m_bUseDictConversion )
		printf ( "Using dictionary-defined character set\n" );
	else
		if ( !m_sCharsetFile.IsEmpty () )
		{
			FILE * pFile = fopen ( m_sCharsetFile.cstr (), "rt" );
			if ( pFile )
			{
				printf ( "Using charater set from '%s'\n", m_sCharsetFile.cstr () );

				const int MAX_CHARSET_LENGTH = 4096;
				char szBuffer [MAX_CHARSET_LENGTH];

				char * szResult = fgets ( szBuffer, MAX_CHARSET_LENGTH, pFile );
				if ( szResult )
				{
					CSphVector<CSphRemapRange> dRemaps;
					if ( sphParseCharset ( szBuffer, dRemaps ) )
					{
						m_bUseLowerCaser = true;
						m_LowerCaser.AddRemaps ( dRemaps, 0 );
					} else
					{
						printf ( "Failed to parse charset from '%s'\n", m_sCharsetFile.cstr() );
					}
				} else
				{
					printf ( "Failed to read charset from '%s'\n", m_sCharsetFile.cstr() );
				}

				fclose ( pFile );

			} else
			{
				printf ( "Failed to open '%s'\n", m_sCharsetFile.cstr() );
			}

		} else
		{
			if ( !m_sLocale.IsEmpty () )
			{
				char dLocaleC[256], dLocaleUser[256];

				setlocale ( LC_ALL, "C" );
				for ( int i=0; i<256; i++ )
					dLocaleC[i] = (char) tolower(i);

				char * szLocale = setlocale ( LC_CTYPE, m_sLocale.cstr() );
				if ( szLocale )
				{
					printf ( "Using user-defined locale (locale=%s)\n", m_sLocale.cstr() );

					for ( int i=0; i<256; i++ )
						dLocaleUser[i] = (char) tolower(i);

					if ( !memcmp ( dLocaleC, dLocaleUser, 256 ) )
						printf ( "WARNING: user-defined locale provides the same case conversion as the default \"C\" locale\n" );
				} else
					printf ( "WARNING: could not set user-defined locale for case conversions (locale=%s)\n", m_sLocale.cstr() );
			} else
				printf ( "WARNING: no character set specified\n" );
		}
}
示例#22
0
bool sphPluginCreate ( const char * szLib, PluginType_e eType, const char * sName, ESphAttr eUDFRetType, CSphString & sError )
{
#if !HAVE_DLOPEN
	sError = "no dlopen(), no plugins";
	return false;
#else
	if ( !g_bPluginsEnabled )
	{
		sError = "plugin support disabled (requires a valid plugin_dir)";
		return false;
	}

	// validate library name
	for ( const char * p = szLib; *p; p++ )
		if ( *p=='/' || *p=='\\' )
	{
		sError = "restricted character (path delimiter) in a library file name";
		return false;
	}

	CSphString sLib = szLib;
	sLib.ToLower();

	// FIXME? preregister known rankers instead?
	if ( eType==PLUGIN_RANKER )
	{
		for ( int i=0; i<SPH_RANK_TOTAL; i++ )
		{
			const char * r = sphGetRankerName ( ESphRankMode(i) );
			if ( r && strcasecmp ( sName, r )==0 )
			{
				sError.SetSprintf ( "%s is a reserved ranker name", r );
				return false;
			}
		}
	}

	// from here, we need a lock (we intend to update the plugin hash)
	CSphScopedLock<CSphMutex> tLock ( g_tPluginMutex );

	// validate function name
	PluginKey_t k ( eType, sName );
	if ( g_hPlugins(k) )
	{
		sError.SetSprintf ( "plugin '%s' already exists", k.m_sName.cstr() );
		return false;
	}

	// lookup or load library
	PluginLib_c * pLib = NULL;
	if ( g_hPluginLibs ( sLib ) )
	{
		pLib = g_hPluginLibs [ sLib ];
		pLib->AddRef();
	} else
	{
		pLib = LoadPluginLibrary ( sLib.cstr(), sError );
		if ( !pLib )
			return false;
	}
	assert ( pLib->GetHandle() );

	PluginDesc_c * pPlugin = NULL;
	const SymbolDesc_t * pSym = NULL;
	switch ( eType )
	{
		case PLUGIN_RANKER:					pPlugin = new PluginRanker_c ( pLib ); pSym = g_dSymbolsRanker; break;
		case PLUGIN_INDEX_TOKEN_FILTER:		pPlugin = new PluginTokenFilter_c ( pLib ); pSym = g_dSymbolsTokenFilter; break;
		case PLUGIN_QUERY_TOKEN_FILTER:		pPlugin = new PluginQueryTokenFilter_c ( pLib ); pSym = g_dSymbolsQueryTokenFilter; break;
		case PLUGIN_FUNCTION:				pPlugin = new PluginUDF_c ( pLib, eUDFRetType ); pSym = g_dSymbolsUDF; break;
		default:
			sError.SetSprintf ( "INTERNAL ERROR: unknown plugin type %d in CreatePlugin()", (int)eType );
			pLib->Release();
			return false;
	}

	// release the refcount that this very function is holding
	// or in other words, transfer the refcount to newly created plugin instance (it does its own addref)
	pLib->Release();

	if ( !PluginLoadSymbols ( pPlugin, pSym, pLib->GetHandle(), k.m_sName.cstr(), sError ) )
	{
		sError.SetSprintf ( "%s in %s", sError.cstr(), sLib.cstr() );
		pPlugin->Release();
		return false;
	}

	// add library if needed
	if ( !g_hPluginLibs ( sLib ) )
	{
		Verify ( g_hPluginLibs.Add ( pLib, pLib->GetName() ) );
		pLib->AddRef(); // the hash reference
	}

	// add function
	Verify ( g_hPlugins.Add ( pPlugin, k ) );
	pPlugin->GetLib()->m_iHashedPlugins++;
	return true;
#endif // HAVE_DLOPEN
}