Exemplo n.º 1
0
bool CISpellAffix::Load ( const char * szFilename )
{
    if ( !szFilename )
        return false;

    m_dRules.Reset ();
    memset ( m_dCharset, 0, sizeof ( m_dCharset ) );
    m_bFirstCaseConv = true;
    m_bUseLowerCaser = false;
    m_bUseDictConversion = false;
    m_LowerCaser.Reset ();

    FILE * pFile = fopen ( szFilename, "rt" );
    if ( !pFile )
        return false;

    bool bResult = false;
    AffixFormat_e eFormat = DetectFormat ( pFile );
    if ( eFormat==AFFIX_FORMAT_UNKNOWN )
        printf ( "Failed to detect affix file format\n" );
    else
    {
        fseek ( pFile, SEEK_SET, 0 );
        printf ( "Using %s affix file format\n", AffixFormatName[eFormat] );
        switch ( eFormat )
        {
        case AFFIX_FORMAT_MYSPELL:
            bResult = LoadMySpell ( pFile );
            break;
        case AFFIX_FORMAT_ISPELL:
            bResult = LoadISpell ( pFile );
            break;
        case AFFIX_FORMAT_UNKNOWN:
            break;
        }
    }
    fclose ( pFile );

    bool bHaveCrossPrefix = false;
    for ( int i = 0; i < m_dRules.GetLength () && !bHaveCrossPrefix; i++ )
        if ( m_dRules[i].IsPrefix() && m_dRules[i].IsCrossProduct() )
            bHaveCrossPrefix = true;

    bool bHaveCrossSuffix = false;
    for ( int i = 0; i < m_dRules.GetLength () && !bHaveCrossSuffix; i++ )
        if ( !m_dRules[i].IsPrefix() && m_dRules[i].IsCrossProduct() )
            bHaveCrossSuffix = true;

    m_bCheckCrosses = bHaveCrossPrefix && bHaveCrossSuffix;

    return bResult;
}
Exemplo n.º 2
0
char CISpellAffix::ToLowerCase ( char cChar )
{
	if ( m_bFirstCaseConv )
	{
		LoadLocale ();
		m_bFirstCaseConv = false;
	}

	// dictionary conversion
	if ( m_bUseDictConversion )
		return m_dCharset [(BYTE) cChar] ? m_dCharset [(BYTE) cChar] : cChar;

	// user-defined character mapping
	if ( m_bUseLowerCaser )
	{
		char cResult = (char)m_LowerCaser.ToLower ( (BYTE) cChar );
		return cResult ? cResult : cChar;
	}

	// user-specified code page conversion
	return (char)tolower ( (BYTE)cChar ); // workaround for systems (eg. FreeBSD) which default to signed char. marvelous!
}
Exemplo n.º 3
0
void CharsetFold ( CSphIndex * pIndex, FILE * fp )
{
	CSphVector<BYTE> sBuf1 ( 16384 );
	CSphVector<BYTE> sBuf2 ( 16384 );

	bool bUtf = pIndex->GetTokenizer()->IsUtf8();
	if ( !bUtf )
		sphDie ( "sorry, --fold vs SBCS is not supported just yet" );

	CSphLowercaser tLC = pIndex->GetTokenizer()->GetLowercaser();

#if USE_WINDOWS
	setmode ( fileno(stdout), O_BINARY );
#endif

	int iBuf1 = 0; // how many leftover bytes from previous iteration
	while ( !feof(fp) )
	{
		int iGot = fread ( sBuf1.Begin()+iBuf1, 1, sBuf1.GetLength()-iBuf1, fp );
		if ( iGot<0 )
			sphDie ( "read error: %s", strerror(errno) );

		if ( iGot==0 )
			if ( feof(fp) )
				if ( iBuf1==0 )
					break;


		const BYTE * pIn = sBuf1.Begin();
		const BYTE * pInMax = pIn + iBuf1 + iGot;

		if ( pIn==pInMax && feof(fp) )
			break;

		// tricky bit
		// on full buffer, and not an eof, terminate a bit early
		// to avoid codepoint vs buffer boundary issue
		if ( ( iBuf1+iGot )==sBuf1.GetLength() && iGot!=0 )
			pInMax -= 16;

		// do folding
		BYTE * pOut = sBuf2.Begin();
		BYTE * pOutMax = pOut + sBuf2.GetLength() - 16;
		while ( pIn < pInMax )
		{
			int iCode = sphUTF8Decode ( pIn );
			if ( iCode==0 )
				pIn++; // decoder does not do that!
			assert ( iCode>=0 );

			if ( iCode!=0x09 && iCode!=0x0A && iCode!=0x0D )
			{
				iCode = tLC.ToLower ( iCode ) & 0xffffffUL;
				if ( !iCode )
					iCode = 0x20;
			}

			pOut += sphUTF8Encode ( pOut, iCode );
			if ( pOut>=pOutMax )
			{
				fwrite ( sBuf2.Begin(), 1, pOut-sBuf2.Begin(), stdout );
				pOut = sBuf2.Begin();
			}
		}
		fwrite ( sBuf2.Begin(), 1, pOut-sBuf2.Begin(), stdout );

		// now move around leftovers
		BYTE * pRealEnd = sBuf1.Begin() + iBuf1 + iGot;
		if ( pIn < pRealEnd )
		{
			iBuf1 = pRealEnd - pIn;
			memmove ( sBuf1.Begin(), pIn, iBuf1 );
		}
	}
}
Exemplo n.º 4
0
void CISpellAffix::LoadLocale ()
{
	if ( m_bUseDictConversion )
		printf ( "Using dictionary-defined character set\n" );
	else
		if ( !m_sCharsetFile.IsEmpty () )
		{
			FILE * pFile = fopen ( m_sCharsetFile.cstr (), "rt" );
			if ( pFile )
			{
				printf ( "Using charater set from '%s'\n", m_sCharsetFile.cstr () );

				const int MAX_CHARSET_LENGTH = 4096;
				char szBuffer [MAX_CHARSET_LENGTH];

				char * szResult = fgets ( szBuffer, MAX_CHARSET_LENGTH, pFile );
				if ( szResult )
				{
					CSphVector<CSphRemapRange> dRemaps;
					if ( sphParseCharset ( szBuffer, dRemaps ) )
					{
						m_bUseLowerCaser = true;
						m_LowerCaser.AddRemaps ( dRemaps, 0 );
					} else
					{
						printf ( "Failed to parse charset from '%s'\n", m_sCharsetFile.cstr() );
					}
				} else
				{
					printf ( "Failed to read charset from '%s'\n", m_sCharsetFile.cstr() );
				}

				fclose ( pFile );

			} else
			{
				printf ( "Failed to open '%s'\n", m_sCharsetFile.cstr() );
			}

		} else
		{
			if ( !m_sLocale.IsEmpty () )
			{
				char dLocaleC[256], dLocaleUser[256];

				setlocale ( LC_ALL, "C" );
				for ( int i=0; i<256; i++ )
					dLocaleC[i] = (char) tolower(i);

				char * szLocale = setlocale ( LC_CTYPE, m_sLocale.cstr() );
				if ( szLocale )
				{
					printf ( "Using user-defined locale (locale=%s)\n", m_sLocale.cstr() );

					for ( int i=0; i<256; i++ )
						dLocaleUser[i] = (char) tolower(i);

					if ( !memcmp ( dLocaleC, dLocaleUser, 256 ) )
						printf ( "WARNING: user-defined locale provides the same case conversion as the default \"C\" locale\n" );
				} else
					printf ( "WARNING: could not set user-defined locale for case conversions (locale=%s)\n", m_sLocale.cstr() );
			} else
				printf ( "WARNING: no character set specified\n" );
		}
}