bool CISpellAffix::Load ( const char * szFilename ) { if ( !szFilename ) return false; m_dRules.Reset (); memset ( m_dCharset, 0, sizeof ( m_dCharset ) ); m_bFirstCaseConv = true; m_bUseLowerCaser = false; m_bUseDictConversion = false; m_LowerCaser.Reset (); FILE * pFile = fopen ( szFilename, "rt" ); if ( !pFile ) return false; bool bResult = false; AffixFormat_e eFormat = DetectFormat ( pFile ); if ( eFormat==AFFIX_FORMAT_UNKNOWN ) printf ( "Failed to detect affix file format\n" ); else { fseek ( pFile, SEEK_SET, 0 ); printf ( "Using %s affix file format\n", AffixFormatName[eFormat] ); switch ( eFormat ) { case AFFIX_FORMAT_MYSPELL: bResult = LoadMySpell ( pFile ); break; case AFFIX_FORMAT_ISPELL: bResult = LoadISpell ( pFile ); break; case AFFIX_FORMAT_UNKNOWN: break; } } fclose ( pFile ); bool bHaveCrossPrefix = false; for ( int i = 0; i < m_dRules.GetLength () && !bHaveCrossPrefix; i++ ) if ( m_dRules[i].IsPrefix() && m_dRules[i].IsCrossProduct() ) bHaveCrossPrefix = true; bool bHaveCrossSuffix = false; for ( int i = 0; i < m_dRules.GetLength () && !bHaveCrossSuffix; i++ ) if ( !m_dRules[i].IsPrefix() && m_dRules[i].IsCrossProduct() ) bHaveCrossSuffix = true; m_bCheckCrosses = bHaveCrossPrefix && bHaveCrossSuffix; return bResult; }
char CISpellAffix::ToLowerCase ( char cChar ) { if ( m_bFirstCaseConv ) { LoadLocale (); m_bFirstCaseConv = false; } // dictionary conversion if ( m_bUseDictConversion ) return m_dCharset [(BYTE) cChar] ? m_dCharset [(BYTE) cChar] : cChar; // user-defined character mapping if ( m_bUseLowerCaser ) { char cResult = (char)m_LowerCaser.ToLower ( (BYTE) cChar ); return cResult ? cResult : cChar; } // user-specified code page conversion return (char)tolower ( (BYTE)cChar ); // workaround for systems (eg. FreeBSD) which default to signed char. marvelous! }
void CharsetFold ( CSphIndex * pIndex, FILE * fp ) { CSphVector<BYTE> sBuf1 ( 16384 ); CSphVector<BYTE> sBuf2 ( 16384 ); bool bUtf = pIndex->GetTokenizer()->IsUtf8(); if ( !bUtf ) sphDie ( "sorry, --fold vs SBCS is not supported just yet" ); CSphLowercaser tLC = pIndex->GetTokenizer()->GetLowercaser(); #if USE_WINDOWS setmode ( fileno(stdout), O_BINARY ); #endif int iBuf1 = 0; // how many leftover bytes from previous iteration while ( !feof(fp) ) { int iGot = fread ( sBuf1.Begin()+iBuf1, 1, sBuf1.GetLength()-iBuf1, fp ); if ( iGot<0 ) sphDie ( "read error: %s", strerror(errno) ); if ( iGot==0 ) if ( feof(fp) ) if ( iBuf1==0 ) break; const BYTE * pIn = sBuf1.Begin(); const BYTE * pInMax = pIn + iBuf1 + iGot; if ( pIn==pInMax && feof(fp) ) break; // tricky bit // on full buffer, and not an eof, terminate a bit early // to avoid codepoint vs buffer boundary issue if ( ( iBuf1+iGot )==sBuf1.GetLength() && iGot!=0 ) pInMax -= 16; // do folding BYTE * pOut = sBuf2.Begin(); BYTE * pOutMax = pOut + sBuf2.GetLength() - 16; while ( pIn < pInMax ) { int iCode = sphUTF8Decode ( pIn ); if ( iCode==0 ) pIn++; // decoder does not do that! assert ( iCode>=0 ); if ( iCode!=0x09 && iCode!=0x0A && iCode!=0x0D ) { iCode = tLC.ToLower ( iCode ) & 0xffffffUL; if ( !iCode ) iCode = 0x20; } pOut += sphUTF8Encode ( pOut, iCode ); if ( pOut>=pOutMax ) { fwrite ( sBuf2.Begin(), 1, pOut-sBuf2.Begin(), stdout ); pOut = sBuf2.Begin(); } } fwrite ( sBuf2.Begin(), 1, pOut-sBuf2.Begin(), stdout ); // now move around leftovers BYTE * pRealEnd = sBuf1.Begin() + iBuf1 + iGot; if ( pIn < pRealEnd ) { iBuf1 = pRealEnd - pIn; memmove ( sBuf1.Begin(), pIn, iBuf1 ); } } }
void CISpellAffix::LoadLocale () { if ( m_bUseDictConversion ) printf ( "Using dictionary-defined character set\n" ); else if ( !m_sCharsetFile.IsEmpty () ) { FILE * pFile = fopen ( m_sCharsetFile.cstr (), "rt" ); if ( pFile ) { printf ( "Using charater set from '%s'\n", m_sCharsetFile.cstr () ); const int MAX_CHARSET_LENGTH = 4096; char szBuffer [MAX_CHARSET_LENGTH]; char * szResult = fgets ( szBuffer, MAX_CHARSET_LENGTH, pFile ); if ( szResult ) { CSphVector<CSphRemapRange> dRemaps; if ( sphParseCharset ( szBuffer, dRemaps ) ) { m_bUseLowerCaser = true; m_LowerCaser.AddRemaps ( dRemaps, 0 ); } else { printf ( "Failed to parse charset from '%s'\n", m_sCharsetFile.cstr() ); } } else { printf ( "Failed to read charset from '%s'\n", m_sCharsetFile.cstr() ); } fclose ( pFile ); } else { printf ( "Failed to open '%s'\n", m_sCharsetFile.cstr() ); } } else { if ( !m_sLocale.IsEmpty () ) { char dLocaleC[256], dLocaleUser[256]; setlocale ( LC_ALL, "C" ); for ( int i=0; i<256; i++ ) dLocaleC[i] = (char) tolower(i); char * szLocale = setlocale ( LC_CTYPE, m_sLocale.cstr() ); if ( szLocale ) { printf ( "Using user-defined locale (locale=%s)\n", m_sLocale.cstr() ); for ( int i=0; i<256; i++ ) dLocaleUser[i] = (char) tolower(i); if ( !memcmp ( dLocaleC, dLocaleUser, 256 ) ) printf ( "WARNING: user-defined locale provides the same case conversion as the default \"C\" locale\n" ); } else printf ( "WARNING: could not set user-defined locale for case conversions (locale=%s)\n", m_sLocale.cstr() ); } else printf ( "WARNING: no character set specified\n" ); } }