void GermanStemmer::strip(StringBuffer& buffer) { bool doMore = true; while ( doMore && buffer.length() > 3 ) { if ( ( buffer.length() + substCount > 5 ) && buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("nd"), 2 ) ) { buffer.deleteChars( buffer.length() - 2, buffer.length() ); } else if ( ( buffer.length() + substCount > 4 ) && buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("em"), 2 ) ) { buffer.deleteChars( buffer.length() - 2, buffer.length() ); } else if ( ( buffer.length() + substCount > 4 ) && buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("er"), 2 ) ) { buffer.deleteChars( buffer.length() - 2, buffer.length() ); } else if ( buffer.charAt( buffer.length() - 1 ) == _T('e') ) { buffer.deleteCharAt( buffer.length() - 1 ); } else if ( buffer.charAt( buffer.length() - 1 ) == _T('s') ) { buffer.deleteCharAt( buffer.length() - 1 ); } else if ( buffer.charAt( buffer.length() - 1 ) == _T('n') ) { buffer.deleteCharAt( buffer.length() - 1 ); } // "t" occurs only as suffix of verbs. else if ( buffer.charAt( buffer.length() - 1 ) == _T('t') ) { buffer.deleteCharAt( buffer.length() - 1 ); } else { doMore = false; } } }
void GermanStemmer::removeParticleDenotion(StringBuffer& buffer) { if ( buffer.length() > 4 ) { for ( size_t c = 0; c < buffer.length() - 3; c++ ) { if ( buffer.substringEquals( c, c + 4, _T("gege"), 4 ) ) { buffer.deleteChars( c, c + 2 ); return; } } } }
void GermanStemmer::substitute(StringBuffer& buffer) { substCount = 0; for ( size_t i = 0; i < buffer.length(); i++ ) { #ifdef _UCS2 TCHAR c = buffer.charAt(i); #else unsigned char c = buffer.charAt(i); #endif // Replace the second char of a pair of the equal characters with an asterisk if ( i > 0 && c == buffer.charAt ( i - 1 ) ) { buffer.setCharAt( i, _T('*') ); } // Substitute Umlauts. else if ( c == 0xe4 ) { buffer.setCharAt( i, _T('a') ); } else if ( c == 0xf6 ) { buffer.setCharAt( i, _T('o') ); } else if ( c == 0xfc ) { buffer.setCharAt( i, _T('u') ); } // Fix bug so that 'ß' at the end of a word is replaced. else if ( c == 0xdf ) { buffer.setCharAt( i, _T('s') ); buffer.insert( i + 1, _T('s') ); substCount++; } // Take care that at least one character is left left side from the current one if ( i < buffer.length() - 1 ) { // Masking several common character combinations with an token if ( ( i < buffer.length() - 2 ) && c == _T('s') && buffer.charAt( i + 1 ) == _T('c') && buffer.charAt( i + 2 ) == _T('h') ) { buffer.setCharAt( i, _T('$') ); buffer.deleteChars( i + 1, i + 3 ); substCount =+ 2; } else if ( c == _T('c') && buffer.charAt( i + 1 ) == _T('h') ) { buffer.setCharAt( i, 0xa7 ); // section sign in UTF-16 buffer.deleteCharAt( i + 1 ); substCount++; } else if ( c == _T('e') && buffer.charAt( i + 1 ) == _T('i') ) { buffer.setCharAt( i, _T('%') ); buffer.deleteCharAt( i + 1 ); substCount++; } else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('e') ) { buffer.setCharAt( i, _T('&') ); buffer.deleteCharAt( i + 1 ); substCount++; } else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('g') ) { buffer.setCharAt( i, _T('#') ); buffer.deleteCharAt( i + 1 ); substCount++; } else if ( c == _T('s') && buffer.charAt( i + 1 ) == _T('t') ) { buffer.setCharAt( i, _T('!') ); buffer.deleteCharAt( i + 1 ); substCount++; } } } }
void GermanStemmer::substitute(StringBuffer& buffer) { substCount = 0; for ( size_t c = 0; c < buffer.length(); c++ ) { // Replace the second char of a pair of the equal characters with an asterisk if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { buffer.setCharAt( c, _T('*') ); } // Substitute Umlauts. else if ( buffer.charAt( c ) == _T('ä') ) { buffer.setCharAt( c, _T('a') ); } else if ( buffer.charAt( c ) == _T('ö') ) { buffer.setCharAt( c, _T('o') ); } else if ( buffer.charAt( c ) == _T('ü') ) { buffer.setCharAt( c, _T('u') ); } // Fix bug so that 'ß' at the end of a word is replaced. else if ( buffer.charAt( c ) == _T('ß') ) { buffer.setCharAt( c, _T('s') ); buffer.insert( c + 1, _T('s') ); substCount++; } // Take care that at least one character is left left side from the current one if ( c < buffer.length() - 1 ) { // Masking several common character combinations with an token if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == _T('s') && buffer.charAt( c + 1 ) == _T('c') && buffer.charAt( c + 2 ) == _T('h') ) { buffer.setCharAt( c, _T('$') ); buffer.deleteChars( c + 1, c + 3 ); substCount =+ 2; } else if ( buffer.charAt( c ) == _T('c') && buffer.charAt( c + 1 ) == _T('h') ) { buffer.setCharAt( c, _T('§') ); buffer.deleteCharAt( c + 1 ); substCount++; } else if ( buffer.charAt( c ) == _T('e') && buffer.charAt( c + 1 ) == _T('i') ) { buffer.setCharAt( c, _T('%') ); buffer.deleteCharAt( c + 1 ); substCount++; } else if ( buffer.charAt( c ) == _T('i') && buffer.charAt( c + 1 ) == _T('e') ) { buffer.setCharAt( c, _T('&') ); buffer.deleteCharAt( c + 1 ); substCount++; } else if ( buffer.charAt( c ) == _T('i') && buffer.charAt( c + 1 ) == _T('g') ) { buffer.setCharAt( c, _T('#') ); buffer.deleteCharAt( c + 1 ); substCount++; } else if ( buffer.charAt( c ) == _T('s') && buffer.charAt( c + 1 ) == _T('t') ) { buffer.setCharAt( c, _T('!') ); buffer.deleteCharAt( c + 1 ); substCount++; } } } }