void GermanStemmer::strip(StringBuffer& buffer) { bool doMore = true; while ( doMore && buffer.length() > 3 ) { if ( ( buffer.length() + substCount > 5 ) && buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("nd"), 2 ) ) { buffer.deleteChars( buffer.length() - 2, buffer.length() ); } else if ( ( buffer.length() + substCount > 4 ) && buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("em"), 2 ) ) { buffer.deleteChars( buffer.length() - 2, buffer.length() ); } else if ( ( buffer.length() + substCount > 4 ) && buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("er"), 2 ) ) { buffer.deleteChars( buffer.length() - 2, buffer.length() ); } else if ( buffer.charAt( buffer.length() - 1 ) == _T('e') ) { buffer.deleteCharAt( buffer.length() - 1 ); } else if ( buffer.charAt( buffer.length() - 1 ) == _T('s') ) { buffer.deleteCharAt( buffer.length() - 1 ); } else if ( buffer.charAt( buffer.length() - 1 ) == _T('n') ) { buffer.deleteCharAt( buffer.length() - 1 ); } // "t" occurs only as suffix of verbs. else if ( buffer.charAt( buffer.length() - 1 ) == _T('t') ) { buffer.deleteCharAt( buffer.length() - 1 ); } else { doMore = false; } } }
void GermanStemmer::optimize(StringBuffer& buffer) { // Additional step for female plurals of professions and inhabitants. if ( buffer.length() > 5 && buffer.substringEquals( buffer.length() - 5, buffer.length(), _T("erin*"), 5 ) ) { buffer.deleteCharAt( buffer.length() -1 ); strip( buffer ); } // Additional step for irregular plural nouns like "Matrizen -> Matrix". if ( buffer.charAt( buffer.length() - 1 ) == ( _T('z') ) ) { buffer.setCharAt( buffer.length() - 1, _T('x') ); } }
public static void DFS(String digits, List<String> result, StringBuffer s, int start, HashMap<Integer, String> map) { if (start == digits.length()) result.add(s.toString()); else { String tmp = map.get(digits.charAt(start) - '0'); for (int i = 0; i < tmp.length(); i++) { s.append(tmp.charAt(i)); DFS(digits, result, s, start + 1, map); s.deleteCharAt(s.length() - 1); } } }
void GermanStemmer::substitute(StringBuffer& buffer) { substCount = 0; for ( size_t i = 0; i < buffer.length(); i++ ) { #ifdef _UCS2 TCHAR c = buffer.charAt(i); #else unsigned char c = buffer.charAt(i); #endif // Replace the second char of a pair of the equal characters with an asterisk if ( i > 0 && c == buffer.charAt ( i - 1 ) ) { buffer.setCharAt( i, _T('*') ); } // Substitute Umlauts. else if ( c == 0xe4 ) { buffer.setCharAt( i, _T('a') ); } else if ( c == 0xf6 ) { buffer.setCharAt( i, _T('o') ); } else if ( c == 0xfc ) { buffer.setCharAt( i, _T('u') ); } // Fix bug so that 'ß' at the end of a word is replaced. else if ( c == 0xdf ) { buffer.setCharAt( i, _T('s') ); buffer.insert( i + 1, _T('s') ); substCount++; } // Take care that at least one character is left left side from the current one if ( i < buffer.length() - 1 ) { // Masking several common character combinations with an token if ( ( i < buffer.length() - 2 ) && c == _T('s') && buffer.charAt( i + 1 ) == _T('c') && buffer.charAt( i + 2 ) == _T('h') ) { buffer.setCharAt( i, _T('$') ); buffer.deleteChars( i + 1, i + 3 ); substCount =+ 2; } else if ( c == _T('c') && buffer.charAt( i + 1 ) == _T('h') ) { buffer.setCharAt( i, 0xa7 ); // section sign in UTF-16 buffer.deleteCharAt( i + 1 ); substCount++; } else if ( c == _T('e') && buffer.charAt( i + 1 ) == _T('i') ) { buffer.setCharAt( i, _T('%') ); buffer.deleteCharAt( i + 1 ); substCount++; } else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('e') ) { buffer.setCharAt( i, _T('&') ); buffer.deleteCharAt( i + 1 ); substCount++; } else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('g') ) { buffer.setCharAt( i, _T('#') ); buffer.deleteCharAt( i + 1 ); substCount++; } else if ( c == _T('s') && buffer.charAt( i + 1 ) == _T('t') ) { buffer.setCharAt( i, _T('!') ); buffer.deleteCharAt( i + 1 ); substCount++; } } } }
void GermanStemmer::substitute(StringBuffer& buffer) { substCount = 0; for ( size_t c = 0; c < buffer.length(); c++ ) { // Replace the second char of a pair of the equal characters with an asterisk if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) { buffer.setCharAt( c, _T('*') ); } // Substitute Umlauts. else if ( buffer.charAt( c ) == _T('ä') ) { buffer.setCharAt( c, _T('a') ); } else if ( buffer.charAt( c ) == _T('ö') ) { buffer.setCharAt( c, _T('o') ); } else if ( buffer.charAt( c ) == _T('ü') ) { buffer.setCharAt( c, _T('u') ); } // Fix bug so that 'ß' at the end of a word is replaced. else if ( buffer.charAt( c ) == _T('ß') ) { buffer.setCharAt( c, _T('s') ); buffer.insert( c + 1, _T('s') ); substCount++; } // Take care that at least one character is left left side from the current one if ( c < buffer.length() - 1 ) { // Masking several common character combinations with an token if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == _T('s') && buffer.charAt( c + 1 ) == _T('c') && buffer.charAt( c + 2 ) == _T('h') ) { buffer.setCharAt( c, _T('$') ); buffer.deleteChars( c + 1, c + 3 ); substCount =+ 2; } else if ( buffer.charAt( c ) == _T('c') && buffer.charAt( c + 1 ) == _T('h') ) { buffer.setCharAt( c, _T('§') ); buffer.deleteCharAt( c + 1 ); substCount++; } else if ( buffer.charAt( c ) == _T('e') && buffer.charAt( c + 1 ) == _T('i') ) { buffer.setCharAt( c, _T('%') ); buffer.deleteCharAt( c + 1 ); substCount++; } else if ( buffer.charAt( c ) == _T('i') && buffer.charAt( c + 1 ) == _T('e') ) { buffer.setCharAt( c, _T('&') ); buffer.deleteCharAt( c + 1 ); substCount++; } else if ( buffer.charAt( c ) == _T('i') && buffer.charAt( c + 1 ) == _T('g') ) { buffer.setCharAt( c, _T('#') ); buffer.deleteCharAt( c + 1 ); substCount++; } else if ( buffer.charAt( c ) == _T('s') && buffer.charAt( c + 1 ) == _T('t') ) { buffer.setCharAt( c, _T('!') ); buffer.deleteCharAt( c + 1 ); substCount++; } } } }