void GermanStemmer::strip(StringBuffer& buffer)
{
    bool doMore = true;
    while ( doMore && buffer.length() > 3 ) {
        if ( ( buffer.length() + substCount > 5 ) &&
                buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("nd"), 2 ) )
        {
            buffer.deleteChars( buffer.length() - 2, buffer.length() );
        }
        else if ( ( buffer.length() + substCount > 4 ) &&
                  buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("em"), 2 ) ) {
            buffer.deleteChars( buffer.length() - 2, buffer.length() );
        }
        else if ( ( buffer.length() + substCount > 4 ) &&
                  buffer.substringEquals( buffer.length() - 2, buffer.length(), _T("er"), 2 ) ) {
            buffer.deleteChars( buffer.length() - 2, buffer.length() );
        }
        else if ( buffer.charAt( buffer.length() - 1 ) == _T('e') ) {
            buffer.deleteCharAt( buffer.length() - 1 );
        }
        else if ( buffer.charAt( buffer.length() - 1 ) == _T('s') ) {
            buffer.deleteCharAt( buffer.length() - 1 );
        }
        else if ( buffer.charAt( buffer.length() - 1 ) == _T('n') ) {
            buffer.deleteCharAt( buffer.length() - 1 );
        }
        // "t" occurs only as suffix of verbs.
        else if ( buffer.charAt( buffer.length() - 1 ) == _T('t') ) {
            buffer.deleteCharAt( buffer.length() - 1 );
        }
        else {
            doMore = false;
        }
    }
}
void GermanStemmer::removeParticleDenotion(StringBuffer& buffer) {
    if ( buffer.length() > 4 ) {
        for ( size_t c = 0; c < buffer.length() - 3; c++ ) {
            if ( buffer.substringEquals( c, c + 4, _T("gege"), 4 ) ) {
                buffer.deleteChars( c, c + 2 );
                return;
            }
        }
    }
}
void GermanStemmer::substitute(StringBuffer& buffer) {
    substCount = 0;

    for ( size_t i = 0; i < buffer.length(); i++ ) {
#ifdef _UCS2
        TCHAR c = buffer.charAt(i);
#else
        unsigned char c = buffer.charAt(i);
#endif
        // Replace the second char of a pair of the equal characters with an asterisk
        if ( i > 0 && c == buffer.charAt ( i - 1 )  ) {
            buffer.setCharAt( i, _T('*') );
        }
        // Substitute Umlauts.
        else if ( c  == 0xe4 ) {
            buffer.setCharAt( i, _T('a') );
        }
        else if ( c == 0xf6 ) {
            buffer.setCharAt( i, _T('o') );
        }
        else if ( c == 0xfc ) {
            buffer.setCharAt( i, _T('u') );
        }
        // Fix bug so that 'ß' at the end of a word is replaced.
        else if ( c == 0xdf ) {
            buffer.setCharAt( i, _T('s') );
            buffer.insert( i + 1, _T('s') );
            substCount++;
        }
        // Take care that at least one character is left left side from the current one
        if ( i < buffer.length() - 1 ) {
            // Masking several common character combinations with an token
            if ( ( i < buffer.length() - 2 ) && c == _T('s') &&
                    buffer.charAt( i + 1 ) == _T('c') && buffer.charAt( i + 2 ) == _T('h') )
            {
                buffer.setCharAt( i, _T('$') );
                buffer.deleteChars( i + 1, i + 3 );
                substCount =+ 2;
            }
            else if ( c == _T('c') && buffer.charAt( i + 1 ) == _T('h') ) {
                buffer.setCharAt( i, 0xa7 ); // section sign in UTF-16
                buffer.deleteCharAt( i + 1 );
                substCount++;
            }
            else if ( c == _T('e') && buffer.charAt( i + 1 ) == _T('i') ) {
                buffer.setCharAt( i, _T('%') );
                buffer.deleteCharAt( i + 1 );
                substCount++;
            }
            else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('e') ) {
                buffer.setCharAt( i, _T('&') );
                buffer.deleteCharAt( i + 1 );
                substCount++;
            }
            else if ( c == _T('i') && buffer.charAt( i + 1 ) == _T('g') ) {
                buffer.setCharAt( i, _T('#') );
                buffer.deleteCharAt( i + 1 );
                substCount++;
            }
            else if ( c == _T('s') && buffer.charAt( i + 1 ) == _T('t') ) {
                buffer.setCharAt( i, _T('!') );
                buffer.deleteCharAt( i + 1 );
                substCount++;
            }
        }
    }
}
Beispiel #4
0
    void GermanStemmer::substitute(StringBuffer& buffer) {
      substCount = 0;

      for ( size_t c = 0; c < buffer.length(); c++ ) {
        // Replace the second char of a pair of the equal characters with an asterisk
        if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
          buffer.setCharAt( c, _T('*') );
        }
        // Substitute Umlauts.
        else if ( buffer.charAt( c ) == _T('ä') ) {
          buffer.setCharAt( c, _T('a') );
        }
        else if ( buffer.charAt( c ) == _T('ö') ) {
          buffer.setCharAt( c, _T('o') );
        }
        else if ( buffer.charAt( c ) == _T('ü') ) {
          buffer.setCharAt( c, _T('u') );
        }
        // Fix bug so that 'ß' at the end of a word is replaced.
        else if ( buffer.charAt( c ) == _T('ß') ) {
            buffer.setCharAt( c, _T('s') );
            buffer.insert( c + 1, _T('s') );
            substCount++;
        }
        // Take care that at least one character is left left side from the current one
        if ( c < buffer.length() - 1 ) {
          // Masking several common character combinations with an token
          if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == _T('s') &&
            buffer.charAt( c + 1 ) == _T('c') && buffer.charAt( c + 2 ) == _T('h') )
          {
            buffer.setCharAt( c, _T('$') );
            buffer.deleteChars( c + 1, c + 3 );
            substCount =+ 2;
          }
          else if ( buffer.charAt( c ) == _T('c') && buffer.charAt( c + 1 ) == _T('h') ) {
            buffer.setCharAt( c, _T('§') );
            buffer.deleteCharAt( c + 1 );
            substCount++;
          }
          else if ( buffer.charAt( c ) == _T('e') && buffer.charAt( c + 1 ) == _T('i') ) {
            buffer.setCharAt( c, _T('%') );
            buffer.deleteCharAt( c + 1 );
            substCount++;
          }
          else if ( buffer.charAt( c ) == _T('i') && buffer.charAt( c + 1 ) == _T('e') ) {
            buffer.setCharAt( c, _T('&') );
            buffer.deleteCharAt( c + 1 );
            substCount++;
          }
          else if ( buffer.charAt( c ) == _T('i') && buffer.charAt( c + 1 ) == _T('g') ) {
            buffer.setCharAt( c, _T('#') );
            buffer.deleteCharAt( c + 1 );
            substCount++;
          }
          else if ( buffer.charAt( c ) == _T('s') && buffer.charAt( c + 1 ) == _T('t') ) {
            buffer.setCharAt( c, _T('!') );
            buffer.deleteCharAt( c + 1 );
            substCount++;
          }
        }
      }
    }