Exemple #1
0
int
trie_contains (trie_t *t, unsigned long p)
{
  if (t == NULL)
    return 0;
  else if (p == t->p)
    return 1;
  else if (p < t->p)
    return trie_contains (t->left, p);
  else
    return trie_contains (t->right, p);
}
Exemple #2
0
void add_deligatured_word( unsigned char *word, Trie *deligatured_words ) {

  int *ligature_positions;
  int *ligature_ids;
  int ligature_count = word_ligatures( word, &ligature_positions, &ligature_ids );
  // printf( "%s (%d)\n", word, ligature_count ); 
  if ( ligature_count > 0 ) {
    unsigned char *deligatured_word;
    deligatured_word = mark_ligatures( word,
                                       ligature_positions, 
                                       ligature_ids, 
                                       ligature_count );
/*     printf( "%s ==> %s\n", word, deligatured_word ); */

    // warn user when 2 "real" words map to the same deligatured word
    Trie *node = trie_contains( deligatured_words, deligatured_word );
    unsigned char *buf = 0;
    if ( node ) {
      unsigned char* existing_word = ( (LigatureData*) node->data )->word;
      // TODO: after a 2nd "real" source word is added to
      // node->data->word, reoccurring source words will be repeated
      // in our concatenated list; not a big deal, since this entry
      // will have to be edited by the user anyway, but we could do
      // better...
      if ( strcmp( word, existing_word ) ) { // if real words are different...
        // we will remove the existing node and replace it with one
        // whose "real" word is a concatenation of both words; the
        // user will be required to edit the result in the generated
        // file
        fprintf( stderr, 
                 "warning: deligatured word '%s' has multiple source words: %s, %s; EDIT FILE!\n",
                 deligatured_word,
                 word,
                 existing_word );
        buf = (unsigned char*) malloc( 2 + strlen( existing_word ) + 1 +
                                       strlen( word ) + 1 );
        buf[0] = 0;
        strcat( buf, "{" );
        strcat( buf, existing_word );
        strcat( buf, "|" );
        strcat( buf, word );
        strcat( buf, "}" );
        word = buf;
        trie_remove( deligatured_words, deligatured_word, _free_ligature_data_callback2 );
      }
    }
    LigatureData *lig_data = (LigatureData*) malloc( sizeof( LigatureData ) );
    lig_data->word = (unsigned char*) strdup( word );
    lig_data->ligature_positions = ligature_positions;
    lig_data->ligature_ids = ligature_ids;
    lig_data->ligature_count = ligature_count;
    trie_add( deligatured_words, deligatured_word, lig_data );
    free( deligatured_word );
    if ( buf ) {
      free( buf );
    }
  }
}
Exemple #3
0
/* "Re-ligatures" the specified deligatured word by finding the
   corresponding "real" word.  Modifies the 'word' argument, which
   must be large enough to hold the "real" word.  Returns the
   difference in size between the real world and the deligatured word
   (the real word will always be longer).  THIS FUNCTION CAN ONLY BE
   CALLED AFTER load_ligatures has been called to initialize our
   global 'deligatured_words' variable.
*/
int religature_word( unsigned char *word, int n ) {
  int chars_added = 0;
  Trie *node = trie_contains( deligatured_words, word );
  if ( node ) {
    int deligatured_word_len = strlen( word );
    unsigned char *real_word = (unsigned char*) node->data;
    strncpy( word, real_word, n );
    word[n-1] = 0; // ensure null-termination
    return strlen( real_word ) - deligatured_word_len;
  }
  return 0;
}    
Exemple #4
0
/*
 * Removes the specified word from the tree (effectively).  Note: this
 * function is not sophisticated, in that it does not collapse and
 * remove the node entirely; so a trie with removed nodes may not be
 * as compact as it could be (rebuild your trie if you really care
 * about this!).
 */
int trie_remove( Trie *t, 
                 unsigned char* s, 
                 void (*free_data_callback) (void*) ) {
  Trie* node = trie_contains( t, s );
  if ( node ) {
    node->is_end_of_string = 0;
    if ( node->data && free_data_callback ) {
      free_data_callback( node->data );
      node->data = NULL;
    }
    return 1;
  }
  return 0;
}
Exemple #5
0
static PyObject *
trie_subscript(trieobject *mp, PyObject *py_key)
{
    const char *key;
    PyObject *py_value;

    /* Make sure key is a string. */
#ifdef IS_PY3K
    if(!PyUnicode_Check(py_key)) {
#else
    if(!PyString_Check(py_key)) {
#endif
        PyErr_SetString(PyExc_TypeError, "key must be a string");
        return NULL;
    }
#ifdef IS_PY3K
    /* TODO - Review next line for buffer usage */
    key = PyBytes_AS_STRING(PyUnicode_AsASCIIString(py_key));
#else
    key = PyString_AS_STRING(py_key);
#endif
    py_value = Trie_get(mp->trie, key);
    if(py_value == NULL)
        PyErr_SetString(PyExc_KeyError, key);
    else
        Py_INCREF(py_value);
    return py_value;
}

static int
trie_ass_sub(trieobject *mp, PyObject *py_key, PyObject *py_value)
{
    int result = -1;
    const char *key;
    PyObject *py_prev;
#ifdef IS_PY3K
    PyObject* bytes;
#endif

    /* Make sure key is a string. */
#ifdef IS_PY3K
    if(!PyUnicode_Check(py_key)) {
#else
    if(!PyString_Check(py_key)) {
#endif
        PyErr_SetString(PyExc_TypeError, "key must be a string");
        return -1;
    }
#ifdef IS_PY3K
    bytes = PyUnicode_AsASCIIString(py_key);
    if(!bytes) {
        PyErr_SetString(PyExc_TypeError, "key must be an ASCII string");
        return -1;
    }
    key = PyBytes_AsString(bytes);
#else
    key = PyString_AS_STRING(py_key);
#endif

    /* Check to see whether something already exists at that key.  If
       there's already an object there, then I will have to remove it.
    */
    py_prev = Trie_get(mp->trie, key);
    if(py_prev) {
        Py_DECREF(py_prev);
    }

    /* The client wants to delete a key from a dictionary.  The Trie
       API doesn't support this, so I will just overwrite it with
       NULL. */
    if(!py_value) {
        /* If the key doesn't exist, raise a KeyError. */
        if(!py_prev)
            PyErr_SetString(PyExc_KeyError, key);
        else {
            Trie_set(mp->trie, key, NULL);
            result = 0;
        }
    }
    /* The client wants to set a key in the dictionary. */
    else {
        Py_INCREF(py_value);
        if(Trie_set(mp->trie, key, py_value))
            PyErr_SetString(PyExc_AssertionError, "error setting trie");
        else
            result = 0;
    }
#ifdef IS_PY3K
    Py_DECREF(bytes);
#endif
    return result;
}

static int trie_contains(trieobject *mp, PyObject* py_key)
{
    int result;
#ifdef IS_PY3K
    PyObject* bytes;
#endif
    const char *key;
    /* Make sure key is a string. */
#ifdef IS_PY3K
    if(!PyUnicode_Check(py_key)) {
#else
    if(!PyString_Check(py_key)) {
#endif
        PyErr_SetString(PyExc_TypeError, "key must be a string");
        return -1;
    }
#ifdef IS_PY3K
    bytes = PyUnicode_AsASCIIString(py_key);
    if(!bytes) {
        PyErr_SetString(PyExc_TypeError, "key must be an ASCII string");
        return -1;
    }
    key = PyBytes_AsString(bytes);
#else
    key = PyString_AS_STRING(py_key);
#endif
    result = Trie_has_key(mp->trie, key);
#ifdef IS_PY3K
    Py_DECREF(bytes);
#endif
    return result;
}

static char has_key__doc__[] =
    "D.has_key(k) -> 1 if D has a key k, else 0";

static PyObject *
trie_has_key(trieobject *mp, PyObject *py_key)
{
    int has_key = trie_contains(mp, py_key);
    if (has_key==-1) return NULL;
#ifdef IS_PY3K
    return PyLong_FromLong((long)has_key);
#else
    return PyInt_FromLong((long)has_key);
#endif
}
Exemple #6
0
/* Add a word (and an optional associated data) to the trie.  's', the
 * word, will be duplicated, so the trie does assume memory
 * "ownership" of this string.  If 'data' is not null, be sure to
 * write and specify a callback function for the 'free_data_callback'
 * argument of trie_free(), so that this "user" data is freed.  If
 * word already exists in the trie, this function returns 0 and the
 * trie will not be modified.
 * 
 */
int trie_add( Trie *t, unsigned char* s, void *data ) {
  Trie *last = t;
  unsigned char prefix[MAX_WORD_LEN + 1];
  prefix[0] = 0;
  // check validity of word before (potentially) making any changes to
  // the trie, which might otherwise leave it with a partially-added
  // path (because an invalid char is found later in the word)
  if ( !_is_valid_word( s ) ) {
    return 0;
  }
  // (we could make this duplication check while we add, but this is
  // more readable)
  if ( trie_contains( t, s ) ) {
    return 0;
  }
  do {
    if ( *s == 0 || t == NULL ) {
      if ( t == NULL ) {
        // it's now time to allocate our new child node
        t = trie_new_child( last, s );
      }
      t->is_end_of_string = 1;
      t->data = data;
      return 1;
    } 
    else {
      last = t;
      Trie *child = trie_child( t, s[0], prefix );
      if ( !child ) {
        // cause logic, above, to create a new child
        t = NULL;
      } else {
        int n = 0;
        while ( prefix[n] && s[n] && prefix[n] == s[n] ) {
          ++n;
        }
        // if 's' is a prefix of child->prefix, or if 's' differs from
        // prefix at some point, then we must split the folded-path
        // child into two nodes
        if ( prefix[n] != 0 ) {
          // insert new intermediate node, breaking up a folded path
          int old_child_index = _char_to_index( child->prefix[0] );
          // set the child's prefix to the tail of the original prefix
          memmove( child->prefix, child->prefix + n, strlen( child->prefix + n ) + 1 );
          // orphan the child (for later re-parenting)
          t->children[old_child_index] = NULL;
          // insert the new intermediate child, assigning it the head
          // of the original prefix; 't' will now represent the new,
          // intermediate child
          prefix[n] = 0;
          t = trie_new_child( t, prefix );
          // make the new intermediate child the parent of the
          // original child
          _trie_alloc_children_array( t );
          t->children[_char_to_index( child->prefix[0] )] = child;
          // we now resume the normal loop logic, which will finish up
          // the initialization of our new intermediate child and then
          // return...
        } else {
          t = child;
        }
        s += n;
      }
    }
  } while ( 1 );
  return 1;
}