int trie_contains (trie_t *t, unsigned long p) { if (t == NULL) return 0; else if (p == t->p) return 1; else if (p < t->p) return trie_contains (t->left, p); else return trie_contains (t->right, p); }
void add_deligatured_word( unsigned char *word, Trie *deligatured_words ) { int *ligature_positions; int *ligature_ids; int ligature_count = word_ligatures( word, &ligature_positions, &ligature_ids ); // printf( "%s (%d)\n", word, ligature_count ); if ( ligature_count > 0 ) { unsigned char *deligatured_word; deligatured_word = mark_ligatures( word, ligature_positions, ligature_ids, ligature_count ); /* printf( "%s ==> %s\n", word, deligatured_word ); */ // warn user when 2 "real" words map to the same deligatured word Trie *node = trie_contains( deligatured_words, deligatured_word ); unsigned char *buf = 0; if ( node ) { unsigned char* existing_word = ( (LigatureData*) node->data )->word; // TODO: after a 2nd "real" source word is added to // node->data->word, reoccurring source words will be repeated // in our concatenated list; not a big deal, since this entry // will have to be edited by the user anyway, but we could do // better... if ( strcmp( word, existing_word ) ) { // if real words are different... // we will remove the existing node and replace it with one // whose "real" word is a concatenation of both words; the // user will be required to edit the result in the generated // file fprintf( stderr, "warning: deligatured word '%s' has multiple source words: %s, %s; EDIT FILE!\n", deligatured_word, word, existing_word ); buf = (unsigned char*) malloc( 2 + strlen( existing_word ) + 1 + strlen( word ) + 1 ); buf[0] = 0; strcat( buf, "{" ); strcat( buf, existing_word ); strcat( buf, "|" ); strcat( buf, word ); strcat( buf, "}" ); word = buf; trie_remove( deligatured_words, deligatured_word, _free_ligature_data_callback2 ); } } LigatureData *lig_data = (LigatureData*) malloc( sizeof( LigatureData ) ); lig_data->word = (unsigned char*) strdup( word ); lig_data->ligature_positions = ligature_positions; lig_data->ligature_ids = ligature_ids; lig_data->ligature_count = ligature_count; trie_add( deligatured_words, deligatured_word, lig_data ); free( deligatured_word ); if ( buf ) { free( buf ); } } }
/* "Re-ligatures" the specified deligatured word by finding the corresponding "real" word. Modifies the 'word' argument, which must be large enough to hold the "real" word. Returns the difference in size between the real world and the deligatured word (the real word will always be longer). THIS FUNCTION CAN ONLY BE CALLED AFTER load_ligatures has been called to initialize our global 'deligatured_words' variable. */ int religature_word( unsigned char *word, int n ) { int chars_added = 0; Trie *node = trie_contains( deligatured_words, word ); if ( node ) { int deligatured_word_len = strlen( word ); unsigned char *real_word = (unsigned char*) node->data; strncpy( word, real_word, n ); word[n-1] = 0; // ensure null-termination return strlen( real_word ) - deligatured_word_len; } return 0; }
/* * Removes the specified word from the tree (effectively). Note: this * function is not sophisticated, in that it does not collapse and * remove the node entirely; so a trie with removed nodes may not be * as compact as it could be (rebuild your trie if you really care * about this!). */ int trie_remove( Trie *t, unsigned char* s, void (*free_data_callback) (void*) ) { Trie* node = trie_contains( t, s ); if ( node ) { node->is_end_of_string = 0; if ( node->data && free_data_callback ) { free_data_callback( node->data ); node->data = NULL; } return 1; } return 0; }
static PyObject * trie_subscript(trieobject *mp, PyObject *py_key) { const char *key; PyObject *py_value; /* Make sure key is a string. */ #ifdef IS_PY3K if(!PyUnicode_Check(py_key)) { #else if(!PyString_Check(py_key)) { #endif PyErr_SetString(PyExc_TypeError, "key must be a string"); return NULL; } #ifdef IS_PY3K /* TODO - Review next line for buffer usage */ key = PyBytes_AS_STRING(PyUnicode_AsASCIIString(py_key)); #else key = PyString_AS_STRING(py_key); #endif py_value = Trie_get(mp->trie, key); if(py_value == NULL) PyErr_SetString(PyExc_KeyError, key); else Py_INCREF(py_value); return py_value; } static int trie_ass_sub(trieobject *mp, PyObject *py_key, PyObject *py_value) { int result = -1; const char *key; PyObject *py_prev; #ifdef IS_PY3K PyObject* bytes; #endif /* Make sure key is a string. */ #ifdef IS_PY3K if(!PyUnicode_Check(py_key)) { #else if(!PyString_Check(py_key)) { #endif PyErr_SetString(PyExc_TypeError, "key must be a string"); return -1; } #ifdef IS_PY3K bytes = PyUnicode_AsASCIIString(py_key); if(!bytes) { PyErr_SetString(PyExc_TypeError, "key must be an ASCII string"); return -1; } key = PyBytes_AsString(bytes); #else key = PyString_AS_STRING(py_key); #endif /* Check to see whether something already exists at that key. If there's already an object there, then I will have to remove it. */ py_prev = Trie_get(mp->trie, key); if(py_prev) { Py_DECREF(py_prev); } /* The client wants to delete a key from a dictionary. The Trie API doesn't support this, so I will just overwrite it with NULL. */ if(!py_value) { /* If the key doesn't exist, raise a KeyError. */ if(!py_prev) PyErr_SetString(PyExc_KeyError, key); else { Trie_set(mp->trie, key, NULL); result = 0; } } /* The client wants to set a key in the dictionary. */ else { Py_INCREF(py_value); if(Trie_set(mp->trie, key, py_value)) PyErr_SetString(PyExc_AssertionError, "error setting trie"); else result = 0; } #ifdef IS_PY3K Py_DECREF(bytes); #endif return result; } static int trie_contains(trieobject *mp, PyObject* py_key) { int result; #ifdef IS_PY3K PyObject* bytes; #endif const char *key; /* Make sure key is a string. */ #ifdef IS_PY3K if(!PyUnicode_Check(py_key)) { #else if(!PyString_Check(py_key)) { #endif PyErr_SetString(PyExc_TypeError, "key must be a string"); return -1; } #ifdef IS_PY3K bytes = PyUnicode_AsASCIIString(py_key); if(!bytes) { PyErr_SetString(PyExc_TypeError, "key must be an ASCII string"); return -1; } key = PyBytes_AsString(bytes); #else key = PyString_AS_STRING(py_key); #endif result = Trie_has_key(mp->trie, key); #ifdef IS_PY3K Py_DECREF(bytes); #endif return result; } static char has_key__doc__[] = "D.has_key(k) -> 1 if D has a key k, else 0"; static PyObject * trie_has_key(trieobject *mp, PyObject *py_key) { int has_key = trie_contains(mp, py_key); if (has_key==-1) return NULL; #ifdef IS_PY3K return PyLong_FromLong((long)has_key); #else return PyInt_FromLong((long)has_key); #endif }
/* Add a word (and an optional associated data) to the trie. 's', the * word, will be duplicated, so the trie does assume memory * "ownership" of this string. If 'data' is not null, be sure to * write and specify a callback function for the 'free_data_callback' * argument of trie_free(), so that this "user" data is freed. If * word already exists in the trie, this function returns 0 and the * trie will not be modified. * */ int trie_add( Trie *t, unsigned char* s, void *data ) { Trie *last = t; unsigned char prefix[MAX_WORD_LEN + 1]; prefix[0] = 0; // check validity of word before (potentially) making any changes to // the trie, which might otherwise leave it with a partially-added // path (because an invalid char is found later in the word) if ( !_is_valid_word( s ) ) { return 0; } // (we could make this duplication check while we add, but this is // more readable) if ( trie_contains( t, s ) ) { return 0; } do { if ( *s == 0 || t == NULL ) { if ( t == NULL ) { // it's now time to allocate our new child node t = trie_new_child( last, s ); } t->is_end_of_string = 1; t->data = data; return 1; } else { last = t; Trie *child = trie_child( t, s[0], prefix ); if ( !child ) { // cause logic, above, to create a new child t = NULL; } else { int n = 0; while ( prefix[n] && s[n] && prefix[n] == s[n] ) { ++n; } // if 's' is a prefix of child->prefix, or if 's' differs from // prefix at some point, then we must split the folded-path // child into two nodes if ( prefix[n] != 0 ) { // insert new intermediate node, breaking up a folded path int old_child_index = _char_to_index( child->prefix[0] ); // set the child's prefix to the tail of the original prefix memmove( child->prefix, child->prefix + n, strlen( child->prefix + n ) + 1 ); // orphan the child (for later re-parenting) t->children[old_child_index] = NULL; // insert the new intermediate child, assigning it the head // of the original prefix; 't' will now represent the new, // intermediate child prefix[n] = 0; t = trie_new_child( t, prefix ); // make the new intermediate child the parent of the // original child _trie_alloc_children_array( t ); t->children[_char_to_index( child->prefix[0] )] = child; // we now resume the normal loop logic, which will finish up // the initialization of our new intermediate child and then // return... } else { t = child; } s += n; } } } while ( 1 ); return 1; }