/** * call-seq: * parser.execute(req_hash, data, start) -> Integer * * Takes a Hash and a String of data, parses the String of data filling in the Hash * returning an Integer to indicate how much of the data has been read. No matter * what the return value, you should call HttpClientParser#finished? and HttpClientParser#error? * to figure out if it's done parsing or there was an error. * * This function now throws an exception when there is a parsing error. This makes * the logic for working with the parser much easier. You can still test for an * error, but now you need to wrap the parser with an exception handling block. * * The third argument allows for parsing a partial request and then continuing * the parsing from that position. It needs all of the original data as well * so you have to append to the data buffer as you read. */ VALUE HttpClientParser_execute(VALUE self, VALUE req_hash, VALUE data, VALUE start) { httpclient_parser *http = NULL; int from = 0; char *dptr = NULL; long dlen = 0; REQUIRE_TYPE(req_hash, T_HASH); REQUIRE_TYPE(data, T_STRING); REQUIRE_TYPE(start, T_FIXNUM); DATA_GET(self, httpclient_parser, http); from = FIX2INT(start); dptr = RSTRING(data)->ptr; dlen = RSTRING(data)->len; if(from >= dlen) { rb_raise(eHttpClientParserError, "Requested start is after data buffer end."); } else { http->data = (void *)req_hash; httpclient_parser_execute(http, dptr, dlen, from); if(httpclient_parser_has_error(http)) { rb_raise(eHttpClientParserError, "Invalid HTTP format, parsing fails."); } else { return INT2FIX(httpclient_parser_nread(http)); } } }
/** * call-seq: * document.addword(normal, asis) * * The basic call to add a normal and asis version of a word to the * document for indexing. */ VALUE Document_addword(VALUE self, VALUE normal, VALUE asis) { ODDOC *oddoc = NULL; DATA_GET(self, ODDOC,oddoc); REQUIRE_TYPE(normal, T_STRING); REQUIRE_TYPE(asis, T_STRING); oddocaddword(oddoc, RSTRING(normal)->ptr, RSTRING(asis)->ptr); return self; }
/** * call-seq: * Odeum::settuning(ibnum, idnum, cbnum, csiz) -> nil * * ibnum=32749: Number of buckets for inverted indexes. * idnum=7: Division number of inverted index. * cbnum=262139: Number of buckets for dirty buffers. * csiz=8388608: Maximum bytes to use memory for dirty buffers. * * This is set globally for all Indexes. Not sure what would happen * if you changed this mid-stream, so don't. Make sure everything is closed. */ VALUE Odeum_settuning(VALUE self, VALUE ibnum, VALUE idnum, VALUE cbnum, VALUE csiz) { REQUIRE_TYPE(ibnum, T_FIXNUM); REQUIRE_TYPE(idnum, T_FIXNUM); REQUIRE_TYPE(cbnum, T_FIXNUM); REQUIRE_TYPE(csiz, T_FIXNUM); odsettuning(FIX2INT(ibnum), FIX2INT(idnum), FIX2INT(cbnum), FIX2INT(csiz)); return Qnil; }
/** * call-seq: * doc[attr] = value * * Adds meta-data to the document. They should be Strings only. */ VALUE Document_addattr(VALUE self, VALUE name, VALUE value) { ODDOC *oddoc = NULL; DATA_GET(self, ODDOC,oddoc); REQUIRE_TYPE(name, T_STRING); REQUIRE_TYPE(value, T_STRING); oddocaddattr(oddoc, RSTRING(name)->ptr, RSTRING(value)->ptr); return self; }
/** * call-seq: * Odeum::merge(new_name, other_databases) -> true/false * * Merges the databases listed in other_databases (Array of Strings) * into the new database new_name. * If two or more documents have the same URI then the first one is * adopted and the others are ignored. */ VALUE Odeum_merge(VALUE self, VALUE name, VALUE elemnames) { REQUIRE_TYPE(name, T_STRING); REQUIRE_TYPE(elemnames, T_ARRAY); CBLIST *elems = array_2_CBLIST(elemnames); int res = odmerge(RSTRING(name)->ptr, elems); cblistclose(elems); return res == FALSE ? Qfalse : Qtrue; }
/** * call-seq: * Index.new(name, mode) -> Index * * Creates an Index with the given name according to mode. The name will be used * as the basis for a local directory which will contain the database for the documents. * * Possible modes might be: * * - Odeum::OWRITER -- Opens as a writer. * - Odeum::OREADER -- Read-only. * - Odeum::OCREAT -- Or'd in to OWRITER to indicate that you want it created if not existing. * - Odeum::ONOLOCK -- Opens without locking on the directory. * * Opening as OWRITER creates an exclusive lock on the database dir, but OREADER * opens with a shared lock. A thread will block until the lock is achieved, but * none of this has been tested in Ruby with Ruby's in-process threads. */ VALUE Index_initialize(VALUE self, VALUE name, VALUE mode) { REQUIRE_TYPE(self, T_DATA); REQUIRE_TYPE(name, T_STRING); REQUIRE_TYPE(mode, T_FIXNUM); DATA_PTR(self) = odopen(RSTRING(name)->ptr, FIX2INT(mode)); if(DATA_PTR(self) == NULL) { // there was an error, find out what it was rb_raise(rb_eStandardError, "Failed to open requested database."); } return self; }
/** * call-seq: * Index::setcharclass(space, delim, glue) -> nil * * Changes the definition of a SPACE, DELIM, and GLUE char for this index. * This will alter how text is broken up in Document::add_content in cases where * you wish to index content differently. */ VALUE Index_setcharclass(VALUE self, VALUE spacechars, VALUE delimchars, VALUE gluechars) { ODEUM *odeum = NULL; DATA_GET(self, ODEUM, odeum); REQUIRE_TYPE(spacechars, T_STRING); REQUIRE_TYPE(delimchars, T_STRING); REQUIRE_TYPE(gluechars, T_STRING); odsetcharclass(odeum, RSTRING(spacechars)->ptr, RSTRING(delimchars)->ptr, RSTRING(gluechars)->ptr); return Qnil; }
/** * call-seq: * index.get_id_by_uri(id) * * Returns just the id of the document with the given uri. */ VALUE Index_get_id_by_uri(VALUE self, VALUE uri) { ODEUM *odeum = NULL; DATA_GET(self, ODEUM, odeum); REQUIRE_TYPE(uri, T_STRING); int res = odgetidbyuri(odeum, RSTRING(uri)->ptr); return INT2FIX(res); }
/** * call-seq: * Document.new uri -> Document * * The uri should be specified if you're calling this. Internally the * Ruby/Odeum library kind of "cheats" and passes a Qnil for the uri * so that the ODDOC can be assigned externally. You should not * (and probably cannot) do this from Ruby. */ VALUE Document_initialize(VALUE self, VALUE uri) { if(!NIL_P(uri)) { REQUIRE_TYPE(uri, T_STRING); DATA_PTR(self) = oddocopen(RSTRING(uri)->ptr); } return self; }
/** * call-seq: * Odeum::normalizeword(asis) -> normal * * Given a word from breaktext (which is considered "as-is") * it will "normalize" it in a consistent way which is suitable * for searching. The normalization effectively strips puntuation * and spacing, and then lowercases the word. If there is nothing * but "removed" chars in the asis string then the return is empty. * Check for this so you don't try to search for nothing. */ VALUE Odeum_normalizeword(VALUE self, VALUE asis) { REQUIRE_TYPE(asis, T_STRING); char *result = odnormalizeword(RSTRING(asis)->ptr); VALUE res_str = rb_str_new2(result); free(result); return res_str; }
/** * call-seq: * Odeum::breaktext(test) -> [word1, word2, word3] * * Breaks a string into an array of words that are separated by * space characters and such delimiters as period, comma, etc. * You should also check out StringScanner as a more flexible * alternative. This function must do a lot of data copying and * other things in order to convert from Odeum internal types to Ruby * types. */ VALUE Odeum_breaktext(VALUE self, VALUE text) { REQUIRE_TYPE(text, T_STRING); CBLIST *result = odbreaktext(RSTRING(text)->ptr); VALUE list = CBLIST_2_array(result); cblistclose(result); return list; }
/** * call-seq: * index.check(id) * * Checks if a document with the given id is in the database. */ VALUE Index_check(VALUE self, VALUE id) { ODEUM *odeum = NULL; DATA_GET(self, ODEUM, odeum); REQUIRE_TYPE(id, T_FIXNUM); int res = odcheck(odeum, FIX2INT(id)); return res == FALSE ? Qfalse : Qtrue; }
/** * call-seq: * index.search_doc_count(word) -> Fixnum * * Returns the number of documents matching the given word. If the word * does not match anything then it returns -1. */ VALUE Index_search_doc_count(VALUE self, VALUE word) { ODEUM *odeum = NULL; DATA_GET(self, ODEUM, odeum); REQUIRE_TYPE(word, T_STRING); int res = odsearchdnum(odeum, RSTRING(word)->ptr); return INT2FIX(res); }
/** * call-seq: * document[name] -> String * * Gets the meta-data attribute for the given name. The name must * be a String. */ VALUE Document_getattr(VALUE self, VALUE name) { ODDOC *oddoc = NULL; DATA_GET(self, ODDOC,oddoc); REQUIRE_TYPE(name, T_STRING); const char *value = oddocgetattr(oddoc, RSTRING(name)->ptr); return value == NULL ? Qnil : rb_str_new2(value); }
/** * call-seq: * index.delete(uri) -> true/false * * Deletes the document given by the uri. The Index must be opened * as a writer, and the call will return false if no such document exists. */ VALUE Index_delete(VALUE self, VALUE uri) { ODEUM *odeum = NULL; DATA_GET(self, ODEUM, odeum); REQUIRE_TYPE(uri, T_STRING); int res = odout(odeum, RSTRING(uri)->ptr); return res == FALSE ? Qfalse : Qtrue; }
/** * call-seq: * index.search(word, max) -> ResultSet * * The big payoff method which actually searches for the documents * that have the given word mentioned. The result of the search is * a ResultSet object which you can use to get at the results either * through iteration or direct access with ResultSet#[]. * * If the search attempt fails for some reason then an exception is thrown, * but an empty result is NOT a failure (that returns a ResultSet with nothing). * * If you don't want to the ResultSet and would rather have an array of the [id,score] * pairs, then simply call the ResultSet.to_a method right away: index.search(word, max).to_a * */ VALUE Index_search(VALUE self, VALUE word, VALUE max) { int num_returned = 0; ODEUM *odeum = NULL; DATA_GET(self, ODEUM, odeum); REQUIRE_TYPE(word, T_STRING); REQUIRE_TYPE(max, T_FIXNUM); ODPAIR *pairs = odsearch(odeum, RSTRING(word)->ptr, FIX2INT(max), &num_returned); if(pairs == NULL) { // nothing found rb_raise(rb_eStandardError, "Search failure."); } return ResultSet_create(pairs, num_returned, NULL); }
/** * call-seq: * index.get(uri) -> Document * * Gets a Document based on the uri, or returns nil. */ VALUE Index_get(VALUE self, VALUE uri) { ODEUM *odeum = NULL; DATA_GET(self, ODEUM, odeum); REQUIRE_TYPE(uri, T_STRING); ODDOC *oddoc = odget(odeum, RSTRING(uri)->ptr); if(oddoc == NULL) return Qnil; else return Document_create(oddoc); }
/** * call-seq: * index.get_by_id(id) -> Document * * Gets a Document based on its id, or nil if that document isn't there. */ VALUE Index_get_by_id(VALUE self, VALUE id) { ODEUM *odeum = NULL; DATA_GET(self, ODEUM, odeum); REQUIRE_TYPE(id, T_FIXNUM); ODDOC *oddoc = odgetbyid(odeum, FIX2INT(id)); if(oddoc == NULL) return Qnil; else return Document_create(oddoc); }
/** * call-seq: * document.scores(max, index) -> { word => score, word => score, ...} * * Get the normalized words and their scores in the document. The * strange thing is that the scores are returned as Strings, but they * are decimal strings. */ VALUE Document_scores(VALUE self, VALUE max, VALUE odeum_obj) { ODDOC *oddoc = NULL; ODEUM *odeum = NULL; DATA_GET(self, ODDOC, oddoc); DATA_GET(odeum_obj, ODEUM, odeum); REQUIRE_TYPE(max, T_FIXNUM); CBMAP *scores = oddocscores(oddoc, FIX2INT(max), odeum); VALUE map = CBMAP_2_hash(scores); cbmapclose(scores); return map; }
/** * call-seq: * index.put(doc, wmax, over) -> true/false * * Puts the Document doc into the Index, and indexes a maximum of wmax * words in the document. If over is true than the document is overwritten * in the database. Otherwise, if the document already exists in the * database and over== nil/false then the method will return false as * an error. */ VALUE Index_put(VALUE self, VALUE doc, VALUE wmax, VALUE over) { int res = 0; ODEUM *odeum = NULL; ODDOC *oddoc = NULL; DATA_GET(self, ODEUM, odeum); DATA_GET(doc, ODDOC, oddoc); REQUIRE_TYPE(wmax, T_FIXNUM); res = odput(odeum, oddoc, FIX2INT(wmax), !(over == Qnil || over == Qfalse)); return res == FALSE ? Qfalse : Qtrue; }
/** * call-seq: * rs.marshal_load(data) -> nil * * This is actually called by the Marshal.load function to re-construct */ VALUE ResultSet_marshal_load(VALUE self, VALUE data) { ResultSetData *rs = NULL; VALUE len; VALUE ind; VALUE pairs; DATA_GET(self, ResultSetData, rs); assert(rs->data == NULL); len = rb_ary_entry(data, 0); REQUIRE_TYPE(len, T_FIXNUM); ind = rb_ary_entry(data, 1); REQUIRE_TYPE(ind, T_FIXNUM); pairs = rb_ary_entry(data, 2); REQUIRE_TYPE(pairs, T_STRING); rs->length = FIX2INT(len); rs->index = FIX2INT(ind); rs->data = malloc(RSTRING(pairs)->len); RAISE_NOT_NULL(rs->data); memcpy(rs->data, RSTRING(pairs)->ptr, RSTRING(pairs)->len); return Qnil; }
/** * call-seq: * document.add_word_list(asis) -> document * * Takes an array of "as-is" words, normalizes them, and puts them in the document. * It assumes that the array is composed of asis words and normalizes them * before putting them in the document. */ VALUE Document_add_word_list(VALUE self, VALUE asis) { VALUE str; int i = 0; ODDOC *oddoc = NULL; DATA_GET(self, ODDOC,oddoc); REQUIRE_TYPE(asis, T_ARRAY); for(i = 0; (str = rb_ary_entry(asis, i)) != Qnil; i++) { char *result = odnormalizeword(RSTRING(str)->ptr); oddocaddword(oddoc, result, RSTRING(str)->ptr); free(result); } return self; }
/** * call-seq: * rnd.seed -> rnd * * Seeds the global ArcFour random generator with the given seed. The same seeds * should produce the exact same stream of random data so that you can get * large amounts of randomness but replay possible interactions using just * an initial key. * * This function also doubles as the FuzzRnd.initialize method since they * do nearly the same thing. * * Taken from http://www.mozilla.org/projects/security/pki/nss/draft-kaukonen-cipher-arcfour-03.txt * sample code, but compared with the output of the ArcFour implementation in * the Phelix test code to make sure it is the same initialization. The main * difference is that this init takes an arbitrary keysize while the original * Phelix ArcFour only took a 32bit key. * * Returns itself so you can seed and then get data easily. */ VALUE FuzzRnd_seed(VALUE self, VALUE data) { unsigned int t, u; unsigned int keyindex; unsigned int stateindex; unsigned char *state; unsigned int counter; char *key = NULL; size_t key_len = 0; REQUIRE_TYPE(data, T_STRING); #if defined(RUBY_1_9_x) VALUE str = StringValue(data); key = RSTRING_PTR(str); key_len = RSTRING_LEN(str);; #elif defined(RUBY_1_8_x) key = RSTRING(data)->ptr; key_len = RSTRING(data)->len; #else #error unsupported RUBY_VERSION #endif state = ArcFour.sbox; ArcFour.i = 0; ArcFour.j = 0; for (counter = 0; counter < 256; counter++) state[counter] = counter; keyindex = 0; stateindex = 0; for (counter = 0; counter < 256; counter++) { t = state[counter]; stateindex = (stateindex + key[keyindex] + t) & 0xff; u = state[stateindex]; state[stateindex] = t; state[counter] = u; if (++keyindex >= key_len) keyindex = 0; } return self; }
/** * call-seq: * index.query(query) -> [[id,score], ... ] * * An implementation of a basic query language for Odeum. The query language * allows boolean expressions of search terms and '&', '|', '!' with parenthesis * as sub-expressions. The '!' operator implements NOTAND so that you can say, * "this AND NOT that" using "this ! that". Consecutive words are assumed to * have an implicit '&' between them. * * An example expression is: "Zed & shaw ! (frank blank)". The (frank blank) * part actually is interpreted as (frank & blank). * * It returns the same ResultSet as Index.search does. */ VALUE Index_query(VALUE self, VALUE word) { CBLIST *errors = NULL; int num_returned = 0; ODEUM *odeum = NULL; DATA_GET(self, ODEUM, odeum); REQUIRE_TYPE(word, T_STRING); errors = cblistopen(); ODPAIR *pairs = odquery(odeum, RSTRING(word)->ptr, &num_returned, errors); if(pairs == NULL) { // nothing found rb_raise(rb_eStandardError, "Query failure."); } return ResultSet_create(pairs, num_returned, errors); }
/** * call-seq: * document.add_content(index, content) -> document * * Takes the contents, breaks the words up, and then puts them in the document * in normalized form. This is the common pattern that people use a Document * with. You may also use Document.addword to add one word a time, and * Document.add_word_list to add a list of words. * * It uses the default odanalyzetext method to break up the text, * which means you can use the Index::setcharclass method to configure * what is a DELIM, GLUE, and SPACE character. The default is the same * as Odeum::breaktext. * * If the process of normalizing a word creates an empty word, then it * is not added to the document's words. This usually happens for * punctation that isn't usualy searched for anyway. * * The Index used with this document is now required since that object holds * the information about how text is broken via the Index::setcharclass method. */ VALUE Document_add_content(VALUE self, VALUE index, VALUE content) { CBLIST *asis_words = NULL; CBLIST *norm_words = NULL; const char *asis = NULL; const char *norm = NULL; int asis_len = 0; int norm_len = 0; int i = 0; int count = 0; ODDOC *oddoc = NULL; ODEUM *odeum = NULL; DATA_GET(self,ODDOC, oddoc); DATA_GET(index,ODEUM, odeum); REQUIRE_TYPE(content, T_STRING); asis_words = cblistopen(); norm_words = cblistopen(); odanalyzetext(odeum, RSTRING(content)->ptr, asis_words, norm_words); // go through words and add them count = cblistnum(asis_words); for(i = 0; i < count; i++) { asis = cblistval(asis_words, i, &asis_len); norm = cblistval(norm_words, i, &norm_len); // only add words that normalize to some content oddocaddword(oddoc, norm, asis); } cblistclose(asis_words); cblistclose(norm_words); return self; }
/** * call-seq: * rnd.seed -> rnd * * Returns a String of random bytes of length that you can use * for generating randomness. It uses the ArcFour cipher to * make the randomness, so the same seeds produce the same * random bits, and the randomness is reasonably high quality. * * Don't use this for secure random generation. It probably would * work if you seeded from a /dev/random that worked, but don't * blame me if you get hacked. * * The main motiviation for using ArcFour without automated reseed * is to produce lots of random bytes quickly, make them high enough * quality for good random tests, and to make sure that we can replay * possible sequences if there's a sequence that we want to test. */ VALUE FuzzRnd_data(VALUE self, VALUE length) { unsigned int n; unsigned char a,b; size_t len = 0; VALUE data; char *p = NULL; REQUIRE_TYPE(length, T_FIXNUM); len = FIX2INT(length); data = rb_str_buf_new(len); rb_str_resize(data, len); #if defined(RUBY_1_9_x) VALUE str_data = StringValue(data); p = RSTRING_PTR(str_data); #elif defined(RUBY_1_8_x) p = RSTRING(data)->ptr; #else #error unsupported RUBY_VERSION #endif for (n=0;n<len;n++) /* run the ArcFour algorithm as long as it needs */ { ArcFour.i++; a = ArcFour.sbox[ArcFour.i]; ArcFour.j = (unsigned char) (ArcFour.j + a); /* avoid MSVC picky compiler warning */ b = ArcFour.sbox[ArcFour.j]; ArcFour.sbox[ArcFour.i] = b; ArcFour.sbox[ArcFour.j] = a; p[n] = ArcFour.sbox[(a+b) & 0xFF]; } return str_data; }
/** * call-seq: * Odeum::remove(name) -> true/false * * Removes the database directory and everything in it. */ VALUE Odeum_remove(VALUE self, VALUE name) { REQUIRE_TYPE(name, T_STRING); int res = odremove(RSTRING(name)->ptr); return res == FALSE ? Qfalse : Qtrue; }