static PyObject* jellyfish_porter_stem(PyObject *self, PyObject *args) { const char *str; char *result; PyObject *ret; struct stemmer *z; int end; if (!PyArg_ParseTuple(args, "s", &str)) { return NULL; } z = create_stemmer(); if (!z) { PyErr_NoMemory(); return NULL; } result = strdup(str); if (!result) { free_stemmer(z); PyErr_NoMemory(); return NULL; } end = stem(z, result, strlen(result) - 1); result[end + 1] = '\0'; ret = Py_BuildValue("s", result); free(result); free_stemmer(z); return ret; }
/* Tokenize a given sds, setting a term to zero-length sds if it's * a stopword. A total number of tokens and total number of nonstopwords * will be returned */ static sds *sds_tokenize(sds s, int *len, int *nonstopwords) { int i, l, k; sds *terms; struct stemmer *stmer; *nonstopwords = 0; terms = sdssplitlen(s, sdslen(s), " ", 1, len); if (!terms) return NULL; stmer = create_stemmer(); for (i = 0; i < *len; i++) { sds stemmed = NULL, term = terms[i]; term = sdstrim(term, puncs); l = sdslen(term); sdstolower(term); if (l == 0 || rr_stopwords_check(term)) { sdsclear(term); continue; } *nonstopwords += 1; /* note that the third argument is a zero-based index */ k = stem(stmer, term, l-1); if (k < l-1) { stemmed = sdsnewlen(term, k+1); sdsfree(term); terms[i] = stemmed; } } free_stemmer(stmer); return terms; }
CAMLprim value caml_stemmer_porter2_close(value v_stem) { CAMLparam1(v_stem); struct stemmer_t* val = (struct stemmer_t*)v_stem; free_stemmer(val->st); free(val->buf); free(val); CAMLreturn(Val_unit); }
int main(int argc, char * argv[]) { int i; struct stemmer * z = create_stemmer(); s = (char *) malloc(i_max + 1); for (i = 1; i < argc; i++) { FILE * f = fopen(argv[i],"r"); if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); } stemfile(z, f); } free(s); free_stemmer(z); return 0; }
/** * filters, transforms, and indexes file using ngrams to the index * * file name - name of file to process * wikiindex - the judy arrays to store the index of the wiki in */ void indexWiki(char* inFileName, Pvoid_t *wikiIndex, int* articleCount) { //-------------------- initialization --------------------// bool articleIndex[lastNgram] = {0}; // boolean array of what trigrams are in an article struct stemmer * currentStemmer = create_stemmer(); // file for writing the titles to FILE* titleFile = NULL; if (writeFiles) { titleFile = fopen("title_file", "w"); if (NULL == titleFile) { fprintf(stderr, "Error open title file: %m\n"); exit(1); } } // initializes the libxml library LIBXML_TEST_VERSION xmlTextReaderPtr wikiReader; //the reader for the document wikiReader = xmlReaderForFile(inFileName, NULL, XML_PARSE_RECOVER+XML_PARSE_NOWARNING+XML_PARSE_NOERROR+XML_PARSE_HUGE); if (NULL == wikiReader) { //fprintf(stderr, "%s %s\n", "Failed to open ", wikiFileName); fprintf(stderr, "Error opening XML wiki: %m\n"); exit(1); } // for progress bar int percent = 0; long fileSize = getFileSize(inFileName); // initialization for currentArticle and its componens article currentArticle; currentArticle.title = g_string_sized_new(256); currentArticle.body = g_string_sized_new(786432); //768*1024 //-------------------- index the wiki --------------------// optionalPrint ("%s", "Adding collection to index.\n"); optionalPrint ("%d", (int)(fileSize / 1048576)); optionalPrint (" MB in file\n"); displayProgressBar (xmlTextReaderByteConsumed(wikiReader), fileSize, &percent); //prime the loop currentArticle.title->len = 0; currentArticle.body->len = 0; xmlTextReaderRead(wikiReader);// at a <page> tag, drop in xmlTextReaderRead(wikiReader);// at a <page> tag, drop in // reads from xml file until file is finished, adds articles to index, and writes tittles to file // processes one article per iteration while (getArticle (wikiReader, ¤tArticle)) { currentArticle.articleNumber = *articleCount; *articleCount = *articleCount + 1; // filter / transform text removeMarkup(currentArticle.body); stemText(currentArticle.body, currentStemmer); //ngramming.h // index the text indexText(currentArticle.body, articleIndex); //ngramming.h addToIndex(articleIndex, wikiIndex, currentArticle.articleNumber); //adds titles to title file if (writeFiles) {fprintf(titleFile, "%s\n", currentArticle.title->str);} //re-prime the loop currentArticle.title->len = 0; currentArticle.body->len = 0; displayProgressBar (xmlTextReaderByteConsumed(wikiReader), fileSize, &percent); } optionalPrint ("\n%s", "Finished indexing. \n"); optionalPrint ("%lu", (long)(xmlTextReaderByteConsumed(wikiReader)/1048576)); optionalPrint ("MB consumed\n"); optionalPrint ("%d %s %d %s", *articleCount, "articles found", (int) currentArticle.body->allocated_len, "length allocated for article body\n"); // clean up of structures needed to process wiki if (writeFiles) {fclose (titleFile);} free_stemmer(currentStemmer); xmlFreeTextReader(wikiReader); xmlCleanupParser(); //g_string_free(currentArticle.title, TRUE); //g_string_free(currentArticle.body, TRUE); //malloc fail if this is uncommented ?! }