예제 #1
0
static PyObject* jellyfish_porter_stem(PyObject *self, PyObject *args)
{
    const char *str;
    char *result;
    PyObject *ret;
    struct stemmer *z;
    int end;

    if (!PyArg_ParseTuple(args, "s", &str)) {
        return NULL;
    }

    z = create_stemmer();
    if (!z) {
        PyErr_NoMemory();
        return NULL;
    }

    result = strdup(str);
    if (!result) {
        free_stemmer(z);
        PyErr_NoMemory();
        return NULL;
    }

    end = stem(z, result, strlen(result) - 1);
    result[end + 1] = '\0';

    ret = Py_BuildValue("s", result);

    free(result);
    free_stemmer(z);

    return ret;
}
예제 #2
0
/* Tokenize a given sds, setting a term to zero-length sds if it's
 * a stopword. A total number of tokens and total number of nonstopwords
 * will be returned */
static sds *sds_tokenize(sds s, int *len, int *nonstopwords) {
    int i, l, k;
    sds *terms;
    struct stemmer *stmer;

    *nonstopwords = 0;
    terms = sdssplitlen(s, sdslen(s), " ", 1, len);
    if (!terms) return NULL;
    stmer = create_stemmer();
    for (i = 0; i < *len; i++) {
        sds stemmed = NULL, term = terms[i];
        term = sdstrim(term, puncs);
        l = sdslen(term);
        sdstolower(term);
        if (l == 0 || rr_stopwords_check(term)) {
            sdsclear(term);
            continue;
        }
        *nonstopwords += 1;
        /* note that the third argument is a zero-based index */
        k = stem(stmer, term, l-1);
        if (k < l-1) {
            stemmed = sdsnewlen(term, k+1);
            sdsfree(term);
            terms[i] = stemmed;
        }
    }

    free_stemmer(stmer);
    return terms;
}
예제 #3
0
CAMLprim value caml_stemmer_porter2_close(value v_stem)
{
  CAMLparam1(v_stem);
  struct stemmer_t* val = (struct stemmer_t*)v_stem;
  free_stemmer(val->st);
  free(val->buf);
  free(val);
  CAMLreturn(Val_unit);
}
예제 #4
0
int main(int argc, char * argv[])
{  int i;
	
	struct stemmer * z = create_stemmer();
	
	s = (char *) malloc(i_max + 1);
	for (i = 1; i < argc; i++)
	{  FILE * f = fopen(argv[i],"r");
		if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); }
		stemfile(z, f);
	}
	free(s);
	
	free_stemmer(z);
	
	return 0;
}
예제 #5
0
/**
 * filters, transforms, and indexes file using ngrams to the index
 * 
 * file name - name of file to process
 * wikiindex - the judy arrays to store the index of the wiki in
 */
void indexWiki(char* inFileName, Pvoid_t *wikiIndex, int* articleCount) {
	
	//-------------------- initialization --------------------//
	bool articleIndex[lastNgram] = {0}; // boolean array of what trigrams are in an article
	struct stemmer * currentStemmer = create_stemmer();
	
	// file for writing the titles to
	FILE* titleFile = NULL;
	if (writeFiles) {
		titleFile = fopen("title_file", "w");
		if (NULL == titleFile) {
			fprintf(stderr, "Error open title file: %m\n");
			exit(1);
		}
	}
	
	// initializes the libxml library
	LIBXML_TEST_VERSION
	xmlTextReaderPtr wikiReader; //the reader for the document
	wikiReader = xmlReaderForFile(inFileName, NULL, XML_PARSE_RECOVER+XML_PARSE_NOWARNING+XML_PARSE_NOERROR+XML_PARSE_HUGE);
	if (NULL == wikiReader) {
		//fprintf(stderr, "%s %s\n", "Failed to open ", wikiFileName);
		fprintf(stderr, "Error opening XML wiki: %m\n");
		exit(1);
	}

	// for progress bar
	int percent = 0;
	long fileSize = getFileSize(inFileName);
	
	// initialization for currentArticle and its componens 
	article currentArticle;	
	currentArticle.title = g_string_sized_new(256);
	currentArticle.body  = g_string_sized_new(786432); //768*1024

	//-------------------- index the wiki --------------------//
	optionalPrint ("%s", "Adding collection to index.\n");
	optionalPrint ("%d", (int)(fileSize / 1048576));
	optionalPrint (" MB in file\n");
	displayProgressBar (xmlTextReaderByteConsumed(wikiReader), fileSize, &percent);
	 
	//prime the loop
	currentArticle.title->len = 0;
	currentArticle.body->len  = 0;
	xmlTextReaderRead(wikiReader);// at a <page> tag, drop in
	xmlTextReaderRead(wikiReader);// at a <page> tag, drop in
	 
	// reads from xml file until file is finished, adds articles to index, and writes tittles to file
	// processes one article per iteration
	while (getArticle (wikiReader, &currentArticle)) {
		currentArticle.articleNumber = *articleCount;
		*articleCount = *articleCount + 1;
		// filter / transform text
		removeMarkup(currentArticle.body);
		stemText(currentArticle.body, currentStemmer); //ngramming.h
		// index the text
		indexText(currentArticle.body, articleIndex); //ngramming.h
		addToIndex(articleIndex, wikiIndex, currentArticle.articleNumber);
		//adds titles to title file
		if (writeFiles) {fprintf(titleFile, "%s\n", currentArticle.title->str);}
		//re-prime the loop
		currentArticle.title->len = 0;
		currentArticle.body->len  = 0;
		displayProgressBar (xmlTextReaderByteConsumed(wikiReader), fileSize, &percent);
	}
	optionalPrint ("\n%s", "Finished indexing. \n");
	optionalPrint ("%lu", (long)(xmlTextReaderByteConsumed(wikiReader)/1048576));
	optionalPrint ("MB consumed\n");
	
	
	optionalPrint ("%d %s %d %s", *articleCount, "articles found", (int) currentArticle.body->allocated_len, "length allocated for article body\n");
	// clean up of structures needed to process wiki
	if (writeFiles) {fclose (titleFile);}
	free_stemmer(currentStemmer);
	xmlFreeTextReader(wikiReader);
	xmlCleanupParser();
	//g_string_free(currentArticle.title, TRUE);
	//g_string_free(currentArticle.body, TRUE); //malloc fail if this is uncommented ?!
}