Beispiel #1
0
static PyObject* jellyfish_porter_stem(PyObject *self, PyObject *args)
{
    const char *str;
    char *result;
    PyObject *ret;
    struct stemmer *z;
    int end;

    if (!PyArg_ParseTuple(args, "s", &str)) {
        return NULL;
    }

    z = create_stemmer();
    if (!z) {
        PyErr_NoMemory();
        return NULL;
    }

    result = strdup(str);
    if (!result) {
        free_stemmer(z);
        PyErr_NoMemory();
        return NULL;
    }

    end = stem(z, result, strlen(result) - 1);
    result[end + 1] = '\0';

    ret = Py_BuildValue("s", result);

    free(result);
    free_stemmer(z);

    return ret;
}
Beispiel #2
0
/* Tokenize a given sds, setting a term to zero-length sds if it's
 * a stopword. A total number of tokens and total number of nonstopwords
 * will be returned */
static sds *sds_tokenize(sds s, int *len, int *nonstopwords) {
    int i, l, k;
    sds *terms;
    struct stemmer *stmer;

    *nonstopwords = 0;
    terms = sdssplitlen(s, sdslen(s), " ", 1, len);
    if (!terms) return NULL;
    stmer = create_stemmer();
    for (i = 0; i < *len; i++) {
        sds stemmed = NULL, term = terms[i];
        term = sdstrim(term, puncs);
        l = sdslen(term);
        sdstolower(term);
        if (l == 0 || rr_stopwords_check(term)) {
            sdsclear(term);
            continue;
        }
        *nonstopwords += 1;
        /* note that the third argument is a zero-based index */
        k = stem(stmer, term, l-1);
        if (k < l-1) {
            stemmed = sdsnewlen(term, k+1);
            sdsfree(term);
            terms[i] = stemmed;
        }
    }

    free_stemmer(stmer);
    return terms;
}
Beispiel #3
0
CAMLprim value caml_stemmer_porter2_init(value v_unit)
{
  CAMLparam1(v_unit);
  stemmer_t *val = malloc(sizeof(stemmer_t));
  memset((void*)val, 0, sizeof(stemmer_t));
  val->st = create_stemmer();
  val->buf = malloc(32);
  val->len = 32;
  memset((void*)val->buf,0,32);
  CAMLreturn((value)val);
}
Beispiel #4
0
int load_data(int WORDS, struct stemmer **stem_list, FILE *f) {
  static int a_max = WORDS;
  int a_size = 0;
  while (a_size < WORDS) {
    int ch = getc(f);
    if (ch == EOF) return a_size;
    char *s = (char *)sirius_malloc(i_max + 1);
    if (LETTER(ch)) {
      int i = 0;
      while (TRUE) {
        if (i == i_max) {
          i_max += INC;
          void *_realloc = NULL;
          if ((_realloc = realloc(s, i_max + 1)) == NULL) {
            fprintf(stderr, "realloc() failed!\n");
            return -ENOMEM; 
          }
          s = (char*)_realloc;
        }
        ch = tolower(ch); /* forces lower case */

        s[i] = ch;
        i++;
        ch = getc(f);
        if (!LETTER(ch)) {
          ungetc(ch, f);
          break;
        }
      }
      struct stemmer *z = create_stemmer();
      z->b = s;
      z->k = i - 1;
      stem_list[a_size] = z;
      if (a_size == a_max) {
        a_max += A_INC;
        void *_realloc = NULL;
        if ((_realloc = realloc(stem_list, a_max)) == NULL) {
            fprintf(stderr, "realloc() failed!\n");
            return -ENOMEM; 
        }
        stem_list = (struct stemmer **)_realloc;
      }
      a_size += 1;
    }
  }
  return a_size;
}
Beispiel #5
0
int main(int argc, char * argv[])
{  int i;
	
	struct stemmer * z = create_stemmer();
	
	s = (char *) malloc(i_max + 1);
	for (i = 1; i < argc; i++)
	{  FILE * f = fopen(argv[i],"r");
		if (f == 0) { fprintf(stderr,"File %s not found\n",argv[i]); exit(1); }
		stemfile(z, f);
	}
	free(s);
	
	free_stemmer(z);
	
	return 0;
}
Beispiel #6
0
/**
 * filters, transforms, and indexes file using ngrams to the index
 * 
 * file name - name of file to process
 * wikiindex - the judy arrays to store the index of the wiki in
 */
void indexWiki(char* inFileName, Pvoid_t *wikiIndex, int* articleCount) {
	
	//-------------------- initialization --------------------//
	bool articleIndex[lastNgram] = {0}; // boolean array of what trigrams are in an article
	struct stemmer * currentStemmer = create_stemmer();
	
	// file for writing the titles to
	FILE* titleFile = NULL;
	if (writeFiles) {
		titleFile = fopen("title_file", "w");
		if (NULL == titleFile) {
			fprintf(stderr, "Error open title file: %m\n");
			exit(1);
		}
	}
	
	// initializes the libxml library
	LIBXML_TEST_VERSION
	xmlTextReaderPtr wikiReader; //the reader for the document
	wikiReader = xmlReaderForFile(inFileName, NULL, XML_PARSE_RECOVER+XML_PARSE_NOWARNING+XML_PARSE_NOERROR+XML_PARSE_HUGE);
	if (NULL == wikiReader) {
		//fprintf(stderr, "%s %s\n", "Failed to open ", wikiFileName);
		fprintf(stderr, "Error opening XML wiki: %m\n");
		exit(1);
	}

	// for progress bar
	int percent = 0;
	long fileSize = getFileSize(inFileName);
	
	// initialization for currentArticle and its componens 
	article currentArticle;	
	currentArticle.title = g_string_sized_new(256);
	currentArticle.body  = g_string_sized_new(786432); //768*1024

	//-------------------- index the wiki --------------------//
	optionalPrint ("%s", "Adding collection to index.\n");
	optionalPrint ("%d", (int)(fileSize / 1048576));
	optionalPrint (" MB in file\n");
	displayProgressBar (xmlTextReaderByteConsumed(wikiReader), fileSize, &percent);
	 
	//prime the loop
	currentArticle.title->len = 0;
	currentArticle.body->len  = 0;
	xmlTextReaderRead(wikiReader);// at a <page> tag, drop in
	xmlTextReaderRead(wikiReader);// at a <page> tag, drop in
	 
	// reads from xml file until file is finished, adds articles to index, and writes tittles to file
	// processes one article per iteration
	while (getArticle (wikiReader, &currentArticle)) {
		currentArticle.articleNumber = *articleCount;
		*articleCount = *articleCount + 1;
		// filter / transform text
		removeMarkup(currentArticle.body);
		stemText(currentArticle.body, currentStemmer); //ngramming.h
		// index the text
		indexText(currentArticle.body, articleIndex); //ngramming.h
		addToIndex(articleIndex, wikiIndex, currentArticle.articleNumber);
		//adds titles to title file
		if (writeFiles) {fprintf(titleFile, "%s\n", currentArticle.title->str);}
		//re-prime the loop
		currentArticle.title->len = 0;
		currentArticle.body->len  = 0;
		displayProgressBar (xmlTextReaderByteConsumed(wikiReader), fileSize, &percent);
	}
	optionalPrint ("\n%s", "Finished indexing. \n");
	optionalPrint ("%lu", (long)(xmlTextReaderByteConsumed(wikiReader)/1048576));
	optionalPrint ("MB consumed\n");
	
	
	optionalPrint ("%d %s %d %s", *articleCount, "articles found", (int) currentArticle.body->allocated_len, "length allocated for article body\n");
	// clean up of structures needed to process wiki
	if (writeFiles) {fclose (titleFile);}
	free_stemmer(currentStemmer);
	xmlFreeTextReader(wikiReader);
	xmlCleanupParser();
	//g_string_free(currentArticle.title, TRUE);
	//g_string_free(currentArticle.body, TRUE); //malloc fail if this is uncommented ?!
}