Beispiel #1
0
char* de::getNounString(noun* n)
{
	if (n->data < 0) n->data = 0;
 char* buffer = (char*) malloc (BUFFER_SIZE*10);
 buffer[0]=0;
 int cas = n->data;
 if (n->reflex)
 {
  cas = 3;
  n->data = 3;
 }
 bool doArticle = true;
 if (n->prepos!=0)
 {
  strcat(buffer,getPreposObject(n->prepos,n,&cas,&doArticle));
 }
 strcat(buffer,getArticle(n->id,n->plural?1:0,n->num,n->data,n->typ));
 for (int i=0;i<16;i++){
  if (n->adj[i]!=0)
   strcat(buffer,getAdjective(n->adj[i],n->id,n->plural,n->typ,n->data));
 }
 strcat(buffer,getNoun(n->id,n->plural,n->data));
 if (n->usegenitive)
 {
  n->genitivenoun->data = 3;
  //Force definate article if not yet defined.
  if (n->genitivenoun->typ == 0)
    n->genitivenoun->typ = -1;
  strcat(buffer,getNounString(n->genitivenoun));
 }
 if (n->useRClause)
 {
  strcat(buffer,", ");
  char* rClauseString =(char*)n->rClause->createSubClause().c_str();
  int cas2 = 0;
  if (n->rClauseObj != 0)
  {
    cas2 = n->rClause->obj[n->rClauseObj-1][0].data;
  }
  else cas2 = n->rClause->s[0].data;
  strcat(buffer,getArticle(n->id,n->plural?1:0,n->num,cas2,-1));
  strcat(buffer,rClauseString);
  strcat(buffer,", ");
 }
 return buffer;
}
/*retrieve article from ID*/
retrieved_article *retrievearticle_1_svc(article_request *request, struct svc_req *rqstp) {
    article_info *ai = malloc(sizeof(article_info));

    struct paperitem *item = getArticle(request->articleID);

    if(item == NULL) {
        ra.size = -1;
        return &ra;
    }

    ra.data.data_val = malloc(item->dataSize);
    memcpy(ra.data.data_val, item->data, item->dataSize);

    ra.data.data_len = item->dataSize;
    return &ra;
}
Beispiel #3
0
/**
 * filters, transforms, and indexes file using ngrams to the index
 * 
 * file name - name of file to process
 * wikiindex - the judy arrays to store the index of the wiki in
 */
void indexWiki(char* inFileName, Pvoid_t *wikiIndex, int* articleCount) {
	
	//-------------------- initialization --------------------//
	bool articleIndex[lastNgram] = {0}; // boolean array of what trigrams are in an article
	struct stemmer * currentStemmer = create_stemmer();
	
	// file for writing the titles to
	FILE* titleFile = NULL;
	if (writeFiles) {
		titleFile = fopen("title_file", "w");
		if (NULL == titleFile) {
			fprintf(stderr, "Error open title file: %m\n");
			exit(1);
		}
	}
	
	// initializes the libxml library
	LIBXML_TEST_VERSION
	xmlTextReaderPtr wikiReader; //the reader for the document
	wikiReader = xmlReaderForFile(inFileName, NULL, XML_PARSE_RECOVER+XML_PARSE_NOWARNING+XML_PARSE_NOERROR+XML_PARSE_HUGE);
	if (NULL == wikiReader) {
		//fprintf(stderr, "%s %s\n", "Failed to open ", wikiFileName);
		fprintf(stderr, "Error opening XML wiki: %m\n");
		exit(1);
	}

	// for progress bar
	int percent = 0;
	long fileSize = getFileSize(inFileName);
	
	// initialization for currentArticle and its componens 
	article currentArticle;	
	currentArticle.title = g_string_sized_new(256);
	currentArticle.body  = g_string_sized_new(786432); //768*1024

	//-------------------- index the wiki --------------------//
	optionalPrint ("%s", "Adding collection to index.\n");
	optionalPrint ("%d", (int)(fileSize / 1048576));
	optionalPrint (" MB in file\n");
	displayProgressBar (xmlTextReaderByteConsumed(wikiReader), fileSize, &percent);
	 
	//prime the loop
	currentArticle.title->len = 0;
	currentArticle.body->len  = 0;
	xmlTextReaderRead(wikiReader);// at a <page> tag, drop in
	xmlTextReaderRead(wikiReader);// at a <page> tag, drop in
	 
	// reads from xml file until file is finished, adds articles to index, and writes tittles to file
	// processes one article per iteration
	while (getArticle (wikiReader, &currentArticle)) {
		currentArticle.articleNumber = *articleCount;
		*articleCount = *articleCount + 1;
		// filter / transform text
		removeMarkup(currentArticle.body);
		stemText(currentArticle.body, currentStemmer); //ngramming.h
		// index the text
		indexText(currentArticle.body, articleIndex); //ngramming.h
		addToIndex(articleIndex, wikiIndex, currentArticle.articleNumber);
		//adds titles to title file
		if (writeFiles) {fprintf(titleFile, "%s\n", currentArticle.title->str);}
		//re-prime the loop
		currentArticle.title->len = 0;
		currentArticle.body->len  = 0;
		displayProgressBar (xmlTextReaderByteConsumed(wikiReader), fileSize, &percent);
	}
	optionalPrint ("\n%s", "Finished indexing. \n");
	optionalPrint ("%lu", (long)(xmlTextReaderByteConsumed(wikiReader)/1048576));
	optionalPrint ("MB consumed\n");
	
	
	optionalPrint ("%d %s %d %s", *articleCount, "articles found", (int) currentArticle.body->allocated_len, "length allocated for article body\n");
	// clean up of structures needed to process wiki
	if (writeFiles) {fclose (titleFile);}
	free_stemmer(currentStemmer);
	xmlFreeTextReader(wikiReader);
	xmlCleanupParser();
	//g_string_free(currentArticle.title, TRUE);
	//g_string_free(currentArticle.body, TRUE); //malloc fail if this is uncommented ?!
}