/**
     * Return an array indicating the number of 
     * entities in each document.
     * This method allocates memory, but it is the 
     * callers responsibility to free it.
     */
    int* word_count() {
      const int nd = num_docs();
      int* e = (int*)ferrum::MALLOC(sizeof(int)*nd);
      for(int i = 0; i<nd; i++) {
	e[i] = documents[i]->num_words();
      }
      return e;
    }
Example #2
0
int
main(void)
{
    int rv = 0;

    printf("-- speed tests determine parsing throughput given %d different sample documents --\n",
           num_docs());

    printf("With UTF8 validation:\n");
    rv = run(1);
    if (rv != 0) return rv;
    printf("Without UTF8 validation:\n");
    rv = run(0);
    return rv;
}
Example #3
0
std::vector<doc_id> disk_index::docs() const
{
    std::vector<doc_id> ret(num_docs());
    std::iota(ret.begin(), ret.end(), 0_did);
    return ret;
}
Example #4
0
static int
run(int validate_utf8)
{
    long long times = 0; 
    double starttime;
    unsigned long long sumsize = 0;

    starttime = mygettime();

    /* allocate a parser */
    for (;;) {
		int i;
        {
            double now = mygettime();
            if (now - starttime >= PARSE_TIME_SECS) break;
        }

        for (i = 0; i < 100; i++) {
            yajl_handle hand = yajl_alloc(NULL, NULL, NULL);
            yajl_status stat;        
            const char ** d;

            yajl_config(hand, yajl_dont_validate_strings, validate_utf8 ? 0 : 1);

            for (d = get_doc(times % num_docs()); *d; d++) {
                size_t size = strlen(*d);
                sumsize += size;
                stat = yajl_parse(hand, (unsigned char *) *d, size);
                if (stat != yajl_status_ok) break;
            }
            
            stat = yajl_complete_parse(hand);

            if (stat != yajl_status_ok) {
                unsigned char * str =
                    yajl_get_error(hand, 1,
                                   (unsigned char *) *d,
                                   (*d ? strlen(*d) : 0));
                fprintf(stderr, "%s", (const char *) str);
                yajl_free_error(hand, str);
                return 1;
            }
            yajl_free(hand);
            times++;
        }
    }

    /* parsed doc 'times' times */
    {
        double throughput;
        double now;
        const char * all_units[] = { "B/s", "KB/s", "MB/s", (char *) 0 };
        const char ** units = all_units;

        now = mygettime();

        throughput = sumsize / (now - starttime);
        
        while (*(units + 1) && throughput > 1024) {
            throughput /= 1024;
            units++;
        }
        
        printf("Parsing speed: %g %s\n", throughput, *units);
    }

    return 0;
}
Example #5
0
WordNode * unionize(WordNode * tmp_list, WordNode * list_2)
{
	WordNode * unioned = init_list();

	if(!tmp_list && !list_2){
		return unioned; // return empty list
	}

	if(!tmp_list){
		// must make a copy, in case this list_2 is retrieved from the hash table. 
		DocNode * doc = list_2->head;
		while (doc != NULL) {
			// just add doc 
			add_doc(doc->docID , doc->freq , unioned);
			// fprintf(stdout, "%d %d ", doc->docID, doc->freq);
			doc = doc->next; 
		}
		free_list(list_2);
		return unioned;
	}

	if(!list_2){
		// must make a copy
		DocNode * doc = tmp_list->head;
		while (doc != NULL) {
			// just add doc 
			add_doc(doc->docID , doc->freq , unioned);
			// fprintf(stdout, "%d %d ", doc->docID, doc->freq);
			doc = doc->next; 
		}
		free_list(tmp_list);
		return unioned;
	}

	// keep track of shared documents
	int max_seen_size = num_docs(list_2);
	int* have_seen = (int *) calloc(max_seen_size, sizeof(int));
	int i=0;

	//go through every node in tmp_list
	DocNode * doc = tmp_list->head;

	while (doc != NULL) {
		// for each doc node, see if that docNode is present in list_2. 
		DocNode * shared = get_index(doc->docID, list_2);
		
		if(shared){
			//if shared, keep track of this docID. 
			have_seen[i++] = doc->docID;
			// take max frequency 
			int newfreq = MAX((shared->freq),(doc->freq)); 
			//make a new docNode and add this docNode to the unioned list
			add_doc(doc->docID ,newfreq , unioned);
		} else {
			// even if not shared, add to the list. 
			add_doc(doc->docID , doc->freq , unioned);
		}
		// fprintf(stdout, "%d %d ", doc->docID, doc->freq);
		doc = doc->next; 
	}
	
	// now go through every node in list_2
	doc = list_2->head;

	while (doc != NULL) {

		//check if docID has already been added from tmp_list
		int index = find_index(have_seen, max_seen_size, doc->docID);
		
		if (index == -1) {
			// only adds if doc has not been added before
			add_doc(doc->docID , doc->freq , unioned);
		}

		doc = doc->next;
	}

	free_list(tmp_list);
	free_list(list_2);

	return unioned;

}