/** * Return an array indicating the number of * entities in each document. * This method allocates memory, but it is the * callers responsibility to free it. */ int* word_count() { const int nd = num_docs(); int* e = (int*)ferrum::MALLOC(sizeof(int)*nd); for(int i = 0; i<nd; i++) { e[i] = documents[i]->num_words(); } return e; }
int main(void) { int rv = 0; printf("-- speed tests determine parsing throughput given %d different sample documents --\n", num_docs()); printf("With UTF8 validation:\n"); rv = run(1); if (rv != 0) return rv; printf("Without UTF8 validation:\n"); rv = run(0); return rv; }
std::vector<doc_id> disk_index::docs() const { std::vector<doc_id> ret(num_docs()); std::iota(ret.begin(), ret.end(), 0_did); return ret; }
static int run(int validate_utf8) { long long times = 0; double starttime; unsigned long long sumsize = 0; starttime = mygettime(); /* allocate a parser */ for (;;) { int i; { double now = mygettime(); if (now - starttime >= PARSE_TIME_SECS) break; } for (i = 0; i < 100; i++) { yajl_handle hand = yajl_alloc(NULL, NULL, NULL); yajl_status stat; const char ** d; yajl_config(hand, yajl_dont_validate_strings, validate_utf8 ? 0 : 1); for (d = get_doc(times % num_docs()); *d; d++) { size_t size = strlen(*d); sumsize += size; stat = yajl_parse(hand, (unsigned char *) *d, size); if (stat != yajl_status_ok) break; } stat = yajl_complete_parse(hand); if (stat != yajl_status_ok) { unsigned char * str = yajl_get_error(hand, 1, (unsigned char *) *d, (*d ? strlen(*d) : 0)); fprintf(stderr, "%s", (const char *) str); yajl_free_error(hand, str); return 1; } yajl_free(hand); times++; } } /* parsed doc 'times' times */ { double throughput; double now; const char * all_units[] = { "B/s", "KB/s", "MB/s", (char *) 0 }; const char ** units = all_units; now = mygettime(); throughput = sumsize / (now - starttime); while (*(units + 1) && throughput > 1024) { throughput /= 1024; units++; } printf("Parsing speed: %g %s\n", throughput, *units); } return 0; }
WordNode * unionize(WordNode * tmp_list, WordNode * list_2) { WordNode * unioned = init_list(); if(!tmp_list && !list_2){ return unioned; // return empty list } if(!tmp_list){ // must make a copy, in case this list_2 is retrieved from the hash table. DocNode * doc = list_2->head; while (doc != NULL) { // just add doc add_doc(doc->docID , doc->freq , unioned); // fprintf(stdout, "%d %d ", doc->docID, doc->freq); doc = doc->next; } free_list(list_2); return unioned; } if(!list_2){ // must make a copy DocNode * doc = tmp_list->head; while (doc != NULL) { // just add doc add_doc(doc->docID , doc->freq , unioned); // fprintf(stdout, "%d %d ", doc->docID, doc->freq); doc = doc->next; } free_list(tmp_list); return unioned; } // keep track of shared documents int max_seen_size = num_docs(list_2); int* have_seen = (int *) calloc(max_seen_size, sizeof(int)); int i=0; //go through every node in tmp_list DocNode * doc = tmp_list->head; while (doc != NULL) { // for each doc node, see if that docNode is present in list_2. DocNode * shared = get_index(doc->docID, list_2); if(shared){ //if shared, keep track of this docID. have_seen[i++] = doc->docID; // take max frequency int newfreq = MAX((shared->freq),(doc->freq)); //make a new docNode and add this docNode to the unioned list add_doc(doc->docID ,newfreq , unioned); } else { // even if not shared, add to the list. add_doc(doc->docID , doc->freq , unioned); } // fprintf(stdout, "%d %d ", doc->docID, doc->freq); doc = doc->next; } // now go through every node in list_2 doc = list_2->head; while (doc != NULL) { //check if docID has already been added from tmp_list int index = find_index(have_seen, max_seen_size, doc->docID); if (index == -1) { // only adds if doc has not been added before add_doc(doc->docID , doc->freq , unioned); } doc = doc->next; } free_list(tmp_list); free_list(list_2); return unioned; }