static Py_ssize_t PyJudyIntSet_len(PyObject* set) { Word_t count; J1C(count, ((PyJudyIntSet*)set)->s, 0, -1); //! 2**31-1 on 32-bit systems, 2**63-1 on 64-bit systems return (Py_ssize_t)count; }
void csp_id_set_build_and_keep(struct csp_id_set *set, struct csp_id_set_builder *builder) { int found; Word_t count; size_t i; csp_id id; /* First make sure that the `ids` array is large enough to hold all of the * ids that have been added to the set. */ J1C(count, builder->working_set, 0, -1); set->count = count; csp_id_set_ensure_size(set); /* Then fill in the array. */ i = 0; id = 0; J1F(found, builder->working_set, id); while (found) { set->ids[i++] = id; J1N(found, builder->working_set, id); } set->hash = builder->hash; }
/** * writes statistics about each ngram * calculates the IDF for all ngrams * writes the frequency of each ngram and the total number of ngrams ....???.... before it, offset into index file. */ void writeNgramStats(Pvoid_t *wikiIndex, int articleCount) { Word_t totalIndexes = 0; Word_t indexes = 0; float highestIDF = 0.0; float lowestIDF = log2f ((float) articleCount); int highestIDFNgram = 0; int lowestIDFNgram = 0; ngramStats currentStats = {0,0,0.0}; FILE *ngramStatsFile = NULL; if (writeFiles) { ngramStatsFile = fopen("ngramStatsFile.bin", "wb"); if (NULL == ngramStatsFile) { // fprintf(stderr, "%s %s\n", "Failed to open ", statsfilename); fprintf(stderr, "Error opening NGram Stats File: %m\n"); exit(1); } } // calculates the IDF and total cumulative article #s preceeding this ngram's block // writes the IDF, count of documents this ngram appears in, and the count of article #s preceeding // one ngram per iteration for (Word_t currentNgram = 0; currentNgram < lastNgram; currentNgram++) { J1C (indexes, wikiIndex[currentNgram], 0, -1); //count indices if (indexes) { //avoid a divide by zero currentStats.IDF = log2f ((float) articleCount / (float) indexes); if (currentStats.IDF > highestIDF) { highestIDF = currentStats.IDF; highestIDFNgram = currentNgram; } else if (currentStats.IDF < lowestIDF) { lowestIDF = currentStats.IDF; lowestIDFNgram = currentNgram; } } else { currentStats.IDF = 0.0;} currentStats.count = (int) indexes; currentStats.offset = (int) totalIndexes; if (writeFiles) {fwrite(¤tStats, sizeof(ngramStats), 1, ngramStatsFile);} totalIndexes = totalIndexes + indexes; } if (writeFiles) {fclose (ngramStatsFile);} //intToNgram(lowestIDFNgram, char_ngram); //intToNgram(highestIDFNgram, char_ngram); optionalPrint ("%d %s", highestIDFNgram, "is the ngram with the highest IDF\n"); optionalPrint ("%d %s", lowestIDFNgram, "is the ngram with the lowest IDF\n"); optionalPrint ("%.4f %.3f %s", log2f (1.0), log2f ((float) articleCount), "possible range of IDF\n"); optionalPrint ("%.4f %.3f %s", lowestIDF, highestIDF, "actual range of IDF\n"); }
/** * Frees the Judy arrays that make up the index and gathers some basic information about them. */ void freeIndex (Pvoid_t *wikiIndex) { Word_t returnCode = 0; Word_t totalIndexes = 0; Word_t indexes = 0; Word_t totalSize = 0; Word_t size = 0; // frees the judy arrays in the index and adds the total indexes and the size of each judy array // one Judy array per iteration for (int currentNgram = 0; currentNgram < lastNgram; currentNgram++) { J1C (indexes, wikiIndex[currentNgram], 0, -1); //count indices totalIndexes = totalIndexes + indexes; J1MU (size, wikiIndex[currentNgram]); // memory usage totalSize = totalSize + size; J1FA (returnCode, wikiIndex[currentNgram]); //free } optionalPrint("%d %s", (int)(totalSize/1048576), "MB of memory used\n"); optionalPrint("%d", (int)((totalIndexes*4)/1048576)); optionalPrint(" MB if not compressed\n"); optionalPrint("Index freed.\n"); }
static Pvoid_t ixemes_freq_range(int p, int min_f, int max_f) { Pvoid_t ix = NULL; Word_t xid; int i, tst; for (i = 0; i < dex_section[INVA].nof_entries; i++){ xid = dex_section[INVA].toc[i].val; if (xid >= XID_META_FREQUENT_F && xid <= XID_META_FREQUENT_L) continue; if (xid <= XID_TOKEN_FREQUENT_L && xid >= XID_TOKEN_FREQUENT_F) continue; const inva_e *e = (const inva_e*)fetch_item(INVA, i); if (e->len > min_f && e->len <= max_f){ J1S(tst, ix, xid); } } xid = 0; J1F(tst, ix, xid); fw_layers[p].min_xid = xid; xid = -1; J1L(tst, ix, xid); fw_layers[p].max_xid = xid; J1C(tst, ix, 0, -1); dub_msg("Layer %u: number of xids %u min %u max %u", p, tst, fw_layers[p].min_xid, fw_layers[p].max_xid); return ix; }
int main(int argc, char **argv) { static const char **fields; static uint64_t *lengths; dsfmt_t state; Pvoid_t uuids = NULL; tdb_cons* c = tdb_cons_init(); test_cons_settings(c); uint64_t i, j; __uint128_t prev_uuid = 0; Word_t key; int tst; assert(tdb_cons_open(c, argv[1], fields, 0) == 0); dsfmt_init_gen_rand(&state, 2489); for (i = 0; i < NUM_TRAILS; i++){ uint8_t uuid[16]; gen_random_uuid(uuid, &state); memcpy(&key, uuid, 8); J1S(tst, uuids, key); if (!tst){ printf("half-word collision! change random seed!\n"); return -1; } for (j = 0; j < NUM_EVENTS; j++) tdb_cons_add(c, uuid, i * 100 + j, fields, lengths); } J1C(key, uuids, 0, -1); assert(key == NUM_TRAILS); assert(tdb_cons_finalize(c) == 0); tdb_cons_close(c); tdb* t = tdb_init(); assert(tdb_open(t, argv[1]) == 0); assert(tdb_num_trails(t) == NUM_TRAILS); assert(tdb_num_events(t) == NUM_TRAILS * NUM_EVENTS); for (i = 0; i < NUM_TRAILS; i++){ __uint128_t this_uuid; /* uuids must be monotonically increasing */ memcpy(&this_uuid, tdb_get_uuid(t, i), 16); assert(this_uuid > prev_uuid); prev_uuid = this_uuid; /* remove this uuid from the uuid set and make sure it exists */ memcpy(&key, &this_uuid, 8); J1U(tst, uuids, key); assert(tst == 1); } /* make sure we retrieved all uuids */ J1C(key, uuids, 0, -1); assert(key == 0); return 0; }
int jtableP_count(jtableP *table) { int ret; J1C(ret, table->t, 0, -1); return ret; }
/* index, called from absmi.c * * Returns: * NULL (yap fallback) No usable indexing available * * Yap_FAILCODE() (fail) No result found * Yap_CauseListToClause(cl) 1 solution found * Yap_ClauseListCode(cl) 2+ solutions found */ yamop * Yap_udi_search(PredEntry *p) { int r; struct ClauseList clauselist; UdiPArg parg; UdiInfo info; /* find our structure*/ HASH_FIND_UdiInfo(UdiControlBlocks,p,info); if (!info || utarray_len(info->args) == 0) return NULL; if (utarray_len(info->args) == 1){ //simple case no intersection needed struct si_callback_h c; c.cl = Yap_ClauseListInit(&clauselist); c.clauselist = info->clauselist; c.pred = info->p; if (!c.cl) return NULL; parg = (UdiPArg) utarray_eltptr(info->args,0); r = parg->control->search(parg->idxstr, parg->arg, si_callback, (void *) &c); Yap_ClauseListClose(c.cl); if (r == -1) { Yap_ClauseListDestroy(c.cl); return NULL; } if (Yap_ClauseListCount(c.cl) == 0) { Yap_ClauseListDestroy(c.cl); return Yap_FAILCODE(); } } else {//intersection needed using Judy1 #ifdef USE_JUDY /*TODO: do more tests to this algorithm*/ int i; Pvoid_t tmp = (Pvoid_t) NULL; Pvoid_t result = (Pvoid_t) NULL; Word_t count = 0L; Word_t idx_r = 0L; Word_t idx_tmp = 0L; int rc = 0; yamop **x; /* * I will start with the simplest approach * for each index create a set and intersect it with the * next * * In the future it could pay to sort according to index type * to improve intersection part */ for (i = 0; i < utarray_len(info->args) ; i++) { parg = (UdiPArg) utarray_eltptr(info->args,i); r = parg->control->search(parg->idxstr, parg->arg, j1_callback, &tmp); if (r == -1) /*this arg does not prune search*/ continue; rc ++; J1C(count, result, 0, -1); if (r == 0) /* this arg gave 0 results -> FAIL*/ { if (count > 0) // clear previous result if they exists J1FA(count, result); return Yap_FAILCODE(); } if (count == 0) // first result_set { result = tmp; tmp = (Pvoid_t) NULL; } else /*intersection*/ { idx_tmp = 0L; idx_r = 0L; J1F(count, result, idx_r); //succeeds one time at least assert(count > 0); J1F(count, tmp, idx_tmp); //succeeds one time at least assert(count > 0); while (count) { while (idx_r < idx_tmp) { J1U(count, result, idx_r); //does not belong J1N(count, result, idx_r); //next if (! count) break; //end result set } if(idx_r == idx_tmp) { J1N(count, result, idx_r); //next if (! count) break; //end result set J1N(count, tmp, idx_tmp); //next tmp //if (! count) break; //end tmp set will break while } else // (idx_r > idx_tmp) { idx_tmp = idx_r; // fast forward J1F(count, tmp, idx_tmp); // first starting in idx_r //if (! count) break; //end tmp set will break while } } J1F(count, result, idx_r); // first starting in idx_r //clear up the rest while (idx_r > idx_tmp && count) //result has more setted values { J1U(count, result, idx_r); //does not belong J1N(count, result, idx_r); //next } J1FA(count, tmp); //free tmp } } if (rc == 0) /*no search performed*/ return NULL; J1C(count, result, 0, -1); if (count == 0) { /*result set empty -> FAIL */ J1FA(count, result); return Yap_FAILCODE(); } /*convert Juddy1 to clauselist*/ Yap_ClauseListInit(&clauselist); idx_r = 0L; J1F(count, result, idx_r); while (count) { x = (yamop **) utarray_eltptr(info->clauselist, idx_r - 1); Yap_ClauseListExtend( &clauselist, *x, info->p); J1N(count, result, idx_r); } J1FA(count,result); fprintf(stderr,"J1 used space %ld bytes for %d clausules\n", count, Yap_ClauseListCount(&clauselist)); Yap_ClauseListClose(&clauselist); #else fprintf(stderr,"Without libJudy only one argument indexed is allowed." "Falling back to Yap Indexing\n"); return NULL; //NO Judy Available #endif } if (Yap_ClauseListCount(&clauselist) == 1) return Yap_ClauseListToClause(&clauselist); return Yap_ClauseListCode(&clauselist); }