/* * The routines @emph{ALIGN_synced} and @emph{ALIGN_ordered} allow to * simply query the alignment status of the two head columns of two * BATs. */ int ALIGNsynced(BAT *b1, BAT *b2) { BATcheck(b1, "ALIGNsynced: bat 1 required", 0); BATcheck(b2, "ALIGNsynced: bat 2 required", 0); /* first try to prove head columns are not in sync */ if (BATcount(b1) != BATcount(b2)) return 0; if (ATOMtype(BAThtype(b1)) != ATOMtype(BAThtype(b2))) return 0; if (BAThvoid(b1) && BAThvoid(b2)) return (b1->hseqbase == b2->hseqbase); /* then try that they are */ if (b1->batCacheid == b2->batCacheid) return 1; /* same bat. trivial case */ if (BATcount(b1) == 0) return 1; /* empty bats of same type. trivial case */ if (b1->halign && b1->halign == b2->halign) return 1; /* columns marked as equal by algorithmics */ if (VIEWparentcol(b1) && ALIGNsynced(BBPcache(VIEWhparent(b1)), b2)) return 1; /* view on same bat --- left recursive def.. */ if (VIEWparentcol(b2) && ALIGNsynced(b1, BBPcache(VIEWhparent(b2)))) return 1; /* view on same bat --- right recursive def.. */ return 0; /* we simply don't know */ }
inline int findGDKtype(int type) { if (type == TYPE_any || type== TYPE_void) return TYPE_void; if (isaBatType(type)) return TYPE_bat; return ATOMtype(type); }
ptr ATOMnil(int t) { const void *src = ATOMnilptr(t); size_t len = ATOMlen(ATOMtype(t), src); ptr dst = GDKmalloc(len); if (dst) memcpy(dst, src, len); return dst; }
log_bid ebat2real(log_bid b, oid ibase) { /* make a copy of b */ BAT *o = temp_descriptor(b); BAT *c = BATcopy(o, TYPE_void, ATOMtype(o->ttype), TRUE); log_bid r; BATseqbase(c, ibase ); c->H->dense = 1; r = temp_create(c); bat_destroy(c); bat_destroy(o); return r; }
/* * The prime routine for the BAT layer is to create a new hash index. * Its argument is the element type and the maximum number of BUNs be * stored under the hash function. */ BAT * BAThash(BAT *b, BUN masksize) { BAT *o = NULL; lng t0,t1; (void) t0; (void) t1; if (VIEWhparent(b)) { bat p = VIEWhparent(b); o = b; b = BATdescriptor(p); if (!ALIGNsynced(o, b) || BUNfirst(o) != BUNfirst(b)) { BBPunfix(b->batCacheid); b = o; o = NULL; } } MT_lock_set(&GDKhashLock(ABS(b->batCacheid)), "BAThash"); if (b->H->hash == NULL) { unsigned int tpe = ATOMstorage(b->htype); BUN cnt = BATcount(b); BUN mask; BUN p = BUNfirst(b), q = BUNlast(b), r; Hash *h = NULL; Heap *hp = NULL; str nme = BBP_physical(b->batCacheid); BATiter bi = bat_iterator(b); ALGODEBUG fprintf(stderr, "#BAThash: create hash(" BUNFMT ");\n", BATcount(b)); /* cnt = 0, hopefully there is a proper capacity from * which we can derive enough information */ if (!cnt) cnt = BATcapacity(b); if (b->htype == TYPE_void) { if (b->hseqbase == oid_nil) { MT_lock_unset(&GDKhashLock(ABS(b->batCacheid)), "BAThash"); ALGODEBUG fprintf(stderr, "#BAThash: cannot create hash-table on void-NIL column.\n"); return NULL; } ALGODEBUG fprintf(stderr, "#BAThash: creating hash-table on void column..\n"); tpe = TYPE_void; } /* determine hash mask size p = first; then no dynamic * scheme */ if (masksize > 0) { mask = HASHmask(masksize); } else if (ATOMsize(ATOMstorage(tpe)) == 1) { mask = (1 << 8); } else if (ATOMsize(ATOMstorage(tpe)) == 2) { mask = (1 << 12); } else if (b->hkey) { mask = HASHmask(cnt); } else { /* dynamic hash: we start with * HASHmask(cnt/64); if there are too many * collisions we try HASHmask(cnt/16), then * HASHmask(cnt/4), and finally * HASHmask(cnt). */ mask = HASHmask(cnt >> 6); p += (cnt >> 2); /* try out on first 25% of b */ if (p > q) p = q; } if (mask < 1024) mask = 1024; t0 = GDKusec(); do { BUN nslots = mask >> 3; /* 1/8 full is too full */ r = BUNfirst(b); if (hp) { HEAPfree(hp); GDKfree(hp); } if (h) { ALGODEBUG fprintf(stderr, "#BAThash: retry hash construction\n"); GDKfree(h); } /* create the hash structures */ hp = (Heap *) GDKzalloc(sizeof(Heap)); if (hp && (hp->filename = GDKmalloc(strlen(nme) + 12)) != NULL) sprintf(hp->filename, "%s.%chash", nme, b->batCacheid > 0 ? 'h' : 't'); if (hp == NULL || hp->filename == NULL || (h = HASHnew(hp, ATOMtype(b->htype), BATcapacity(b), mask)) == NULL) { MT_lock_unset(&GDKhashLock(ABS(b->batCacheid)), "BAThash"); if (hp != NULL) { GDKfree(hp->filename); GDKfree(hp); } return NULL; } switch (tpe) { case TYPE_bte: starthash(bte); break; case TYPE_sht: starthash(sht); break; case TYPE_int: case TYPE_flt: starthash(int); break; case TYPE_dbl: case TYPE_lng: starthash(lng); break; default: for (; r < p; r++) { ptr v = BUNhead(bi, r); BUN c = (BUN) heap_hash_any(b->H->vheap, h, v); if ( HASHget(h,c) == HASHnil(h) && nslots-- == 0) break; /* mask too full */ HASHputlink(h,r, HASHget(h,c)); HASHput(h,c, r); } break; } } while (r < p && mask < cnt && (mask <<= 2)); /* finish the hashtable with the current mask */ p = r; switch (tpe) { case TYPE_bte: finishhash(bte); break; case TYPE_sht: finishhash(sht); break; case TYPE_int: case TYPE_flt: finishhash(int); break; case TYPE_dbl: case TYPE_lng: finishhash(lng); break; default: for (; p < q; p++) { ptr v = BUNhead(bi, p); BUN c = (BUN) heap_hash_any(b->H->vheap, h, v); HASHputlink(h,p, HASHget(h,c)); HASHput(h,c,p); } break; } b->H->hash = h; t1 = GDKusec(); ALGODEBUG fprintf(stderr, "#BAThash: hash construction "LLFMT" usec\n", t1-t0); ALGODEBUG HASHcollisions(b,b->H->hash); }
/* * The prime routine for the BAT layer is to create a new hash index. * Its argument is the element type and the maximum number of BUNs be * stored under the hash function. */ gdk_return BAThash(BAT *b, BUN masksize) { BAT *o = NULL; lng t0 = 0, t1 = 0; if (BATcheckhash(b)) { if (o != NULL) { o->T->hash = b->T->hash; BBPunfix(b->batCacheid); } return GDK_SUCCEED; } MT_lock_set(&GDKhashLock(abs(b->batCacheid)), "BAThash"); if (b->T->hash == NULL) { unsigned int tpe = ATOMbasetype(b->ttype); BUN cnt = BATcount(b); BUN mask, maxmask = 0; BUN p = BUNfirst(b), q = BUNlast(b), r; Hash *h = NULL; Heap *hp; const char *nme = BBP_physical(b->batCacheid); const char *ext = b->batCacheid > 0 ? "thash" : "hhash"; BATiter bi = bat_iterator(b); #ifdef PERSISTENTHASH int fd; #endif ALGODEBUG fprintf(stderr, "#BAThash: create hash(" BUNFMT ");\n", BATcount(b)); if ((hp = GDKzalloc(sizeof(*hp))) == NULL || (hp->farmid = BBPselectfarm(b->batRole, b->ttype, hashheap)) < 0 || (hp->filename = GDKmalloc(strlen(nme) + 12)) == NULL) { MT_lock_unset(&GDKhashLock(abs(b->batCacheid)), "BAThash"); GDKfree(hp); return GDK_FAIL; } sprintf(hp->filename, "%s.%s", nme, ext); /* cnt = 0, hopefully there is a proper capacity from * which we can derive enough information */ if (!cnt) cnt = BATcapacity(b); if (b->ttype == TYPE_void) { if (b->tseqbase == oid_nil) { MT_lock_unset(&GDKhashLock(abs(b->batCacheid)), "BAThash"); ALGODEBUG fprintf(stderr, "#BAThash: cannot create hash-table on void-NIL column.\n"); GDKfree(hp->filename); GDKfree(hp); return GDK_FAIL; } ALGODEBUG fprintf(stderr, "#BAThash: creating hash-table on void column..\n"); tpe = TYPE_void; } /* determine hash mask size p = first; then no dynamic * scheme */ if (masksize > 0) { mask = HASHmask(masksize); } else if (ATOMsize(tpe) == 1) { mask = (1 << 8); } else if (ATOMsize(tpe) == 2) { mask = (1 << 16); } else if (b->tkey) { mask = HASHmask(cnt); } else { /* dynamic hash: we start with * HASHmask(cnt)/64; if there are too many * collisions we try HASHmask(cnt)/16, then * HASHmask(cnt)/4, and finally * HASHmask(cnt). */ maxmask = HASHmask(cnt); mask = maxmask >> 6; p += (cnt >> 2); /* try out on first 25% of b */ if (p > q) p = q; } t0 = GDKusec(); do { BUN nslots = mask >> 3; /* 1/8 full is too full */ r = BUNfirst(b); if (h) { char *fnme; bte farmid; ALGODEBUG fprintf(stderr, "#BAThash: retry hash construction\n"); fnme = GDKstrdup(hp->filename); farmid = hp->farmid; HEAPfree(hp, 1); memset(hp, 0, sizeof(*hp)); hp->filename = fnme; hp->farmid = farmid; GDKfree(h); h = NULL; } /* create the hash structures */ if ((h = HASHnew(hp, ATOMtype(b->ttype), BATcapacity(b), mask, BATcount(b))) == NULL) { MT_lock_unset(&GDKhashLock(abs(b->batCacheid)), "BAThash"); GDKfree(hp->filename); GDKfree(hp); return GDK_FAIL; } switch (tpe) { case TYPE_bte: starthash(bte); break; case TYPE_sht: starthash(sht); break; case TYPE_int: case TYPE_flt: #if SIZEOF_OID == SIZEOF_INT case TYPE_oid: #endif #if SIZEOF_WRD == SIZEOF_INT case TYPE_wrd: #endif starthash(int); break; case TYPE_dbl: case TYPE_lng: #if SIZEOF_OID == SIZEOF_LNG case TYPE_oid: #endif #if SIZEOF_WRD == SIZEOF_LNG case TYPE_wrd: #endif starthash(lng); break; #ifdef HAVE_HGE case TYPE_hge: starthash(hge); break; #endif default: for (; r < p; r++) { ptr v = BUNtail(bi, r); BUN c = (BUN) heap_hash_any(b->T->vheap, h, v); if (HASHget(h, c) == HASHnil(h) && nslots-- == 0) break; /* mask too full */ HASHputlink(h, r, HASHget(h, c)); HASHput(h, c, r); } break; } } while (r < p && mask < maxmask && (mask <<= 2)); /* finish the hashtable with the current mask */ p = r; switch (tpe) { case TYPE_bte: finishhash(bte); break; case TYPE_sht: finishhash(sht); break; case TYPE_int: case TYPE_flt: #if SIZEOF_OID == SIZEOF_INT case TYPE_oid: #endif #if SIZEOF_WRD == SIZEOF_INT case TYPE_wrd: #endif finishhash(int); break; case TYPE_dbl: case TYPE_lng: #if SIZEOF_OID == SIZEOF_LNG case TYPE_oid: #endif #if SIZEOF_WRD == SIZEOF_LNG case TYPE_wrd: #endif finishhash(lng); break; #ifdef HAVE_HGE case TYPE_hge: finishhash(hge); break; #endif default: for (; p < q; p++) { ptr v = BUNtail(bi, p); BUN c = (BUN) heap_hash_any(b->T->vheap, h, v); HASHputlink(h, p, HASHget(h, c)); HASHput(h, c, p); } break; } #ifdef PERSISTENTHASH if ((BBP_status(b->batCacheid) & BBPEXISTING) && b->batInserted == b->batCount && HEAPsave(hp, nme, ext) == GDK_SUCCEED && (fd = GDKfdlocate(hp->farmid, nme, "rb+", ext)) >= 0) { ALGODEBUG fprintf(stderr, "#BAThash: persisting hash %d\n", b->batCacheid); ((size_t *) hp->base)[0] |= 1 << 24; if (write(fd, hp->base, SIZEOF_SIZE_T) < 0) perror("write hash"); if (!(GDKdebug & FORCEMITOMASK)) { #if defined(NATIVE_WIN32) _commit(fd); #elif defined(HAVE_FDATASYNC) fdatasync(fd); #elif defined(HAVE_FSYNC) fsync(fd); #endif } close(fd); } else ALGODEBUG fprintf(stderr, "#BAThash: NOT persisting hash %d\n", b->batCacheid); #endif b->T->hash = h; t1 = GDKusec(); ALGODEBUG fprintf(stderr, "#BAThash: hash construction " LLFMT " usec\n", t1 - t0); ALGODEBUG HASHcollisions(b, b->T->hash); }
/* return TRUE if we have a hash on the tail, even if we need to read * one from disk */ int BATcheckhash(BAT *b) { int ret; lng t; t = GDKusec(); MT_lock_set(&GDKhashLock(abs(b->batCacheid)), "BATcheckhash"); t = GDKusec() - t; // use or ignore a persistent hash #ifdef PERSISTENTHASH if (b->T->hash == NULL) { Hash *h; Heap *hp; const char *nme = BBP_physical(b->batCacheid); const char *ext = b->batCacheid > 0 ? "thash" : "hhash"; int fd; if ((hp = GDKzalloc(sizeof(*hp))) != NULL && (hp->farmid = BBPselectfarm(b->batRole, b->ttype, hashheap)) >= 0 && (hp->filename = GDKmalloc(strlen(nme) + 12)) != NULL) { sprintf(hp->filename, "%s.%s", nme, ext); /* check whether a persisted hash can be found */ if ((fd = GDKfdlocate(hp->farmid, nme, "rb+", ext)) >= 0) { size_t hdata[HASH_HEADER_SIZE]; struct stat st; if ((h = GDKmalloc(sizeof(*h))) != NULL && read(fd, hdata, sizeof(hdata)) == sizeof(hdata) && hdata[0] == (((size_t) 1 << 24) | HASH_VERSION) && hdata[4] == (size_t) BATcount(b) && fstat(fd, &st) == 0 && st.st_size >= (off_t) (hp->size = hp->free = (hdata[1] + hdata[2]) * hdata[3] + HASH_HEADER_SIZE * SIZEOF_SIZE_T) && HEAPload(hp, nme, ext, 0) == GDK_SUCCEED) { h->lim = (BUN) hdata[1]; h->type = ATOMtype(b->ttype); h->mask = (BUN) (hdata[2] - 1); h->heap = hp; h->width = (int) hdata[3]; switch (h->width) { case BUN2: h->nil = (BUN) BUN2_NONE; break; case BUN4: h->nil = (BUN) BUN4_NONE; break; #ifdef BUN8 case BUN8: h->nil = (BUN) BUN8_NONE; break; #endif default: assert(0); } h->Link = hp->base + HASH_HEADER_SIZE * SIZEOF_SIZE_T; h->Hash = (void *) ((char *) h->Link + h->lim * h->width); close(fd); b->T->hash = h; ALGODEBUG fprintf(stderr, "#BATcheckhash: reusing persisted hash %d\n", b->batCacheid); MT_lock_unset(&GDKhashLock(abs(b->batCacheid)), "BATcheckhash"); return 1; } GDKfree(h); close(fd); /* unlink unusable file */ GDKunlink(hp->farmid, BATDIR, nme, ext); } GDKfree(hp->filename); } GDKfree(hp); } #endif ret = b->T->hash != NULL; MT_lock_unset(&GDKhashLock(abs(b->batCacheid)), "BATcheckhash"); ALGODEBUG if (ret) fprintf(stderr, "#BATcheckhash: already has hash %d, waited " LLFMT " usec\n", b->batCacheid, t); return ret; }
int type() const { return ATOMtype(get()->T.type); }