btSIter *btSetFullRangeIter(btSIter *iter, bt *btr, bool asc, cswc_t *w) { cswc_t W; // used in setHigh() if (!btr->root || !btr->numkeys) return NULL; if (!w) w = &W; ai_obj *aL = &w->wf.alow, *aH = &w->wf.ahigh; if (!assignMinKey(btr, aL) || !assignMaxKey(btr, aH)) return NULL; btk_t btk; bool med; uint32 ksize; SETITER8R(iter, btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev); siter->scan = 1; setHigh(siter, asc ? aH : aL, btr->s.ktype); char *bkey = createBTKey(asc ? aL : aH, &med, &ksize, btr, &btk); //DEST 030 if (!bkey) goto frangeiter_err; bt_n *x = NULL; int i = -1; uchar *stream = setIter(btr, bkey, siter, asc ? aL : aH, &x, &i, asc); destroyBTKey(bkey, med); /* DESTROYED 030 */ if (!stream && siter->missed) return siter;//IILMISS if (!streamToBTEntry(stream, siter, x, i)) goto frangeiter_err; if (btr->dirty_left) siter->missed = 1; // FULL means 100% FULL return siter; frangeiter_err: btReleaseRangeIterator(siter); return NULL; }
/* * Internal function which adds digests to the defrag_list * Mallocs the nodes of defrag_list * Returns : * -1 : Error * number of digests found : success * */ static long build_defrag_list_from_nbtr(as_namespace *ns, ai_obj *acol, bt *nbtr, ulong nofst, ulong *limit, uint64_t * tot_found, cf_ll *gc_list) { int error = -1; btEntry *nbe; // STEP 1: go thru a portion of the nbtr and find to-be-deleted-PKs // TODO: a range query may be smarter then using the Xth Iterator btSIter *nbi = (nofst ? btGetFullXthIter(nbtr, nofst, 1, NULL, 0) : btGetFullRangeIter(nbtr, 1, NULL)); if (!nbi) { return error; } long found = 0; long processed = 0; while ((nbe = btRangeNext(nbi, 1))) { ai_obj *akey = nbe->key; int ret = as_sindex_can_defrag_record(ns, (cf_digest *) (&akey->y)); if (ret == AS_SINDEX_GC_SKIP_ITERATION) { *limit = 0; break; } else if (ret == AS_SINDEX_GC_OK) { bool create = (cf_ll_size(gc_list) == 0) ? true : false; objs_to_defrag_arr *dt; if (!create) { cf_ll_element * ele = cf_ll_get_tail(gc_list); dt = ((ll_sindex_gc_element*)ele)->objs_to_defrag; if (dt->num == SINDEX_GC_NUM_OBJS_PER_ARR) { create = true; } } if (create) { dt = as_sindex_gc_get_defrag_arr(); if (!dt) { *tot_found += found; return -1; } ll_sindex_gc_element * node; node = cf_malloc(sizeof(ll_sindex_gc_element)); node->objs_to_defrag = dt; cf_ll_append(gc_list, (cf_ll_element *)node); } cloneDigestFromai_obj(&(dt->acol_digs[dt->num].dig), akey); ai_objClone(&(dt->acol_digs[dt->num].acol), acol); dt->num += 1; found++; } processed++; (*limit)--; if (*limit == 0) break; } btReleaseRangeIterator(nbi); *tot_found += found; return processed; }
/* * Return 0 in case of success * -1 in case of failure */ static int add_recs_from_nbtr(as_sindex_metadata *imd, ai_obj *ikey, bt *nbtr, as_sindex_qctx *qctx, bool fullrng) { int ret = 0; ai_obj sfk, efk; init_ai_obj(&sfk); init_ai_obj(&efk); btSIter *nbi; btEntry *nbe; btSIter stack_nbi; if (fullrng) { nbi = btSetFullRangeIter(&stack_nbi, nbtr, 1, NULL); } else { // search from LAST batches end-point init_ai_objFromDigest(&sfk, &qctx->bdig); assignMaxKey(nbtr, &efk); nbi = btSetRangeIter(&stack_nbi, nbtr, &sfk, &efk, 1); } if (nbi) { while ((nbe = btRangeNext(nbi, 1))) { ai_obj *akey = nbe->key; // FIRST can be REPEAT (last batch) if (!fullrng && ai_objEQ(&sfk, akey)) { continue; } if (btree_addsinglerec(imd, ikey, (cf_digest *)&akey->y, qctx->recl, &qctx->n_bdigs, qctx->can_partition_query, qctx->partitions_pre_reserved)) { ret = -1; break; } if (qctx->n_bdigs == qctx->bsize) { if (ikey) { ai_objClone(qctx->bkey, ikey); } cloneDigestFromai_obj(&qctx->bdig, akey); break; } } btReleaseRangeIterator(nbi); } else { cf_warning(AS_QUERY, "Could not find nbtr iterator.. skipping !!"); } return ret; }
btSIter *btSetRangeIter(btSIter * iter, bt *btr, ai_obj *alow, ai_obj *ahigh, bool asc) { if (!btr->root || !btr->numkeys) return NULL; btk_t btk; bool med; uint32 ksize; //bt_dumptree(btr, btr->ktype); SETITER8R(iter, btr, asc, iter_leaf, iter_leaf_rev, iter_node, iter_node_rev); setHigh(siter, asc ? ahigh : alow, btr->s.ktype); char *bkey = createBTKey(asc ? alow : ahigh, &med, &ksize, btr, &btk); //D032 if (!bkey) goto rangeiter_err; bt_n *x = NULL; int i = -1; uchar *stream = setIter(btr, bkey, siter, asc ? alow : ahigh, &x, &i, asc); destroyBTKey(bkey, med); /* DESTROYED 032 */ if (!streamToBTEntry(stream, siter, x, i)) goto rangeiter_err; return siter; rangeiter_err: btReleaseRangeIterator(siter); return NULL; }
// Iterate through the btree and cleanup local array // if it is btree it will be cleaned up by Aerospike Index // call for dropIndex static int ai_cleanup(bt *ibtr) { if (!ibtr) { return 0; } btSIter stack_bi; btEntry *be; btSIter *bi = btSetFullRangeIter(&stack_bi, ibtr, 1, NULL); if (bi) { while ((be = btRangeNext(bi, 1))) { ai_nbtr *anbtr = be->val; if (anbtr) { if (!anbtr->is_btree) { ai_arr_destroy(anbtr->u.arr); } } } btReleaseRangeIterator(bi); } return 0; }
void joinGeneric(redisClient *c, jb_t *jb) { if (jb->w.nob > 1) { addReply(c, shared.join_m_obc); return; } Order_by = jb->w.nob; Order_by_col_val = NULL; /* sort queried-columns to queried-indices */ jqo_t o_csort_order[MAX_COLUMN_PER_TABLE]; jqo_t csort_order [MAX_COLUMN_PER_TABLE]; for (int i = 0; i < jb->qcols; i++) { for (int j = 0; j < jb->n_ind; j++) { if (jb->j_tbls[i] == Index[server.dbid][jb->j_indxs[j]].table) { csort_order[i].t = jb->j_tbls[i]; csort_order[i].i = j; csort_order[i].c = jb->j_cols[i]; csort_order[i].n = i; } } } memcpy(&o_csort_order, &csort_order, sizeof(jqo_t) * jb->qcols); qsort(&csort_order, jb->qcols, sizeof(jqo_t), cmp_jqo); /* reorder queried-columns to queried-indices, will sort @ output time */ bool reordered = 0; for (int i = 0; i < jb->qcols; i++) { if (jb->j_tbls[i] != csort_order[i].t || jb->j_cols[i] != csort_order[i].c) { reordered = 1; jb->j_tbls[i] = csort_order[i].t; jb->j_cols[i] = csort_order[i].c; } } cswc_t *w = &jb->w; /* makes coding more compact */ w->tmatch = w->obt[0]; /* HACK: initOBsort needs w->tmatch */ list *ll = initOBsort(Order_by, w); uchar pk1type = Tbl[server.dbid] [Index[server.dbid][jb->j_indxs[0]].table].col_type[0]; bt *jbtr = createJoinResultSet(pk1type); robj *rset[MAX_JOIN_INDXS]; for (int i = 1; i < jb->n_ind; i++) { rset[i] = createValSetObject(); } int j_ind_len [MAX_JOIN_INDXS]; int jind_ncols[MAX_JOIN_INDXS]; join_add_cols_t jc; /* these dont change in the loop below */ jc.qcols = jb->qcols; jc.j_tbls = jb->j_tbls; jc.j_cols = jb->j_cols; jc.jind_ncols = jind_ncols; jc.j_ind_len = j_ind_len; jc.jbtr = jbtr; for (int i = 0; i < jb->n_ind; i++) { /* iterate join indices */ btEntry *be, *nbe; j_ind_len[i] = 0; jc.index = i; jc.itable = Index[server.dbid][jb->j_indxs[i]].table; robj *btt = lookupKeyRead(c->db, Tbl[server.dbid][jc.itable].name); jc.btr = (bt *)btt->ptr; jc.virt = Index[server.dbid][jb->j_indxs[i]].virt; if (w->low) { /* RANGE QUERY */ if (jc.virt) { /* PK */ btSIter *bi = btGetRangeIterator(jc.btr, w->low, w->high); while ((be = btRangeNext(bi)) != NULL) { jc.ajk = be->key; jc.rrow = be->val; joinAddColsFromInd(&jc, rset, w); } btReleaseRangeIterator(bi); } else { /* FK */ robj *ind = Index[server.dbid][jb->j_indxs[i]].obj; robj *ibtt = lookupKey(c->db, ind); bt *ibtr = (bt *)ibtt->ptr; btSIter *bi = btGetRangeIterator(ibtr, w->low, w->high); while ((be = btRangeNext(bi)) != NULL) { jc.ajk = be->key; bt *nbtr = be->val; btSIter *nbi = btGetFullRangeIterator(nbtr); while ((nbe = btRangeNext(nbi)) != NULL) { jc.rrow = btFindVal(jc.btr, nbe->key); joinAddColsFromInd(&jc, rset, w); } btReleaseRangeIterator(nbi); } btReleaseRangeIterator(bi); } } else { /* IN() QUERY */ listNode *ln; listIter *li = listGetIterator(w->inl, AL_START_HEAD); if (jc.virt) { while((ln = listNext(li)) != NULL) { jc.ajk = ln->value; jc.rrow = btFindVal(jc.btr, jc.ajk); if (jc.rrow) joinAddColsFromInd(&jc, rset, w); } } else { btSIter *nbi; robj *ind = Index[server.dbid][jb->j_indxs[i]].obj; robj *ibtt = lookupKey(c->db, ind); bt *ibtr = (bt *)ibtt->ptr; while((ln = listNext(li)) != NULL) { jc.ajk = ln->value; bt *nbtr = btIndFindVal(ibtr, jc.ajk); if (nbtr) { nbi = btGetFullRangeIterator(nbtr); while ((nbe = btRangeNext(nbi)) != NULL) { jc.rrow = btFindVal(jc.btr, nbe->key); joinAddColsFromInd(&jc, rset, w); } btReleaseRangeIterator(nbi); } } } listReleaseIterator(li); } } /* cant join if one table had ZERO rows */ bool one_empty = 0; if (jbtr->numkeys == 0) one_empty = 1; else { for (int i = 1; i < jb->n_ind; i++) { if (dictSize((dict *)rset[i]->ptr) == 0) { one_empty = 1; break; } } } LEN_OBJ bool err = 0; long sent = 0; btIterator *bi = NULL; /* B4 GOTO */ char *reply = NULL; /* B4 GOTO */ if (!one_empty) { int reply_size = 0; for (int i = 0; i < jb->n_ind; i++) { // get maxlen possbl 4 joined row reply_size += j_ind_len[i] + 1; } reply = malloc(reply_size); /* freed after while() loop */ build_jrow_reply_t bjr; /* none of these change during a join */ bzero(&bjr, sizeof(build_jrow_reply_t)); bjr.j.c = c; bjr.j.jind_ncols = jind_ncols; bjr.j.reply = reply; bjr.j.csort_order = csort_order; bjr.j.reordered = reordered; bjr.j.qcols = jb->qcols; bjr.n_ind = jb->n_ind; bjr.card = &card; bjr.j.obt = w->obt[0]; bjr.j.obc = w->obc[0]; bjr.j_indxs = jb->j_indxs; bjr.j.ll = ll; bjr.j.cstar = jb->cstar; joinRowEntry *be; bi = btGetJoinFullRangeIterator(jbtr, pk1type); while ((be = btJoinRangeNext(bi, pk1type)) != NULL) { /* iter BT */ listNode *ln; bjr.jk = be->key; list *jll = (list *)be->val; listIter *li = listGetIterator(jll, AL_START_HEAD); while((ln = listNext(li)) != NULL) { /* iter LIST */ char *first_entry; char *item = ln->value; if (bjr.j.obt == Index[server.dbid][bjr.j_indxs[0]].table) { obsl_t *ob = (obsl_t *)item; Order_by_col_val = ob->keys[0]; first_entry = (char *)ob->row; } else { first_entry = item; } for (int j = 0; j < jind_ncols[0]; j++) { Rcols[0][j] = (char **)first_entry; first_entry += PTR_SIZE; memcpy(&Rc_lens[0][j], first_entry, UINT_SIZE); first_entry += UINT_SIZE; } if (!buildJRowReply(&bjr, 1, rset)) { err = 1; goto join_end; } } listReleaseIterator(li); } if (Order_by) { sent = sortJoinOrderByAndReply(c, &bjr, w); if (sent == -1) err = 1; releaseOBsort(ll); } } join_end: if (bi) btReleaseJoinRangeIterator(bi); if (reply) free(reply); /* free joinRowEntry malloc from joinAddColsFromInd() */ bool is_ob = (w->obt[0] == Index[server.dbid][jb->j_indxs[0]].table); btJoinRelease(jbtr, jind_ncols[0], is_ob, freeListOfIndRow); /* free joinRowEntry malloc from joinAddColsFromInd() */ dictEntry *de; for (int i = 1; i < jb->n_ind; i++) { dict *set = rset[i]->ptr; bool is_ob = (w->obt[0] == Index[server.dbid][jb->j_indxs[i]].table); dictIterator *di = dictGetIterator(set); while((de = dictNext(di)) != NULL) { robj *val = dictGetEntryVal(de); dict *iset = val->ptr; freeDictOfIndRow(iset, jind_ncols[i], is_ob); } dictReleaseIterator(di); } for (int i = 1; i < jb->n_ind; i++) { decrRefCount(rset[i]); } if (err) return; if (w->lim != -1 && sent < card) card = sent; if (jb->cstar) { lenobj->ptr = sdscatprintf(sdsempty(), ":%ld\r\n", card); } else { lenobj->ptr = sdscatprintf(sdsempty(), "*%ld\r\n", card); if (w->ovar) incrOffsetVar(c, w, card); } }
/* * Return 0 in case of success * -1 in case of failure */ static int get_numeric_range_recl(as_sindex_metadata *imd, uint64_t begk, uint64_t endk, as_sindex_qctx *qctx) { ai_obj sfk; init_ai_objLong(&sfk, qctx->new_ibtr ? begk : qctx->bkey->l); ai_obj efk; init_ai_objLong(&efk, endk); as_sindex_pmetadata *pimd = &imd->pimd[qctx->pimd_idx]; bool fullrng = qctx->new_ibtr; int ret = 0; btSIter *bi = btGetRangeIter(pimd->ibtr, &sfk, &efk, 1); btEntry *be; if (bi) { while ((be = btRangeNext(bi, 1))) { ai_obj *ikey = be->key; ai_nbtr *anbtr = be->val; if (!anbtr) { ret = -1; break; } // figure out nbtr to deal with. If the key which was // used last time vanishes work with next key. If the // key exist but 'last' entry made to list in the last // iteration; Move to next nbtr if (!fullrng) { if (!ai_objEQ(&sfk, ikey)) { fullrng = 1; // bkey disappeared } else if (qctx->nbtr_done) { qctx->nbtr_done = false; // If we are moving to the next key, we need // to search the full range. fullrng = 1; continue; } } if (anbtr->is_btree) { if (add_recs_from_nbtr(imd, ikey, anbtr->u.nbtr, qctx, fullrng)) { ret = -1; break; } } else { if (add_recs_from_arr(imd, ikey, anbtr->u.arr, qctx)) { ret = -1; break; } } // Since add_recs_from_arr() returns entire thing and do not support the batch limit, // >= operator is needed here. if (qctx->n_bdigs >= qctx->bsize) { break; } // If it reaches here, this means last key could not fill the batch. // So if we are to start a new key, search should be done on full range // and the new nbtr is obviously not done. fullrng = 1; qctx->nbtr_done = false; } btReleaseRangeIterator(bi); } return ret; }
/* * Aerospike Index interface to build a defrag_list. * * Returns : * AS_SINDEX_DONE ---> The current pimd has been scanned completely for defragging * AS_SINDEX_CONTINUE ---> Current pimd sill may have some candidate digest to be defragged * AS_SINDEX_ERR ---> Error. Abort this pimd. * * Notes : Caller has the responsibility to free the iterators. * Requires a proper offset value from the caller. */ int ai_btree_build_defrag_list(as_sindex_metadata *imd, as_sindex_pmetadata *pimd, ai_obj *icol, long *nofst, long limit, uint64_t * tot_processed, uint64_t * tot_found, cf_ll *gc_list) { int ret = AS_SINDEX_ERR; if (!pimd || !imd) { return ret; } as_namespace *ns = imd->si->ns; if (!ns) { ns = as_namespace_get_byname((char *)imd->ns_name); } char *iname = get_iname_from_imd(imd); if (!iname) { ret = AS_SINDEX_ERR_NO_MEMORY; return ret; } if (!pimd || !pimd->ibtr || !pimd->ibtr->numkeys) { goto END; } //Entry is range query, FROM previous icol TO maxKey(ibtr) if (icol->empty) { assignMinKey(pimd->ibtr, icol); // init first call } ai_obj iH; assignMaxKey(pimd->ibtr, &iH); btEntry *be = NULL; btSIter *bi = btGetRangeIter(pimd->ibtr, icol, &iH, 1); if (!bi) { goto END; } while ( true ) { be = btRangeNext(bi, 1); if (!be) { ret = AS_SINDEX_DONE; break; } ai_obj *acol = be->key; ai_nbtr *anbtr = be->val; long processed = 0; if (!anbtr) { break; } if (anbtr->is_btree) { processed = build_defrag_list_from_nbtr(ns, acol, anbtr->u.nbtr, *nofst, &limit, tot_found, gc_list); } else { processed = build_defrag_list_from_arr(ns, acol, anbtr->u.arr, *nofst, &limit, tot_found, gc_list); } if (processed < 0) { // error .. abort everything. cf_detail(AS_SINDEX, "build_defrag_list returns an error. Aborting defrag on current pimd"); ret = AS_SINDEX_ERR; break; } *tot_processed += processed; // This tree may have some more digest to defrag if (limit == 0) { *nofst = *nofst + processed; ai_objClone(icol, acol); cf_detail(AS_SINDEX, "Current pimd may need more iteration of defragging."); ret = AS_SINDEX_CONTINUE; break; } // We have finished this tree. Yet we have not reached our limit to defrag. // Goes to next iteration *nofst = 0; ai_objClone(icol, acol); }; btReleaseRangeIterator(bi); END: cf_free(iname); return ret; }
/* SYNTAX 1.) SCANSELECT * FROM tbl 2.) SCANSELECT * FROM tbl ORDER_BY_CLAUSE 3.) SCANSELECT * FROM tbl WHERE clause [ORDER_BY_CLAUSE] */ void tscanCommand(redisClient *c) { int cmatchs[MAX_COLUMN_PER_TABLE]; bool nowc = 0; /* NO WHERE CLAUSE */ bool cstar = 0; int qcols = 0; int tmatch = -1; bool join = 0; sds where = (c->argc > 4) ? c->argv[4]->ptr : NULL; sds wc = (c->argc > 5) ? c->argv[5]->ptr : NULL; if ((where && !*where) || (wc && !*wc)) { addReply(c, shared.scanselectsyntax); return; } if (!parseSelectReply(c, 1, &nowc, &tmatch, cmatchs, &qcols, &join, &cstar, c->argv[1]->ptr, c->argv[2]->ptr, c->argv[3]->ptr, where)) return; if (join) { addReply(c, shared.scan_join); return; } if (!nowc && !wc) { addReply(c, shared.scanselectsyntax); return; } cswc_t w; list *ll = NULL; /* B4 GOTO */ init_check_sql_where_clause(&w, tmatch, wc); /* on error: GOTO tscan_end */ if (nowc && c->argc > 4) { /* ORDER BY or STORE w/o WHERE CLAUSE */ if (!strncasecmp(where, "ORDER ", 6) || !strncasecmp(where, "STORE ", 6)) { if (!parseWCAddtlSQL(c, c->argv[4]->ptr, &w)) goto tscan_end; if (w.lvr) { w.lvr = sdsnewlen(w.lvr, strlen(w.lvr)); if (!leftoverParsingReply(c, w.lvr)) goto tscan_end; } if (w.wtype > SQL_STORE_LOOKUP_MASK) { /* STORE after ORDER BY */ addReply(c, shared.scan_store); goto tscan_end; } } } if (nowc && !w.nob && c->argc > 4) { /* argv[4] parse error */ w.lvr = sdsdup(where); leftoverParsingReply(c, w.lvr); goto tscan_end; } if (!nowc && !w.nob) { /* WhereClause exists and no ORDER BY */ parseWCReply(c, &w, SQL_SCANSELECT, 1); if (w.wtype == SQL_ERR_LOOKUP) goto tscan_end; if (!leftoverParsingReply(c, w.lvr)) goto tscan_end; if (w.imatch != -1) { /* disallow SCANSELECT on indexed columns */ addReply(c, shared.scan_on_index); goto tscan_end; } if (w.wtype > SQL_STORE_LOOKUP_MASK) { /* no SCAN STOREs (for now) */ addReply(c, shared.scan_store); goto tscan_end; } } if (cstar && w.nob) { /* SCANSELECT COUNT(*) ORDER BY -> stupid */ addReply(c, shared.orderby_count); goto tscan_end; } robj *btt = lookupKeyRead(c->db, Tbl[server.dbid][w.tmatch].name); bt *btr = (bt *)btt->ptr; if (cstar && nowc) { /* SCANSELECT COUNT(*) FROM tbl */ addReplyLongLong(c, (long long)btr->numkeys); goto tscan_end; } // TODO on "fk_lim" iterate on FK (not PK) //if (w.nob) w.imatch = find_index(w.tmatch, w.obc); fr_t fr; qr_t q; setQueued(&w, &q); ll = initOBsort(q.qed, &w); init_filter_row(&fr, c, btr, &w, &q, qcols, cmatchs, nowc, ll, cstar, OBY_FREE_ROBJ); //dumpW(&w, w.wtype); LEN_OBJ btEntry *be; long sent = 0; long loops = -1; btSIter *bi = q.pk_lo ? btGetFullIteratorXth(btr, w.ofst): btGetFullRangeIterator(btr); while ((be = btRangeNext(bi)) != NULL) { loops++; if (q.pk_lim) { if (!q.pk_lo && w.ofst != -1 && loops < w.ofst) continue; sent++; if (w.lim == card) break; /* ORDRBY PK LIM */ } condSelectReply(&fr, be->key, be->val, &card); } btReleaseRangeIterator(bi); if (q.qed && card) opSelectOnSort(c, ll, &w, fr.ofree, &sent); if (w.lim != -1 && sent < card) card = sent; if (cstar) lenobj->ptr = sdscatprintf(sdsempty(), ":%ld\r\n", card); else lenobj->ptr = sdscatprintf(sdsempty(), "*%ld\r\n", card); if (w.ovar) incrOffsetVar(c, &w, card); tscan_end: releaseOBsort(ll); destroy_check_sql_where_clause(&w); }