/* * __wt_curjoin_join -- * Add a new join to a join cursor. */ int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count) { WT_CURSOR_INDEX *cindex; WT_CURSOR_JOIN *child; WT_CURSOR_JOIN_ENDPOINT *end; WT_CURSOR_JOIN_ENTRY *entry; size_t len; uint8_t endrange; u_int i, ins, nonbloom; bool hasins, needbloom, nested, range_eq; entry = NULL; hasins = needbloom = false; ins = nonbloom = 0; /* -Wuninitialized */ if (cjoin->entries_next == 0) { if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION)) F_SET(cjoin, WT_CURJOIN_DISJUNCTION); } else if (LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) && !F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "operation=or does not match previous operation=and"); else if (!LF_ISSET(WT_CURJOIN_ENTRY_DISJUNCTION) && F_ISSET(cjoin, WT_CURJOIN_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "operation=and does not match previous operation=or"); nested = WT_PREFIX_MATCH(ref_cursor->uri, "join:"); if (!nested) for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == idx && cjoin->entries[i].subjoin == NULL) { entry = &cjoin->entries[i]; break; } if (!needbloom && i > 0 && !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) { needbloom = true; nonbloom = i; } } else { if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM)) WT_RET_MSG(session, EINVAL, "Bloom filters cannot be used with subjoins"); } if (entry == NULL) { WT_RET(__wt_realloc_def(session, &cjoin->entries_allocated, cjoin->entries_next + 1, &cjoin->entries)); if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) { /* * Reorder the list so that after the first entry, * the Bloom filtered entries come next, followed by * the non-Bloom entries. Once the Bloom filters * are built, determining membership via Bloom is * faster than without Bloom, so we can answer * membership questions more quickly, and with less * I/O, with the Bloom entries first. */ entry = &cjoin->entries[nonbloom]; memmove(entry + 1, entry, (cjoin->entries_next - nonbloom) * sizeof(WT_CURSOR_JOIN_ENTRY)); memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY)); } else entry = &cjoin->entries[cjoin->entries_next]; entry->index = idx; entry->flags = flags; entry->count = count; entry->bloom_bit_count = bloom_bit_count; entry->bloom_hash_count = bloom_hash_count; ++cjoin->entries_next; } else { /* Merge the join into an existing entry for this index */ if (count != 0 && entry->count != 0 && entry->count != count) WT_RET_MSG(session, EINVAL, "count=%" PRIu64 " does not match " "previous count=%" PRIu64 " for this index", count, entry->count); if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) != F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM)) WT_RET_MSG(session, EINVAL, "join has incompatible strategy " "values for the same index"); if (LF_MASK(WT_CURJOIN_ENTRY_FALSE_POSITIVES) != F_MASK(entry, WT_CURJOIN_ENTRY_FALSE_POSITIVES)) WT_RET_MSG(session, EINVAL, "join has incompatible bloom_false_positives " "values for the same index"); /* * Check against other comparisons (we call them endpoints) * already set up for this index. * We allow either: * - one or more "eq" (with disjunction) * - exactly one "eq" (with conjunction) * - exactly one of "gt" or "ge" (conjunction or disjunction) * - exactly one of "lt" or "le" (conjunction or disjunction) * - one of "gt"/"ge" along with one of "lt"/"le" * (currently restricted to conjunction). * * Some other combinations, although expressible either do * not make sense (X == 3 AND X == 5) or are reducible (X < * 7 AND X < 9). Other specific cases of (X < 7 OR X > 15) * or (X == 4 OR X > 15) make sense but we don't handle yet. */ for (i = 0; i < entry->ends_next; i++) { end = &entry->ends[i]; range_eq = (range == WT_CURJOIN_END_EQ); endrange = WT_CURJOIN_END_RANGE(end); if ((F_ISSET(end, WT_CURJOIN_END_GT) && ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || (F_ISSET(end, WT_CURJOIN_END_LT) && ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || (endrange == WT_CURJOIN_END_EQ && (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) != 0)) WT_RET_MSG(session, EINVAL, "join has overlapping ranges"); if (range == WT_CURJOIN_END_EQ && endrange == WT_CURJOIN_END_EQ && !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) WT_RET_MSG(session, EINVAL, "compare=eq can only be combined " "using operation=or"); /* * Sort "gt"/"ge" to the front, followed by any number * of "eq", and finally "lt"/"le". */ if (!hasins && ((range & WT_CURJOIN_END_GT) != 0 || (range == WT_CURJOIN_END_EQ && endrange != WT_CURJOIN_END_EQ && !F_ISSET(end, WT_CURJOIN_END_GT)))) { ins = i; hasins = true; } } /* All checks completed, merge any new configuration now */ entry->count = count; entry->bloom_bit_count = WT_MAX(entry->bloom_bit_count, bloom_bit_count); entry->bloom_hash_count = WT_MAX(entry->bloom_hash_count, bloom_hash_count); } if (nested) { child = (WT_CURSOR_JOIN *)ref_cursor; entry->subjoin = child; child->parent = cjoin; } else { WT_RET(__curjoin_insert_endpoint(session, entry, hasins ? ins : entry->ends_next, &end)); end->cursor = ref_cursor; F_SET(end, range); if (entry->main == NULL && idx != NULL) { /* * Open the main file with a projection of the * indexed columns. */ WT_RET(__curjoin_open_main(session, cjoin, entry)); /* * When we are repacking index keys to remove the * primary key, we never want to transform trailing * 'u'. Use no-op padding to force this. */ cindex = (WT_CURSOR_INDEX *)ref_cursor; len = strlen(cindex->iface.key_format) + 3; WT_RET(__wt_calloc(session, len, 1, &entry->repack_format)); WT_RET(__wt_snprintf(entry->repack_format, len, "%s0x", cindex->iface.key_format)); } } return (0); }
/* * __wt_curjoin_join -- * Add a new join to a join cursor. */ int __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count) { WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; WT_CURSOR_JOIN_ENDPOINT *end, *newend; bool hasins, needbloom, range_eq; u_int i, ins, nonbloom; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; char *main_uri; size_t namesize, newsize; entry = NULL; hasins = needbloom = false; ins = 0; /* -Wuninitialized */ main_uri = NULL; nonbloom = 0; /* -Wuninitialized */ namesize = strlen(cjoin->table->name); for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == idx) { entry = &cjoin->entries[i]; break; } if (!needbloom && i > 0 && !F_ISSET(&cjoin->entries[i], WT_CURJOIN_ENTRY_BLOOM)) { needbloom = true; nonbloom = i; } } if (entry == NULL) { WT_ERR(__wt_realloc_def(session, &cjoin->entries_allocated, cjoin->entries_next + 1, &cjoin->entries)); if (LF_ISSET(WT_CURJOIN_ENTRY_BLOOM) && needbloom) { /* * Reorder the list so that after the first entry, * the Bloom filtered entries come next, followed by * the non-Bloom entries. Once the Bloom filters * are built, determining membership via Bloom is * faster than without Bloom, so we can answer * membership questions more quickly, and with less * I/O, with the Bloom entries first. */ entry = &cjoin->entries[nonbloom]; memmove(entry + 1, entry, (cjoin->entries_next - nonbloom) * sizeof(WT_CURSOR_JOIN_ENTRY)); memset(entry, 0, sizeof(WT_CURSOR_JOIN_ENTRY)); } else entry = &cjoin->entries[cjoin->entries_next]; entry->index = idx; entry->flags = flags; entry->count = count; entry->bloom_bit_count = bloom_bit_count; entry->bloom_hash_count = bloom_hash_count; ++cjoin->entries_next; } else { /* Merge the join into an existing entry for this index */ if (count != 0 && entry->count != 0 && entry->count != count) WT_ERR_MSG(session, EINVAL, "count=%" PRIu64 " does not match " "previous count=%" PRIu64 " for this index", count, entry->count); if (LF_MASK(WT_CURJOIN_ENTRY_BLOOM) != F_MASK(entry, WT_CURJOIN_ENTRY_BLOOM)) WT_ERR_MSG(session, EINVAL, "join has incompatible strategy " "values for the same index"); /* * Check against other comparisons (we call them endpoints) * already set up for this index. * We allow either: * - one or more "eq" (with disjunction) * - exactly one "eq" (with conjunction) * - exactly one of "gt" or "ge" (conjunction or disjunction) * - exactly one of "lt" or "le" (conjunction or disjunction) * - one of "gt"/"ge" along with one of "lt"/"le" * (currently restricted to conjunction). * * Some other combinations, although expressible either do * not make sense (X == 3 AND X == 5) or are reducible (X < * 7 AND X < 9). Other specific cases of (X < 7 OR X > 15) * or (X == 4 OR X > 15) make sense but we don't handle yet. */ for (i = 0; i < entry->ends_next; i++) { end = &entry->ends[i]; range_eq = (range == WT_CURJOIN_END_EQ); if ((F_ISSET(end, WT_CURJOIN_END_GT) && ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || (F_ISSET(end, WT_CURJOIN_END_LT) && ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || (end->flags == WT_CURJOIN_END_EQ && (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) != 0)) WT_ERR_MSG(session, EINVAL, "join has overlapping ranges"); if (range == WT_CURJOIN_END_EQ && end->flags == WT_CURJOIN_END_EQ && !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) WT_ERR_MSG(session, EINVAL, "compare=eq can only be combined " "using operation=or"); /* * Sort "gt"/"ge" to the front, followed by any number * of "eq", and finally "lt"/"le". */ if (!hasins && ((range & WT_CURJOIN_END_GT) != 0 || (range == WT_CURJOIN_END_EQ && !F_ISSET(end, WT_CURJOIN_END_GT)))) { ins = i; hasins = true; } } /* All checks completed, merge any new configuration now */ entry->count = count; entry->bloom_bit_count = WT_MAX(entry->bloom_bit_count, bloom_bit_count); entry->bloom_hash_count = WT_MAX(entry->bloom_hash_count, bloom_hash_count); } WT_ERR(__wt_realloc_def(session, &entry->ends_allocated, entry->ends_next + 1, &entry->ends)); if (!hasins) ins = entry->ends_next; newend = &entry->ends[ins]; memmove(newend + 1, newend, (entry->ends_next - ins) * sizeof(WT_CURSOR_JOIN_ENDPOINT)); memset(newend, 0, sizeof(WT_CURSOR_JOIN_ENDPOINT)); entry->ends_next++; newend->cursor = ref_cursor; F_SET(newend, range); /* Open the main file with a projection of the indexed columns. */ if (entry->main == NULL && entry->index != NULL) { namesize = strlen(cjoin->table->name); newsize = namesize + entry->index->colconf.len + 1; WT_ERR(__wt_calloc(session, 1, newsize, &main_uri)); snprintf(main_uri, newsize, "%s%.*s", cjoin->table->name, (int)entry->index->colconf.len, entry->index->colconf.str); WT_ERR(__wt_open_cursor(session, main_uri, (WT_CURSOR *)cjoin, raw_cfg, &entry->main)); } err: if (main_uri != NULL) __wt_free(session, main_uri); return (ret); }