/* * _bt_mkscankey_nodata * Build an insertion scan key that contains 3-way comparator routines * appropriate to the key datatypes, but no comparison data. The * comparison data ultimately used must match the key datatypes. * * The result cannot be used with _bt_compare(), unless comparison * data is first stored into the key entries. Currently this * routine is only called by nbtsort.c and tuplesort.c, which have * their own comparison routines. */ ScanKey _bt_mkscankey_nodata(Relation rel) { ScanKey skey; int natts; int16 *indoption; int i; natts = RelationGetNumberOfAttributes(rel); indoption = rel->rd_indoption; skey = (ScanKey) palloc(natts * sizeof(ScanKeyData)); for (i = 0; i < natts; i++) { FmgrInfo *procinfo; int flags; /* * We can use the cached (default) support procs since no cross-type * comparison can be needed. */ procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); flags = SK_ISNULL | (indoption[i] << SK_BT_INDOPTION_SHIFT); ScanKeyEntryInitializeWithInfo(&skey[i], flags, (AttrNumber) (i + 1), InvalidStrategy, InvalidOid, procinfo, (Datum) 0); } return skey; }
/* * Given a WITH(...) clause and no other column encoding directives -- such as * in the case of CREATE TABLE WITH () AS SELECT -- fill in the column encoding * catalog entries for that relation. */ void AddDefaultRelationAttributeOptions(Relation rel, List *options) { Datum opts; AttrNumber attno; List *ce; /* only supported on AOCO at this stage */ if (true) return; ce = form_default_storage_directive(options); if (!ce) ce = default_column_encoding_clause(); ce = transformStorageEncodingClause(ce); opts = transformRelOptions(PointerGetDatum(NULL), ce, true, false); for (attno = 1; attno <= RelationGetNumberOfAttributes(rel); attno++) add_attribute_encoding_entry(RelationGetRelid(rel), attno, opts); CommandCounterIncrement(); }
/* Returns an array of block sizes -- one entry for each user column in rel. */ uint32 * RelationGetColumnBlocksize(Relation rel) { uint32 *bz = palloc(RelationGetNumberOfAttributes(rel) * sizeof(uint32)); StdRdOptions **opts = RelationGetAttributeOptions(rel); int i; for (i = 0; i < RelationGetNumberOfAttributes(rel); i++) { if (opts[i] == NULL) bz[i] = DEFAULT_APPENDONLY_BLOCK_SIZE; else bz[i] = opts[i]->blocksize; } return bz; }
static void gp_statistics_estimate_reltuples_relpages_ao_cs(Relation rel, float4 *reltuples, float4 *relpages) { AOCSFileSegInfo **aocsInfo = NULL; int nsegs = 0; double totalBytes = 0; AppendOnlyEntry *aoEntry; int64 hidden_tupcount; AppendOnlyVisimap visimap; /** * Ensure that the right kind of relation with the right type of storage is passed to us. */ Assert(rel->rd_rel->relkind == RELKIND_RELATION); Assert(RelationIsAoCols(rel)); *reltuples = 0.0; *relpages = 0.0; /* get table level statistics from the pg_aoseg table */ aoEntry = GetAppendOnlyEntry(RelationGetRelid(rel), SnapshotNow); aocsInfo = GetAllAOCSFileSegInfo(rel, aoEntry, SnapshotNow, &nsegs); if (aocsInfo) { int i = 0; int j = 0; for(i = 0; i < nsegs; i++) { for(j = 0; j < RelationGetNumberOfAttributes(rel); j++) { AOCSVPInfoEntry *e = getAOCSVPEntry(aocsInfo[i], j); Assert(e); totalBytes += e->eof_uncompressed; } /* Do not include tuples from an awaiting drop segment file */ if (aocsInfo[i]->state != AOSEG_STATE_AWAITING_DROP) { *reltuples += aocsInfo[i]->total_tupcount; } } /** * The planner doesn't understand AO's blocks, so need this method to try to fudge up a number for * the planner. */ *relpages = RelationGuessNumberOfBlocks(totalBytes); } AppendOnlyVisimap_Init(&visimap, aoEntry->visimaprelid, aoEntry->visimapidxid, AccessShareLock, SnapshotNow); hidden_tupcount = AppendOnlyVisimap_GetRelationHiddenTupleCount(&visimap); AppendOnlyVisimap_Finish(&visimap, AccessShareLock); (*reltuples) -= hidden_tupcount; pfree(aoEntry); return; }
/** * Drops a segment file. * */ static void AOCSCompaction_DropSegmentFile(Relation aorel, int segno) { ItemPointerData persistentTid; int64 persistentSerialNum; int pseudoSegNo; int col; Assert(RelationIsAoCols(aorel)); for (col = 0; col < RelationGetNumberOfAttributes(aorel); col++) { pseudoSegNo = (col * AOTupleId_MultiplierSegmentFileNum) + segno; if (!ReadGpRelationNode( aorel->rd_rel->reltablespace, aorel->rd_rel->relfilenode, pseudoSegNo, &persistentTid, &persistentSerialNum)) { /* There is nothing to drop */ return; } elogif(Debug_appendonly_print_compaction, LOG, "Drop segment file: " "segno %d", pseudoSegNo); MirroredFileSysObj_ScheduleDropAppendOnlyFile( &aorel->rd_node, pseudoSegNo, RelationGetRelationName(aorel), &persistentTid, persistentSerialNum); DeleteGpRelationNodeTuple(aorel, pseudoSegNo); } }
/* * _bt_mkscankey * Build an insertion scan key that contains comparison data from itup * as well as comparator routines appropriate to the key datatypes. * * The result is intended for use with _bt_compare(). */ ScanKey _bt_mkscankey(Relation rel, IndexTuple itup) { ScanKey skey; TupleDesc itupdesc; int natts; int16 *indoption; int i; itupdesc = RelationGetDescr(rel); natts = RelationGetNumberOfAttributes(rel); indoption = rel->rd_indoption; skey = (ScanKey) palloc(natts * sizeof(ScanKeyData)); for (i = 0; i < natts; i++) { FmgrInfo *procinfo; Datum arg; bool null; int flags; /* * We can use the cached (default) support procs since no cross-type * comparison can be needed. */ procinfo = index_getprocinfo(rel, i + 1, BTORDER_PROC); arg = index_getattr(itup, i + 1, itupdesc, &null); flags = (null ? SK_ISNULL : 0) | (indoption[i] << SK_BT_INDOPTION_SHIFT); ScanKeyEntryInitializeWithInfo(&skey[i], flags, (AttrNumber) (i + 1), InvalidStrategy, InvalidOid, rel->rd_indcollation[i], procinfo, arg); } return skey; }
/* * Drops a segment file. * * Actually, we just truncate the segfile to 0 bytes, to reclaim the space. * Before GPDB 6, we used to remove the file, but with WAL replication, we * no longer have a convenient function to remove a single segment of a * relation. An empty file is as almost as good as a non-existent file. If * the relation is dropped later, the code in mdunlink() will remove all * segments, including any empty ones we've left behind. */ static void AOCSCompaction_DropSegmentFile(Relation aorel, int segno) { int col; Assert(RelationIsAoCols(aorel)); for (col = 0; col < RelationGetNumberOfAttributes(aorel); col++) { char filenamepath[MAXPGPATH]; int pseudoSegNo; File fd; /* Open and truncate the relation segfile */ MakeAOSegmentFileName(aorel, segno, col, &pseudoSegNo, filenamepath); elogif(Debug_appendonly_print_compaction, LOG, "Drop segment file: " "segno %d", pseudoSegNo); fd = OpenAOSegmentFile(aorel, filenamepath, pseudoSegNo, 0); if (fd >= 0) { TruncateAOSegmentFile(fd, aorel, pseudoSegNo, 0); CloseAOSegmentFile(fd); } else { /* * The file we were about to drop/truncate didn't exist. That's normal, * for example, if a column is added with ALTER TABLE ADD COLUMN. */ elog(DEBUG1, "could not truncate segfile %s, because it does not exist", filenamepath); } } }
/* * Read tuples in correct sort order from tuplesort, and load them into * btree leaves. */ static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) { BTPageState *state = NULL; bool merge = (btspool2 != NULL); IndexTuple itup, itup2 = NULL; bool load1; TupleDesc tupdes = RelationGetDescr(wstate->index); int i, keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey = NULL; SortSupport sortKeys; if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge * btspool and btspool2. */ /* the preparation of merge */ itup = tuplesort_getindextuple(btspool->sortstate, true); itup2 = tuplesort_getindextuple(btspool2->sortstate, true); indexScanKey = _bt_mkscankey_nodata(wstate->index); /* Prepare SortSupport data for each column */ sortKeys = (SortSupport) palloc0(keysz * sizeof(SortSupportData)); for (i = 0; i < keysz; i++) { SortSupport sortKey = sortKeys + i; ScanKey scanKey = indexScanKey + i; int16 strategy; sortKey->ssup_cxt = CurrentMemoryContext; sortKey->ssup_collation = scanKey->sk_collation; sortKey->ssup_nulls_first = (scanKey->sk_flags & SK_BT_NULLS_FIRST) != 0; sortKey->ssup_attno = scanKey->sk_attno; /* Abbreviation is not supported here */ sortKey->abbreviate = false; AssertState(sortKey->ssup_attno != 0); strategy = (scanKey->sk_flags & SK_BT_DESC) != 0 ? BTGreaterStrategyNumber : BTLessStrategyNumber; PrepareSortSupportFromIndexRel(wstate->index, strategy, sortKey); } _bt_freeskey(indexScanKey); for (;;) { load1 = true; /* load BTSpool next ? */ if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { for (i = 1; i <= keysz; i++) { SortSupport entry; Datum attrDatum1, attrDatum2; bool isNull1, isNull2; int32 compare; entry = sortKeys + i - 1; attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); compare = ApplySortComparator(attrDatum1, isNull1, attrDatum2, isNull2, entry); if (compare > 0) { load1 = false; break; } else if (compare < 0) break; } } else load1 = false; /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { _bt_buildadd(wstate, state, itup); itup = tuplesort_getindextuple(btspool->sortstate, true); } else { _bt_buildadd(wstate, state, itup2); itup2 = tuplesort_getindextuple(btspool2->sortstate, true); } } pfree(sortKeys); } else { /* merge is unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, true)) != NULL) { /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); _bt_buildadd(wstate, state, itup); } } /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index is WAL-logged, we must fsync it down to disk before it's * safe to commit the transaction. (For a non-WAL-logged index we don't * care since the index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (RelationNeedsWAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); rel->reltablespace = RelationGetForm(relation)->reltablespace; Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) estimate_rel_size(relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples); /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemClass(relation->rd_rel))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes! */ if (!index->indisvalid) { index_close(indexRelation, NoLock); continue; } /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), TransactionXmin)) { root->glob->transientPlan = true; index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->reltablespace = RelationGetForm(indexRelation)->reltablespace; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; /* * Allocate per-column info arrays. To save a few palloc cycles * we allocate all the Oid-type arrays in one request. Note that * the opfamily array needs an extra, terminating zero at the end. * We pre-zero the ordering info in case the index is unordered. */ info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->opfamily = (Oid *) palloc0(sizeof(Oid) * (4 * ncolumns + 1)); info->opcintype = info->opfamily + (ncolumns + 1); info->fwdsortop = info->opcintype + ncolumns; info->revsortop = info->fwdsortop + ncolumns; info->nulls_first = (bool *) palloc0(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { info->indexkeys[i] = index->indkey.values[i]; info->opfamily[i] = indexRelation->rd_opfamily[i]; info->opcintype[i] = indexRelation->rd_opcintype[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); info->amhasgetbitmap = OidIsValid(indexRelation->rd_am->amgetbitmap); /* * Fetch the ordering operators associated with the index, if any. * We expect that all ordering-capable indexes use btree's * strategy numbers for the ordering operators. */ if (indexRelation->rd_am->amcanorder) { int nstrat = indexRelation->rd_am->amstrategies; for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; int fwdstrat; int revstrat; if (opt & INDOPTION_DESC) { fwdstrat = BTGreaterStrategyNumber; revstrat = BTLessStrategyNumber; } else { fwdstrat = BTLessStrategyNumber; revstrat = BTGreaterStrategyNumber; } /* * Index AM must have a fixed set of strategies for it to * make sense to specify amcanorder, so we need not allow * the case amstrategies == 0. */ if (fwdstrat > 0) { Assert(fwdstrat <= nstrat); info->fwdsortop[i] = indexRelation->rd_operator[i * nstrat + fwdstrat - 1]; } if (revstrat > 0) { Assert(revstrat <= nstrat); info->revsortop[i] = indexRelation->rd_operator[i * nstrat + revstrat - 1]; } info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; } } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ if (info->indpred == NIL) { info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } else { estimate_rel_size(indexRelation, NULL, &info->pages, &info->tuples); if (info->tuples > rel->tuples) info->tuples = rel->tuples; } index_close(indexRelation, NoLock); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); /* Temporary and unlogged relations are inaccessible during recovery. */ if (!RelationNeedsWAL(relation) && RecoveryInProgress()) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("cannot access temporary or unlogged relations during recovery"))); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); rel->reltablespace = RelationGetForm(relation)->reltablespace; Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) estimate_rel_size(relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples, &rel->allvisfrac); /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemClass(relation->rd_rel))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes! */ if (!index->indisvalid) { index_close(indexRelation, NoLock); continue; } /* * If the index is valid, but cannot yet be used, ignore it; but * mark the plan we are generating as transient. See * src/backend/access/heap/README.HOT for discussion. */ if (index->indcheckxmin && !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRelation->rd_indextuple->t_data), TransactionXmin)) { root->glob->transientPlan = true; index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->reltablespace = RelationGetForm(indexRelation)->reltablespace; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->indexcollations = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->opcintype = (Oid *) palloc(sizeof(Oid) * ncolumns); for (i = 0; i < ncolumns; i++) { info->indexkeys[i] = index->indkey.values[i]; info->indexcollations[i] = indexRelation->rd_indcollation[i]; info->opfamily[i] = indexRelation->rd_opfamily[i]; info->opcintype[i] = indexRelation->rd_opcintype[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->canreturn = index_can_return(indexRelation); info->amcanorderbyop = indexRelation->rd_am->amcanorderbyop; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; info->amsearcharray = indexRelation->rd_am->amsearcharray; info->amsearchnulls = indexRelation->rd_am->amsearchnulls; info->amhasgettuple = OidIsValid(indexRelation->rd_am->amgettuple); info->amhasgetbitmap = OidIsValid(indexRelation->rd_am->amgetbitmap); /* * Fetch the ordering information for the index, if any. */ if (info->relam == BTREE_AM_OID) { /* * If it's a btree index, we can use its opfamily OIDs * directly as the sort ordering opfamily OIDs. */ Assert(indexRelation->rd_am->amcanorder); info->sortopfamily = info->opfamily; info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; } } else if (indexRelation->rd_am->amcanorder) { /* * Otherwise, identify the corresponding btree opfamilies by * trying to map this index's "<" operators into btree. Since * "<" uniquely defines the behavior of a sort order, this is * a sufficient test. * * XXX This method is rather slow and also requires the * undesirable assumption that the other index AM numbers its * strategies the same as btree. It'd be better to have a way * to explicitly declare the corresponding btree opfamily for * each opfamily of the other index type. But given the lack * of current or foreseeable amcanorder index types, it's not * worth expending more effort on now. */ info->sortopfamily = (Oid *) palloc(sizeof(Oid) * ncolumns); info->reverse_sort = (bool *) palloc(sizeof(bool) * ncolumns); info->nulls_first = (bool *) palloc(sizeof(bool) * ncolumns); for (i = 0; i < ncolumns; i++) { int16 opt = indexRelation->rd_indoption[i]; Oid ltopr; Oid btopfamily; Oid btopcintype; int16 btstrategy; info->reverse_sort[i] = (opt & INDOPTION_DESC) != 0; info->nulls_first[i] = (opt & INDOPTION_NULLS_FIRST) != 0; ltopr = get_opfamily_member(info->opfamily[i], info->opcintype[i], info->opcintype[i], BTLessStrategyNumber); if (OidIsValid(ltopr) && get_ordering_op_properties(ltopr, &btopfamily, &btopcintype, &btstrategy) && btopcintype == info->opcintype[i] && btstrategy == BTLessStrategyNumber) { /* Successful mapping */ info->sortopfamily[i] = btopfamily; } else { /* Fail ... quietly treat index as unordered */ info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; break; } } } else { info->sortopfamily = NULL; info->reverse_sort = NULL; info->nulls_first = NULL; } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); /* Build targetlist using the completed indexprs data */ info->indextlist = build_index_tlist(root, info, relation); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; info->immediate = index->indimmediate; info->hypothetical = false; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ if (info->indpred == NIL) { info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } else { double allvisfrac; /* dummy */ estimate_rel_size(indexRelation, NULL, &info->pages, &info->tuples, &allvisfrac); if (info->tuples > rel->tuples) info->tuples = rel->tuples; } index_close(indexRelation, NoLock); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. * * If inhparent is true, all we need to do is set up the attr arrays: * the RelOptInfo actually represents the appendrel formed by an inheritance * tree, and so the parent rel's physical size and index information isn't * important for it. */ void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; bool needs_longlock; /* * We need not lock the relation since it was already locked, either by * the rewriter or when expand_inherited_rtentry() added it to the query's * rangetable. */ relation = heap_open(relationObjectId, NoLock); needs_longlock = rel_needs_long_lock(relationObjectId); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * CDB: Get partitioning key info for distributed relation. */ rel->cdbpolicy = RelationGetPartitioningKey(relation); /* * Estimate relation size --- unless it's an inheritance parent, in which * case the size will be computed later in set_append_rel_pathlist, and we * must leave it zero for now to avoid bollixing the total_table_pages * calculation. */ if (!inhparent) { cdb_estimate_rel_size ( rel, relation, relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples, &rel->cdb_default_stats_used ); } /* * Make list of indexes. Ignore indexes on system catalogs if told to. * Don't bother with indexes for an inheritance parent, either. */ if (inhparent || (IgnoreSystemIndexes && IsSystemClass(relation->rd_rel))) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; LOCKMODE lmode; /* Warn if indexed table needs ANALYZE. */ if (rel->cdb_default_stats_used) cdb_default_stats_warning_for_table(relation->rd_id); indexoidlist = RelationGetIndexList(relation); /* * For each index, we get the same type of lock that the executor will * need, and do not release it. This saves a couple of trips to the * shared lock manager while not creating any real loss of * concurrency, because no schema changes could be happening on the * index while we hold lock on the parent rel, and neither lock type * blocks any other kind of index operation. */ if (rel->relid == root->parse->resultRelation) lmode = RowExclusiveLock; else lmode = AccessShareLock; foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; int16 amorderstrategy; /* * Extract info from the relation descriptor for the index. */ indexRelation = index_open(indexoid, lmode); index = indexRelation->rd_index; /* * Ignore invalid indexes, since they can't safely be used for * queries. Note that this is OK because the data structure we * are constructing is only used by the planner --- the executor * still needs to insert into "invalid" indexes! */ if (!index->indisvalid) { index_close(indexRelation, NoLock); continue; } info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; /* * Need to make classlist and ordering arrays large enough to put * a terminating 0 at the end of each one. */ info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->classlist = (Oid *) palloc0(sizeof(Oid) * (ncolumns + 1)); info->ordering = (Oid *) palloc0(sizeof(Oid) * (ncolumns + 1)); for (i = 0; i < ncolumns; i++) { info->classlist[i] = indexRelation->rd_indclass->values[i]; info->indexkeys[i] = index->indkey.values[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; /* * Fetch the ordering operators associated with the index, if any. */ amorderstrategy = indexRelation->rd_am->amorderstrategy; if (amorderstrategy != 0) { int oprindex = amorderstrategy - 1; for (i = 0; i < ncolumns; i++) { info->ordering[i] = indexRelation->rd_operator[oprindex]; oprindex += indexRelation->rd_am->amstrategies; } } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ cdb_estimate_rel_size(rel, relation, indexRelation, NULL, &info->pages, &info->tuples, &info->cdb_default_stats_used); if (!info->indpred || info->tuples > rel->tuples) info->tuples = rel->tuples; if (info->cdb_default_stats_used && !rel->cdb_default_stats_used) cdb_default_stats_warning_for_index(relation->rd_id, indexoid); index_close(indexRelation, needs_longlock ? NoLock : lmode); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }
/* * _bt_mergeload - Merge two streams of index tuples into new index files. */ static void _bt_mergeload(Spooler *self, BTWriteState *wstate, BTSpool *btspool, BTReader *btspool2, Relation heapRel) { BTPageState *state = NULL; IndexTuple itup, itup2; bool should_free = false; TupleDesc tupdes = RelationGetDescr(wstate->index); int keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey; ON_DUPLICATE on_duplicate = self->on_duplicate; Assert(btspool != NULL); /* the preparation of merge */ itup = BTSpoolGetNextItem(btspool, NULL, &should_free); itup2 = BTReaderGetNextItem(btspool2); indexScanKey = _bt_mkscankey_nodata(wstate->index); for (;;) { bool load1 = true; /* load BTSpool next ? */ bool hasnull; int32 compare; if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { compare = compare_indextuple(itup, itup2, indexScanKey, keysz, tupdes, &hasnull); if (compare == 0 && !hasnull && btspool->isunique) { ItemPointerData t_tid2; /* * t_tid is update by heap_is_visible(), because use it for an * index, t_tid backup */ ItemPointerCopy(&itup2->t_tid, &t_tid2); /* The tuple pointed by the old index should not be visible. */ if (!heap_is_visible(heapRel, &itup->t_tid)) { itup = BTSpoolGetNextItem(btspool, itup, &should_free); } else if (!heap_is_visible(heapRel, &itup2->t_tid)) { itup2 = BTReaderGetNextItem(btspool2); } else { if (on_duplicate == ON_DUPLICATE_KEEP_NEW) { self->dup_old++; remove_duplicate(self, heapRel, itup2, RelationGetRelationName(wstate->index)); itup2 = BTReaderGetNextItem(btspool2); } else { ItemPointerCopy(&t_tid2, &itup2->t_tid); self->dup_new++; remove_duplicate(self, heapRel, itup, RelationGetRelationName(wstate->index)); itup = BTSpoolGetNextItem(btspool, itup, &should_free); } } continue; } else if (compare > 0) load1 = false; } else load1 = false; BULKLOAD_PROFILE(&prof_merge_unique); /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { IndexTuple next_itup = NULL; bool next_should_free = false; for (;;) { /* get next item */ next_itup = BTSpoolGetNextItem(btspool, next_itup, &next_should_free); if (!btspool->isunique || next_itup == NULL) break; compare = compare_indextuple(itup, next_itup, indexScanKey, keysz, tupdes, &hasnull); if (compare < 0 || hasnull) break; if (compare > 0) { /* shouldn't happen */ elog(ERROR, "faild in tuplesort_performsort"); } /* * If tupple is deleted by other unique indexes, not visible */ if (!heap_is_visible(heapRel, &next_itup->t_tid)) { continue; } if (!heap_is_visible(heapRel, &itup->t_tid)) { if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; next_should_free = false; continue; } /* not unique between input files */ self->dup_new++; remove_duplicate(self, heapRel, next_itup, RelationGetRelationName(wstate->index)); if (self->dup_old + self->dup_new > self->max_dup_errors) ereport(ERROR, (errcode(ERRCODE_INTERNAL_ERROR), errmsg("Maximum duplicate error count exceeded"))); } _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); itup = next_itup; should_free = next_should_free; } else { _bt_buildadd(wstate, state, itup2); itup2 = BTReaderGetNextItem(btspool2); } BULKLOAD_PROFILE(&prof_merge_insert); } _bt_freeskey(indexScanKey); /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index isn't temp, we must fsync it down to disk before it's safe * to commit the transaction. (For a temp index we don't care since the * index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (!RELATION_IS_LOCAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } BULKLOAD_PROFILE(&prof_merge_term); }
Datum gistrescan(PG_FUNCTION_ARGS) { IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanKey key = (ScanKey) PG_GETARG_POINTER(1); ScanKey orderbys = (ScanKey) PG_GETARG_POINTER(3); /* nkeys and norderbys arguments are ignored */ GISTScanOpaque so = (GISTScanOpaque) scan->opaque; bool first_time; int i; MemoryContext oldCxt; /* rescan an existing indexscan --- reset state */ /* * The first time through, we create the search queue in the scanCxt. * Subsequent times through, we create the queue in a separate queueCxt, * which is created on the second call and reset on later calls. Thus, in * the common case where a scan is only rescan'd once, we just put the * queue in scanCxt and don't pay the overhead of making a second memory * context. If we do rescan more than once, the first RBTree is just left * for dead until end of scan; this small wastage seems worth the savings * in the common case. */ if (so->queue == NULL) { /* first time through */ Assert(so->queueCxt == so->giststate->scanCxt); first_time = true; } else if (so->queueCxt == so->giststate->scanCxt) { /* second time through */ so->queueCxt = AllocSetContextCreate(so->giststate->scanCxt, "GiST queue context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); first_time = false; } else { /* third or later time through */ MemoryContextReset(so->queueCxt); first_time = false; } /* * If we're doing an index-only scan, on the first call, also initialize * a tuple descriptor to represent the returned index tuples and create a * memory context to hold them during the scan. */ if (scan->xs_want_itup && !scan->xs_itupdesc) { int natts; int attno; /* * The storage type of the index can be different from the original * datatype being indexed, so we cannot just grab the index's tuple * descriptor. Instead, construct a descriptor with the original data * types. */ natts = RelationGetNumberOfAttributes(scan->indexRelation); so->giststate->fetchTupdesc = CreateTemplateTupleDesc(natts, false); for (attno = 1; attno <= natts; attno++) { TupleDescInitEntry(so->giststate->fetchTupdesc, attno, NULL, scan->indexRelation->rd_opcintype[attno - 1], -1, 0); } scan->xs_itupdesc = so->giststate->fetchTupdesc; so->pageDataCxt = AllocSetContextCreate(so->giststate->scanCxt, "GiST page data context", ALLOCSET_DEFAULT_MINSIZE, ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); } /* create new, empty RBTree for search queue */ oldCxt = MemoryContextSwitchTo(so->queueCxt); so->queue = pairingheap_allocate(pairingheap_GISTSearchItem_cmp, scan); MemoryContextSwitchTo(oldCxt); so->firstCall = true; /* Update scan key, if a new one is given */ if (key && scan->numberOfKeys > 0) { void **fn_extras = NULL; /* * If this isn't the first time through, preserve the fn_extra * pointers, so that if the consistentFns are using them to cache * data, that data is not leaked across a rescan. */ if (!first_time) { fn_extras = (void **) palloc(scan->numberOfKeys * sizeof(void *)); for (i = 0; i < scan->numberOfKeys; i++) fn_extras[i] = scan->keyData[i].sk_func.fn_extra; } memmove(scan->keyData, key, scan->numberOfKeys * sizeof(ScanKeyData)); /* * Modify the scan key so that the Consistent method is called for all * comparisons. The original operator is passed to the Consistent * function in the form of its strategy number, which is available * from the sk_strategy field, and its subtype from the sk_subtype * field. * * Next, if any of keys is a NULL and that key is not marked with * SK_SEARCHNULL/SK_SEARCHNOTNULL then nothing can be found (ie, we * assume all indexable operators are strict). */ so->qual_ok = true; for (i = 0; i < scan->numberOfKeys; i++) { ScanKey skey = scan->keyData + i; fmgr_info_copy(&(skey->sk_func), &(so->giststate->consistentFn[skey->sk_attno - 1]), so->giststate->scanCxt); /* Restore prior fn_extra pointers, if not first time */ if (!first_time) skey->sk_func.fn_extra = fn_extras[i]; if (skey->sk_flags & SK_ISNULL) { if (!(skey->sk_flags & (SK_SEARCHNULL | SK_SEARCHNOTNULL))) so->qual_ok = false; } } if (!first_time) pfree(fn_extras); } /* Update order-by key, if a new one is given */ if (orderbys && scan->numberOfOrderBys > 0) { void **fn_extras = NULL; /* As above, preserve fn_extra if not first time through */ if (!first_time) { fn_extras = (void **) palloc(scan->numberOfOrderBys * sizeof(void *)); for (i = 0; i < scan->numberOfOrderBys; i++) fn_extras[i] = scan->orderByData[i].sk_func.fn_extra; } memmove(scan->orderByData, orderbys, scan->numberOfOrderBys * sizeof(ScanKeyData)); so->orderByTypes = (Oid *) palloc(scan->numberOfOrderBys * sizeof(Oid)); /* * Modify the order-by key so that the Distance method is called for * all comparisons. The original operator is passed to the Distance * function in the form of its strategy number, which is available * from the sk_strategy field, and its subtype from the sk_subtype * field. */ for (i = 0; i < scan->numberOfOrderBys; i++) { ScanKey skey = scan->orderByData + i; FmgrInfo *finfo = &(so->giststate->distanceFn[skey->sk_attno - 1]); /* Check we actually have a distance function ... */ if (!OidIsValid(finfo->fn_oid)) elog(ERROR, "missing support function %d for attribute %d of index \"%s\"", GIST_DISTANCE_PROC, skey->sk_attno, RelationGetRelationName(scan->indexRelation)); /* * Look up the datatype returned by the original ordering operator. * GiST always uses a float8 for the distance function, but the * ordering operator could be anything else. * * XXX: The distance function is only allowed to be lossy if the * ordering operator's result type is float4 or float8. Otherwise * we don't know how to return the distance to the executor. But * we cannot check that here, as we won't know if the distance * function is lossy until it returns *recheck = true for the * first time. */ so->orderByTypes[i] = get_func_rettype(skey->sk_func.fn_oid); fmgr_info_copy(&(skey->sk_func), finfo, so->giststate->scanCxt); /* Restore prior fn_extra pointers, if not first time */ if (!first_time) skey->sk_func.fn_extra = fn_extras[i]; } if (!first_time) pfree(fn_extras); } PG_RETURN_VOID(); }
/* * Setup a ScanKey for a search in the relation 'rel' for a tuple 'key' that * is setup to match 'rel' (*NOT* idxrel!). * * Returns whether any column contains NULLs. * * This is not generic routine, it expects the idxrel to be replication * identity of a rel and meet all limitations associated with that. */ static bool build_replindex_scan_key(ScanKey skey, Relation rel, Relation idxrel, TupleTableSlot *searchslot) { int attoff; bool isnull; Datum indclassDatum; oidvector *opclass; int2vector *indkey = &idxrel->rd_index->indkey; bool hasnulls = false; Assert(RelationGetReplicaIndex(rel) == RelationGetRelid(idxrel)); indclassDatum = SysCacheGetAttr(INDEXRELID, idxrel->rd_indextuple, Anum_pg_index_indclass, &isnull); Assert(!isnull); opclass = (oidvector *) DatumGetPointer(indclassDatum); /* Build scankey for every attribute in the index. */ for (attoff = 0; attoff < RelationGetNumberOfAttributes(idxrel); attoff++) { Oid operator; Oid opfamily; RegProcedure regop; int pkattno = attoff + 1; int mainattno = indkey->values[attoff]; Oid optype = get_opclass_input_type(opclass->values[attoff]); /* * Load the operator info. We need this to get the equality operator * function for the scan key. */ opfamily = get_opclass_family(opclass->values[attoff]); operator = get_opfamily_member(opfamily, optype, optype, BTEqualStrategyNumber); if (!OidIsValid(operator)) elog(ERROR, "could not find member %d(%u,%u) of opfamily %u", BTEqualStrategyNumber, optype, optype, opfamily); regop = get_opcode(operator); /* Initialize the scankey. */ ScanKeyInit(&skey[attoff], pkattno, BTEqualStrategyNumber, regop, searchslot->tts_values[mainattno - 1]); /* Check for null value. */ if (searchslot->tts_isnull[mainattno - 1]) { hasnulls = true; skey[attoff].sk_flags |= SK_ISNULL; } } return hasnulls; }
/* * Search the relation 'rel' for tuple using the index. * * If a matching tuple is found, lock it with lockmode, fill the slot with its * contents, and return true. Return false otherwise. */ bool RelationFindReplTupleByIndex(Relation rel, Oid idxoid, LockTupleMode lockmode, TupleTableSlot *searchslot, TupleTableSlot *outslot) { HeapTuple scantuple; ScanKeyData skey[INDEX_MAX_KEYS]; IndexScanDesc scan; SnapshotData snap; TransactionId xwait; Relation idxrel; bool found; /* Open the index. */ idxrel = index_open(idxoid, RowExclusiveLock); /* Start an index scan. */ InitDirtySnapshot(snap); scan = index_beginscan(rel, idxrel, &snap, RelationGetNumberOfAttributes(idxrel), 0); /* Build scan key. */ build_replindex_scan_key(skey, rel, idxrel, searchslot); retry: found = false; index_rescan(scan, skey, RelationGetNumberOfAttributes(idxrel), NULL, 0); /* Try to find the tuple */ if ((scantuple = index_getnext(scan, ForwardScanDirection)) != NULL) { found = true; ExecStoreTuple(scantuple, outslot, InvalidBuffer, false); ExecMaterializeSlot(outslot); xwait = TransactionIdIsValid(snap.xmin) ? snap.xmin : snap.xmax; /* * If the tuple is locked, wait for locking transaction to finish and * retry. */ if (TransactionIdIsValid(xwait)) { XactLockTableWait(xwait, NULL, NULL, XLTW_None); goto retry; } } /* Found tuple, try to lock it in the lockmode. */ if (found) { Buffer buf; HeapUpdateFailureData hufd; HTSU_Result res; HeapTupleData locktup; ItemPointerCopy(&outslot->tts_tuple->t_self, &locktup.t_self); PushActiveSnapshot(GetLatestSnapshot()); res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false), lockmode, LockWaitBlock, false /* don't follow updates */ , &buf, &hufd); /* the tuple slot already has the buffer pinned */ ReleaseBuffer(buf); PopActiveSnapshot(); switch (res) { case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: /* XXX: Improve handling here */ ereport(LOG, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("concurrent update, retrying"))); goto retry; case HeapTupleInvisible: elog(ERROR, "attempted to lock invisible tuple"); default: elog(ERROR, "unexpected heap_lock_tuple status: %u", res); break; } } index_endscan(scan); /* Don't release lock until commit. */ index_close(idxrel, NoLock); return found; }
/* * Read tuples in correct sort order from tuplesort, and load them into * btree leaves. */ static void _bt_load(BTWriteState *wstate, BTSpool *btspool, BTSpool *btspool2) { BTPageState *state = NULL; bool merge = (btspool2 != NULL); IndexTuple itup, itup2 = NULL; bool should_free, should_free2, load1; TupleDesc tupdes = RelationGetDescr(wstate->index); int i, keysz = RelationGetNumberOfAttributes(wstate->index); ScanKey indexScanKey = NULL; if (merge) { /* * Another BTSpool for dead tuples exists. Now we have to merge * btspool and btspool2. */ /* the preparation of merge */ itup = tuplesort_getindextuple(btspool->sortstate, true, &should_free); itup2 = tuplesort_getindextuple(btspool2->sortstate, true, &should_free2); indexScanKey = _bt_mkscankey_nodata(wstate->index); for (;;) { load1 = true; /* load BTSpool next ? */ if (itup2 == NULL) { if (itup == NULL) break; } else if (itup != NULL) { for (i = 1; i <= keysz; i++) { ScanKey entry; Datum attrDatum1, attrDatum2; bool isNull1, isNull2; int32 compare; entry = indexScanKey + i - 1; attrDatum1 = index_getattr(itup, i, tupdes, &isNull1); attrDatum2 = index_getattr(itup2, i, tupdes, &isNull2); if (isNull1) { if (isNull2) compare = 0; /* NULL "=" NULL */ else if (entry->sk_flags & SK_BT_NULLS_FIRST) compare = -1; /* NULL "<" NOT_NULL */ else compare = 1; /* NULL ">" NOT_NULL */ } else if (isNull2) { if (entry->sk_flags & SK_BT_NULLS_FIRST) compare = 1; /* NOT_NULL ">" NULL */ else compare = -1; /* NOT_NULL "<" NULL */ } else { compare = DatumGetInt32(FunctionCall2Coll(&entry->sk_func, entry->sk_collation, attrDatum1, attrDatum2)); if (entry->sk_flags & SK_BT_DESC) compare = -compare; } if (compare > 0) { load1 = false; break; } else if (compare < 0) break; } } else load1 = false; /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); if (load1) { _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); itup = tuplesort_getindextuple(btspool->sortstate, true, &should_free); } else { _bt_buildadd(wstate, state, itup2); if (should_free2) pfree(itup2); itup2 = tuplesort_getindextuple(btspool2->sortstate, true, &should_free2); } } _bt_freeskey(indexScanKey); } else { /* merge is unnecessary */ while ((itup = tuplesort_getindextuple(btspool->sortstate, true, &should_free)) != NULL) { /* When we see first tuple, create first index page */ if (state == NULL) state = _bt_pagestate(wstate, 0); _bt_buildadd(wstate, state, itup); if (should_free) pfree(itup); } } /* Close down final pages and write the metapage */ _bt_uppershutdown(wstate, state); /* * If the index is WAL-logged, we must fsync it down to disk before it's * safe to commit the transaction. (For a non-WAL-logged index we don't * care since the index will be uninteresting after a crash anyway.) * * It's obvious that we must do this when not WAL-logging the build. It's * less obvious that we have to do it even if we did WAL-log the index * pages. The reason is that since we're building outside shared buffers, * a CHECKPOINT occurring during the build has no way to flush the * previously written data to disk (indeed it won't know the index even * exists). A crash later on would replay WAL from the checkpoint, * therefore it wouldn't replay our earlier WAL entries. If we do not * fsync those pages here, they might still not be on disk when the crash * occurs. */ if (RelationNeedsWAL(wstate->index)) { RelationOpenSmgr(wstate->index); smgrimmedsync(wstate->index->rd_smgr, MAIN_FORKNUM); } }
/* checks the individual attributes of the tuple */ uint32 check_index_tuple_attributes(Relation rel, PageHeader header, int block, int i, char *buffer) { IndexTuple tuple; uint32 nerrs = 0; int j, off; bits8 * bitmap; BTPageOpaque opaque; ereport(DEBUG2,(errmsg("[%d:%d] checking attributes for the tuple", block, i))); /* get the index tuple and info about the page */ tuple = (IndexTuple)(buffer + header->pd_linp[i].lp_off); opaque = (BTPageOpaque)(buffer + header->pd_special); /* current attribute offset - always starts at (buffer + off) */ off = header->pd_linp[i].lp_off + IndexInfoFindDataOffset(tuple->t_info); ereport(DEBUG3,(errmsg("[%d:%d] tuple has %d attributes", block, (i+1), RelationGetNumberOfAttributes(rel)))); bitmap = (bits8*)(buffer + header->pd_linp[i].lp_off + sizeof(IndexTupleData)); /* TODO This is mostly copy'n'paste from check_heap_tuple_attributes, so maybe it could be refactored to share the code. */ /* For left-most tuples on non-leaf pages, there are no data actually (see src/backend/access/nbtree/README, last paragraph in section "Notes About Data Representation") Use P_LEFTMOST/P_ISLEAF to identify such cases (for the leftmost item only) and set len = 0. */ if (P_LEFTMOST(opaque) && (! P_ISLEAF(opaque)) && (i == 0)) { ereport(DEBUG3, (errmsg("[%d:%d] leftmost tuple on non-leaf block => no data, skipping", block, i))); return nerrs; } /* check all the index attributes */ for (j = 0; j < rel->rd_att->natts; j++) { /* default length of the attribute */ int len = rel->rd_att->attrs[j]->attlen; /* copy from src/backend/commands/analyze.c */ bool is_varlena = (!rel->rd_att->attrs[j]->attbyval && len == -1); bool is_varwidth = (!rel->rd_att->attrs[j]->attbyval && len < 0); /* thus it's "len = -2" */ /* if the attribute is marked as NULL (in the tuple header), skip to the next attribute */ if (IndexTupleHasNulls(tuple) && att_isnull(j, bitmap)) { ereport(DEBUG3, (errmsg("[%d:%d] attribute '%s' is NULL (skipping)", block, (i+1), rel->rd_att->attrs[j]->attname.data))); continue; } /* fix the alignment (see src/include/access/tupmacs.h) */ off = att_align_pointer(off, rel->rd_att->attrs[j]->attalign, rel->rd_att->attrs[j]->attlen, buffer+off); if (is_varlena) { /* other interesting macros (see postgres.h) - should do something about those ... VARATT_IS_COMPRESSED(PTR) VARATT_IS_4B_C(PTR) VARATT_IS_EXTERNAL(PTR) VARATT_IS_1B_E(PTR) VARATT_IS_SHORT(PTR) VARATT_IS_1B(PTR) VARATT_IS_EXTENDED(PTR) (!VARATT_IS_4B_U(PTR)) */ len = VARSIZE_ANY(buffer + off); if (len < 0) { ereport(WARNING, (errmsg("[%d:%d] attribute '%s' has negative length < 0 (%d)", block, (i+1), rel->rd_att->attrs[j]->attname.data, len))); ++nerrs; break; } if (VARATT_IS_COMPRESSED(buffer + off)) { /* the raw length should be less than 1G (and positive) */ if ((VARRAWSIZE_4B_C(buffer + off) < 0) || (VARRAWSIZE_4B_C(buffer + off) > 1024*1024)) { ereport(WARNING, (errmsg("[%d:%d] attribute '%s' has invalid length %d (should be between 0 and 1G)", block, (i+1), rel->rd_att->attrs[j]->attname.data, VARRAWSIZE_4B_C(buffer + off)))); ++nerrs; /* no break here, this does not break the page structure - we may check the other attributes */ } } /* FIXME Check if the varlena value may be detoasted. */ } else if (is_varwidth) { /* get the C-string length (at most to the end of tuple), +1 as it does not include '\0' at the end */ /* if the string is not properly terminated, then this returns 'remaining space + 1' so it's detected */ len = strnlen(buffer + off, header->pd_linp[i].lp_off + len + header->pd_linp[i].lp_len - off) + 1; } /* Check if the length makes sense (is not negative and does not overflow * the tuple end, stop validating the other rows (we don't know where to * continue anyway). */ if (off + len > (header->pd_linp[i].lp_off + header->pd_linp[i].lp_len)) { ereport(WARNING, (errmsg("[%d:%d] attribute '%s' (off=%d len=%d) overflows tuple end (off=%d, len=%d)", block, (i+1), rel->rd_att->attrs[j]->attname.data, off, len, header->pd_linp[i].lp_off, header->pd_linp[i].lp_len))); ++nerrs; break; } /* skip to the next attribute */ off += len; ereport(DEBUG3,(errmsg("[%d:%d] attribute '%s' len=%d", block, (i+1), rel->rd_att->attrs[j]->attname.data, len))); } ereport(DEBUG3,(errmsg("[%d:%d] last attribute ends at %d, tuple ends at %d", block, (i+1), off, header->pd_linp[i].lp_off + header->pd_linp[i].lp_len))); /* after the last attribute, the offset should be exactly the same as the end of the tuple */ if (MAXALIGN(off) != header->pd_linp[i].lp_off + header->pd_linp[i].lp_len) { ereport(WARNING, (errmsg("[%d:%d] the last attribute ends at %d but the tuple ends at %d", block, (i+1), off, header->pd_linp[i].lp_off + header->pd_linp[i].lp_len))); ++nerrs; } return nerrs; }
/* * Assumes that the segment file lock is already held. * Assumes that the segment file should be compacted. */ static bool AOCSSegmentFileFullCompaction(Relation aorel, AOCSInsertDesc insertDesc, AOCSFileSegInfo *fsinfo, Snapshot snapshot) { const char *relname; AppendOnlyVisimap visiMap; AOCSScanDesc scanDesc; TupleDesc tupDesc; TupleTableSlot *slot; int compact_segno; int64 movedTupleCount = 0; ResultRelInfo *resultRelInfo; MemTupleBinding *mt_bind; EState *estate; bool *proj; int i; AOTupleId *aoTupleId; int64 tupleCount = 0; int64 tuplePerPage = INT_MAX; Assert(Gp_role == GP_ROLE_EXECUTE || Gp_role == GP_ROLE_UTILITY); Assert(RelationIsAoCols(aorel)); Assert(insertDesc); compact_segno = fsinfo->segno; if (fsinfo->varblockcount > 0) { tuplePerPage = fsinfo->total_tupcount / fsinfo->varblockcount; } relname = RelationGetRelationName(aorel); AppendOnlyVisimap_Init(&visiMap, aorel->rd_appendonly->visimaprelid, aorel->rd_appendonly->visimapidxid, ShareLock, snapshot); elogif(Debug_appendonly_print_compaction, LOG, "Compact AO segfile %d, relation %sd", compact_segno, relname); proj = palloc0(sizeof(bool) * RelationGetNumberOfAttributes(aorel)); for (i = 0; i < RelationGetNumberOfAttributes(aorel); ++i) { proj[i] = true; } scanDesc = aocs_beginrangescan(aorel, snapshot, snapshot, &compact_segno, 1, NULL, proj); tupDesc = RelationGetDescr(aorel); slot = MakeSingleTupleTableSlot(tupDesc); mt_bind = create_memtuple_binding(tupDesc); /* * We need a ResultRelInfo and an EState so we can use the regular * executor's index-entry-making machinery. */ estate = CreateExecutorState(); resultRelInfo = makeNode(ResultRelInfo); resultRelInfo->ri_RangeTableIndex = 1; /* dummy */ resultRelInfo->ri_RelationDesc = aorel; resultRelInfo->ri_TrigDesc = NULL; /* we don't fire triggers */ ExecOpenIndices(resultRelInfo); estate->es_result_relations = resultRelInfo; estate->es_num_result_relations = 1; estate->es_result_relation_info = resultRelInfo; while (aocs_getnext(scanDesc, ForwardScanDirection, slot)) { CHECK_FOR_INTERRUPTS(); aoTupleId = (AOTupleId *) slot_get_ctid(slot); if (AppendOnlyVisimap_IsVisible(&scanDesc->visibilityMap, aoTupleId)) { AOCSMoveTuple(slot, insertDesc, resultRelInfo, estate); movedTupleCount++; } else { /* Tuple is invisible and needs to be dropped */ AppendOnlyThrowAwayTuple(aorel, slot, mt_bind); } /* * Check for vacuum delay point after approximatly a var block */ tupleCount++; if (VacuumCostActive && tupleCount % tuplePerPage == 0) { vacuum_delay_point(); } } SetAOCSFileSegInfoState(aorel, compact_segno, AOSEG_STATE_AWAITING_DROP); AppendOnlyVisimap_DeleteSegmentFile(&visiMap, compact_segno); /* Delete all mini pages of the segment files if block directory exists */ if (OidIsValid(aorel->rd_appendonly->blkdirrelid)) { AppendOnlyBlockDirectory_DeleteSegmentFile(aorel, snapshot, compact_segno, 0); } elogif(Debug_appendonly_print_compaction, LOG, "Finished compaction: " "AO segfile %d, relation %s, moved tuple count " INT64_FORMAT, compact_segno, relname, movedTupleCount); AppendOnlyVisimap_Finish(&visiMap, NoLock); ExecCloseIndices(resultRelInfo); FreeExecutorState(estate); ExecDropSingleTupleTableSlot(slot); destroy_memtuple_binding(mt_bind); aocs_endscan(scanDesc); pfree(proj); return true; }
/* * get_relation_info - * Retrieves catalog information for a given relation. * * Given the Oid of the relation, return the following info into fields * of the RelOptInfo struct: * * min_attr lowest valid AttrNumber * max_attr highest valid AttrNumber * indexlist list of IndexOptInfos for relation's indexes * pages number of pages * tuples number of tuples * * Also, initialize the attr_needed[] and attr_widths[] arrays. In most * cases these are left as zeroes, but sometimes we need to compute attr * widths here, and we may as well cache the results for costsize.c. */ void get_relation_info(Oid relationObjectId, RelOptInfo *rel) { Index varno = rel->relid; Relation relation; bool hasindex; List *indexinfos = NIL; /* * Normally, we can assume the rewriter already acquired at least * AccessShareLock on each relation used in the query. However this will * not be the case for relations added to the query because they are * inheritance children of some relation mentioned explicitly. For them, * this is the first access during the parse/rewrite/plan pipeline, and so * we need to obtain and keep a suitable lock. * * XXX really, a suitable lock is RowShareLock if the relation is an * UPDATE/DELETE target, and AccessShareLock otherwise. However we cannot * easily tell here which to get, so for the moment just get * AccessShareLock always. The executor will get the right lock when it * runs, which means there is a very small chance of deadlock trying to * upgrade our lock. */ if (rel->reloptkind == RELOPT_BASEREL) relation = heap_open(relationObjectId, NoLock); else relation = heap_open(relationObjectId, AccessShareLock); rel->min_attr = FirstLowInvalidHeapAttributeNumber + 1; rel->max_attr = RelationGetNumberOfAttributes(relation); Assert(rel->max_attr >= rel->min_attr); rel->attr_needed = (Relids *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids)); rel->attr_widths = (int32 *) palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32)); /* * Estimate relation size. */ estimate_rel_size(relation, rel->attr_widths - rel->min_attr, &rel->pages, &rel->tuples); /* * Make list of indexes. Ignore indexes on system catalogs if told to. */ if (IsIgnoringSystemIndexes() && IsSystemClass(relation->rd_rel)) hasindex = false; else hasindex = relation->rd_rel->relhasindex; if (hasindex) { List *indexoidlist; ListCell *l; indexoidlist = RelationGetIndexList(relation); foreach(l, indexoidlist) { Oid indexoid = lfirst_oid(l); Relation indexRelation; Form_pg_index index; IndexOptInfo *info; int ncolumns; int i; int16 amorderstrategy; /* * Extract info from the relation descriptor for the index. * * Note that we take no lock on the index; we assume our lock on * the parent table will protect the index's schema information. * When and if the executor actually uses the index, it will take * a lock as needed to protect the access to the index contents. */ indexRelation = index_open(indexoid); index = indexRelation->rd_index; info = makeNode(IndexOptInfo); info->indexoid = index->indexrelid; info->rel = rel; info->ncolumns = ncolumns = index->indnatts; /* * Need to make classlist and ordering arrays large enough to put * a terminating 0 at the end of each one. */ info->indexkeys = (int *) palloc(sizeof(int) * ncolumns); info->classlist = (Oid *) palloc0(sizeof(Oid) * (ncolumns + 1)); info->ordering = (Oid *) palloc0(sizeof(Oid) * (ncolumns + 1)); for (i = 0; i < ncolumns; i++) { info->classlist[i] = indexRelation->rd_indclass->values[i]; info->indexkeys[i] = index->indkey.values[i]; } info->relam = indexRelation->rd_rel->relam; info->amcostestimate = indexRelation->rd_am->amcostestimate; info->amoptionalkey = indexRelation->rd_am->amoptionalkey; /* * Fetch the ordering operators associated with the index, if any. */ amorderstrategy = indexRelation->rd_am->amorderstrategy; if (amorderstrategy != 0) { int oprindex = amorderstrategy - 1; for (i = 0; i < ncolumns; i++) { info->ordering[i] = indexRelation->rd_operator[oprindex]; oprindex += indexRelation->rd_am->amstrategies; } } /* * Fetch the index expressions and predicate, if any. We must * modify the copies we obtain from the relcache to have the * correct varno for the parent relation, so that they match up * correctly against qual clauses. */ info->indexprs = RelationGetIndexExpressions(indexRelation); info->indpred = RelationGetIndexPredicate(indexRelation); if (info->indexprs && varno != 1) ChangeVarNodes((Node *) info->indexprs, 1, varno, 0); if (info->indpred && varno != 1) ChangeVarNodes((Node *) info->indpred, 1, varno, 0); info->predOK = false; /* set later in indxpath.c */ info->unique = index->indisunique; /* * Estimate the index size. If it's not a partial index, we lock * the number-of-tuples estimate to equal the parent table; if it * is partial then we have to use the same methods as we would for * a table, except we can be sure that the index is not larger * than the table. */ if (info->indpred == NIL) { info->pages = RelationGetNumberOfBlocks(indexRelation); info->tuples = rel->tuples; } else { estimate_rel_size(indexRelation, NULL, &info->pages, &info->tuples); if (info->tuples > rel->tuples) info->tuples = rel->tuples; } index_close(indexRelation); indexinfos = lcons(info, indexinfos); } list_free(indexoidlist); }