/* ---------------------------------------------------------------- * ExecEndBitmapIndexScan * ---------------------------------------------------------------- */ void ExecEndBitmapIndexScan(BitmapIndexScanState *node) { Relation indexRelationDesc; IndexScanDesc indexScanDesc; IndexScanDesc odIndexScanDesc; /* * extract information from the node */ indexRelationDesc = node->biss_RelationDesc; indexScanDesc = node->biss_ScanDesc; odIndexScanDesc = node->odbiss_ScanDesc; /* * Free the exprcontext ... now dead code, see ExecFreeExprContext */ #ifdef NOT_USED if (node->biss_RuntimeContext) FreeExprContext(node->biss_RuntimeContext); #endif /* * close the index relation */ index_endscan(indexScanDesc); if (odIndexScanDesc != NULL) { index_endscan(odIndexScanDesc); odIndexScanDesc = NULL; } index_close(indexRelationDesc); }
/* * Release resources for one part (this includes closing the index and * the relation). */ static inline void CleanupOnePartition(IndexScanState *indexState) { Assert(NULL != indexState); /* Reset index state and release locks. */ ExecClearTuple(indexState->ss.ps.ps_ResultTupleSlot); ExecClearTuple(indexState->ss.ss_ScanTupleSlot); if ((indexState->ss.scan_state & SCAN_SCAN) != 0) { Assert(indexState->iss_ScanDesc != NULL); Assert(indexState->iss_RelationDesc != NULL); Assert(indexState->ss.ss_currentRelation != NULL); index_endscan(indexState->iss_ScanDesc); indexState->iss_ScanDesc = NULL; index_close(indexState->iss_RelationDesc, NoLock); indexState->iss_RelationDesc = NULL; ExecCloseScanRelation(indexState->ss.ss_currentRelation); indexState->ss.ss_currentRelation = NULL; } indexState->ss.scan_state = SCAN_INIT; }
/* * systable_endscan_ordered --- close scan, release resources */ void systable_endscan_ordered(SysScanDesc sysscan) { Assert(sysscan->irel); index_endscan(sysscan->iscan); pfree(sysscan); }
/* * _bitmap_cleanup_buildstate() -- clean up the build state after * inserting all rows in the heap into the bitmap index. */ void _bitmap_cleanup_buildstate(Relation index, BMBuildState *bmstate) { /* write out remaining tids in bmstate->bm_tidLicsBuffer */ BMTidBuildBuf *tidLocsBuffer = bmstate->bm_tidLocsBuffer; _bitmap_write_alltids(index, tidLocsBuffer, bmstate->use_wal); pfree(bmstate->bm_tidLocsBuffer); if (cur_bmbuild) { MemoryContextDelete(cur_bmbuild->tmpcxt); MemoryContextDelete(cur_bmbuild->hash_cxt); pfree(cur_bmbuild->hash_funcs); pfree(cur_bmbuild->eq_funcs); pfree(cur_bmbuild); cur_bmbuild = NULL; } else { /* * We might have build an index on a non-hashable data type, in * which case we will have searched the btree manually. Free associated * memory. */ index_endscan(bmstate->bm_lov_scanDesc); pfree(bmstate->bm_lov_scanKeys); } _bitmap_close_lov_heapandindex(bmstate->bm_lov_heap,bmstate->bm_lov_index, RowExclusiveLock); }
/* * GetNewOidWithIndex * Guts of GetNewOid: use the supplied index * * This is exported separately because there are cases where we want to use * an index that will not be recognized by RelationGetOidIndex: TOAST tables * and pg_largeobject have indexes that are usable, but have multiple columns * and are on ordinary columns rather than a true OID column. This code * will work anyway, so long as the OID is the index's first column. * * Caller must have a suitable lock on the relation. */ Oid GetNewOidWithIndex(Relation relation, Relation indexrel) { Oid newOid; SnapshotData SnapshotDirty; IndexScanDesc scan; ScanKeyData key; bool collides; InitDirtySnapshot(SnapshotDirty); /* Generate new OIDs until we find one not in the table */ do { CHECK_FOR_INTERRUPTS(); newOid = GetNewObjectId(); ScanKeyInit(&key, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(newOid)); /* see notes above about using SnapshotDirty */ scan = index_beginscan(relation, indexrel, &SnapshotDirty, 1, &key); collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection)); index_endscan(scan); } while (collides); return newOid; }
/* ---------------------------------------------------------------- * ExecEndBitmapIndexScan * ---------------------------------------------------------------- */ void ExecEndBitmapIndexScan(BitmapIndexScanState *node) { Relation indexRelationDesc; IndexScanDesc indexScanDesc; /* * extract information from the node */ indexRelationDesc = node->biss_RelationDesc; indexScanDesc = node->biss_ScanDesc; /* * Free the exprcontext ... now dead code, see ExecFreeExprContext */ #ifdef NOT_USED if (node->biss_RuntimeContext) FreeExprContext(node->biss_RuntimeContext, true); #endif /* * close the index relation (no-op if we didn't open it) */ if (indexScanDesc) index_endscan(indexScanDesc); if (indexRelationDesc) index_close(indexRelationDesc, NoLock); }
/* * Determine size of a large object * * NOTE: LOs can contain gaps, just like Unix files. We actually return * the offset of the last byte + 1. */ static uint32 inv_getsize(LargeObjectDesc *obj_desc) { bool found = false; uint32 lastbyte = 0; ScanKeyData skey[1]; IndexScanDesc sd; HeapTuple tuple; Assert(PointerIsValid(obj_desc)); open_lo_relation(); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); sd = index_beginscan(lo_heap_r, lo_index_r, obj_desc->snapshot, 1, skey); /* * Because the pg_largeobject index is on both loid and pageno, but we * constrain only loid, a backwards scan should visit all pages of the * large object in reverse pageno order. So, it's sufficient to examine * the first valid tuple (== last valid page). */ while ((tuple = index_getnext(sd, BackwardScanDirection)) != NULL) { Form_pg_largeobject data; bytea *datafield; bool pfreeit; found = true; if (HeapTupleHasNulls(tuple)) /* paranoia */ elog(ERROR, "null field found in pg_largeobject"); data = (Form_pg_largeobject) GETSTRUCT(tuple); datafield = &(data->data); /* see note at top of file */ pfreeit = false; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((struct varlena *) datafield); pfreeit = true; } lastbyte = data->pageno * LOBLKSIZE + getbytealen(datafield); if (pfreeit) pfree(datafield); break; } index_endscan(sd); if (!found) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("large object %u does not exist", obj_desc->id))); return lastbyte; }
/* * Free the index scan descriptor. */ static inline void freeScanDesc(IndexScanState *indexstate) { if (indexstate->iss_ScanDesc != NULL) { index_endscan(indexstate->iss_ScanDesc); indexstate->iss_ScanDesc = NULL; } }
/* * systable_endscan_ordered --- close scan, release resources */ void systable_endscan_ordered(SysScanDesc sysscan) { Assert(sysscan->irel); index_endscan(sysscan->iscan); if (sysscan->snapshot) UnregisterSnapshot(sysscan->snapshot); pfree(sysscan); }
/* * Ends a index scan over the visimap store. */ void AppendOnlyVisimapStore_EndScan( AppendOnlyVisimapStore *visiMapStore, IndexScanDesc indexScan) { Assert(visiMapStore); Assert(indexScan); index_endscan(indexScan); }
/* * systable_endscan --- close scan, release resources * * Note that it's still up to the caller to close the heap relation. */ void systable_endscan(SysScanDesc sysscan) { if (sysscan->irel) { index_endscan(sysscan->iscan); index_close(sysscan->irel, AccessShareLock); } else heap_endscan(sysscan->scan); pfree(sysscan); }
/* ---------- * toast_delete_datum - * * Delete a single external stored value. * ---------- */ static void toast_delete_datum(Relation rel __attribute__((unused)), Datum value) { varattrib *attr = (varattrib *) DatumGetPointer(value); Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple toasttup; if (!VARATT_IS_EXTERNAL(attr)) return; /* * Open the toast relation and its index */ toastrel = heap_open(attr->va_external.va_toastrelid, RowExclusiveLock); toastidx = index_open(toastrel->rd_rel->reltoastidxid, RowExclusiveLock); /* * Setup a scan key to fetch from the index by va_valueid (we don't * particularly care whether we see them in sequence or not) */ ScanKeyInit(&toastkey, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(attr->va_external.va_valueid)); /* * Find all the chunks. (We don't actually care whether we see them in * sequence or not, but since we've already locked the index we might as * well use systable_beginscan_ordered.) */ toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((toasttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, delete it */ simple_heap_delete(toastrel, &toasttup->t_self); } /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx, RowExclusiveLock); heap_close(toastrel, RowExclusiveLock); }
/* ---------- * toast_delete_datum - * * Delete a single external stored value. * ---------- */ static void toast_delete_datum(Relation rel, Datum value) { varattrib *attr = (varattrib *) DatumGetPointer(value); Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple toasttup; if (!VARATT_IS_EXTERNAL(attr)) return; /* * Open the toast relation and it's index */ toastrel = heap_open(attr->va_content.va_external.va_toastrelid, RowExclusiveLock); toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Setup a scan key to fetch from the index by va_valueid (we don't * particularly care whether we see them in sequence or not) */ ScanKeyEntryInitialize(&toastkey, (bits16) 0, (AttrNumber) 1, (RegProcedure) F_OIDEQ, ObjectIdGetDatum(attr->va_content.va_external.va_valueid)); /* * Find the chunks by index */ toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((toasttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, delete it */ simple_heap_delete(toastrel, &toasttup->t_self); } /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx); heap_close(toastrel, RowExclusiveLock); }
/* ---------------------------------------------------------------- * ExecEndIndexOnlyScan * ---------------------------------------------------------------- */ void ExecEndIndexOnlyScan(IndexOnlyScanState *node) { Relation indexRelationDesc; IndexScanDesc indexScanDesc; Relation relation; /* * extract information from the node */ indexRelationDesc = node->ioss_RelationDesc; indexScanDesc = node->ioss_ScanDesc; relation = node->ss.ss_currentRelation; /* Release VM buffer pin, if any. */ if (node->ioss_VMBuffer != InvalidBuffer) { ReleaseBuffer(node->ioss_VMBuffer); node->ioss_VMBuffer = InvalidBuffer; } /* * Free the exprcontext(s) ... now dead code, see ExecFreeExprContext */ #ifdef NOT_USED ExecFreeExprContext(&node->ss.ps); if (node->ioss_RuntimeContext) FreeExprContext(node->ioss_RuntimeContext, true); #endif /* * clear out tuple table slots */ ExecClearTuple(node->ss.ps.ps_ResultTupleSlot); ExecClearTuple(node->ss.ss_ScanTupleSlot); /* * close the index relation (no-op if we didn't open it) */ if (indexScanDesc) index_endscan(indexScanDesc); if (indexRelationDesc) index_close(indexRelationDesc, NoLock); /* * close the heap relation. */ ExecCloseScanRelation(relation); }
/* * GetNewOidWithIndex * Guts of GetNewOid: use the supplied index * * This is exported separately because there are cases where we want to use * an index that will not be recognized by RelationGetOidIndex: TOAST tables * and pg_largeobject have indexes that are usable, but have multiple columns * and are on ordinary columns rather than a true OID column. This code * will work anyway, so long as the OID is the index's first column. * * Caller must have a suitable lock on the relation. */ Oid GetNewOidWithIndex(Relation relation, Relation indexrel) { Oid newOid; IndexScanDesc scan; ScanKeyData key; bool collides; /* Generate new OIDs until we find one not in the table */ do { CHECK_FOR_INTERRUPTS(); newOid = GetNewObjectId(); ScanKeyInit(&key, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(newOid)); /* see notes above about using SnapshotDirty */ scan = index_beginscan(relation, indexrel, SnapshotDirty, 1, &key); collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection)); index_endscan(scan); } while (collides); if (IsSystemNamespace(RelationGetNamespace(relation))) { if (Gp_role == GP_ROLE_EXECUTE) { if (relation->rd_id != 2604 /* pg_attrdef */ && relation->rd_id != 2606 /* pg_constraint */ && relation->rd_id != 2615 /* pg_namespace */) elog(DEBUG1,"Allocating Oid %u with index on relid %u %s in EXECUTE mode",newOid,relation->rd_id, RelationGetRelationName(relation)); else elog(DEBUG4,"Allocating Oid %u with index on relid %u %s in EXECUTE mode",newOid,relation->rd_id, RelationGetRelationName(relation)); } if (Gp_role == GP_ROLE_DISPATCH) { elog(DEBUG5,"Allocating Oid %u with index on relid %u %s in DISPATCH mode",newOid,relation->rd_id, RelationGetRelationName(relation)); } } return newOid; }
/* ---------------------------------------------------------------- * ExecEndIndexScan * ---------------------------------------------------------------- */ void ExecEndIndexScan(index_ss *node) { struct relation* indexRelationDesc; struct index_scan* indexScanDesc; struct relation* relation; /* * extract information from the node */ indexRelationDesc = node->iss_RelationDesc; indexScanDesc = node->iss_ScanDesc; relation = node->ss.ss_currentRelation; /* * Free the exprcontext(s) ... now dead code, see exec_free_expr_ctx */ #ifdef NOT_USED exec_free_expr_ctx(&node->ss.ps); if (node->iss_RuntimeContext) free_expr_ctx(node->iss_RuntimeContext, true); #endif /* * clear out tuple table slots */ exec_clear_tuple(node->ss.ps.ps_ResultTupleSlot); exec_clear_tuple(node->ss.ss_ScanTupleSlot); /* * close the index relation (no-op if we didn't open it) */ if (indexScanDesc) index_endscan(indexScanDesc); if (indexRelationDesc) index_close(indexRelationDesc, NO_LOCK); /* * close the heap relation. */ ExecCloseScanRelation(relation); }
/* * AppendOnlyBlockDirectory_DeleteSegmentFile * * Deletes all block directory entries for given segment file of an * append-only relation. */ void AppendOnlyBlockDirectory_DeleteSegmentFile( AppendOnlyEntry *aoEntry, Snapshot snapshot, int segno, int columnGroupNo) { Assert(aoEntry); Assert(OidIsValid(aoEntry->blkdirrelid)); Assert(OidIsValid(aoEntry->blkdiridxid)); Relation blkdirRel = heap_open(aoEntry->blkdirrelid, RowExclusiveLock); Relation blkdirIdx = index_open(aoEntry->blkdiridxid, RowExclusiveLock); ScanKeyData scanKey; ScanKeyInit(&scanKey, 1, /* segno */ BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(segno)); IndexScanDesc indexScan = index_beginscan( blkdirRel, blkdirIdx, snapshot, 1, &scanKey); HeapTuple tuple = NULL; while ((tuple = index_getnext(indexScan, ForwardScanDirection)) != NULL) { simple_heap_delete(blkdirRel, &tuple->t_self); } index_endscan(indexScan); index_close(blkdirIdx, RowExclusiveLock); heap_close(blkdirRel, RowExclusiveLock); }
/* * Bulk deletion of all index entries pointing to a set of heap tuples. * The set of target tuples is specified via a callback routine that tells * whether any given heap tuple (identified by ItemPointer) is being deleted. * * Result: a palloc'd struct containing statistical info for VACUUM displays. */ Datum rtbulkdelete(PG_FUNCTION_ARGS) { Relation rel = (Relation) PG_GETARG_POINTER(0); IndexBulkDeleteCallback callback = (IndexBulkDeleteCallback) PG_GETARG_POINTER(1); void *callback_state = (void *) PG_GETARG_POINTER(2); IndexBulkDeleteResult *result; BlockNumber num_pages; double tuples_removed; double num_index_tuples; IndexScanDesc iscan; tuples_removed = 0; num_index_tuples = 0; /* * Since rtree is not marked "amconcurrent" in pg_am, caller should have * acquired exclusive lock on index relation. We need no locking here. */ /* * XXX generic implementation --- should be improved! */ /* walk through the entire index */ iscan = index_beginscan(NULL, rel, SnapshotAny, 0, NULL); /* including killed tuples */ iscan->ignore_killed_tuples = false; while (index_getnext_indexitem(iscan, ForwardScanDirection)) { vacuum_delay_point(); if (callback(&iscan->xs_ctup.t_self, callback_state)) { ItemPointerData indextup = iscan->currentItemData; BlockNumber blkno; OffsetNumber offnum; Buffer buf; Page page; blkno = ItemPointerGetBlockNumber(&indextup); offnum = ItemPointerGetOffsetNumber(&indextup); /* adjust any scans that will be affected by this deletion */ /* (namely, my own scan) */ rtadjscans(rel, RTOP_DEL, blkno, offnum); /* delete the index tuple */ buf = ReadBuffer(rel, blkno); page = BufferGetPage(buf); PageIndexTupleDelete(page, offnum); WriteBuffer(buf); tuples_removed += 1; } else num_index_tuples += 1; } index_endscan(iscan); /* return statistics */ num_pages = RelationGetNumberOfBlocks(rel); result = (IndexBulkDeleteResult *) palloc0(sizeof(IndexBulkDeleteResult)); result->num_pages = num_pages; result->num_index_tuples = num_index_tuples; result->tuples_removed = tuples_removed; PG_RETURN_POINTER(result); }
/* * AppendOnlyBlockDirectory_GetEntry * * Find a directory entry for the given AOTupleId in the block directory. * If such an entry is found, return true. Otherwise, return false. * * The range for directoryEntry is assigned accordingly in this function. * * The block directory for the appendonly table should exist before calling * this function. */ bool AppendOnlyBlockDirectory_GetEntry( AppendOnlyBlockDirectory *blockDirectory, AOTupleId *aoTupleId, int columnGroupNo, AppendOnlyBlockDirectoryEntry *directoryEntry) { int segmentFileNum = AOTupleIdGet_segmentFileNum(aoTupleId); int64 rowNum = AOTupleIdGet_rowNum(aoTupleId); int i; Relation blkdirRel = blockDirectory->blkdirRel; Relation blkdirIdx = blockDirectory->blkdirIdx; int numScanKeys = blockDirectory->numScanKeys; ScanKey scanKeys = blockDirectory->scanKeys; TupleDesc heapTupleDesc; FileSegInfo *fsInfo = NULL; IndexScanDesc idxScanDesc; HeapTuple tuple = NULL; MinipagePerColumnGroup *minipageInfo = &blockDirectory->minipages[columnGroupNo]; int entry_no = -1; int tmpGroupNo; if (blkdirRel == NULL || blkdirIdx == NULL) { Assert(RelationIsValid(blockDirectory->aoRel)); ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("Block directory for append-only relation '%s' does not exist", RelationGetRelationName(blockDirectory->aoRel)))); return false; } ereportif(Debug_appendonly_print_blockdirectory, LOG, (errmsg("Append-only block directory get entry: " "(columnGroupNo, segmentFileNum, rowNum) = " "(%d, %d, " INT64_FORMAT ")", columnGroupNo, segmentFileNum, rowNum))); /* * If the segment file number is the same as * blockDirectory->currentSegmentFileNum, the in-memory minipage * may contain such an entry. We search the in-memory minipage * first. If such an entry can not be found, we search for the * appropriate minipage by using the block directory btree index. */ if (segmentFileNum == blockDirectory->currentSegmentFileNum && minipageInfo->numMinipageEntries > 0) { Assert(blockDirectory->currentSegmentFileInfo != NULL); MinipageEntry *firstentry = &minipageInfo->minipage->entry[0]; if (rowNum >= firstentry->firstRowNum) { /* * Check if the existing minipage contains the requested * rowNum. If so, just get it. */ entry_no = find_minipage_entry(minipageInfo->minipage, minipageInfo->numMinipageEntries, rowNum); if (entry_no != -1) { return set_directoryentry_range(blockDirectory, columnGroupNo, entry_no, directoryEntry); } /* * The given rowNum may point to a tuple that does not exist * in the AO table any more, either because of cancellation of * an insert, or due to crashes during an insert. If this is * the case, rowNum is smaller than the highest entry in * the in-memory minipage entry. */ else { MinipageEntry *entry = &minipageInfo->minipage->entry[minipageInfo->numMinipageEntries - 1]; if (rowNum < entry->firstRowNum + entry->rowCount - 1) return false; } } } for (i = 0; i < blockDirectory->totalSegfiles; i++) { fsInfo = blockDirectory->segmentFileInfo[i]; if (!blockDirectory->isAOCol && segmentFileNum == fsInfo->segno) break; else if (blockDirectory->isAOCol && segmentFileNum == ((AOCSFileSegInfo*)fsInfo)->segno) break; } Assert(fsInfo != NULL); /* * Search the btree index to find the minipage that contains * the rowNum. We find the minipages for all column groups, since * currently we will need to access all columns at the same time. */ heapTupleDesc = RelationGetDescr(blkdirRel); Assert(numScanKeys == 3); for (tmpGroupNo = 0; tmpGroupNo < blockDirectory->numColumnGroups; tmpGroupNo++) { if (blockDirectory->proj && !blockDirectory->proj[tmpGroupNo]) { /* Ignore columns that are not projected. */ continue; } /* Setup the scan keys for the scan. */ Assert(scanKeys != NULL); scanKeys[0].sk_argument = Int32GetDatum(segmentFileNum); scanKeys[1].sk_argument = Int32GetDatum(tmpGroupNo); scanKeys[2].sk_argument = Int64GetDatum(rowNum); idxScanDesc = index_beginscan(blkdirRel, blkdirIdx, blockDirectory->appendOnlyMetaDataSnapshot, numScanKeys, scanKeys); tuple = index_getnext(idxScanDesc, BackwardScanDirection); if (tuple != NULL) { /* * MPP-17061: we need to update currentSegmentFileNum * & currentSegmentFileInfo at the same time when we * load the minipage for the block directory entry we * found, otherwise we would risk having inconsistency * between currentSegmentFileNum/currentSegmentFileInfo * and minipage contents, which would cause wrong block * header offset being returned in following block * directory entry look up. */ blockDirectory->currentSegmentFileNum = segmentFileNum; blockDirectory->currentSegmentFileInfo = fsInfo; extract_minipage(blockDirectory, tuple, heapTupleDesc, tmpGroupNo); } else { /* MPP-17061: index look up failed, row is invisible */ index_endscan(idxScanDesc); return false; } index_endscan(idxScanDesc); } { MinipagePerColumnGroup *minipageInfo; minipageInfo = &blockDirectory->minipages[columnGroupNo]; /* * Perform a binary search over the minipage to find * the entry about the AO block. */ entry_no = find_minipage_entry(minipageInfo->minipage, minipageInfo->numMinipageEntries, rowNum); /* If there are no entries, return false. */ if (entry_no == -1 && minipageInfo->numMinipageEntries == 0) return false; if (entry_no == -1) { /* * Since the last few blocks may not be logged in the block * directory, we always use the last entry. */ entry_no = minipageInfo->numMinipageEntries - 1; } return set_directoryentry_range(blockDirectory, columnGroupNo, entry_no, directoryEntry); } return false; }
/* ---------- * toast_fetch_datum_slice - * * Reconstruct a segment of a varattrib from the chunks saved * in the toast relation * ---------- */ static varattrib * toast_fetch_datum_slice(varattrib *attr, int32 sliceoffset, int32 length) { Relation toastrel; Relation toastidx; ScanKeyData toastkey[3]; int nscankeys; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 attrsize; int32 residx; int32 nextidx; int numchunks; int startchunk; int endchunk; int32 startoffset; int32 endoffset; int totalchunks; Pointer chunk; bool isnull; int32 chunksize; int32 chcpystrt; int32 chcpyend; attrsize = attr->va_content.va_external.va_extsize; totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; if (sliceoffset >= attrsize) { sliceoffset = 0; length = 0; } if (((sliceoffset + length) > attrsize) || length < 0) length = attrsize - sliceoffset; result = (varattrib *) palloc(length + VARHDRSZ); VARATT_SIZEP(result) = length + VARHDRSZ; if (VARATT_IS_COMPRESSED(attr)) VARATT_SIZEP(result) |= VARATT_FLAG_COMPRESSED; if (length == 0) return (result); /* Can save a lot of work at this point! */ startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE; numchunks = (endchunk - startchunk) + 1; startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE; endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE; /* * Open the toast relation and it's index */ toastrel = heap_open(attr->va_content.va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Setup a scan key to fetch from the index. This is either two keys * or three depending on the number of chunks. */ ScanKeyEntryInitialize(&toastkey[0], (bits16) 0, (AttrNumber) 1, (RegProcedure) F_OIDEQ, ObjectIdGetDatum(attr->va_content.va_external.va_valueid)); /* * Now dependent on number of chunks: */ if (numchunks == 1) { ScanKeyEntryInitialize(&toastkey[1], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4EQ, Int32GetDatum(startchunk)); nscankeys = 2; } else { ScanKeyEntryInitialize(&toastkey[1], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4GE, Int32GetDatum(startchunk)); ScanKeyEntryInitialize(&toastkey[2], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4LE, Int32GetDatum(endchunk)); nscankeys = 3; } /* * Read the chunks by index * * The index is on (valueid, chunkidx) so they will come in order */ nextidx = startchunk; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, nscankeys, toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(heap_getattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(heap_getattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); chunksize = VARATT_SIZE(chunk) - VARHDRSZ; /* * Some checks on the data we've found */ if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk)) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, attr->va_content.va_external.va_valueid); if (residx < totalchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } /* * Copy the data into proper place in our result */ chcpystrt = 0; chcpyend = chunksize - 1; if (residx == startchunk) chcpystrt = startoffset; if (residx == endchunk) chcpyend = endoffset; memcpy(((char *) VARATT_DATA(result)) + (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, VARATT_DATA(chunk) + chcpystrt, (chcpyend - chcpystrt) + 1); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != (endchunk + 1)) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, attr->va_content.va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx); heap_close(toastrel, AccessShareLock); return result; }
/* ---------- * toast_fetch_datum - * * Reconstruct an in memory varattrib from the chunks saved * in the toast relation * ---------- */ static varattrib * toast_fetch_datum(varattrib *attr) { Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 ressize; int32 residx, nextidx; int32 numchunks; Pointer chunk; bool isnull; int32 chunksize; ressize = attr->va_content.va_external.va_extsize; numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; result = (varattrib *) palloc(ressize + VARHDRSZ); VARATT_SIZEP(result) = ressize + VARHDRSZ; if (VARATT_IS_COMPRESSED(attr)) VARATT_SIZEP(result) |= VARATT_FLAG_COMPRESSED; /* * Open the toast relation and its index */ toastrel = heap_open(attr->va_content.va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Setup a scan key to fetch from the index by va_valueid */ ScanKeyEntryInitialize(&toastkey, (bits16) 0, (AttrNumber) 1, (RegProcedure) F_OIDEQ, ObjectIdGetDatum(attr->va_content.va_external.va_valueid)); /* * Read the chunks by index * * Note that because the index is actually on (valueid, chunkidx) we will * see the chunks in chunkidx order, even though we didn't explicitly * ask for it. */ nextidx = 0; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(heap_getattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(heap_getattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); chunksize = VARATT_SIZE(chunk) - VARHDRSZ; /* * Some checks on the data we've found */ if (residx != nextidx) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, attr->va_content.va_external.va_valueid); if (residx < numchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else if (residx < numchunks) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else elog(ERROR, "unexpected chunk number %d for toast value %u", residx, attr->va_content.va_external.va_valueid); /* * Copy the data into proper place in our result */ memcpy(((char *) VARATT_DATA(result)) + residx * TOAST_MAX_CHUNK_SIZE, VARATT_DATA(chunk), chunksize); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != numchunks) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, attr->va_content.va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx); heap_close(toastrel, AccessShareLock); return result; }
void inv_truncate(LargeObjectDesc *obj_desc, int len) { int32 pageno = (int32) (len / LOBLKSIZE); int off; ScanKeyData skey[2]; IndexScanDesc sd; HeapTuple oldtuple; Form_pg_largeobject olddata; struct { bytea hdr; char data[LOBLKSIZE]; /* make struct big enough */ int32 align_it; /* ensure struct is aligned well enough */ } workbuf; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; bool nulls[Natts_pg_largeobject]; bool replace[Natts_pg_largeobject]; CatalogIndexState indstate; Assert(PointerIsValid(obj_desc)); /* enforce writability because snapshot is probably wrong otherwise */ if ((obj_desc->flags & IFS_WRLOCK) == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("large object %u was not opened for writing", obj_desc->id))); open_lo_relation(); indstate = CatalogOpenIndexes(lo_heap_r); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = index_beginscan(lo_heap_r, lo_index_r, obj_desc->snapshot, 2, skey); /* * If possible, get the page the truncation point is in. The truncation * point may be beyond the end of the LO or in a hole. */ olddata = NULL; if ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL) { if (HeapTupleHasNulls(oldtuple)) /* paranoia */ elog(ERROR, "null field found in pg_largeobject"); olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); Assert(olddata->pageno >= pageno); } /* * If we found the page of the truncation point we need to truncate the * data in it. Otherwise if we're in a hole, we need to create a page to * mark the end of data. */ if (olddata != NULL && olddata->pageno == pageno) { /* First, load old data into workbuf */ bytea *datafield = &(olddata->data); /* see note at top of * file */ bool pfreeit = false; int pagelen; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((struct varlena *) datafield); pfreeit = true; } pagelen = getbytealen(datafield); Assert(pagelen <= LOBLKSIZE); memcpy(workb, VARDATA(datafield), pagelen); if (pfreeit) pfree(datafield); /* * Fill any hole */ off = len % LOBLKSIZE; if (off > pagelen) MemSet(workb + pagelen, 0, off - pagelen); /* compute length of new page */ SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); memset(replace, false, sizeof(replace)); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); replace[Anum_pg_largeobject_data - 1] = true; newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r), values, nulls, replace); simple_heap_update(lo_heap_r, &newtup->t_self, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } else { /* * If the first page we found was after the truncation point, we're in * a hole that we'll fill, but we need to delete the later page. */ if (olddata != NULL && olddata->pageno > pageno) simple_heap_delete(lo_heap_r, &oldtuple->t_self); /* * Write a brand new page. * * Fill the hole up to the truncation point */ off = len % LOBLKSIZE; if (off > 0) MemSet(workb, 0, off); /* compute length of new page */ SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ); /* * Form and insert new tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls); simple_heap_insert(lo_heap_r, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } /* * Delete any pages after the truncation point */ while ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL) { simple_heap_delete(lo_heap_r, &oldtuple->t_self); } index_endscan(sd); CatalogCloseIndexes(indstate); /* * Advance command counter so that tuple updates will be seen by later * large-object operations in this transaction. */ CommandCounterIncrement(); }
int inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes) { int nwritten = 0; int n; int off; int len; int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); ScanKeyData skey[2]; IndexScanDesc sd; HeapTuple oldtuple; Form_pg_largeobject olddata; bool neednextpage; bytea *datafield; bool pfreeit; struct { bytea hdr; char data[LOBLKSIZE]; /* make struct big enough */ int32 align_it; /* ensure struct is aligned well enough */ } workbuf; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; bool nulls[Natts_pg_largeobject]; bool replace[Natts_pg_largeobject]; CatalogIndexState indstate; Assert(PointerIsValid(obj_desc)); Assert(buf != NULL); /* enforce writability because snapshot is probably wrong otherwise */ if ((obj_desc->flags & IFS_WRLOCK) == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("large object %u was not opened for writing", obj_desc->id))); if (nbytes <= 0) return 0; open_lo_relation(); indstate = CatalogOpenIndexes(lo_heap_r); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = index_beginscan(lo_heap_r, lo_index_r, obj_desc->snapshot, 2, skey); oldtuple = NULL; olddata = NULL; neednextpage = true; while (nwritten < nbytes) { /* * If possible, get next pre-existing page of the LO. We assume the * indexscan will deliver these in order --- but there may be holes. */ if (neednextpage) { if ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL) { if (HeapTupleHasNulls(oldtuple)) /* paranoia */ elog(ERROR, "null field found in pg_largeobject"); olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); Assert(olddata->pageno >= pageno); } neednextpage = false; } /* * If we have a pre-existing page, see if it is the page we want to * write, or a later one. */ if (olddata != NULL && olddata->pageno == pageno) { /* * Update an existing page with fresh data. * * First, load old data into workbuf */ datafield = &(olddata->data); /* see note at top of file */ pfreeit = false; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((struct varlena *) datafield); pfreeit = true; } len = getbytealen(datafield); Assert(len <= LOBLKSIZE); memcpy(workb, VARDATA(datafield), len); if (pfreeit) pfree(datafield); /* * Fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > len) MemSet(workb + len, 0, off - len); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; off += n; /* compute valid length of new page */ len = (len >= off) ? len : off; SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); memset(replace, false, sizeof(replace)); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); replace[Anum_pg_largeobject_data - 1] = true; newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r), values, nulls, replace); simple_heap_update(lo_heap_r, &newtup->t_self, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); /* * We're done with this old page. */ oldtuple = NULL; olddata = NULL; neednextpage = true; } else { /* * Write a brand new page. * * First, fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > 0) MemSet(workb, 0, off); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; /* compute valid length of new page */ len = off + n; SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls); simple_heap_insert(lo_heap_r, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } pageno++; } index_endscan(sd); CatalogCloseIndexes(indstate); /* * Advance command counter so that my tuple updates will be seen by later * large-object operations in this transaction. */ CommandCounterIncrement(); return nwritten; }
bool CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared, Relation pg_class) { RelFileNode rnode; char *rpath; int fd; bool collides; if (pg_class) { Oid oidIndex; Relation indexrel; IndexScanDesc scan; ScanKeyData key; Assert(!IsBootstrapProcessingMode()); Assert(pg_class->rd_rel->relhasoids); /* The relcache will cache the identity of the OID index for us */ oidIndex = RelationGetOidIndex(pg_class); Assert(OidIsValid(oidIndex)); indexrel = index_open(oidIndex, AccessShareLock); ScanKeyInit(&key, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(newOid)); scan = index_beginscan(pg_class, indexrel, SnapshotDirty, 1, &key); collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection)); index_endscan(scan); index_close(indexrel, AccessShareLock); if (collides) elog(ERROR, "relfilenode %d already in use in \"pg_class\"", newOid); } /* This should match RelationInitPhysicalAddr */ rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace; rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId; rnode.relNode = newOid; /* Check for existing file of same name */ rpath = relpath(rnode); fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0); if (fd >= 0) { /* definite collision */ gp_retry_close(fd); collides = true; } else collides = false; pfree(rpath); if (collides && !relisshared) elog(ERROR, "oid %d already in use", newOid); while(GetNewObjectId() < newOid); return !collides; }
int inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes) { int nread = 0; int n; int off; int len; int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); uint32 pageoff; ScanKeyData skey[2]; IndexScanDesc sd; HeapTuple tuple; Assert(PointerIsValid(obj_desc)); Assert(buf != NULL); if (nbytes <= 0) return 0; open_lo_relation(); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = index_beginscan(lo_heap_r, lo_index_r, SnapshotNow, 2, skey); while ((tuple = index_getnext(sd, ForwardScanDirection)) != NULL) { Form_pg_largeobject data; bytea *datafield; bool pfreeit; data = (Form_pg_largeobject) GETSTRUCT(tuple); /* * We assume the indexscan will deliver pages in order. However, * there may be missing pages if the LO contains unwritten * "holes". We want missing sections to read out as zeroes. */ pageoff = ((uint32) data->pageno) * LOBLKSIZE; if (pageoff > obj_desc->offset) { n = pageoff - obj_desc->offset; n = (n <= (nbytes - nread)) ? n : (nbytes - nread); MemSet(buf + nread, 0, n); nread += n; obj_desc->offset += n; } if (nread < nbytes) { Assert(obj_desc->offset >= pageoff); off = (int) (obj_desc->offset - pageoff); Assert(off >= 0 && off < LOBLKSIZE); datafield = &(data->data); pfreeit = false; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((varattrib *) datafield); pfreeit = true; } len = getbytealen(datafield); if (len > off) { n = len - off; n = (n <= (nbytes - nread)) ? n : (nbytes - nread); memcpy(buf + nread, VARDATA(datafield) + off, n); nread += n; obj_desc->offset += n; } if (pfreeit) pfree(datafield); } if (nread >= nbytes) break; } index_endscan(sd); return nread; }
int inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes) { int nwritten = 0; int n; int off; int len; int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); ScanKeyData skey[2]; IndexScanDesc sd; HeapTuple oldtuple; Form_pg_largeobject olddata; bool neednextpage; bytea *datafield; bool pfreeit; struct { bytea hdr; char data[LOBLKSIZE]; } workbuf; char *workb = VARATT_DATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; char nulls[Natts_pg_largeobject]; char replace[Natts_pg_largeobject]; CatalogIndexState indstate; Assert(PointerIsValid(obj_desc)); Assert(buf != NULL); if (nbytes <= 0) return 0; open_lo_relation(); indstate = CatalogOpenIndexes(lo_heap_r); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = index_beginscan(lo_heap_r, lo_index_r, SnapshotNow, 2, skey); oldtuple = NULL; olddata = NULL; neednextpage = true; while (nwritten < nbytes) { /* * If possible, get next pre-existing page of the LO. We assume * the indexscan will deliver these in order --- but there may be * holes. */ if (neednextpage) { if ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL) { olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); Assert(olddata->pageno >= pageno); } neednextpage = false; } /* * If we have a pre-existing page, see if it is the page we want * to write, or a later one. */ if (olddata != NULL && olddata->pageno == pageno) { /* * Update an existing page with fresh data. * * First, load old data into workbuf */ datafield = &(olddata->data); pfreeit = false; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((varattrib *) datafield); pfreeit = true; } len = getbytealen(datafield); Assert(len <= LOBLKSIZE); memcpy(workb, VARDATA(datafield), len); if (pfreeit) pfree(datafield); /* * Fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > len) MemSet(workb + len, 0, off - len); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; off += n; /* compute valid length of new page */ len = (len >= off) ? len : off; VARATT_SIZEP(&workbuf.hdr) = len + VARHDRSZ; /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, ' ', sizeof(nulls)); memset(replace, ' ', sizeof(replace)); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); replace[Anum_pg_largeobject_data - 1] = 'r'; newtup = heap_modifytuple(oldtuple, lo_heap_r, values, nulls, replace); simple_heap_update(lo_heap_r, &newtup->t_self, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); /* * We're done with this old page. */ oldtuple = NULL; olddata = NULL; neednextpage = true; } else { /* * Write a brand new page. * * First, fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > 0) MemSet(workb, 0, off); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; /* compute valid length of new page */ len = off + n; VARATT_SIZEP(&workbuf.hdr) = len + VARHDRSZ; /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, ' ', sizeof(nulls)); values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); newtup = heap_formtuple(lo_heap_r->rd_att, values, nulls); simple_heap_insert(lo_heap_r, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } pageno++; } index_endscan(sd); CatalogCloseIndexes(indstate); /* * Advance command counter so that my tuple updates will be seen by * later large-object operations in this transaction. */ CommandCounterIncrement(); return nwritten; }
/* * GetNewSequenceRelationOid * Get a sequence relation Oid and verify it is valid against * the pg_class relation by doing an index lookup. The caller * should have a suitable lock on pg_class. */ Oid GetNewSequenceRelationOid(Relation relation) { Oid newOid; Oid oidIndex; Relation indexrel; SnapshotData SnapshotDirty; IndexScanDesc scan; ScanKeyData key; bool collides; RelFileNode rnode; char *rpath; int fd; /* This should match RelationInitPhysicalAddr */ rnode.spcNode = relation->rd_rel->reltablespace ? relation->rd_rel->reltablespace : MyDatabaseTableSpace; rnode.dbNode = relation->rd_rel->relisshared ? InvalidOid : MyDatabaseId; /* We should only be using pg_class */ Assert(RelationGetRelid(relation) == RelationRelationId); /* The relcache will cache the identity of the OID index for us */ oidIndex = RelationGetOidIndex(relation); /* Otherwise, use the index to find a nonconflicting OID */ indexrel = index_open(oidIndex, AccessShareLock); InitDirtySnapshot(SnapshotDirty); /* Generate new sequence relation OIDs until we find one not in the table */ do { CHECK_FOR_INTERRUPTS(); newOid = GetNewSequenceRelationObjectId(); ScanKeyInit(&key, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(newOid)); /* see notes above about using SnapshotDirty */ scan = index_beginscan(relation, indexrel, &SnapshotDirty, 1, &key); collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection)); index_endscan(scan); if (!collides) { /* Check for existing file of same name */ rpath = relpath(rnode); fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0); if (fd >= 0) { /* definite collision */ gp_retry_close(fd); collides = true; } else { /* * Here we have a little bit of a dilemma: if errno is something * other than ENOENT, should we declare a collision and loop? In * particular one might think this advisable for, say, EPERM. * However there really shouldn't be any unreadable files in a * tablespace directory, and if the EPERM is actually complaining * that we can't read the directory itself, we'd be in an infinite * loop. In practice it seems best to go ahead regardless of the * errno. If there is a colliding file we will get an smgr * failure when we attempt to create the new relation file. */ collides = false; } } /* * Also check that the OID hasn't been pre-assigned for a different * relation. * * We're a bit sloppy between OIDs and relfilenodes here; it would be * OK to use a value that's been reserved for use as a type or * relation OID here, as long as the relfilenode is free. But there's * no harm in skipping over those too, so we don't bother to * distinguish them. */ if (!collides && !IsOidAcceptable(newOid)) collides = true; } while (collides); index_close(indexrel, AccessShareLock); return newOid; }
/* * Search the relation 'rel' for tuple using the index. * * If a matching tuple is found, lock it with lockmode, fill the slot with its * contents, and return true. Return false otherwise. */ bool RelationFindReplTupleByIndex(Relation rel, Oid idxoid, LockTupleMode lockmode, TupleTableSlot *searchslot, TupleTableSlot *outslot) { HeapTuple scantuple; ScanKeyData skey[INDEX_MAX_KEYS]; IndexScanDesc scan; SnapshotData snap; TransactionId xwait; Relation idxrel; bool found; /* Open the index. */ idxrel = index_open(idxoid, RowExclusiveLock); /* Start an index scan. */ InitDirtySnapshot(snap); scan = index_beginscan(rel, idxrel, &snap, RelationGetNumberOfAttributes(idxrel), 0); /* Build scan key. */ build_replindex_scan_key(skey, rel, idxrel, searchslot); retry: found = false; index_rescan(scan, skey, RelationGetNumberOfAttributes(idxrel), NULL, 0); /* Try to find the tuple */ if ((scantuple = index_getnext(scan, ForwardScanDirection)) != NULL) { found = true; ExecStoreTuple(scantuple, outslot, InvalidBuffer, false); ExecMaterializeSlot(outslot); xwait = TransactionIdIsValid(snap.xmin) ? snap.xmin : snap.xmax; /* * If the tuple is locked, wait for locking transaction to finish and * retry. */ if (TransactionIdIsValid(xwait)) { XactLockTableWait(xwait, NULL, NULL, XLTW_None); goto retry; } } /* Found tuple, try to lock it in the lockmode. */ if (found) { Buffer buf; HeapUpdateFailureData hufd; HTSU_Result res; HeapTupleData locktup; ItemPointerCopy(&outslot->tts_tuple->t_self, &locktup.t_self); PushActiveSnapshot(GetLatestSnapshot()); res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false), lockmode, LockWaitBlock, false /* don't follow updates */ , &buf, &hufd); /* the tuple slot already has the buffer pinned */ ReleaseBuffer(buf); PopActiveSnapshot(); switch (res) { case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: /* XXX: Improve handling here */ ereport(LOG, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("concurrent update, retrying"))); goto retry; case HeapTupleInvisible: elog(ERROR, "attempted to lock invisible tuple"); default: elog(ERROR, "unexpected heap_lock_tuple status: %u", res); break; } } index_endscan(scan); /* Don't release lock until commit. */ index_close(idxrel, NoLock); return found; }
/* ---------- * toast_fetch_datum - * * Reconstruct an in memory Datum from the chunks saved * in the toast relation * ---------- */ static struct varlena * toast_fetch_datum(struct varlena *attr) { Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 ressize; int32 residx, nextidx; int32 numchunks; Pointer chunk; bool isnull; int32 chunksize; void *chunkdata; ressize = ((varattrib *)attr)->va_external.va_extsize; numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; result = (varattrib *) palloc(ressize + VARHDRSZ); SET_VARSIZE(result, ressize + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(attr)) VARATT_SET_COMPRESSED(result); /* * Open the toast relation and its index */ toastrel = heap_open(((varattrib *)attr)->va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock); /* * Setup a scan key to fetch from the index by va_valueid */ ScanKeyInit(&toastkey, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(((varattrib *)attr)->va_external.va_valueid)); /* * Read the chunks by index * * Note that because the index is actually on (valueid, chunkidx) we will * see the chunks in chunkidx order, even though we didn't explicitly ask * for it. */ nextidx = 0; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); if (VARATT_IS_SHORT(chunk)) { chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; chunkdata = VARDATA_SHORT(chunk); } else if (!VARATT_IS_EXTENDED(chunk)) { chunksize = VARSIZE(chunk) - VARHDRSZ; chunkdata = VARDATA(chunk); } else { elog(ERROR, "found toasted toast chunk?"); chunksize = 0; /* shut compiler up */ chunkdata = NULL; } /* * Some checks on the data we've found */ if (residx != nextidx) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, ((varattrib *)attr)->va_external.va_valueid); if (residx < numchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d of %d for toast value %u (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, numchunks-1, (int)TOAST_MAX_CHUNK_SIZE); } else if (residx == numchunks-1) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize) elog(ERROR, "unexpected chunk size %d in final chunk %d for toast value %u (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, ressize - residx*(int)TOAST_MAX_CHUNK_SIZE); } else elog(ERROR, "unexpected chunk number %d for toast value %u (expected in %d..%d)", residx, ((varattrib *)attr)->va_external.va_valueid, 0, numchunks-1); /* * Copy the data into proper place in our result */ memcpy(((char *) VARDATA(result)) + residx * TOAST_MAX_CHUNK_SIZE, chunkdata, chunksize); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != numchunks) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, ((varattrib *)attr)->va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx, AccessShareLock); heap_close(toastrel, AccessShareLock); return (struct varlena *)result; }
/* ---------- * toast_fetch_datum_slice - * * Reconstruct a segment of a Datum from the chunks saved * in the toast relation * ---------- */ static struct varlena * toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, int32 length) { Relation toastrel; Relation toastidx; ScanKeyData toastkey[3]; int nscankeys; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 attrsize; int32 residx; int32 nextidx; int numchunks; int startchunk; int endchunk; int32 startoffset; int32 endoffset; int totalchunks; Pointer chunk; bool isnull; int32 chunksize; int32 chcpystrt; int32 chcpyend; attrsize = ((varattrib *)attr)->va_external.va_extsize; totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; if (sliceoffset >= attrsize) { sliceoffset = 0; length = 0; } if (((sliceoffset + length) > attrsize) || length < 0) length = attrsize - sliceoffset; result = (varattrib *) palloc(length + VARHDRSZ); SET_VARSIZE(result, length + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(attr)) VARATT_SET_COMPRESSED(result); if (length == 0) return (struct varlena *)result; /* Can save a lot of work at this point! */ startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE; numchunks = (endchunk - startchunk) + 1; startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE; endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE; /* * Open the toast relation and its index */ toastrel = heap_open(((varattrib *)attr)->va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock); /* * Setup a scan key to fetch from the index. This is either two keys or * three depending on the number of chunks. */ ScanKeyInit(&toastkey[0], (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(((varattrib *)attr)->va_external.va_valueid)); /* * Use equality condition for one chunk, a range condition otherwise: */ if (numchunks == 1) { ScanKeyInit(&toastkey[1], (AttrNumber) 2, BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(startchunk)); nscankeys = 2; } else { ScanKeyInit(&toastkey[1], (AttrNumber) 2, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(startchunk)); ScanKeyInit(&toastkey[2], (AttrNumber) 2, BTLessEqualStrategyNumber, F_INT4LE, Int32GetDatum(endchunk)); nscankeys = 3; } /* * Read the chunks by index * * The index is on (valueid, chunkidx) so they will come in order */ nextidx = startchunk; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, nscankeys, toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); if (VARATT_IS_SHORT((varattrib *)chunk)) chunksize = VARSIZE_SHORT((varattrib *)chunk) - VARHDRSZ_SHORT; else if (!VARATT_IS_EXTENDED((varattrib *)chunk)) chunksize = VARSIZE((varattrib *)chunk) - VARHDRSZ; else { elog(ERROR, "found toasted toast chunk?"); chunksize = 0; /* shut compiler up */ } /* * Some checks on the data we've found */ if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk)) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, ((varattrib *)attr)->va_external.va_valueid); if (residx < totalchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u of %d when fetching slice (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, totalchunks-1, (int)TOAST_MAX_CHUNK_SIZE); } else if (residx == totalchunks-1) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize) elog(ERROR, "unexpected chunk size %d in chunk %d for final toast value %u when fetching slice (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, attrsize - residx * (int)TOAST_MAX_CHUNK_SIZE); } else { elog(ERROR, "unexpected chunk"); } /* * Copy the data into proper place in our result */ chcpystrt = 0; chcpyend = chunksize - 1; if (residx == startchunk) chcpystrt = startoffset; if (residx == endchunk) chcpyend = endoffset; memcpy(((char *) VARDATA(result)) + (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, VARDATA((varattrib *)chunk) + chcpystrt, (chcpyend - chcpystrt) + 1); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != (endchunk + 1)) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, ((varattrib *)attr)->va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx, AccessShareLock); heap_close(toastrel, AccessShareLock); return (struct varlena *)result; }