/* * Fetches the next entry from a visimap store index scan. * * It is the responsibility of the caller to decode the return value * correctly. * */ static HeapTuple AppendOnlyVisimapStore_GetNextTuple( AppendOnlyVisimapStore *visiMapStore, IndexScanDesc indexScan, ScanDirection scanDirection) { Assert(visiMapStore); Assert(RelationIsValid(visiMapStore->visimapRelation)); Assert(RelationIsValid(visiMapStore->visimapIndex)); Assert(indexScan); return index_getnext(indexScan, scanDirection); }
/* * systable_getnext_ordered --- get next tuple in an ordered catalog scan */ HeapTuple systable_getnext_ordered(SysScanDesc sysscan, ScanDirection direction) { HeapTuple htup; Assert(sysscan->irel); htup = index_getnext(sysscan->iscan, direction); /* See notes in systable_getnext */ if (htup && sysscan->iscan->xs_recheck) elog(ERROR, "system catalog scans with lossy index conditions are not implemented"); return htup; }
/* ---------- * toast_delete_datum - * * Delete a single external stored value. * ---------- */ static void toast_delete_datum(Relation rel __attribute__((unused)), Datum value) { varattrib *attr = (varattrib *) DatumGetPointer(value); Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple toasttup; if (!VARATT_IS_EXTERNAL(attr)) return; /* * Open the toast relation and its index */ toastrel = heap_open(attr->va_external.va_toastrelid, RowExclusiveLock); toastidx = index_open(toastrel->rd_rel->reltoastidxid, RowExclusiveLock); /* * Setup a scan key to fetch from the index by va_valueid (we don't * particularly care whether we see them in sequence or not) */ ScanKeyInit(&toastkey, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(attr->va_external.va_valueid)); /* * Find all the chunks. (We don't actually care whether we see them in * sequence or not, but since we've already locked the index we might as * well use systable_beginscan_ordered.) */ toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((toasttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, delete it */ simple_heap_delete(toastrel, &toasttup->t_self); } /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx, RowExclusiveLock); heap_close(toastrel, RowExclusiveLock); }
/* ---------- * toast_delete_datum - * * Delete a single external stored value. * ---------- */ static void toast_delete_datum(Relation rel, Datum value) { varattrib *attr = (varattrib *) DatumGetPointer(value); Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple toasttup; if (!VARATT_IS_EXTERNAL(attr)) return; /* * Open the toast relation and it's index */ toastrel = heap_open(attr->va_content.va_external.va_toastrelid, RowExclusiveLock); toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Setup a scan key to fetch from the index by va_valueid (we don't * particularly care whether we see them in sequence or not) */ ScanKeyEntryInitialize(&toastkey, (bits16) 0, (AttrNumber) 1, (RegProcedure) F_OIDEQ, ObjectIdGetDatum(attr->va_content.va_external.va_valueid)); /* * Find the chunks by index */ toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((toasttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, delete it */ simple_heap_delete(toastrel, &toasttup->t_self); } /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx); heap_close(toastrel, RowExclusiveLock); }
/* * GetNewOidWithIndex * Guts of GetNewOid: use the supplied index * * This is exported separately because there are cases where we want to use * an index that will not be recognized by RelationGetOidIndex: TOAST tables * and pg_largeobject have indexes that are usable, but have multiple columns * and are on ordinary columns rather than a true OID column. This code * will work anyway, so long as the OID is the index's first column. * * Caller must have a suitable lock on the relation. */ Oid GetNewOidWithIndex(Relation relation, Relation indexrel) { Oid newOid; IndexScanDesc scan; ScanKeyData key; bool collides; /* Generate new OIDs until we find one not in the table */ do { CHECK_FOR_INTERRUPTS(); newOid = GetNewObjectId(); ScanKeyInit(&key, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(newOid)); /* see notes above about using SnapshotDirty */ scan = index_beginscan(relation, indexrel, SnapshotDirty, 1, &key); collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection)); index_endscan(scan); } while (collides); if (IsSystemNamespace(RelationGetNamespace(relation))) { if (Gp_role == GP_ROLE_EXECUTE) { if (relation->rd_id != 2604 /* pg_attrdef */ && relation->rd_id != 2606 /* pg_constraint */ && relation->rd_id != 2615 /* pg_namespace */) elog(DEBUG1,"Allocating Oid %u with index on relid %u %s in EXECUTE mode",newOid,relation->rd_id, RelationGetRelationName(relation)); else elog(DEBUG4,"Allocating Oid %u with index on relid %u %s in EXECUTE mode",newOid,relation->rd_id, RelationGetRelationName(relation)); } if (Gp_role == GP_ROLE_DISPATCH) { elog(DEBUG5,"Allocating Oid %u with index on relid %u %s in DISPATCH mode",newOid,relation->rd_id, RelationGetRelationName(relation)); } } return newOid; }
/* * AppendOnlyBlockDirectory_DeleteSegmentFile * * Deletes all block directory entries for given segment file of an * append-only relation. */ void AppendOnlyBlockDirectory_DeleteSegmentFile( AppendOnlyEntry *aoEntry, Snapshot snapshot, int segno, int columnGroupNo) { Assert(aoEntry); Assert(OidIsValid(aoEntry->blkdirrelid)); Assert(OidIsValid(aoEntry->blkdiridxid)); Relation blkdirRel = heap_open(aoEntry->blkdirrelid, RowExclusiveLock); Relation blkdirIdx = index_open(aoEntry->blkdiridxid, RowExclusiveLock); ScanKeyData scanKey; ScanKeyInit(&scanKey, 1, /* segno */ BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(segno)); IndexScanDesc indexScan = index_beginscan( blkdirRel, blkdirIdx, snapshot, 1, &scanKey); HeapTuple tuple = NULL; while ((tuple = index_getnext(indexScan, ForwardScanDirection)) != NULL) { simple_heap_delete(blkdirRel, &tuple->t_self); } index_endscan(indexScan); index_close(blkdirIdx, RowExclusiveLock); heap_close(blkdirRel, RowExclusiveLock); }
void rebuildheap(Oid OIDNewHeap, Oid OIDOldHeap, Oid OIDOldIndex) { Relation LocalNewHeap, LocalOldHeap, LocalOldIndex; IndexScanDesc ScanDesc; RetrieveIndexResult ScanResult; ItemPointer HeapTid; HeapTuple LocalHeapTuple; Buffer LocalBuffer; Oid OIDNewHeapInsert; /* * Open the relations I need. Scan through the OldHeap on the OldIndex and * insert each tuple into the NewHeap. */ LocalNewHeap=(Relation)heap_open(OIDNewHeap); LocalOldHeap=(Relation)heap_open(OIDOldHeap); LocalOldIndex=(Relation)index_open(OIDOldIndex); ScanDesc=index_beginscan(LocalOldIndex, false, 0, (ScanKey) NULL); while ((ScanResult = index_getnext(ScanDesc, ForwardScanDirection)) != NULL) { HeapTid = &ScanResult->heap_iptr; LocalHeapTuple = heap_fetch(LocalOldHeap, 0, HeapTid, &LocalBuffer); OIDNewHeapInsert = heap_insert(LocalNewHeap, LocalHeapTuple); pfree(ScanResult); ReleaseBuffer(LocalBuffer); } index_close(LocalOldIndex); heap_close(LocalOldHeap); heap_close(LocalNewHeap); }
/* * systable_getnext --- get next tuple in a heap-or-index scan * * Returns NULL if no more tuples available. * * Note that returned tuple is a reference to data in a disk buffer; * it must not be modified, and should be presumed inaccessible after * next getnext() or endscan() call. */ HeapTuple systable_getnext(SysScanDesc sysscan) { HeapTuple htup; if (sysscan->irel) { htup = index_getnext(sysscan->iscan, ForwardScanDirection); /* * We currently don't need to support lossy index operators for any * system catalog scan. It could be done here, using the scan keys to * drive the operator calls, if we arranged to save the heap attnums * during systable_beginscan(); this is practical because we still * wouldn't need to support indexes on expressions. */ if (htup && sysscan->iscan->xs_recheck) elog(ERROR, "system catalog scans with lossy index conditions are not implemented"); } else htup = heap_getnext(sysscan->scan, ForwardScanDirection); return htup; }
/* ---------- * toast_fetch_datum_slice - * * Reconstruct a segment of a varattrib from the chunks saved * in the toast relation * ---------- */ static varattrib * toast_fetch_datum_slice(varattrib *attr, int32 sliceoffset, int32 length) { Relation toastrel; Relation toastidx; ScanKeyData toastkey[3]; int nscankeys; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 attrsize; int32 residx; int32 nextidx; int numchunks; int startchunk; int endchunk; int32 startoffset; int32 endoffset; int totalchunks; Pointer chunk; bool isnull; int32 chunksize; int32 chcpystrt; int32 chcpyend; attrsize = attr->va_content.va_external.va_extsize; totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; if (sliceoffset >= attrsize) { sliceoffset = 0; length = 0; } if (((sliceoffset + length) > attrsize) || length < 0) length = attrsize - sliceoffset; result = (varattrib *) palloc(length + VARHDRSZ); VARATT_SIZEP(result) = length + VARHDRSZ; if (VARATT_IS_COMPRESSED(attr)) VARATT_SIZEP(result) |= VARATT_FLAG_COMPRESSED; if (length == 0) return (result); /* Can save a lot of work at this point! */ startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE; numchunks = (endchunk - startchunk) + 1; startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE; endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE; /* * Open the toast relation and it's index */ toastrel = heap_open(attr->va_content.va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Setup a scan key to fetch from the index. This is either two keys * or three depending on the number of chunks. */ ScanKeyEntryInitialize(&toastkey[0], (bits16) 0, (AttrNumber) 1, (RegProcedure) F_OIDEQ, ObjectIdGetDatum(attr->va_content.va_external.va_valueid)); /* * Now dependent on number of chunks: */ if (numchunks == 1) { ScanKeyEntryInitialize(&toastkey[1], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4EQ, Int32GetDatum(startchunk)); nscankeys = 2; } else { ScanKeyEntryInitialize(&toastkey[1], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4GE, Int32GetDatum(startchunk)); ScanKeyEntryInitialize(&toastkey[2], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4LE, Int32GetDatum(endchunk)); nscankeys = 3; } /* * Read the chunks by index * * The index is on (valueid, chunkidx) so they will come in order */ nextidx = startchunk; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, nscankeys, toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(heap_getattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(heap_getattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); chunksize = VARATT_SIZE(chunk) - VARHDRSZ; /* * Some checks on the data we've found */ if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk)) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, attr->va_content.va_external.va_valueid); if (residx < totalchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } /* * Copy the data into proper place in our result */ chcpystrt = 0; chcpyend = chunksize - 1; if (residx == startchunk) chcpystrt = startoffset; if (residx == endchunk) chcpyend = endoffset; memcpy(((char *) VARATT_DATA(result)) + (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, VARATT_DATA(chunk) + chcpystrt, (chcpyend - chcpystrt) + 1); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != (endchunk + 1)) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, attr->va_content.va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx); heap_close(toastrel, AccessShareLock); return result; }
/* ---------- * toast_fetch_datum - * * Reconstruct an in memory varattrib from the chunks saved * in the toast relation * ---------- */ static varattrib * toast_fetch_datum(varattrib *attr) { Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 ressize; int32 residx, nextidx; int32 numchunks; Pointer chunk; bool isnull; int32 chunksize; ressize = attr->va_content.va_external.va_extsize; numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; result = (varattrib *) palloc(ressize + VARHDRSZ); VARATT_SIZEP(result) = ressize + VARHDRSZ; if (VARATT_IS_COMPRESSED(attr)) VARATT_SIZEP(result) |= VARATT_FLAG_COMPRESSED; /* * Open the toast relation and its index */ toastrel = heap_open(attr->va_content.va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Setup a scan key to fetch from the index by va_valueid */ ScanKeyEntryInitialize(&toastkey, (bits16) 0, (AttrNumber) 1, (RegProcedure) F_OIDEQ, ObjectIdGetDatum(attr->va_content.va_external.va_valueid)); /* * Read the chunks by index * * Note that because the index is actually on (valueid, chunkidx) we will * see the chunks in chunkidx order, even though we didn't explicitly * ask for it. */ nextidx = 0; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(heap_getattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(heap_getattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); chunksize = VARATT_SIZE(chunk) - VARHDRSZ; /* * Some checks on the data we've found */ if (residx != nextidx) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, attr->va_content.va_external.va_valueid); if (residx < numchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else if (residx < numchunks) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else elog(ERROR, "unexpected chunk number %d for toast value %u", residx, attr->va_content.va_external.va_valueid); /* * Copy the data into proper place in our result */ memcpy(((char *) VARATT_DATA(result)) + residx * TOAST_MAX_CHUNK_SIZE, VARATT_DATA(chunk), chunksize); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != numchunks) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, attr->va_content.va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx); heap_close(toastrel, AccessShareLock); return result; }
/* * load_last_minipage * * Search through the block directory btree to find the last row that * contains the last minipage. */ static void load_last_minipage(AppendOnlyBlockDirectory *blockDirectory, int64 lastSequence, int columnGroupNo) { Relation blkdirRel = blockDirectory->blkdirRel; Relation blkdirIdx = blockDirectory->blkdirIdx; TupleDesc heapTupleDesc; IndexScanDesc idxScanDesc; HeapTuple tuple = NULL; MemoryContext oldcxt; int numScanKeys = blockDirectory->numScanKeys; ScanKey scanKeys = blockDirectory->scanKeys; #ifdef USE_ASSERT_CHECKING StrategyNumber *strategyNumbers = blockDirectory->strategyNumbers; #endif /* USE_ASSERT_CHECKING */ Assert(blockDirectory->aoRel != NULL); Assert(blockDirectory->blkdirRel != NULL); Assert(blockDirectory->blkdirIdx != NULL); oldcxt = MemoryContextSwitchTo(blockDirectory->memoryContext); heapTupleDesc = RelationGetDescr(blkdirRel); Assert(numScanKeys == 3); Assert(blockDirectory->currentSegmentFileInfo != NULL); /* Setup the scan keys for the scan. */ Assert(scanKeys != NULL); Assert(strategyNumbers != NULL); if (lastSequence == 0) lastSequence = 1; scanKeys[0].sk_argument = Int32GetDatum(blockDirectory->currentSegmentFileNum); scanKeys[1].sk_argument = Int32GetDatum(columnGroupNo); scanKeys[2].sk_argument = Int64GetDatum(lastSequence); /* * Search the btree to find the entry in the block directory * that contains the last minipage. */ idxScanDesc = index_beginscan(blkdirRel, blkdirIdx, blockDirectory->appendOnlyMetaDataSnapshot, numScanKeys, scanKeys); tuple = index_getnext(idxScanDesc, BackwardScanDirection); if (tuple != NULL) { extract_minipage(blockDirectory, tuple, heapTupleDesc, columnGroupNo); } index_endscan(idxScanDesc); MemoryContextSwitchTo(oldcxt); ereportif(Debug_appendonly_print_blockdirectory, LOG, (errmsg("Append-only block directory load last minipage: " "(columnGroupNo, lastSequence, nEntries) = (%d, " INT64_FORMAT ", %u)", columnGroupNo, lastSequence, blockDirectory->minipages[columnGroupNo].numMinipageEntries))); }
bool CheckNewRelFileNodeIsOk(Oid newOid, Oid reltablespace, bool relisshared, Relation pg_class) { RelFileNode rnode; char *rpath; int fd; bool collides; if (pg_class) { Oid oidIndex; Relation indexrel; IndexScanDesc scan; ScanKeyData key; Assert(!IsBootstrapProcessingMode()); Assert(pg_class->rd_rel->relhasoids); /* The relcache will cache the identity of the OID index for us */ oidIndex = RelationGetOidIndex(pg_class); Assert(OidIsValid(oidIndex)); indexrel = index_open(oidIndex, AccessShareLock); ScanKeyInit(&key, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(newOid)); scan = index_beginscan(pg_class, indexrel, SnapshotDirty, 1, &key); collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection)); index_endscan(scan); index_close(indexrel, AccessShareLock); if (collides) elog(ERROR, "relfilenode %d already in use in \"pg_class\"", newOid); } /* This should match RelationInitPhysicalAddr */ rnode.spcNode = reltablespace ? reltablespace : MyDatabaseTableSpace; rnode.dbNode = relisshared ? InvalidOid : MyDatabaseId; rnode.relNode = newOid; /* Check for existing file of same name */ rpath = relpath(rnode); fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0); if (fd >= 0) { /* definite collision */ gp_retry_close(fd); collides = true; } else collides = false; pfree(rpath); if (collides && !relisshared) elog(ERROR, "oid %d already in use", newOid); while(GetNewObjectId() < newOid); return !collides; }
/* * AppendOnlyBlockDirectory_GetEntry * * Find a directory entry for the given AOTupleId in the block directory. * If such an entry is found, return true. Otherwise, return false. * * The range for directoryEntry is assigned accordingly in this function. * * The block directory for the appendonly table should exist before calling * this function. */ bool AppendOnlyBlockDirectory_GetEntry( AppendOnlyBlockDirectory *blockDirectory, AOTupleId *aoTupleId, int columnGroupNo, AppendOnlyBlockDirectoryEntry *directoryEntry) { int segmentFileNum = AOTupleIdGet_segmentFileNum(aoTupleId); int64 rowNum = AOTupleIdGet_rowNum(aoTupleId); int i; Relation blkdirRel = blockDirectory->blkdirRel; Relation blkdirIdx = blockDirectory->blkdirIdx; int numScanKeys = blockDirectory->numScanKeys; ScanKey scanKeys = blockDirectory->scanKeys; TupleDesc heapTupleDesc; FileSegInfo *fsInfo = NULL; IndexScanDesc idxScanDesc; HeapTuple tuple = NULL; MinipagePerColumnGroup *minipageInfo = &blockDirectory->minipages[columnGroupNo]; int entry_no = -1; int tmpGroupNo; if (blkdirRel == NULL || blkdirIdx == NULL) { Assert(RelationIsValid(blockDirectory->aoRel)); ereport(ERROR, (errcode(ERRCODE_GP_INTERNAL_ERROR), errmsg("Block directory for append-only relation '%s' does not exist", RelationGetRelationName(blockDirectory->aoRel)))); return false; } ereportif(Debug_appendonly_print_blockdirectory, LOG, (errmsg("Append-only block directory get entry: " "(columnGroupNo, segmentFileNum, rowNum) = " "(%d, %d, " INT64_FORMAT ")", columnGroupNo, segmentFileNum, rowNum))); /* * If the segment file number is the same as * blockDirectory->currentSegmentFileNum, the in-memory minipage * may contain such an entry. We search the in-memory minipage * first. If such an entry can not be found, we search for the * appropriate minipage by using the block directory btree index. */ if (segmentFileNum == blockDirectory->currentSegmentFileNum && minipageInfo->numMinipageEntries > 0) { Assert(blockDirectory->currentSegmentFileInfo != NULL); MinipageEntry *firstentry = &minipageInfo->minipage->entry[0]; if (rowNum >= firstentry->firstRowNum) { /* * Check if the existing minipage contains the requested * rowNum. If so, just get it. */ entry_no = find_minipage_entry(minipageInfo->minipage, minipageInfo->numMinipageEntries, rowNum); if (entry_no != -1) { return set_directoryentry_range(blockDirectory, columnGroupNo, entry_no, directoryEntry); } /* * The given rowNum may point to a tuple that does not exist * in the AO table any more, either because of cancellation of * an insert, or due to crashes during an insert. If this is * the case, rowNum is smaller than the highest entry in * the in-memory minipage entry. */ else { MinipageEntry *entry = &minipageInfo->minipage->entry[minipageInfo->numMinipageEntries - 1]; if (rowNum < entry->firstRowNum + entry->rowCount - 1) return false; } } } for (i = 0; i < blockDirectory->totalSegfiles; i++) { fsInfo = blockDirectory->segmentFileInfo[i]; if (!blockDirectory->isAOCol && segmentFileNum == fsInfo->segno) break; else if (blockDirectory->isAOCol && segmentFileNum == ((AOCSFileSegInfo*)fsInfo)->segno) break; } Assert(fsInfo != NULL); /* * Search the btree index to find the minipage that contains * the rowNum. We find the minipages for all column groups, since * currently we will need to access all columns at the same time. */ heapTupleDesc = RelationGetDescr(blkdirRel); Assert(numScanKeys == 3); for (tmpGroupNo = 0; tmpGroupNo < blockDirectory->numColumnGroups; tmpGroupNo++) { if (blockDirectory->proj && !blockDirectory->proj[tmpGroupNo]) { /* Ignore columns that are not projected. */ continue; } /* Setup the scan keys for the scan. */ Assert(scanKeys != NULL); scanKeys[0].sk_argument = Int32GetDatum(segmentFileNum); scanKeys[1].sk_argument = Int32GetDatum(tmpGroupNo); scanKeys[2].sk_argument = Int64GetDatum(rowNum); idxScanDesc = index_beginscan(blkdirRel, blkdirIdx, blockDirectory->appendOnlyMetaDataSnapshot, numScanKeys, scanKeys); tuple = index_getnext(idxScanDesc, BackwardScanDirection); if (tuple != NULL) { /* * MPP-17061: we need to update currentSegmentFileNum * & currentSegmentFileInfo at the same time when we * load the minipage for the block directory entry we * found, otherwise we would risk having inconsistency * between currentSegmentFileNum/currentSegmentFileInfo * and minipage contents, which would cause wrong block * header offset being returned in following block * directory entry look up. */ blockDirectory->currentSegmentFileNum = segmentFileNum; blockDirectory->currentSegmentFileInfo = fsInfo; extract_minipage(blockDirectory, tuple, heapTupleDesc, tmpGroupNo); } else { /* MPP-17061: index look up failed, row is invisible */ index_endscan(idxScanDesc); return false; } index_endscan(idxScanDesc); } { MinipagePerColumnGroup *minipageInfo; minipageInfo = &blockDirectory->minipages[columnGroupNo]; /* * Perform a binary search over the minipage to find * the entry about the AO block. */ entry_no = find_minipage_entry(minipageInfo->minipage, minipageInfo->numMinipageEntries, rowNum); /* If there are no entries, return false. */ if (entry_no == -1 && minipageInfo->numMinipageEntries == 0) return false; if (entry_no == -1) { /* * Since the last few blocks may not be logged in the block * directory, we always use the last entry. */ entry_no = minipageInfo->numMinipageEntries - 1; } return set_directoryentry_range(blockDirectory, columnGroupNo, entry_no, directoryEntry); } return false; }
/* ---------- * toast_fetch_datum_slice - * * Reconstruct a segment of a Datum from the chunks saved * in the toast relation * ---------- */ static struct varlena * toast_fetch_datum_slice(struct varlena *attr, int32 sliceoffset, int32 length) { Relation toastrel; Relation toastidx; ScanKeyData toastkey[3]; int nscankeys; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 attrsize; int32 residx; int32 nextidx; int numchunks; int startchunk; int endchunk; int32 startoffset; int32 endoffset; int totalchunks; Pointer chunk; bool isnull; int32 chunksize; int32 chcpystrt; int32 chcpyend; attrsize = ((varattrib *)attr)->va_external.va_extsize; totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; if (sliceoffset >= attrsize) { sliceoffset = 0; length = 0; } if (((sliceoffset + length) > attrsize) || length < 0) length = attrsize - sliceoffset; result = (varattrib *) palloc(length + VARHDRSZ); SET_VARSIZE(result, length + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(attr)) VARATT_SET_COMPRESSED(result); if (length == 0) return (struct varlena *)result; /* Can save a lot of work at this point! */ startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE; numchunks = (endchunk - startchunk) + 1; startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE; endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE; /* * Open the toast relation and its index */ toastrel = heap_open(((varattrib *)attr)->va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock); /* * Setup a scan key to fetch from the index. This is either two keys or * three depending on the number of chunks. */ ScanKeyInit(&toastkey[0], (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(((varattrib *)attr)->va_external.va_valueid)); /* * Use equality condition for one chunk, a range condition otherwise: */ if (numchunks == 1) { ScanKeyInit(&toastkey[1], (AttrNumber) 2, BTEqualStrategyNumber, F_INT4EQ, Int32GetDatum(startchunk)); nscankeys = 2; } else { ScanKeyInit(&toastkey[1], (AttrNumber) 2, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(startchunk)); ScanKeyInit(&toastkey[2], (AttrNumber) 2, BTLessEqualStrategyNumber, F_INT4LE, Int32GetDatum(endchunk)); nscankeys = 3; } /* * Read the chunks by index * * The index is on (valueid, chunkidx) so they will come in order */ nextidx = startchunk; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, nscankeys, toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); if (VARATT_IS_SHORT((varattrib *)chunk)) chunksize = VARSIZE_SHORT((varattrib *)chunk) - VARHDRSZ_SHORT; else if (!VARATT_IS_EXTENDED((varattrib *)chunk)) chunksize = VARSIZE((varattrib *)chunk) - VARHDRSZ; else { elog(ERROR, "found toasted toast chunk?"); chunksize = 0; /* shut compiler up */ } /* * Some checks on the data we've found */ if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk)) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, ((varattrib *)attr)->va_external.va_valueid); if (residx < totalchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u of %d when fetching slice (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, totalchunks-1, (int)TOAST_MAX_CHUNK_SIZE); } else if (residx == totalchunks-1) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize) elog(ERROR, "unexpected chunk size %d in chunk %d for final toast value %u when fetching slice (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, attrsize - residx * (int)TOAST_MAX_CHUNK_SIZE); } else { elog(ERROR, "unexpected chunk"); } /* * Copy the data into proper place in our result */ chcpystrt = 0; chcpyend = chunksize - 1; if (residx == startchunk) chcpystrt = startoffset; if (residx == endchunk) chcpyend = endoffset; memcpy(((char *) VARDATA(result)) + (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, VARDATA((varattrib *)chunk) + chcpystrt, (chcpyend - chcpystrt) + 1); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != (endchunk + 1)) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, ((varattrib *)attr)->va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx, AccessShareLock); heap_close(toastrel, AccessShareLock); return (struct varlena *)result; }
/* ---------------------------------------------------------------- * IndexNext * * Retrieve a tuple from the IndexScan node's currentRelation * using the index specified in the IndexScanState information. * ---------------------------------------------------------------- */ TupleTableSlot * IndexNext(IndexScanState *node) { EState *estate; ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; Index scanrelid; HeapTuple tuple; TupleTableSlot *slot; /* * extract necessary information from index scan node */ estate = node->ss.ps.state; direction = estate->es_direction; initScanDesc(node); /* flip direction if this is an overall backward scan */ if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; else if (ScanDirectionIsBackward(direction)) direction = ForwardScanDirection; } scandesc = node->iss_ScanDesc; econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; scanrelid = ((IndexScan *) node->ss.ps.plan)->scan.scanrelid; /* * Check if we are evaluating PlanQual for tuple of this relation. * Additional checking is not good, but no other way for now. We could * introduce new nodes for this case and handle IndexScan --> NewNode * switching in Init/ReScan plan... */ if (estate->es_evTuple != NULL && estate->es_evTuple[scanrelid - 1] != NULL) { if (estate->es_evTupleNull[scanrelid - 1]) { if (!node->ss.ps.delayEagerFree) { ExecEagerFreeIndexScan(node); } return ExecClearTuple(slot); } ExecStoreGenericTuple(estate->es_evTuple[scanrelid - 1], slot, false); /* Does the tuple meet the indexqual condition? */ econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->indexqualorig, econtext, false)) { if (!node->ss.ps.delayEagerFree) { ExecEagerFreeIndexScan(node); } ExecClearTuple(slot); /* would not be returned by scan */ } /* Flag for the next call that no more tuples */ estate->es_evTupleNull[scanrelid - 1] = true; Gpmon_M_Incr_Rows_Out(GpmonPktFromIndexScanState(node)); CheckSendPlanStateGpmonPkt(&node->ss.ps); return slot; } /* * ok, now that we have what we need, fetch the next tuple. */ if ((tuple = index_getnext(scandesc, direction)) != NULL) { /* * Store the scanned tuple in the scan tuple slot of the scan state. * Note: we pass 'false' because tuples returned by amgetnext are * pointers onto disk pages and must not be pfree()'d. */ ExecStoreHeapTuple(tuple, /* tuple to store */ slot, /* slot to store in */ scandesc->xs_cbuf, /* buffer containing tuple */ false); /* don't pfree */ Gpmon_M_Incr_Rows_Out(GpmonPktFromIndexScanState(node)); CheckSendPlanStateGpmonPkt(&node->ss.ps); return slot; } if (!node->ss.ps.delayEagerFree) { ExecEagerFreeIndexScan(node); } /* * if we get here it means the index scan failed so we are at the end of * the scan.. */ return ExecClearTuple(slot); }
int inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes) { int nwritten = 0; int n; int off; int len; int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); ScanKeyData skey[2]; IndexScanDesc sd; HeapTuple oldtuple; Form_pg_largeobject olddata; bool neednextpage; bytea *datafield; bool pfreeit; struct { bytea hdr; char data[LOBLKSIZE]; /* make struct big enough */ int32 align_it; /* ensure struct is aligned well enough */ } workbuf; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; bool nulls[Natts_pg_largeobject]; bool replace[Natts_pg_largeobject]; CatalogIndexState indstate; Assert(PointerIsValid(obj_desc)); Assert(buf != NULL); /* enforce writability because snapshot is probably wrong otherwise */ if ((obj_desc->flags & IFS_WRLOCK) == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("large object %u was not opened for writing", obj_desc->id))); if (nbytes <= 0) return 0; open_lo_relation(); indstate = CatalogOpenIndexes(lo_heap_r); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = index_beginscan(lo_heap_r, lo_index_r, obj_desc->snapshot, 2, skey); oldtuple = NULL; olddata = NULL; neednextpage = true; while (nwritten < nbytes) { /* * If possible, get next pre-existing page of the LO. We assume the * indexscan will deliver these in order --- but there may be holes. */ if (neednextpage) { if ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL) { if (HeapTupleHasNulls(oldtuple)) /* paranoia */ elog(ERROR, "null field found in pg_largeobject"); olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); Assert(olddata->pageno >= pageno); } neednextpage = false; } /* * If we have a pre-existing page, see if it is the page we want to * write, or a later one. */ if (olddata != NULL && olddata->pageno == pageno) { /* * Update an existing page with fresh data. * * First, load old data into workbuf */ datafield = &(olddata->data); /* see note at top of file */ pfreeit = false; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((struct varlena *) datafield); pfreeit = true; } len = getbytealen(datafield); Assert(len <= LOBLKSIZE); memcpy(workb, VARDATA(datafield), len); if (pfreeit) pfree(datafield); /* * Fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > len) MemSet(workb + len, 0, off - len); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; off += n; /* compute valid length of new page */ len = (len >= off) ? len : off; SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); memset(replace, false, sizeof(replace)); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); replace[Anum_pg_largeobject_data - 1] = true; newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r), values, nulls, replace); simple_heap_update(lo_heap_r, &newtup->t_self, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); /* * We're done with this old page. */ oldtuple = NULL; olddata = NULL; neednextpage = true; } else { /* * Write a brand new page. * * First, fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > 0) MemSet(workb, 0, off); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; /* compute valid length of new page */ len = off + n; SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls); simple_heap_insert(lo_heap_r, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } pageno++; } index_endscan(sd); CatalogCloseIndexes(indstate); /* * Advance command counter so that my tuple updates will be seen by later * large-object operations in this transaction. */ CommandCounterIncrement(); return nwritten; }
/* ---------- * toast_fetch_datum - * * Reconstruct an in memory Datum from the chunks saved * in the toast relation * ---------- */ static struct varlena * toast_fetch_datum(struct varlena *attr) { Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 ressize; int32 residx, nextidx; int32 numchunks; Pointer chunk; bool isnull; int32 chunksize; void *chunkdata; ressize = ((varattrib *)attr)->va_external.va_extsize; numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; result = (varattrib *) palloc(ressize + VARHDRSZ); SET_VARSIZE(result, ressize + VARHDRSZ); if (VARATT_EXTERNAL_IS_COMPRESSED(attr)) VARATT_SET_COMPRESSED(result); /* * Open the toast relation and its index */ toastrel = heap_open(((varattrib *)attr)->va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid, AccessShareLock); /* * Setup a scan key to fetch from the index by va_valueid */ ScanKeyInit(&toastkey, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(((varattrib *)attr)->va_external.va_valueid)); /* * Read the chunks by index * * Note that because the index is actually on (valueid, chunkidx) we will * see the chunks in chunkidx order, even though we didn't explicitly ask * for it. */ nextidx = 0; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(fastgetattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(fastgetattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); if (VARATT_IS_SHORT(chunk)) { chunksize = VARSIZE_SHORT(chunk) - VARHDRSZ_SHORT; chunkdata = VARDATA_SHORT(chunk); } else if (!VARATT_IS_EXTENDED(chunk)) { chunksize = VARSIZE(chunk) - VARHDRSZ; chunkdata = VARDATA(chunk); } else { elog(ERROR, "found toasted toast chunk?"); chunksize = 0; /* shut compiler up */ chunkdata = NULL; } /* * Some checks on the data we've found */ if (residx != nextidx) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, ((varattrib *)attr)->va_external.va_valueid); if (residx < numchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d of %d for toast value %u (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, numchunks-1, (int)TOAST_MAX_CHUNK_SIZE); } else if (residx == numchunks-1) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize) elog(ERROR, "unexpected chunk size %d in final chunk %d for toast value %u (expected %d)", chunksize, residx, ((varattrib *)attr)->va_external.va_valueid, ressize - residx*(int)TOAST_MAX_CHUNK_SIZE); } else elog(ERROR, "unexpected chunk number %d for toast value %u (expected in %d..%d)", residx, ((varattrib *)attr)->va_external.va_valueid, 0, numchunks-1); /* * Copy the data into proper place in our result */ memcpy(((char *) VARDATA(result)) + residx * TOAST_MAX_CHUNK_SIZE, chunkdata, chunksize); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != numchunks) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, ((varattrib *)attr)->va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx, AccessShareLock); heap_close(toastrel, AccessShareLock); return (struct varlena *)result; }
void inv_truncate(LargeObjectDesc *obj_desc, int len) { int32 pageno = (int32) (len / LOBLKSIZE); int off; ScanKeyData skey[2]; IndexScanDesc sd; HeapTuple oldtuple; Form_pg_largeobject olddata; struct { bytea hdr; char data[LOBLKSIZE]; /* make struct big enough */ int32 align_it; /* ensure struct is aligned well enough */ } workbuf; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; bool nulls[Natts_pg_largeobject]; bool replace[Natts_pg_largeobject]; CatalogIndexState indstate; Assert(PointerIsValid(obj_desc)); /* enforce writability because snapshot is probably wrong otherwise */ if ((obj_desc->flags & IFS_WRLOCK) == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("large object %u was not opened for writing", obj_desc->id))); open_lo_relation(); indstate = CatalogOpenIndexes(lo_heap_r); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = index_beginscan(lo_heap_r, lo_index_r, obj_desc->snapshot, 2, skey); /* * If possible, get the page the truncation point is in. The truncation * point may be beyond the end of the LO or in a hole. */ olddata = NULL; if ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL) { if (HeapTupleHasNulls(oldtuple)) /* paranoia */ elog(ERROR, "null field found in pg_largeobject"); olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); Assert(olddata->pageno >= pageno); } /* * If we found the page of the truncation point we need to truncate the * data in it. Otherwise if we're in a hole, we need to create a page to * mark the end of data. */ if (olddata != NULL && olddata->pageno == pageno) { /* First, load old data into workbuf */ bytea *datafield = &(olddata->data); /* see note at top of * file */ bool pfreeit = false; int pagelen; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((struct varlena *) datafield); pfreeit = true; } pagelen = getbytealen(datafield); Assert(pagelen <= LOBLKSIZE); memcpy(workb, VARDATA(datafield), pagelen); if (pfreeit) pfree(datafield); /* * Fill any hole */ off = len % LOBLKSIZE; if (off > pagelen) MemSet(workb + pagelen, 0, off - pagelen); /* compute length of new page */ SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); memset(replace, false, sizeof(replace)); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); replace[Anum_pg_largeobject_data - 1] = true; newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r), values, nulls, replace); simple_heap_update(lo_heap_r, &newtup->t_self, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } else { /* * If the first page we found was after the truncation point, we're in * a hole that we'll fill, but we need to delete the later page. */ if (olddata != NULL && olddata->pageno > pageno) simple_heap_delete(lo_heap_r, &oldtuple->t_self); /* * Write a brand new page. * * Fill the hole up to the truncation point */ off = len % LOBLKSIZE; if (off > 0) MemSet(workb, 0, off); /* compute length of new page */ SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ); /* * Form and insert new tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls); simple_heap_insert(lo_heap_r, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } /* * Delete any pages after the truncation point */ while ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL) { simple_heap_delete(lo_heap_r, &oldtuple->t_self); } index_endscan(sd); CatalogCloseIndexes(indstate); /* * Advance command counter so that tuple updates will be seen by later * large-object operations in this transaction. */ CommandCounterIncrement(); }
/* * Search the relation 'rel' for tuple using the index. * * If a matching tuple is found, lock it with lockmode, fill the slot with its * contents, and return true. Return false otherwise. */ bool RelationFindReplTupleByIndex(Relation rel, Oid idxoid, LockTupleMode lockmode, TupleTableSlot *searchslot, TupleTableSlot *outslot) { HeapTuple scantuple; ScanKeyData skey[INDEX_MAX_KEYS]; IndexScanDesc scan; SnapshotData snap; TransactionId xwait; Relation idxrel; bool found; /* Open the index. */ idxrel = index_open(idxoid, RowExclusiveLock); /* Start an index scan. */ InitDirtySnapshot(snap); scan = index_beginscan(rel, idxrel, &snap, RelationGetNumberOfAttributes(idxrel), 0); /* Build scan key. */ build_replindex_scan_key(skey, rel, idxrel, searchslot); retry: found = false; index_rescan(scan, skey, RelationGetNumberOfAttributes(idxrel), NULL, 0); /* Try to find the tuple */ if ((scantuple = index_getnext(scan, ForwardScanDirection)) != NULL) { found = true; ExecStoreTuple(scantuple, outslot, InvalidBuffer, false); ExecMaterializeSlot(outslot); xwait = TransactionIdIsValid(snap.xmin) ? snap.xmin : snap.xmax; /* * If the tuple is locked, wait for locking transaction to finish and * retry. */ if (TransactionIdIsValid(xwait)) { XactLockTableWait(xwait, NULL, NULL, XLTW_None); goto retry; } } /* Found tuple, try to lock it in the lockmode. */ if (found) { Buffer buf; HeapUpdateFailureData hufd; HTSU_Result res; HeapTupleData locktup; ItemPointerCopy(&outslot->tts_tuple->t_self, &locktup.t_self); PushActiveSnapshot(GetLatestSnapshot()); res = heap_lock_tuple(rel, &locktup, GetCurrentCommandId(false), lockmode, LockWaitBlock, false /* don't follow updates */ , &buf, &hufd); /* the tuple slot already has the buffer pinned */ ReleaseBuffer(buf); PopActiveSnapshot(); switch (res) { case HeapTupleMayBeUpdated: break; case HeapTupleUpdated: /* XXX: Improve handling here */ ereport(LOG, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("concurrent update, retrying"))); goto retry; case HeapTupleInvisible: elog(ERROR, "attempted to lock invisible tuple"); default: elog(ERROR, "unexpected heap_lock_tuple status: %u", res); break; } } index_endscan(scan); /* Don't release lock until commit. */ index_close(idxrel, NoLock); return found; }
/* * GetNewSequenceRelationOid * Get a sequence relation Oid and verify it is valid against * the pg_class relation by doing an index lookup. The caller * should have a suitable lock on pg_class. */ Oid GetNewSequenceRelationOid(Relation relation) { Oid newOid; Oid oidIndex; Relation indexrel; SnapshotData SnapshotDirty; IndexScanDesc scan; ScanKeyData key; bool collides; RelFileNode rnode; char *rpath; int fd; /* This should match RelationInitPhysicalAddr */ rnode.spcNode = relation->rd_rel->reltablespace ? relation->rd_rel->reltablespace : MyDatabaseTableSpace; rnode.dbNode = relation->rd_rel->relisshared ? InvalidOid : MyDatabaseId; /* We should only be using pg_class */ Assert(RelationGetRelid(relation) == RelationRelationId); /* The relcache will cache the identity of the OID index for us */ oidIndex = RelationGetOidIndex(relation); /* Otherwise, use the index to find a nonconflicting OID */ indexrel = index_open(oidIndex, AccessShareLock); InitDirtySnapshot(SnapshotDirty); /* Generate new sequence relation OIDs until we find one not in the table */ do { CHECK_FOR_INTERRUPTS(); newOid = GetNewSequenceRelationObjectId(); ScanKeyInit(&key, (AttrNumber) 1, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(newOid)); /* see notes above about using SnapshotDirty */ scan = index_beginscan(relation, indexrel, &SnapshotDirty, 1, &key); collides = HeapTupleIsValid(index_getnext(scan, ForwardScanDirection)); index_endscan(scan); if (!collides) { /* Check for existing file of same name */ rpath = relpath(rnode); fd = BasicOpenFile(rpath, O_RDONLY | PG_BINARY, 0); if (fd >= 0) { /* definite collision */ gp_retry_close(fd); collides = true; } else { /* * Here we have a little bit of a dilemma: if errno is something * other than ENOENT, should we declare a collision and loop? In * particular one might think this advisable for, say, EPERM. * However there really shouldn't be any unreadable files in a * tablespace directory, and if the EPERM is actually complaining * that we can't read the directory itself, we'd be in an infinite * loop. In practice it seems best to go ahead regardless of the * errno. If there is a colliding file we will get an smgr * failure when we attempt to create the new relation file. */ collides = false; } } /* * Also check that the OID hasn't been pre-assigned for a different * relation. * * We're a bit sloppy between OIDs and relfilenodes here; it would be * OK to use a value that's been reserved for use as a type or * relation OID here, as long as the relfilenode is free. But there's * no harm in skipping over those too, so we don't bother to * distinguish them. */ if (!collides && !IsOidAcceptable(newOid)) collides = true; } while (collides); index_close(indexrel, AccessShareLock); return newOid; }
int inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes) { int nwritten = 0; int n; int off; int len; int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); ScanKeyData skey[2]; IndexScanDesc sd; HeapTuple oldtuple; Form_pg_largeobject olddata; bool neednextpage; bytea *datafield; bool pfreeit; struct { bytea hdr; char data[LOBLKSIZE]; } workbuf; char *workb = VARATT_DATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; char nulls[Natts_pg_largeobject]; char replace[Natts_pg_largeobject]; CatalogIndexState indstate; Assert(PointerIsValid(obj_desc)); Assert(buf != NULL); if (nbytes <= 0) return 0; open_lo_relation(); indstate = CatalogOpenIndexes(lo_heap_r); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = index_beginscan(lo_heap_r, lo_index_r, SnapshotNow, 2, skey); oldtuple = NULL; olddata = NULL; neednextpage = true; while (nwritten < nbytes) { /* * If possible, get next pre-existing page of the LO. We assume * the indexscan will deliver these in order --- but there may be * holes. */ if (neednextpage) { if ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL) { olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); Assert(olddata->pageno >= pageno); } neednextpage = false; } /* * If we have a pre-existing page, see if it is the page we want * to write, or a later one. */ if (olddata != NULL && olddata->pageno == pageno) { /* * Update an existing page with fresh data. * * First, load old data into workbuf */ datafield = &(olddata->data); pfreeit = false; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((varattrib *) datafield); pfreeit = true; } len = getbytealen(datafield); Assert(len <= LOBLKSIZE); memcpy(workb, VARDATA(datafield), len); if (pfreeit) pfree(datafield); /* * Fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > len) MemSet(workb + len, 0, off - len); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; off += n; /* compute valid length of new page */ len = (len >= off) ? len : off; VARATT_SIZEP(&workbuf.hdr) = len + VARHDRSZ; /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, ' ', sizeof(nulls)); memset(replace, ' ', sizeof(replace)); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); replace[Anum_pg_largeobject_data - 1] = 'r'; newtup = heap_modifytuple(oldtuple, lo_heap_r, values, nulls, replace); simple_heap_update(lo_heap_r, &newtup->t_self, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); /* * We're done with this old page. */ oldtuple = NULL; olddata = NULL; neednextpage = true; } else { /* * Write a brand new page. * * First, fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > 0) MemSet(workb, 0, off); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; /* compute valid length of new page */ len = off + n; VARATT_SIZEP(&workbuf.hdr) = len + VARHDRSZ; /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, ' ', sizeof(nulls)); values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); newtup = heap_formtuple(lo_heap_r->rd_att, values, nulls); simple_heap_insert(lo_heap_r, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } pageno++; } index_endscan(sd); CatalogCloseIndexes(indstate); /* * Advance command counter so that my tuple updates will be seen by * later large-object operations in this transaction. */ CommandCounterIncrement(); return nwritten; }
int inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes) { int nread = 0; int n; int off; int len; int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); uint32 pageoff; ScanKeyData skey[2]; IndexScanDesc sd; HeapTuple tuple; Assert(PointerIsValid(obj_desc)); Assert(buf != NULL); if (nbytes <= 0) return 0; open_lo_relation(); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = index_beginscan(lo_heap_r, lo_index_r, SnapshotNow, 2, skey); while ((tuple = index_getnext(sd, ForwardScanDirection)) != NULL) { Form_pg_largeobject data; bytea *datafield; bool pfreeit; data = (Form_pg_largeobject) GETSTRUCT(tuple); /* * We assume the indexscan will deliver pages in order. However, * there may be missing pages if the LO contains unwritten * "holes". We want missing sections to read out as zeroes. */ pageoff = ((uint32) data->pageno) * LOBLKSIZE; if (pageoff > obj_desc->offset) { n = pageoff - obj_desc->offset; n = (n <= (nbytes - nread)) ? n : (nbytes - nread); MemSet(buf + nread, 0, n); nread += n; obj_desc->offset += n; } if (nread < nbytes) { Assert(obj_desc->offset >= pageoff); off = (int) (obj_desc->offset - pageoff); Assert(off >= 0 && off < LOBLKSIZE); datafield = &(data->data); pfreeit = false; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((varattrib *) datafield); pfreeit = true; } len = getbytealen(datafield); if (len > off) { n = len - off; n = (n <= (nbytes - nread)) ? n : (nbytes - nread); memcpy(buf + nread, VARDATA(datafield) + off, n); nread += n; obj_desc->offset += n; } if (pfreeit) pfree(datafield); } if (nread >= nbytes) break; } index_endscan(sd); return nread; }
/* ---------------------------------------------------------------- * IndexNext * * Retrieve a tuple from the IndexScan node's currentRelation * using the index specified in the IndexScanState information. * ---------------------------------------------------------------- */ static TupleTableSlot * IndexNext(IndexScanState *node) { EState *estate; ExprContext *econtext; ScanDirection direction; IndexScanDesc scandesc; HeapTuple tuple; TupleTableSlot *slot; /* * extract necessary information from index scan node */ estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ if (ScanDirectionIsBackward(((IndexScan *) node->ss.ps.plan)->indexorderdir)) { if (ScanDirectionIsForward(direction)) direction = BackwardScanDirection; else if (ScanDirectionIsBackward(direction)) direction = ForwardScanDirection; } scandesc = node->iss_ScanDesc; econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; /* * ok, now that we have what we need, fetch the next tuple. */ while ((tuple = index_getnext(scandesc, direction)) != NULL) { /* * Store the scanned tuple in the scan tuple slot of the scan state. * Note: we pass 'false' because tuples returned by amgetnext are * pointers onto disk pages and must not be pfree()'d. */ ExecStoreTuple(tuple, /* tuple to store */ slot, /* slot to store in */ scandesc->xs_cbuf, /* buffer containing tuple */ false); /* don't pfree */ /* * If the index was lossy, we have to recheck the index quals using * the real tuple. */ if (scandesc->xs_recheck) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!ExecQual(node->indexqualorig, econtext, false)) continue; /* nope, so ask index for another one */ } return slot; } /* * if we get here it means the index scan failed so we are at the end of * the scan.. */ return ExecClearTuple(slot); }
/* ---------------------------------------------------------------- * IndexNext * * Retrieve a tuple from the index_scan_sc node's currentRelation * using the index specified in the index_ss information. * ---------------------------------------------------------------- */ static struct tupslot* IndexNext(index_ss *node) { exec_state_n* estate; expr_ctx_n* econtext; enum scandir direction; struct index_scan* scandesc; struct heap_tuple* tuple; struct tupslot* slot; /* * extract necessary information from index scan node */ estate = node->ss.ps.state; direction = estate->es_direction; /* flip direction if this is an overall backward scan */ if (SCANDIR_BACKWARD(((index_scan_sc *) node->ss.ps.plan)->indexorderdir)) { if (SCANDIR_FORWARD(direction)) direction = BACKWARD_SCANDIR; else if (SCANDIR_BACKWARD(direction)) direction = FORWARD_SCANDIR; } scandesc = node->iss_ScanDesc; econtext = node->ss.ps.ps_ExprContext; slot = node->ss.ss_ScanTupleSlot; /* * ok, now that we have what we need, fetch the next tuple. */ while ((tuple = index_getnext(scandesc, direction)) != NULL) { /* * Store the scanned tuple in the scan tuple slot of the scan state. * Note: we pass 'false' because tuples returned by amgetnext are * pointers onto disk pages and must not be pfree()'d. */ exec_store_tuple(tuple, /* tuple to store */ slot, /* slot to store in */ scandesc->xs_cbuf, /* buffer containing tuple */ false); /* don't pfree */ /* * If the index was lossy, we have to recheck the index quals using * the real tuple. */ if (scandesc->xs_recheck) { econtext->ecxt_scantuple = slot; ResetExprContext(econtext); if (!exec_qual(node->indexqualorig, econtext, false)) continue; /* nope, so ask index for another one */ } return slot; } /* * if we get here it means the index scan failed so we are at the end of * the scan.. */ return exec_clear_tuple(slot); }