/* * Extract all item values from a BRIN index page * * Usage: SELECT * FROM brin_page_items(get_raw_page('idx', 1), 'idx'::regclass); */ Datum brin_page_items(PG_FUNCTION_ARGS) { brin_page_state *state; FuncCallContext *fctx; if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), (errmsg("must be superuser to use raw page functions")))); if (SRF_IS_FIRSTCALL()) { bytea *raw_page = PG_GETARG_BYTEA_P(0); Oid indexRelid = PG_GETARG_OID(1); Page page; TupleDesc tupdesc; MemoryContext mctx; Relation indexRel; AttrNumber attno; /* minimally verify the page we got */ page = verify_brin_page(raw_page, BRIN_PAGETYPE_REGULAR, "regular"); /* create a function context for cross-call persistence */ fctx = SRF_FIRSTCALL_INIT(); /* switch to memory context appropriate for multiple function calls */ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); indexRel = index_open(indexRelid, AccessShareLock); state = palloc(offsetof(brin_page_state, columns) + sizeof(brin_column_state) * RelationGetDescr(indexRel)->natts); state->bdesc = brin_build_desc(indexRel); state->page = page; state->offset = FirstOffsetNumber; state->unusedItem = false; state->done = false; state->dtup = NULL; /* * Initialize output functions for all indexed datatypes; simplifies * calling them later. */ for (attno = 1; attno <= state->bdesc->bd_tupdesc->natts; attno++) { Oid output; bool isVarlena; BrinOpcInfo *opcinfo; int i; brin_column_state *column; opcinfo = state->bdesc->bd_info[attno - 1]; column = palloc(offsetof(brin_column_state, outputFn) + sizeof(FmgrInfo) * opcinfo->oi_nstored); column->nstored = opcinfo->oi_nstored; for (i = 0; i < opcinfo->oi_nstored; i++) { getTypeOutputInfo(opcinfo->oi_typids[i], &output, &isVarlena); fmgr_info(output, &column->outputFn[i]); } state->columns[attno - 1] = column; } index_close(indexRel, AccessShareLock); fctx->user_fctx = state; fctx->tuple_desc = BlessTupleDesc(tupdesc); MemoryContextSwitchTo(mctx); } fctx = SRF_PERCALL_SETUP(); state = fctx->user_fctx; if (!state->done) { HeapTuple result; Datum values[7]; bool nulls[7]; /* * This loop is called once for every attribute of every tuple in the * page. At the start of a tuple, we get a NULL dtup; that's our * signal for obtaining and decoding the next one. If that's not the * case, we output the next attribute. */ if (state->dtup == NULL) { BrinTuple *tup; MemoryContext mctx; ItemId itemId; /* deformed tuple must live across calls */ mctx = MemoryContextSwitchTo(fctx->multi_call_memory_ctx); /* verify item status: if there's no data, we can't decode */ itemId = PageGetItemId(state->page, state->offset); if (ItemIdIsUsed(itemId)) { tup = (BrinTuple *) PageGetItem(state->page, PageGetItemId(state->page, state->offset)); state->dtup = brin_deform_tuple(state->bdesc, tup); state->attno = 1; state->unusedItem = false; } else state->unusedItem = true; MemoryContextSwitchTo(mctx); } else state->attno++; MemSet(nulls, 0, sizeof(nulls)); if (state->unusedItem) { values[0] = UInt16GetDatum(state->offset); nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; } else { int att = state->attno - 1; values[0] = UInt16GetDatum(state->offset); values[1] = UInt32GetDatum(state->dtup->bt_blkno); values[2] = UInt16GetDatum(state->attno); values[3] = BoolGetDatum(state->dtup->bt_columns[att].bv_allnulls); values[4] = BoolGetDatum(state->dtup->bt_columns[att].bv_hasnulls); values[5] = BoolGetDatum(state->dtup->bt_placeholder); if (!state->dtup->bt_columns[att].bv_allnulls) { BrinValues *bvalues = &state->dtup->bt_columns[att]; StringInfoData s; bool first; int i; initStringInfo(&s); appendStringInfoChar(&s, '{'); first = true; for (i = 0; i < state->columns[att]->nstored; i++) { char *val; if (!first) appendStringInfoString(&s, " .. "); first = false; val = OutputFunctionCall(&state->columns[att]->outputFn[i], bvalues->bv_values[i]); appendStringInfoString(&s, val); pfree(val); } appendStringInfoChar(&s, '}'); values[6] = CStringGetTextDatum(s.data); pfree(s.data); } else { nulls[6] = true; } } result = heap_form_tuple(fctx->tuple_desc, values, nulls); /* * If the item was unused, jump straight to the next one; otherwise, * the only cleanup needed here is to set our signal to go to the next * tuple in the following iteration, by freeing the current one. */ if (state->unusedItem) state->offset = OffsetNumberNext(state->offset); else if (state->attno >= state->bdesc->bd_tupdesc->natts) { pfree(state->dtup); state->dtup = NULL; state->offset = OffsetNumberNext(state->offset); } /* * If we're beyond the end of the page, set flag to end the function in * the following iteration. */ if (state->offset > PageGetMaxOffsetNumber(state->page)) state->done = true; SRF_RETURN_NEXT(fctx, HeapTupleGetDatum(result)); } brin_free_desc(state->bdesc); SRF_RETURN_DONE(fctx); }
/* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the initial buckets, and the initial bitmap page. * * The initial number of buckets is dependent on num_tuples, an estimate * of the number of tuples to be loaded into the index initially. The * chosen number of buckets is returned. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ uint32 _hash_metapinit(Relation rel, double num_tuples, ForkNumber forkNum) { HashMetaPage metap; HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; Page pg; int32 data_width; int32 item_width; int32 ffactor; double dnumbuckets; uint32 num_buckets; uint32 log2_num_buckets; uint32 i; /* safety check */ if (RelationGetNumberOfBlocksInFork(rel, forkNum) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly since the index datatype (i.e. uint32 hash key) is fixed-width. */ data_width = sizeof(uint32); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; /* * Choose the number of initial bucket pages to match the fill factor * given the estimated number of tuples. We round up the result to the * next power of 2, however, and always force at least 2 bucket pages. The * upper limit is determined by considerations explained in * _hash_expandtable(). */ dnumbuckets = num_tuples / ffactor; if (dnumbuckets <= 2.0) num_buckets = 2; else if (dnumbuckets >= (double) 0x40000000) num_buckets = 0x40000000; else num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets); log2_num_buckets = _hash_log2(num_buckets); Assert(num_buckets == (((uint32) 1) << log2_num_buckets)); Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS); /* * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. */ metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, forkNum); pg = BufferGetPage(metabuf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; metap = HashPageGetMeta(pg); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = HashGetMaxBitmapSize(pg); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= metap->hashm_bsize) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); /* * Label the index with its primary hash support function's OID. This is * pretty useless for normal operation (in fact, hashm_procid is not used * anywhere), but it might be handy for forensic purposes so we keep it. */ metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); /* * We initialize the index with N buckets, 0 .. N-1, occupying physical * blocks 1 to N. The first freespace bitmap page is in block N+1. Since * N is a power of 2, we can set the masks this way: */ metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1; metap->hashm_highmask = (num_buckets << 1) - 1; MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); /* Set up mapping for one spare page after the initial splitpoints */ metap->hashm_spares[log2_num_buckets] = 1; metap->hashm_ovflpoint = log2_num_buckets; metap->hashm_firstfree = 0; /* * Release buffer lock on the metapage while we initialize buckets. * Otherwise, we'll be in interrupt holdoff and the CHECK_FOR_INTERRUPTS * won't accomplish anything. It's a bad idea to hold buffer locks for * long intervals in any case, since that can block the bgwriter. */ _hash_chgbufaccess(rel, metabuf, HASH_WRITE, HASH_NOLOCK); /* * Initialize the first N buckets */ for (i = 0; i < num_buckets; i++) { /* Allow interrupts, in case N is huge */ CHECK_FOR_INTERRUPTS(); buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), forkNum); pg = BufferGetPage(buf); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; _hash_wrtbuf(rel, buf); } /* Now reacquire buffer lock on metapage */ _hash_chgbufaccess(rel, metabuf, HASH_NOLOCK, HASH_WRITE); /* * Initialize first bitmap page */ _hash_initbitmap(rel, metap, num_buckets + 1, forkNum); /* all done */ _hash_wrtbuf(rel, metabuf); return num_buckets; }
/* * visibilitymap_truncate - truncate the visibility map * * The caller must hold AccessExclusiveLock on the relation, to ensure that * other backends receive the smgr invalidation event that this function sends * before they access the VM again. * * nheapblocks is the new size of the heap. */ void visibilitymap_truncate(Relation rel, BlockNumber nheapblocks) { BlockNumber newnblocks; /* last remaining block, byte, and bit */ BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks); uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks); uint8 truncBit = HEAPBLK_TO_MAPBIT(nheapblocks); #ifdef TRACE_VISIBILITYMAP elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks); #endif RelationOpenSmgr(rel); /* * If no visibility map has been created yet for this relation, there's * nothing to truncate. */ if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM)) return; /* * Unless the new size is exactly at a visibility map page boundary, the * tail bits in the last remaining map page, representing truncated heap * blocks, need to be cleared. This is not only tidy, but also necessary * because we don't get a chance to clear the bits if the heap is extended * again. */ if (truncByte != 0 || truncBit != 0) { Buffer mapBuffer; Page page; char *map; newnblocks = truncBlock + 1; mapBuffer = vm_readbuf(rel, truncBlock, false); if (!BufferIsValid(mapBuffer)) { /* nothing to do, the file was already smaller */ return; } page = BufferGetPage(mapBuffer); map = PageGetContents(page); LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE); /* Clear out the unwanted bytes. */ MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1)); /* * Mask out the unwanted bits of the last remaining byte. * * ((1 << 0) - 1) = 00000000 ((1 << 1) - 1) = 00000001 ... ((1 << 6) - * 1) = 00111111 ((1 << 7) - 1) = 01111111 */ map[truncByte] &= (1 << truncBit) - 1; MarkBufferDirty(mapBuffer); UnlockReleaseBuffer(mapBuffer); } else newnblocks = truncBlock; if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks) { /* nothing to do, the file was already smaller than requested size */ return; } /* Truncate the unused VM pages, and send smgr inval message */ smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks); /* * We might as well update the local smgr_vm_nblocks setting. smgrtruncate * sent an smgr cache inval message, which will cause other backends to * invalidate their copy of smgr_vm_nblocks, and this one too at the next * command boundary. But this ensures it isn't outright wrong until then. */ if (rel->rd_smgr) rel->rd_smgr->smgr_vm_nblocks = newnblocks; }
/* * pg_lock_status - produce a view with one row per held or awaited lock mode */ Datum pg_lock_status(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; PG_Lock_Status *mystatus; LockData *lockData; if (SRF_IS_FIRSTCALL()) { TupleDesc tupdesc; MemoryContext oldcontext; /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); /* * switch to memory context appropriate for multiple function calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* build tupdesc for result tuples */ /* this had better match pg_locks view in system_views.sql */ tupdesc = CreateTemplateTupleDesc(16, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "locktype", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "database", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "relation", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 4, "page", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 5, "tuple", INT2OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 6, "transactionid", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 7, "classid", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 8, "objid", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 9, "objsubid", INT2OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 10, "transaction", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 11, "pid", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 12, "mode", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 13, "granted", BOOLOID, -1, 0); /* * These next columns are specific to GPDB */ TupleDescInitEntry(tupdesc, (AttrNumber) 14, "mppSessionId", INT4OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 15, "mppIsWriter", BOOLOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 16, "gp_segment_id", INT4OID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); /* * Collect all the locking information that we will format and send * out as a result set. */ mystatus = (PG_Lock_Status *) palloc(sizeof(PG_Lock_Status)); funcctx->user_fctx = (void *) mystatus; mystatus->lockData = GetLockStatusData(); mystatus->currIdx = 0; mystatus->numSegLocks = 0; mystatus->numsegresults = 0; mystatus->segresults = NULL; /* * Seeing the locks just from the masterDB isn't enough to know what is locked, * or if there is a deadlock. That's because the segDBs also take locks. * Some locks show up only on the master, some only on the segDBs, and some on both. * * So, let's collect the lock information from all the segDBs. Sure, this means * there are a lot more rows coming back from pg_locks than before, since most locks * on the segDBs happen across all the segDBs at the same time. But not always, * so let's play it safe and get them all. */ if (Gp_role == GP_ROLE_DISPATCH) { int resultCount = 0; struct pg_result **results = NULL; StringInfoData buffer; StringInfoData errbuf; int i; initStringInfo(&buffer); /* * This query has to match the tupledesc we just made above. */ appendStringInfo(&buffer, "SELECT * FROM pg_lock_status() L " " (locktype text, database oid, relation oid, page int4, tuple int2," " transactionid xid, classid oid, objid oid, objsubid int2," " transaction xid, pid int4, mode text, granted boolean, " " mppSessionId int4, mppIsWriter boolean, gp_segment_id int4) "); initStringInfo(&errbuf); /* * Why dispatch something here, rather than do a UNION ALL in pg_locks view, and * a join to gp_dist_random('gp_id')? There are several important reasons. * * The union all method is much slower, and requires taking locks on gp_id. * More importantly, applications such as pgAdmin do queries of this view that * involve a correlated subqueries joining to other catalog tables, * which works if we do it this way, but fails * if the view includes the union all. That completely breaks the server status * display in pgAdmin. * * Why dispatch this way, rather than via SPI? There are several advantages. * First, it's easy to get "writer gang is busy" errors if we use SPI. * * Second, this should be much faster, as it doesn't require setting up * the interconnect, and doesn't need to touch any actual data tables to be * able to get the gp_segment_id. * * The downside is we get n result sets, where n == number of segDBs. * * It would be better yet if we sent a plan tree rather than a text string, * so the segDBs don't need to parse it. That would also avoid taking any relation locks * on the segDB to get this info (normally need to get an accessShareLock on pg_locks on the segDB * to make sure it doesn't go away during parsing). But the only safe way I know to do this * is to hand-build the plan tree, and I'm to lazy to do it right now. It's just a matter of * building a function scan node, and filling it in with our result set info (from the tupledesc). * * One thing to note: it's OK to join pg_locks with any catalog table or master-only table, * but joining to a distributed table will result in "writer gang busy: possible attempt to * execute volatile function in unsupported context" errors, because * the scan of the distributed table might already be running on the writer gang * when we want to dispatch this. * * This could be fixed by allocating a reader gang and dispatching to that, but the cost * of setting up a new gang is high, and I've never seen anyone need to join this to a * distributed table. * */ results = cdbdisp_dispatchRMCommand(buffer.data, true, &errbuf, &resultCount); if (errbuf.len > 0) ereport(ERROR, (errmsg("pg_lock internal error (gathered %d results from cmd '%s')", resultCount, buffer.data), errdetail("%s", errbuf.data))); /* * I don't think resultCount can ever be zero if errbuf isn't set. * But check to be sure. */ if (resultCount == 0) elog(ERROR, "pg_locks didn't get back any data from the segDBs"); for (i = 0; i < resultCount; i++) { /* * Any error here should have propagated into errbuf, so we shouldn't * ever see anything other that tuples_ok here. But, check to be * sure. */ if (PQresultStatus(results[i]) != PGRES_TUPLES_OK) { elog(ERROR,"pg_locks: resultStatus not tuples_Ok"); } else { /* * numSegLocks needs to be the total size we are returning to * the application. At the start of this loop, it has the count * for the masterDB locks. Add each of the segDB lock counts. */ mystatus->numSegLocks += PQntuples(results[i]); } } pfree(errbuf.data); mystatus->numsegresults = resultCount; /* * cdbdisp_dispatchRMCommand copies the result sets into our memory, which * will still exist on the subsequent calls. */ mystatus->segresults = results; MemoryContextSwitchTo(oldcontext); } } funcctx = SRF_PERCALL_SETUP(); mystatus = (PG_Lock_Status *) funcctx->user_fctx; lockData = mystatus->lockData; /* * This loop returns all the local lock data from the segment we are running on. */ while (mystatus->currIdx < lockData->nelements) { PROCLOCK *proclock; LOCK *lock; PGPROC *proc; bool granted; LOCKMODE mode = 0; const char *locktypename; char tnbuf[32]; Datum values[16]; bool nulls[16]; HeapTuple tuple; Datum result; proclock = &(lockData->proclocks[mystatus->currIdx]); lock = &(lockData->locks[mystatus->currIdx]); proc = &(lockData->procs[mystatus->currIdx]); /* * Look to see if there are any held lock modes in this PROCLOCK. If * so, report, and destructively modify lockData so we don't report * again. */ granted = false; if (proclock->holdMask) { for (mode = 0; mode < MAX_LOCKMODES; mode++) { if (proclock->holdMask & LOCKBIT_ON(mode)) { granted = true; proclock->holdMask &= LOCKBIT_OFF(mode); break; } } } /* * If no (more) held modes to report, see if PROC is waiting for a * lock on this lock. */ if (!granted) { if (proc->waitLock == proclock->tag.myLock) { /* Yes, so report it with proper mode */ mode = proc->waitLockMode; /* * We are now done with this PROCLOCK, so advance pointer to * continue with next one on next call. */ mystatus->currIdx++; } else { /* * Okay, we've displayed all the locks associated with this * PROCLOCK, proceed to the next one. */ mystatus->currIdx++; continue; } } /* * Form tuple with appropriate data. */ MemSet(values, 0, sizeof(values)); MemSet(nulls, false, sizeof(nulls)); if (lock->tag.locktag_type <= LOCKTAG_ADVISORY) locktypename = LockTagTypeNames[lock->tag.locktag_type]; else { snprintf(tnbuf, sizeof(tnbuf), "unknown %d", (int) lock->tag.locktag_type); locktypename = tnbuf; } values[0] = CStringGetTextDatum(locktypename); switch (lock->tag.locktag_type) { case LOCKTAG_RELATION: case LOCKTAG_RELATION_EXTEND: case LOCKTAG_RELATION_RESYNCHRONIZE: values[1] = ObjectIdGetDatum(lock->tag.locktag_field1); values[2] = ObjectIdGetDatum(lock->tag.locktag_field2); nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[7] = true; nulls[8] = true; break; case LOCKTAG_PAGE: values[1] = ObjectIdGetDatum(lock->tag.locktag_field1); values[2] = ObjectIdGetDatum(lock->tag.locktag_field2); values[3] = UInt32GetDatum(lock->tag.locktag_field3); nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[7] = true; nulls[8] = true; break; case LOCKTAG_TUPLE: values[1] = ObjectIdGetDatum(lock->tag.locktag_field1); values[2] = ObjectIdGetDatum(lock->tag.locktag_field2); values[3] = UInt32GetDatum(lock->tag.locktag_field3); values[4] = UInt16GetDatum(lock->tag.locktag_field4); nulls[5] = true; nulls[6] = true; nulls[7] = true; nulls[8] = true; break; case LOCKTAG_TRANSACTION: values[5] = TransactionIdGetDatum(lock->tag.locktag_field1); nulls[1] = true; nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[6] = true; nulls[7] = true; nulls[8] = true; break; case LOCKTAG_RELATION_APPENDONLY_SEGMENT_FILE: values[1] = ObjectIdGetDatum(lock->tag.locktag_field1); values[2] = ObjectIdGetDatum(lock->tag.locktag_field2); values[7] = ObjectIdGetDatum(lock->tag.locktag_field3); nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[8] = true; break; case LOCKTAG_RESOURCE_QUEUE: values[1] = ObjectIdGetDatum(proc->databaseId); values[7] = ObjectIdGetDatum(lock->tag.locktag_field1); nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; nulls[6] = true; nulls[8] = true; break; case LOCKTAG_OBJECT: case LOCKTAG_USERLOCK: case LOCKTAG_ADVISORY: default: /* treat unknown locktags like OBJECT */ values[1] = ObjectIdGetDatum(lock->tag.locktag_field1); values[6] = ObjectIdGetDatum(lock->tag.locktag_field2); values[7] = ObjectIdGetDatum(lock->tag.locktag_field3); values[8] = Int16GetDatum(lock->tag.locktag_field4); nulls[2] = true; nulls[3] = true; nulls[4] = true; nulls[5] = true; break; } values[9] = TransactionIdGetDatum(proc->xid); if (proc->pid != 0) values[10] = Int32GetDatum(proc->pid); else nulls[10] = true; values[11] = DirectFunctionCall1(textin, CStringGetDatum((char *) GetLockmodeName(LOCK_LOCKMETHOD(*lock), mode))); values[12] = BoolGetDatum(granted); values[13] = Int32GetDatum(proc->mppSessionId); values[14] = Int32GetDatum(proc->mppIsWriter); values[15] = Int32GetDatum(Gp_segment); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } /* * This loop only executes on the masterDB and only in dispatch mode, because that * is the only time we dispatched to the segDBs. */ while (mystatus->currIdx >= lockData->nelements && mystatus->currIdx < lockData->nelements + mystatus->numSegLocks) { HeapTuple tuple; Datum result; Datum values[16]; bool nulls[16]; int i; int whichresultset = 0; int whichelement = mystatus->currIdx - lockData->nelements; int whichrow = whichelement; Assert(Gp_role == GP_ROLE_DISPATCH); /* * Because we have one result set per segDB (rather than one big result set with everything), * we need to figure out which result set we are on, and which row within that result set * we are returning. * * So, we walk through all the result sets and all the rows in each one, in order. */ while(whichrow >= PQntuples(mystatus->segresults[whichresultset])) { whichrow -= PQntuples(mystatus->segresults[whichresultset]); whichresultset++; if (whichresultset >= mystatus->numsegresults) break; } /* * If this condition is true, we have already sent everything back, * and we just want to do the SRF_RETURN_DONE */ if (whichresultset >= mystatus->numsegresults) break; mystatus->currIdx++; /* * Form tuple with appropriate data we got from the segDBs */ MemSet(values, 0, sizeof(values)); MemSet(nulls, false, sizeof(nulls)); /* * For each column, extract out the value (which comes out in text). * Convert it to the appropriate datatype to match our tupledesc, * and put that in values. * The columns look like this (from select statement earlier): * * " (locktype text, database oid, relation oid, page int4, tuple int2," * " transactionid xid, classid oid, objid oid, objsubid int2," * " transaction xid, pid int4, mode text, granted boolean, " * " mppSessionId int4, mppIsWriter boolean, gp_segment_id int4) ," */ values[0] = CStringGetTextDatum(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 0)); values[1] = ObjectIdGetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 1))); values[2] = ObjectIdGetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 2))); values[3] = UInt32GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 3))); values[4] = UInt16GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 4))); values[5] = TransactionIdGetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 5))); values[6] = ObjectIdGetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 6))); values[7] = ObjectIdGetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 7))); values[8] = UInt16GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 8))); values[9] = TransactionIdGetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow, 9))); values[10] = UInt32GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow,10))); values[11] = CStringGetTextDatum(PQgetvalue(mystatus->segresults[whichresultset], whichrow,11)); values[12] = BoolGetDatum(strncmp(PQgetvalue(mystatus->segresults[whichresultset], whichrow,12),"t",1)==0); values[13] = Int32GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow,13))); values[14] = BoolGetDatum(strncmp(PQgetvalue(mystatus->segresults[whichresultset], whichrow,14),"t",1)==0); values[15] = Int32GetDatum(atoi(PQgetvalue(mystatus->segresults[whichresultset], whichrow,15))); /* * Copy the null info over. It should all match properly. */ for (i=0; i<16; i++) { nulls[i] = PQgetisnull(mystatus->segresults[whichresultset], whichrow, i); } tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } /* * if we dispatched to the segDBs, free up the memory holding the result sets. * Otherwise we might leak this memory each time we got called (does it automatically * get freed by the pool being deleted? Probably, but this is safer). */ if (mystatus->segresults != NULL) { int i; for (i = 0; i < mystatus->numsegresults; i++) PQclear(mystatus->segresults[i]); free(mystatus->segresults); } SRF_RETURN_DONE(funcctx); }
/* * Create a table space * * Only superusers can create a tablespace. This seems a reasonable restriction * since we're determining the system layout and, anyway, we probably have * root if we're doing this kind of activity */ Oid CreateTableSpace(CreateTableSpaceStmt *stmt) { #ifdef HAVE_SYMLINK Relation rel; Datum values[Natts_pg_tablespace]; bool nulls[Natts_pg_tablespace]; HeapTuple tuple; Oid tablespaceoid; char *location; Oid ownerId; /* Must be super user */ if (!superuser()) ereport(ERROR, (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE), errmsg("permission denied to create tablespace \"%s\"", stmt->tablespacename), errhint("Must be superuser to create a tablespace."))); /* However, the eventual owner of the tablespace need not be */ if (stmt->owner) ownerId = get_role_oid(stmt->owner, false); else ownerId = GetUserId(); /* Unix-ify the offered path, and strip any trailing slashes */ location = pstrdup(stmt->location); canonicalize_path(location); /* disallow quotes, else CREATE DATABASE would be at risk */ if (strchr(location, '\'')) ereport(ERROR, (errcode(ERRCODE_INVALID_NAME), errmsg("tablespace location cannot contain single quotes"))); /* * Allowing relative paths seems risky * * this also helps us ensure that location is not empty or whitespace */ if (!is_absolute_path(location)) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location must be an absolute path"))); /* * Check that location isn't too long. Remember that we're going to append * 'PG_XXX/<dboid>/<relid>.<nnn>'. FYI, we never actually reference the * whole path, but mkdir() uses the first two parts. */ if (strlen(location) + 1 + strlen(TABLESPACE_VERSION_DIRECTORY) + 1 + OIDCHARS + 1 + OIDCHARS + 1 + OIDCHARS > MAXPGPATH) ereport(ERROR, (errcode(ERRCODE_INVALID_OBJECT_DEFINITION), errmsg("tablespace location \"%s\" is too long", location))); /* * Disallow creation of tablespaces named "pg_xxx"; we reserve this * namespace for system purposes. */ if (!allowSystemTableMods && IsReservedName(stmt->tablespacename)) ereport(ERROR, (errcode(ERRCODE_RESERVED_NAME), errmsg("unacceptable tablespace name \"%s\"", stmt->tablespacename), errdetail("The prefix \"pg_\" is reserved for system tablespaces."))); /* * Check that there is no other tablespace by this name. (The unique * index would catch this anyway, but might as well give a friendlier * message.) */ if (OidIsValid(get_tablespace_oid(stmt->tablespacename, true))) ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("tablespace \"%s\" already exists", stmt->tablespacename))); /* * Insert tuple into pg_tablespace. The purpose of doing this first is to * lock the proposed tablename against other would-be creators. The * insertion will roll back if we find problems below. */ rel = heap_open(TableSpaceRelationId, RowExclusiveLock); MemSet(nulls, false, sizeof(nulls)); values[Anum_pg_tablespace_spcname - 1] = DirectFunctionCall1(namein, CStringGetDatum(stmt->tablespacename)); values[Anum_pg_tablespace_spcowner - 1] = ObjectIdGetDatum(ownerId); nulls[Anum_pg_tablespace_spcacl - 1] = true; nulls[Anum_pg_tablespace_spcoptions - 1] = true; tuple = heap_form_tuple(rel->rd_att, values, nulls); tablespaceoid = simple_heap_insert(rel, tuple); CatalogUpdateIndexes(rel, tuple); heap_freetuple(tuple); /* Record dependency on owner */ recordDependencyOnOwner(TableSpaceRelationId, tablespaceoid, ownerId); /* Post creation hook for new tablespace */ InvokeObjectPostCreateHook(TableSpaceRelationId, tablespaceoid, 0); create_tablespace_directories(location, tablespaceoid); /* Record the filesystem change in XLOG */ { xl_tblspc_create_rec xlrec; XLogRecData rdata[2]; xlrec.ts_id = tablespaceoid; rdata[0].data = (char *) &xlrec; rdata[0].len = offsetof(xl_tblspc_create_rec, ts_path); rdata[0].buffer = InvalidBuffer; rdata[0].next = &(rdata[1]); rdata[1].data = (char *) location; rdata[1].len = strlen(location) + 1; rdata[1].buffer = InvalidBuffer; rdata[1].next = NULL; (void) XLogInsert(RM_TBLSPC_ID, XLOG_TBLSPC_CREATE, rdata); } /* * Force synchronous commit, to minimize the window between creating the * symlink on-disk and marking the transaction committed. It's not great * that there is any window at all, but definitely we don't want to make * it larger than necessary. */ ForceSyncCommit(); pfree(location); /* We keep the lock on pg_tablespace until commit */ heap_close(rel, NoLock); #else /* !HAVE_SYMLINK */ ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("tablespaces are not supported on this platform"))); #endif /* HAVE_SYMLINK */ return tablespaceoid; }
/* * MarkAsPreparing * Reserve the GID for the given transaction. * * Internally, this creates a gxact struct and puts it into the active array. * NOTE: this is also used when reloading a gxact after a crash; so avoid * assuming that we can use very much backend context. */ GlobalTransaction MarkAsPreparing(TransactionId xid, const char *gid, TimestampTz prepared_at, Oid owner, Oid databaseid) { GlobalTransaction gxact; int i; if (strlen(gid) >= GIDSIZE) ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("transaction identifier \"%s\" is too long", gid))); LWLockAcquire(TwoPhaseStateLock, LW_EXCLUSIVE); /* * First, find and recycle any gxacts that failed during prepare. We do * this partly to ensure we don't mistakenly say their GIDs are still * reserved, and partly so we don't fail on out-of-slots unnecessarily. */ for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { gxact = TwoPhaseState->prepXacts[i]; if (!gxact->valid && !TransactionIdIsActive(gxact->locking_xid)) { /* It's dead Jim ... remove from the active array */ TwoPhaseState->numPrepXacts--; TwoPhaseState->prepXacts[i] = TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts]; /* and put it back in the freelist */ gxact->proc.links.next = TwoPhaseState->freeGXacts; TwoPhaseState->freeGXacts = MAKE_OFFSET(gxact); /* Back up index count too, so we don't miss scanning one */ i--; } } /* Check for conflicting GID */ for (i = 0; i < TwoPhaseState->numPrepXacts; i++) { gxact = TwoPhaseState->prepXacts[i]; if (strcmp(gxact->gid, gid) == 0) { ereport(ERROR, (errcode(ERRCODE_DUPLICATE_OBJECT), errmsg("transaction identifier \"%s\" is already in use", gid))); } } /* Get a free gxact from the freelist */ if (TwoPhaseState->freeGXacts == INVALID_OFFSET) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("maximum number of prepared transactions reached"), errhint("Increase max_prepared_transactions (currently %d).", max_prepared_xacts))); gxact = (GlobalTransaction) MAKE_PTR(TwoPhaseState->freeGXacts); TwoPhaseState->freeGXacts = gxact->proc.links.next; /* Initialize it */ MemSet(&gxact->proc, 0, sizeof(PGPROC)); SHMQueueElemInit(&(gxact->proc.links)); gxact->proc.waitStatus = STATUS_OK; gxact->proc.xid = xid; gxact->proc.xmin = InvalidTransactionId; gxact->proc.pid = 0; gxact->proc.databaseId = databaseid; gxact->proc.roleId = owner; gxact->proc.inVacuum = false; gxact->proc.lwWaiting = false; gxact->proc.lwExclusive = false; gxact->proc.lwWaitLink = NULL; gxact->proc.waitLock = NULL; gxact->proc.waitProcLock = NULL; for (i = 0; i < NUM_LOCK_PARTITIONS; i++) SHMQueueInit(&(gxact->proc.myProcLocks[i])); /* subxid data must be filled later by GXactLoadSubxactData */ gxact->proc.subxids.overflowed = false; gxact->proc.subxids.nxids = 0; gxact->prepared_at = prepared_at; /* initialize LSN to 0 (start of WAL) */ gxact->prepare_lsn.xlogid = 0; gxact->prepare_lsn.xrecoff = 0; gxact->owner = owner; gxact->locking_xid = xid; gxact->valid = false; strcpy(gxact->gid, gid); /* And insert it into the active array */ Assert(TwoPhaseState->numPrepXacts < max_prepared_xacts); TwoPhaseState->prepXacts[TwoPhaseState->numPrepXacts++] = gxact; LWLockRelease(TwoPhaseStateLock); return gxact; }
Datum hstore_from_record(PG_FUNCTION_ARGS) { HeapTupleHeader rec; int4 buflen; HStore *out; Pairs *pairs; Oid tupType; int32 tupTypmod; TupleDesc tupdesc; HeapTupleData tuple; RecordIOData *my_extra; int ncolumns; int i, j; Datum *values; bool *nulls; if (PG_ARGISNULL(0)) { Oid argtype = get_fn_expr_argtype(fcinfo->flinfo, 0); /* * have no tuple to look at, so the only source of type info is the * argtype. The lookup_rowtype_tupdesc call below will error out if we * don't have a known composite type oid here. */ tupType = argtype; tupTypmod = -1; rec = NULL; } else { rec = PG_GETARG_HEAPTUPLEHEADER(0); /* Extract type info from the tuple itself */ tupType = HeapTupleHeaderGetTypeId(rec); tupTypmod = HeapTupleHeaderGetTypMod(rec); } tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod); ncolumns = tupdesc->natts; /* * We arrange to look up the needed I/O info just once per series of * calls, assuming the record type doesn't change underneath us. */ my_extra = (RecordIOData *) fcinfo->flinfo->fn_extra; if (my_extra == NULL || my_extra->ncolumns != ncolumns) { fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, sizeof(RecordIOData) - sizeof(ColumnIOData) + ncolumns * sizeof(ColumnIOData)); my_extra = (RecordIOData *) fcinfo->flinfo->fn_extra; my_extra->record_type = InvalidOid; my_extra->record_typmod = 0; } if (my_extra->record_type != tupType || my_extra->record_typmod != tupTypmod) { MemSet(my_extra, 0, sizeof(RecordIOData) - sizeof(ColumnIOData) + ncolumns * sizeof(ColumnIOData)); my_extra->record_type = tupType; my_extra->record_typmod = tupTypmod; my_extra->ncolumns = ncolumns; } pairs = palloc(ncolumns * sizeof(Pairs)); if (rec) { /* Build a temporary HeapTuple control structure */ tuple.t_len = HeapTupleHeaderGetDatumLength(rec); ItemPointerSetInvalid(&(tuple.t_self)); tuple.t_tableOid = InvalidOid; tuple.t_data = rec; values = (Datum *) palloc(ncolumns * sizeof(Datum)); nulls = (bool *) palloc(ncolumns * sizeof(bool)); /* Break down the tuple into fields */ heap_deform_tuple(&tuple, tupdesc, values, nulls); } else { values = NULL; nulls = NULL; } for (i = 0, j = 0; i < ncolumns; ++i) { ColumnIOData *column_info = &my_extra->columns[i]; Oid column_type = tupdesc->attrs[i]->atttypid; char *value; /* Ignore dropped columns in datatype */ if (tupdesc->attrs[i]->attisdropped) continue; pairs[j].key = NameStr(tupdesc->attrs[i]->attname); pairs[j].keylen = hstoreCheckKeyLen(strlen(NameStr(tupdesc->attrs[i]->attname))); if (!nulls || nulls[i]) { pairs[j].val = NULL; pairs[j].vallen = 4; pairs[j].isnull = true; pairs[j].needfree = false; ++j; continue; } /* * Convert the column value to text */ if (column_info->column_type != column_type) { bool typIsVarlena; getTypeOutputInfo(column_type, &column_info->typiofunc, &typIsVarlena); fmgr_info_cxt(column_info->typiofunc, &column_info->proc, fcinfo->flinfo->fn_mcxt); column_info->column_type = column_type; } value = OutputFunctionCall(&column_info->proc, values[i]); pairs[j].val = value; pairs[j].vallen = hstoreCheckValLen(strlen(value)); pairs[j].isnull = false; pairs[j].needfree = false; ++j; } ncolumns = hstoreUniquePairs(pairs, j, &buflen); out = hstorePairs(pairs, ncolumns, buflen); ReleaseTupleDesc(tupdesc); PG_RETURN_POINTER(out); }
int inv_write(LargeObjectDesc *obj_desc, const char *buf, int nbytes) { int nwritten = 0; int n; int off; int len; int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); ScanKeyData skey[2]; SysScanDesc sd; HeapTuple oldtuple; Form_pg_largeobject olddata; bool neednextpage; bytea *datafield; bool pfreeit; struct { bytea hdr; char data[LOBLKSIZE]; /* make struct big enough */ int32 align_it; /* ensure struct is aligned well enough */ } workbuf; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; bool nulls[Natts_pg_largeobject]; bool replace[Natts_pg_largeobject]; CatalogIndexState indstate; Assert(PointerIsValid(obj_desc)); Assert(buf != NULL); /* enforce writability because snapshot is probably wrong otherwise */ if ((obj_desc->flags & IFS_WRLOCK) == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("large object %u was not opened for writing", obj_desc->id))); /* check existence of the target largeobject */ if (!LargeObjectExists(obj_desc->id)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("large object %u was already dropped", obj_desc->id))); if (nbytes <= 0) return 0; open_lo_relation(); indstate = CatalogOpenIndexes(lo_heap_r); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = systable_beginscan_ordered(lo_heap_r, lo_index_r, obj_desc->snapshot, 2, skey); oldtuple = NULL; olddata = NULL; neednextpage = true; while (nwritten < nbytes) { /* * If possible, get next pre-existing page of the LO. We expect the * indexscan will deliver these in order --- but there may be holes. */ if (neednextpage) { if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL) { if (HeapTupleHasNulls(oldtuple)) /* paranoia */ elog(ERROR, "null field found in pg_largeobject"); olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); Assert(olddata->pageno >= pageno); } neednextpage = false; } /* * If we have a pre-existing page, see if it is the page we want to * write, or a later one. */ if (olddata != NULL && olddata->pageno == pageno) { /* * Update an existing page with fresh data. * * First, load old data into workbuf */ datafield = &(olddata->data); /* see note at top of file */ pfreeit = false; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((struct varlena *) datafield); pfreeit = true; } len = getbytealen(datafield); Assert(len <= LOBLKSIZE); memcpy(workb, VARDATA(datafield), len); if (pfreeit) pfree(datafield); /* * Fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > len) MemSet(workb + len, 0, off - len); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; off += n; /* compute valid length of new page */ len = (len >= off) ? len : off; SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); memset(replace, false, sizeof(replace)); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); replace[Anum_pg_largeobject_data - 1] = true; newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r), values, nulls, replace); simple_heap_update(lo_heap_r, &newtup->t_self, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); /* * We're done with this old page. */ oldtuple = NULL; olddata = NULL; neednextpage = true; } else { /* * Write a brand new page. * * First, fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > 0) MemSet(workb, 0, off); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; /* compute valid length of new page */ len = off + n; SET_VARSIZE(&workbuf.hdr, len + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls); simple_heap_insert(lo_heap_r, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } pageno++; } systable_endscan_ordered(sd); CatalogCloseIndexes(indstate); /* * Advance command counter so that my tuple updates will be seen by later * large-object operations in this transaction. */ CommandCounterIncrement(); return nwritten; }
void inv_truncate(LargeObjectDesc *obj_desc, int len) { int32 pageno = (int32) (len / LOBLKSIZE); int off; ScanKeyData skey[2]; SysScanDesc sd; HeapTuple oldtuple; Form_pg_largeobject olddata; struct { bytea hdr; char data[LOBLKSIZE]; /* make struct big enough */ int32 align_it; /* ensure struct is aligned well enough */ } workbuf; char *workb = VARDATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; bool nulls[Natts_pg_largeobject]; bool replace[Natts_pg_largeobject]; CatalogIndexState indstate; Assert(PointerIsValid(obj_desc)); /* enforce writability because snapshot is probably wrong otherwise */ if ((obj_desc->flags & IFS_WRLOCK) == 0) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), errmsg("large object %u was not opened for writing", obj_desc->id))); /* check existence of the target largeobject */ if (!LargeObjectExists(obj_desc->id)) ereport(ERROR, (errcode(ERRCODE_UNDEFINED_OBJECT), errmsg("large object %u was already dropped", obj_desc->id))); open_lo_relation(); indstate = CatalogOpenIndexes(lo_heap_r); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = systable_beginscan_ordered(lo_heap_r, lo_index_r, obj_desc->snapshot, 2, skey); /* * If possible, get the page the truncation point is in. The truncation * point may be beyond the end of the LO or in a hole. */ olddata = NULL; if ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL) { if (HeapTupleHasNulls(oldtuple)) /* paranoia */ elog(ERROR, "null field found in pg_largeobject"); olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); Assert(olddata->pageno >= pageno); } /* * If we found the page of the truncation point we need to truncate the * data in it. Otherwise if we're in a hole, we need to create a page to * mark the end of data. */ if (olddata != NULL && olddata->pageno == pageno) { /* First, load old data into workbuf */ bytea *datafield = &(olddata->data); /* see note at top of * file */ bool pfreeit = false; int pagelen; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((struct varlena *) datafield); pfreeit = true; } pagelen = getbytealen(datafield); Assert(pagelen <= LOBLKSIZE); memcpy(workb, VARDATA(datafield), pagelen); if (pfreeit) pfree(datafield); /* * Fill any hole */ off = len % LOBLKSIZE; if (off > pagelen) MemSet(workb + pagelen, 0, off - pagelen); /* compute length of new page */ SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ); /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); memset(replace, false, sizeof(replace)); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); replace[Anum_pg_largeobject_data - 1] = true; newtup = heap_modify_tuple(oldtuple, RelationGetDescr(lo_heap_r), values, nulls, replace); simple_heap_update(lo_heap_r, &newtup->t_self, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } else { /* * If the first page we found was after the truncation point, we're in * a hole that we'll fill, but we need to delete the later page. */ if (olddata != NULL && olddata->pageno > pageno) simple_heap_delete(lo_heap_r, &oldtuple->t_self); /* * Write a brand new page. * * Fill the hole up to the truncation point */ off = len % LOBLKSIZE; if (off > 0) MemSet(workb, 0, off); /* compute length of new page */ SET_VARSIZE(&workbuf.hdr, off + VARHDRSZ); /* * Form and insert new tuple */ memset(values, 0, sizeof(values)); memset(nulls, false, sizeof(nulls)); values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); newtup = heap_form_tuple(lo_heap_r->rd_att, values, nulls); simple_heap_insert(lo_heap_r, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } /* * Delete any pages after the truncation point */ while ((oldtuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL) { simple_heap_delete(lo_heap_r, &oldtuple->t_self); } systable_endscan_ordered(sd); CatalogCloseIndexes(indstate); /* * Advance command counter so that tuple updates will be seen by later * large-object operations in this transaction. */ CommandCounterIncrement(); }
bool BootpTx(void){ int i; char *txPktBuf, *rxPktBuf; long delay; // delay : 1초 delay후 실패면 재시도. time_t now; txPktBuf = PktBuf; rxPktBuf = PktBuf; bootpState = BOOTP_CONTINUE; protocol = PROT_BOOTP; // make boop packet. MemSet(txPktBuf, 0, MAX_PKT_SIZE); SetBootpHeader((char *)(txPktBuf+ETHER_HDR_SIZE+IP_HDR_SIZE+UDP_HDR_SIZE)); SetUdpHeader((char *)(txPktBuf+ETHER_HDR_SIZE+IP_HDR_SIZE), bootps, bootpc, BOOTP_HDR_SIZE); SetIPHeader((char *)(txPktBuf+ETHER_HDR_SIZE), noIP, broadcastIP, UDP_HDR_SIZE+BOOTP_HDR_SIZE); SetEtherHeader(txPktBuf, broadcastEther, PROT_IP); //if (!NetInit()) return false; // view info. printf("Our Ethernet address : "); PrintEthAddr(clientEther); printf(".\n"); printf("Sending bootp packet...\n"); // bootp operation. for (i=0; i<TIMEOUT; i++){ // transmit bootp packet to host. printf("."); if (!TxPacket(txPktBuf, ETHER_HDR_SIZE+IP_HDR_SIZE+UDP_HDR_SIZE+BOOTP_HDR_SIZE)) break; // receive bootp packet from host. delay = GetTime(&now)+HZ; while (GetTime(&now)<delay && bootpState==BOOTP_CONTINUE){ RxPacket(rxPktBuf); if (bootpState==BOOTP_SUCCESS) break; } if (bootpState==BOOTP_SUCCESS) break; } printf("\n"); protocol = NOPROTOCOL; if (bootpState==BOOTP_SUCCESS){ printf("Bootp Packet received.\n"); printf("\tHost (server) Ethernet : "); PrintEthAddr(hostEther); printf("\n"); printf("\tHost (server) IP : "); PrintIPAddr(hostIP); printf("\n"); printf("\tClient (target) Ethernet : "); PrintEthAddr(clientEther); printf("\n"); printf("\tClient (target) IP : "); PrintIPAddr(clientIP); printf("\n"); printf("\n"); return true; } else { printf("Bootp packet is not received.\n\n"); return false; } return true; } // BootpTx.
int inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes) { int nread = 0; int n; int off; int len; int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); uint32 pageoff; ScanKeyData skey[2]; SysScanDesc sd; HeapTuple tuple; Assert(PointerIsValid(obj_desc)); Assert(buf != NULL); if (nbytes <= 0) return 0; open_lo_relation(); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = systable_beginscan_ordered(lo_heap_r, lo_index_r, obj_desc->snapshot, 2, skey); while ((tuple = systable_getnext_ordered(sd, ForwardScanDirection)) != NULL) { Form_pg_largeobject data; bytea *datafield; bool pfreeit; if (HeapTupleHasNulls(tuple)) /* paranoia */ elog(ERROR, "null field found in pg_largeobject"); data = (Form_pg_largeobject) GETSTRUCT(tuple); /* * We expect the indexscan will deliver pages in order. However, * there may be missing pages if the LO contains unwritten "holes". We * want missing sections to read out as zeroes. */ pageoff = ((uint32) data->pageno) * LOBLKSIZE; if (pageoff > obj_desc->offset) { n = pageoff - obj_desc->offset; n = (n <= (nbytes - nread)) ? n : (nbytes - nread); MemSet(buf + nread, 0, n); nread += n; obj_desc->offset += n; } if (nread < nbytes) { Assert(obj_desc->offset >= pageoff); off = (int) (obj_desc->offset - pageoff); Assert(off >= 0 && off < LOBLKSIZE); datafield = &(data->data); /* see note at top of file */ pfreeit = false; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((struct varlena *) datafield); pfreeit = true; } len = getbytealen(datafield); if (len > off) { n = len - off; n = (n <= (nbytes - nread)) ? n : (nbytes - nread); memcpy(buf + nread, VARDATA(datafield) + off, n); nread += n; obj_desc->offset += n; } if (pfreeit) pfree(datafield); } if (nread >= nbytes) break; } systable_endscan_ordered(sd); return nread; }
/* * _hash_metapinit() -- Initialize the metadata page of a hash index, * the two buckets that we begin with and the initial * bitmap page. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ void _hash_metapinit(Relation rel) { MIRROREDLOCK_BUFMGR_DECLARE; HashMetaPage metap; HashPageOpaque pageopaque; Buffer metabuf; Buffer buf; Page pg; int32 data_width; int32 item_width; int32 ffactor; uint16 i; /* safety check */ if (RelationGetNumberOfBlocks(rel) != 0) elog(ERROR, "cannot initialize non-empty hash index \"%s\"", RelationGetRelationName(rel)); /* * Determine the target fill factor (in tuples per bucket) for this index. * The idea is to make the fill factor correspond to pages about as full * as the user-settable fillfactor parameter says. We can compute it * exactly if the index datatype is fixed-width, but for var-width there's * some guessing involved. */ data_width = get_typavgwidth(RelationGetDescr(rel)->attrs[0]->atttypid, RelationGetDescr(rel)->attrs[0]->atttypmod); item_width = MAXALIGN(sizeof(IndexTupleData)) + MAXALIGN(data_width) + sizeof(ItemIdData); /* include the line pointer */ ffactor = RelationGetTargetPageUsage(rel, HASH_DEFAULT_FILLFACTOR) / item_width; /* keep to a sane range */ if (ffactor < 10) ffactor = 10; /* * We initialize the metapage, the first two bucket pages, and the * first bitmap page in sequence, using _hash_getnewbuf to cause * smgrextend() calls to occur. This ensures that the smgr level * has the right idea of the physical index length. */ // -------- MirroredLock ---------- MIRROREDLOCK_BUFMGR_LOCK; metabuf = _hash_getnewbuf(rel, HASH_METAPAGE, HASH_WRITE); pg = BufferGetPage(metabuf); _hash_pageinit(pg, BufferGetPageSize(metabuf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_filler = HASHO_FILL; metap = (HashMetaPage) pg; metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = BufferGetPageSize(metabuf); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= (metap->hashm_bsize - (MAXALIGN(sizeof(PageHeaderData)) + MAXALIGN(sizeof(HashPageOpaqueData))))) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); /* * We initialize the index with two buckets, 0 and 1, occupying physical * blocks 1 and 2. The first freespace bitmap page is in block 3. */ metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */ metap->hashm_ovflpoint = 1; metap->hashm_firstfree = 0; /* * Initialize the first two buckets */ for (i = 0; i <= 1; i++) { buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i), HASH_WRITE); pg = BufferGetPage(buf); _hash_pageinit(pg, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(pg); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = i; pageopaque->hasho_flag = LH_BUCKET_PAGE; pageopaque->hasho_filler = HASHO_FILL; _hash_wrtbuf(rel, buf); } /* * Initialize first bitmap page */ _hash_initbitmap(rel, metap, 3); /* all done */ _hash_wrtbuf(rel, metabuf); MIRROREDLOCK_BUFMGR_UNLOCK; // -------- MirroredLock ---------- }
/* * parseInput subroutine to read a 'B' or 'D' (row data) message. * We add another tuple to the existing PGresult structure. * Returns: 0 if completed message, EOF if error or not enough data yet. * * Note that if we run out of data, we have to suspend and reprocess * the message after more data is received. We keep a partially constructed * tuple in conn->curTuple, and avoid reallocating already-allocated storage. */ static int getAnotherTuple(PGconn *conn, bool binary) { PGresult *result = conn->result; int nfields = result->numAttributes; PGresAttValue *tup; /* the backend sends us a bitmap of which attributes are null */ char std_bitmap[64]; /* used unless it doesn't fit */ char *bitmap = std_bitmap; int i; size_t nbytes; /* the number of bytes in bitmap */ char bmap; /* One byte of the bitmap */ int bitmap_index; /* Its index */ int bitcnt; /* number of bits examined in current byte */ int vlen; /* length of the current field value */ result->binary = binary; /* Allocate tuple space if first time for this data message */ if (conn->curTuple == NULL) { conn->curTuple = (PGresAttValue *) pqResultAlloc(result, nfields * sizeof(PGresAttValue), TRUE); if (conn->curTuple == NULL) goto outOfMemory; MemSet(conn->curTuple, 0, nfields * sizeof(PGresAttValue)); /* * If it's binary, fix the column format indicators. We assume the * backend will consistently send either B or D, not a mix. */ if (binary) { for (i = 0; i < nfields; i++) result->attDescs[i].format = 1; } } tup = conn->curTuple; /* Get the null-value bitmap */ nbytes = (nfields + BITS_PER_BYTE - 1) / BITS_PER_BYTE; /* malloc() only for unusually large field counts... */ if (nbytes > sizeof(std_bitmap)) { bitmap = (char *) malloc(nbytes); if (!bitmap) goto outOfMemory; } if (pqGetnchar(bitmap, nbytes, conn)) goto EOFexit; /* Scan the fields */ bitmap_index = 0; bmap = bitmap[bitmap_index]; bitcnt = 0; for (i = 0; i < nfields; i++) { if (!(bmap & 0200)) { /* if the field value is absent, make it a null string */ tup[i].value = result->null_field; tup[i].len = NULL_LEN; } else { /* get the value length (the first four bytes are for length) */ if (pqGetInt(&vlen, 4, conn)) goto EOFexit; if (!binary) vlen = vlen - 4; if (vlen < 0) vlen = 0; if (tup[i].value == NULL) { tup[i].value = (char *) pqResultAlloc(result, vlen + 1, binary); if (tup[i].value == NULL) goto outOfMemory; } tup[i].len = vlen; /* read in the value */ if (vlen > 0) if (pqGetnchar((char *) (tup[i].value), vlen, conn)) goto EOFexit; /* we have to terminate this ourselves */ tup[i].value[vlen] = '\0'; } /* advance the bitmap stuff */ bitcnt++; if (bitcnt == BITS_PER_BYTE) { bitmap_index++; bmap = bitmap[bitmap_index]; bitcnt = 0; } else bmap <<= 1; } /* Success! Store the completed tuple in the result */ if (!pqAddTuple(result, tup)) goto outOfMemory; /* and reset for a new message */ conn->curTuple = NULL; if (bitmap != std_bitmap) free(bitmap); return 0; outOfMemory: /* Replace partially constructed result with an error result */ /* * we do NOT use pqSaveErrorResult() here, because of the likelihood that * there's not enough memory to concatenate messages... */ pqClearAsyncResult(conn); printfPQExpBuffer(&conn->errorMessage, libpq_gettext("out of memory for query result\n")); /* * XXX: if PQmakeEmptyPGresult() fails, there's probably not much we can * do to recover... */ conn->result = PQmakeEmptyPGresult(conn, PGRES_FATAL_ERROR); conn->asyncStatus = PGASYNC_READY; /* Discard the failed message --- good idea? */ conn->inStart = conn->inEnd; EOFexit: if (bitmap != NULL && bitmap != std_bitmap) free(bitmap); return EOF; }
/* * parseInput subroutine to read a 'T' (row descriptions) message. * We build a PGresult structure containing the attribute data. * Returns: 0 if completed message, EOF if not enough data yet. * * Note that if we run out of data, we have to release the partially * constructed PGresult, and rebuild it again next time. Fortunately, * that shouldn't happen often, since 'T' messages usually fit in a packet. */ static int getRowDescriptions(PGconn *conn) { PGresult *result = NULL; int nfields; int i; result = PQmakeEmptyPGresult(conn, PGRES_TUPLES_OK); if (!result) goto failure; /* parseInput already read the 'T' label. */ /* the next two bytes are the number of fields */ if (pqGetInt(&(result->numAttributes), 2, conn)) goto failure; nfields = result->numAttributes; /* allocate space for the attribute descriptors */ if (nfields > 0) { result->attDescs = (PGresAttDesc *) pqResultAlloc(result, nfields * sizeof(PGresAttDesc), TRUE); if (!result->attDescs) goto failure; MemSet(result->attDescs, 0, nfields * sizeof(PGresAttDesc)); } /* get type info */ for (i = 0; i < nfields; i++) { int typid; int typlen; int atttypmod; if (pqGets(&conn->workBuffer, conn) || pqGetInt(&typid, 4, conn) || pqGetInt(&typlen, 2, conn) || pqGetInt(&atttypmod, 4, conn)) goto failure; /* * Since pqGetInt treats 2-byte integers as unsigned, we need to * coerce the result to signed form. */ typlen = (int) ((int16) typlen); result->attDescs[i].name = pqResultStrdup(result, conn->workBuffer.data); if (!result->attDescs[i].name) goto failure; result->attDescs[i].tableid = 0; result->attDescs[i].columnid = 0; result->attDescs[i].format = 0; result->attDescs[i].typid = typid; result->attDescs[i].typlen = typlen; result->attDescs[i].atttypmod = atttypmod; } /* Success! */ conn->result = result; return 0; failure: if (result) PQclear(result); return EOF; }
/* * get_next_id * * Gets the smallest possible id to assign to the next continuous view. * We keep this minimal so that we can minimize the size of bitmaps used * to tag stream buffer events with. */ static Oid get_next_id(Relation rel) { HeapScanDesc scandesc; HeapTuple tup; List *ids_list = NIL; int num_ids; Assert(MAX_CQS % 32 == 0); scandesc = heap_beginscan_catalog(rel, 0, NULL); while ((tup = heap_getnext(scandesc, ForwardScanDirection)) != NULL) { Form_pipeline_query row = (Form_pipeline_query) GETSTRUCT(tup); ids_list = lappend_oid(ids_list, row->id); } heap_endscan(scandesc); num_ids = list_length(ids_list); if (num_ids) { Oid ids[num_ids]; int counts_per_combiner[continuous_query_num_combiners]; int i = 0; Oid max; ListCell *lc; int j; int target_combiner; List *potential_ids; MemSet(counts_per_combiner, 0, sizeof(counts_per_combiner)); foreach(lc, ids_list) { ids[i] = lfirst_oid(lc); counts_per_combiner[ids[i] % continuous_query_num_combiners] += 1; i++; } qsort(ids, num_ids, sizeof(Oid), &compare_oid); if (num_ids == MAX_CQS - 1) /* -1 because 0 is an invalid id */ ereport(ERROR, (errcode(ERRCODE_TOO_MANY_CONTINUOUS_VIEWS), errmsg("maximum number of continuous views exceeded"), errhint("Please drop a existing continuous view before trying to create a new one."))); max = ids[num_ids - 1]; Assert(max >= num_ids); /* * FIXME(usmanm): We do some randomization of ID generation here to make sure that CQs that * are created and dropped in quick succession don't read an event that was not for them. */ /* * Collect any unused ids in [1, max]. */ list_free(ids_list); ids_list = NIL; for (i = 1, j = 0; j < num_ids; i++) { if (ids[j] > i) ids_list = lappend_oid(ids_list, (Oid) i); else j++; } /* * Add all IDs between max and the next multiple of 32. */ j = Min((max / 32 + 1) * 32, MAX_CQS); for (i = max + 1; i < j; i++) ids_list = lappend_oid(ids_list, (Oid) i); /* * Less than 16 options? Throw in some more. */ if (list_length(ids_list) < 16 && j < MAX_CQS) for (i = j; i < j + 32; i++) ids_list = lappend_oid(ids_list, (Oid) i); /* * Figure out the target combiner (one with least IDs allocated) and try to allocate * an ID that belongs to it. */ target_combiner = 0; for (i = 0; i < continuous_query_num_combiners; i++) if (counts_per_combiner[i] < counts_per_combiner[target_combiner]) target_combiner = i; potential_ids = NIL; foreach(lc, ids_list) { Oid id = lfirst_oid(lc); if (id % continuous_query_num_combiners == target_combiner) potential_ids = lappend_oid(potential_ids, id); }
/* * _hash_init_metabuffer() -- Initialize the metadata page of a hash index. */ void _hash_init_metabuffer(Buffer buf, double num_tuples, RegProcedure procid, uint16 ffactor, bool initpage) { HashMetaPage metap; HashPageOpaque pageopaque; Page page; double dnumbuckets; uint32 num_buckets; uint32 spare_index; uint32 i; /* * Choose the number of initial bucket pages to match the fill factor * given the estimated number of tuples. We round up the result to the * total number of buckets which has to be allocated before using its * _hashm_spare element. However always force at least 2 bucket pages. The * upper limit is determined by considerations explained in * _hash_expandtable(). */ dnumbuckets = num_tuples / ffactor; if (dnumbuckets <= 2.0) num_buckets = 2; else if (dnumbuckets >= (double) 0x40000000) num_buckets = 0x40000000; else num_buckets = _hash_get_totalbuckets(_hash_spareindex(dnumbuckets)); spare_index = _hash_spareindex(num_buckets); Assert(spare_index < HASH_MAX_SPLITPOINTS); page = BufferGetPage(buf); if (initpage) _hash_pageinit(page, BufferGetPageSize(buf)); pageopaque = (HashPageOpaque) PageGetSpecialPointer(page); pageopaque->hasho_prevblkno = InvalidBlockNumber; pageopaque->hasho_nextblkno = InvalidBlockNumber; pageopaque->hasho_bucket = -1; pageopaque->hasho_flag = LH_META_PAGE; pageopaque->hasho_page_id = HASHO_PAGE_ID; metap = HashPageGetMeta(page); metap->hashm_magic = HASH_MAGIC; metap->hashm_version = HASH_VERSION; metap->hashm_ntuples = 0; metap->hashm_nmaps = 0; metap->hashm_ffactor = ffactor; metap->hashm_bsize = HashGetMaxBitmapSize(page); /* find largest bitmap array size that will fit in page size */ for (i = _hash_log2(metap->hashm_bsize); i > 0; --i) { if ((1 << i) <= metap->hashm_bsize) break; } Assert(i > 0); metap->hashm_bmsize = 1 << i; metap->hashm_bmshift = i + BYTE_TO_BIT; Assert((1 << BMPG_SHIFT(metap)) == (BMPG_MASK(metap) + 1)); /* * Label the index with its primary hash support function's OID. This is * pretty useless for normal operation (in fact, hashm_procid is not used * anywhere), but it might be handy for forensic purposes so we keep it. */ metap->hashm_procid = procid; /* * We initialize the index with N buckets, 0 .. N-1, occupying physical * blocks 1 to N. The first freespace bitmap page is in block N+1. */ metap->hashm_maxbucket = num_buckets - 1; /* * Set highmask as next immediate ((2 ^ x) - 1), which should be * sufficient to cover num_buckets. */ metap->hashm_highmask = (1 << (_hash_log2(num_buckets + 1))) - 1; metap->hashm_lowmask = (metap->hashm_highmask >> 1); MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); /* Set up mapping for one spare page after the initial splitpoints */ metap->hashm_spares[spare_index] = 1; metap->hashm_ovflpoint = spare_index; metap->hashm_firstfree = 0; /* * Set pd_lower just past the end of the metadata. This is to log full * page image of metapage in xloginsert.c. */ ((PageHeader) page)->pd_lower = ((char *) metap + sizeof(HashMetaPageData)) - (char *) page; }
/* * pgdatabasev - produce a view of gp_transaction_log that combines * information from the local clog and the distributed log. */ Datum gp_transaction_log(PG_FUNCTION_ARGS) { typedef struct Context { TransactionId indexXid; } Context; FuncCallContext *funcctx; Context *context; if (SRF_IS_FIRSTCALL()) { TupleDesc tupdesc; MemoryContext oldcontext; /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); /* * switch to memory context appropriate for multiple function * calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* build tupdesc for result tuples */ /* this had better match gp_distributed_xacts view in system_views.sql */ tupdesc = CreateTemplateTupleDesc(4, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "segment_id", INT2OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "dbid", INT2OID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "transaction", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 4, "status", TEXTOID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); /* * Collect all the locking information that we will format and send * out as a result set. */ context = (Context *) palloc(sizeof(Context)); funcctx->user_fctx = (void *) context; context->indexXid = ShmemVariableCache->nextXid; // Start with last possible + 1. funcctx->user_fctx = (void *) context; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); context = (Context *) funcctx->user_fctx; /* * Go backwards until we don't find a clog log page */ while (true) { XidStatus status; char *statusStr = NULL; Datum values[4]; bool nulls[4]; HeapTuple tuple; Datum result; if (context->indexXid < FirstNormalTransactionId) break; if (!CLOGScanForPrevStatus(&context->indexXid, &status)) break; /* * Form tuple with appropriate data. */ MemSet(values, 0, sizeof(values)); MemSet(nulls, false, sizeof(nulls)); values[0] = Int16GetDatum((int16)Gp_segment); values[1] = Int16GetDatum((int16)GpIdentity.dbid); values[2] = TransactionIdGetDatum(context->indexXid); if (status == TRANSACTION_STATUS_IN_PROGRESS) statusStr = "InProgress"; else if (status == TRANSACTION_STATUS_COMMITTED) statusStr = "Committed"; else if (status == TRANSACTION_STATUS_ABORTED) statusStr = "Aborted"; else if (status == TRANSACTION_STATUS_SUB_COMMITTED) statusStr = "SubXactCommitted"; else elog(ERROR, "Unexpected transaction status %d", status); values[3] = DirectFunctionCall1(textin, CStringGetDatum(statusStr)); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } SRF_RETURN_DONE(funcctx); }
inline static void XLogRecPtr_Zero(XLogRecPtr *xlogLoc) { MemSet(xlogLoc, 0, sizeof(XLogRecPtr)); }
/* * pg_prepared_xact * Produce a view with one row per prepared transaction. * * This function is here so we don't have to export the * GlobalTransactionData struct definition. */ Datum pg_prepared_xact(PG_FUNCTION_ARGS) { FuncCallContext *funcctx; Working_State *status; if (SRF_IS_FIRSTCALL()) { TupleDesc tupdesc; MemoryContext oldcontext; /* create a function context for cross-call persistence */ funcctx = SRF_FIRSTCALL_INIT(); /* * Switch to memory context appropriate for multiple function calls */ oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); /* build tupdesc for result tuples */ /* this had better match pg_prepared_xacts view in system_views.sql */ tupdesc = CreateTemplateTupleDesc(5, false); TupleDescInitEntry(tupdesc, (AttrNumber) 1, "transaction", XIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 2, "gid", TEXTOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 3, "prepared", TIMESTAMPTZOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 4, "ownerid", OIDOID, -1, 0); TupleDescInitEntry(tupdesc, (AttrNumber) 5, "dbid", OIDOID, -1, 0); funcctx->tuple_desc = BlessTupleDesc(tupdesc); /* * Collect all the 2PC status information that we will format and send * out as a result set. */ status = (Working_State *) palloc(sizeof(Working_State)); funcctx->user_fctx = (void *) status; status->ngxacts = GetPreparedTransactionList(&status->array); status->currIdx = 0; MemoryContextSwitchTo(oldcontext); } funcctx = SRF_PERCALL_SETUP(); status = (Working_State *) funcctx->user_fctx; while (status->array != NULL && status->currIdx < status->ngxacts) { GlobalTransaction gxact = &status->array[status->currIdx++]; Datum values[5]; bool nulls[5]; HeapTuple tuple; Datum result; if (!gxact->valid) continue; /* * Form tuple with appropriate data. */ MemSet(values, 0, sizeof(values)); MemSet(nulls, 0, sizeof(nulls)); values[0] = TransactionIdGetDatum(gxact->proc.xid); values[1] = DirectFunctionCall1(textin, CStringGetDatum(gxact->gid)); values[2] = TimestampTzGetDatum(gxact->prepared_at); values[3] = ObjectIdGetDatum(gxact->owner); values[4] = ObjectIdGetDatum(gxact->proc.databaseId); tuple = heap_form_tuple(funcctx->tuple_desc, values, nulls); result = HeapTupleGetDatum(tuple); SRF_RETURN_NEXT(funcctx, result); } SRF_RETURN_DONE(funcctx); }
/* * Initialize AppendOnlyStorageRead. * * The AppendOnlyStorageRead data structure is initialized * once for a read "session" and can be used to read * Append-Only Storage Blocks from 1 or more segment files. * * The current file to read to is opened with the * AppendOnlyStorageRead_OpenFile routine. */ void AppendOnlyStorageRead_Init( AppendOnlyStorageRead *storageRead, /* The data structure to initialize. */ MemoryContext memoryContext, /* * The memory context to use for buffers and * other memory needs. When NULL, the * current memory context is used. */ int32 maxBufferLen, /* * The maximum Append-Only Storage Block * length including all storage headers. */ char *relationName, /* * Name of the relation to use in system * logging and error messages. */ char *title, /* * A phrase that better describes the purpose of the this open. * * The caller manages the storage for this. */ AppendOnlyStorageAttributes *storageAttributes) /* * The Append-Only Storage Attributes * from relation creation. */ { int relationNameLen; uint8 *memory; int32 memoryLen; MemoryContext oldMemoryContext; Assert(storageRead != NULL); // UNDONE: Range check maxBufferLen Assert(relationName != NULL); Assert(storageAttributes != NULL); // UNDONE: Range check fields in storageAttributes MemSet(storageRead, 0, sizeof(AppendOnlyStorageRead)); storageRead->maxBufferLen = maxBufferLen; if (memoryContext == NULL) storageRead->memoryContext = CurrentMemoryContext; else storageRead->memoryContext = memoryContext; oldMemoryContext = MemoryContextSwitchTo(storageRead->memoryContext); memcpy( &storageRead->storageAttributes, storageAttributes, sizeof(AppendOnlyStorageAttributes)); relationNameLen = strlen(relationName); storageRead->relationName = (char *) palloc(relationNameLen + 1); memcpy(storageRead->relationName, relationName, relationNameLen + 1); storageRead->title = title; storageRead->minimumHeaderLen = AppendOnlyStorageFormat_RegularHeaderLenNeeded( storageRead->storageAttributes.checksum); /* * Initialize BufferedRead. */ storageRead->largeReadLen = 2 * storageRead->maxBufferLen; memoryLen = BufferedReadMemoryLen( storageRead->maxBufferLen, storageRead->largeReadLen); Assert(CurrentMemoryContext == storageRead->memoryContext); memory = (uint8*)palloc(memoryLen); BufferedReadInit(&storageRead->bufferedRead, memory, memoryLen, storageRead->maxBufferLen, storageRead->largeReadLen, relationName); elogif(Debug_appendonly_print_scan || Debug_appendonly_print_read_block, LOG, "Append-Only Storage Read initialize for table '%s' " "(compression = %s, compression level %d, maximum buffer length %d, large read length %d)", storageRead->relationName, (storageRead->storageAttributes.compress ? "true" : "false"), storageRead->storageAttributes.compressLevel, storageRead->maxBufferLen, storageRead->largeReadLen); storageRead->file = -1; MemoryContextSwitchTo(oldMemoryContext); storageRead->isActive = true; }
Datum hstore_populate_record(PG_FUNCTION_ARGS) { Oid argtype = get_fn_expr_argtype(fcinfo->flinfo, 0); HStore *hs; HEntry *entries; char *ptr; HeapTupleHeader rec; Oid tupType; int32 tupTypmod; TupleDesc tupdesc; HeapTupleData tuple; HeapTuple rettuple; RecordIOData *my_extra; int ncolumns; int i; Datum *values; bool *nulls; if (!type_is_rowtype(argtype)) ereport(ERROR, (errcode(ERRCODE_DATATYPE_MISMATCH), errmsg("first argument must be a rowtype"))); if (PG_ARGISNULL(0)) { if (PG_ARGISNULL(1)) PG_RETURN_NULL(); rec = NULL; /* * have no tuple to look at, so the only source of type info is the * argtype. The lookup_rowtype_tupdesc call below will error out if we * don't have a known composite type oid here. */ tupType = argtype; tupTypmod = -1; } else { rec = PG_GETARG_HEAPTUPLEHEADER(0); if (PG_ARGISNULL(1)) PG_RETURN_POINTER(rec); /* Extract type info from the tuple itself */ tupType = HeapTupleHeaderGetTypeId(rec); tupTypmod = HeapTupleHeaderGetTypMod(rec); } hs = PG_GETARG_HS(1); entries = ARRPTR(hs); ptr = STRPTR(hs); /* * if the input hstore is empty, we can only skip the rest if we were * passed in a non-null record, since otherwise there may be issues with * domain nulls. */ if (HS_COUNT(hs) == 0 && rec) PG_RETURN_POINTER(rec); tupdesc = lookup_rowtype_tupdesc(tupType, tupTypmod); ncolumns = tupdesc->natts; if (rec) { /* Build a temporary HeapTuple control structure */ tuple.t_len = HeapTupleHeaderGetDatumLength(rec); ItemPointerSetInvalid(&(tuple.t_self)); tuple.t_tableOid = InvalidOid; tuple.t_data = rec; } /* * We arrange to look up the needed I/O info just once per series of * calls, assuming the record type doesn't change underneath us. */ my_extra = (RecordIOData *) fcinfo->flinfo->fn_extra; if (my_extra == NULL || my_extra->ncolumns != ncolumns) { fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt, sizeof(RecordIOData) - sizeof(ColumnIOData) + ncolumns * sizeof(ColumnIOData)); my_extra = (RecordIOData *) fcinfo->flinfo->fn_extra; my_extra->record_type = InvalidOid; my_extra->record_typmod = 0; } if (my_extra->record_type != tupType || my_extra->record_typmod != tupTypmod) { MemSet(my_extra, 0, sizeof(RecordIOData) - sizeof(ColumnIOData) + ncolumns * sizeof(ColumnIOData)); my_extra->record_type = tupType; my_extra->record_typmod = tupTypmod; my_extra->ncolumns = ncolumns; } values = (Datum *) palloc(ncolumns * sizeof(Datum)); nulls = (bool *) palloc(ncolumns * sizeof(bool)); if (rec) { /* Break down the tuple into fields */ heap_deform_tuple(&tuple, tupdesc, values, nulls); } else { for (i = 0; i < ncolumns; ++i) { values[i] = (Datum) 0; nulls[i] = true; } } for (i = 0; i < ncolumns; ++i) { ColumnIOData *column_info = &my_extra->columns[i]; Oid column_type = tupdesc->attrs[i]->atttypid; char *value; int idx; int vallen; /* Ignore dropped columns in datatype */ if (tupdesc->attrs[i]->attisdropped) { nulls[i] = true; continue; } idx = hstoreFindKey(hs, 0, NameStr(tupdesc->attrs[i]->attname), strlen(NameStr(tupdesc->attrs[i]->attname))); /* * we can't just skip here if the key wasn't found since we might have * a domain to deal with. If we were passed in a non-null record * datum, we assume that the existing values are valid (if they're * not, then it's not our fault), but if we were passed in a null, * then every field which we don't populate needs to be run through * the input function just in case it's a domain type. */ if (idx < 0 && rec) continue; /* * Prepare to convert the column value from text */ if (column_info->column_type != column_type) { getTypeInputInfo(column_type, &column_info->typiofunc, &column_info->typioparam); fmgr_info_cxt(column_info->typiofunc, &column_info->proc, fcinfo->flinfo->fn_mcxt); column_info->column_type = column_type; } if (idx < 0 || HS_VALISNULL(entries, idx)) { /* * need InputFunctionCall to happen even for nulls, so that domain * checks are done */ values[i] = InputFunctionCall(&column_info->proc, NULL, column_info->typioparam, tupdesc->attrs[i]->atttypmod); nulls[i] = true; } else { vallen = HS_VALLEN(entries, idx); value = palloc(1 + vallen); memcpy(value, HS_VAL(entries, ptr, idx), vallen); value[vallen] = 0; values[i] = InputFunctionCall(&column_info->proc, value, column_info->typioparam, tupdesc->attrs[i]->atttypmod); nulls[i] = false; } } rettuple = heap_form_tuple(tupdesc, values, nulls); ReleaseTupleDesc(tupdesc); PG_RETURN_DATUM(HeapTupleGetDatum(rettuple)); }
/* * Schedule alarm for the next active timeout, if any * * We assume the caller has obtained the current time, or a close-enough * approximation. */ static void schedule_alarm(TimestampTz now) { if (num_active_timeouts > 0) { struct itimerval timeval; long secs; int usecs; MemSet(&timeval, 0, sizeof(struct itimerval)); /* Get the time remaining till the nearest pending timeout */ TimestampDifference(now, active_timeouts[0]->fin_time, &secs, &usecs); /* * It's possible that the difference is less than a microsecond; * ensure we don't cancel, rather than set, the interrupt. */ if (secs == 0 && usecs == 0) usecs = 1; timeval.it_value.tv_sec = secs; timeval.it_value.tv_usec = usecs; /* * We must enable the signal handler before calling setitimer(); if we * did it in the other order, we'd have a race condition wherein the * interrupt could occur before we can set alarm_enabled, so that the * signal handler would fail to do anything. * * Because we didn't bother to reset the timer in disable_alarm(), * it's possible that a previously-set interrupt will fire between * enable_alarm() and setitimer(). This is safe, however. There are * two possible outcomes: * * 1. The signal handler finds nothing to do (because the nearest * timeout event is still in the future). It will re-set the timer * and return. Then we'll overwrite the timer value with a new one. * This will mean that the timer fires a little later than we * intended, but only by the amount of time it takes for the signal * handler to do nothing useful, which shouldn't be much. * * 2. The signal handler executes and removes one or more timeout * events. When it returns, either the queue is now empty or the * frontmost event is later than the one we looked at above. So we'll * overwrite the timer value with one that is too soon (plus or minus * the signal handler's execution time), causing a useless interrupt * to occur. But the handler will then re-set the timer and * everything will still work as expected. * * Since these cases are of very low probability (the window here * being quite narrow), it's not worth adding cycles to the mainline * code to prevent occasional wasted interrupts. */ enable_alarm(); /* Set the alarm timer */ if (setitimer(ITIMER_REAL, &timeval, NULL) != 0) elog(FATAL, "could not enable SIGALRM timer: %m"); } }
/* * Map a relation's (tablespace, filenode) to a relation's oid and cache the * result. * * Returns InvalidOid if no relation matching the criteria could be found. */ Oid RelidByRelfilenode(Oid reltablespace, Oid relfilenode) { RelfilenodeMapKey key; RelfilenodeMapEntry *entry; bool found; SysScanDesc scandesc; Relation relation; HeapTuple ntp; ScanKeyData skey[2]; Oid relid; if (RelfilenodeMapHash == NULL) InitializeRelfilenodeMap(); /* pg_class will show 0 when the value is actually MyDatabaseTableSpace */ if (reltablespace == MyDatabaseTableSpace) reltablespace = 0; MemSet(&key, 0, sizeof(key)); key.reltablespace = reltablespace; key.relfilenode = relfilenode; /* * Check cache and return entry if one is found. Even if no target * relation can be found later on we store the negative match and return a * InvalidOid from cache. That's not really necessary for performance * since querying invalid values isn't supposed to be a frequent thing, * but it's basically free. */ entry = hash_search(RelfilenodeMapHash, (void *) &key, HASH_FIND, &found); if (found) return entry->relid; /* ok, no previous cache entry, do it the hard way */ /* initialize empty/negative cache entry before doing the actual lookups */ relid = InvalidOid; if (reltablespace == GLOBALTABLESPACE_OID) { /* * Ok, shared table, check relmapper. */ relid = RelationMapFilenodeToOid(relfilenode, true); } else { /* * Not a shared table, could either be a plain relation or a * non-shared, nailed one, like e.g. pg_class. */ /* check for plain relations by looking in pg_class */ relation = heap_open(RelationRelationId, AccessShareLock); /* copy scankey to local copy, it will be modified during the scan */ memcpy(skey, relfilenode_skey, sizeof(skey)); /* set scan arguments */ skey[0].sk_argument = ObjectIdGetDatum(reltablespace); skey[1].sk_argument = ObjectIdGetDatum(relfilenode); scandesc = systable_beginscan(relation, ClassTblspcRelfilenodeIndexId, true, NULL, 2, skey); found = false; while (HeapTupleIsValid(ntp = systable_getnext(scandesc))) { if (found) elog(ERROR, "unexpected duplicate for tablespace %u, relfilenode %u", reltablespace, relfilenode); found = true; #ifdef USE_ASSERT_CHECKING { bool isnull; Oid check; check = fastgetattr(ntp, Anum_pg_class_reltablespace, RelationGetDescr(relation), &isnull); Assert(!isnull && check == reltablespace); check = fastgetattr(ntp, Anum_pg_class_relfilenode, RelationGetDescr(relation), &isnull); Assert(!isnull && check == relfilenode); } #endif relid = HeapTupleGetOid(ntp); } systable_endscan(scandesc); heap_close(relation, AccessShareLock); /* check for tables that are mapped but not shared */ if (!found) relid = RelationMapFilenodeToOid(relfilenode, false); } /* * Only enter entry into cache now, our opening of pg_class could have * caused cache invalidations to be executed which would have deleted a * new entry if we had entered it above. */ entry = hash_search(RelfilenodeMapHash, (void *) &key, HASH_ENTER, &found); if (found) elog(ERROR, "corrupted hashtable"); entry->relid = relid; return relid; }
void PersistentEndXactRec_Init( PersistentEndXactRecObjects *objects) { MemSet(objects, 0, sizeof(PersistentEndXactRecObjects)); }
/* * load up the categories hash table */ static HTAB * load_categories_hash(char *cats_sql, MemoryContext per_query_ctx) { HTAB *crosstab_hash; HASHCTL ctl; int ret; int proc; MemoryContext SPIcontext; /* initialize the category hash table */ MemSet(&ctl, 0, sizeof(ctl)); ctl.keysize = MAX_CATNAME_LEN; ctl.entrysize = sizeof(crosstab_HashEnt); ctl.hcxt = per_query_ctx; /* * use INIT_CATS, defined above as a guess of how many hash table entries * to create, initially */ crosstab_hash = hash_create("crosstab hash", INIT_CATS, &ctl, HASH_ELEM | HASH_CONTEXT); /* Connect to SPI manager */ if ((ret = SPI_connect()) < 0) /* internal error */ elog(ERROR, "load_categories_hash: SPI_connect returned %d", ret); /* Retrieve the category name rows */ ret = SPI_execute(cats_sql, true, 0); proc = SPI_processed; /* Check for qualifying tuples */ if ((ret == SPI_OK_SELECT) && (proc > 0)) { SPITupleTable *spi_tuptable = SPI_tuptable; TupleDesc spi_tupdesc = spi_tuptable->tupdesc; int i; /* * The provided categories SQL query must always return one column: * category - the label or identifier for each column */ if (spi_tupdesc->natts != 1) ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), errmsg("provided \"categories\" SQL must " \ "return 1 column of at least one row"))); for (i = 0; i < proc; i++) { crosstab_cat_desc *catdesc; char *catname; HeapTuple spi_tuple; /* get the next sql result tuple */ spi_tuple = spi_tuptable->vals[i]; /* get the category from the current sql result tuple */ catname = SPI_getvalue(spi_tuple, spi_tupdesc, 1); SPIcontext = MemoryContextSwitchTo(per_query_ctx); catdesc = (crosstab_cat_desc *) palloc(sizeof(crosstab_cat_desc)); catdesc->catname = catname; catdesc->attidx = i; /* Add the proc description block to the hashtable */ crosstab_HashTableInsert(crosstab_hash, catdesc); MemoryContextSwitchTo(SPIcontext); } } if (SPI_finish() != SPI_OK_FINISH) /* internal error */ elog(ERROR, "load_categories_hash: SPI_finish() failed"); return crosstab_hash; }
int32 PersistentEndXactRec_FetchObjectsFromSmgr( PersistentEndXactRecObjects *objects, EndXactRecKind endXactRecKind, int16 *objectCount) { int32 serializeLen; int objKind; void *data; int32 count; int32 len; MemSet(objects, 0, sizeof(PersistentEndXactRecObjects)); *objectCount = 0; serializeLen = 0; for (objKind = 1; objKind < MaxPersistentEndXactObjKind; objKind++) { data = NULL; count = 0; len = 0; switch (objKind) { case PersistentEndXactObjKind_FileSysAction: count = smgrGetPendingFileSysWork( endXactRecKind, (PersistentEndXactFileSysActionInfo**)&data, NULL); len = count * sizeof(PersistentEndXactFileSysActionInfo); PersistentEndXactRec_VerifyFileSysActionInfos( endXactRecKind, (PersistentEndXactFileSysActionInfo*)data, count); break; case PersistentEndXactObjKind_AppendOnlyMirrorResyncEofs: count = smgrGetAppendOnlyMirrorResyncEofs( endXactRecKind, (PersistentEndXactAppendOnlyMirrorResyncEofs**)&data); len = count * sizeof(PersistentEndXactAppendOnlyMirrorResyncEofs); break; default: elog(ERROR, "Unexpected persistent transaction object kind: %d", objKind); } if (len > 0) { (*objectCount)++; serializeLen += offsetof(PersistentEndXactRecObjHdr, p.data) + len; PersistentEndXactRec_SetObjectInfo( objects, objKind, data, count, len); } } return serializeLen; }
/* * For all items in this page, find their respective root line pointers. * If item k is part of a HOT-chain with root at item j, then we set * root_offsets[k - 1] = j. * * The passed-in root_offsets array must have MaxHeapTuplesPerPage entries. * We zero out all unused entries. * * The function must be called with at least share lock on the buffer, to * prevent concurrent prune operations. * * Note: The information collected here is valid only as long as the caller * holds a pin on the buffer. Once pin is released, a tuple might be pruned * and reused by a completely unrelated tuple. */ void heap_get_root_tuples(Page page, OffsetNumber *root_offsets) { OffsetNumber offnum, maxoff; MemSet(root_offsets, 0, MaxHeapTuplesPerPage * sizeof(OffsetNumber)); maxoff = PageGetMaxOffsetNumber(page); for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) { ItemId lp = PageGetItemId(page, offnum); HeapTupleHeader htup; OffsetNumber nextoffnum; TransactionId priorXmax; /* skip unused and dead items */ if (!ItemIdIsUsed(lp) || ItemIdIsDead(lp)) continue; if (ItemIdIsNormal(lp)) { htup = (HeapTupleHeader) PageGetItem(page, lp); /* * Check if this tuple is part of a HOT-chain rooted at some other * tuple. If so, skip it for now; we'll process it when we find * its root. */ if (HeapTupleHeaderIsHeapOnly(htup)) continue; /* * This is either a plain tuple or the root of a HOT-chain. * Remember it in the mapping. */ root_offsets[offnum - 1] = offnum; /* If it's not the start of a HOT-chain, we're done with it */ if (!HeapTupleHeaderIsHotUpdated(htup)) continue; /* Set up to scan the HOT-chain */ nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetUpdateXid(htup); } else { /* Must be a redirect item. We do not set its root_offsets entry */ Assert(ItemIdIsRedirected(lp)); /* Set up to scan the HOT-chain */ nextoffnum = ItemIdGetRedirect(lp); priorXmax = InvalidTransactionId; } /* * Now follow the HOT-chain and collect other tuples in the chain. * * Note: Even though this is a nested loop, the complexity of the * function is O(N) because a tuple in the page should be visited not * more than twice, once in the outer loop and once in HOT-chain * chases. */ for (;;) { lp = PageGetItemId(page, nextoffnum); /* Check for broken chains */ if (!ItemIdIsNormal(lp)) break; htup = (HeapTupleHeader) PageGetItem(page, lp); if (TransactionIdIsValid(priorXmax) && !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(htup))) break; /* Remember the root line pointer for this item */ root_offsets[nextoffnum - 1] = offnum; /* Advance to next chain member, if any */ if (!HeapTupleHeaderIsHotUpdated(htup)) break; nextoffnum = ItemPointerGetOffsetNumber(&htup->t_ctid); priorXmax = HeapTupleHeaderGetUpdateXid(htup); } } }
/* * Call this to update the ps status display to a fixed prefix plus an * indication of what you're currently doing passed in the argument. */ void set_ps_display(const char *activity, bool force) { #ifndef PS_USE_NONE /* update_process_title=off disables updates, unless force = true */ if (!force && !update_process_title) return; /* no ps display for stand-alone backend */ if (!IsUnderPostmaster) return; #ifdef PS_USE_CLOBBER_ARGV /* If ps_buffer is a pointer, it might still be null */ if (!ps_buffer) return; #endif /* Update ps_buffer to contain both fixed part and activity */ strlcpy(ps_buffer + ps_buffer_fixed_size, activity, ps_buffer_size - ps_buffer_fixed_size); ps_buffer_cur_len = strlen(ps_buffer); /* Transmit new setting to kernel, if necessary */ #ifdef PS_USE_SETPROCTITLE setproctitle("%s", ps_buffer); #endif #ifdef PS_USE_PSTAT { union pstun pst; pst.pst_command = ps_buffer; pstat(PSTAT_SETCMD, pst, ps_buffer_cur_len, 0, 0); } #endif /* PS_USE_PSTAT */ #ifdef PS_USE_PS_STRINGS PS_STRINGS->ps_nargvstr = 1; PS_STRINGS->ps_argvstr = ps_buffer; #endif /* PS_USE_PS_STRINGS */ #ifdef PS_USE_CLOBBER_ARGV /* pad unused memory; need only clobber remainder of old status string */ if (last_status_len > ps_buffer_cur_len) MemSet(ps_buffer + ps_buffer_cur_len, PS_PADDING, last_status_len - ps_buffer_cur_len); last_status_len = ps_buffer_cur_len; #endif /* PS_USE_CLOBBER_ARGV */ #ifdef PS_USE_WIN32 { /* * Win32 does not support showing any changed arguments. To make it at * all possible to track which backend is doing what, we create a * named object that can be viewed with for example Process Explorer. */ static HANDLE ident_handle = INVALID_HANDLE_VALUE; char name[PS_BUFFER_SIZE + 32]; if (ident_handle != INVALID_HANDLE_VALUE) CloseHandle(ident_handle); sprintf(name, "pgident(%d): %s", MyProcPid, ps_buffer); ident_handle = CreateEvent(NULL, TRUE, FALSE, name); } #endif /* PS_USE_WIN32 */ #endif /* not PS_USE_NONE */ }
/* * Indicate we intend to create a relation file as part of the current transaction. * * This function adds an entry in 'gp_persistent_relation_node' for either a new table (segment file * # 0) or a new segment file under AO table (segment file # > 0 for row/column-oriented AO) with a state * 'Create Pending'. An XLOG IntentToCreate record is generated that will guard the subsequent file-system * create in case the transaction aborts. * * Paramaters * ----------- * relFileNode = The tablespace, database, and relation OIDs for the create * segmentFileNum = As the name implies ( 0 for heap * >= 0 for RO/CO AO as applicable) * relStorageMgr = Persistent Relation storage Manager * relBufpoolKind = Buffer pool type beneath corrosponding relation * TODO bufferPollBulkLoad = ??? * TODO mirrorExistenceState = ??? * TODO relDataSynchronizationState = ??? * flushToXlog = If true, the XLOG record for this change will be flushed to disk. * TODO isLocalBuf = ??? * * Return * ------ * relationName = Name of the relation used for either debugging or to store in PendingDelete LL. * persistentTid = Resulting TID of the gp_persistent_rel_files tuple for the relation * serialNum = Resulting serial number for the relation. Distinquishes the uses of the tuple */ void PersistentRelation_AddCreatePending( RelFileNode *relFileNode, int32 segmentFileNum, PersistentFileSysRelStorageMgr relStorageMgr, PersistentFileSysRelBufpoolKind relBufpoolKind, bool bufferPoolBulkLoad, MirroredObjectExistenceState mirrorExistenceState, MirroredRelDataSynchronizationState relDataSynchronizationState, char *relationName, ItemPointer persistentTid, int64 *serialNum, bool flushToXLog, bool isLocalBuf) { WRITE_PERSISTENT_STATE_ORDERED_LOCK_DECLARE; PersistentFileSysObjName fsObjName; XLogRecPtr mirrorBufpoolResyncCkptLoc; ItemPointerData previousFreeTid; Datum values[Natts_gp_persistent_relation_node]; if(RelFileNode_IsEmpty(relFileNode)) elog(ERROR, "Invalid RelFileNode (0,0,0)"); MemSet(&previousFreeTid, 0, sizeof(ItemPointerData)); MemSet(&mirrorBufpoolResyncCkptLoc, 0, sizeof(XLogRecPtr)); if (Persistent_BeforePersistenceWork()) { if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Skipping persistent relation '%s' because we are before persistence work", relpath(*relFileNode)); MemSet(persistentTid, 0, sizeof(ItemPointerData)); *serialNum = 0; return; // The initdb process will load the persistent table once we out of bootstrap mode. } /* Verify if the needed shared mem data structures for persistent tables are setup and inited */ PersistentRelation_VerifyInitScan(); /* Setup the file system object name */ PersistentFileSysObjName_SetRelationFile( &fsObjName, relFileNode, segmentFileNum); WRITE_PERSISTENT_STATE_ORDERED_LOCK; /* Create a values array which will be used to create a 'gp_persistent_relation_node' tuple */ GpPersistentRelationNode_SetDatumValues( values, relFileNode->spcNode, relFileNode->dbNode, relFileNode->relNode, segmentFileNum, relStorageMgr, (bufferPoolBulkLoad ? PersistentFileSysState_BulkLoadCreatePending : PersistentFileSysState_CreatePending), /* createMirrorDataLossTrackingSessionNum */ 0, mirrorExistenceState, relDataSynchronizationState, /* mirrorBufpoolMarkedForScanIncrementalResync */ false, /* mirrorBufpoolResyncChangedPageCount */ 0, &mirrorBufpoolResyncCkptLoc, /* mirrorBufpoolResyncCkptBlockNum */ 0, /* mirrorAppendOnlyLossEof */ 0, /* mirrorAppendOnlyNewEof */ 0, relBufpoolKind, GetTopTransactionId(), /* persistentSerialNum */ 0, // This will be set by PersistentFileSysObj_AddTuple. &previousFreeTid); /* Add a new tuple to 'gp_persistent_relation_node' table for the new relation/segment file * we intend to create. This will also create and apply a new persistent serial number. */ PersistentFileSysObj_AddTuple( PersistentFsObjType_RelationFile, values, flushToXLog, persistentTid, serialNum); /* * This XLOG must be generated under the persistent write-lock. */ #ifdef MASTER_MIRROR_SYNC mmxlog_log_create_relfilenode( relFileNode->spcNode, relFileNode->dbNode, relFileNode->relNode, segmentFileNum); #endif #ifdef FAULT_INJECTOR FaultInjector_InjectFaultIfSet( FaultBeforePendingDeleteRelationEntry, DDLNotSpecified, "", // databaseName ""); // tableName #endif /* We'll add an entry to the PendingDelete LinkedList (LL) to remeber what we * created in this transaction (or sub-transaction). If the transaction * aborts then we can search for all such entries in this LL and get rid of (delete) * such relations or segment files on the disk. * * MPP-18228 * To make adding 'Create Pending' entry to persistent table and adding * to the PendingDelete list atomic */ PendingDelete_AddCreatePendingRelationEntry( &fsObjName, persistentTid, serialNum, relStorageMgr, relationName, isLocalBuf, bufferPoolBulkLoad); WRITE_PERSISTENT_STATE_ORDERED_UNLOCK; if (Debug_persistent_print) elog(Persistent_DebugPrintLevel(), "Persistent relation: Add '%s', relation name '%s' in state 'Create Pending', relation storage manager '%s', mirror existence state '%s', relation data resynchronization state '%s', serial number " INT64_FORMAT " at TID %s", PersistentFileSysObjName_ObjectName(&fsObjName), relationName, PersistentFileSysRelStorageMgr_Name(relStorageMgr), MirroredObjectExistenceState_Name(mirrorExistenceState), MirroredRelDataSynchronizationState_Name(relDataSynchronizationState), *serialNum, ItemPointerToString(persistentTid)); }
Datum formatter_import(PG_FUNCTION_ARGS) { HeapTuple tuple; TupleDesc tupdesc; MemoryContext m, oldcontext; format_t *myData; char *data_buf; int ncolumns = 0; int data_cur; int data_len; int i; /* Must be called via the external table format manager */ if (!CALLED_AS_FORMATTER(fcinfo)) elog(ERROR, "formatter_import: not called by format manager"); tupdesc = FORMATTER_GET_TUPDESC(fcinfo); /* Get our internal description of the formatter */ ncolumns = tupdesc->natts; myData = (format_t *) FORMATTER_GET_USER_CTX(fcinfo); if (myData == NULL) { myData = palloc(sizeof(format_t)); myData->ncols = ncolumns; myData->values = palloc(sizeof(Datum) * ncolumns); myData->nulls = palloc(sizeof(bool) * ncolumns); /* misc verification */ for (i = 0; i < ncolumns; i++) { Oid type = tupdesc->attrs[i]->atttypid; //int32 typmod = tupdesc->attrs[i]->atttypmod; /* Don't know how to format dropped columns, error for now */ if (tupdesc->attrs[i]->attisdropped) elog(ERROR, "formatter_import: dropped columns"); switch (type) { case FLOAT8OID: case VARCHAROID: case BPCHAROID: case TEXTOID: break; default: { elog(ERROR, "formatter_import error: unsupported data type"); break; } } } FORMATTER_SET_USER_CTX(fcinfo, myData); } if (myData->ncols != ncolumns) elog(ERROR, "formatter_import: unexpected change of output record type"); /* get our input data buf and number of valid bytes in it */ data_buf = FORMATTER_GET_DATABUF(fcinfo); data_len = FORMATTER_GET_DATALEN(fcinfo); data_cur = FORMATTER_GET_DATACURSOR(fcinfo); /* start clean */ MemSet(myData->values, 0, ncolumns * sizeof(Datum)); MemSet(myData->nulls, true, ncolumns * sizeof(bool)); /* ======================================================================= * MAIN FORMATTING CODE * * Currently this code assumes: * - Homogoneos hardware => No need to convert data to network byte order * - Support for TEXT/VARCHAR/BPCHAR/FLOAT8 only * - Length Prefixed strings * - No end of record tags, checksums, or optimizations for alignment. * - NULL values are cast to some sensible default value (NaN, "") * * ======================================================================= */ m = FORMATTER_GET_PER_ROW_MEM_CTX(fcinfo); oldcontext = MemoryContextSwitchTo(m); for (i = 0; i < ncolumns; i++) { Oid type = tupdesc->attrs[i]->atttypid; //int typmod = tupdesc->attrs[i]->atttypmod; int remaining = 0; int attr_len = 0; remaining = data_len - data_cur; switch (type) { case FLOAT8OID: { float8 value; attr_len = sizeof(value); if (remaining < attr_len) { MemoryContextSwitchTo(oldcontext); FORMATTER_RETURN_NOTIFICATION(fcinfo, FMT_NEED_MORE_DATA); } memcpy(&value, data_buf + data_cur, attr_len); if(value != NULL_FLOAT8_VALUE) { myData->nulls[i] = false; myData->values[i] = Float8GetDatum(value); } /* TODO: check for nan? */ break; } case TEXTOID: case VARCHAROID: case BPCHAROID: { text* value; int32 len; bool nextlen = remaining >= sizeof(len); if (nextlen) memcpy(&len, data_buf + data_cur, sizeof(len)); /* if len or data bytes don't exist in this buffer, return */ if (!nextlen || (nextlen && (remaining - sizeof(len) < len))) { MemoryContextSwitchTo(oldcontext); FORMATTER_RETURN_NOTIFICATION(fcinfo, FMT_NEED_MORE_DATA); } if (len > 0) { value = (text *) palloc(len + VARHDRSZ); SET_VARSIZE(value, len + VARHDRSZ); memcpy(VARDATA(value), data_buf + data_cur + sizeof(len), len); myData->nulls[i] = false; myData->values[i] = PointerGetDatum(value); } attr_len = len + sizeof(len); break; } default: elog(ERROR, "formatter_import: unsupported datatype"); break; } /* add byte length of last attribute to the temporary cursor */ data_cur += attr_len; } /* ======================================================================= */ MemoryContextSwitchTo(oldcontext); FORMATTER_SET_DATACURSOR(fcinfo, data_cur); tuple = heap_form_tuple(tupdesc, myData->values, myData->nulls); /* hack... pass tuple here. don't free prev tuple - the executor does it */ ((FormatterData*) fcinfo->context)->fmt_tuple = tuple; FORMATTER_RETURN_TUPLE(tuple); }