/* ---------- * heap_tuple_untoast_attr - * * Public entry point to get back a toasted value from compression * or external storage. * ---------- */ varattrib * heap_tuple_untoast_attr(varattrib *attr) { varattrib *result; if (VARATT_IS_EXTERNAL(attr)) { if (VARATT_IS_COMPRESSED(attr)) { /* ---------- * This is an external stored compressed value * Fetch it from the toast heap and decompress. * ---------- */ varattrib *tmp; tmp = toast_fetch_datum(attr); result = (varattrib *) palloc(attr->va_content.va_external.va_rawsize + VARHDRSZ); VARATT_SIZEP(result) = attr->va_content.va_external.va_rawsize + VARHDRSZ; pglz_decompress((PGLZ_Header *) tmp, VARATT_DATA(result)); pfree(tmp); } else { /* * This is an external stored plain value */ result = toast_fetch_datum(attr); } } else if (VARATT_IS_COMPRESSED(attr)) { /* * This is a compressed value inside of the main tuple */ result = (varattrib *) palloc(attr->va_content.va_compressed.va_rawsize + VARHDRSZ); VARATT_SIZEP(result) = attr->va_content.va_compressed.va_rawsize + VARHDRSZ; pglz_decompress((PGLZ_Header *) attr, VARATT_DATA(result)); } else /* * This is a plain value inside of the main tuple - why am I * called? */ return attr; return result; }
/* ---------- * heap_tuple_untoast_attr_slice - * * Public entry point to get back part of a toasted value * from compression or external storage. * ---------- */ varattrib * heap_tuple_untoast_attr_slice(varattrib *attr, int32 sliceoffset, int32 slicelength) { varattrib *preslice; varattrib *result; int32 attrsize; if (VARATT_IS_COMPRESSED(attr)) { varattrib *tmp; if (VARATT_IS_EXTERNAL(attr)) tmp = toast_fetch_datum(attr); else { tmp = attr; /* compressed in main tuple */ } preslice = (varattrib *) palloc(attr->va_content.va_external.va_rawsize + VARHDRSZ); VARATT_SIZEP(preslice) = attr->va_content.va_external.va_rawsize + VARHDRSZ; pglz_decompress((PGLZ_Header *) tmp, VARATT_DATA(preslice)); if (tmp != attr) pfree(tmp); } else { /* Plain value */ if (VARATT_IS_EXTERNAL(attr)) { /* fast path */ return (toast_fetch_datum_slice(attr, sliceoffset, slicelength)); } else preslice = attr; } /* slicing of datum for compressed cases and plain value */ attrsize = VARSIZE(preslice) - VARHDRSZ; if (sliceoffset >= attrsize) { sliceoffset = 0; slicelength = 0; } if (((sliceoffset + slicelength) > attrsize) || slicelength < 0) slicelength = attrsize - sliceoffset; result = (varattrib *) palloc(slicelength + VARHDRSZ); VARATT_SIZEP(result) = slicelength + VARHDRSZ; memcpy(VARDATA(result), VARDATA(preslice) + sliceoffset, slicelength); if (preslice != attr) pfree(preslice); return result; }
/* ---------- * toast_compress_datum - * * Create a compressed version of a varlena datum * * If we fail (ie, compressed result is actually bigger than original) * then return NULL. We must not use compressed data if it'd expand * the tuple! * ---------- */ Datum toast_compress_datum(Datum value) { varattrib *tmp; tmp = (varattrib *) palloc(sizeof(PGLZ_Header) + VARATT_SIZE(value)); pglz_compress(VARATT_DATA(value), VARATT_SIZE(value) - VARHDRSZ, (PGLZ_Header *) tmp, PGLZ_strategy_default); if (VARATT_SIZE(tmp) < VARATT_SIZE(value)) { /* successful compression */ VARATT_SIZEP(tmp) |= VARATT_FLAG_COMPRESSED; return PointerGetDatum(tmp); } else { /* incompressible data */ pfree(tmp); return PointerGetDatum(NULL); } }
/* ---------- * toast_save_datum - * * Save one single datum into the secondary relation and return * a varattrib reference for it. * ---------- */ static Datum toast_save_datum(Relation rel, Datum value) { Relation toastrel; Relation toastidx; HeapTuple toasttup; InsertIndexResult idxres; TupleDesc toasttupDesc; Datum t_values[3]; char t_nulls[3]; varattrib *result; struct { struct varlena hdr; char data[TOAST_MAX_CHUNK_SIZE]; } chunk_data; int32 chunk_size; int32 chunk_seq = 0; char *data_p; int32 data_todo; /* * Create the varattrib reference */ result = (varattrib *) palloc(sizeof(varattrib)); result->va_header = sizeof(varattrib) | VARATT_FLAG_EXTERNAL; if (VARATT_IS_COMPRESSED(value)) { result->va_header |= VARATT_FLAG_COMPRESSED; result->va_content.va_external.va_rawsize = ((varattrib *) value)->va_content.va_compressed.va_rawsize; } else result->va_content.va_external.va_rawsize = VARATT_SIZE(value); result->va_content.va_external.va_extsize = VARATT_SIZE(value) - VARHDRSZ; result->va_content.va_external.va_valueid = newoid(); result->va_content.va_external.va_toastrelid = rel->rd_rel->reltoastrelid; /* * Initialize constant parts of the tuple data */ t_values[0] = ObjectIdGetDatum(result->va_content.va_external.va_valueid); t_values[2] = PointerGetDatum(&chunk_data); t_nulls[0] = ' '; t_nulls[1] = ' '; t_nulls[2] = ' '; /* * Get the data to process */ data_p = VARATT_DATA(value); data_todo = VARATT_SIZE(value) - VARHDRSZ; /* * Open the toast relation */ toastrel = heap_open(rel->rd_rel->reltoastrelid, RowExclusiveLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Split up the item into chunks */ while (data_todo > 0) { /* * Calculate the size of this chunk */ chunk_size = Min(TOAST_MAX_CHUNK_SIZE, data_todo); /* * Build a tuple and store it */ t_values[1] = Int32GetDatum(chunk_seq++); VARATT_SIZEP(&chunk_data) = chunk_size + VARHDRSZ; memcpy(VARATT_DATA(&chunk_data), data_p, chunk_size); toasttup = heap_formtuple(toasttupDesc, t_values, t_nulls); if (!HeapTupleIsValid(toasttup)) elog(ERROR, "failed to build TOAST tuple"); simple_heap_insert(toastrel, toasttup); /* * Create the index entry. We cheat a little here by not using * FormIndexDatum: this relies on the knowledge that the index * columns are the same as the initial columns of the table. * * Note also that there had better not be any user-created index on * the TOAST table, since we don't bother to update anything else. */ idxres = index_insert(toastidx, t_values, t_nulls, &(toasttup->t_self), toastrel, toastidx->rd_index->indisunique); if (idxres == NULL) elog(ERROR, "failed to insert index entry for TOAST tuple"); /* * Free memory */ pfree(idxres); heap_freetuple(toasttup); /* * Move on to next chunk */ data_todo -= chunk_size; data_p += chunk_size; } /* * Done - close toast relation and return the reference */ index_close(toastidx); heap_close(toastrel, RowExclusiveLock); return PointerGetDatum(result); }
/* ---------- * toast_fetch_datum_slice - * * Reconstruct a segment of a varattrib from the chunks saved * in the toast relation * ---------- */ static varattrib * toast_fetch_datum_slice(varattrib *attr, int32 sliceoffset, int32 length) { Relation toastrel; Relation toastidx; ScanKeyData toastkey[3]; int nscankeys; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 attrsize; int32 residx; int32 nextidx; int numchunks; int startchunk; int endchunk; int32 startoffset; int32 endoffset; int totalchunks; Pointer chunk; bool isnull; int32 chunksize; int32 chcpystrt; int32 chcpyend; attrsize = attr->va_content.va_external.va_extsize; totalchunks = ((attrsize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; if (sliceoffset >= attrsize) { sliceoffset = 0; length = 0; } if (((sliceoffset + length) > attrsize) || length < 0) length = attrsize - sliceoffset; result = (varattrib *) palloc(length + VARHDRSZ); VARATT_SIZEP(result) = length + VARHDRSZ; if (VARATT_IS_COMPRESSED(attr)) VARATT_SIZEP(result) |= VARATT_FLAG_COMPRESSED; if (length == 0) return (result); /* Can save a lot of work at this point! */ startchunk = sliceoffset / TOAST_MAX_CHUNK_SIZE; endchunk = (sliceoffset + length - 1) / TOAST_MAX_CHUNK_SIZE; numchunks = (endchunk - startchunk) + 1; startoffset = sliceoffset % TOAST_MAX_CHUNK_SIZE; endoffset = (sliceoffset + length - 1) % TOAST_MAX_CHUNK_SIZE; /* * Open the toast relation and it's index */ toastrel = heap_open(attr->va_content.va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Setup a scan key to fetch from the index. This is either two keys * or three depending on the number of chunks. */ ScanKeyEntryInitialize(&toastkey[0], (bits16) 0, (AttrNumber) 1, (RegProcedure) F_OIDEQ, ObjectIdGetDatum(attr->va_content.va_external.va_valueid)); /* * Now dependent on number of chunks: */ if (numchunks == 1) { ScanKeyEntryInitialize(&toastkey[1], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4EQ, Int32GetDatum(startchunk)); nscankeys = 2; } else { ScanKeyEntryInitialize(&toastkey[1], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4GE, Int32GetDatum(startchunk)); ScanKeyEntryInitialize(&toastkey[2], (bits16) 0, (AttrNumber) 2, (RegProcedure) F_INT4LE, Int32GetDatum(endchunk)); nscankeys = 3; } /* * Read the chunks by index * * The index is on (valueid, chunkidx) so they will come in order */ nextidx = startchunk; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, nscankeys, toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(heap_getattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(heap_getattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); chunksize = VARATT_SIZE(chunk) - VARHDRSZ; /* * Some checks on the data we've found */ if ((residx != nextidx) || (residx > endchunk) || (residx < startchunk)) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, attr->va_content.va_external.va_valueid); if (residx < totalchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != attrsize) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } /* * Copy the data into proper place in our result */ chcpystrt = 0; chcpyend = chunksize - 1; if (residx == startchunk) chcpystrt = startoffset; if (residx == endchunk) chcpyend = endoffset; memcpy(((char *) VARATT_DATA(result)) + (residx * TOAST_MAX_CHUNK_SIZE - sliceoffset) + chcpystrt, VARATT_DATA(chunk) + chcpystrt, (chcpyend - chcpystrt) + 1); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != (endchunk + 1)) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, attr->va_content.va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx); heap_close(toastrel, AccessShareLock); return result; }
/* ---------- * toast_fetch_datum - * * Reconstruct an in memory varattrib from the chunks saved * in the toast relation * ---------- */ static varattrib * toast_fetch_datum(varattrib *attr) { Relation toastrel; Relation toastidx; ScanKeyData toastkey; IndexScanDesc toastscan; HeapTuple ttup; TupleDesc toasttupDesc; varattrib *result; int32 ressize; int32 residx, nextidx; int32 numchunks; Pointer chunk; bool isnull; int32 chunksize; ressize = attr->va_content.va_external.va_extsize; numchunks = ((ressize - 1) / TOAST_MAX_CHUNK_SIZE) + 1; result = (varattrib *) palloc(ressize + VARHDRSZ); VARATT_SIZEP(result) = ressize + VARHDRSZ; if (VARATT_IS_COMPRESSED(attr)) VARATT_SIZEP(result) |= VARATT_FLAG_COMPRESSED; /* * Open the toast relation and its index */ toastrel = heap_open(attr->va_content.va_external.va_toastrelid, AccessShareLock); toasttupDesc = toastrel->rd_att; toastidx = index_open(toastrel->rd_rel->reltoastidxid); /* * Setup a scan key to fetch from the index by va_valueid */ ScanKeyEntryInitialize(&toastkey, (bits16) 0, (AttrNumber) 1, (RegProcedure) F_OIDEQ, ObjectIdGetDatum(attr->va_content.va_external.va_valueid)); /* * Read the chunks by index * * Note that because the index is actually on (valueid, chunkidx) we will * see the chunks in chunkidx order, even though we didn't explicitly * ask for it. */ nextidx = 0; toastscan = index_beginscan(toastrel, toastidx, SnapshotToast, 1, &toastkey); while ((ttup = index_getnext(toastscan, ForwardScanDirection)) != NULL) { /* * Have a chunk, extract the sequence number and the data */ residx = DatumGetInt32(heap_getattr(ttup, 2, toasttupDesc, &isnull)); Assert(!isnull); chunk = DatumGetPointer(heap_getattr(ttup, 3, toasttupDesc, &isnull)); Assert(!isnull); chunksize = VARATT_SIZE(chunk) - VARHDRSZ; /* * Some checks on the data we've found */ if (residx != nextidx) elog(ERROR, "unexpected chunk number %d (expected %d) for toast value %u", residx, nextidx, attr->va_content.va_external.va_valueid); if (residx < numchunks - 1) { if (chunksize != TOAST_MAX_CHUNK_SIZE) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else if (residx < numchunks) { if ((residx * TOAST_MAX_CHUNK_SIZE + chunksize) != ressize) elog(ERROR, "unexpected chunk size %d in chunk %d for toast value %u", chunksize, residx, attr->va_content.va_external.va_valueid); } else elog(ERROR, "unexpected chunk number %d for toast value %u", residx, attr->va_content.va_external.va_valueid); /* * Copy the data into proper place in our result */ memcpy(((char *) VARATT_DATA(result)) + residx * TOAST_MAX_CHUNK_SIZE, VARATT_DATA(chunk), chunksize); nextidx++; } /* * Final checks that we successfully fetched the datum */ if (nextidx != numchunks) elog(ERROR, "missing chunk number %d for toast value %u", nextidx, attr->va_content.va_external.va_valueid); /* * End scan and close relations */ index_endscan(toastscan); index_close(toastidx); heap_close(toastrel, AccessShareLock); return result; }
int inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes) { int nwritten = 0; int n; int off; int len; int32 pageno = (int32) (obj_desc->offset / LOBLKSIZE); ScanKeyData skey[2]; IndexScanDesc sd; HeapTuple oldtuple; Form_pg_largeobject olddata; bool neednextpage; bytea *datafield; bool pfreeit; struct { bytea hdr; char data[LOBLKSIZE]; } workbuf; char *workb = VARATT_DATA(&workbuf.hdr); HeapTuple newtup; Datum values[Natts_pg_largeobject]; char nulls[Natts_pg_largeobject]; char replace[Natts_pg_largeobject]; CatalogIndexState indstate; Assert(PointerIsValid(obj_desc)); Assert(buf != NULL); if (nbytes <= 0) return 0; open_lo_relation(); indstate = CatalogOpenIndexes(lo_heap_r); ScanKeyInit(&skey[0], Anum_pg_largeobject_loid, BTEqualStrategyNumber, F_OIDEQ, ObjectIdGetDatum(obj_desc->id)); ScanKeyInit(&skey[1], Anum_pg_largeobject_pageno, BTGreaterEqualStrategyNumber, F_INT4GE, Int32GetDatum(pageno)); sd = index_beginscan(lo_heap_r, lo_index_r, SnapshotNow, 2, skey); oldtuple = NULL; olddata = NULL; neednextpage = true; while (nwritten < nbytes) { /* * If possible, get next pre-existing page of the LO. We assume * the indexscan will deliver these in order --- but there may be * holes. */ if (neednextpage) { if ((oldtuple = index_getnext(sd, ForwardScanDirection)) != NULL) { olddata = (Form_pg_largeobject) GETSTRUCT(oldtuple); Assert(olddata->pageno >= pageno); } neednextpage = false; } /* * If we have a pre-existing page, see if it is the page we want * to write, or a later one. */ if (olddata != NULL && olddata->pageno == pageno) { /* * Update an existing page with fresh data. * * First, load old data into workbuf */ datafield = &(olddata->data); pfreeit = false; if (VARATT_IS_EXTENDED(datafield)) { datafield = (bytea *) heap_tuple_untoast_attr((varattrib *) datafield); pfreeit = true; } len = getbytealen(datafield); Assert(len <= LOBLKSIZE); memcpy(workb, VARDATA(datafield), len); if (pfreeit) pfree(datafield); /* * Fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > len) MemSet(workb + len, 0, off - len); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; off += n; /* compute valid length of new page */ len = (len >= off) ? len : off; VARATT_SIZEP(&workbuf.hdr) = len + VARHDRSZ; /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, ' ', sizeof(nulls)); memset(replace, ' ', sizeof(replace)); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); replace[Anum_pg_largeobject_data - 1] = 'r'; newtup = heap_modifytuple(oldtuple, lo_heap_r, values, nulls, replace); simple_heap_update(lo_heap_r, &newtup->t_self, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); /* * We're done with this old page. */ oldtuple = NULL; olddata = NULL; neednextpage = true; } else { /* * Write a brand new page. * * First, fill any hole */ off = (int) (obj_desc->offset % LOBLKSIZE); if (off > 0) MemSet(workb, 0, off); /* * Insert appropriate portion of new data */ n = LOBLKSIZE - off; n = (n <= (nbytes - nwritten)) ? n : (nbytes - nwritten); memcpy(workb + off, buf + nwritten, n); nwritten += n; obj_desc->offset += n; /* compute valid length of new page */ len = off + n; VARATT_SIZEP(&workbuf.hdr) = len + VARHDRSZ; /* * Form and insert updated tuple */ memset(values, 0, sizeof(values)); memset(nulls, ' ', sizeof(nulls)); values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id); values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno); values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf); newtup = heap_formtuple(lo_heap_r->rd_att, values, nulls); simple_heap_insert(lo_heap_r, newtup); CatalogIndexInsert(indstate, newtup); heap_freetuple(newtup); } pageno++; } index_endscan(sd); CatalogCloseIndexes(indstate); /* * Advance command counter so that my tuple updates will be seen by * later large-object operations in this transaction. */ CommandCounterIncrement(); return nwritten; }