/* ---------- * toast_compress_datum - * * Create a compressed version of a varlena datum * * If we fail (ie, compressed result is actually bigger than original) * then return NULL. We must not use compressed data if it'd expand * the tuple! * * We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without * copying them. But we can't handle external or compressed datums. * ---------- */ Datum toast_compress_datum(Datum value) { varattrib *tmp; int32 valsize = VARSIZE_ANY_EXHDR_D(value); Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value))); Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value))); /* * No point in wasting a palloc cycle if value size is out of the allowed * range for compression */ if (valsize < PGLZ_strategy_default->min_input_size || valsize > PGLZ_strategy_default->max_input_size) return PointerGetDatum(NULL); tmp = (varattrib *) palloc(PGLZ_MAX_OUTPUT(valsize)); if (pglz_compress(VARDATA_ANY_D(value), valsize, (PGLZ_Header *) tmp, PGLZ_strategy_default) && VARSIZE(tmp) < VARSIZE_ANY_D(value)) { /* successful compression */ VARATT_SET_COMPRESSED(tmp); return PointerGetDatum(tmp); } else { /* incompressible data */ pfree(tmp); return PointerGetDatum(NULL); } }
/* ---------- * toast_compress_datum - * * Create a compressed version of a varlena datum * * If we fail (ie, compressed result is actually bigger than original) * then return NULL. We must not use compressed data if it'd expand * the tuple! * * We use VAR{SIZE,DATA}_ANY so we can handle short varlenas here without * copying them. But we can't handle external or compressed datums. * ---------- */ Datum toast_compress_datum(Datum value) { struct varlena *tmp; int32 valsize = VARSIZE_ANY_EXHDR(DatumGetPointer(value)); Assert(!VARATT_IS_EXTERNAL(DatumGetPointer(value))); Assert(!VARATT_IS_COMPRESSED(DatumGetPointer(value))); /* * No point in wasting a palloc cycle if value size is out of the allowed * range for compression */ if (valsize < PGLZ_strategy_default->min_input_size || valsize > PGLZ_strategy_default->max_input_size) return PointerGetDatum(NULL); tmp = (struct varlena *) palloc(PGLZ_MAX_OUTPUT(valsize)); /* * We recheck the actual size even if pglz_compress() reports success, * because it might be satisfied with having saved as little as one byte * in the compressed data --- which could turn into a net loss once you * consider header and alignment padding. Worst case, the compressed * format might require three padding bytes (plus header, which is * included in VARSIZE(tmp)), whereas the uncompressed format would take * only one header byte and no padding if the value is short enough. So * we insist on a savings of more than 2 bytes to ensure we have a gain. */ if (pglz_compress(VARDATA_ANY(DatumGetPointer(value)), valsize, (PGLZ_Header *) tmp, PGLZ_strategy_default) && VARSIZE(tmp) < valsize - 2) { /* successful compression */ return PointerGetDatum(tmp); } else { /* incompressible data */ pfree(tmp); return PointerGetDatum(NULL); } }
/* * compress_data * * Compress the bytea buffer and return the result as bytea. */ Datum compress_data(PG_FUNCTION_ARGS) { bytea *raw_data = PG_GETARG_BYTEA_P(0); bytea *res; int32 compressed_len; char *compressed_data; PGLZ_Strategy strategy; memcpy(&strategy, (PGLZ_Strategy *) PGLZ_strategy_always, sizeof(PGLZ_Strategy)); /* Get custom values if specified by user */ if (PG_NARGS() == 7) { strategy.min_input_size = PG_GETARG_INT32(1); strategy.max_input_size = PG_GETARG_INT32(2); strategy.min_comp_rate = PG_GETARG_INT32(3); strategy.first_success_by = PG_GETARG_INT32(4); strategy.match_size_good = PG_GETARG_INT32(5); strategy.match_size_drop = PG_GETARG_INT32(6); } /* Compress data in build */ compressed_data = palloc(PGLZ_MAX_OUTPUT(VARSIZE(raw_data) - VARHDRSZ)); compressed_len = pglz_compress(VARDATA(raw_data), VARSIZE(raw_data) - VARHDRSZ, compressed_data, &strategy); /* if compression failed return the original data */ if (compressed_len < 0) PG_RETURN_BYTEA_P(raw_data); /* Build result */ res = (bytea *) palloc(VARHDRSZ + compressed_len); SET_VARSIZE(res, compressed_len + VARHDRSZ); memcpy(VARDATA(res), compressed_data, compressed_len); pfree(compressed_data); PG_RETURN_BYTEA_P(res); }
/* ---------- * toast_compress_datum - * * Create a compressed version of a varlena datum * * If we fail (ie, compressed result is actually bigger than original) * then return NULL. We must not use compressed data if it'd expand * the tuple! * ---------- */ Datum toast_compress_datum(Datum value) { varattrib *tmp; int32 valsize = VARATT_SIZE(value) - VARHDRSZ; tmp = (varattrib *) palloc(PGLZ_MAX_OUTPUT(valsize)); if (pglz_compress(VARATT_DATA(value), valsize, (PGLZ_Header *) tmp, PGLZ_strategy_default) && VARATT_SIZE(tmp) < VARATT_SIZE(value)) { /* successful compression */ VARATT_SIZEP(tmp) |= VARATT_FLAG_COMPRESSED; return PointerGetDatum(tmp); } else { /* incompressible data */ pfree(tmp); return PointerGetDatum(NULL); } }
/* * SerializeBlockData serializes and compresses block data at given block index with given * compression type for every column. */ static void SerializeBlockData(TableWriteState *writeState, uint32 blockIndex, uint32 rowCount) { uint32 columnIndex = 0; StripeBuffers *stripeBuffers = writeState->stripeBuffers; ColumnBlockData **blockDataArray = writeState->blockDataArray; CompressionType requestedCompressionType = writeState->compressionType; const uint32 columnCount = stripeBuffers->columnCount; StringInfo compressionBuffer = writeState->compressionBuffer; /* serialize exist values, data values are already serialized */ for (columnIndex = 0; columnIndex < columnCount; columnIndex++) { ColumnBuffers *columnBuffers = stripeBuffers->columnBuffersArray[columnIndex]; ColumnBlockBuffers *blockBuffers = columnBuffers->blockBuffersArray[blockIndex]; ColumnBlockData *blockData = blockDataArray[columnIndex]; blockBuffers->existsBuffer = SerializeBoolArray(blockData->existsArray, rowCount); } /* * check and compress value buffers, if a value buffer is not compressable * then keep it as uncompressed, store compression information. */ for (columnIndex = 0; columnIndex < columnCount; columnIndex++) { uint64 maximumLength = 0; bool compressable = false; ColumnBuffers *columnBuffers = stripeBuffers->columnBuffersArray[columnIndex]; ColumnBlockBuffers *blockBuffers = columnBuffers->blockBuffersArray[blockIndex]; ColumnBlockData *blockData = blockDataArray[columnIndex]; StringInfo serializedValueBuffer = NULL; CompressionType actualCompressionType = COMPRESSION_NONE; serializedValueBuffer = blockData->valueBuffer; /* the only other supported compression type is pg_lz for now */ Assert(requestedCompressionType == COMPRESSION_NONE || requestedCompressionType == COMPRESSION_PG_LZ); /* * if serializedValueBuffer is be compressed, update serializedValueBuffer * with compressed data and store compression type. */ if (requestedCompressionType == COMPRESSION_PG_LZ) { maximumLength = PGLZ_MAX_OUTPUT(serializedValueBuffer->len); resetStringInfo(compressionBuffer); enlargeStringInfo(compressionBuffer, maximumLength); compressable = cstore_pglz_compress((const char *) serializedValueBuffer->data, serializedValueBuffer->len, (PGLZ_Header*)compressionBuffer->data, PGLZ_strategy_always); if (compressable) { serializedValueBuffer = compressionBuffer; serializedValueBuffer->len = VARSIZE(compressionBuffer->data); actualCompressionType = COMPRESSION_PG_LZ; } } /* store (compressed) value buffer */ blockBuffers->valueCompressionType = actualCompressionType; blockBuffers->valueBuffer = CopyStringInfo(serializedValueBuffer); /* valueBuffer needs to be reset for next block's data */ resetStringInfo(blockData->valueBuffer); } }
/* * FlushStripe compresses the data in the current stripe, flushes the compressed * data into the file, and returns the stripe metadata. To do this, the function * first creates the data buffers, and then updates position and length statistics * in stripe's skip list. Then, the function creates the skip list and footer * buffers. Finally, the function flushes the skip list, data, and footer buffers * to the file. */ static StripeMetadata FlushStripe(TableWriteState *writeState) { StripeMetadata stripeMetadata = {0, 0, 0, 0}; uint64 skipListLength = 0; uint64 dataLength = 0; StringInfo **existsBufferArray = NULL; StringInfo **valueBufferArray = NULL; CompressionType **valueCompressionTypeArray = NULL; StringInfo *skipListBufferArray = NULL; StripeFooter *stripeFooter = NULL; StringInfo stripeFooterBuffer = NULL; uint32 columnIndex = 0; uint32 blockIndex = 0; FILE *tableFile = writeState->tableFile; StripeData *stripeData = writeState->stripeData; StripeSkipList *stripeSkipList = writeState->stripeSkipList; CompressionType compressionType = writeState->compressionType; TupleDesc tupleDescriptor = writeState->tupleDescriptor; uint32 columnCount = tupleDescriptor->natts; uint32 blockCount = stripeSkipList->blockCount; /* create "exists" and "value" buffers */ existsBufferArray = CreateExistsBufferArray(stripeData->columnDataArray, stripeSkipList); valueBufferArray = CreateValueBufferArray(stripeData->columnDataArray, stripeSkipList, tupleDescriptor); valueCompressionTypeArray = palloc0(columnCount * sizeof(CompressionType *)); for (columnIndex = 0; columnIndex < columnCount; columnIndex++) { CompressionType *blockCompressionTypeArray = palloc0(blockCount * sizeof(CompressionType)); valueCompressionTypeArray[columnIndex] = blockCompressionTypeArray; for (blockIndex = 0; blockIndex < blockCount; blockIndex++) { StringInfo valueBuffer = NULL; uint64 maximumLength = 0; PGLZ_Header *compressedData = NULL; bool compressable = false; if (compressionType == COMPRESSION_NONE) { blockCompressionTypeArray[blockIndex] = COMPRESSION_NONE; continue; } /* the only other supported compression type is pg_lz for now */ Assert(compressionType == COMPRESSION_PG_LZ); valueBuffer = valueBufferArray[columnIndex][blockIndex]; maximumLength = PGLZ_MAX_OUTPUT(valueBuffer->len); compressedData = palloc0(maximumLength); compressable = pglz_compress((const char *) valueBuffer->data, valueBuffer->len, compressedData, PGLZ_strategy_always); if (compressable) { pfree(valueBuffer->data); valueBuffer->data = (char *) compressedData; valueBuffer->len = VARSIZE(compressedData); valueBuffer->maxlen = maximumLength; blockCompressionTypeArray[blockIndex] = COMPRESSION_PG_LZ; } else { pfree(compressedData); blockCompressionTypeArray[blockIndex] = COMPRESSION_NONE; } } } /* update buffer sizes and positions in stripe skip list */ for (columnIndex = 0; columnIndex < columnCount; columnIndex++) { ColumnBlockSkipNode **columnSkipNodeArray = stripeSkipList->blockSkipNodeArray; ColumnBlockSkipNode *blockSkipNodeArray = columnSkipNodeArray[columnIndex]; uint32 blockCount = stripeSkipList->blockCount; uint32 blockIndex = 0; uint64 currentExistsBlockOffset = 0; uint64 currentValueBlockOffset = 0; for (blockIndex = 0; blockIndex < blockCount; blockIndex++) { uint64 existsBufferSize = existsBufferArray[columnIndex][blockIndex]->len; uint64 valueBufferSize = valueBufferArray[columnIndex][blockIndex]->len; CompressionType valueCompressionType = valueCompressionTypeArray[columnIndex][blockIndex]; ColumnBlockSkipNode *blockSkipNode = &blockSkipNodeArray[blockIndex]; blockSkipNode->existsBlockOffset = currentExistsBlockOffset; blockSkipNode->existsLength = existsBufferSize; blockSkipNode->valueBlockOffset = currentValueBlockOffset; blockSkipNode->valueLength = valueBufferSize; blockSkipNode->valueCompressionType = valueCompressionType; currentExistsBlockOffset += existsBufferSize; currentValueBlockOffset += valueBufferSize; } } /* create skip list and footer buffers */ skipListBufferArray = CreateSkipListBufferArray(stripeSkipList, tupleDescriptor); stripeFooter = CreateStripeFooter(stripeSkipList, skipListBufferArray); stripeFooterBuffer = SerializeStripeFooter(stripeFooter); /* * Each stripe has three sections: * (1) Skip list, which contains statistics for each column block, and can * be used to skip reading row blocks that are refuted by WHERE clause list, * (2) Data section, in which we store data for each column continuously. * We store data for each for each column in blocks. For each block, we * store two buffers: "exists" buffer, and "value" buffer. "exists" buffer * tells which values are not NULL. "value" buffer contains values for * present values. For each column, we first store all "exists" buffers, * and then all "value" buffers. * (3) Stripe footer, which contains the skip list buffer size, exists buffer * size, and value buffer size for each of the columns. * * We start by flushing the skip list buffers. */ for (columnIndex = 0; columnIndex < columnCount; columnIndex++) { StringInfo skipListBuffer = skipListBufferArray[columnIndex]; WriteToFile(tableFile, skipListBuffer->data, skipListBuffer->len); } /* then, we flush the data buffers */ for (columnIndex = 0; columnIndex < columnCount; columnIndex++) { uint32 blockIndex = 0; for (blockIndex = 0; blockIndex < stripeSkipList->blockCount; blockIndex++) { StringInfo existsBuffer = existsBufferArray[columnIndex][blockIndex]; WriteToFile(tableFile, existsBuffer->data, existsBuffer->len); } for (blockIndex = 0; blockIndex < stripeSkipList->blockCount; blockIndex++) { StringInfo valueBuffer = valueBufferArray[columnIndex][blockIndex]; WriteToFile(tableFile, valueBuffer->data, valueBuffer->len); } } /* finally, we flush the footer buffer */ WriteToFile(tableFile, stripeFooterBuffer->data, stripeFooterBuffer->len); /* set stripe metadata */ for (columnIndex = 0; columnIndex < columnCount; columnIndex++) { skipListLength += stripeFooter->skipListSizeArray[columnIndex]; dataLength += stripeFooter->existsSizeArray[columnIndex]; dataLength += stripeFooter->valueSizeArray[columnIndex]; } stripeMetadata.fileOffset = writeState->currentFileOffset; stripeMetadata.skipListLength = skipListLength; stripeMetadata.dataLength = dataLength; stripeMetadata.footerLength = stripeFooterBuffer->len; /* advance current file offset */ writeState->currentFileOffset += skipListLength; writeState->currentFileOffset += dataLength; writeState->currentFileOffset += stripeFooterBuffer->len; return stripeMetadata; }