Beispiel #1
0
/*!
 * insert a value at position i of the mfv sketch
 *
 * we do not overwrite the previous value at position i.
 * instead we place the new value at the next_offset.
 *
 * <i>Note: we do not currently garbage collection the old value's storage.
 * This wastes space, with the worst-case scenario being a column with values
 * of increasing size and frequency!</i>
 *
 * \param transblob the transition value packed into a bytea
 * \param dat the value to be inserted
 * \param i the position to insert at
 */
bytea *mfv_transval_insert_at(bytea *transblob, Datum dat, uint32 i)
{
    mfvtransval *transval = (mfvtransval *)VARDATA(transblob);
    bytea *      tmpblob;
    size_t       datumLen = ExtractDatumLen(dat, transval->typLen, transval->typByVal, -1);

    if (i > transval->next_mfv)
        elog(
            ERROR,
            "attempt to insert frequent value at illegal index %d in mfv sketch",
            i);
    if (MFV_TRANSVAL_CAPACITY(transblob) < datumLen) {
        /* allocate a copy with room for this, and double the current space for values */
        size_t curspace = VARSIZE(transblob) - transval->mfvs[0].offset -
                          VARHDRSZ;
        tmpblob = (bytea *)palloc0(VARSIZE(transblob) + curspace + datumLen);
        memmove(tmpblob, transblob, VARSIZE(transblob));
        SET_VARSIZE(tmpblob, VARSIZE(transblob) + curspace + datumLen);
        /*
         * PG won't let us pfree the old transblob
         * pfree(transblob);
         */
        transblob = tmpblob;
        transval = (mfvtransval *)VARDATA(transblob);
    }
    transval->mfvs[i].offset = transval->next_offset;
    mfv_copy_datum(transblob, i, dat);

    transval->next_offset += datumLen;

    return(transblob);
}
Beispiel #2
0
/* check whether the content in the given bytea is safe for mfvtransval */
void check_mfvtransval(bytea *storage) {
    size_t left_len = VARSIZE(storage);
    size_t cur_size = 0;
    size_t cur_capacity = 0;
    Oid     outFuncOid;
    bool    typIsVarLen;

    mfvtransval *mfv  = NULL;

    if (left_len < VARHDRSZ + sizeof(mfvtransval)) {
        elog(ERROR, "invalid transition state for mfvsketch");
    }
    mfv = (mfvtransval*)VARDATA(storage);
    left_len -= VARHDRSZ + sizeof(mfvtransval);

    if (mfv->next_mfv > mfv->max_mfvs) {
        elog(ERROR, "invalid transition state for mfvsketch");
    }

    if (mfv->next_offset + VARHDRSZ > VARSIZE(storage)) {
        elog(ERROR, "invalid transition state for mfvsketch");
    }

    if (InvalidOid == mfv->typOid) {
        elog(ERROR, "invalid transition state for mfvsketch");
    }

    getTypeOutputInfo(mfv->typOid, &outFuncOid, &typIsVarLen);
    if (mfv->outFuncOid != outFuncOid
        || mfv->typLen != get_typlen(mfv->typOid)
        || mfv->typByVal != get_typbyval(mfv->typOid)) {
        elog(ERROR, "invalid transition state for mfvsketch");
    }

    if (left_len < sizeof(offsetcnt)*mfv->max_mfvs) {
        elog(ERROR, "invalid transition state for mfvsketch");
    }
    /* offset is relative to mfvtransval */
    left_len = VARSIZE(storage) - VARHDRSZ;

    /*
     * the following checking may be inefficiency, but by doing this centrally,
     * we can avoid spreading the checking code everywhere.
     */
    for (unsigned i = 0; i < mfv->next_mfv; i ++) {
        cur_capacity = left_len - mfv->mfvs[i].offset;

        if (mfv->mfvs[i].offset > left_len) {
            elog(ERROR, "invalid transition state for mfvsketch");
        }

        cur_size = ExtractDatumLen(PointerGetDatum(MFV_DATA(mfv) + mfv->mfvs[i].offset),
            mfv->typLen, mfv->typByVal, cur_capacity);

        if (cur_size > cur_capacity) {
            elog(ERROR, "invalid transition state for mfvsketch");
        }
    }
}
Beispiel #3
0
/*!
 * copy datum <c>dat</c> into the offset of position <c>index</c> of
 * the mfv sketch stored in <c>transblob</c>.
 *
 * <i>Caller beware: this helper return assumes that
 * <c>dat</c> is small enough to fit in the storage
 * currently used by the datum at position <c>index</c>.</i>
 *
 * \param transblob a bytea holding and mfv transval
 * \param index the index of the destination for copying
 * \param dat the datum to be copied into the transval
 */
void mfv_copy_datum(bytea *transblob, int index, Datum dat)
{
    mfvtransval *transval = (mfvtransval *)VARDATA(transblob);
    size_t       datumLen = ExtractDatumLen(dat, transval->typLen, transval->typByVal, -1);
    void *       curval = (char*)transval +  transval->mfvs[index].offset;

    memmove(curval, (void *)DatumExtractPointer(dat, transval->typByVal), datumLen);
}
Beispiel #4
0
/*!
 * copy datum <c>dat</c> into the offset of position <c>index</c> of
 * the mfv sketch stored in <c>transblob</c>.
 *
 * <i>Caller beware: this helper return assumes that
 * <c>dat</c> is small enough to fit in the storage
 * currently used by the datum at position <c>index</c>.</i>
 *
 * \param transblob a bytea holding and mfv transval
 * \param index the index of the destination for copying
 * \param dat the datum to be copied into the transval
 */
void mfv_copy_datum(bytea *transblob, int index, Datum dat)
{
    mfvtransval *transval = (mfvtransval *)VARDATA(transblob);
    size_t       datumLen = ExtractDatumLen(dat, transval->typLen, transval->typByVal, -1);
    void *       curval = mfv_transval_getval(transblob,index);

    memmove(curval, (void *)DatumExtractPointer(dat, transval->typByVal), datumLen);
}
Beispiel #5
0
/*!
 * replace the value at position i of the mfvsketch with dat
 *
 * \param transblob the transition value packed into a bytea
 * \param dat the value to be inserted
 * \param i the position to replace
 */
bytea *mfv_transval_replace(bytea *transblob, Datum dat, int i)
{
    /*
     * if new value is smaller than old, we overwrite at the old offset.
     * otherwise we call mfv_transval_insert_at which will take care of
     * space allocation for the new value
     */
    mfvtransval *transval = (mfvtransval *)VARDATA(transblob);
    size_t       datumLen = ExtractDatumLen(dat, transval->typLen, transval->typByVal, -1);
    void *       tmpp = mfv_transval_getval(transblob,i);
    Datum        oldDat = PointerExtractDatum(tmpp, transval->typByVal);
    size_t       oldLen = ExtractDatumLen(oldDat, transval->typLen, transval->typByVal, -1);

    if (datumLen <= oldLen) {
        mfv_copy_datum(transblob, i, dat);
        return transblob;
    }
    else return(mfv_transval_insert_at(transblob, dat, i));
}
Beispiel #6
0
/*!
 * look to see if the mfvsketch currently has <c>val</c>
 * stored as one of its most-frequent values.
 * Returns the offset in the <c>mfvs</c> array, or -1
 * if not found.
 * NOTE: a 0 return value means the item <i>was found</i>
 * at offset 0!
 * \param blob a bytea holding an mfv transval
 * \param val the datum to search for
 */
int mfv_find(bytea *blob, Datum val)
{
    mfvtransval *transval = (mfvtransval *)VARDATA(blob);
    unsigned     i;
    uint32       len;
    void *       datp;
    Datum        iDat;
    void        *valp = DatumExtractPointer(val, transval->typByVal);

    /* look for existing entry for this value */
    for (i = 0; i < transval->next_mfv; i++) {
        /* if they're the same */
        datp = mfv_transval_getval(blob,i);
        iDat = PointerExtractDatum(datp, transval->typByVal);

        if ((len = ExtractDatumLen(iDat, transval->typLen, transval->typByVal, -1))
            == ExtractDatumLen(val, transval->typLen, transval->typByVal, -1)) {
            if (!memcmp(datp, valp, len))
                /* arg is an mfv */
                return(i);
        }
    }
    return(-1);
}
Beispiel #7
0
/*!
 * \param blob a bytea holding an mfv transval
 * \param i index of the mfv to look up
 * \returns pointer to the datum associated with the i'th mfv
 */
void *mfv_transval_getval(bytea *blob, uint32 i)
{
    mfvtransval *tvp = (mfvtransval *)VARDATA(blob);
    void *       retval = (void *)(((char*)tvp) + tvp->mfvs[i].offset);
    Datum        dat = PointerExtractDatum(retval, tvp->typByVal);

    if (i > tvp->next_mfv)
        elog(ERROR,
             "attempt to get frequent value at illegal index %d in mfv sketch",
             i);
    if (tvp->mfvs[i].offset > VARSIZE(blob) - VARHDRSZ
        || tvp->mfvs[i].offset < MFV_TRANSVAL_SZ(tvp->max_mfvs)-VARHDRSZ)
        elog(ERROR, "illegal offset %u in mfv sketch", tvp->mfvs[i].offset);
    if (tvp->mfvs[i].offset  + ExtractDatumLen(dat, tvp->typLen, tvp->typByVal, -1)
        > VARSIZE(blob) - VARHDRSZ)
        elog(ERROR, "value overruns size of mfv sketch");

    return (retval);
}
Beispiel #8
0
/*!
 * \param blob a bytea holding an mfv transval
 * \param i index of the mfv to look up
 * \returns pointer to the datum associated with the i'th mfv
 */
void *mfv_transval_getval(bytea *blob, uint32 i)
{
    mfvtransval *tvp = (mfvtransval *)VARDATA(blob);
    void *       retval = (void *)(((char*)tvp) + tvp->mfvs[i].offset);
    Datum        dat = PointerExtractDatum(retval, tvp->typByVal);

    if (i >= tvp->next_mfv)
        elog(ERROR,
             "attempt to get frequent value at illegal index %d in mfv sketch",
             i);
    if (tvp->mfvs[i].offset > VARSIZE(blob) - VARHDRSZ
        || tvp->mfvs[i].offset < MFV_TRANSVAL_SZ(tvp->max_mfvs)-VARHDRSZ)
        elog(ERROR, "illegal offset %u in mfv sketch", tvp->mfvs[i].offset);
    /*
     * call ExtractDatumLen to make sure enough space, this checking is unnecessary,
     * it is used to prevent gcc from optimizing out the ExtractDatumLen function call.
     */
    if (tvp->mfvs[i].offset
        + ExtractDatumLen(dat, tvp->typLen, tvp->typByVal, VARSIZE(blob) - VARHDRSZ - tvp->mfvs[i].offset)
        > VARSIZE(blob) - VARHDRSZ)
        elog(ERROR, "value overruns size of mfv sketch");

    return (retval);
}
Beispiel #9
0
/*! UDA transition function for the fmsketch aggregate. */
Datum __fmsketch_trans(PG_FUNCTION_ARGS)
{
    bytea *     transblob = (bytea *)PG_GETARG_BYTEA_P(0);
    fmtransval *transval;
    Oid         element_type = get_fn_expr_argtype(fcinfo->flinfo, 1);
    Oid         funcOid;
    bool        typIsVarlena;
    Datum       retval;
    Datum       inval;

    if (!OidIsValid(element_type))
        elog(ERROR, "could not determine data type of input");

    /*
     * This is Postgres boilerplate for UDFs that modify the data in their own context.
     * Such UDFs can only be correctly called in an agg context since regular scalar
     * UDFs are essentially stateless across invocations.
     */
    if (!(fcinfo->context &&
          (IsA(fcinfo->context, AggState)
    #ifdef NOTGP
           || IsA(fcinfo->context, WindowAggState)
    #endif
          )))
        elog(
            ERROR,
            "UDF call to a function that only works for aggs (destructive pass by reference)");


    /* get the provided element, being careful in case it's NULL */
    if (!PG_ARGISNULL(1)) {
        inval = PG_GETARG_DATUM(1);

        /*
         * if this is the first call, initialize transval to hold a sortasort
         * on the first call, we should have the empty string (if the agg was declared properly!)
         */
        if (VARSIZE(transblob) <= VARHDRSZ) {
            size_t blobsz = VARHDRSZ + sizeof(fmtransval) +
                            SORTASORT_INITIAL_STORAGE;

            transblob = (bytea *)palloc0(blobsz);
            SET_VARSIZE(transblob, blobsz);
            transval = (fmtransval *)VARDATA(transblob);

            transval->typOid = element_type;
            /* figure out the outfunc for this type */
            getTypeOutputInfo(element_type, &funcOid, &typIsVarlena);
            get_typlenbyval(element_type, &(transval->typLen), &(transval->typByVal));
            transval->status = SMALL;
            sortasort_init((sortasort *)transval->storage,
                           MINVALS,
                           SORTASORT_INITIAL_STORAGE,
                           transval->typLen,
                           transval->typByVal);
        }
        else {
            check_fmtransval(transblob);
            /* extract the existing transval from the transblob */
            transval = (fmtransval *)VARDATA(transblob);
            if (transval->typOid != element_type) {
                elog(ERROR, "cannot aggregate on elements with different types");
            }
        }

        /*
         * if we've seen < MINVALS distinct values, place datum into the sortasort
         * XXXX Would be cleaner to try the sortasort insert and if it fails, then continue.
         */
        if (transval->status == SMALL
            && ((sortasort *)(transval->storage))->num_vals <
            MINVALS) {
            int len = ExtractDatumLen(inval, transval->typLen, transval->typByVal, -1);

            retval =
                PointerGetDatum(fmsketch_sortasort_insert(
                                    transblob,
                                    inval, len));
            PG_RETURN_DATUM(retval);
        }

        /*
         * if we've seen exactly MINVALS distinct values, create FM bitmaps
         * and load the contents of the sortasort into the FM sketch
         */
        else if (transval->status == SMALL
                 && ((sortasort *)(transval->storage))->num_vals ==
                 MINVALS) {
            int        i;
            sortasort  *s = (sortasort *)(transval->storage);
            bytea      *newblob = fm_new(transval);

            transval = (fmtransval *)VARDATA(newblob);

            /*
             * "catch up" on the past as if we were doing FM from the beginning:
             * apply the FM sketching algorithm to each value previously stored in the sortasort
             */
            for (i = 0; i < MINVALS; i++)
                __fmsketch_trans_c(newblob,
                                   PointerExtractDatum(sortasort_getval(s,i), s->typByVal));

            /*
             * XXXX would like to pfree the old transblob, but the memory allocator doesn't like it
             * XXXX Meanwhile we know that this memory "leak" is of fixed size and will get
             * XXXX deallocated "soon" when the memory context is destroyed.
             */
            /* drop through to insert the current datum in "BIG" mode */
            transblob = newblob;
        }

        /*
         * if we're here we've seen >=MINVALS distinct values and are in BIG mode.
         * Just for sanity, let's check.
         */
        if (transval->status != BIG)
            elog(
                ERROR,
                "FM sketch failed internal sanity check");

        /* Apply FM algorithm to this datum */
        retval = __fmsketch_trans_c(transblob, inval);
        PG_RETURN_DATUM(retval);
    }
    else PG_RETURN_NULL();
}