Example #1
0
/*!
 * Initialize an mfv sketch
 * \param max_mfvs the number of "bins" in the histogram
 * \param typOid the type ID for the column
 */
bytea *mfv_init_transval(int max_mfvs, Oid typOid)
{
    int          initial_size;
    bool         typIsVarLen;
    bytea *      transblob;
    mfvtransval *transval;

    /*
     * initialize mfvtransval, using palloc0 to zero it out.
     * if typlen is positive (fixed), size chosen accurately.
     * Else we'll do a conservative estimate of 16 bytes, and repalloc as needed.
     */
    if ((initial_size = get_typlen(typOid)) > 0)
        initial_size *= max_mfvs*get_typlen(typOid);
    else /* guess */
        initial_size = max_mfvs*16;

    transblob = (bytea *)palloc0(MFV_TRANSVAL_SZ(max_mfvs) + initial_size);

    SET_VARSIZE(transblob, MFV_TRANSVAL_SZ(max_mfvs) + initial_size);
    transval = (mfvtransval *)VARDATA(transblob);
    transval->max_mfvs = max_mfvs;
    transval->next_mfv = 0;
    transval->next_offset = MFV_TRANSVAL_SZ(max_mfvs)-VARHDRSZ;
    transval->typOid = typOid;
    getTypeOutputInfo(transval->typOid,
                      &(transval->outFuncOid),
                      &(typIsVarLen));
    transval->typLen = get_typlen(transval->typOid);
    transval->typByVal = get_typbyval(transval->typOid);
    if (!transval->outFuncOid) {
        /* no outFunc for this type! */
        elog(ERROR, "no outFunc for type %d", transval->typOid);
    }
    return(transblob);
}
Example #2
0
/*!
 * \param blob a bytea holding an mfv transval
 * \param i index of the mfv to look up
 * \returns pointer to the datum associated with the i'th mfv
 */
void *mfv_transval_getval(bytea *blob, uint32 i)
{
    mfvtransval *tvp = (mfvtransval *)VARDATA(blob);
    void *       retval = (void *)(((char*)tvp) + tvp->mfvs[i].offset);
    Datum        dat = PointerExtractDatum(retval, tvp->typByVal);

    if (i > tvp->next_mfv)
        elog(ERROR,
             "attempt to get frequent value at illegal index %d in mfv sketch",
             i);
    if (tvp->mfvs[i].offset > VARSIZE(blob) - VARHDRSZ
        || tvp->mfvs[i].offset < MFV_TRANSVAL_SZ(tvp->max_mfvs)-VARHDRSZ)
        elog(ERROR, "illegal offset %u in mfv sketch", tvp->mfvs[i].offset);
    if (tvp->mfvs[i].offset  + ExtractDatumLen(dat, tvp->typLen, tvp->typByVal, -1)
        > VARSIZE(blob) - VARHDRSZ)
        elog(ERROR, "value overruns size of mfv sketch");

    return (retval);
}
Example #3
0
/*!
 * \param blob a bytea holding an mfv transval
 * \param i index of the mfv to look up
 * \returns pointer to the datum associated with the i'th mfv
 */
void *mfv_transval_getval(bytea *blob, uint32 i)
{
    mfvtransval *tvp = (mfvtransval *)VARDATA(blob);
    void *       retval = (void *)(((char*)tvp) + tvp->mfvs[i].offset);
    Datum        dat = PointerExtractDatum(retval, tvp->typByVal);

    if (i >= tvp->next_mfv)
        elog(ERROR,
             "attempt to get frequent value at illegal index %d in mfv sketch",
             i);
    if (tvp->mfvs[i].offset > VARSIZE(blob) - VARHDRSZ
        || tvp->mfvs[i].offset < MFV_TRANSVAL_SZ(tvp->max_mfvs)-VARHDRSZ)
        elog(ERROR, "illegal offset %u in mfv sketch", tvp->mfvs[i].offset);
    /*
     * call ExtractDatumLen to make sure enough space, this checking is unnecessary,
     * it is used to prevent gcc from optimizing out the ExtractDatumLen function call.
     */
    if (tvp->mfvs[i].offset
        + ExtractDatumLen(dat, tvp->typLen, tvp->typByVal, VARSIZE(blob) - VARHDRSZ - tvp->mfvs[i].offset)
        > VARSIZE(blob) - VARHDRSZ)
        elog(ERROR, "value overruns size of mfv sketch");

    return (retval);
}
Example #4
0
/*!
 *  transition function to maintain a CountMin sketch with
 *  Most-Frequent Values
 */
Datum __mfvsketch_trans(PG_FUNCTION_ARGS)
{
    bytea *      transblob = PG_GETARG_BYTEA_P(0);
    Datum        newdatum  = PG_GETARG_DATUM(1);
    int          max_mfvs  = PG_GETARG_INT32(2);
    mfvtransval *transval;
    uint64       tmpcnt;
    int          i;
    Datum        md5_datum;

    /*
     * This function makes destructive updates to its arguments.
     * Make sure it's being called in an agg context.
     */
    if (!(fcinfo->context &&
          (IsA(fcinfo->context, AggState)
   #ifdef NOTGP
           || IsA(fcinfo->context, WindowAggState)
   #endif
          )))
        elog(ERROR,
             "destructive pass by reference outside agg");

    /* initialize if this is first call */
    if (VARSIZE(transblob) <= sizeof(MFV_TRANSVAL_SZ(0))) {
        Oid typOid = get_fn_expr_argtype(fcinfo->flinfo, 1);
        transblob = mfv_init_transval(max_mfvs, typOid);
    }
    else {
        check_mfvtransval(transblob);
    }

    /* ignore NULL inputs */
    if (PG_ARGISNULL(1) || PG_ARGISNULL(2))
        PG_RETURN_DATUM(PointerGetDatum(transblob));

    transval = (mfvtransval *)VARDATA(transblob);
    if (transval->typOid != get_fn_expr_argtype(fcinfo->flinfo, 1)) {
        elog(ERROR, "cannot aggregate on elements with different types");
    }
    /* insert into the countmin sketch */
    md5_datum = countmin_trans_c(transval->sketch,
                                newdatum,
                                transval->outFuncOid,
                                transval->typOid);

    tmpcnt = cmsketch_count_md5_datum(transval->sketch,
                                      (bytea *)DatumGetPointer(md5_datum),
                                      transval->outFuncOid);
    i = mfv_find(transblob, newdatum);

    if (i > -1) {
        transval->mfvs[i].cnt = tmpcnt;
    }
    else {
        /* try to insert as either a new or replacement entry */
        for (i = 0; i < (int)transval->max_mfvs; i++) {
            if ((i == (int)transval->next_mfv)) {
                /* room for new */
                transblob = mfv_transval_append(transblob, newdatum);
                transval = (mfvtransval *)VARDATA(transblob);
                transval->mfvs[i].cnt = tmpcnt;
                break;
            }
            else if (transval->mfvs[i].cnt < tmpcnt) {
                /* arg beats this mfv */
                transblob = mfv_transval_replace(transblob, newdatum, i);
                transval = (mfvtransval *)VARDATA(transblob);
                transval->mfvs[i].cnt = tmpcnt;
                break;
            }
            /* else this is not a frequent value */
        }
    }
    PG_RETURN_DATUM(PointerGetDatum(transblob));
}
Example #5
0
/*!
 * implementation of the merge of two mfv sketches.  we
 * first merge the embedded countmin sketches to get the
 * sums of the counts, and then use those sums to pick the
 * top values for the resulting histogram.  We overwrite
 * the first argument and return it.
 * \param transblob1 an mfv transval stored inside a bytea
 * \param transblob2 another mfv transval in a bytea
 */
bytea *mfvsketch_merge_c(bytea *transblob1, bytea *transblob2)
{
    mfvtransval *transval1 = (mfvtransval *)VARDATA(transblob1);
    mfvtransval *transval2 = (mfvtransval *)VARDATA(transblob2);
    void        *newblob;
    mfvtransval *newval;
    uint32       i, j, cnt;

    /* handle uninitialized args */
    if (VARSIZE(transblob1) <= sizeof(MFV_TRANSVAL_SZ(0))
        && VARSIZE(transblob2) <= sizeof(MFV_TRANSVAL_SZ(0)))
        return(transblob1);
    else if (VARSIZE(transblob1) <= sizeof(MFV_TRANSVAL_SZ(0))) {
        transblob1 = mfv_init_transval(transval2->max_mfvs, transval2->typOid);
        transval1 = (mfvtransval *)VARDATA(transblob1);
    }
    else if (VARSIZE(transblob2) <= sizeof(MFV_TRANSVAL_SZ(0))) {
        transblob2 = mfv_init_transval(transval1->max_mfvs, transval1->typOid);
        transval2 = (mfvtransval *)VARDATA(transblob2);
    }
    check_mfvtransval(transblob1);
    check_mfvtransval(transblob2);

    if ( transval1->typOid != transval2->typOid ) {
        elog(ERROR, "cannot merge two transition state with different element type");
    }

    /* initialize output */
    newblob   = mfv_init_transval(transval1->max_mfvs, transval1->typOid);
    newval    = (mfvtransval *)VARDATA(newblob);

    /* combine sketches */
    for (i = 0; i < DEPTH; i++)
        for (j = 0; j < NUMCOUNTERS; j++)
            newval->sketch[i][j] = transval1->sketch[i][j]
                                   + transval2->sketch[i][j];

    /* recompute the counts using the merged sketch */
    for (i = 0; i < transval1->next_mfv; i++) {
        void *tmpp = mfv_transval_getval(transblob1,i);
        Datum dat = PointerExtractDatum(tmpp, transval1->typByVal);

        transval1->mfvs[i].cnt = cmsketch_count_c(newval->sketch,
                                                  dat,
                                                  newval->outFuncOid,
                                                  newval->typOid);
    }
    for (i = 0; i < transval2->next_mfv; i++) {
        void *tmpp = mfv_transval_getval(transblob2,i);
        Datum dat = PointerExtractDatum(tmpp, transval2->typByVal);

        transval2->mfvs[i].cnt = cmsketch_count_c(newval->sketch,
                                                  dat,
                                                  newval->outFuncOid,
                                                  newval->typOid);
    }

    /* now take maxes on mfvs in a sort-merge style, copying into transval1  */
    qsort(transval1->mfvs, transval1->next_mfv, sizeof(offsetcnt), cnt_cmp_desc);
    qsort(transval2->mfvs, transval2->next_mfv, sizeof(offsetcnt), cnt_cmp_desc);

    /* choose top k from transval1 and transval2 */
    for (i = j = cnt = 0;
         cnt < newval->max_mfvs
         && (j < transval2->next_mfv || i < transval1->next_mfv);
         cnt++) {
        Datum iDatum, jDatum;

	if (i < transval1->next_mfv &&
            (j == transval2->next_mfv
             || transval1->mfvs[i].cnt >= transval2->mfvs[j].cnt)) {
          /* next item comes from transval1 */
          iDatum = PointerExtractDatum(mfv_transval_getval(transblob1, i),
                                       transval1->typByVal);
          newblob = mfv_transval_append(newblob, iDatum);
          newval = (mfvtransval *)VARDATA(newblob);
          newval->mfvs[cnt].cnt = transval1->mfvs[i].cnt;
          i++;
        }
        else if (j < transval2->next_mfv &&
                 (i == transval1->next_mfv
                  || transval1->mfvs[i].cnt < transval2->mfvs[j].cnt)) {
          /* next item comes from transval2 */
          jDatum = PointerExtractDatum(mfv_transval_getval(transblob2, j),
                                       transval2->typByVal);
          newblob = mfv_transval_append(newblob, jDatum);
          newval = (mfvtransval *)VARDATA(newblob);
          newval->mfvs[cnt].cnt = transval2->mfvs[j].cnt;
          j++;
        }
    }
    return(newblob);
}
Example #6
0
/*!
 * scalar function taking an mfv sketch, returning a histogram of
 * its most frequent values
 */
Datum __mfvsketch_final(PG_FUNCTION_ARGS)
{
    bytea *      transblob = PG_GETARG_BYTEA_P(0);
    mfvtransval *transval = NULL;
    ArrayType *  retval;
    uint32       i;
    int          dims[2], lbs[2];
    /* Oid     typInput, typIOParam; */
    Oid          outFuncOid;
    bool         typIsVarlena;
    int16        typlen;
    bool         typbyval;
    char         typalign;
    char         typdelim;
    Oid          typioparam;
    Oid          typiofunc;


    if (PG_ARGISNULL(0)) PG_RETURN_NULL();
    if (VARSIZE(transblob) < MFV_TRANSVAL_SZ(0)) PG_RETURN_NULL();

    check_mfvtransval(transblob);
    transval = (mfvtransval *)VARDATA(transblob);
    /*
     * We only declare the variable-length array histo here after some sanity
     * checking. We risk a stack overflow otherwise. In particular, we need to
     * make sure that transval->max_mfvs is initialized. It might not be if the
     * (strict) transition function is never called. (MADLIB-254)
     */
    Datum        histo[transval->max_mfvs][2];

    qsort(transval->mfvs, transval->next_mfv, sizeof(offsetcnt), cnt_cmp_desc);
    getTypeOutputInfo(INT8OID,
                      &outFuncOid,
                      &typIsVarlena);

    for (i = 0; i < transval->next_mfv; i++) {
        void *tmpp = mfv_transval_getval(transblob,i);
        Datum curval = PointerExtractDatum(tmpp, transval->typByVal);
        char *countbuf =
            OidOutputFunctionCall(outFuncOid,
                                  Int64GetDatum(transval->mfvs[i].cnt));
        char *valbuf = OidOutputFunctionCall(transval->outFuncOid, curval);

        histo[i][0] = PointerGetDatum(cstring_to_text(valbuf));
        histo[i][1] = PointerGetDatum(cstring_to_text(countbuf));
        pfree(countbuf);
        pfree(valbuf);
    }

    /*
     * Get info about element type
     */
    get_type_io_data(TEXTOID, IOFunc_output,
                     &typlen, &typbyval,
                     &typalign, &typdelim,
                     &typioparam, &typiofunc);

    dims[0] = i;
    dims[1] = 2;
    lbs[0] = lbs[1] = 0;
    retval = construct_md_array((Datum *)histo,
                                NULL,
                                2,
                                dims,
                                lbs,
                                TEXTOID,
                                -1,
                                0,
                                'i');
    PG_RETURN_ARRAYTYPE_P(retval);
}