/*! * copy datum <c>dat</c> into the offset of position <c>index</c> of * the mfv sketch stored in <c>transblob</c>. * * <i>Caller beware: this helper return assumes that * <c>dat</c> is small enough to fit in the storage * currently used by the datum at position <c>index</c>.</i> * * \param transblob a bytea holding and mfv transval * \param index the index of the destination for copying * \param dat the datum to be copied into the transval */ void mfv_copy_datum(bytea *transblob, int index, Datum dat) { mfvtransval *transval = (mfvtransval *)VARDATA(transblob); size_t datumLen = ExtractDatumLen(dat, transval->typLen, transval->typByVal, -1); void * curval = mfv_transval_getval(transblob,index); memmove(curval, (void *)DatumExtractPointer(dat, transval->typByVal), datumLen); }
/*! * replace the value at position i of the mfvsketch with dat * * \param transblob the transition value packed into a bytea * \param dat the value to be inserted * \param i the position to replace */ bytea *mfv_transval_replace(bytea *transblob, Datum dat, int i) { /* * if new value is smaller than old, we overwrite at the old offset. * otherwise we call mfv_transval_insert_at which will take care of * space allocation for the new value */ mfvtransval *transval = (mfvtransval *)VARDATA(transblob); size_t datumLen = ExtractDatumLen(dat, transval->typLen, transval->typByVal, -1); void * tmpp = mfv_transval_getval(transblob,i); Datum oldDat = PointerExtractDatum(tmpp, transval->typByVal); size_t oldLen = ExtractDatumLen(oldDat, transval->typLen, transval->typByVal, -1); if (datumLen <= oldLen) { mfv_copy_datum(transblob, i, dat); return transblob; } else return(mfv_transval_insert_at(transblob, dat, i)); }
/*! * look to see if the mfvsketch currently has <c>val</c> * stored as one of its most-frequent values. * Returns the offset in the <c>mfvs</c> array, or -1 * if not found. * NOTE: a 0 return value means the item <i>was found</i> * at offset 0! * \param blob a bytea holding an mfv transval * \param val the datum to search for */ int mfv_find(bytea *blob, Datum val) { mfvtransval *transval = (mfvtransval *)VARDATA(blob); unsigned i; uint32 len; void * datp; Datum iDat; void *valp = DatumExtractPointer(val, transval->typByVal); /* look for existing entry for this value */ for (i = 0; i < transval->next_mfv; i++) { /* if they're the same */ datp = mfv_transval_getval(blob,i); iDat = PointerExtractDatum(datp, transval->typByVal); if ((len = ExtractDatumLen(iDat, transval->typLen, transval->typByVal, -1)) == ExtractDatumLen(val, transval->typLen, transval->typByVal, -1)) { if (!memcmp(datp, valp, len)) /* arg is an mfv */ return(i); } } return(-1); }
/*! * implementation of the merge of two mfv sketches. we * first merge the embedded countmin sketches to get the * sums of the counts, and then use those sums to pick the * top values for the resulting histogram. We overwrite * the first argument and return it. * \param transblob1 an mfv transval stored inside a bytea * \param transblob2 another mfv transval in a bytea */ bytea *mfvsketch_merge_c(bytea *transblob1, bytea *transblob2) { mfvtransval *transval1 = (mfvtransval *)VARDATA(transblob1); mfvtransval *transval2 = (mfvtransval *)VARDATA(transblob2); void *newblob; mfvtransval *newval; uint32 i, j, cnt; /* handle uninitialized args */ if (VARSIZE(transblob1) <= sizeof(MFV_TRANSVAL_SZ(0)) && VARSIZE(transblob2) <= sizeof(MFV_TRANSVAL_SZ(0))) return(transblob1); else if (VARSIZE(transblob1) <= sizeof(MFV_TRANSVAL_SZ(0))) { transblob1 = mfv_init_transval(transval2->max_mfvs, transval2->typOid); transval1 = (mfvtransval *)VARDATA(transblob1); } else if (VARSIZE(transblob2) <= sizeof(MFV_TRANSVAL_SZ(0))) { transblob2 = mfv_init_transval(transval1->max_mfvs, transval1->typOid); transval2 = (mfvtransval *)VARDATA(transblob2); } check_mfvtransval(transblob1); check_mfvtransval(transblob2); if ( transval1->typOid != transval2->typOid ) { elog(ERROR, "cannot merge two transition state with different element type"); } /* initialize output */ newblob = mfv_init_transval(transval1->max_mfvs, transval1->typOid); newval = (mfvtransval *)VARDATA(newblob); /* combine sketches */ for (i = 0; i < DEPTH; i++) for (j = 0; j < NUMCOUNTERS; j++) newval->sketch[i][j] = transval1->sketch[i][j] + transval2->sketch[i][j]; /* recompute the counts using the merged sketch */ for (i = 0; i < transval1->next_mfv; i++) { void *tmpp = mfv_transval_getval(transblob1,i); Datum dat = PointerExtractDatum(tmpp, transval1->typByVal); transval1->mfvs[i].cnt = cmsketch_count_c(newval->sketch, dat, newval->outFuncOid, newval->typOid); } for (i = 0; i < transval2->next_mfv; i++) { void *tmpp = mfv_transval_getval(transblob2,i); Datum dat = PointerExtractDatum(tmpp, transval2->typByVal); transval2->mfvs[i].cnt = cmsketch_count_c(newval->sketch, dat, newval->outFuncOid, newval->typOid); } /* now take maxes on mfvs in a sort-merge style, copying into transval1 */ qsort(transval1->mfvs, transval1->next_mfv, sizeof(offsetcnt), cnt_cmp_desc); qsort(transval2->mfvs, transval2->next_mfv, sizeof(offsetcnt), cnt_cmp_desc); /* choose top k from transval1 and transval2 */ for (i = j = cnt = 0; cnt < newval->max_mfvs && (j < transval2->next_mfv || i < transval1->next_mfv); cnt++) { Datum iDatum, jDatum; if (i < transval1->next_mfv && (j == transval2->next_mfv || transval1->mfvs[i].cnt >= transval2->mfvs[j].cnt)) { /* next item comes from transval1 */ iDatum = PointerExtractDatum(mfv_transval_getval(transblob1, i), transval1->typByVal); newblob = mfv_transval_append(newblob, iDatum); newval = (mfvtransval *)VARDATA(newblob); newval->mfvs[cnt].cnt = transval1->mfvs[i].cnt; i++; } else if (j < transval2->next_mfv && (i == transval1->next_mfv || transval1->mfvs[i].cnt < transval2->mfvs[j].cnt)) { /* next item comes from transval2 */ jDatum = PointerExtractDatum(mfv_transval_getval(transblob2, j), transval2->typByVal); newblob = mfv_transval_append(newblob, jDatum); newval = (mfvtransval *)VARDATA(newblob); newval->mfvs[cnt].cnt = transval2->mfvs[j].cnt; j++; } } return(newblob); }
/*! * scalar function taking an mfv sketch, returning a histogram of * its most frequent values */ Datum __mfvsketch_final(PG_FUNCTION_ARGS) { bytea * transblob = PG_GETARG_BYTEA_P(0); mfvtransval *transval = NULL; ArrayType * retval; uint32 i; int dims[2], lbs[2]; /* Oid typInput, typIOParam; */ Oid outFuncOid; bool typIsVarlena; int16 typlen; bool typbyval; char typalign; char typdelim; Oid typioparam; Oid typiofunc; if (PG_ARGISNULL(0)) PG_RETURN_NULL(); if (VARSIZE(transblob) < MFV_TRANSVAL_SZ(0)) PG_RETURN_NULL(); check_mfvtransval(transblob); transval = (mfvtransval *)VARDATA(transblob); /* * We only declare the variable-length array histo here after some sanity * checking. We risk a stack overflow otherwise. In particular, we need to * make sure that transval->max_mfvs is initialized. It might not be if the * (strict) transition function is never called. (MADLIB-254) */ Datum histo[transval->max_mfvs][2]; qsort(transval->mfvs, transval->next_mfv, sizeof(offsetcnt), cnt_cmp_desc); getTypeOutputInfo(INT8OID, &outFuncOid, &typIsVarlena); for (i = 0; i < transval->next_mfv; i++) { void *tmpp = mfv_transval_getval(transblob,i); Datum curval = PointerExtractDatum(tmpp, transval->typByVal); char *countbuf = OidOutputFunctionCall(outFuncOid, Int64GetDatum(transval->mfvs[i].cnt)); char *valbuf = OidOutputFunctionCall(transval->outFuncOid, curval); histo[i][0] = PointerGetDatum(cstring_to_text(valbuf)); histo[i][1] = PointerGetDatum(cstring_to_text(countbuf)); pfree(countbuf); pfree(valbuf); } /* * Get info about element type */ get_type_io_data(TEXTOID, IOFunc_output, &typlen, &typbyval, &typalign, &typdelim, &typioparam, &typiofunc); dims[0] = i; dims[1] = 2; lbs[0] = lbs[1] = 0; retval = construct_md_array((Datum *)histo, NULL, 2, dims, lbs, TEXTOID, -1, 0, 'i'); PG_RETURN_ARRAYTYPE_P(retval); }