void Sessionize::processData(UDRInvocationInfo &info, UDRPlanInfo &plan) { // this is just a dummy implementation, the test // does not rely on the generated results // loop over input rows while (getNextRow(info)) { info.out().setString(0, "userid"); info.out().setLong(1, 999); info.out().setLong(2, 9999); emitRow(info); } }
void Sessionize::describeConstraints(UDRInvocationInfo &info) { // The sessionize UDF produces at most one result row for every input // row it reads. This means it can propagate certain constraints on // its input tables to the result. info.propagateConstraintsFor1To1UDFs(false); // The id column, together with session id and sequence_no, form a unique key. // Generate a uniqueness constraint for that. UniqueConstraintInfo uc; uc.addColumn(info.out().getColNum(info.par().getString(0))); uc.addColumn(0); // the session id is alway column #0 uc.addColumn(1); // the sequence number alway column #1 info.out().addUniquenessConstraint(uc); }
void Sessionize::describeDataflowAndPredicates(UDRInvocationInfo &info) { // Start with the default behavior for a reducer, pushing down // any predicates on the key/id column. UDR::describeDataflowAndPredicates(info); // Make sure we don't require any unused passthru columns // from the child/input table. NOTE: This can change the // column numbers for our id and timestamp columns! info.setUnusedPassthruColumns(); // That could have set our timestamp column or user id // column as unused, however. So, make sure these two // columns are definitely included. // first, recompute the id and timestamp column numbers InternalColumns state( info.in().getColNum(info.par().getString(0)), info.in().getColNum(info.par().getString(1))); // then include the columns info.setChildColumnUsage(0, state.getIdColumn(), ColumnInfo::USED); info.setChildColumnUsage(0, state.getTsColumn(), ColumnInfo::USED); bool generatedColsAreUsed = (info.out().getColumn(0).getUsage() == ColumnInfo::USED || info.out().getColumn(1).getUsage() == ColumnInfo::USED); // Walk through predicates and find additional ones to push down // or to evaluate locally for (int p=0; p<info.getNumPredicates(); p++) { if (!generatedColsAreUsed) { // If session_id/sequence_no are not used in the query, then // we can push all predicates to the children. info.setPredicateEvaluationCode(p, PredicateInfo::EVALUATE_IN_CHILD); } else if (info.isAComparisonPredicate(p)) { // For demo purposes, accept predicates of the // form "session_id < const" to be evaluated in the UDF. const ComparisonPredicateInfo &cpi = info.getComparisonPredicate(p); if (cpi.getColumnNumber() == 0 /* SESSION_ID */ && cpi.getOperator() == PredicateInfo::LESS && cpi.hasAConstantValue()) info.setPredicateEvaluationCode(p, PredicateInfo::EVALUATE_IN_UDF); } } }
void FibonacciUDF::processData(UDRInvocationInfo &info, UDRPlanInfo &plan) { // input parameters: (int startRow, int numResultRows) int startRow = info.par().getInt(0); int numResultRows = info.par().getInt(1); long fibonacciNumber = 0; long previousResult = 1; long temp = 0; int ordinal=0; // produce fibonacci numbers and emit rows // --------------------------------------- while (1) { if (ordinal >= startRow) { // set result parameters (int ordinal, long fibonacci_number) info.out().setInt(0, ordinal); info.out().setLong(1, fibonacciNumber); emitRow(info); } // did we produce numResultRows already? if (++ordinal >= startRow+numResultRows) break; if (fibonacciNumber > std::numeric_limits<long>::max()/2) throw UDRException(38001, "Upper limit exceeded"); // pre-compute the next row temp = fibonacciNumber; fibonacciNumber += previousResult; previousResult = temp; } }
void Sessionize::describeStatistics(UDRInvocationInfo &info) { // We set the function type to REDUCER earlier. The Trafodion compiler // estimates one output row per input partition for reducer function, // unless the UDF specifies another value. Since our sessionize UDF // returns one output row per input row, make sure the optimizer has // a better cardinality estimate. // Crude estimate, assume each predicate evaluated by the UDF // reduces the number of output columns by 50%. At this point, only // predicates that are evaluated by the UDF are left in the list. double selectivity = pow(2,-info.getNumPredicates()); long resultRowCount = static_cast<long>(info.in().getEstimatedNumRows() * selectivity); info.out().setEstimatedNumRows(resultRowCount); }
void TimeSeries::describeParamsAndColumns(UDRInvocationInfo &info) { InternalColumns internalCols(info); // create PARTITION BY output columns, one passthru column // for every column that appears in PARTITION BY const PartitionInfo &part = info.in().getQueryPartitioning(); int numPartCols = part.getNumEntries(); for (int pc=0; pc<numPartCols; pc++) info.addPassThruColumns(0, part.getColumnNum(pc), part.getColumnNum(pc)); // since we work locally in a partition, set the function type // of this TMUDF to REDUCER info.setFuncType(UDRInvocationInfo::REDUCER); // produce the time column, it has the same type as the // ORDER BY column that defines the input time value // and its name is specified by parameter 0 const TypeInfo &timeType = info.in().getColumn(internalCols.getTimeSliceInColNum()).getType(); info.out().addColumn(ColumnInfo(info.par().getString(0).c_str(), timeType)); // produce aggregate columns for (int a=0; a<internalCols.getNumAggrCols(); a++) { TimeSeriesAggregate tsa = internalCols.getAggrColumn(a); std::string outColName(info.par().getString(2*a + 2)); TypeInfo inColType( info.in().getColumn(tsa.getInputColNum()).getType()); // append suffix to input column name to form the output column // name, make those all capitals to avoid delimited identifiers outColName += "_"; if (tsa.isFirstVal()) outColName += "F"; else outColName += "L"; if (tsa.isConstInterpol()) outColName += "C"; else outColName += "L"; if (tsa.isIgnoreNulls()) outColName += "I"; if (tsa.isConstInterpol()) { // add a column with the same data type as the original // column, but make it nullable if it isn't already inColType.setNullable(true); info.out().addColumn(ColumnInfo(outColName.c_str(), inColType)); } else // add a "DOUBLE" output column to allow interpolation info.out().addColumn(ColumnInfo( outColName.c_str(), TypeInfo(TypeInfo::DOUBLE_PRECISION, 0, true))); } // add formal parameters with types that match the actual ones for (int p=0; p<info.par().getNumColumns(); p++) { char parName[20]; snprintf(parName, sizeof(parName), "PAR_%d", p); info.addFormalParameter(ColumnInfo(parName, info.par().getColumn(p).getType())); } }
InternalColumns::InternalColumns(const UDRInvocationInfo &info) : info_(info) { // expect a single table-valued input if (info.getNumTableInputs() != 1) throw UDRException( 38010, "TIMESERIES UDF: Expecting one table-valued input"); const OrderInfo &ord = info.in().getQueryOrdering(); const PartitionInfo &part = info.in().getQueryPartitioning(); // perform some basic tests in the first call at compile time if (info.getCallPhase() == UDRInvocationInfo::COMPILER_INITIAL_CALL) { // expect an order by on a time or timestamp expression if (ord.getNumEntries() != 1 || ord.getOrderType(0) == OrderInfo::DESCENDING) throw UDRException( 38020, "TIMESERIES UDF: Must use ORDER BY with one column for its input table and the order must be ascending"); TypeInfo::SQLTypeCode typeCode = info.in().getColumn(ord.getColumnNum(0)).getType().getSQLType(); if (typeCode != TypeInfo::TIME && typeCode != TypeInfo::TIMESTAMP) throw UDRException( 38030, "TIMESERIES UDF: Must use ORDER BY a TIME or TIMESTAMP column for the input table"); // we need at least two parameters, time column name and width // of time slice if (info.par().getNumColumns() < 2) throw UDRException( 38040, "TIMESERIES UDF: UDF needs to be called with at least 2 scalar parameters"); // input parameter 0 (defined in the DDL) is the // name of the column containing the time values if (!info.par().isAvailable(0) || info.par().getColumn(0).getType().getSQLTypeClass() != TypeInfo::CHARACTER_TYPE) throw UDRException( 38050, "TIMESERIES UDF: Expecting a character constant (timestamp alias) as first parameter"); // check type and value of the time slice width, specified // as parameter 1 if (!info.par().isAvailable(1)) throw UDRException( 38060, "TIMESERIES UDF: Expecting a constant for the time slice width as second parameter"); // time slice width must be a day-second interval if (info.par().getColumn(1).getType().getSQLTypeSubClass() != TypeInfo::DAY_SECOND_INTERVAL_TYPE) throw UDRException( 38070, "TIMESERIES UDF: Second scalar parameter for time slice width must be an interval constant in the day to second range"); // make sure parameters come in pairs if (info.par().getNumColumns() % 2 != 0) throw UDRException( 38080, "TIMESERIES UDF: Parameters need to be specified in pairs of column name and instructions"); // make sure all parameters are specified at compile time for (int p=2; p<info.par().getNumColumns(); p++) if (!info.par().isAvailable(p)) throw UDRException( 38090, "TIMESERIES UDF: All parameters must be specified as literals"); } // initial compile-time checks tsInColNum_ = ord.getColumnNum(0); numTSCols_ = 1; // always a single timestamp column for now numPartCols_ = part.getNumEntries(); timeSliceWidth_ = info.par().getTime(1); // initialize vectors for (int p=0; p<numPartCols_; p++) { currPartKey_.push_back(""); currPartKeyNulls_.push_back(true); } int ip = 2; while (ip<info.par().getNumColumns()) { std::string colName = info.par().getString(ip); std::string instr = info.par().getString(ip+1); bool isFirstVal; bool isConstInterpol; bool isIgnoreNulls; // some checks done only during the first compile time call if (info.getCallPhase() == UDRInvocationInfo::COMPILER_INITIAL_CALL) { if (instr.size() < 2 || instr.size() > 3) throw UDRException( 38100, "TIMESERIES UDF: Expecting instructions with 2 or 3 characters: %s", instr.c_str()); // validate first character of instructions switch (instr[0]) { case 'f': case 'F': case 'l': case 'L': break; default: throw UDRException( 38110, "TIMESERIES UDF: Parameter %d should start with F or L for first or last value", ip+2); } // validate second character of instructions switch (instr[1]) { case 'c': case 'C': case 'l': case 'L': break; default: throw UDRException( 38120, "TIMESERIES UDF: Parameter %d should have C or L as its second character, for constant or linear interpolation", ip+2); } if (instr.size() == 3 && instr[2] != 'i' && instr[2] != 'I') throw UDRException( 38130, "TIMESERIES UDF: Unexpected trailing characters in aggregate instructions: %s", instr.c_str()); } // compile-time checks isFirstVal = (instr[0] == 'F' || instr[0] == 'f'); isConstInterpol = (instr[1] == 'C' || instr[1] == 'c'); isIgnoreNulls = (instr.size() > 2); columns_.push_back(new TimeSeriesAggregate( info.in(), info.out(), info.in().getColNum(info.par().getString(ip)), getFirstAggrCol() + columns_.size(), isFirstVal, isConstInterpol, isIgnoreNulls)); ip += 2; } }
void Sessionize::describeParamsAndColumns(UDRInvocationInfo &info) { // First, do some validation of the parameters and set // PARTITION BY and ORDER BY columns int idCol = -1; int tsCol = -1; // Make sure we have exactly one table-valued input, otherwise // generate a compile error if (info.getNumTableInputs() != 1) throw UDRException(38000, "%s must be called with one table-valued input", info.getUDRName().data()); // check whether the first two arguments identify // an arbitrary column and an exact numeric column if (info.par().isAvailable(0)) { const PartitionInfo &queryPartInfo = info.in().getQueryPartitioning(); PartitionInfo newPartInfo; // This will raise an error if the column name // specified in the first parameter doesn't exist idCol = info.in().getColNum(info.par().getString(0)); // make sure the query didn't specify a conflicting // PARTITION BY clause if (queryPartInfo.getType() == PartitionInfo::PARTITION && (queryPartInfo.getNumEntries() != 1 || queryPartInfo.getColumnNum(0) != idCol)) throw UDRException(38001, "Query PARTITION BY not compatible with id column %s", info.par().getString(0).c_str()); // Set this user id column as the required PARTITION BY column newPartInfo.setType(PartitionInfo::PARTITION); newPartInfo.addEntry(idCol); info.setChildPartitioning(0, newPartInfo); } else throw UDRException(38001,"First scalar parameter must be a string constant"); // make sure the second parameter specifies the name of // an existing input column of type exact numeric if (info.par().isAvailable(1)) { // This will raise an error if the column name // specified in the second parameter doesn't exist tsCol = info.in().getColNum(info.par().getString(1)); const TypeInfo &typ = info.in().getColumn(tsCol).getType(); const OrderInfo &queryOrderInfo = info.in().getQueryOrdering(); OrderInfo newOrderInfo; if (typ.getSQLTypeSubClass() != TypeInfo::EXACT_NUMERIC_TYPE) throw UDRException(38002, "Second parameter must be the name of an exact numeric column"); // check for a conflicting ORDER BY in the query if (queryOrderInfo.getNumEntries() > 0 && (queryOrderInfo.getColumnNum(0) != tsCol || queryOrderInfo.getOrderType(0) == OrderInfo::DESCENDING)) throw UDRException( 38900, "Query ORDER BY conflicts with specified timestamp column %s", info.par().getString(1).c_str()); // make a new ORDER BY clause with just the timestamp column newOrderInfo.addEntry(tsCol); info.setChildOrdering(0, newOrderInfo); } else throw UDRException(38003,"Second scalar parameter must be a string constant"); // To demonstrate state that gets passed between compiler phases and // to avoid looking up the id column and timestamp column each time, // store those as UDR Writer data in the UDRInvocationInfo object /* TBD: uncomment when this is allowed info.setUDRWriterCompileTimeData(new InternalColumns(idCol, tsCol)); */ // Second, define the output parameters // add the columns for session id and sequence number // (sequence_no is a unique sequence number within the session) info.out().addLongColumn("SESSION_ID"); // column number 0 info.out().addLongColumn("SEQUENCE_NO"); // column number 1 // Make all the input table columns also output columns, // those are called "pass-through" columns. The default // parameters of this method add all the columns of the // first input table. info.addPassThruColumns(); // set the function type, sessionize behaves like a reducer in // MapReduce. Session ids are local within rows that share the // same id column value. info.setFuncType(UDRInvocationInfo::REDUCER); }
void Sessionize::processData(UDRInvocationInfo &info, UDRPlanInfo &plan) { // read the three parameters and convert the first two into column numbers int userIdColNum = info.in(0).getColNum(info.par().getString(0)); int timeStampColNum = info.in(0).getColNum(info.par().getString(1)); long timeout = info.par().getLong(2); // variables needed for computing the session id long lastTimeStamp = 0; std::string lastUserId; long currSessionId = 1; long currSequenceNo = 1; int maxSessionId = 999999999; if (info.getNumPredicates() > 0) { // based on the describeDataflowAndPredicates() method, this must be // a predicate of the form SESSION_ID < const that we need // to evaluate inside this method std::string maxValue = info.getComparisonPredicate(0).getConstValue(); sscanf(maxValue.c_str(), "%d", &maxSessionId); } // loop over input rows while (getNextRow(info)) { long timeStamp = info.in(0).getLong(timeStampColNum); std::string userId = info.in(0).getString(userIdColNum); if (lastUserId != userId) { // reset timestamp check and start over with session id 0 lastTimeStamp = 0; currSessionId = 1; currSequenceNo = 1; lastUserId = userId; } long tsDiff = timeStamp - lastTimeStamp; if (tsDiff > timeout && lastTimeStamp > 0) { currSessionId++; currSequenceNo = 1; } else if (tsDiff < 0) throw UDRException( 38001, "Got negative or descending timestamps %ld, %ld", lastTimeStamp, timeStamp); lastTimeStamp = timeStamp; // this evaluates the SQL predicate on SESSION_ID if (currSessionId < maxSessionId) { // produce session_id and sequence_no output columns info.out().setLong(0, currSessionId); info.out().setLong(1, currSequenceNo); // produce the remaining columns and emit the row info.copyPassThruData(); emitRow(info); currSequenceNo++; } } }