示例#1
0
void Sessionize::describeDataflowAndPredicates(UDRInvocationInfo &info)
{
  // Start with the default behavior for a reducer, pushing down
  // any predicates on the key/id column.
  UDR::describeDataflowAndPredicates(info);

  // Make sure we don't require any unused passthru columns
  // from the child/input table. NOTE: This can change the
  // column numbers for our id and timestamp columns!
  info.setUnusedPassthruColumns();

  // That could have set our timestamp column or user id
  // column as unused, however. So, make sure these two
  // columns are definitely included.

  // first, recompute the id and timestamp column numbers
  InternalColumns state(
         info.in().getColNum(info.par().getString(0)),
         info.in().getColNum(info.par().getString(1)));

  // then include the columns
  info.setChildColumnUsage(0, state.getIdColumn(), ColumnInfo::USED);
  info.setChildColumnUsage(0, state.getTsColumn(), ColumnInfo::USED);

  bool generatedColsAreUsed =
    (info.out().getColumn(0).getUsage() == ColumnInfo::USED ||
     info.out().getColumn(1).getUsage() == ColumnInfo::USED);

  // Walk through predicates and find additional ones to push down
  // or to evaluate locally
  for (int p=0; p<info.getNumPredicates(); p++)
    {
      if (!generatedColsAreUsed)
        {
          // If session_id/sequence_no are not used in the query, then
          // we can push all predicates to the children.
          info.setPredicateEvaluationCode(p, PredicateInfo::EVALUATE_IN_CHILD);
        }
      else if (info.isAComparisonPredicate(p))
        {
          // For demo purposes, accept predicates of the
          // form "session_id < const" to be evaluated in the UDF.
          const ComparisonPredicateInfo &cpi = info.getComparisonPredicate(p);

          if (cpi.getColumnNumber() == 0 /* SESSION_ID */ &&
              cpi.getOperator() == PredicateInfo::LESS &&
              cpi.hasAConstantValue())
            info.setPredicateEvaluationCode(p, PredicateInfo::EVALUATE_IN_UDF);
        }
    }
}
void Sessionize::processData(UDRInvocationInfo &info,
                             UDRPlanInfo &plan)
{
  // this is just a dummy implementation, the test
  // does not rely on the generated results

  // loop over input rows
  while (getNextRow(info))
  {
    info.out().setString(0, "userid");
    info.out().setLong(1, 999);
    info.out().setLong(2, 9999);

    emitRow(info);
  }
}
示例#3
0
void Sessionize::describeStatistics(UDRInvocationInfo &info)
{
  // We set the function type to REDUCER earlier. The Trafodion compiler
  // estimates one output row per input partition for reducer function,
  // unless the UDF specifies another value. Since our sessionize UDF
  // returns one output row per input row, make sure the optimizer has
  // a better cardinality estimate.

  // Crude estimate, assume each predicate evaluated by the UDF
  // reduces the number of output columns by 50%. At this point, only
  // predicates that are evaluated by the UDF are left in the list.
  double selectivity = pow(2,-info.getNumPredicates());
  long resultRowCount =
    static_cast<long>(info.in().getEstimatedNumRows() * selectivity);

  info.out().setEstimatedNumRows(resultRowCount);
}
示例#4
0
void Sessionize::describeConstraints(UDRInvocationInfo &info)
{
  // The sessionize UDF produces at most one result row for every input
  // row it reads. This means it can propagate certain constraints on
  // its input tables to the result.
  info.propagateConstraintsFor1To1UDFs(false);

  // The id column, together with session id and sequence_no, form a unique key.
  // Generate a uniqueness constraint for that.

  UniqueConstraintInfo uc;

  uc.addColumn(info.out().getColNum(info.par().getString(0)));
  uc.addColumn(0); // the session id is alway column #0
  uc.addColumn(1); // the sequence number alway column #1
  info.out().addUniquenessConstraint(uc);

}
示例#5
0
void FibonacciUDF::processData(UDRInvocationInfo &info,
                               UDRPlanInfo &plan)
{
  // input parameters: (int startRow, int numResultRows)
  int startRow = info.par().getInt(0);
  int numResultRows = info.par().getInt(1);
  long fibonacciNumber = 0;
  long previousResult = 1;
  long temp = 0;
  int ordinal=0;

  // produce fibonacci numbers and emit rows
  // ---------------------------------------
  while (1)
    {
      if (ordinal >= startRow)
        {
          // set result parameters (int ordinal, long fibonacci_number)
          info.out().setInt(0, ordinal);
          info.out().setLong(1, fibonacciNumber);
          emitRow(info);
        }

      // did we produce numResultRows already?
      if (++ordinal >= startRow+numResultRows)
        break;

      if (fibonacciNumber > std::numeric_limits<long>::max()/2)
        throw UDRException(38001, "Upper limit exceeded");

      // pre-compute the next row
      temp = fibonacciNumber;
      fibonacciNumber += previousResult;
      previousResult = temp;
    }
}
void TimeSeries::describeParamsAndColumns(UDRInvocationInfo &info)
{
  InternalColumns internalCols(info);

  // create PARTITION BY output columns, one passthru column
  // for every column that appears in PARTITION BY
  const PartitionInfo &part = info.in().getQueryPartitioning();
  int numPartCols = part.getNumEntries();

  for (int pc=0; pc<numPartCols; pc++)
    info.addPassThruColumns(0, part.getColumnNum(pc), part.getColumnNum(pc));

  // since we work locally in a partition, set the function type
  // of this TMUDF to REDUCER
  info.setFuncType(UDRInvocationInfo::REDUCER);

  // produce the time column, it has the same type as the
  // ORDER BY column that defines the input time value
  // and its name is specified by parameter 0
  const TypeInfo &timeType =
    info.in().getColumn(internalCols.getTimeSliceInColNum()).getType();

  info.out().addColumn(ColumnInfo(info.par().getString(0).c_str(),
                                  timeType));

  // produce aggregate columns
  for (int a=0; a<internalCols.getNumAggrCols(); a++)
    {
      TimeSeriesAggregate tsa = internalCols.getAggrColumn(a);
      std::string outColName(info.par().getString(2*a + 2));
      TypeInfo inColType(
           info.in().getColumn(tsa.getInputColNum()).getType());

      // append suffix to input column name to form the output column
      // name, make those all capitals to avoid delimited identifiers
      outColName += "_";
      if (tsa.isFirstVal())
        outColName += "F";
      else
        outColName += "L";
      if (tsa.isConstInterpol())
        outColName += "C";
      else
        outColName += "L";
      if (tsa.isIgnoreNulls())
        outColName += "I";

      if (tsa.isConstInterpol())
        {
          // add a column with the same data type as the original
          // column, but make it nullable if it isn't already
          inColType.setNullable(true);
          info.out().addColumn(ColumnInfo(outColName.c_str(), inColType));
        }
      else
        // add a "DOUBLE" output column to allow interpolation
        info.out().addColumn(ColumnInfo(
                                  outColName.c_str(),
                                  TypeInfo(TypeInfo::DOUBLE_PRECISION,
                                           0,
                                           true)));
    }

  // add formal parameters with types that match the actual ones
  for (int p=0; p<info.par().getNumColumns(); p++)
    {
      char parName[20];

      snprintf(parName, sizeof(parName), "PAR_%d", p);
      info.addFormalParameter(ColumnInfo(parName,
                                         info.par().getColumn(p).getType()));
    } 
}
InternalColumns::InternalColumns(const UDRInvocationInfo &info) :
     info_(info)
{
  // expect a single table-valued input
  if (info.getNumTableInputs() != 1)
    throw UDRException(
         38010,
         "TIMESERIES UDF: Expecting one table-valued input");

  const OrderInfo &ord = info.in().getQueryOrdering();
  const PartitionInfo &part = info.in().getQueryPartitioning();

  // perform some basic tests in the first call at compile time
  if (info.getCallPhase() == UDRInvocationInfo::COMPILER_INITIAL_CALL)
    {
      // expect an order by on a time or timestamp expression
      if (ord.getNumEntries() != 1 ||
          ord.getOrderType(0) == OrderInfo::DESCENDING)
        throw UDRException(
             38020,
             "TIMESERIES UDF: Must use ORDER BY with one column for its input table and the order must be ascending");
      TypeInfo::SQLTypeCode typeCode =
        info.in().getColumn(ord.getColumnNum(0)).getType().getSQLType();
      if (typeCode != TypeInfo::TIME &&
          typeCode != TypeInfo::TIMESTAMP)
        throw UDRException(
             38030,
             "TIMESERIES UDF: Must use ORDER BY a TIME or TIMESTAMP column for the input table");

      // we need at least two parameters, time column name and width
      // of time slice
      if (info.par().getNumColumns() < 2)
        throw UDRException(
             38040,
             "TIMESERIES UDF: UDF needs to be called with at least 2 scalar parameters");

      // input parameter 0 (defined in the DDL) is the
      // name of the column containing the time values
      if (!info.par().isAvailable(0) ||
          info.par().getColumn(0).getType().getSQLTypeClass() !=
          TypeInfo::CHARACTER_TYPE)
        throw UDRException(
             38050,
             "TIMESERIES UDF: Expecting a character constant (timestamp alias) as first parameter");

      // check type and value of the time slice width, specified
      // as parameter 1
      if (!info.par().isAvailable(1))
        throw UDRException(
             38060,
             "TIMESERIES UDF: Expecting a constant for the time slice width as second parameter");

      // time slice width must be a day-second interval
      if (info.par().getColumn(1).getType().getSQLTypeSubClass() !=
          TypeInfo::DAY_SECOND_INTERVAL_TYPE)
        throw UDRException(
             38070,
             "TIMESERIES UDF: Second scalar parameter for time slice width must be an interval constant in the day to second range");

      // make sure parameters come in pairs
      if (info.par().getNumColumns() % 2 != 0)
        throw UDRException(
             38080,
             "TIMESERIES UDF: Parameters need to be specified in pairs of column name and instructions");

      // make sure all parameters are specified at compile time
      for (int p=2; p<info.par().getNumColumns(); p++)
        if (!info.par().isAvailable(p))
          throw UDRException(
               38090,
               "TIMESERIES UDF: All parameters must be specified as literals");
    } // initial compile-time checks

  tsInColNum_     = ord.getColumnNum(0);
  numTSCols_      = 1; // always a single timestamp column for now
  numPartCols_    = part.getNumEntries();

  timeSliceWidth_  = info.par().getTime(1);

  // initialize vectors
  for (int p=0; p<numPartCols_; p++)
    {
      currPartKey_.push_back("");
      currPartKeyNulls_.push_back(true);
    }

  int ip = 2;

  while (ip<info.par().getNumColumns())
    {
      std::string colName = info.par().getString(ip);
      std::string instr = info.par().getString(ip+1);
      bool isFirstVal;
      bool isConstInterpol;
      bool isIgnoreNulls;

      // some checks done only during the first compile time call
      if (info.getCallPhase() == UDRInvocationInfo::COMPILER_INITIAL_CALL)
        {
          if (instr.size() < 2 || instr.size() > 3)
            throw UDRException(
                 38100,
                 "TIMESERIES UDF: Expecting instructions with 2 or 3 characters: %s",
                 instr.c_str());

          // validate first character of instructions
          switch (instr[0])
            {
            case 'f':
            case 'F':
            case 'l':
            case 'L':
              break;

            default:
              throw UDRException(
                   38110,
                   "TIMESERIES UDF: Parameter %d should start with F or L for first or last value",
                   ip+2);
            }

          // validate second character of instructions
          switch (instr[1])
            {
            case 'c':
            case 'C':
            case 'l':
            case 'L':
              break;

            default:
              throw UDRException(
                   38120,
                   "TIMESERIES UDF: Parameter %d should have C or L as its second character, for constant or linear interpolation",
                   ip+2);
            }

          if (instr.size() == 3 &&
              instr[2] != 'i' &&
              instr[2] != 'I')
            throw UDRException(
                 38130,
                 "TIMESERIES UDF: Unexpected trailing characters in aggregate instructions: %s",
                 instr.c_str());
        } // compile-time checks

      isFirstVal = (instr[0] == 'F' || instr[0] == 'f');
      isConstInterpol = (instr[1] == 'C' || instr[1] == 'c');
      isIgnoreNulls = (instr.size() > 2);

      columns_.push_back(new TimeSeriesAggregate(
                              info.in(),
                              info.out(),
                              info.in().getColNum(info.par().getString(ip)),
                              getFirstAggrCol() + columns_.size(),
                              isFirstVal,
                              isConstInterpol,
                              isIgnoreNulls));
      ip += 2;
    }
}
示例#8
0
void Sessionize::describeParamsAndColumns(UDRInvocationInfo &info)
{
  // First, do some validation of the parameters and set
  // PARTITION BY and ORDER BY columns
  int idCol = -1;
  int tsCol = -1;

  // Make sure we have exactly one table-valued input, otherwise
  // generate a compile error
  if (info.getNumTableInputs() != 1)
    throw UDRException(38000,
                       "%s must be called with one table-valued input",
                       info.getUDRName().data());

  // check whether the first two arguments identify
  // an arbitrary column and an exact numeric column
  if (info.par().isAvailable(0))
    {
      const PartitionInfo &queryPartInfo = info.in().getQueryPartitioning();
      PartitionInfo newPartInfo;

      // This will raise an error if the column name
      // specified in the first parameter doesn't exist
      idCol = info.in().getColNum(info.par().getString(0));

      // make sure the query didn't specify a conflicting
      // PARTITION BY clause
      if (queryPartInfo.getType() == PartitionInfo::PARTITION &&
          (queryPartInfo.getNumEntries() != 1 ||
           queryPartInfo.getColumnNum(0) != idCol))
        throw UDRException(38001,
                           "Query PARTITION BY not compatible with id column %s",
                           info.par().getString(0).c_str());

      // Set this user id column as the required PARTITION BY column
      newPartInfo.setType(PartitionInfo::PARTITION);
      newPartInfo.addEntry(idCol);
      info.setChildPartitioning(0, newPartInfo);
    }
  else
    throw UDRException(38001,"First scalar parameter must be a string constant");

  // make sure the second parameter specifies the name of
  // an existing input column of type exact numeric
  if (info.par().isAvailable(1))
    {
      // This will raise an error if the column name
      // specified in the second parameter doesn't exist
      tsCol = info.in().getColNum(info.par().getString(1));
      const TypeInfo &typ = info.in().getColumn(tsCol).getType();
      const OrderInfo &queryOrderInfo = info.in().getQueryOrdering();
      OrderInfo newOrderInfo;

      if (typ.getSQLTypeSubClass() != TypeInfo::EXACT_NUMERIC_TYPE)
        throw UDRException(38002, "Second parameter must be the name of an exact numeric column");

      // check for a conflicting ORDER BY in the query
      if (queryOrderInfo.getNumEntries() > 0 &&
          (queryOrderInfo.getColumnNum(0) != tsCol ||
           queryOrderInfo.getOrderType(0) == OrderInfo::DESCENDING))
        throw UDRException(
             38900,
             "Query ORDER BY conflicts with specified timestamp column %s",
             info.par().getString(1).c_str());

      // make a new ORDER BY clause with just the timestamp column
      newOrderInfo.addEntry(tsCol);
      info.setChildOrdering(0, newOrderInfo);
    }
  else
    throw UDRException(38003,"Second scalar parameter must be a string constant");
 
  // To demonstrate state that gets passed between compiler phases and
  // to avoid looking up the id column and timestamp column each time,
  // store those as UDR Writer data in the UDRInvocationInfo object
  /* TBD: uncomment when this is allowed
  info.setUDRWriterCompileTimeData(new InternalColumns(idCol, tsCol));
  */

  // Second, define the output parameters

  // add the columns for session id and sequence number
  // (sequence_no is a unique sequence number within the session)
  info.out().addLongColumn("SESSION_ID");  // column number 0
  info.out().addLongColumn("SEQUENCE_NO"); // column number 1
 
  // Make all the input table columns also output columns,
  // those are called "pass-through" columns. The default
  // parameters of this method add all the columns of the
  // first input table.
  info.addPassThruColumns();

  // set the function type, sessionize behaves like a reducer in
  // MapReduce. Session ids are local within rows that share the
  // same id column value.
  info.setFuncType(UDRInvocationInfo::REDUCER);
}
示例#9
0
void Sessionize::processData(UDRInvocationInfo &info,
                             UDRPlanInfo &plan)
{
  // read the three parameters and convert the first two into column numbers
  int userIdColNum    = info.in(0).getColNum(info.par().getString(0));
  int timeStampColNum = info.in(0).getColNum(info.par().getString(1));
  long timeout        = info.par().getLong(2);

  // variables needed for computing the session id
  long lastTimeStamp = 0;
  std::string lastUserId;
  long currSessionId = 1;
  long currSequenceNo = 1;
  int maxSessionId = 999999999;

  if (info.getNumPredicates() > 0)
    {
      // based on the describeDataflowAndPredicates() method, this must be
      // a predicate of the form SESSION_ID < const that we need
      // to evaluate inside this method
      std::string maxValue = info.getComparisonPredicate(0).getConstValue();

      sscanf(maxValue.c_str(), "%d", &maxSessionId);
    }

  // loop over input rows
  while (getNextRow(info))
  {
    long timeStamp = info.in(0).getLong(timeStampColNum);
    std::string userId = info.in(0).getString(userIdColNum);

    if (lastUserId != userId)
      {
        // reset timestamp check and start over with session id 0
        lastTimeStamp = 0;
        currSessionId = 1;
        currSequenceNo = 1;
        lastUserId = userId;
      }

    long tsDiff = timeStamp - lastTimeStamp;

    if (tsDiff > timeout && lastTimeStamp > 0)
      {
        currSessionId++;
        currSequenceNo = 1;
      }
    else if (tsDiff < 0)
      throw UDRException(
           38001,
           "Got negative or descending timestamps %ld, %ld",
           lastTimeStamp, timeStamp);

    lastTimeStamp = timeStamp;

    // this evaluates the SQL predicate on SESSION_ID
    if (currSessionId < maxSessionId)
      {
        // produce session_id and sequence_no output columns
        info.out().setLong(0, currSessionId);
        info.out().setLong(1, currSequenceNo);

        // produce the remaining columns and emit the row
        info.copyPassThruData();
        emitRow(info);
        currSequenceNo++;
      }
   }
}