double SymmetricUncertaintyCalculator::_calculateEntropy(const DataFrame& df, int factorIndex)
  {
    typedef HashMap<int, int> ClassCounts;
    ClassCounts cc;
    
    for(unsigned int i = 0; i < df.getNumDataVectors(); i++)
    {
      double v = df.getDataElement(i, factorIndex);
      // null values are not supported Use the DataFrameDiscretizer to "fix" nulls
      if (DataFrame::isNull(v) == true)
      {
        throw Tgs::Exception("Null values are not supported by SymmetricUncertaintyCalculator");
      }
      cc[(int)(v + .5)]++;
    }

    double sum = 0.0;
    double totalSize = df.getNumDataVectors();
    for (ClassCounts::const_iterator classIt = cc.begin(); classIt != cc.end(); classIt++)
    {
      double count = classIt->second;
      sum += count / totalSize * log2(count / totalSize);
    }

    return -sum;
  }
Exemplo n.º 2
0
  void RandomTree::findProximity(DataFrame & data, std::vector<unsigned int> & proximity)
  {
    std::vector<unsigned int> proxVec;

    unsigned int dSize = data.getNumDataVectors();
    proxVec.resize(dSize);

    //Find out which node each vector is classified as 
    for(unsigned int i = 0; i < dSize; i++)
    {
      std::string resultClass;
      unsigned int nodeId = classifyDataVector(data.getDataVector(i), resultClass);

      proxVec[i] = nodeId;
    }

    for(unsigned int j = 0; j < dSize; j++)
    {
      unsigned int tempId = proxVec[j];
      for(unsigned int k = j; k < dSize;j++)
      {
        if(proxVec[k] == tempId)
        {
          proximity[j * dSize + k] += 1;
        }
      }
    }
  }
  double SymmetricUncertaintyCalculator::_calculateConditionalEntropy(const DataFrame& dfY, 
    int factorIndexY, const DataFrame& dfX, int factorIndexX)
  {
    CondClassCounts ccc;
    ClassCounts cc;

    for(unsigned int i = 0; i < dfX.getNumDataVectors(); i++)
    {
      double vx = dfX.getDataElement(i, factorIndexX);
      if (DataFrame::isNull(vx) == true)
      {
        throw Tgs::Exception("Null values are not supported by SymmetricUncertaintyCalculator");
      }
      int ex = (int)(vx + 0.5); // x enumeration

      double vy = dfY.getDataElement(i, factorIndexY);
      if (DataFrame::isNull(vy) == true)
      {
        throw Tgs::Exception("Null values are not supported by SymmetricUncertaintyCalculator");
      }      
      int ey = (int)(vy + 0.5); // y enumeration

      ccc[ex][ey]++;
      cc[ex]++;
    }

    double sumX = 0.0;
    double totalSize = dfX.getNumDataVectors();
    for (CondClassCounts::const_iterator condIt = ccc.begin(); condIt != ccc.end(); condIt++)
    {
      const ClassCounts& classCounts = condIt->second;
      double px = (double)cc[condIt->first] / totalSize; // p(x)
      double sumY = 0.0;
      for (ClassCounts::const_iterator classIt = classCounts.begin(); 
        classIt != classCounts.end(); classIt++)
      {
        double count = classIt->second;
        double pyx = count / (double)cc[condIt->first]; // p(y | x)
        sumY += pyx * log2(pyx);
      }
      sumX += px * sumY;
    }

    return -sumX;
  }
Exemplo n.º 4
0
  void RandomTree::buildTest(DataFrame & data, unsigned int numFactors)
  {
    unsigned int nodeSize = 1;
    _factPerNode = numFactors;

    std::vector<unsigned int> indices;
    indices.resize(data.getNumDataVectors());

    for(unsigned int i = 0; i < indices.size(); i++)
    {
      indices[i] = i;
    }

    _root = new TreeNode();
    _root->leftChild = NULL;
    _root->rightChild = NULL;
    _root->isPure = false;

    _build(data, indices, _root, nodeSize);
  }
  void PrincipalComponentsAnalysis::compute(DataFrame& df)
  {
    if (df.getNumFactors() > 2)
    {
      // see PrincipalComponentsAnalysisTest
      cout << "You realize this hasn't been tested, right?" << endl;
    }
    Matrix dataMat(df.getNumFactors(), df.getNumDataVectors());
    Matrix deviates(df.getNumFactors(), df.getNumDataVectors());
    SymmetricMatrix covar(df.getNumFactors());
    DiagonalMatrix eigenValues(df.getNumFactors());
    Matrix eigenVectors;
    ColumnVector means(df.getNumFactors());
    means = 0.0;
    RowVector h(df.getNumDataVectors());
    h = 1.0;

    for (unsigned int j = 0; j < df.getNumFactors(); j++)
    {
      if (df.isNominal(j))
      {
        throw Tgs::Exception("Only numeric values are supported.");
      }
    }


    for(unsigned int i = 0; i < df.getNumDataVectors(); i++)
    {
      for (unsigned int j = 0; j < df.getNumFactors(); j++)
      {
        double v = df.getDataElement(i, j);
        if (df.isNull(v))
        {
          throw Tgs::Exception("Only non-null values are supported.");
        }
        dataMat.element(j, i) = v;
        means.element(j) += v / (double)df.getNumDataVectors();
      }
    }

    try
    {
      deviates = dataMat - (means * h);
      covar << (1.0/(float)df.getNumDataVectors()) * (deviates * deviates.t());
      Jacobi::jacobi(covar, eigenValues, eigenVectors);
    }
    catch (const std::exception&)
    {
      throw;
    }
    catch (...)
    {
      throw Tgs::Exception("Unknown error while calculating PCA");
    }

    _sortEigens(eigenVectors, eigenValues);

    _components.resize(df.getNumFactors());
    for (unsigned int v = 0; v < df.getNumFactors(); v++)
    {
      _components[v].resize(df.getNumFactors());
      for (unsigned int d = 0; d < df.getNumFactors(); d++)
      {
        _components[v][d] = eigenVectors.element(d, v);
      }
    }
  }