double infoGain(python::object resArr) { PyObject *matObj = resArr.ptr(); if (!PyArray_Check(matObj)) { throw_value_error("Expecting a Numeric array object"); } PyArrayObject *copy; copy = (PyArrayObject *)PyArray_ContiguousFromObject( matObj, ((PyArrayObject *)matObj)->descr->type_num, 2, 2); long int rows = (long int)((PyArrayObject *)matObj)->dimensions[0]; long int cols = (long int)((PyArrayObject *)matObj)->dimensions[1]; double res = 0.0; if (((PyArrayObject *)matObj)->descr->type_num == PyArray_DOUBLE) { double *data = (double *)copy->data; res = InfoEntropyGain(data, rows, cols); } else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_FLOAT) { float *data = (float *)copy->data; res = InfoEntropyGain(data, rows, cols); } else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_INT) { int *data = (int *)copy->data; res = InfoEntropyGain(data, rows, cols); } else if (((PyArrayObject *)matObj)->descr->type_num == PyArray_LONG) { long int *data = (long int *)copy->data; res = InfoEntropyGain(data, rows, cols); } else { throw_value_error( "Numeric array object of type int or long or float or double"); } Py_DECREF(copy); return res; }
double InfoBitRanker::BiasInfoEntropyGain(RDKit::USHORT *resMat) const { PRECONDITION(resMat,"bad result pointer"); bool bitOk = this->BiasCheckBit(resMat); double info=0.0; if (bitOk) { info = InfoEntropyGain(resMat, 2, d_classes); } return info; }
double *InfoBitRanker::getTopN(unsigned int num) { // this is a place holder to pass along to infogain function // the size of this container should nVals*d_classes, where nVals // is the number of values a variable can take. // since we are dealing with a binary bit vector nVals = 2 // in addition the infogain function pretends that this is a 2D matrix // with the number of rows equal to nVals and num of columns equal to // d_classes if(num>d_dims) throw ValueErrorException("attempt to rank more bits than present in the bit vectors"); if(dp_maskBits) CHECK_INVARIANT(num <= dp_maskBits->getNumOnBits(), "Can't rank more bits than the ensemble size"); RDKit::USHORT *resMat = new RDKit::USHORT[2*d_classes]; PR_QUEUE topN; for (unsigned int i = 0; i < d_dims; i++) { // we may want to ignore bits that are not turned on in any item of class // "ignoreNoClass" /* if ((0 <= ignoreNoClass) && (d_classes > ignoreNoClass)) { if (d_counts[ignoreNoClass][i] == 0) { continue; } }*/ if (dp_maskBits && !dp_maskBits->getBit(i)) { continue; } // fill up dmat for (unsigned int j = 0; j < d_classes; j++) { // we know that we have only two rows here resMat[j] = d_counts[j][i]; resMat[d_classes + j] = (d_clsCount[j] - d_counts[j][i]); } double info = 0.0; switch (d_type) { case ENTROPY: info = InfoEntropyGain(resMat, 2, d_classes); break; case BIASENTROPY: info = this->BiasInfoEntropyGain(resMat); break; case CHISQUARE: info = ChiSquare(resMat, 2, d_classes); break; case BIASCHISQUARE: info = BiasChiSquareGain(resMat); break; default: break; } PAIR_D_I entry(info, i); if (info >= 0.0) { if (topN.size() < num) { topN.push(entry); } else if (info > topN.top().first) { topN.pop(); topN.push(entry); } } } delete [] resMat; // now fill up the result matrix for the topN bits // the result from this function is a double * of size // num*4. The caller of this function interprets this // array as a two dimensional array of size num*(2+d_classes) with each row // containing the following entries // bitId, infogain, 1 additional column for number of hits for each class //double *res = new double[num*(2+d_classes)]; d_top = num; int ncols = 2+d_classes; delete [] dp_topBits; dp_topBits = new double[num*ncols]; int offset, bid; RDKit::INT_VECT maskBits; if (dp_maskBits && topN.size() < num) { dp_maskBits->getOnBits(maskBits); } for (int i = num - 1; i >= 0; i--) { offset = i*ncols; if (topN.size() == 0 ) { if (dp_maskBits) { bid = maskBits[i]; } else { bid = i; } dp_topBits[offset + 1] = 0.0; } else { bid = topN.top().second; // bit id dp_topBits[offset + 1] = topN.top().first; // value of the infogain topN.pop(); } dp_topBits[offset] = (double)bid; for (unsigned int j = 0; j < d_classes; j++) { dp_topBits[offset + 2 + j] = (double)d_counts[j][bid]; } } return dp_topBits; }