void SwapAttributes(GArgReader& args) { GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); size_t nAttr1 = args.pop_uint(); size_t nAttr2 = args.pop_uint(); size_t attrCount = pData->relation()->size(); if(nAttr1 >= attrCount) ThrowError("Index out of range"); if(nAttr2 >= attrCount) ThrowError("Index out of range"); pData->swapColumns(nAttr1, nAttr2); pData->print(cout); }
void ROC(GArgReader& args) { // Parse options unsigned int seed = getpid() * (unsigned int)time(NULL); bool ideal = false; while(args.next_is_flag()) { if(args.if_pop("-seed")) seed = args.pop_uint(); else if(args.if_pop("-ideal")) ideal = true; else ThrowError("Invalid option: ", args.peek()); } // Load the data if(args.size() < 1) ThrowError("No dataset specified."); GMatrix* pData = loadData(args.pop_string()); Holder<GMatrix> hData(pData); // Instantiate the recommender GRand prng(seed); GCollaborativeFilter* pModel = InstantiateAlgorithm(prng, args); Holder<GCollaborativeFilter> hModel(pModel); if(args.size() > 0) ThrowError("Superfluous argument: ", args.peek()); // Generate ROC data GMatrix* pResults = pModel->precisionRecall(*pData, ideal); Holder<GMatrix> hResults(pResults); double auc = GCollaborativeFilter::areaUnderCurve(*pResults); pResults->deleteColumn(1); // we don't need the precision column pResults->swapColumns(0, 1); cout << "% Area Under the Curve = " << auc << "\n"; pResults->print(cout); }
GMatrix* loadDataWithSwitches(GArgReader& args, size_t* pLabelDims) { // Load the dataset by extension const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); GMatrix* pData = NULL; if(_stricmp(szFilename + pd.extStart, ".arff") == 0) pData = GMatrix::loadArff(szFilename); else if(_stricmp(szFilename + pd.extStart, ".csv") == 0) pData = GMatrix::loadCsv(szFilename, ',', false, false); else if(_stricmp(szFilename + pd.extStart, ".dat") == 0) pData = GMatrix::loadCsv(szFilename, '\0', false, false); else ThrowError("Unsupported file format: ", szFilename + pd.extStart); Holder<GMatrix> hData(pData); // Parse params vector<size_t> ignore; vector<size_t> labels; while(args.next_is_flag()) { if(args.if_pop("-labels")) parseAttributeList(labels, args, pData->cols()); else if(args.if_pop("-ignore")) parseAttributeList(ignore, args, pData->cols()); else break; } // Throw out the ignored attributes std::sort(ignore.begin(), ignore.end()); for(size_t i = ignore.size() - 1; i < ignore.size(); i--) { pData->deleteColumn(ignore[i]); for(size_t j = 0; j < labels.size(); j++) { if(labels[j] >= ignore[i]) { if(labels[j] == ignore[i]) ThrowError("Attribute ", to_str(labels[j]), " is both ignored and used as a label"); labels[j]--; } } } // Swap label columns to the end *pLabelDims = std::max((size_t)1, labels.size()); for(size_t i = 0; i < labels.size(); i++) { size_t src = labels[i]; size_t dst = pData->cols() - *pLabelDims + i; if(src != dst) { pData->swapColumns(src, dst); for(size_t j = i + 1; j < labels.size(); j++) { if(labels[j] == dst) { labels[j] = src; break; } } } } return hData.release(); }
///Return a pointer to newly allocated data read from the command line ///represented by args. /// ///The returned matrix is allocated by new and it is the caller's ///responsibility to deallocate it. The suggested manner is to use a ///Holder<GMatrix*> /// ///In the returned matrix, all of the attributes designated as labels ///have been moved to the end and ignored attributes have been ///removed. The original indices of all the attributes are returned in ///originalIndices. /// ///\param args the command-line arguments /// ///\param pLabelDims (out parameter) the index of the first attribute ///which is designated a label. /// ///\param originalIndices the vector in which to place the original ///indices. originalIndices[i] is the index in the original data file ///of the attribute currently at index i. void loadDataWithSwitches(GMatrix& data, GArgReader& args, size_t& pLabelDims, std::vector<size_t>& originalIndices) { // Load the dataset by extension const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) data.loadArff(szFilename); else if(_stricmp(szFilename + pd.extStart, ".csv") == 0) data.loadCsv(szFilename, ',', false, false); else if(_stricmp(szFilename + pd.extStart, ".dat") == 0) data.loadCsv(szFilename, '\0', false, false); else throw Ex("Unsupported file format: ", szFilename + pd.extStart); //Make the initial list of original indices originalIndices.resize(data.cols()); for(std::size_t i = 0; i < originalIndices.size(); ++i){ originalIndices.at(i) = i; } // Parse params vector<size_t> ignore; vector<size_t> labels; while(args.next_is_flag()) { if(args.if_pop("-labels")) parseAttributeList(labels, args, data.cols()); else if(args.if_pop("-ignore")) parseAttributeList(ignore, args, data.cols()); else break; } // Throw out the ignored attributes std::sort(ignore.begin(), ignore.end()); for(size_t i = ignore.size() - 1; i < ignore.size(); i--) { data.deleteColumn(ignore[i]); originalIndices.erase(originalIndices.begin()+ignore[i]); for(size_t j = 0; j < labels.size(); j++) { if(labels[j] >= ignore[i]) { if(labels[j] == ignore[i]) throw Ex("Attribute ", to_str(labels[j]), " is both ignored and used as a label"); labels[j]--; } } } // Swap label columns to the end pLabelDims = std::max((size_t)1, labels.size()); for(size_t i = 0; i < labels.size(); i++) { size_t src = labels[i]; size_t dst = data.cols() - pLabelDims + i; if(src != dst) { data.swapColumns(src, dst); std::swap(originalIndices.at(src), originalIndices.at(dst)); for(size_t j = i + 1; j < labels.size(); j++) { if(labels[j] == dst) { labels[j] = src; break; } } } } }