GMatrix* loadData(const char* szFilename) { // Load the dataset by extension PathData pd; GFile::parsePath(szFilename, &pd); GMatrix* pData = new GMatrix(); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) pData->loadArff(szFilename); else if(_stricmp(szFilename + pd.extStart, ".csv") == 0) pData->loadCsv(szFilename, ',', false, false); else if(_stricmp(szFilename + pd.extStart, ".dat") == 0) pData->loadCsv(szFilename, '\0', false, false); else throw Ex("Unsupported file format: ", szFilename + pd.extStart); return pData; }
///Return a pointer to newly allocated data read from the command line ///represented by args. /// ///The returned matrix is allocated by new and it is the caller's ///responsibility to deallocate it. The suggested manner is to use a ///Holder<GMatrix*> /// ///In the returned matrix, all of the attributes designated as labels ///have been moved to the end and ignored attributes have been ///removed. The original indices of all the attributes are returned in ///originalIndices. /// ///\param args the command-line arguments /// ///\param pLabelDims (out parameter) the index of the first attribute ///which is designated a label. /// ///\param originalIndices the vector in which to place the original ///indices. originalIndices[i] is the index in the original data file ///of the attribute currently at index i. void loadDataWithSwitches(GMatrix& data, GArgReader& args, size_t& pLabelDims, std::vector<size_t>& originalIndices) { // Load the dataset by extension const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) data.loadArff(szFilename); else if(_stricmp(szFilename + pd.extStart, ".csv") == 0) data.loadCsv(szFilename, ',', false, false); else if(_stricmp(szFilename + pd.extStart, ".dat") == 0) data.loadCsv(szFilename, '\0', false, false); else throw Ex("Unsupported file format: ", szFilename + pd.extStart); //Make the initial list of original indices originalIndices.resize(data.cols()); for(std::size_t i = 0; i < originalIndices.size(); ++i){ originalIndices.at(i) = i; } // Parse params vector<size_t> ignore; vector<size_t> labels; while(args.next_is_flag()) { if(args.if_pop("-labels")) parseAttributeList(labels, args, data.cols()); else if(args.if_pop("-ignore")) parseAttributeList(ignore, args, data.cols()); else break; } // Throw out the ignored attributes std::sort(ignore.begin(), ignore.end()); for(size_t i = ignore.size() - 1; i < ignore.size(); i--) { data.deleteColumn(ignore[i]); originalIndices.erase(originalIndices.begin()+ignore[i]); for(size_t j = 0; j < labels.size(); j++) { if(labels[j] >= ignore[i]) { if(labels[j] == ignore[i]) throw Ex("Attribute ", to_str(labels[j]), " is both ignored and used as a label"); labels[j]--; } } } // Swap label columns to the end pLabelDims = std::max((size_t)1, labels.size()); for(size_t i = 0; i < labels.size(); i++) { size_t src = labels[i]; size_t dst = data.cols() - pLabelDims + i; if(src != dst) { data.swapColumns(src, dst); std::swap(originalIndices.at(src), originalIndices.at(dst)); for(size_t j = i + 1; j < labels.size(); j++) { if(labels[j] == dst) { labels[j] = src; break; } } } } }