void loadData(GMatrix& m, const char* szFilename) { // Load the dataset by extension PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) m.loadArff(szFilename); else if(_stricmp(szFilename + pd.extStart, ".csv") == 0) { GCSVParser parser; parser.parse(m, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < m.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else if(_stricmp(szFilename + pd.extStart, ".dat") == 0) { GCSVParser parser; parser.setSeparator('\0'); parser.parse(m, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < m.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else throw Ex("Unsupported file format: ", szFilename + pd.extStart); }
void LoadData(GArgReader &args, std::unique_ptr<GMatrix> &hOutput) { // Load the dataset by extension if(args.size() < 1) throw Ex("Expected the filename of a datset. (Found end of arguments.)"); const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); GMatrix data; vector<size_t> abortedCols; vector<size_t> ambiguousCols; const char *input_type; if (args.next_is_flag() && args.if_pop("-input_type")) { input_type = args.pop_string(); } else { /* deduce it from extension (if any) */ input_type = szFilename + pd.extStart; if (*input_type != '.') /* no extension - assume ARFF */ input_type = "arff"; else input_type++; } // Now load the data if(_stricmp(input_type, "arff") == 0) { data.loadArff(szFilename); } else if(_stricmp(input_type, "csv") == 0) { GCSVParser parser; parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else if(_stricmp(input_type, "dat") == 0) { GCSVParser parser; parser.setSeparator('\0'); parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else { throw Ex("Unsupported file format: ", szFilename + pd.extStart); } // Split data into a feature matrix and a label matrix GMatrix* pFeatures = data.cloneSub(0, 0, data.rows(), data.cols()); hOutput.reset(pFeatures); }
///Return a pointer to newly allocated data read from the command line ///represented by args. /// ///The returned matrix is allocated by new and it is the caller's ///responsibility to deallocate it. The suggested manner is to use a ///Holder<GMatrix*> /// ///In the returned matrix, all of the attributes designated as labels ///have been moved to the end and ignored attributes have been ///removed. The original indices of all the attributes are returned in ///originalIndices. /// ///\param args the command-line arguments /// ///\param pLabelDims (out parameter) the index of the first attribute ///which is designated a label. /// ///\param originalIndices the vector in which to place the original ///indices. originalIndices[i] is the index in the original data file ///of the attribute currently at index i. void loadDataWithSwitches(GMatrix& data, GArgReader& args, size_t& pLabelDims, std::vector<size_t>& originalIndices) { // Load the dataset by extension const char* szFilename = args.pop_string(); PathData pd; GFile::parsePath(szFilename, &pd); if(_stricmp(szFilename + pd.extStart, ".arff") == 0) data.loadArff(szFilename); else if(_stricmp(szFilename + pd.extStart, ".csv") == 0) { GCSVParser parser; parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else if(_stricmp(szFilename + pd.extStart, ".dat") == 0) { GCSVParser parser; parser.setSeparator('\0'); parser.parse(data, szFilename); cerr << "\nParsing Report:\n"; for(size_t i = 0; i < data.cols(); i++) cerr << to_str(i) << ") " << parser.report(i) << "\n"; } else throw Ex("Unsupported file format: ", szFilename + pd.extStart); //Make the initial list of original indices originalIndices.resize(data.cols()); for(std::size_t i = 0; i < originalIndices.size(); ++i){ originalIndices.at(i) = i; } // Parse params vector<size_t> ignore; vector<size_t> labels; while(args.next_is_flag()) { if(args.if_pop("-labels")) parseAttributeList(labels, args, data.cols()); else if(args.if_pop("-ignore")) parseAttributeList(ignore, args, data.cols()); else break; } // Throw out the ignored attributes std::sort(ignore.begin(), ignore.end()); for(size_t i = ignore.size() - 1; i < ignore.size(); i--) { data.deleteColumns(ignore[i], 1); originalIndices.erase(originalIndices.begin()+ignore[i]); for(size_t j = 0; j < labels.size(); j++) { if(labels[j] >= ignore[i]) { if(labels[j] == ignore[i]) throw Ex("Attribute ", to_str(labels[j]), " is both ignored and used as a label"); labels[j]--; } } } // Swap label columns to the end pLabelDims = std::max((size_t)1, labels.size()); for(size_t i = 0; i < labels.size(); i++) { size_t src = labels[i]; size_t dst = data.cols() - pLabelDims + i; if(src != dst) { data.swapColumns(src, dst); std::swap(originalIndices.at(src), originalIndices.at(dst)); for(size_t j = i + 1; j < labels.size(); j++) { if(labels[j] == dst) { labels[j] = src; break; } } } } }