void saveSvmLight(const char *fname, docs_t &docs, intvector_t &ids) { cerr << "# Writing " << fname << "." << endl; ogzstream f; f.open(fname); if (! f.good()) { cerr << "ERROR: cannot open " << fname << " for writing." << endl; ::exit(10); } for(int i=0; i<(int)ids.size(); i++) { int id = ids[i]; bool y = classes[id]; SVector s = docs[id]; int p = s.npairs(); if (p <= 0) { cerr << "ERROR: empty vector " << id << "." << endl; ::exit(10); } f << ((y) ? +1 : -1); f << s; if (! f.good()) { cerr << "ERROR: writing " << fname << " for writing." << endl; ::exit(10); } } cerr << "# Done. Wrote " << ids.size() << " examples." << endl; }
void saveBinary(const char *fname, docs_t &docs, intvector_t &ids) { cerr << "# Writing " << fname << "." << endl; ogzstream f; f.open(fname); if (! f.good()) { cerr << "ERROR: cannot open " << fname << " for writing." << endl; ::exit(10); } int pcount = 0; int ncount = 0; int npairs = 0; for(int i=0; i<(int)ids.size(); i++) { int id = ids[i]; bool y = classes[id]; if (y) pcount += 1; else ncount += 1; SVector s = docs[id]; int p = s.npairs(); npairs += p; if (p <= 0) { cerr << "ERROR: empty vector " << id << "." << endl; ::exit(10); } f.put( y ? 1 : 0); s.save(f); if (! f.good()) { cerr << "ERROR: writing " << fname << " for writing." << endl; ::exit(10); } } cerr << "# Done. Wrote " << ids.size() << " examples." << endl; cerr << "# with " << npairs << " pairs, " << pcount << " positives, and " << ncount << " negatives." << endl; }
void readDocs(const char *fname, docs_t &docs, bool freezedico=false) { cerr << "# Reading " << fname << endl; igzstream f; f.open(fname); if (! f.good()) { cerr << "ERROR: cannot open file " << fname << endl; ::exit(10); } string token; f >> token; if (token != ".I") { cerr << "ERROR: Cannot read initial .I in " << fname << endl; ::exit(10); } int id = 0; int count = 0; while(f.good()) { f >> id >> token; count += 1; if (! f.good() || token != ".W") { cerr << "ERROR (" << id << "): " << "Cannot read \"<id> .W\"." << endl; ::exit(10); } int wid = -1; string otoken; SVector s; for(;;) { f >> token; if (!f.good() || token == ".I") break; if (token != otoken) { dico_t::iterator it = dico.find(token); if (it != dico.end()) wid = it->second; else if (freezedico) continue; else { wid = dico.size() + 1; dico[token] = wid; } otoken = token; } s.set(wid, s.get(wid)+1.0); } if (s.npairs() <= 0) { cerr << "ERROR (" << id << "): " << "Empty vector " << id << "?" << endl; ::exit(10); } docs[id] = s; } if (!f.eof()) { cerr << "ERROR (" << id << "): " << "Failed reading words" << endl; ::exit(10); } cerr << "# Done reading " << count << " documents." << endl; }