Пример #1
0
void
saveSvmLight(const char *fname, docs_t &docs, intvector_t &ids)
{
  cerr << "# Writing " << fname << "."  << endl;
  
  ogzstream f;
  f.open(fname);
  if (! f.good())
    {
      cerr << "ERROR: cannot open " << fname << " for writing." << endl;
      ::exit(10);
    }
  
  for(int i=0; i<(int)ids.size(); i++)
    {
      int id = ids[i];
      bool y = classes[id];
      SVector s = docs[id];
      int p = s.npairs();
      if (p <= 0)
        {
          cerr << "ERROR: empty vector " << id << "." << endl;
          ::exit(10);
        }
      f << ((y) ? +1 : -1);
      f << s;
      if (! f.good())
        {
          cerr << "ERROR: writing " << fname << " for writing." << endl;
          ::exit(10);
        }
    }
  
  cerr << "# Done. Wrote " << ids.size() << " examples." << endl;
}
Пример #2
0
void
saveBinary(const char *fname, docs_t &docs, intvector_t &ids)
{
  cerr << "# Writing " << fname << "."  << endl;
  
  ogzstream f;
  f.open(fname);
  if (! f.good())
    {
      cerr << "ERROR: cannot open " << fname << " for writing." << endl;
      ::exit(10);
    }
  
  int pcount = 0;
  int ncount = 0;
  int npairs = 0;
  for(int i=0; i<(int)ids.size(); i++)
    {
      int id = ids[i];
      bool y = classes[id];
      if (y)
        pcount += 1;
      else
        ncount += 1;

      SVector s = docs[id];
      int p = s.npairs();
      npairs += p;
      if (p <= 0)
        {
          cerr << "ERROR: empty vector " << id << "." << endl;
          ::exit(10);
        }
      
      f.put( y ? 1 : 0);
      s.save(f);
      if (! f.good())
        {
          cerr << "ERROR: writing " << fname << " for writing." << endl;
          ::exit(10);
        }
    }

  cerr << "# Done. Wrote " << ids.size() << " examples." << endl;
  cerr << "#   with " << npairs << " pairs, " 
       << pcount << " positives, and "
       << ncount << " negatives." << endl;
}
Пример #3
0
void 
readDocs(const char *fname, docs_t &docs, bool freezedico=false)
{
  cerr << "# Reading " << fname << endl;

  igzstream f;
  f.open(fname);
  if (! f.good()) {
    cerr << "ERROR: cannot open file " << fname << endl;
    ::exit(10);
  }
  
  string token;
  f >> token;
  if (token != ".I")
    {
      cerr << "ERROR: Cannot read initial .I in " << fname << endl;
      ::exit(10);
    }
  int id = 0;
  int count = 0;
  while(f.good())
    {
      f >> id >> token;
      count += 1;
      if (! f.good() || token != ".W")
        {
          cerr << "ERROR (" << id << "): "
               << "Cannot read \"<id> .W\"." << endl;
          ::exit(10);
        }
      int wid = -1;
      string otoken;
      SVector s;
      for(;;)
        {
          f >> token;
          if (!f.good() || token == ".I")
            break;
          if (token != otoken)
            {
              dico_t::iterator it = dico.find(token);
              if (it != dico.end())
                wid = it->second;
              else if (freezedico)
                continue;
              else
                {
                  wid = dico.size() + 1;
                  dico[token] = wid;
                }
              otoken = token;
            }
          s.set(wid, s.get(wid)+1.0);
        }
      if (s.npairs() <= 0)
        {
          cerr << "ERROR (" << id << "): "
               << "Empty vector " << id << "?" << endl;
          ::exit(10);
        }
      docs[id] = s;
    }
  if (!f.eof())
    {
      cerr << "ERROR (" << id << "): "
           << "Failed reading words" << endl;
      ::exit(10);
    }

  cerr << "# Done reading " << count << " documents." << endl;
}