Example #1
0
void 
computeNormalizedTfIdf()
{
  cerr << "# Computing document frequencies" << endl;

  int terms = dico.size();
  vector<double> nt(terms+1);
  
  double nd = trainid.size();
  for(int i=0; i<terms+1; i++)
    nt[i] = 0;
  for(int i=0; i<(int)trainid.size(); i++)
    {
      int id = trainid[i];
      SVector s = train[id];
      for (const SVector::Pair *p = s; p->i >= 0; p++)
        if (p->v > 0)
          nt[p->i] += 1;
    }
  
  cerr << "# Computing TF/IDF for training set" << endl;
  for(int i=0; i<(int)trainid.size(); i++)
    {
      int id = trainid[i];
      SVector s = train[id];
      SVector v;
      for (const SVector::Pair *p = s; p->i >= 0; p++)
        if (nt[p->i] > 0)
          v.set(p->i, (1.0 + log(p->v)) * log(nd/nt[p->i]));
      double norm = dot(v,v);
      v.scale(1.0 / sqrt(norm));
      train[id] = v;
    }
  cerr << "# Computing TF/IDF for testing set" << endl;
  for(int i=0; i<(int)testid.size(); i++)
    {
      int id = testid[i];
      SVector s = test[id];
      SVector v;
      for (const SVector::Pair *p = s; p->i >= 0; p++)
        if (nt[p->i] > 0)
          v.set(p->i, (1.0 + log(p->v)) * log(nd/nt[p->i]));
      double norm = dot(v,v);
      v.scale(1.0 / sqrt(norm));
      test[id] = v;
    }
  cerr << "# Done." << endl;
}
void LinearLaRank::eval(Sample& sample, Result& result) {
    // Evaluate the sample
    if (m_sampleCount) {
        // Convert the Sample to LaRank form
        SVector laX;
        for (int nFeat = 0; nFeat < sample.x.rows(); nFeat++) {
            laX.set(nFeat, sample.x(nFeat));
        }

        m_svm->predict_with_scores(laX, result);

        // Convert the scores to probabilities
        double totalProb = 0.0;
        for (int nClass = 0; nClass < *m_numClasses; nClass++) {
            result.confidence[nClass] = exp(result.confidence[nClass]);
            totalProb += result.confidence[nClass];
        }
        for (int nClass = 0; nClass < *m_numClasses; nClass++) {
            result.confidence[nClass] /= (totalProb + 1e-16);
        }
    } else {
        for (int nClass = 0; nClass < *m_numClasses; nClass++) {
            result.confidence[nClass] = 1.0 / *m_numClasses;
        }
        result.prediction = 0;
    }
}
void LinearLaRank::update(Sample& sample) {
    // Convert the Sample to LaRank form
    SVector laX;
    for (int nFeat = 0; nFeat < sample.x.rows(); nFeat++) {
        laX.set(nFeat, sample.x(nFeat));
    }

    // Add the sample to svm
    m_sampleCount++;
    m_svm->add(laX, sample.y, m_sampleCount, sample.w);
}
Example #4
0
void 
readDocs(const char *fname, docs_t &docs, bool freezedico=false)
{
  cerr << "# Reading " << fname << endl;

  igzstream f;
  f.open(fname);
  if (! f.good()) {
    cerr << "ERROR: cannot open file " << fname << endl;
    ::exit(10);
  }
  
  string token;
  f >> token;
  if (token != ".I")
    {
      cerr << "ERROR: Cannot read initial .I in " << fname << endl;
      ::exit(10);
    }
  int id = 0;
  int count = 0;
  while(f.good())
    {
      f >> id >> token;
      count += 1;
      if (! f.good() || token != ".W")
        {
          cerr << "ERROR (" << id << "): "
               << "Cannot read \"<id> .W\"." << endl;
          ::exit(10);
        }
      int wid = -1;
      string otoken;
      SVector s;
      for(;;)
        {
          f >> token;
          if (!f.good() || token == ".I")
            break;
          if (token != otoken)
            {
              dico_t::iterator it = dico.find(token);
              if (it != dico.end())
                wid = it->second;
              else if (freezedico)
                continue;
              else
                {
                  wid = dico.size() + 1;
                  dico[token] = wid;
                }
              otoken = token;
            }
          s.set(wid, s.get(wid)+1.0);
        }
      if (s.npairs() <= 0)
        {
          cerr << "ERROR (" << id << "): "
               << "Empty vector " << id << "?" << endl;
          ::exit(10);
        }
      docs[id] = s;
    }
  if (!f.eof())
    {
      cerr << "ERROR (" << id << "): "
           << "Failed reading words" << endl;
      ::exit(10);
    }

  cerr << "# Done reading " << count << " documents." << endl;
}