コード例 #1
0
int main(){
  
  vector<int> percentage;
  vector<int> attempts;
  vector<int> incorrect;
  
  for(int percent_held=40; percent_held<51; percent_held+=10)
  {
    percentage.push_back(percent_held);
    int num_wrong = 0;
    int iterations =0;
    int iter = 0;
    for(int iter=0; iter<10; iter++)
    {
      Classifier<NgramVector> classifier;
      
      vector<string> file_names;
      file_names.push_back("data/bar-tweets.txt");
      file_names.push_back("data/ht-tweets.txt");
      
      ClassifiedDataSet<NgramVector> test;
      
      for(int j=0; j<file_names.size(); j++)
      {
	//test on bar-tweets.txt
	ifstream fin(file_names[j]);
	//ifstream fin("data/ht-tweets.txt");
	
	
	
	ClassifiedDataSet<NgramVector> v;
	map<string, NgramVector> vect_map;

	int i = 1;
	while(!fin.eof()){
	  //NgramVector temp(to_string(i++),"2","multi",3);

	  //get line
	  string line;
	  getline(fin, line);

	  //get tweet_id
	  string tweet_id = line.substr(0,line.find('\t'));
	  line.erase(0, tweet_id.size()+1);

	  //get user_id
	  string user_id = line.substr(0,line.find('\t'));
	  line.erase(0, tweet_id.size()+1);

	  //get tweet
	  string tweet = line.substr(0,line.find('\t'));

	  //if user_id not in map
	  if(vect_map.find(user_id) == vect_map.end())
	  {
	    NgramVector temp(tweet_id,user_id,file_names[j],3);
	    vect_map[user_id] = temp;
	    //cout << user_id << " " << tweet << endl;// tweet << endl;
	  }
	  vect_map[user_id].input_string(tweet);
	}
	
	srand (time(NULL));
	//create c++ vector of random integers from 0 to vect_map.size()
	//want random 20 percent of vectors not included in training set.
	unordered_map<int,int> test_set;
	for(int i=0; i<= ((double)percent_held/(double)100)*vect_map.size(); i++)
	{
	  int iSecret;
	  bool found = false;
	  while(!found)
	  {
	    //cout << "a" << endl;
	    //find random int not yet in test set;
	    /* initialize random seed: */
	    

	    /* generate secret number between 1 and 10: */
	    iSecret = rand() % (vect_map.size()-1);
	    
	    if(test_set.find(iSecret) == test_set.end())
	    {
	      found = true;
	      test_set[iSecret] = 0;
	    }
	  }
	  //cout << iSecret << endl;
	}
	
	int m=0;
	for(auto it=vect_map.begin(); it!=vect_map.end(); it++)
	{
	  
	  if(test_set.find(m) == test_set.end())
	  {
	    v.push_back(it->second);
	    //cout << it->second.get_lang() << endl;
	  }
	  else
	  {
	    test.push_back(it->second);
	    //cout << "-"<< it->second.get_lang() << endl;
	  }
	  m++;
	}
	//v.info_gain_for_each_value();
	//v.print_ordered();
	
	classifier.add_classified_dataset(v, file_names[j]);
      }
      //classifier.print_ordered();
      classifier.calculate_info_gain();
      
      
      for(int i=0; i<test.size(); i++)
      {
      	iterations++;
      	cout << "------------------------------" << endl;
      	string expected = test[i].get_lang();
      	cout << "exptected: " << expected << endl;
      	string return_val = classifier.naive_classify(test[i]);
      	cout << "actual:    " << return_val << endl;
      	if(expected != return_val)
      	{
      	  cout << "incorrectly guessed value" << endl;
      	  num_wrong++;
      	}
      }
      
      cout << "number wrong for " << iterations << " iterations = " << num_wrong << endl;
      //classifier.print_universal_info_gain();
      
      //add error and iterations
      
    }
    attempts.push_back(iterations);
    incorrect.push_back(num_wrong);
    for(int num=0; num<percentage.size(); num++)
      {
	cout << "==============================================" << endl;
	cout << "-----------------------------------" << endl;
	cout << "trial percentage: " << percentage[num] << endl;
	cout << "attempst        : " << attempts[num] << endl;
	cout << "incorrect       : " << incorrect[num] << endl;
	cout << "error           : " << incorrect[num]/attempts[num] << endl;
      }
  }
}