예제 #1
0
int main(int argc, char* argv[]) {
    StatsCollector2 stats;

    std::ifstream stats_file(argv[1]);
    int count = 0;
    while (true) {
        Session session;
        if (count % 10000 == 0) std::cout << "Stats: " << count << std::endl;
        if (!Session::readSession(stats_file, session, false)) break;
        stats.processSession(session);
        ++count;
    }

    std::ifstream train_file(argv[2]);
    std::ofstream feats_file(argv[3]);
    stats.writeFeaturesHeader(feats_file);
    count = 0;
    while (true) {
        Session session;
        if (count % 10000 == 0) std::cout << "Generating feats: " << count << std::endl;
        if (!Session::readSession(train_file, session, false)) break;
        stats.writeFeatures(feats_file, session);
        ++count;
    }

}
예제 #2
0
void loaders::load_train(const std::string& path,std::vector<std::vector<std::string> >& text,std::vector<bool>& labels)
{
	
	// Each sample (message) is represented as a boolean for his class (only for the training set), and a vector of strings representing the words composing it
	// with length >= min_size. Only alphanumeric characters can be part of a word. Every other character is considered as a delimiter.
	text.resize(loaders::train_size);
	labels.resize(loaders::train_size);
	
	std::fstream train_file(path.c_str(),std::fstream::in);
	std::string line;
	// Getting each line (ie each message) one at the time.
	for(int line_cmp=0;getline(train_file,line);line_cmp++)
	{
		int line_size = line.size();
		auto& sample = text[line_cmp];
		// The class of the sample is determinated by his first character only.
		labels[line_cmp] = (line[0] == '1');
		// Since each line begins by [01],""" we can start the analysis only at the 6-th character.
		int word_start = 5,cur_position = 5;
		// While we're not at the end of the line, we consider each word one at the time.
		while(cur_position < line_size)
		{
			// While we're not at the end of the word (ie we didn't meet a delimter), the cursor advances.
			while(cur_position < line_size && (isalpha(line[cur_position]) || isdigit(line[cur_position])))
				++cur_position;
			
			// We add the word to sample if it's long enough.
			if(cur_position - word_start >= loaders::min_size)
				sample.push_back(line.substr(word_start,cur_position - word_start));
			
			++cur_position;
			word_start = cur_position;
		}
	}
}
예제 #3
0
void loaders::load_test(const std::string& path,std::vector<std::vector<std::string> >& text)
{
	text.resize(loaders::test_size);
	
	std::fstream train_file(path.c_str(),std::fstream::in);
	std::string line;
	// Getting each line (ie each message) one at the time.
	for(int line_cmp=0;getline(train_file,line);line_cmp++)
	{
		int line_size = line.size();
		auto& sample = text[line_cmp];
		// Since each line begins by """ we can start the analysis only at the 4-th character.
		int word_start = 3,cur_position = 3;
		// While we're not at the end of the line, we consider each word one at the time.
		while(cur_position < line_size)
		{
			// While we're not at the end of the word (ie we didn't meet a delimter), the cursor advances.
			while(cur_position < line_size && (isalpha(line[cur_position]) || isdigit(line[cur_position])))
				++cur_position;
			
			// We add the word to sample if it's long enough.
			if(cur_position - word_start >= loaders::min_size)
				sample.push_back(line.substr(word_start,cur_position - word_start));
			
			++cur_position;
			word_start = cur_position;
		}
	}
}
int main(int argc, char *argv[])
{
    //for random number generator
    srand((unsigned)time(NULL));

    //preprocessing
    if(argc != 5){
        std::cout<<"No enough arugments\n";
        exit(0);
    }
    int k, n_threads;
    double lambda, wall_timer;
    std::string data_dir, meta_dir, train_dir, test_dir;
    std::string line;
    int row, col, train_size, test_size;

    std::cout<<"----------------------------------------------"<<std::endl;
    std::cout<<"exec filename: "<<argv[0]<<std::endl;
    wall_timer = omp_get_wtime();
    k = atoi(argv[1]);
    lambda = atof(argv[2]);
    n_threads = atoi(argv[3]);
    data_dir = argv[4];

    //set number of threads
    omp_set_num_threads(n_threads); 
   
    std::vector<std::string> files(3,"");
    files.reserve(3);
    getdir(data_dir, files);

    meta_dir = data_dir+files[0];
    train_dir = data_dir+files[1];
    test_dir = data_dir+files[2];
 

    //std::cout<<"Using [rank, lambda, n_threads, directory]->>: "<<"[ "<<k<<", "<<lambda<<", "<<n_threads<<", "<<data_dir<<"* ]\n";
    //std::cout<<meta_dir<<" "<<train_dir<<" "<<test_dir<<" "<<"\n";

    //read meta file
    std::ifstream meta_file(meta_dir.c_str());
        //get rows and clos
    std::getline(meta_file, line);
    std::stringstream meta_ss(line);
    meta_ss >> row >> col;
        //get size of train
    std::getline(meta_file, line);
    std::stringstream train_ss(line);
    train_ss >> train_size;
        //get size of test
    std::getline(meta_file, line);
    std::stringstream test_ss(line);
    test_ss >> test_size;
    //std::cout<<"row: "<<row<<" col: "<<col<<" train size: "<<train_size<<" test size: "<<test_size<<"\n";


    //read from training file, and construct matrix
    int *Ix = new int[train_size];
    int *Jx = new int[train_size];
    double *xx = new double[train_size];

    int *Iy = new int[train_size];
    int *Jy = new int[train_size];
    double *yy = new double[train_size];

    int *cc = new int[col](); //number of non zeros in each column
    int *rc = new int[row]();


    std::ifstream train_file(train_dir.c_str());
    std::vector<T> train_list;
    train_list.reserve(train_size);

    for(int i=0;i<train_size;i++){
	train_file >> Ix[i];
	train_file >> Jx[i];
	train_file >> xx[i];
        Ix[i] = Ix[i] - offset;
        Jx[i] = Jx[i] - offset;
        train_list.push_back(T(Ix[i], Jx[i], xx[i]));
        cc[Jx[i]] = cc[Jx[i]] + 1;
        rc[Ix[i]]= rc[Ix[i]] + 1;
    }
    Eigen::SparseMatrix<double> R(row, col);
    R.setFromTriplets(train_list.begin(), train_list.end());



    Eigen::SparseMatrix<double> R_tsp = R.transpose();
    int i_count = 0;
    for(int i=0;i<R_tsp.outerSize(); i++){
        for(Eigen::SparseMatrix<double>::InnerIterator it(R_tsp, i); it; ++it){
            Iy[i_count] = it.row();
            Jy[i_count] = it.col();
            yy[i_count] = it.value();
            i_count++;
        }
    }
    
    //std::cout<<"cc(1)=2->>: "<<cc[1]<<" cc(14)=1->>: "<<cc[14]<<" cc(32)=8->>: "<<cc[32]<<"\n";
    //std::cout<<"xx(0)=4->>: "<<xx[0]<<" xx(7)=5->>: "<<xx[7]<<" xx(38)=3->>: "<<xx[7]<<"\n";
    //std::cout<<"rc(4)=23->>: "<<rc[4]<<" rc(6)=1->>: "<<rc[6]<<" rc[12]=148->>: "<<rc[12]<<"\n";
    //std::cout<<"yy(2)=5->>: "<<yy[2]<<" yy(8)=4->>: "<<yy[8]<<" yy(19)=3->>: "<<yy[19]<<"\n";

    //read from testing file, and construct matrix
    
    int *Ixt = new int[test_size];
    int *Jxt = new int[test_size];
    double *xxt = new double[test_size];

    std::ifstream test_file(test_dir.c_str());
    std::vector<T> test_list;
    test_list.reserve(test_size);

    for(int i=0;i<test_size;i++){
    	test_file >> Ixt[i];
        test_file >> Jxt[i];
        test_file >> xxt[i];
        Ixt[i] = Ixt[i] - offset;
        Jxt[i] = Jxt[i] - offset;
        test_list.push_back(T(Ixt[i], Jxt[i], xx[i]));
    }
    Eigen::SparseMatrix<double> Rt(row, col);
    Rt.setFromTriplets(test_list.begin(), test_list.end());

    int maxiter = 10;

    Eigen::MatrixXd U(k, row);
    Eigen::MatrixXd M(k, col);

    //generate random numbers within 0 and 1 for U, and M
    #pragma omp parallel for
    for(int i=0;i<k;i++){
        for(int j=0;j<row;j++){
            U(i,j) = (double) rand() / (double) RAND_MAX;
        }
        for(int j=0;j<col;j++){
            M(i,j) = (double) rand() / (double) RAND_MAX;
        }
    }
    //preprocessing for parallelization
    int *cci = new int[col];
    int *rci = new int[row];
    int pre_count;

    pre_count = 0;
    for(int i=0;i<col;i++){
        cci[i] = pre_count;
        pre_count = cci[i] + cc[i];
    }

    pre_count = 0;
    for(int i=0;i<row;i++){
       rci[i] = pre_count;
       pre_count = rci[i] + rc[i];
    }


    std::cout<<"walltime spent on preprocessing data: "<<omp_get_wtime() - wall_timer<<"\n";

    //std::cout<<"R_tsp: "<<R_tsp.size()<<" R_tsp(4999, 1825) = 3, the output is: "<<R_tsp.coeffRef(4999,1825)<<"\n";

    //for small
    //std::cout<<"R size: "<<R.size()<<" R(1825, 4999) = 3, the output is: "<<R.coeffRef(1825,4999)<<"\n";
    //std::cout<<"Rt size: "<<Rt.size()<<" Rt(1395, 4999) = 3, the output is: "<<Rt.coeffRef(1395,4999)<<"\n";

    //for medium
    //std::cout<<"R size: "<<R.size()<<" R(4750, 3951) is 4, the output is:  "<<R.coeffRef(4750,3951)<<"\n";
    //std::cout<<"Rt size: "<<Rt.size()<<" R(2128, 3951) is 3, the output is: "<<Rt.coeffRef(2128,3951)<<"\n";

//------------------------------begin processing----------------------------------------------//
    double accu_sum=0;
    double rmse_test=0;
    double rmse_train=0;
    Eigen::MatrixXd U_tps = U.transpose();
    #pragma omp parallel for reduction(+:accu_sum) 
    for(int i=0;i<train_size;i++){
        accu_sum += pow(U_tps.row(Ix[i])*M.col(Jx[i]) - xx[i], 2);
    }
    rmse_train = sqrt(accu_sum/train_size);

    accu_sum = 0;
    #pragma omp parallel for reduction(+:accu_sum)
    for(int i=0;i<test_size;i++){
        accu_sum += pow(U_tps.row(Ixt[i])*M.col(Jxt[i]) - xxt[i], 2);
    }
    rmse_test = sqrt(accu_sum/test_size);

    Eigen::MatrixXd iden = Eigen::MatrixXd::Identity(k,k);

    std::cout<<"start with rmse on train: "<< rmse_train <<" rmse on test: "<< rmse_test << " n_threads: "<<n_threads<<std::endl;
    double total_timer, end_timer;
    for(int t=0;t<maxiter;t++){
	printf("iter: %d\n",t+1);

	printf("Minimize M while fixing U ...");
        wall_timer = omp_get_wtime();
        //minimize M while fixing U
	#pragma omp parallel for schedule(dynamic, 1)
        for(int i=0;i<col;i++){
            if( cc[i]>0 ){
                //construct subU, and subR
                Eigen::MatrixXd subU(k, cc[i]);
                Eigen::VectorXd subR(cc[i]);
		int j=cci[i];
                for(int l=0; l<cc[i];l++){
                    subU.col(l) = U.col(Ix[j+l]);
		    subR[l] = xx[j+l];
                }
                M.col(i) = (lambda*iden+subU*subU.transpose()).llt().solve((subU*subR));
            }else{
                M.col(i) = Eigen::VectorXd::Zero(k);
            }
        
        }
        end_timer = omp_get_wtime();
	total_timer += end_timer-wall_timer;	
        printf("%0.2f seconds\n", end_timer - wall_timer);

	printf("Minimize U whilt fixing M ...");
	wall_timer = omp_get_wtime();
        //minimize U while fixing M
        #pragma omp parallel for schedule(dynamic, 1)
        for(int i=0;i<row;i++){
            if( rc[i] > 0){
                //construct subM, and subR
                Eigen::MatrixXd subM(k, rc[i]);
                Eigen::VectorXd subR(rc[i]);
		int j=rci[i];
                for(int l=0;l<rc[i];l++){
                    subM.col(l) = M.col(Iy[j+l]);
		    subR[l] = yy[j+l];
                }
                U.col(i) = (lambda*iden+subM*subM.transpose()).llt().solve((subM*subR));
            }else{
                U.col(i) = Eigen::VectorXd::Zero(k);
            }
        }
	end_timer = omp_get_wtime();
	total_timer += end_timer-wall_timer;
	printf("%0.2f seconds\n", end_timer - wall_timer);

	Eigen::MatrixXd U_tps = U.transpose();
    
        accu_sum = 0;
        #pragma omp parallel for reduction(+:accu_sum)
        for(int i=0;i<train_size;i++){
            accu_sum += pow(U_tps.row(Ix[i])*M.col(Jx[i]) - xx[i], 2);
        }
        rmse_train = sqrt(accu_sum/train_size);
        
        accu_sum = 0;
        #pragma omp parallel for reduction(+:accu_sum)
        for(int i=0;i<test_size;i++){
            accu_sum += pow(U_tps.row(Ixt[i])*M.col(Jxt[i]) - xxt[i], 2);
        }
        rmse_test = sqrt(accu_sum/test_size);

        printf("rmse on train: %0.6f, rmse on test: %0.6f\n",rmse_train, rmse_test);
    }
    printf("total running time: %0.2f\n",total_timer);

    //free variables
    delete[] Ix;
    delete[] Jx;
    delete[] xx;
    delete[] Iy;
    delete[] Jy;
    delete[] yy;
    delete[] Ixt;
    delete[] Jxt;
    delete[] xxt;
    delete[] rci;
    delete[] cci;
}
예제 #5
0
void loaders::load_data(const std::string& pathTrain, 
                        const std::string& pathTest, 
                        std::vector<std::vector<std::string>>& trainText, 
                        std::vector<bool>& labels,
                        std::vector<std::vector<std::string>>& testText,
								const int& stem_length, // Defaults to -1 : don't stem
								const unsigned int& min_size, // Defaults to 2 : words of length 1 are ignored.
								const bool& you_stem){ // Defaults to 0 : don't you-stem
    // Resize vectors just to be sure.
    trainText.resize(loaders::train_size);
    testText.resize(loaders::test_size);
    labels.resize(loaders::train_size);
    
    // Tokens with which smileys are replaced.
    std::string smileyToken = " smiley ";
    std::string saddeyToken = " saddey ";
  
    // Load list of smileys and corrections from corresponding files.
    std::vector<std::pair<std::string, std::string>> corrections;
    std::vector<std::pair<std::string, bool>> smileys;
    load_smileys("../smileys", smileys);
    load_corrections("../corrections", corrections);
    
    // Prepare list of regex replacements to be made.
    std::vector<std::pair<std::string, std::string>> replacements;
    getRegexps(replacements, you_stem);
    
    // Load train file for processing.
    std::fstream train_file(pathTrain.c_str(), std::ios_base::in);
    std::string sample;
    for(int line_cmp=0;getline(train_file,sample);line_cmp++){
        auto& finalSample = trainText[line_cmp];
		  labels[line_cmp] = sample[0] == '1';
        
        // Replace smileys with smiley/saddey token.
        for(std::pair<std::string, bool> p : smileys)
            sample = boost::regex_replace(sample, boost::regex(p.first), p.second ? smileyToken : saddeyToken);
        
        // Apply all regexps 
        for(std::pair<std::string, std::string> p : replacements){
            boost::regex r(p.first);
            std::string fmt = p.second;
            sample = boost::regex_replace(sample, r, fmt);
        }
        
        // Turn all to lowercase
        std::transform(sample.begin(), sample.end(), sample.begin(), ::tolower);
        
        // Correct auto-censored swear words
        for(std::pair<std::string, std::string> p : corrections)
            sample = boost::regex_replace(sample, boost::regex(p.first), p.second);
        
        // Remove excess *
        sample = boost::regex_replace(sample, boost::regex("\\*"), "");
        
        // Save to output vector.
        std::stringstream ss(sample);
        std::string word;
        while(ss >> word)
			  if(word.size() >= min_size)
				{
					if(stem_length == -1 || (int)word.size() < stem_length)
						finalSample.push_back(word);
					else
						finalSample.push_back(word.substr(0,stem_length));
				}
    }
    train_file.close();
    
    // Load test file for processing.
	 std::fstream test_file(pathTest.c_str(), std::ios_base::in);
    for(int line_cmp=0;getline(test_file,sample);line_cmp++){
        auto& finalSample = testText[line_cmp];
        
        // Replace smileys with smiley/saddey token.
        for(std::pair<std::string, bool> p : smileys)
            sample = boost::regex_replace(sample, boost::regex(p.first), p.second ? smileyToken : saddeyToken);
        
        // Apply all regexps 
        for(std::pair<std::string, std::string> p : replacements){
            boost::regex r(p.first);
            std::string fmt = p.second;
            sample = boost::regex_replace(sample, r, fmt);
        }
        
        // Turn all to lowercase
        std::transform(sample.begin(), sample.end(), sample.begin(), ::tolower);
        
        // Correct auto-censored swear words
        for(std::pair<std::string, std::string> p : corrections)
            sample = boost::regex_replace(sample, boost::regex(p.first), p.second);
        
        // Remove excess *
        sample = boost::regex_replace(sample, boost::regex("\\*"), "");
        
        // Save to output vector.
        std::stringstream ss(sample);
        std::string word;
		  
		  while(ss >> word)
			  if(word.size() >= min_size)
			  {
				  if(stem_length == -1 || (int)word.size() < stem_length)
					  finalSample.push_back(word);
				  else
					  finalSample.push_back(word.substr(0,stem_length));
			  }
    }
    test_file.close();
}