예제 #1
0
int main(int argc, char * argv[] ) {

	time_t now = time(0);
	int rank ,size;

	SRAssembler* srassembler = NULL;
	try {
		mpi_init(argc,argv);
		size=mpi_get_size();
		rank=mpi_get_rank();

		srassembler = SRAssembler::getInstance(rank);
		int ret = srassembler->init(argc, argv, rank, size);
		if (ret == -1) {
			throw -1;
		}
		srassembler->do_preprocessing();
		srassembler->do_walking();
	} catch (int e) {
		mpi_code code;
		code.action = ACTION_EXIT;
		code.value1 = 0;
		code.value2 = 0;
		mpi_bcast(get_mpi_code_value(code));
		finalized();
		return -1;
	}
	finalized();
	if (rank == 0) {
		string str = "Execution time: " + int2str(time(0) - now) + " seconds";
        srassembler->get_logger()->info(str);
	}
	return 0;
}
예제 #2
0
static void info(const char *fmt,...)
{
	if(mpi_get_rank()!=0)
		return;
	char buf[BUFSIZ];
	va_list ap;
	va_start(ap,fmt);
	vsprintf(buf,fmt,ap);
	va_end(ap);
	(*rksvm_print_string)(buf);
}
예제 #3
0
int main(int argc, char **argv)
{
	//set the mpi settings
	int threadprovided;
	MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &threadprovided);
	if(threadprovided != MPI_THREAD_MULTIPLE)
	{
		printf("MPI multiple thread isn't provided!\n");
		fflush(stdout);
		mpi_exit(1);
	}
	int current_rank = mpi_get_rank();
	int nr_ranks = mpi_get_size();
	param.nr_ranks = nr_ranks;
	
	char hostname[1024];
	int hostname_len;
	MPI_Get_processor_name(hostname, &hostname_len);
    printf("processor name: %s, number of processed: %d, rank: %d\n", hostname, nr_ranks, current_rank);
	fflush(stdout);
	//
	int global_l;
	char input_file_name[1024];
	char model_file_name[1024];
	const char *error_msg;
	parse_command_line(argc, argv, input_file_name, model_file_name);
	
	//set the number of threads for the shared-memory system
	int nr_threads = param.thread_count;
	int max_thread_count = omp_get_max_threads();

	if(nr_threads > max_thread_count)
	{
		printf("[rank %d], please enter the correct number of threads: 1~%d\n", current_rank, max_thread_count);
		mpi_exit(1);
	}
	omp_set_num_threads(nr_threads);

	//set the cpu affnity
	/*int ithread, err, cpu;
	cpu_set_t cpu_mask;
#pragma omp parallel private(ithread, cpu_mask, err, cpu)
	{
		ithread = omp_get_thread_num();
		CPU_ZERO(&cpu_mask);//set mask to zero
		CPU_SET(ithread, &cpu_mask);//set mask with ithread
		err = sched_setaffinity((pid_t)0, sizeof(cpu_mask), &cpu_mask);
		cpu = sched_getcpu();
		printf("thread_id %d on CPU %d\n", ithread, cpu);
	}*/
	//now, read the problem from the input file
	read_problem(input_file_name);
	error_msg = rksvm_check_parameter(&prob,&param);

	if(error_msg)
	{
		fprintf(stderr,"ERROR: %s\n",error_msg);
		mpi_exit(1);
	}

	//distributed code
	global_l = prob.l;
	mpi_allreduce(&global_l, 1, MPI_INT, MPI_SUM);//MPI_INT :int;MPI_SUM:sum
	prob.global_l = global_l;
	
	printf("#local instances = %d, #global instances = %d\n", prob.l, prob.global_l);
	fflush(stdout);

	if(current_rank==0){
	puts("Start to train!");
	}
	model = rksvm_train(&prob,&param);
	if(rksvm_save_model(model_file_name,model))
	{
		fprintf(stderr,"[rank %d] can't save model to file %s\n",mpi_get_rank(), model_file_name);
		mpi_exit(1);
	}
	rksvm_free_and_destroy_model(&model);
	free(prob.y);
	free(prob.x);
	free(prob.query);
	free(x_space);
	free(prob.length_of_each_rksvm_node);
	free(line);

	MPI_Finalize();
	return 0;
}
예제 #4
0
void exit_input_error(int line_num)
{
	fprintf(stderr,"[rank %d] Wrong input format at line %d\n", mpi_get_rank(), line_num);
	mpi_exit(1);
}
예제 #5
0
void read_problem(const char *filename)
{
	long int elements, max_index, inst_max_index, i, j, k;
	FILE *fp = fopen(filename,"r");
	char *endptr;
	char *idx, *val, *label;

	if(fp == NULL)
	{
		fprintf(stderr,"can't open input file %s\n",filename);
		mpi_exit(1);
	}

	prob.l = 0;
	elements = 0;

	max_line_len = 1024;
	line = Malloc(char,max_line_len);
	while(readline(fp)!=NULL)
	{
		char *p = strtok(line," \t"); // label

		// features
		while(1)
		{
			p = strtok(NULL," \t");
			if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature
				break;
			++elements;
		}
		++elements;
		++prob.l;
	}
	rewind(fp);

	prob.y = Malloc(double,prob.l);
	prob.x = Malloc(struct rksvm_node *,prob.l);
	prob.query = Malloc(int, prob.l);
	x_space = Malloc(struct rksvm_node,elements);
	prob.length_of_each_rksvm_node = Malloc(int, prob.l);

	max_index = 0;
	j=0;
	k=0;
	for(i=0;i<prob.l;i++)
	{
		prob.query[i] = 0;
		inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0
		readline(fp);
		prob.x[i] = &x_space[j];
		label = strtok(line," \t\n");
		if(label == NULL) // empty line
			exit_input_error(i+1);

		prob.y[i] = strtod(label,&endptr);
		if(endptr == label || *endptr != '\0')
			exit_input_error(i+1);

		while(1)
		{
			idx = strtok(NULL,":");
			val = strtok(NULL," \t");

			if(val == NULL)
				break;
			if (!strcmp(idx,"qid"))
			{
				errno = 0;
				prob.query[i] = (int) strtol(val, &endptr,10);
				if(endptr == val || errno !=0 || (*endptr != '\0' && !isspace(*endptr)))
					exit_input_error(i+1);
			}
			else
			{

				errno = 0;
				x_space[j].index = (int) strtol(idx,&endptr,10);
				if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index)
					exit_input_error(i+1);
				else
					inst_max_index = x_space[j].index;

				errno = 0;
				x_space[j].value = strtod(val,&endptr);
				if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr)))
					exit_input_error(i+1);

				++j;
			}
		}

		if(inst_max_index > max_index)
			max_index = inst_max_index;
		x_space[j++].index = -1;
		prob.length_of_each_rksvm_node[i] = (int)(j-k);
		 k = j;
	}

	if(param.gamma == 0 && max_index > 0)
		param.gamma = 1.0/max_index;

	if(param.kernel_type == PRECOMPUTED)
		for(i=0;i<prob.l;i++)
		{
			if (prob.x[i][0].index != 0)
			{
				fprintf(stderr,"[rank %d] Wrong input format: first column must be 0:sample_serial_number\n", mpi_get_rank());
				mpi_exit(1);
			}
			if ((int)prob.x[i][0].value <= 0 || (int)prob.x[i][0].value > max_index)
			{
				fprintf(stderr,"[rank %d] Wrong input format: sample_serial_number out of range\n", mpi_get_rank());
				mpi_exit(1);
			}
		}

	fclose(fp);
}
예제 #6
0
l2r_rank_fun::l2r_rank_fun(const rksvm_problem *prob, const rksvm_parameter *param, 
		Scheduler *scheduler, struct SolutionInfo *si)
{
	this->si = si;
	this->param = param;
	si->rho = 0;
	si->upper_bound_p = INF;
	si->upper_bound_n = INF;
	int l=prob->l;
	this->prob = prob;
	this->C = param->C;
	this->thread_count = param->thread_count;//
	this->current_rank = mpi_get_rank();//
	this->global_l = prob->global_l;//
	z = new double[l];

	int i,j,k;
	perm = new int[l];
	group_queries(prob, &nr_subset ,&start, &count, perm);
	pi = new id_and_value* [nr_subset];

#pragma omp parallel for default(shared) if(nr_subset > 50)
	for (int i=0;i<nr_subset;i++)
	{
		pi[i] = new id_and_value[count[i]];
	}

	double *y=prob->y;
	int_y = new int[prob->l];
	nr_class = new int[nr_subset];
	l_plus = new int[l];
	l_minus = new int[l];
	gamma_plus = new double[l];
	gamma_minus = new double[l];
	ATAQb = new double[l];
	ATe = new double[l];

	// the variable we have changed;
	this->scheduler = scheduler;
	this->local_l = scheduler->local_l;
	this->start_ptr = scheduler->start_ptr;
	//this->nr_recv = scheduler->nr_recv;
	//this->nr_send = scheduler->nr_send;
	gz = new double[global_l];
	//gATAQb = new double[global_];
	//gATe = new double[global_l];
	Q = new double[l*global_l];

	//here, it shows how to compute Q through TBB library.
	nomad_fun(prob, param, scheduler, Q);

//testing Q 
//char *file = "/home/jing/model/Q.txt";
//save_Q(file, prob, Q);	
//mpi_exit(1);


#pragma omp parallel for default(shared) private(i,j,k)	
	for (i=0;i<nr_subset;i++)
	{
		k=1;
		for (j=0;j<count[i];j++)
		{
			pi[i][j].id=perm[j+start[i]];
			pi[i][j].value=y[perm[j+start[i]]];
		}
		qsort(pi[i], count[i], sizeof(id_and_value), compare_id_and_value);
		int_y[pi[i][count[i]-1].id]=1;
		for(j=count[i]-2;j>=0;j--)
		{
			if (pi[i][j].value>pi[i][j+1].value)
				k++;
			int_y[pi[i][j].id]=k;
		}
		nr_class[i]=k;
	}
}
예제 #7
0
void nomad_fun(const rksvm_problem *prob, const rksvm_parameter *param, Scheduler *scheduler, double *Q)
{
	int l = prob->l;
	int global_l = prob->global_l;
	int thread_count = param->thread_count;
	int nr_ranks = param->nr_ranks;
	int current_rank = mpi_get_rank();
	int *nr_send = scheduler->nr_send;
	int *nr_recv = scheduler->nr_recv;
	//atomic variables
	atomic<int> count_setup_threads;
	count_setup_threads = 0;
	atomic<int> computed_data_nodes;//record the number of data_nodes that have been utilized.
	computed_data_nodes = 0;
	atomic<int> sended_count;//record the number of data_nodes that have been sended.
	sended_count = 0;
	atomic<int> recvd_count;////record the number of data_nodes that have been received.
	recvd_count = 0;
	// two auxiliary atomic flags for both sending and receiving
	atomic<bool> flag_send_ready;
	flag_send_ready = false;
	atomic<bool> flag_receive_ready;
	flag_receive_ready = false;

	//build several job queues and one sending queue
	con_queue *job_queues = callocator<con_queue>().allocate(thread_count);
	for(int i=0;i<thread_count;i++)
			callocator<con_queue>().construct(job_queues + i);
	con_queue send_queue;
	//initilize job queues
	int interval = (int)ceil((double)prob->l/thread_count);
	int thread_id = 0;
	for(int i=0;i<l;i++)
	{
		data_node *copy_x = nullptr;
		copy_x = scheduler->pop();
		if((i!=0)&&(i%interval==0))
			thread_id++;
		job_queues[thread_id].push(copy_x);
	}

	//the first function
	auto QMatrix = [&](struct data_node *copy_x)->void{//{{{
		int i = 0;
		int global_index = copy_x->global_index;
		for(i=0;i<l;i++)
		{
			rksvm_node *s = prob->x[i];
			rksvm_node *t = copy_x->x;
			Q[global_index + i*global_l] = k_function(s,t,*param);
		}
		return;
	};//}}}	

	//the second function
	auto computer_fun = [&](int thread_id)->void{///{{{
		count_setup_threads++;
		while(count_setup_threads < thread_count)
		{
			std::this_thread::yield();
		}
		while(true)
		{
			if(computed_data_nodes == global_l)
				break;
			data_node *copy_x = nullptr;
			bool success = job_queues[thread_id].try_pop(copy_x);
			if(success)
			{
				if(copy_x->first_time)
				{
					QMatrix(copy_x);
					computed_data_nodes++;
					if(nr_ranks==1)
					{
						int lth = copy_x->length;
						callocator<rksvm_node>().deallocate(copy_x->x, lth);
						callocator<data_node>().destroy(copy_x);
						callocator<data_node>().deallocate(copy_x,1);
					}
					else
					{
						copy_x->first_time = false;
						send_queue.push(copy_x);
						flag_send_ready = true;
					}
				}
				else
				{
					QMatrix(copy_x);
					computed_data_nodes++;
					copy_x->current_rank = current_rank;
					int next_rank = cyclic_loading_rank(copy_x->current_rank, nr_ranks);
					if(next_rank==copy_x->initial_rank)
					{
						int lth = copy_x->length; 
						callocator<rksvm_node>().deallocate(copy_x->x, lth);
						callocator<data_node>().destroy(copy_x);
						callocator<data_node>().deallocate(copy_x,1);
					}
					else
					{
						send_queue.push(copy_x);
					}
				}
			}
		}
	return;
	};///}}}

	//the third function
	auto sender_fun = [&]()->void{///{{{
		while(flag_send_ready == false)
		{
			std::this_thread::yield();
		}
		int lth;
		int msg_bytenum;
		while(true)
		{
			if(sended_count == nr_send[current_rank])
				break;
			data_node *copy_x = nullptr;
			bool success = send_queue.try_pop(copy_x);
			
			if(success)
			{
				int next_rank = cyclic_loading_rank(copy_x->current_rank, nr_ranks);
				if(next_rank == copy_x->initial_rank)
				{
					lth = copy_x->length; 
					callocator<rksvm_node>().deallocate(copy_x->x, lth);
					callocator<data_node>().destroy(copy_x);
					callocator<data_node>().deallocate(copy_x,1);
				}
				else
				{
					lth = copy_x->length; 
					msg_bytenum = sizeof(bool)+4*sizeof(int)+lth*sizeof(rksvm_node);
					char *send_message = sallocator<char>().allocate(msg_bytenum);
					*(reinterpret_cast<bool *>(send_message)) = copy_x->first_time;
					*(reinterpret_cast<int *>(send_message + sizeof(bool))) = copy_x->length;
					*(reinterpret_cast<int *>(send_message + sizeof(bool) + sizeof(int))) = copy_x->initial_rank;
					*(reinterpret_cast<int *>(send_message + sizeof(bool) + 2*sizeof(int))) = copy_x->current_rank;
					*(reinterpret_cast<int *>(send_message + sizeof(bool) + 3*sizeof(int))) = copy_x->global_index;
					rksvm_node *dest = reinterpret_cast<rksvm_node *>(send_message + sizeof(bool) + 4*sizeof(int));
					std::copy(copy_x->x, copy_x->x + lth, dest);
					flag_receive_ready = true;
					MPI_Ssend(send_message, msg_bytenum, MPI_CHAR, next_rank, 1, MPI_COMM_WORLD);
					//destroying
					callocator<rksvm_node>().deallocate(copy_x->x, lth);
					callocator<data_node>().destroy(copy_x);
					callocator<data_node>().deallocate(copy_x,1);
					//record the sended count
					sended_count++;
					sallocator<char>().deallocate(send_message, msg_bytenum);
				}
			}
		}
		return;
	};///}}}

	//the fourth function
	auto receiver_fun = [&]()->void{///{{{
		
		while(flag_receive_ready == false)
		{
			std::this_thread::yield();
		}
		int flag = 0;
		int src_rank;
		int lth;
		MPI_Status status;
		while(true)
		{
			if(recvd_count == nr_recv[mpi_get_rank()])
				break;
			MPI_Iprobe(MPI_ANY_SOURCE, 1, MPI_COMM_WORLD, &flag, &status);
			if(flag == 0)
			{
				std::this_thread::yield();
			}
			else
			{
				src_rank = status.MPI_SOURCE;
				int msg_size = 0; 
				MPI_Get_count(&status, MPI_CHAR, &msg_size);
				char *recv_message = sallocator<char>().allocate(msg_size);
				MPI_Recv(recv_message, msg_size, MPI_CHAR, src_rank, 1, MPI_COMM_WORLD, &status);
				//recovering
				data_node *copy_x = callocator<data_node>().allocate(1);
				copy_x->first_time = *(reinterpret_cast<bool *>(recv_message));
				copy_x->length = *(reinterpret_cast<int *>(recv_message + sizeof(bool)));
				copy_x->initial_rank = *(reinterpret_cast<int *>(recv_message + sizeof(bool) + sizeof(int)));
				copy_x->current_rank = *(reinterpret_cast<int *>(recv_message + sizeof(bool) + 2*sizeof(int)));
				copy_x->global_index = *(reinterpret_cast<int *>(recv_message + sizeof(bool) + 3*sizeof(int)));
				rksvm_node *dest = reinterpret_cast<rksvm_node *>(recv_message + sizeof(bool) + 4*sizeof(int));
				//please notice that the approach to recover cp_x->x
				lth = copy_x->length;
				copy_x->x = callocator<rksvm_node>().allocate(lth);
				memcpy(copy_x->x, dest, (size_t)sizeof(rksvm_node)*lth);
				sallocator<char>().deallocate(recv_message, msg_size); 
				//push an item to the job_queue who has the smallest number of items.
				//In doing so, the dynamic loading balancing can be achieved.	
				int smallest_items_thread_id = 0;	
				auto smallest_items = job_queues[0].unsafe_size();	
				for(int i=1;i<thread_count;i++)	
				{
					auto tmp = job_queues[i].unsafe_size();		
					if(tmp < smallest_items)		
					{			
						smallest_items_thread_id = i;			
						smallest_items = tmp;		
					}	
				}
				job_queues[smallest_items_thread_id].push(copy_x);
				recvd_count++;
			}
		}
		return;
	};///}}}
	//notice that tht above functions are important to our program

	//create some functional threads
	std::vector<std::thread> computers;
	std::thread *sender = nullptr;
	std::thread *receiver = nullptr;
	for (int i=0; i < thread_count; i++){
		computers.push_back(std::thread(computer_fun, i));
    }
	if(nr_ranks>1)
	{
		sender = new std::thread(sender_fun);
		receiver = new std::thread(receiver_fun);
	}
	//wait until data loading and initialization
	//the main thread is used to test the results
	while(count_setup_threads < thread_count){
		std::this_thread::yield();
	}
	if(current_rank==0)
	{
		printf("Start to compute kernel matrix!\n");
		fflush(stdout);
	}
	//test the time used to compute Q
	tbb::tick_count start_time = tbb::tick_count::now();
	while(true)
	{
		if(nr_ranks==1)
		{
			if(computed_data_nodes == global_l)
				break;
		}
		else
		{
			if((computed_data_nodes==global_l)&&
				(sended_count==nr_send[current_rank])&&
				(recvd_count==nr_recv[current_rank]))
				break;
		}
	}
	MPI_Barrier(MPI_COMM_WORLD);//sychronization
	double elapsed_seconds = (tbb::tick_count::now() - start_time).seconds();
	if(current_rank==0)
	{
		printf("Computing Q has done!, the elapsed time is %f secs\n", elapsed_seconds);
		fflush(stdout);
	}

	callocator<con_queue>().deallocate(job_queues, thread_count); 
	for(auto &th: computers)
		th.join();
	if(nr_ranks > 1)
	{
		sender->join();
		receiver->join();
		delete sender;
		delete receiver;
	}
	return;
}