int main(int argc, char * argv[] ) { time_t now = time(0); int rank ,size; SRAssembler* srassembler = NULL; try { mpi_init(argc,argv); size=mpi_get_size(); rank=mpi_get_rank(); srassembler = SRAssembler::getInstance(rank); int ret = srassembler->init(argc, argv, rank, size); if (ret == -1) { throw -1; } srassembler->do_preprocessing(); srassembler->do_walking(); } catch (int e) { mpi_code code; code.action = ACTION_EXIT; code.value1 = 0; code.value2 = 0; mpi_bcast(get_mpi_code_value(code)); finalized(); return -1; } finalized(); if (rank == 0) { string str = "Execution time: " + int2str(time(0) - now) + " seconds"; srassembler->get_logger()->info(str); } return 0; }
static void info(const char *fmt,...) { if(mpi_get_rank()!=0) return; char buf[BUFSIZ]; va_list ap; va_start(ap,fmt); vsprintf(buf,fmt,ap); va_end(ap); (*rksvm_print_string)(buf); }
int main(int argc, char **argv) { //set the mpi settings int threadprovided; MPI_Init_thread(&argc, &argv, MPI_THREAD_MULTIPLE, &threadprovided); if(threadprovided != MPI_THREAD_MULTIPLE) { printf("MPI multiple thread isn't provided!\n"); fflush(stdout); mpi_exit(1); } int current_rank = mpi_get_rank(); int nr_ranks = mpi_get_size(); param.nr_ranks = nr_ranks; char hostname[1024]; int hostname_len; MPI_Get_processor_name(hostname, &hostname_len); printf("processor name: %s, number of processed: %d, rank: %d\n", hostname, nr_ranks, current_rank); fflush(stdout); // int global_l; char input_file_name[1024]; char model_file_name[1024]; const char *error_msg; parse_command_line(argc, argv, input_file_name, model_file_name); //set the number of threads for the shared-memory system int nr_threads = param.thread_count; int max_thread_count = omp_get_max_threads(); if(nr_threads > max_thread_count) { printf("[rank %d], please enter the correct number of threads: 1~%d\n", current_rank, max_thread_count); mpi_exit(1); } omp_set_num_threads(nr_threads); //set the cpu affnity /*int ithread, err, cpu; cpu_set_t cpu_mask; #pragma omp parallel private(ithread, cpu_mask, err, cpu) { ithread = omp_get_thread_num(); CPU_ZERO(&cpu_mask);//set mask to zero CPU_SET(ithread, &cpu_mask);//set mask with ithread err = sched_setaffinity((pid_t)0, sizeof(cpu_mask), &cpu_mask); cpu = sched_getcpu(); printf("thread_id %d on CPU %d\n", ithread, cpu); }*/ //now, read the problem from the input file read_problem(input_file_name); error_msg = rksvm_check_parameter(&prob,¶m); if(error_msg) { fprintf(stderr,"ERROR: %s\n",error_msg); mpi_exit(1); } //distributed code global_l = prob.l; mpi_allreduce(&global_l, 1, MPI_INT, MPI_SUM);//MPI_INT :int;MPI_SUM:sum prob.global_l = global_l; printf("#local instances = %d, #global instances = %d\n", prob.l, prob.global_l); fflush(stdout); if(current_rank==0){ puts("Start to train!"); } model = rksvm_train(&prob,¶m); if(rksvm_save_model(model_file_name,model)) { fprintf(stderr,"[rank %d] can't save model to file %s\n",mpi_get_rank(), model_file_name); mpi_exit(1); } rksvm_free_and_destroy_model(&model); free(prob.y); free(prob.x); free(prob.query); free(x_space); free(prob.length_of_each_rksvm_node); free(line); MPI_Finalize(); return 0; }
void exit_input_error(int line_num) { fprintf(stderr,"[rank %d] Wrong input format at line %d\n", mpi_get_rank(), line_num); mpi_exit(1); }
void read_problem(const char *filename) { long int elements, max_index, inst_max_index, i, j, k; FILE *fp = fopen(filename,"r"); char *endptr; char *idx, *val, *label; if(fp == NULL) { fprintf(stderr,"can't open input file %s\n",filename); mpi_exit(1); } prob.l = 0; elements = 0; max_line_len = 1024; line = Malloc(char,max_line_len); while(readline(fp)!=NULL) { char *p = strtok(line," \t"); // label // features while(1) { p = strtok(NULL," \t"); if(p == NULL || *p == '\n') // check '\n' as ' ' may be after the last feature break; ++elements; } ++elements; ++prob.l; } rewind(fp); prob.y = Malloc(double,prob.l); prob.x = Malloc(struct rksvm_node *,prob.l); prob.query = Malloc(int, prob.l); x_space = Malloc(struct rksvm_node,elements); prob.length_of_each_rksvm_node = Malloc(int, prob.l); max_index = 0; j=0; k=0; for(i=0;i<prob.l;i++) { prob.query[i] = 0; inst_max_index = -1; // strtol gives 0 if wrong format, and precomputed kernel has <index> start from 0 readline(fp); prob.x[i] = &x_space[j]; label = strtok(line," \t\n"); if(label == NULL) // empty line exit_input_error(i+1); prob.y[i] = strtod(label,&endptr); if(endptr == label || *endptr != '\0') exit_input_error(i+1); while(1) { idx = strtok(NULL,":"); val = strtok(NULL," \t"); if(val == NULL) break; if (!strcmp(idx,"qid")) { errno = 0; prob.query[i] = (int) strtol(val, &endptr,10); if(endptr == val || errno !=0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); } else { errno = 0; x_space[j].index = (int) strtol(idx,&endptr,10); if(endptr == idx || errno != 0 || *endptr != '\0' || x_space[j].index <= inst_max_index) exit_input_error(i+1); else inst_max_index = x_space[j].index; errno = 0; x_space[j].value = strtod(val,&endptr); if(endptr == val || errno != 0 || (*endptr != '\0' && !isspace(*endptr))) exit_input_error(i+1); ++j; } } if(inst_max_index > max_index) max_index = inst_max_index; x_space[j++].index = -1; prob.length_of_each_rksvm_node[i] = (int)(j-k); k = j; } if(param.gamma == 0 && max_index > 0) param.gamma = 1.0/max_index; if(param.kernel_type == PRECOMPUTED) for(i=0;i<prob.l;i++) { if (prob.x[i][0].index != 0) { fprintf(stderr,"[rank %d] Wrong input format: first column must be 0:sample_serial_number\n", mpi_get_rank()); mpi_exit(1); } if ((int)prob.x[i][0].value <= 0 || (int)prob.x[i][0].value > max_index) { fprintf(stderr,"[rank %d] Wrong input format: sample_serial_number out of range\n", mpi_get_rank()); mpi_exit(1); } } fclose(fp); }
l2r_rank_fun::l2r_rank_fun(const rksvm_problem *prob, const rksvm_parameter *param, Scheduler *scheduler, struct SolutionInfo *si) { this->si = si; this->param = param; si->rho = 0; si->upper_bound_p = INF; si->upper_bound_n = INF; int l=prob->l; this->prob = prob; this->C = param->C; this->thread_count = param->thread_count;// this->current_rank = mpi_get_rank();// this->global_l = prob->global_l;// z = new double[l]; int i,j,k; perm = new int[l]; group_queries(prob, &nr_subset ,&start, &count, perm); pi = new id_and_value* [nr_subset]; #pragma omp parallel for default(shared) if(nr_subset > 50) for (int i=0;i<nr_subset;i++) { pi[i] = new id_and_value[count[i]]; } double *y=prob->y; int_y = new int[prob->l]; nr_class = new int[nr_subset]; l_plus = new int[l]; l_minus = new int[l]; gamma_plus = new double[l]; gamma_minus = new double[l]; ATAQb = new double[l]; ATe = new double[l]; // the variable we have changed; this->scheduler = scheduler; this->local_l = scheduler->local_l; this->start_ptr = scheduler->start_ptr; //this->nr_recv = scheduler->nr_recv; //this->nr_send = scheduler->nr_send; gz = new double[global_l]; //gATAQb = new double[global_]; //gATe = new double[global_l]; Q = new double[l*global_l]; //here, it shows how to compute Q through TBB library. nomad_fun(prob, param, scheduler, Q); //testing Q //char *file = "/home/jing/model/Q.txt"; //save_Q(file, prob, Q); //mpi_exit(1); #pragma omp parallel for default(shared) private(i,j,k) for (i=0;i<nr_subset;i++) { k=1; for (j=0;j<count[i];j++) { pi[i][j].id=perm[j+start[i]]; pi[i][j].value=y[perm[j+start[i]]]; } qsort(pi[i], count[i], sizeof(id_and_value), compare_id_and_value); int_y[pi[i][count[i]-1].id]=1; for(j=count[i]-2;j>=0;j--) { if (pi[i][j].value>pi[i][j+1].value) k++; int_y[pi[i][j].id]=k; } nr_class[i]=k; } }
void nomad_fun(const rksvm_problem *prob, const rksvm_parameter *param, Scheduler *scheduler, double *Q) { int l = prob->l; int global_l = prob->global_l; int thread_count = param->thread_count; int nr_ranks = param->nr_ranks; int current_rank = mpi_get_rank(); int *nr_send = scheduler->nr_send; int *nr_recv = scheduler->nr_recv; //atomic variables atomic<int> count_setup_threads; count_setup_threads = 0; atomic<int> computed_data_nodes;//record the number of data_nodes that have been utilized. computed_data_nodes = 0; atomic<int> sended_count;//record the number of data_nodes that have been sended. sended_count = 0; atomic<int> recvd_count;////record the number of data_nodes that have been received. recvd_count = 0; // two auxiliary atomic flags for both sending and receiving atomic<bool> flag_send_ready; flag_send_ready = false; atomic<bool> flag_receive_ready; flag_receive_ready = false; //build several job queues and one sending queue con_queue *job_queues = callocator<con_queue>().allocate(thread_count); for(int i=0;i<thread_count;i++) callocator<con_queue>().construct(job_queues + i); con_queue send_queue; //initilize job queues int interval = (int)ceil((double)prob->l/thread_count); int thread_id = 0; for(int i=0;i<l;i++) { data_node *copy_x = nullptr; copy_x = scheduler->pop(); if((i!=0)&&(i%interval==0)) thread_id++; job_queues[thread_id].push(copy_x); } //the first function auto QMatrix = [&](struct data_node *copy_x)->void{//{{{ int i = 0; int global_index = copy_x->global_index; for(i=0;i<l;i++) { rksvm_node *s = prob->x[i]; rksvm_node *t = copy_x->x; Q[global_index + i*global_l] = k_function(s,t,*param); } return; };//}}} //the second function auto computer_fun = [&](int thread_id)->void{///{{{ count_setup_threads++; while(count_setup_threads < thread_count) { std::this_thread::yield(); } while(true) { if(computed_data_nodes == global_l) break; data_node *copy_x = nullptr; bool success = job_queues[thread_id].try_pop(copy_x); if(success) { if(copy_x->first_time) { QMatrix(copy_x); computed_data_nodes++; if(nr_ranks==1) { int lth = copy_x->length; callocator<rksvm_node>().deallocate(copy_x->x, lth); callocator<data_node>().destroy(copy_x); callocator<data_node>().deallocate(copy_x,1); } else { copy_x->first_time = false; send_queue.push(copy_x); flag_send_ready = true; } } else { QMatrix(copy_x); computed_data_nodes++; copy_x->current_rank = current_rank; int next_rank = cyclic_loading_rank(copy_x->current_rank, nr_ranks); if(next_rank==copy_x->initial_rank) { int lth = copy_x->length; callocator<rksvm_node>().deallocate(copy_x->x, lth); callocator<data_node>().destroy(copy_x); callocator<data_node>().deallocate(copy_x,1); } else { send_queue.push(copy_x); } } } } return; };///}}} //the third function auto sender_fun = [&]()->void{///{{{ while(flag_send_ready == false) { std::this_thread::yield(); } int lth; int msg_bytenum; while(true) { if(sended_count == nr_send[current_rank]) break; data_node *copy_x = nullptr; bool success = send_queue.try_pop(copy_x); if(success) { int next_rank = cyclic_loading_rank(copy_x->current_rank, nr_ranks); if(next_rank == copy_x->initial_rank) { lth = copy_x->length; callocator<rksvm_node>().deallocate(copy_x->x, lth); callocator<data_node>().destroy(copy_x); callocator<data_node>().deallocate(copy_x,1); } else { lth = copy_x->length; msg_bytenum = sizeof(bool)+4*sizeof(int)+lth*sizeof(rksvm_node); char *send_message = sallocator<char>().allocate(msg_bytenum); *(reinterpret_cast<bool *>(send_message)) = copy_x->first_time; *(reinterpret_cast<int *>(send_message + sizeof(bool))) = copy_x->length; *(reinterpret_cast<int *>(send_message + sizeof(bool) + sizeof(int))) = copy_x->initial_rank; *(reinterpret_cast<int *>(send_message + sizeof(bool) + 2*sizeof(int))) = copy_x->current_rank; *(reinterpret_cast<int *>(send_message + sizeof(bool) + 3*sizeof(int))) = copy_x->global_index; rksvm_node *dest = reinterpret_cast<rksvm_node *>(send_message + sizeof(bool) + 4*sizeof(int)); std::copy(copy_x->x, copy_x->x + lth, dest); flag_receive_ready = true; MPI_Ssend(send_message, msg_bytenum, MPI_CHAR, next_rank, 1, MPI_COMM_WORLD); //destroying callocator<rksvm_node>().deallocate(copy_x->x, lth); callocator<data_node>().destroy(copy_x); callocator<data_node>().deallocate(copy_x,1); //record the sended count sended_count++; sallocator<char>().deallocate(send_message, msg_bytenum); } } } return; };///}}} //the fourth function auto receiver_fun = [&]()->void{///{{{ while(flag_receive_ready == false) { std::this_thread::yield(); } int flag = 0; int src_rank; int lth; MPI_Status status; while(true) { if(recvd_count == nr_recv[mpi_get_rank()]) break; MPI_Iprobe(MPI_ANY_SOURCE, 1, MPI_COMM_WORLD, &flag, &status); if(flag == 0) { std::this_thread::yield(); } else { src_rank = status.MPI_SOURCE; int msg_size = 0; MPI_Get_count(&status, MPI_CHAR, &msg_size); char *recv_message = sallocator<char>().allocate(msg_size); MPI_Recv(recv_message, msg_size, MPI_CHAR, src_rank, 1, MPI_COMM_WORLD, &status); //recovering data_node *copy_x = callocator<data_node>().allocate(1); copy_x->first_time = *(reinterpret_cast<bool *>(recv_message)); copy_x->length = *(reinterpret_cast<int *>(recv_message + sizeof(bool))); copy_x->initial_rank = *(reinterpret_cast<int *>(recv_message + sizeof(bool) + sizeof(int))); copy_x->current_rank = *(reinterpret_cast<int *>(recv_message + sizeof(bool) + 2*sizeof(int))); copy_x->global_index = *(reinterpret_cast<int *>(recv_message + sizeof(bool) + 3*sizeof(int))); rksvm_node *dest = reinterpret_cast<rksvm_node *>(recv_message + sizeof(bool) + 4*sizeof(int)); //please notice that the approach to recover cp_x->x lth = copy_x->length; copy_x->x = callocator<rksvm_node>().allocate(lth); memcpy(copy_x->x, dest, (size_t)sizeof(rksvm_node)*lth); sallocator<char>().deallocate(recv_message, msg_size); //push an item to the job_queue who has the smallest number of items. //In doing so, the dynamic loading balancing can be achieved. int smallest_items_thread_id = 0; auto smallest_items = job_queues[0].unsafe_size(); for(int i=1;i<thread_count;i++) { auto tmp = job_queues[i].unsafe_size(); if(tmp < smallest_items) { smallest_items_thread_id = i; smallest_items = tmp; } } job_queues[smallest_items_thread_id].push(copy_x); recvd_count++; } } return; };///}}} //notice that tht above functions are important to our program //create some functional threads std::vector<std::thread> computers; std::thread *sender = nullptr; std::thread *receiver = nullptr; for (int i=0; i < thread_count; i++){ computers.push_back(std::thread(computer_fun, i)); } if(nr_ranks>1) { sender = new std::thread(sender_fun); receiver = new std::thread(receiver_fun); } //wait until data loading and initialization //the main thread is used to test the results while(count_setup_threads < thread_count){ std::this_thread::yield(); } if(current_rank==0) { printf("Start to compute kernel matrix!\n"); fflush(stdout); } //test the time used to compute Q tbb::tick_count start_time = tbb::tick_count::now(); while(true) { if(nr_ranks==1) { if(computed_data_nodes == global_l) break; } else { if((computed_data_nodes==global_l)&& (sended_count==nr_send[current_rank])&& (recvd_count==nr_recv[current_rank])) break; } } MPI_Barrier(MPI_COMM_WORLD);//sychronization double elapsed_seconds = (tbb::tick_count::now() - start_time).seconds(); if(current_rank==0) { printf("Computing Q has done!, the elapsed time is %f secs\n", elapsed_seconds); fflush(stdout); } callocator<con_queue>().deallocate(job_queues, thread_count); for(auto &th: computers) th.join(); if(nr_ranks > 1) { sender->join(); receiver->join(); delete sender; delete receiver; } return; }