static void handle_missing_ack(consensus_component* comp,void* data){ missing_ack* msg = data; request_record* origin = (request_record*)msg->data; SYS_LOG(comp,"Node %d Handle Missing Ack From Node %d.\n", comp->node_id,msg->node_id); if(view_stamp_comp(comp->highest_committed_vs,&msg->missing_vs)>=0){ goto handle_missing_ack_exit; }else{ db_key_type record_no = vstol(&msg->missing_vs); request_record* record_data = NULL; size_t data_size; retrieve_record(comp->db_ptr,sizeof(record_no),&record_no,&data_size,(void**)&record_data); if(record_data!=NULL){ goto handle_missing_ack_exit; } record_data =(request_record*)malloc(REQ_RECORD_SIZE(origin)); if(record_data==NULL){ goto handle_missing_ack_exit; } gettimeofday(&record_data->created_time,NULL); record_data->data_size = origin->data_size; memcpy(record_data->data,origin->data,origin->data_size); store_record(comp->db_ptr,sizeof(record_no),&record_no,REQ_RECORD_SIZE(record_data),record_data); } try_to_execute(comp); handle_missing_ack_exit: return; };
static void handle_accept_ack(consensus_component* comp,void* data){ accept_ack* msg = data; // if currently the node is not the leader, then it should ignore all the // accept ack, because that can must be the msg from previous view SYS_LOG(comp,"Node %d Handle Accept Ack From Node %u.\n", comp->node_id,msg->node_id); if(comp->my_role!=LEADER){ goto handle_accept_ack_exit; } // the request has reached quorum if(view_stamp_comp(&msg->msg_vs,comp->highest_committed_vs)<=0){ goto handle_accept_ack_exit; } db_key_type record_no = vstol(&msg->msg_vs); request_record* record_data = NULL; size_t data_size; retrieve_record(comp->db_ptr,sizeof(record_no),&record_no,&data_size,(void**)&record_data); if(record_data==NULL){ SYS_LOG(comp,"Received Ack To Non-Exist Record %lu.\n", record_no); goto handle_accept_ack_exit; } update_record(record_data,msg->node_id); // we do not care about whether the update is successful, otherwise this can // be treated as a message loss store_record(comp->db_ptr,sizeof(record_no),&record_no,REQ_RECORD_SIZE(record_data),record_data); handle_accept_ack_exit: try_to_execute(comp); return; };
// leader has another responsibility to update the highest request that can be executed, // and if the leader is also synchronous, it can execute the record in this stage static void leader_try_to_execute(consensus_component* comp){ SYS_LOG(comp, "highest_seen_req_id %lu.\n", comp->highest_seen_vs->req_id); SYS_LOG(comp, "highest_seen_view_id %lu.\n", comp->highest_seen_vs->view_id); SYS_LOG(comp, "highest_to_commit_vs_req_id %lu.\n", comp->highest_to_commit_vs->req_id); SYS_LOG(comp, "highest_to_commit_vs_view_id %lu.\n", comp->highest_to_commit_vs->view_id); db_key_type start; db_key_type end = vstol(comp->highest_seen_vs);; size_t data_size; view_stamp temp_boundary; view_boundary* boundary_record = NULL; if(comp->highest_seen_vs->view_id != comp->highest_to_commit_vs->view_id){ // address the boundary assert(comp->highest_to_commit_vs->view_id + 1 == comp->highest_seen_vs->view_id); comp->highest_to_commit_vs->view_id += 1; comp->highest_to_commit_vs->req_id = 0; comp->highest_committed_vs->view_id = comp->highest_to_commit_vs->view_id; comp->highest_committed_vs->req_id = comp->highest_to_commit_vs->req_id; start = vstol(comp->highest_to_commit_vs); } else{ start = vstol(comp->highest_to_commit_vs)+1; } int exec_flag = (!view_stamp_comp(comp->highest_committed_vs,comp->highest_to_commit_vs)); request_record* record_data = NULL; SYS_LOG(comp,"The Leader Tries To Execute.\n"); SYS_LOG(comp,"The Start Value Is %lu.\n",start); SYS_LOG(comp,"The End Value Is %lu.\n",end); for(db_key_type index=start;index<=end;index++){ retrieve_record(comp->db_ptr,sizeof(index),&index,&data_size,(void**)&record_data); assert(record_data!=NULL && "The Record Should Be Inserted By The Node Itself!"); if(reached_quorum(record_data,comp->group_size)){ view_stamp temp = ltovs(index); SYS_LOG(comp,"Node %d : View Stamp %u : %u Has Reached Quorum.\n", comp->node_id,temp.view_id,temp.req_id); SYS_LOG(comp,"Before Node %d Inc Execute %u : %u.\n", comp->node_id, comp->highest_to_commit_vs->view_id, comp->highest_to_commit_vs->req_id); view_stamp_inc(comp->highest_to_commit_vs); SYS_LOG(comp,"After Node %d Inc Execute %u : %u.\n", comp->node_id, comp->highest_to_commit_vs->view_id, comp->highest_to_commit_vs->req_id); if(exec_flag){ view_stamp vs = ltovs(index); deliver_msg_data(comp,&vs); view_stamp_inc(comp->highest_committed_vs); } }else{ return; } } }
static void handle_force_exec(consensus_component* comp,void* data){ force_exec* msg = data; if(msg->node_id!=comp->cur_view->leader_id){ goto handle_force_exec_exit; } if(view_stamp_comp(comp->highest_to_commit_vs,&msg->highest_committed_op)<0){ *(comp->highest_to_commit_vs)=msg->highest_committed_op; try_to_execute(comp); } handle_force_exec_exit: return; };
void consensus_make_progress(struct consensus_component_t* comp){ if(LEADER!=comp->my_role){ goto make_progress_exit; } leader_try_to_execute(comp); SYS_LOG(comp,"Let's Make Progress.\n"); if((view_stamp_comp(comp->highest_committed_vs,comp->highest_seen_vs)<0)&& (comp->highest_seen_vs->view_id==comp->cur_view->view_id)){ view_stamp temp; temp.view_id = comp->cur_view->view_id; temp.req_id = 0; if(view_stamp_comp(&temp,comp->highest_committed_vs)<0){ temp = *(comp->highest_committed_vs); } temp.req_id++; record_index_type start = vstol(&temp); record_index_type end = vstol(comp->highest_seen_vs); for(record_index_type index = start;index<=end;index++){ request_record* record_data = NULL; size_t data_size=0; view_stamp temp_vs = ltovs(index); retrieve_record(comp->db_ptr,sizeof(db_key_type),&index,&data_size,(void**)&record_data); if(!reached_quorum(record_data,comp->group_size)){ accept_req* msg = build_accept_req(comp,REQ_RECORD_SIZE(record_data),record_data,&temp_vs); if(NULL==msg){ continue; }else{ comp->uc(comp->my_node,ACCEPT_REQ_SIZE(msg),msg,-1); free(msg); } } } } force_exec* msg = build_force_exec(comp); if(NULL==msg){goto make_progress_exit;} comp->uc(comp->my_node,FORCE_EXEC_SIZE,msg,-1); free(msg); make_progress_exit: return; };
void *handle_accept_req(void* arg) { consensus_component* comp = arg; db_key_type start; db_key_type end; db_key_type index; size_t data_size; request_record* retrieve_data = NULL; int sock; struct timeval start_time; struct timeval end_time; unsigned long e_usec; while (1) { log_entry* new_entry = (log_entry*)((char*)shared_memory.shm[comp->node_id] + shared_memory.log->tail); if (new_entry->req_canbe_exed.view_id != 0)//TODO atmoic opeartion { gettimeofday(&start, 0); if(new_entry->msg_vs.view_id < comp->cur_view.view_id){ // TODO //goto reloop; } // if we this message is not from the current leader if(new_entry->msg_vs.view_id == comp->cur_view.view_id && new_entry->node_id != comp->cur_view.leader_id){ // TODO //goto reloop; } // update highest seen request if(view_stamp_comp(new_entry->msg_vs, comp->highest_seen_vs) > 0){ comp->highest_seen_vs = new_entry->msg_vs; } db_key_type record_no = vstol(new_entry->msg_vs); request_record* record_data = (request_record*)malloc(new_entry->data_size + sizeof(request_record)); gettimeofday(&record_data->created_time, NULL); record_data->data_size = new_entry->data_size; memcpy(record_data->data, new_entry->data, new_entry->data_size); // record the data persistently store_record(comp->db_ptr, sizeof(record_no), &record_no, REQ_RECORD_SIZE(record_data), record_data); uint64_t offset = shared_memory.tail + sizeof(accept_ack) * comp->node_id; shared_memory.tail = shared_memory.tail + log_entry_len(new_entry); accept_ack* reply = (accept_ack*)((char*)new_entry + ACCEPT_ACK_SIZE * comp->node_id); reply->node_id = comp->node_id; reply->msg_vs.view_id = new_entry->msg_vs.view_id; reply->msg_vs.req_id = new_entry->msg_vs.req_id; memcpy((void*)((char*)shared_memory.shm[new_entry->node_id] + offset), reply, ACCEPT_ACK_SIZE); free(record_data); if(view_stamp_comp(new_entry->req_canbe_exed, comp->committed) > 0) { sock = socket(AF_INET, SOCK_STREAM, 0); connect(sock, (struct sockaddr*)&comp->sys_addr.c_addr, comp->sys_addr.c_sock_len); //TODO: why? Broken pipe. Maybe the server closes the socket start = vstol(comp->committed)+1; end = vstol(new_entry->req_canbe_exed); for(index = start; index <= end; index++) { retrieve_record(comp->db_ptr, sizeof(index), &index, &data_size, (void**)&retrieve_data); send(sock, retrieve_data->data, retrieve_data->data_size, 0); } comp->committed = new_entry->req_canbe_exed; } gettimeofday(&end, 0); e_usec = ((end.tv_sec * 1000000) + end.tv_usec) - ((start.tv_sec * 1000000) + start.tv_usec); CON_LOG(comp, "%lu\n", e_usec); } } };
static void handle_accept_req(consensus_component* comp,void* data){ SYS_LOG(comp,"Node %d Handle Accept Req.\n", comp->node_id); accept_req* msg = data; if(msg->msg_vs.view_id< comp->cur_view->view_id){ goto handle_accept_req_exit; } // if we this message is not from the current leader if(msg->msg_vs.view_id == comp->cur_view->view_id && msg->node_id!=comp->cur_view->leader_id){ SYS_LOG(comp, "Msg come from node %ld, which is not the current leader %ld.\n", msg->node_id, comp->cur_view->leader_id); goto handle_accept_req_exit; } // if we have committed the operation, then safely ignore it if(view_stamp_comp(&msg->msg_vs,comp->highest_committed_vs)<=0){ SYS_LOG(comp, "I've already committed the operation. I'll ignore this one.\n"); goto handle_accept_req_exit; }else{ // update highest seen request if(view_stamp_comp(&msg->msg_vs,comp->highest_seen_vs)>0){ *(comp->highest_seen_vs) = msg->msg_vs; } // update highest requests that can be executed // SYS_LOG(comp,"Now Node %d Sees Request %u : %u .\n", comp->node_id, msg->req_canbe_exed.view_id, msg->req_canbe_exed.req_id); if(view_stamp_comp(&msg->req_canbe_exed, comp->highest_to_commit_vs)>0){ *(comp->highest_to_commit_vs) = msg->req_canbe_exed; SYS_LOG(comp,"Now Node %d Can Execute Request %u : %u .\n", comp->node_id, comp->highest_to_commit_vs->view_id, comp->highest_to_commit_vs->req_id); } db_key_type record_no = vstol(&msg->msg_vs); request_record* origin_data = (request_record*)msg->data; request_record* record_data = (request_record*)malloc( REQ_RECORD_SIZE(origin_data)); if(record_data==NULL){ goto handle_accept_req_exit; } gettimeofday(&record_data->created_time,NULL); record_data->is_closed = origin_data->is_closed; record_data->data_size = origin_data->data_size; memcpy(record_data->data,origin_data->data, origin_data->data_size); // record the data persistently if(store_record(comp->db_ptr,sizeof(record_no),&record_no, REQ_RECORD_SIZE(record_data),record_data)!=0){ goto handle_accept_req_exit; } // build the reply to the leader accept_ack* reply = build_accept_ack(comp,&msg->msg_vs); if(NULL==reply){ goto handle_accept_req_exit; } comp->uc(comp->my_node,ACCEPT_ACK_SIZE,reply,msg->node_id); free(reply); } handle_accept_req_exit: try_to_execute(comp); return; };
void *handle_accept_req(void* arg) { consensus_component* comp = arg; db_key_type start; db_key_type end; db_key_type index; dare_log_entry_t* entry; set_affinity(1); for (;;) { if (comp->cur_view->leader_id != *comp->node_id) { comp->uc(comp->up_para); entry = log_get_entry(SRV_DATA->log, &SRV_DATA->log->end); if (entry->data_size != 0) { char* dummy = (char*)((char*)entry + log_entry_len(entry) - 1); if (*dummy == DUMMY_END) // atmoic opeartion { #ifdef MEASURE_LATENCY clock_handler c_k; clock_init(&c_k); clock_add(&c_k); #endif if(entry->msg_vs.view_id < comp->cur_view->view_id){ // TODO //goto reloop; } // if we this message is not from the current leader if(entry->msg_vs.view_id == comp->cur_view->view_id && entry->node_id != comp->cur_view->leader_id){ // TODO //goto reloop; } // update highest seen request if(view_stamp_comp(&entry->msg_vs, comp->highest_seen_vs) > 0){ *(comp->highest_seen_vs) = entry->msg_vs; } db_key_type record_no = vstol(&entry->msg_vs); // record the data persistently request_record* record_data = (request_record*)((char*)entry + offsetof(dare_log_entry_t, data_size)); store_record(comp->db_ptr, sizeof(record_no), &record_no, REQ_RECORD_SIZE(record_data) - 1, record_data); #ifdef MEASURE_LATENCY clock_add(&c_k); #endif SRV_DATA->log->tail = SRV_DATA->log->end; SRV_DATA->log->end += log_entry_len(entry); uint32_t my_id = *comp->node_id; uint32_t offset = (uint32_t)(offsetof(dare_log_t, entries) + SRV_DATA->log->tail + ACCEPT_ACK_SIZE * my_id); accept_ack* reply = (accept_ack*)((char*)entry + ACCEPT_ACK_SIZE * my_id); reply->node_id = my_id; reply->msg_vs.view_id = entry->msg_vs.view_id; reply->msg_vs.req_id = entry->msg_vs.req_id; if (entry->type == P_OUTPUT) { // up = get_mapping_fd() is defined in ev_mgr.c int fd = comp->ug(entry->clt_id, comp->up_para); // consider entry->data as a pointer. uint64_t hash = get_output_hash(fd, *(long*)entry->data); reply->hash = hash; } rem_mem_t rm; dare_ib_ep_t *ep = (dare_ib_ep_t*)SRV_DATA->config.servers[entry->node_id].ep; memset(&rm, 0, sizeof(rem_mem_t)); uint32_t *send_count_ptr = &(ep->rc_ep.rc_qp.send_count); int send_flags, poll_completion = 0; if((*send_count_ptr & S_DEPTH_) == 0) send_flags = IBV_SEND_SIGNALED; else send_flags = 0; if ((*send_count_ptr & S_DEPTH_) == S_DEPTH_) poll_completion = 1; (*send_count_ptr)++; rm.raddr = ep->rc_ep.rmt_mr.raddr + offset; rm.rkey = ep->rc_ep.rmt_mr.rkey; post_send(entry->node_id, reply, ACCEPT_ACK_SIZE, IBDEV->lcl_mr, IBV_WR_RDMA_WRITE, &rm, send_flags, poll_completion); if(view_stamp_comp(&entry->req_canbe_exed, comp->highest_committed_vs) > 0) { start = vstol(comp->highest_committed_vs)+1; end = vstol(&entry->req_canbe_exed); for(index = start; index <= end; index++) { comp->ucb(index,comp->up_para); } *(comp->highest_committed_vs) = entry->req_canbe_exed; } #ifdef MEASURE_LATENCY clock_add(&c_k); clock_display(comp->sys_log_file, &c_k); #endif } } } } };