/* * get file info from every replica * return value don't needed, we just need finfos */ int read_file_info(const VUINT64& replicas, const uint64_t block_id, const uint64_t file_id, map<uint64_t, FileInfo>& finfos) { finfos.clear(); int ret = TFS_SUCCESS; VUINT64::const_iterator iter = replicas.begin(); for ( ; iter != replicas.end(); iter++) { int32_t retry = 2; while (retry--) { FileInfo info; ret = ToolUtil::read_file_info(*iter, block_id, file_id, FORCE_STAT, info); if (TFS_SUCCESS == ret) { finfos.insert(make_pair(*iter, info)); break; } else if (EXIT_META_NOT_FOUND_ERROR == ret) // file not exist, just ignore { break; } } } return (finfos.size() > 0) ? TFS_SUCCESS : TFS_ERROR; }
void print_result(const VUINT64& need_sync_block_list, const VUINT64& lost_block_list) { if (lost_block_list.size() > 0) { fprintf(stdout, "BOTH LOST BLOCK COUNT: %zd\n", lost_block_list.size()); for (uint32_t i = 0; i < lost_block_list.size(); i++) { fprintf(stdout, "%" PRI64_PREFIX "u\n", lost_block_list.at(i)); } } int32_t real_need_sync_block_size = need_sync_block_list.size() - lost_block_list.size(); if (real_need_sync_block_size > 0) { fprintf(stdout, "NEED SYNC BLOCK COUNT: %d\n", real_need_sync_block_size); set<uint64_t> lost_block_set; lost_block_set.insert(lost_block_list.begin(), lost_block_list.end()); for (uint32_t i = 0; i < need_sync_block_list.size(); i++) { uint64_t block_id = need_sync_block_list.at(i); if (lost_block_set.find(block_id) == lost_block_set.end()) { fprintf(stdout, "%" PRI64_PREFIX "u\n", block_id);// can sync from slave cluster } } } }
void recover_block_from_slave_cluster(const char* ns_addr, const char* ns_slave_addr, const VUINT64& tmp_fail_block_list, VUINT64& success_block_list, VUINT64& fail_block_list, multimap<uint64_t, uint64_t>& fail_block_file_list) { int ret = TFS_SUCCESS; VUINT64::const_iterator vit = tmp_fail_block_list.begin(); multimap<uint64_t, uint64_t> tmp_fail_block_file_list; vector<FileInfoV2> finfos; for (; vit != tmp_fail_block_list.end(); vit++) { uint64_t block_id = (*vit); finfos.clear(); ret = ToolUtil::read_file_infos_v2(Func::get_host_ip(ns_slave_addr), block_id, finfos); if (ret == TFS_SUCCESS) { bool all_success = true; int32_t copy_file_succ_count = 0; tmp_fail_block_file_list.clear(); vector<FileInfoV2>::const_iterator v_file_iter = finfos.begin(); for (; v_file_iter != finfos.end(); v_file_iter++) { if ((v_file_iter->status_ & FILE_STATUS_DELETE) != 0) continue; uint64_t file_id = v_file_iter->id_; ret = copy_file_from_slave_cluster(ns_slave_addr, ns_addr, block_id, (*v_file_iter)); if (TFS_SUCCESS == ret) { ++copy_file_succ_count; TBSYS_LOG(DEBUG, "recover block_id: %" PRI64_PREFIX "u, file_id: %" PRI64_PREFIX "u successful from slave cluster!", block_id, file_id); } else { TBSYS_LOG(WARN, "recover block_id: %" PRI64_PREFIX "u, file_id: %" PRI64_PREFIX "u failed from slave cluster, ret: %d", block_id, file_id, ret); all_success = false; tmp_fail_block_file_list.insert(pair<uint64_t, uint64_t>(block_id, file_id)); } } if (all_success) { success_block_list.push_back(block_id); } else if (0 == copy_file_succ_count) { fail_block_list.push_back(block_id); } else { fail_block_file_list.insert(tmp_fail_block_file_list.begin(), tmp_fail_block_file_list.end()); } } else { fail_block_list.push_back(block_id); } } }
int32_t elect_ds_exclude_group(const DS_WEIGHT& weights, const int32_t elect_count, int64_t& elect_seq, VUINT64& elect_ds_list) { if (elect_count == 0) { TBSYS_LOG(DEBUG, "current elect count(%d) <= 0, must be return", elect_count); return 0; } std::set < uint32_t > existlan; for (uint32_t i = 0; i < elect_ds_list.size(); ++i) { uint32_t lan = Func::get_lan(elect_ds_list[i], SYSPARAM_NAMESERVER.group_mask_); existlan.insert(lan); } //dump_weigths(weights); DS_WEIGHT::const_iterator iter = weights.begin(); int32_t need_elect_count = elect_count; TBSYS_LOG(DEBUG, "weights.size(%u), need_elect_count(%d)", weights.size(), need_elect_count); DataServerStatInfo* ds_stat_info = NULL; while (iter != weights.end() && need_elect_count > 0) { ds_stat_info = iter->second->get_ds(); uint32_t dlan = Func::get_lan(ds_stat_info->id_, SYSPARAM_NAMESERVER.group_mask_); if (existlan.find(dlan) == existlan.end()) { existlan.insert(dlan); elect_ds_list.push_back(ds_stat_info->id_); if (elect_seq > 0) iter->second->elect(++elect_seq); --need_elect_count; } ++iter; } TBSYS_LOG(DEBUG, "current elect_count(%d)", elect_count - need_elect_count); return elect_count - need_elect_count; }
void OpMeta::set_members(const VUINT64& servers) { done_server_size_ = 0; server_size_ = servers.size(); start_time_ = Func::get_monotonic_time_us(); for (int32_t index = 0; index < server_size_; index++) { members_[index].server_ = servers[index]; members_[index].info_.block_id_ = INVALID_BLOCK_ID; members_[index].info_.version_= INVALID_VERSION; members_[index].status_ = EXIT_TIMEOUT_ERROR; } }
int MiscRequester::read_block_index(const uint64_t ns_id, const uint64_t block_id, const uint64_t attach_block_id, IndexDataV2& index_data) { VUINT64 replicas; int32_t index = 0; int ret = NsRequester::get_block_replicas(ns_id, block_id, replicas); if (TFS_SUCCESS == ret) { ret = replicas.size() > 0 ? TFS_SUCCESS : EXIT_NO_DATASERVER; if (TFS_SUCCESS == ret) { index = random() % replicas.size(); } if (TFS_SUCCESS == ret) { ret = DsRequester::read_block_index(replicas[index], block_id, attach_block_id, index_data); } } return ret; }
int32_t elect_ds_normal(const DS_WEIGHT& weights, const int32_t elect_count, int64_t& elect_seq, VUINT64& elect_ds_list) { if (elect_count == 0) return 0; int32_t need_elect_count = elect_count; DS_WEIGHT::const_iterator iter = weights.begin(); while (iter != weights.end() && need_elect_count > 0) { elect_ds_list.push_back(iter->second->get_ds()->id_); if (elect_seq > 0) iter->second->elect(++elect_seq); --need_elect_count; ++iter; } return elect_count - need_elect_count; }
// if parameter: ds_id != INVALID_SERVER_ID, check from current cluster, need exclude repliates in this ds; // if parameter: ds_id == INVALID_SERVER_ID, check from slave cluster int get_and_check_all_blocks_copy(const uint64_t ns_id, const VUINT64& blocks, VUINT64& no_copy_blocks, const uint64_t ds_id = INVALID_SERVER_ID) { no_copy_blocks.clear(); vector<BlockMeta> blocks_meta; int ret = ToolUtil::get_all_blocks_meta(ns_id, blocks, blocks_meta, true); if (TFS_SUCCESS == ret) { VUINT64 lost_in_family; for (uint32_t block_index = 0; block_index < blocks_meta.size(); ++block_index) { // blk_meta.result_ just be used in trunk(stable2.7) BlockMeta blk_meta = blocks_meta.at(block_index); uint64_t block_id = blk_meta.block_id_; FamilyInfoExt family_info = blk_meta.family_info_; if (INVALID_FAMILY_ID == family_info.family_id_) { // if family can't reinstall, family_id here is also invalid if (INVALID_BLOCK_ID == block_id || IS_VERFIFY_BLOCK(block_id)) continue; // else family data block regard as normal block if (0 == blk_meta.size_) { // maybe not exist the block no_copy_blocks.push_back(block_id); } else if (INVALID_SERVER_ID != ds_id && 1 == blk_meta.size_ && ds_id == blk_meta.ds_[0]) { // only exist one copy and be positioned in this ds no_copy_blocks.push_back(block_id); } } else // block is marshalling { int data_num = GET_DATA_MEMBER_NUM(family_info.family_aid_info_); int member_num = data_num + GET_CHECK_MEMBER_NUM(family_info.family_aid_info_); int32_t alive_num = 0; lost_in_family.clear(); for (int i = 0; i < member_num; ++i) { // pair is <blockid, ds_id> pair<uint64_t, uint64_t>& item = family_info.members_[i]; if (INVALID_SERVER_ID != item.second && item.second != ds_id) // exclude current ds { ++alive_num; } else { // keep all lost(and will lost) data blockid if (!IS_VERFIFY_BLOCK(item.first) && INVALID_BLOCK_ID != item.first) { lost_in_family.push_back(item.first); } } } if (alive_num < data_num) // can't reinstate { no_copy_blocks.insert(no_copy_blocks.end(), lost_in_family.begin(), lost_in_family.end()); } } } } else { // this error need script-calller care TBSYS_LOG(ERROR, "get blockis ds_list fail from ns: %s, ret:%d", tbsys::CNetUtil::addrToString(ns_id).c_str(), ret); } return ret; }
int check_all_block_in_disk(const char* ns_addr, const char* ns_slave_addr, const uint64_t ds_id, const VUINT64& blocks, VUINT64& need_sync_block_list, VUINT64& lost_block_list) { int ret = TFS_SUCCESS; uint64_t ns_id = Func::get_host_ip(ns_addr); uint64_t slave_ns_id = Func::get_host_ip(ns_slave_addr); need_sync_block_list.clear(); lost_block_list.clear(); // check current cluster's block in blocks array ret = get_and_check_all_blocks_copy(ns_id, blocks, need_sync_block_list, ds_id); if (TFS_SUCCESS == ret && need_sync_block_list.size() > 0) { // check slave cluster's block in need_sync_block_list array VUINT64 raw_lost_blocks; ret = get_and_check_all_blocks_copy(slave_ns_id, need_sync_block_list, raw_lost_blocks); // raw_lost_blocks need unique, then remove all blocks which not in need_sync_block_list set<uint64_t> need_sync_block_set, lost_block_set; need_sync_block_set.insert(need_sync_block_list.begin(), need_sync_block_list.end()); // to sort lost_block_set.insert(raw_lost_blocks.begin(), raw_lost_blocks.end()); // to sort & unique lost_block_list.resize(raw_lost_blocks.size()); VUINT64::iterator it = set_intersection(raw_lost_blocks.begin(), raw_lost_blocks.end(), need_sync_block_list.begin(), need_sync_block_list.end(), lost_block_list.begin()); lost_block_list.resize(it - lost_block_list.begin());// set real size } return ret; }
int recover_block_from_disk_data(const char* ns_addr, const char* ns_slave_addr, BlockManager& block_manager, VUINT64& tmp_fail_block_list, VUINT64& no_need_recover_block_list, VUINT64& success_block_list, multimap<uint64_t, uint64_t>& fail_block_file_list, VUINT64& fail_block_list) { int ret = TFS_SUCCESS; VUINT64 blocks; vector<FileInfoV2> finfos; multimap<uint64_t, uint64_t> tmp_fail_block_file_list; uint64_t ns_id = Func::get_host_ip(ns_addr); //get all blocks from ds disk block_manager.get_all_block_ids(blocks);//肯定返回成功 vector<BlockMeta> blocks_meta; ret = ToolUtil::get_all_blocks_meta(ns_id, blocks, blocks_meta, false);// no need get check block TBSYS_LOG(DEBUG , "all logic blocks count: %zd, data block count: %zd", blocks.size(), blocks_meta.size()); if (TFS_SUCCESS == ret) { int32_t bret = TFS_SUCCESS; for (uint32_t block_index = 0; block_index < blocks_meta.size(); ++block_index) { BlockMeta blk_meta = blocks_meta.at(block_index); uint64_t block_id = blk_meta.block_id_; if (blk_meta.size_ > 0 || INVALID_FAMILY_ID != blk_meta.family_info_.family_id_) { TBSYS_LOG(DEBUG , "blockid: %" PRI64_PREFIX "u no need recover, ds_size:%d, family_id: %" PRI64_PREFIX "u", block_id, blk_meta.size_, blk_meta.family_info_.family_id_); no_need_recover_block_list.push_back(block_id);//只要还有副本或者副本丢失但有编组(不考虑退化读恢复)都不用本工具恢复 } else { bret = rm_no_replicate_block_from_ns(ns_addr, block_id);//now T_NEWBLK will can not remove empty ds_list's block if (TFS_SUCCESS != bret) { fail_block_list.push_back(block_id); TBSYS_LOG(WARN , "remove block %" PRI64_PREFIX "u from ns: %s failed, ret: %d", block_id, ns_addr, bret); } else { IndexHeaderV2 header; finfos.clear(); bret = block_manager.traverse(header, finfos, block_id, block_id); if (TFS_SUCCESS != bret) { TBSYS_LOG(WARN , "block %" PRI64_PREFIX "u get local file infos failed, ret: %d", block_id, bret); tmp_fail_block_list.push_back(block_id);//只有本磁盘读取block的文件index错误,才需要整个block尝试从辅集群恢复 } else { bool all_success = true; int32_t copy_file_succ_count = 0; tmp_fail_block_file_list.clear(); for (uint32_t file_index = 0; file_index < finfos.size(); ++file_index) { // skip deleted file if ((finfos.at(file_index).status_ & FILE_STATUS_DELETE) != 0) continue; uint64_t file_id = finfos.at(file_index).id_; bret = copy_file(block_manager, block_id, finfos.at(file_index)); if (TFS_SUCCESS == bret) { TBSYS_LOG(DEBUG, "recover block_id: %" PRI64_PREFIX "u, file_id: %" PRI64_PREFIX "u successful!", block_id, file_id); } else {// 如果磁盘中该文件数据已经损坏(如crc出错),则从对等集群(ns_slave_addr)拷贝数据复制 bret = copy_file_from_slave_cluster(ns_slave_addr, ns_addr, block_id, finfos.at(file_index)); if (TFS_SUCCESS == bret) { TBSYS_LOG(DEBUG, "recover block_id: %" PRI64_PREFIX "u, file_id: %" PRI64_PREFIX "u successful from slave cluster!", block_id, file_id); } else { TBSYS_LOG(WARN, "recover block_id: %" PRI64_PREFIX "u, file_id: %" PRI64_PREFIX "u failed from slave cluster, ret: %d!", block_id, file_id, bret); all_success = false; tmp_fail_block_file_list.insert(pair<uint64_t, uint64_t>(block_id, file_id)); } } if (TFS_SUCCESS == bret) { ++copy_file_succ_count; } } if (all_success) { success_block_list.push_back(block_id); if (0 == copy_file_succ_count) { TBSYS_LOG(DEBUG, "recover block_id: %" PRI64_PREFIX "u need to do nothing," " because the count of files who need to recover is ZERO except DELETE status files!", block_id); } } else if (0 == copy_file_succ_count)// all file(exclude DELETE) fail { TBSYS_LOG(WARN, "recover block_id: %" PRI64_PREFIX "u's files failed, copy_file_succ_count is Zero !", block_id); fail_block_list.push_back(block_id);// for print fail block to out log file at end } else { fail_block_file_list.insert(tmp_fail_block_file_list.begin(), tmp_fail_block_file_list.end()); } } } } }//end for blocks loop } else { TBSYS_LOG(WARN, "get blockis ds_list error, ret:%d", ret); } TBSYS_LOG(INFO, "success_block_list size: %zd, tmp_fail_block_list size: %zd, fail_block_list size: %zd", success_block_list.size(), tmp_fail_block_list.size(), fail_block_list.size()); return ret; }
int OpManager::forward_op(tbnet::Packet* message, const uint64_t block_id, const int64_t family_id, const VUINT64& servers) { // post request to slaves int ret = TFS_SUCCESS; DsRuntimeGlobalInformation& ds_info = DsRuntimeGlobalInformation::instance(); // take master's version to slave BlockInfoV2 block_info; ret = get_block_manager().get_block_info(block_info, block_id); for (uint32_t i = 0; TFS_SUCCESS == ret && i < servers.size(); i++) { if (servers[i] == ds_info.information_.id_) { continue; // exclude self } if (WRITE_FILE_MESSAGE_V2 == message->getPCode()) { WriteFileMessageV2* msg = dynamic_cast<WriteFileMessageV2*>(message); msg->set_version(block_info.version_); // version will take to slave if (INVALID_FAMILY_ID != family_id) { FamilyInfoExt& info = msg->get_family_info(); msg->set_block_id(info.get_block(servers[i])); info.family_id_ = INVALID_FAMILY_ID; // family will not take to slave } } else if (CLOSE_FILE_MESSAGE_V2 == message->getPCode()) { CloseFileMessageV2* msg = dynamic_cast<CloseFileMessageV2*>(message); if (INVALID_FAMILY_ID != family_id) { FamilyInfoExt& info = msg->get_family_info(); msg->set_block_id(info.get_block(servers[i])); info.family_id_ = INVALID_FAMILY_ID; } } else if (UNLINK_FILE_MESSAGE_V2 == message->getPCode()) { UnlinkFileMessageV2* msg = dynamic_cast<UnlinkFileMessageV2*>(message); if (INVALID_FAMILY_ID != family_id) { FamilyInfoExt& info = msg->get_family_info(); msg->set_block_id(info.get_block(servers[i])); info.family_id_ = INVALID_FAMILY_ID; } } else { assert(false); } // forward will clone source msg ret = post_msg_to_server(servers[i], message, ds_async_callback, true); if (TFS_SUCCESS != ret) { TBSYS_LOG(WARN, "forward request to slave fail, ret : %d", ret); } } return ret; }
bool elect_move_dest_ds(const vector<ServerCollect*>& ds_list, const ReplicateDestStrategy::counter_type& dest_counter, const VUINT64& elect_ds_list, const uint64_t src_ds, uint64_t & dest_ds) { vector<ServerCollect*>::const_iterator maxit = std::max_element(ds_list.begin(), ds_list.end(), CompareLoad()); int32_t max_load = 1; if (maxit != ds_list.end()) max_load = (*maxit)->get_ds()->current_load_; NsGlobalInfo ginfo; ginfo.max_load_ = max_load; // only max_load & alive_server_count could be useful, calc. ginfo.alive_server_count_ = ds_list.size(); // elect seq not used in this case; ReplicateSourceStrategy strategy(1, ginfo, dest_counter); DS_WEIGHT weights; StoreWeight < ReplicateSourceStrategy > store(strategy, weights); std::for_each(ds_list.begin(), ds_list.end(), store); std::set < uint32_t > existlan; uint32_t elect_ds_list_size = elect_ds_list.size(); for (uint32_t i = 0; i < elect_ds_list_size; ++i) { uint32_t lan = Func::get_lan(elect_ds_list[i], SYSPARAM_NAMESERVER.group_mask_); existlan.insert(lan); } dest_ds = 0; uint64_t first_elect_ds = 0; uint32_t dlan = 0; DataServerStatInfo* ds_stat_info = NULL; DS_WEIGHT::const_iterator iter = weights.begin(); while (iter != weights.end()) { ds_stat_info = iter->second->get_ds(); dlan = Func::get_lan(ds_stat_info->id_, SYSPARAM_NAMESERVER.group_mask_); if ((first_elect_ds == 0) && (existlan.find(dlan) == existlan.end())) { first_elect_ds = ds_stat_info->id_; } if ((dest_ds == 0) && (existlan.find(dlan) == existlan.end()) && (ReplicateStrategy::get_ds_ip(src_ds) == ReplicateStrategy::get_ds_ip(ds_stat_info->id_))) { dest_ds = ds_stat_info->id_; } if ((first_elect_ds != 0) && (dest_ds != 0)) { break; } ++iter; } if (dest_ds == 0) { dest_ds = first_elect_ds; } return (dest_ds != 0); }