//Build huffaman tree from the existing dictionary void HuffmanEncoder::BuildHuffmanTreeFromDict() { std::vector<std::pair<int, int64> > ordered_words; ordered_words.reserve(dict_->Size()); ordered_words.clear(); for (int i = 0; i < dict_->Size(); ++i) ordered_words.push_back(std::pair<int, int64>(i, dict_->GetWordInfo(i)->freq)); std::sort(ordered_words.begin(), ordered_words.end(), compare); unsigned vocab_size = (unsigned)ordered_words.size(); // frequence int64 *count = new (std::nothrow)int64[vocab_size * 2 + 1]; assert(count != nullptr); // Huffman code relative to parent node [1,0] of each node unsigned *binary = new (std::nothrow)unsigned[vocab_size * 2 + 1]; assert(binary != nullptr); memset(binary, 0, sizeof(unsigned)* (vocab_size * 2 + 1)); unsigned *parent_node = new (std::nothrow)unsigned[vocab_size * 2 + 1]; // assert(parent_node != nullptr); memset(parent_node, 0, sizeof(unsigned)* (vocab_size * 2 + 1)); unsigned code[kMaxCodeLength], point[kMaxCodeLength]; for (unsigned i = 0; i < vocab_size; ++i) count[i] = ordered_words[i].second; for (unsigned i = vocab_size; i < vocab_size * 2; i++) count[i] = static_cast<int64>(1e15); int pos1 = vocab_size - 1; int pos2 = vocab_size; int min1i, min2i; for (unsigned i = 0; i < vocab_size - 1; i++) { // First, find two smallest nodes 'min1, min2' assert(pos2 < static_cast<int>(vocab_size)* 2 - 1); //Find the samllest node if (pos1 >= 0) { if (count[pos1] < count[pos2]) { min1i = pos1; pos1--; } else { min1i = pos2; pos2++; } } else { min1i = pos2; pos2++; } //Find the second samllest node if (pos1 >= 0) { if (count[pos1] < count[pos2]) { min2i = pos1; pos1--; } else { min2i = pos2; pos2++; } } else { min2i = pos2; pos2++; } count[vocab_size + i] = count[min1i] + count[min2i]; assert(min1i >= 0); assert(min1i < static_cast<int>(vocab_size)* 2 - 1); assert(min2i >= 0); assert(min2i < static_cast<int>(vocab_size)* 2 - 1); parent_node[min1i] = vocab_size + i; parent_node[min2i] = vocab_size + i; binary[min2i] = 1; } assert(pos1 < 0); //Generate the huffman code for each leaf node hufflabel_info_.clear(); for (unsigned a = 0; a < vocab_size; ++a) hufflabel_info_.push_back(HuffLabelInfo()); for (unsigned a = 0; a < vocab_size; a++) { unsigned b = a, i = 0; while (1) { assert(i < kMaxCodeLength); code[i] = binary[b]; point[i] = b; i++; b = parent_node[b]; if (b == vocab_size * 2 - 2) break; } unsigned cur_word = ordered_words[a].first; hufflabel_info_[cur_word].codelen = i; hufflabel_info_[cur_word].point.push_back(vocab_size - 2); for (b = 0; b < i; b++) { hufflabel_info_[cur_word].code.push_back(code[i - b - 1]); if (b) hufflabel_info_[cur_word].point.push_back(point[i - b] - vocab_size); } } delete[] count; count = nullptr; delete[] binary; binary = nullptr; delete[] parent_node; parent_node = nullptr; }
void HuffmanEncoder::BuildHuffmanTreeFromDict() { std::vector<std::pair<int, long long> > ordered_words; ordered_words.reserve(m_dict->Size()); ordered_words.clear(); for (unsigned i = 0; i < m_dict->Size(); ++i) ordered_words.push_back(std::pair<int, long long>(i, m_dict->GetWordInfo(i)->freq)); std::sort(ordered_words.begin(), ordered_words.end(), compare); unsigned vocab_size = (unsigned)ordered_words.size(); long long *count = new long long[vocab_size * 2 + 1]; //frequence unsigned *binary = new unsigned[vocab_size * 2 + 1]; //huffman code relative to parent node [1,0] of each node memset(binary, 0, sizeof(unsigned)* (vocab_size * 2 + 1)); unsigned *parent_node = new unsigned[vocab_size * 2 + 1]; // memset(parent_node, 0, sizeof(unsigned)* (vocab_size * 2 + 1)); unsigned code[MAX_CODE_LENGTH], point[MAX_CODE_LENGTH]; for (unsigned i = 0; i < vocab_size; ++i) count[i] = ordered_words[i].second; for (unsigned i = vocab_size; i < vocab_size * 2; i++) count[i] = 1e15; int pos1 = vocab_size - 1; int pos2 = vocab_size; int min1i, min2i; for (unsigned i = 0; i < vocab_size - 1; i++) { // First, find two smallest nodes 'min1, min2' assert(pos2 < vocab_size * 2 - 1); //find the samllest node if (pos1 >= 0) { if (count[pos1] < count[pos2]) { min1i = pos1; pos1--; } else { min1i = pos2; pos2++; } } else { min1i = pos2; pos2++; } //find the second samllest node if (pos1 >= 0) { if (count[pos1] < count[pos2]) { min2i = pos1; pos1--; } else { min2i = pos2; pos2++; } } else { min2i = pos2; pos2++; } count[vocab_size + i] = count[min1i] + count[min2i]; assert(min1i >= 0 && min1i < vocab_size * 2 - 1 && min2i >= 0 && min2i < vocab_size * 2 - 1); parent_node[min1i] = vocab_size + i; parent_node[min2i] = vocab_size + i; binary[min2i] = 1; } assert(pos1 < 0); //generate the huffman code for each leaf node m_hufflabel_info.clear(); for (unsigned a = 0; a < vocab_size; ++a) m_hufflabel_info.push_back(HuffLabelInfo()); for (unsigned a = 0; a < vocab_size; a++) { unsigned b = a, i = 0; while (1) { assert(i < MAX_CODE_LENGTH); code[i] = binary[b]; point[i] = b; i++; b = parent_node[b]; if (b == vocab_size * 2 - 2) break; } unsigned cur_word = ordered_words[a].first; m_hufflabel_info[cur_word].codelen = i; m_hufflabel_info[cur_word].point.push_back(vocab_size - 2); for (b = 0; b < i; b++) { m_hufflabel_info[cur_word].code.push_back(code[i - b - 1]); if (b) m_hufflabel_info[cur_word].point.push_back(point[i - b] - vocab_size); } } delete[] count; count = nullptr; delete[] binary; binary = nullptr; delete[] parent_node; parent_node = nullptr; }