Пример #1
0
		//Build huffaman tree from the existing dictionary
		void HuffmanEncoder::BuildHuffmanTreeFromDict()
		{
			std::vector<std::pair<int, int64> > ordered_words;
			ordered_words.reserve(dict_->Size());
			ordered_words.clear();
			for (int i = 0; i < dict_->Size(); ++i)
				ordered_words.push_back(std::pair<int, int64>(i, dict_->GetWordInfo(i)->freq));
			std::sort(ordered_words.begin(), ordered_words.end(), compare);

			unsigned vocab_size = (unsigned)ordered_words.size();
			// frequence
			int64 *count = new (std::nothrow)int64[vocab_size * 2 + 1];
			assert(count != nullptr);
			// Huffman code relative to parent node [1,0] of each node
			unsigned *binary = new (std::nothrow)unsigned[vocab_size * 2 + 1];
			assert(binary != nullptr);
			memset(binary, 0, sizeof(unsigned)* (vocab_size * 2 + 1));

			unsigned *parent_node = new (std::nothrow)unsigned[vocab_size * 2 + 1]; //
			assert(parent_node != nullptr);
			memset(parent_node, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
			unsigned code[kMaxCodeLength], point[kMaxCodeLength];

			for (unsigned i = 0; i < vocab_size; ++i)
				count[i] = ordered_words[i].second;
			for (unsigned i = vocab_size; i < vocab_size * 2; i++)
				count[i] = static_cast<int64>(1e15);
			int pos1 = vocab_size - 1;
			int pos2 = vocab_size;
			int min1i, min2i;
			for (unsigned i = 0; i < vocab_size - 1; i++)
			{
				// First, find two smallest nodes 'min1, min2'
				assert(pos2 < static_cast<int>(vocab_size)* 2 - 1);
				//Find the samllest node
				if (pos1 >= 0)
				{
					if (count[pos1] < count[pos2])
					{
						min1i = pos1;
						pos1--;
					}
					else
					{
						min1i = pos2;
						pos2++;
					}
				}
				else
				{
					min1i = pos2;
					pos2++;
				}

				//Find the second samllest node
				if (pos1 >= 0)
				{
					if (count[pos1] < count[pos2])
					{
						min2i = pos1;
						pos1--;
					}
					else
					{
						min2i = pos2;
						pos2++;
					}
				}
				else
				{
					min2i = pos2;
					pos2++;
				}

				count[vocab_size + i] = count[min1i] + count[min2i];

				assert(min1i >= 0);
				assert(min1i < static_cast<int>(vocab_size)* 2 - 1);
				assert(min2i >= 0);
				assert(min2i < static_cast<int>(vocab_size)* 2 - 1);
				parent_node[min1i] = vocab_size + i;
				parent_node[min2i] = vocab_size + i;
				binary[min2i] = 1;
			}
			assert(pos1 < 0);

			//Generate the huffman code for each leaf node
			hufflabel_info_.clear();
			for (unsigned a = 0; a < vocab_size; ++a)
				hufflabel_info_.push_back(HuffLabelInfo());
			for (unsigned a = 0; a < vocab_size; a++)
			{
				unsigned b = a, i = 0;
				while (1)
				{
					assert(i < kMaxCodeLength);
					code[i] = binary[b];
					point[i] = b;
					i++;
					b = parent_node[b];
					if (b == vocab_size * 2 - 2) break;
				}
				unsigned cur_word = ordered_words[a].first;

				hufflabel_info_[cur_word].codelen = i;
				hufflabel_info_[cur_word].point.push_back(vocab_size - 2);

				for (b = 0; b < i; b++)
				{
					hufflabel_info_[cur_word].code.push_back(code[i - b - 1]);
					if (b)
						hufflabel_info_[cur_word].point.push_back(point[i - b] - vocab_size);
				}
			}

			delete[] count;
			count = nullptr;
			delete[] binary;
			binary = nullptr;
			delete[] parent_node;
			parent_node = nullptr;
		}
Пример #2
0
void HuffmanEncoder::BuildHuffmanTreeFromDict()
{
	std::vector<std::pair<int, long long> > ordered_words;
	ordered_words.reserve(m_dict->Size());
	ordered_words.clear();
	for (unsigned i = 0; i < m_dict->Size(); ++i)
		ordered_words.push_back(std::pair<int, long long>(i, m_dict->GetWordInfo(i)->freq));
	std::sort(ordered_words.begin(), ordered_words.end(), compare);

	unsigned vocab_size = (unsigned)ordered_words.size();
	long long *count = new long long[vocab_size * 2 + 1]; //frequence
	unsigned *binary = new unsigned[vocab_size * 2 + 1]; //huffman code relative to parent node [1,0] of each node
	memset(binary, 0, sizeof(unsigned)* (vocab_size * 2 + 1));

	unsigned *parent_node = new unsigned[vocab_size * 2 + 1]; //
	memset(parent_node, 0, sizeof(unsigned)* (vocab_size * 2 + 1));
	unsigned code[MAX_CODE_LENGTH], point[MAX_CODE_LENGTH];

	for (unsigned i = 0; i < vocab_size; ++i)
		count[i] = ordered_words[i].second;
	for (unsigned i = vocab_size; i < vocab_size * 2; i++)
		count[i] = 1e15;
	int pos1 = vocab_size - 1;
	int pos2 = vocab_size;
	int min1i, min2i;
	for (unsigned i = 0; i < vocab_size - 1; i++)
	{
		// First, find two smallest nodes 'min1, min2'
		assert(pos2 < vocab_size * 2 - 1);
		//find the samllest node
		if (pos1 >= 0)
		{
			if (count[pos1] < count[pos2])
			{
				min1i = pos1;
				pos1--;
			}
			else
			{
				min1i = pos2;
				pos2++;
			}
		}
		else
		{
			min1i = pos2;
			pos2++;
		}

		//find the second samllest node
		if (pos1 >= 0)
		{
			if (count[pos1] < count[pos2])
			{
				min2i = pos1;
				pos1--;
			}
			else
			{
				min2i = pos2;
				pos2++;
			}
		}
		else
		{
			min2i = pos2;
			pos2++;
		}

		count[vocab_size + i] = count[min1i] + count[min2i];

		assert(min1i >= 0 && min1i < vocab_size * 2 - 1 && min2i >= 0 && min2i < vocab_size * 2 - 1);
		parent_node[min1i] = vocab_size + i;
		parent_node[min2i] = vocab_size + i;
		binary[min2i] = 1;
	}
	assert(pos1 < 0);

	//generate the huffman code for each leaf node
	m_hufflabel_info.clear();
	for (unsigned a = 0; a < vocab_size; ++a)
		m_hufflabel_info.push_back(HuffLabelInfo());
	for (unsigned a = 0; a < vocab_size; a++)
	{
		unsigned b = a, i = 0;
		while (1)
		{
			assert(i < MAX_CODE_LENGTH);
			code[i] = binary[b];
			point[i] = b;
			i++;
			b = parent_node[b];
			if (b == vocab_size * 2 - 2) break;
		}
		unsigned cur_word = ordered_words[a].first;

		m_hufflabel_info[cur_word].codelen = i;
		m_hufflabel_info[cur_word].point.push_back(vocab_size - 2);

		for (b = 0; b < i; b++)
		{
			m_hufflabel_info[cur_word].code.push_back(code[i - b - 1]);
			if (b)
				m_hufflabel_info[cur_word].point.push_back(point[i - b] - vocab_size);
		}
	}

	delete[] count;
	count = nullptr;
	delete[] binary;
	binary = nullptr;
	delete[] parent_node;
	parent_node = nullptr;
}