Code
from dataclasses import dataclass
import gradio as gr
import pandas as pd
import zlib
import base64This notebook implements Huffman Encoding in Python, a classical algorithm for lossless data compression. Huffman encoding assigns variable-length codes to input characters, ensuring that more frequent symbols have shorter codes, and less frequent symbols have longer ones. The efficiency of encoding is achieved by minimizing the total number of bits needed for any given set of symbols based on their frequency:
\[ \text{Total Bits} = \sum_{i} f_i \cdot l_i, \]
where \(f_i\) is the frequency of symbol \(i\), and \(l_i\) is the length of its code. The core of Huffman encoding is building a Huffman tree, a binary tree with unique prefixes, optimizing the data compression process.
In what follows, we will explore:
The implementation begins with defining data structures for the components of a Huffman Tree. We’ll utilize Python’s dataclass to create nodes and leaves efficiently, necessary for constructing the Huffman tree:
The following functions enable us to calculate frequency distribution, build the Huffman tree, and generate optimal binary codes for each symbol:
def make_freq_dict(s: str) -> dict[str, int]:
"""
Generate a frequency dictionary for the characters in the input string.
Args:
s (str): The input string.
Returns:
dict[str, int]: A dictionary mapping each character to its frequency.
"""
freq = {}
for ch in s:
freq[ch] = freq.get(ch, 0) + 1
return freq
def build_huffman_tree(freq_dict: dict[str, int]) -> HuffmanTree:
"""
Construct the Huffman tree from the frequency dictionary.
The algorithm repeatedly takes the two trees with the smallest frequencies,
merges them into a new node, and continues until one tree remains.
Args:
freq_dict (dict[str, int]): A dictionary mapping characters to frequencies.
Returns:
HuffmanTree: The root of the Huffman tree.
"""
trees: list[HuffmanTree] = [HuffmanLeaf(ch, fq) for ch, fq in freq_dict.items()]
while len(trees) > 1:
trees.sort(key=lambda x: x.freq, reverse=True) # smallest frequency at the end
left = trees.pop() # smallest
right = trees.pop() # second smallest
trees.append(HuffmanNode(left, right, left.freq + right.freq))
return trees[0]
def generate_encoding(tree: HuffmanTree, prefix: str = "") -> dict[str, str]:
"""
Recursively traverse the Huffman tree to generate the binary encoding for each character.
By convention, the left branch appends '0' and the right branch appends '1'.
Args:
tree (HuffmanTree): The Huffman tree.
prefix (str, optional): The current prefix (binary code) accumulated during recursion.
Returns:
dict[str, str]: A dictionary mapping each character to its binary code.
"""
if isinstance(tree, HuffmanLeaf):
# Handle the edge case where the tree consists of a single node.
return {tree.char: prefix or "0"}
else:
encoding: dict[str, str] = {}
encoding.update(generate_encoding(tree.left, prefix + "0"))
encoding.update(generate_encoding(tree.right, prefix + "1"))
return encodingTo better visualize the Huffman Tree, we use Mermaid diagrams, which provide a straightforward representation of the branching structure:
def tree_to_mermaid(tree: HuffmanTree) -> str:
"""
Generate a Mermaid diagram representing the Huffman tree with proper escaping.
"""
nodes = []
edges = []
counter = 0
def mermaid_escape(s: str) -> str:
"""Escape special characters for Mermaid labels using HTML entities"""
return (
s.replace('"', "#quot;")
.replace("&", "#38;")
.replace("<", "#60;")
.replace(">", "#62;")
)
def dfs(t: HuffmanTree, node_id: int):
nonlocal counter
if isinstance(t, HuffmanLeaf):
# Escape special characters in leaf node label
escaped_char = mermaid_escape(t.char)
nodes.append(f'node{node_id}["{escaped_char} | {t.freq}"]')
else:
nodes.append(f'node{node_id}["{t.freq}"]')
counter += 1
left_id = counter
dfs(t.left, left_id)
edges.append(f'node{node_id} -- "0" --> node{left_id}')
counter += 1
right_id = counter
dfs(t.right, right_id)
edges.append(f'node{node_id} -- "1" --> node{right_id}')
dfs(tree, 0)
return "flowchart TD\n" + "\n".join(nodes + edges)The interactive Gradio interface below allows you to input text strings to see their Huffman encoding.
def create_mermaid_flowchart(mermaid_spec: str, height="500px") -> str:
mermaid_iframe = f"""
<iframe srcdoc='
<!DOCTYPE html>
<html>
<head>
<script src="https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.min.js"></script>
</head>
<body>
<div class="mermaid">
{mermaid_spec}
</div>
<script>mermaid.initialize({{startOnLoad:true}});</script>
</body>
</html>
' style="width:100%; height:{height}; border:none">
</iframe>
"""
return mermaid_iframe
def process_input(input_str: str) -> tuple[pd.DataFrame, pd.DataFrame, gr.HTML]:
"""
Process the input string to compute Huffman encoding details.
"""
# 1. Frequency table
freq_dict = make_freq_dict(input_str)
# Convert frequency dictionary to DataFrame
freq_df = pd.DataFrame(
[(char if char != " " else "Space", freq) for char, freq in freq_dict.items()],
columns=["Character", "Frequency"],
)
# 2. Build Huffman tree
tree = build_huffman_tree(freq_dict)
# 3. Generate binary encoding for each character
encoding = generate_encoding(tree)
# Convert encoding dictionary to DataFrame
encoding_df = pd.DataFrame(
[(char if char != " " else "Space", code) for char, code in encoding.items()],
columns=["Character", "Encoding"],
)
# 4. Generate Mermaid diagram for the Huffman tree
mermaid_diagram = tree_to_mermaid(tree)
mermaid_html = create_mermaid_flowchart(mermaid_diagram)
return freq_df, encoding_df, mermaid_htmlwith gr.Blocks(
css="""gradio-app {background: #222222 !important}""",
title="Huffman Encoding",
) as demo:
input_text = gr.Textbox(
lines=4, placeholder="Enter text here...", label="Input Text", value="hello"
)
gr.Examples(
examples=[
["hello world"],
["the quick brown fox jumps over the lazy dog"],
[
"ffcebcffcafffaedbfebffdedfdecbfffcfeeecfdfcffcbfcffeadfffedddffddbcfbcfefffbdfefbeefffffcffffefdefaa"
],
["ddcdeddabedebcdced"],
["sdasaakaakkka"],
],
inputs=input_text,
label="Try an example",
)
process_button = gr.Button("Encode")
# Mermaid
output_tree = gr.HTML(label="Huffman Tree (Mermaid Diagram)")
# Encoding details
freq_table = gr.Dataframe(
headers=["Character", "Frequency"], label="Frequency Table", wrap=True
)
encoding_table = gr.Dataframe(
headers=["Character", "Encoding"], label="Encoding Table", wrap=True
)
process_button.click(
fn=process_input,
inputs=input_text,
outputs=[freq_table, encoding_table, output_tree],
)
demo.load(
fn=process_input,
inputs=input_text,
outputs=[freq_table, encoding_table, output_tree],
)