Orig: Music to hear, why hear’st thou music sadly?
Comp: 00100010111011111100101101110010100010100101110000111000000101101001001111111100111000101110000111000000100100011111110000101000111010101101101000100111011111100101101110010111111001011010010011100011011101
Deco: Music to hear, why hear’st thou music sadly?
Size uncompressed 44
Size compressed 26
# This is part of my anti-atrophy tasks. As most of my professional software development
# tasks are now is done through agentic development. This is fitness for the brain!
#
# Solely based on what's here: https://en.wikipedia.org/wiki/Huffman_coding
import requests
import pprint
import heapq
import math
# Fetch the file, this is our corpus
response = requests.get("https://www.gutenberg.org/cache/epub/100/pg100.txt")
text = response.text
# Calculate frequencies
freqs = {}
count = 0
for c in text:
if not c in freqs:
freqs[c] = 0
freqs[c] += 1
count += 1
# Normalize
for k in freqs:
freqs[k] = freqs[k] / count
# How priority queues aka head queues work
p1 = []
# heapq.heapify(p1) - No need to run for empty lists
heapq.heappush(p1, (5, "not so important"))
heapq.heappush(p1, (1, "Really important"))
heapq.heappush(p1, (20, "Forget it"))
heapq.heappush(p1, (24, ("t1", "t2", "t3")))
important_task = heapq.heappop(p1)
assert important_task[0] == 1
# For heapq when using the tuples, it will go onto next element if the first element is identical.
# The docs are framing the type construction as a payload to attach a payload.
# This is over loaded semantics (shane on you, Python)
tie_breaker = 0
def next_tie_breaker():
global tie_breaker
tie_breaker += 1
return tie_breaker
leafs = []
for k in freqs:
heapq.heappush(leafs, (freqs[k], next_tie_breaker(), k))
# Alternatively
# leafs1 = [(v, next_tie_breaker(), k) for k, v in freqs.items()]
# heapq.heapify(leafs1)
# Representing the tree:
# Leaf (frq, _, symbol)
# Node: (frq, _, (left, right))
while True:
if len(leafs) == 1:
break
a = heapq.heappop(leafs)
b = heapq.heappop(leafs)
sumProp = a[0] + b[0]
node = (sumProp, next_tie_breaker(), (a, b))
heapq.heappush(leafs, node)
huffman_tree = heapq.heappop(leafs)
huffman_dict = {}
rev_huffman_dict = {}
stack = [("", huffman_tree)]
while len(stack) > 0:
next = stack.pop()
match(next):
case (prefix, (frq, x, (left, right))):
stack.append((prefix + "1", left))
stack.append((prefix + "0", right))
case (prefix, (frq, x, symbol)):
huffman_dict[prefix] = symbol
rev_huffman_dict[symbol] = prefix
case _:
pprint.pprint(next)
# Let's compress something!
string = "Music to hear, why hear’st thou music sadly?"
# compress
compressed = ""
for c in string:
compressed += rev_huffman_dict[c]
decompressed = ""
tree_traverser = huffman_tree
for c in compressed:
# Traverse the tree
if c == "1":
tree_traverser = tree_traverser[2][0]
if c == "0":
tree_traverser = tree_traverser[2][1]
# Are we at a leaf?
if isinstance(tree_traverser[2], str):
decompressed += tree_traverser[2]
tree_traverser = huffman_tree
print(f"Orig: {string}")
print(f"Comp: {compressed}")
print(f"Deco: {decompressed}")
print(f"Size uncompressed {len(string)}")
print(f"Size compressed {math.ceil(len(compressed) / 8)}") # We approximate here