Orig: Music to hear, why hear’st thou music sadly?
Comp: 00100010111011111100101101110010100010100101110000111000000101101001001111111100111000101110000111000000100100011111110000101000111010101101101000100111011111100101101110010111111001011010010011100011011101
Deco: Music to hear, why hear’st thou music sadly?
Size uncompressed 44
Size compressed 26
# This is part of my anti-atrophy tasks. As most of my professional software development
# tasks are now is done through agentic development. This is fitness for the brain!
#
# Solely based on what's here: https://en.wikipedia.org/wiki/Huffman_coding

import requests
import pprint
import heapq
import math

# Fetch the file, this is our corpus
response = requests.get("https://www.gutenberg.org/cache/epub/100/pg100.txt")
text = response.text

# Calculate frequencies

freqs = {}
count = 0

for c in text:
    if not c in freqs:
        freqs[c] = 0
    freqs[c] += 1
    count += 1

# Normalize
for k in freqs:
    freqs[k] = freqs[k] / count

# How priority queues aka head queues work
p1 = []
# heapq.heapify(p1) - No need to run for empty lists

heapq.heappush(p1, (5, "not so important"))
heapq.heappush(p1, (1, "Really important"))
heapq.heappush(p1, (20, "Forget it"))
heapq.heappush(p1, (24, ("t1", "t2", "t3")))

important_task = heapq.heappop(p1)

assert important_task[0] == 1

# For heapq when using the tuples, it will go onto next element if the first element is identical.
# The docs are framing the type construction as a payload to attach a payload.
# This is over loaded semantics (shane on you, Python)
tie_breaker = 0
def next_tie_breaker():
    global tie_breaker
    tie_breaker += 1
    return tie_breaker


leafs = []
for k in freqs:
    heapq.heappush(leafs, (freqs[k], next_tie_breaker(), k))

# Alternatively
# leafs1 = [(v, next_tie_breaker(), k) for k, v in freqs.items()]
# heapq.heapify(leafs1)

# Representing the tree:
# Leaf (frq, _, symbol)
# Node: (frq, _, (left, right))

while True:
    if len(leafs) == 1:
        break

    a = heapq.heappop(leafs)
    b = heapq.heappop(leafs)
    sumProp = a[0] + b[0]
    node = (sumProp, next_tie_breaker(), (a, b))

    heapq.heappush(leafs, node)

huffman_tree = heapq.heappop(leafs)

huffman_dict = {}
rev_huffman_dict = {}

stack = [("", huffman_tree)]
while len(stack) > 0:
    next = stack.pop()

    match(next):
        case (prefix, (frq, x, (left, right))):
            stack.append((prefix + "1", left))
            stack.append((prefix + "0", right))
        case (prefix, (frq, x, symbol)):
            huffman_dict[prefix] = symbol
            rev_huffman_dict[symbol] = prefix
        case _:
            pprint.pprint(next)



# Let's compress something!

string = "Music to hear, why hear’st thou music sadly?"


# compress
compressed = ""
for c in string:
    compressed += rev_huffman_dict[c]

decompressed = ""
tree_traverser = huffman_tree
for c in compressed:
    # Traverse the tree
    if c == "1":
        tree_traverser = tree_traverser[2][0]
    if c == "0":
        tree_traverser = tree_traverser[2][1]

    # Are we at a leaf?
    if isinstance(tree_traverser[2], str):
        decompressed += tree_traverser[2]
        tree_traverser = huffman_tree


print(f"Orig: {string}")
print(f"Comp: {compressed}")
print(f"Deco: {decompressed}")

print(f"Size uncompressed {len(string)}")
print(f"Size compressed {math.ceil(len(compressed) / 8)}") # We approximate here