Phosphophyllite

情報系学部に通う雑魚学生の日常・進捗・疑問ブログ

ハフマン符号化【Python】


課題が出たのでやってた。
色んなサイトを参考にしたのでパクリに近い。
Pythonの基本構文から調べ始めたからとても汚い、今度直したい。
問題があれば消します。

#! /usr/bin/python
# -*- coding: utf-8 -*-

from heapq import *
from itertools import groupby
from collections import Counter
import sys

class Node(object):

    # initializer.
    def __init__(self, value, freq, left = None, right = None):
        # character.
        self.value = value
        # frequency of appearance.
        self.freq = freq
        self.left = left
        self.right = right

    # compare the priotity.
    def __cmp__(self, x):
        return cmp(self.freq, x.freq)

    # set Children Node.
    def setChildren(self, l, r):
        self.left = l
        self.right = r

# dictionary for the encode.
codes = {}

def huffman_encode(text):

    # get the sorted dictionary.
    sortedTable = word_count(text)

    #create the list of Nodes.
    table = [Node(v, f) for (v, f) in sortedTable.items()]

    heapify(table)
    create_tree(table)
    codeIt("", table[0])

# create Huffman Tree.
def create_tree(node):

    while len(node) > 1:

        # pick up 2 Nodes which has smallest frequency
        l = heappop(node)
        r = heappop(node)

        # create New branch Node.
        n = Node(None, r.freq + l.freq)

        # set left and right to branch Node.
        n.setChildren(l, r)

        # push.
        heappush(node, n) 

        # DEBUG
        print l.freq, r.freq, n.freq, l.value, r.value

# add 0 or 1.
def codeIt(s, node):

    # if node has a value.(leaf)
    if node.value:
        # first value.
        if not s:
            codes[node.value] = "0"
        else:
            codes[node.value] = s
    # node is branch and it has children.
    else:
        codeIt(s+"0", node.left)
        codeIt(s+"1", node.right)


#count frequency of appearance as dictionary AND sort it.
def word_count(text):

    # create empty dictionary and list.
    sorted_dict = {}
    keys = []
    values = []

    # count frequency of appearance.
    c = Counter(text)

    # create dictionary.
    dict(c)

    # sort dictionary following value(frequency)
    for k, v in sorted(c.items(), key = lambda x:x[1]):

        # add to each list.
        keys.append(k)
        values.append(v)

    # create the sorted dict
    for i in range(len(keys)):
        sorted_dict[keys[i]] = values[i]

    return sorted_dict

# main
if __name__ == "__main__": 

    wiki = 'this is an example of a huffman tree'

    huffman_encode(wiki)
    charList = list(wiki)
    print '\n'
    for i in range(len(charList)):
        print charList[i] ,
        print codes[charList[i]]


わぁいなんとかエンコードできたーって思ってプリントよく見たら出現率の二分木与えられてた。
やり直しで全部消すのはちょっとつらいので置いておく。

出力結果

1 1 2 u r
1 1 2 l p
1 1 2 o x
2 2 4 t m
2 2 4 h s
2 2 4 None i
2 2 4 n None
2 3 5 None f
4 4 8 a None
4 4 8 None None
4 4 8 None e
5 7 12 None  
8 8 16 None None
8 12 20 None None
16 20 36 None None

t 0010
h 1010
i 1001
s 1011
  111
i 1001
s 1011
  111
a 000
n 0100
  111
e 011
x 10001
a 000
m 0011
p 01011
l 01010
e 011
  111
o 10000
f 1101
  111
a 000
  111
h 1010
u 11000
f 1101
f 1101
m 0011
a 000
n 0100
  111
t 0010
r 11001
e 011
e 011