Problem
I wrote a script with utilities for calculating the entropy of iterables and included a Tk GUI that shows a quick overview over a text’s properties in real-time. (on GitHub)
I tried to follow PEP 8 as good as possible, but I’m not sure about other things, specificially:
- I think my docstrings are sometimes overly redundant, see the GUI for example.
- In
gui.py
, I’m not sure if I should move thecalculate
method out of theGUI
class. - Is the overall design good? I know it’s a rather small project, but I want to do this correctly.
If you have any other concerns beside these questions, I’m open to criticism!
The code is split into two modules:
calc.py – Includes the calculation functions
"""Utilities for entropy-related calculations."""
from math import ceil as _ceil, log2 as _log2
def prob_to_info(probability):
"""Converts probability in the range from 0 to 1 into information measured
in bits, therefore using the dual logarithm. Returns None if the probability
is equal to zero."""
if probability == 0:
return None
elif probability == 1:
return 0
else:
return -_log2(probability)
def info_to_prob(information):
"""Converts information measured in bits to probablity."""
return 2**-information
def entropy(iterable):
"""Calculates the Shannon entropy of the given iterable."""
return sum(prob[1]*prob_to_info(prob[1]) for prob in char_mapping(iterable))
def optimal_bits(iterable):
"""Calculates the optimal usage of bits for decoding the iterable."""
return _ceil(entropy(iterable)) * len(iterable)
def metric_entropy(iterable):
"""Calculates the metric entropy of the iterable."""
return entropy(iterable) / len(iterable)
def char_mapping(iterable):
"""Creates a dictionary of the unique chararacters and their probability
in the given iterable."""
char_map = dict.fromkeys(set(iterable))
for char in set(iterable):
probability = iterable.count(char) / len(iterable)
char_map[char] = probability
return sorted(char_map.items(), key=lambda x: x[1], reverse=True)
gui.py
import tkinter as tk
import calc
class GUI:
"""A simple Tk-based interface for real-time entropy-related analytics
on given texts."""
def __init__(self, root):
"""Initializes the GUI where 'root' is a tkinter.Tk instance."""
self.parent = root
self.parent.state("zoomed")
self.frame = tk.Frame(self.parent)
self.frame.grid(row=0, column=0, sticky="nwes")
self.input_head = tk.Label(self.frame, text="Input:")
self.input_head.grid(row=0, column=0, sticky="nwes")
self.ignore_case_value = tk.IntVar()
self.ignore_case_value.trace("w", self.case_switch)
self.ignore_case = tk.Checkbutton(
self.frame,
variable=self.ignore_case_value,
text="Ignore case"
)
self.ignore_case.grid(row=0, column=1, sticky="nwes")
self.input_main = tk.Text(self.frame)
self.input_main.grid(row=1, column=0, sticky="nwes", columnspan=2)
self.input_main.bind("<KeyRelease>", self.update)
self.output_head = tk.Label(self.frame, text="Output:")
self.output_head.grid(row=0, column=2, sticky="nwes")
self.output_main = tk.Text(self.frame, state=tk.DISABLED)
self.output_main.grid(row=1, column=2, sticky="nwes")
self.parent.rowconfigure(0, weight=1)
self.parent.columnconfigure(0, weight=1)
self.frame.rowconfigure(1, weight=1)
self.frame.columnconfigure(0, weight=1)
self.frame.columnconfigure(1, weight=1)
self.frame.columnconfigure(2, weight=1)
def case_switch(self, *_):
"""Toggles case sensivity ."""
self.input_main.edit_modified(True)
self.update()
def update(self, *_):
"""Updates the contents of the analysis text box."""
if not self.input_main.edit_modified():
return
analyze_text = self.calculate()
self.output_main["state"] = tk.NORMAL
self.output_main.delete("1.0", tk.END)
self.output_main.insert("1.0", analyze_text)
self.output_main["state"] = tk.DISABLED
self.input_main.edit_modified(False)
def calculate(self, *_):
"""Creates the analysis text."""
text = self.input_main.get("1.0", "end-1c")
if self.ignore_case_value.get():
text = text.lower()
char_map = calc.char_mapping(text)
entropy = calc.entropy(char_map)
metric_entropy = calc.metric_entropy(text)
optimal = calc.optimal_bits(text)
info = "n".join(
[
"Length: " + str(len(text)),
"Unique chars: " + str(len(char_map)),
"Entropy: " + str(entropy),
"Metric entropy: " + str(metric_entropy),
"Optimal bit usage: " + str(optimal)
]
)
table_head = " Chart| Probabilitytt| Bitstt| Occurences"
table_body = "n".join(
[
" " + repr(char)[1:-1] +
"t" + str(round(prob, 7)) +
"tt" + str(round(calc.prob_to_info(prob), 7)) +
"tt" + str(text.count(char))
for char, prob in char_map
]
)
table = "n".join([table_head, table_body])
return "nn".join([info, table])
def main():
root = tk.Tk()
_ = GUI(root)
root.mainloop()
if __name__ == "__main__":
main()
Solution
You ask about docstrings, so you should be aware that there is a PEP for those, too. In particular, note that:
Multi-line docstrings consist of a summary line just like a one-line docstring, followed by a blank line, followed by a more elaborate description.
The style guide specifies that docstring lines should be a maximum of 72 characters; a few of yours exceed this. There are various formats that you can adopt to include information in the docstrings in a structured way for use by documentation generators and other tools; I like the Google style.
For example,
"""Converts probability in the range from 0 to 1 into information measured
in bits, therefore using the dual logarithm. Returns None if the probability
is equal to zero."""
could be more like:
"""Converts probability into information, measured in bits.
Notes:
Uses the dual logarithm.
Args:
probability (float): In the range from 0 to 1.
Returns:
float [or None if the probability is equal to zero].
"""
I assume that you’ve aliased log2
and ceil
to _log2
and _ceil
respectively to avoid them being import
ed into gui
. Instead, you can use __all__
to specify what should be available to modules that import from calc
(see the tutorial):
__all__ = [
'entropy',
'info_to_prob',
'metric_entropy',
'optimal_bits',
'prob_to_info',
]
It seems a bit odd to have the class that occupies pretty much the whole of gui.py
be explicitly ignored after instantiation! Rather than having:
root = tk.Tk()
_ = GUI(root)
root.mainloop()
you could make the GUI
class inherit from tk.Tk
:
class GUI(tk.Tk):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.state("zoomed")
self.frame = tk.Frame(self)
...
and run it directly:
root = GUI()
root.mainloop()
This is trivial enough to include under if __name__ == '__main__':
directly, rather than via main
. There’s also no need for the , *_
in GUI.calculate
.
Rather than the string concatenation with +
, I would use str.format
, for example:
table_head = " Char | Probability | Bits | Occurrences "
table_body = "n".join(
[
" {:<4} | {:>11.7f} | {:>11.7f} | {:>11}".format(
char,
prob,
calc.prob_to_info(prob),
text.count(char)
)
for char, prob in char_map
]
)
Given what this method does, I don’t think that calculate
is an appropriate name for it. You could split the calculations and the formatting into two methods, with more appropriate names.
As currently implemented, the code breaks (due to ZeroDivisionError
in metric_entropy
) if you toggle Ignore Case before entering any text, or if you delete all of the input text. You should handle this error, and display something sensible in these cases.
As you never use the first element of the tuples you get from char_mapping
, and the order does not count I wrote a simpler function:
def ratios(iterable):
"""
Returns a list of ratios indicating how often the chars
appear in the iterable.
>>> list(sorted(ratios("hello")))
[0.2, 0.2, 0.2, 0.4]
"""
return [iterable.count(i) / len(iterable) for i in set(iterable)]
that you can use as:
def entropy(iterable):
"""Calculates the Shannon entropy of the given iterable.
>>> entropy(range(10))
3.321928094887362
>>> entropy([1,2,3])
1.584962500721156
"""
return sum(prob*prob_to_info(prob) for prob in ratios(iterable))
obtaining the same results as before.