#!/usr/bin/env python # Distributed under the "here, have it" license # Written by Greg Hellings, all rights reserved # Counts all the characters in a file, assumes UTF-8 encoding, and # reports the frequency of each character as well as the Unicode # character name for that code point. Can accept an arbitrary number # of files on the argument line and will report the aggregate across # each file. Can also accept input from stdin. If you want to mix # stdin with files pass the filename '-' on the argument line. import fileinput from unicodedata import name from operator import itemgetter def sort_dict(adic): items = adic.items() items.sort() return [value for key, value in items] chars = dict() for line in fileinput.input(): for c in line.decode('utf-8'): if not chars.has_key(c): chars[c] = 1 else: chars[c] += 1 items = chars.items() items.sort(key=itemgetter(1), reverse=True) print 'Code point\tCharacter\tName\t\tCount' for key, val in items: try: n = name(key) except: n = 'not found' print '%06X\t\t%s\t%24s %s' % (ord(key), key, n, val)