docs

a slatepencil documentail site

View on GitHub

Word count

import jieba

fname = '/mnt/ssd/share/books/Alice_Wonderland.txt'
f = open(fname, 'r', encoding='utf-8')
txt = f.read()

# tokenizer
words = jieba.lcut(txt)

# statistics
counts = {}
for word in words:
    if len(word) == 1: # ignore single world
        continue
    else:
        counts[word] = counts.get(word, 0) + 1

# sort
items = list(counts.items())
items.sort(key=lambda x:x[1], reverse=True)

# output top 20
for i in range(20):
    print(items[i][0], items[i][1])