frequencycount.py


Below is the syntax highlighted version of frequencycount.py from §4.2 Sorting and Searching.


#-----------------------------------------------------------------------
# frequencycount.py
#-----------------------------------------------------------------------

import sys
import stdio
from counter import Counter

# Read words from standard input, and write the frequency counts
# of the words to standard output.

words = stdio.readAllStrings()

# Previous doesn't eliminate punctuation chars, but this does:
#   import re
#   s = stdio.readAll()
#   regExp = re.compile(r'\w+') # One or more alphanumeric chars
#   words = regExp.findall(s)

words.sort()  # or merge.sort(words)

zipf = []
for i in range(len(words)):
    if (i == 0) or (words[i] != words[i-1]):
        entry = Counter(words[i], len(words))
        zipf += [entry]
    zipf[len(zipf)-1].increment()

zipf.sort()  # or merge.sort(zipf)
zipf.reverse()
for entry in zipf:
    stdio.writeln(entry)

#-----------------------------------------------------------------------

# python frequencycount.py < tomsawyer.txt
# the: 3452
# and: 2908
# a: 1758
# to: 1741
# of: 1539
# was: 1124
# in: 926
# he: 849
# that: 786
# his: 769
# with: 670
# it: 633
# I: 618
# you: 595
# had: 508
# for: 491
# Tom: 427
# they: 392
# The: 374
# ...

# python frequencycount.py < leipzig100k.txt | more
# the: 116512
# of: 59372
# to: 56121
# a: 47399
# and: 43547
# in: 42943
# for: 20692
# The: 19192
# that: 18797
# is: 17205
# said: 14932
# on: 14685
# was: 14169
# with: 11910
# by: 11778
# from: 10747
# at: 10700
# as: 10389
# be: 10269
# he: 9364
# has: 9156
# it: 8999
# have: 8833

# python frequencycount.py < leipzig200k.txt | more
# the: 232333
# of: 118742
# to: 112518
# a: 94685
# and: 87259
# in: 85927
# for: 40963
# The: 38414
# that: 37468
# is: 34556
# said: 29688
# on: 29237
# was: 28240
# by: 23701
# with: 23591
# at: 21414
# from: 21371
# as: 20770
# be: 20247
# he: 18781
# has: 18154
# it: 18121
# have: 17617

# python frequencycount.py < leipzig1m.txt
# the: 1160105
# of: 593492
# to: 560945
# a: 472819
# and: 435866
# in: 430484
# for: 205531
# The: 192296
# that: 188971
# is: 172225
# said: 148915
# on: 147024
# was: 141178
# by: 118429
# with: 118193
# at: 107551
# from: 105921
# as: 104028
# be: 101027
# he: 93393
# it: 90975
# has: 90116
# have: 87856
# ...


Copyright © 2000–2015, Robert Sedgewick, Kevin Wayne, and Robert Dondero.
Last updated: Fri Oct 20 20:45:16 EDT 2017.