index.py


Below is the syntax highlighted version of index.py from §4.4 Symbol Tables.


#-----------------------------------------------------------------------
# index.py
#-----------------------------------------------------------------------

import sys
import stdio
from bst import OrderedSymbolTable

# Accept integers minLength and minCount as command-line arguments. Read
# words from standard input until end-of-file. Create an index
# indicating where each word appears within standard input. Consider
# only words that have at least minLength characters. Then write
# the index to standard output. Write only words that occur at
# least minCount times.

minLength = int(sys.argv[1])
minCount = int(sys.argv[2])

words = stdio.readAllStrings()

bst = OrderedSymbolTable()

for i in range(len(words)):
    word = words[i]
    if len(word) >= minLength:
        if not word in bst:
            bst[word] = []
        bst[word] += [i]

for word in bst:
    occurrences = bst[word]
    if len(occurrences) >= minCount:
        stdio.write(word + ': ')
        for occurrence in occurrences:
            stdio.write(str(occurrence) + ' ')
        stdio.writeln()

#-----------------------------------------------------------------------

# python index.py 9 30 < tale.txt 
# confidence: 2794 23064 25031 34249 47907 48268 48577 ...
# courtyard: 11885 12062 17303 17451 32404 32522 38663 ...
# evremonde: 86211 90791 90798 90802 90814 90822 90856 ...
# expression: 3777 5575 6574 7116 7195 8509 8928 15015 ...
# gentleman: 2521 5290 5337 5698 6235 6301 6326 6338 ... 
# influence: 27809 36881 43141 43150 48308 54049 54067 ...
# monseigneur: 85 90 36587 36590 36611 36636 36643 36680 ...
# something: 3406 3765 9283 13234 13239 15245 20257 23642 ...
# sometimes: 4514 4530 4548 6082 20731 33883 34239 34283 ...
# vengeance: 56041 63943 67705 79351 79941 79945 80225 ...

# python index.py 9 30 < mobydick.txt
# carpenter: 109292 155782 162820 171660 171725 171861 171947 ...
# completely: 8108 8372 8598 22809 28234 29930 37421 37854 ...
# concerning: 7430 8432 21577 21854 25019 30606 37510 41442 ...
# considering: 1650 14156 18786 18799 28664 28870 29013 78282 ...
# creatures: 1247 13400 27886 30780 52474 71084 77843 81477 ...
# especially: 101 3823 4358 6110 18798 21561 22148 24163 28175 ...
# forecastle: 1275 1703 1751 36956 44449 46220 53057 55574 ...
# gentlemen: 59297 61808 76405 90255 90323 90621 90932 91090 ...
# greenland: 44300 48647 48673 48686 48782 48816 49899 49918 ...
# harpooneer: 4677 4695 4931 4945 4975 5002 5643 5721 5740 ...
# harpooneers: 4630 7218 7357 7675 11427 12418 34048 39082 ...
# instances: 27550 53904 54261 65156 66402 66927 69646 69795 ...
# intervals: 19909 29961 38130 64917 67511 71903 72680 77582 ...
# leviathan: 2459 4028 27970 40852 48300 49118 49131 49265 ...
# nantucket: 2298 2368 2410 2417 2441 2464 3126 13590 13759 ...
# nevertheless: 3258 10720 13514 28881 38164 38865 42271 45272 ...
# particular: 25 1474 2644 3115 10024 10676 19759 21725 24733 ...
# something: 1052 1086 1900 2376 3869 3997 6359 7451 8332 ...
# steelkilt: 90568 90936 91063 91088 91579 91622 91652 91654 ...
# substance: 13699 21710 49813 77803 97580 97812 102060 107304...
# themselves: 1069 1595 23815 37572 37880 40274 40296 40913 ...
# therefore: 13216 13755 17592 28923 31122 32187 37859 39661 ...


Copyright © 2000–2015, Robert Sedgewick, Kevin Wayne, and Robert Dondero.
Last updated: Fri Oct 20 20:45:16 EDT 2017.