comparedocuments.py


Below is the syntax highlighted version of comparedocuments.py from §3.3 Designing Data Types.


#-----------------------------------------------------------------------
# compareadocuments.py
#-----------------------------------------------------------------------

import sys
import stdarray
import stdio
from instream import InStream
from sketch import Sketch

#-----------------------------------------------------------------------

# Accept integers k and d as command-line arguments. Read a document
# list from standard input, compute profiles based on k-gram
# frequencies for all the documents, and write a matrix of similarity
# measures between all pairs of documents. d is the dimension of the
# profiles.

k = int(sys.argv[1])
d = int(sys.argv[2])

filenames = stdio.readAllStrings()
sketches = stdarray.create1D(len(filenames))

for i in range(len(filenames)):
    text = InStream(filenames[i]).readAll()
    sketches[i] = Sketch(text, k, d)
    
stdio.write('    ')
for i in range(len(filenames)):
    stdio.writef('%8.4s', filenames[i])
stdio.writeln()

for i in range(len(filenames)):
    stdio.writef('%.4s', filenames[i])
    for j in range(len(filenames)):
        stdio.writef('%8.2f', sketches[i].similarTo(sketches[j]))
    stdio.writeln()
    
#-----------------------------------------------------------------------

# more documents.txt 
# constitution.txt
# tomsawyer.txt
# huckfinn.txt
# prejudice.txt
# vector.py
# djia.csv
# amazon.html
# actg.txt

# $ python comparedocuments.py 5 10000 < documents.txt 
#         cons    toms    huck    prej    vect    djia    amaz    actg
# cons    1.00    0.69    0.63    0.67    0.09    0.15    0.19    0.12
# toms    0.69    1.00    0.93    0.89    0.07    0.18    0.19    0.14
# huck    0.63    0.93    1.00    0.83    0.05    0.16    0.16    0.13
# prej    0.67    0.89    0.83    1.00    0.06    0.20    0.20    0.14
# vect    0.09    0.07    0.05    0.06    1.00    0.02    0.19    0.02
# djia    0.15    0.18    0.16    0.20    0.02    1.00    0.11    0.07
# amaz    0.19    0.19    0.16    0.20    0.19    0.11    1.00    0.06
# actg    0.12    0.14    0.13    0.14    0.02    0.07    0.06    1.00


Copyright © 2000–2015, Robert Sedgewick, Kevin Wayne, and Robert Dondero.
Last updated: Fri Oct 20 20:45:16 EDT 2017.