sketch.py


Below is the syntax highlighted version of sketch.py from §3.3 Designing Data Types.


#-----------------------------------------------------------------------
# sketch.py
#-----------------------------------------------------------------------

import sys
import stdio
import stdarray
from vector import Vector

#-----------------------------------------------------------------------

class Sketch:

    # Construct a new Sketch object which is a profile of string
    # text. The profile should consist of a unit vector of dimension
    # d. Element i of the vector should indicate how many k-grams
    # in the file (or web page) hash to i.
    def __init__(self, text, k, d):
        freq = stdarray.create1D(d, 0)
        for i in range(len(text) - k + 1):
            kgram = text[i:i+k]
            h = hash(kgram)
            freq[h % d] += 1
        vector = Vector(freq)
        self._sketch = vector.direction()  # Unit vector

    # Return the similarity measure between self and Sketch object
    # other as a number between 0 and 1. 0 indicates that the
    # objects are dissimilar; 1 indicates that they are similar.
    def similarTo(self, other):
        return self._sketch.dot(other._sketch)

    # Return a string representation of self.
    def __str__(self):
        return str(self._sketch)

#-----------------------------------------------------------------------

# For testing.
# Accept integers k and d as command-line arguments. Read text from
# standard input, and construct a Sketch object from that text, k, and
# d. Write the Sketch object to standard output.

def main():
    text = stdio.readAll()
    k = int(sys.argv[1])
    d = int(sys.argv[2])
    sketch = Sketch(text, k, d)
    stdio.writeln(sketch)

if __name__ == '__main__':
    main()

#-----------------------------------------------------------------------

# more genome20.txt 
# ATAGATGCATAGCGCATAGC

# python sketch.py 2 16 < genome20.txt 
# [0.37210420376762543, 0.37210420376762543, 0.49613893835683387, 
# 0.0, 0.12403473458920847, 0.0, 
# 0.0, 0.0, 0.0, 
# 0.0, 0.24806946917841693, 0.0, 
# 0.12403473458920847, 0.6201736729460423, 0.0, 0.0]


Copyright © 2000–2015, Robert Sedgewick, Kevin Wayne, and Robert Dondero.
Last updated: Fri Oct 20 20:45:16 EDT 2017.