markov.py

#

ngrams and Markov chains

from collections import Counter
import random

#

Find ngrams from a given list

def find_ngrams(seq, n):

#

Given a list seq returns a list of ngrams where each element is an n-tuple.

Example:
>>> seq = [1, 2, 1, 3, 1, 2, 7]
>>> find_ngrams(seq, 2)
[(1, 2), (2, 1), (1, 3), (3, 1), (1, 2), (2, 7)]
>>> find_ngrams(seq, 3)
[(1, 2, 1), (2, 1, 3), (1, 3, 1), (3, 1, 2), (1, 2, 7)]

    ngrams = []

#

Iterate over seq in n-sized slices each converted to an n-tuple and appended to ngrams.

    for i in range(len(seq) - n + 1):
        new_tuple = tuple(seq[i:i+n])
        ngrams.append(new_tuple)
    return ngrams

#

Create Markov model from a given list

#

Given a list seq returns a Markov model of the given order as a dictionary. The keys are order-sized tuples. The value for each key is a list of elements in seq that immediately follow occurances of the key in the seq.

Example:
>>> seq = [1, 2, 1, 3, 1, 2, 7]
>>> markov_model(seq, 1)
{(2,): [1, 7], (3,): [1], (1,): [2, 3, 2]}
>>> markov_model(seq, 2)
{(1, 2): [1, 7], (1, 3): [1], (3, 1): [2], (2, 1): [3]}

def markov_model(seq, order):

#

First use find_ngrams to slice seq appropriately.

    ngrams = find_ngrams(seq, order+1)
    markov = {}

#

For each ngram in ngrams (where n = order + 1) the first (n - 1) elements are used as the key and the last element is appended to the list of possible continuations.

    for ngram in ngrams:
        try:
            markov[ngram[:-1]].append(ngram[-1])
        except KeyError:
            markov[ngram[:-1]] = [ngram[-1]]
    return markov

#

Generate Markov chain

#

Given a list seq returns a Markov chain of the given order with a maximum length of the integer max_length. In this case, the function assumes that seq is a list of words, tries to begin the chain with a capitalized word and end the chain with a terminal punctions ('.', '!', '?').

def markov_chain(seq, order, max_length=50):

#

    markov = markov_model(seq, order)

#

Start chain with an ngram (n = order) that begins with an uppercase letter, if possible. If there are no ngrams that begin with an uppercase letter, start with any random ngram.

    startkeys = [key for key in markov if key[0][0].isupper()]
    if len(startkeys) == 0:
        startkeys = list(markov.keys())
    key = random.choice(startkeys)

#

Markov chain is a list beginning with the start key.

    chain = list(key)

#

while loop to generate markov chain.

    while True:

#

Choose a random continuation from the elements of the value for the current key.

        new = random.choice(markov[key])
        chain.append(new)

#

break if the new element is the end of a sentence

        if new[-1] in ['.', '!', '?']:
            break

#

break if the length of chain is max_length or greater.

        if len(chain) >= max_length:
            break

#

Concatenate new element to key and use the appropriate slice of the resulting tuple as the new key.

        key = (key + (new,))
        key = key[1:]
    return " ".join(chain)

#

Assign a multiline string

sonnet = '''From fairest creatures we desire increase,
That thereby beauty's rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou contracted to thine own bright eyes,
Feed'st thy light's flame with self-substantial fuel,
Making a famine where abundance lies,
Thy self thy foe, to thy sweet self too cruel:
Thou that art now the world's fresh ornament,
And only herald to the gaudy spring,
Within thine own bud buriest thy content,
And tender churl mak'st waste in niggarding:
Pity the world, or else this glutton be,
To eat the world's due, by the grave and thee.'''

#

Open the file 'sonnets.txt' for reading and read the contents. (You will need to download sonnets.txt to the same directory as this code.)

sonnets = open('sonnets.txt', 'r').read()

#

The split() method splits a string into a list of substrings separated by white space.
Example:
>>> my_string = 'I am Sam'
>>> my_string.split()
['I', 'am', 'Sam']
Try substituting sonnets for sonnet.

text = sonnet.split()

#

Check output of find_ngrams

ngrams = find_ngrams(text, 2)
print(ngrams)

#

Check output of markov_model

markov = markov_model(text, 1)
print(markov)

#

Print out the fist 10 keys of the markov model dictionary

for key in list(markov)[:10]:
    print(key, markov[key])

#

Print the n-order markov_chain generated from text. Try using different values for n.

n = 1
print(markov_chain(text, n))