from collections import Counter
import random
def find_ngrams(seq, n):
Given a list seq
returns a list of ngrams
where each element is
an n-tuple.
Example:
>>> seq = [1, 2, 1, 3, 1, 2, 7]
>>> find_ngrams(seq, 2)
[(1, 2), (2, 1), (1, 3), (3, 1), (1, 2), (2, 7)]
>>> find_ngrams(seq, 3)
[(1, 2, 1), (2, 1, 3), (1, 3, 1), (3, 1, 2), (1, 2, 7)]
ngrams = []
Iterate over seq
in n-sized slices each converted to
an n-tuple and appended to ngrams
.
for i in range(len(seq) - n + 1):
new_tuple = tuple(seq[i:i+n])
ngrams.append(new_tuple)
return ngrams
Given a list seq
returns a Markov model of the given order
as a dictionary. The keys are order
-sized tuples. The value for each
key is a list of elements in seq
that immediately follow
occurances of the key in the seq
.
Example:
>>> seq = [1, 2, 1, 3, 1, 2, 7]
>>> markov_model(seq, 1)
{(2,): [1, 7], (3,): [1], (1,): [2, 3, 2]}
>>> markov_model(seq, 2)
{(1, 2): [1, 7], (1, 3): [1], (3, 1): [2], (2, 1): [3]}
def markov_model(seq, order):
First use find_ngrams
to slice seq
appropriately.
ngrams = find_ngrams(seq, order+1)
markov = {}
For each ngram
in ngrams
(where n = order
+ 1) the first
(n - 1) elements are used as the key and the last
element is appended to the list of possible
continuations.
for ngram in ngrams:
try:
markov[ngram[:-1]].append(ngram[-1])
except KeyError:
markov[ngram[:-1]] = [ngram[-1]]
return markov
Given a list seq
returns a Markov chain of
the given order
with a maximum length of the integer max_length
.
In this case, the function assumes that seq
is
a list of words, tries to begin the chain with a capitalized word and
end the chain with a terminal punctions ('.'
, '!'
, '?'
).
def markov_chain(seq, order, max_length=50):
markov = markov_model(seq, order)
Start chain with an ngram (n = order
) that begins with
an uppercase letter, if possible.
If there are no ngrams that begin with an uppercase letter, start with
any random ngram.
startkeys = [key for key in markov if key[0][0].isupper()]
if len(startkeys) == 0:
startkeys = list(markov.keys())
key = random.choice(startkeys)
Markov chain
is a list beginning with the start key
.
chain = list(key)
while
loop to generate markov chain
.
while True:
Choose a random continuation from the elements of
the value for the current key
.
new = random.choice(markov[key])
chain.append(new)
break
if the new element is the end of a sentence
if new[-1] in ['.', '!', '?']:
break
break
if the length of chain
is max_length
or greater.
if len(chain) >= max_length:
break
Concatenate new element to key
and use the appropriate slice
of the resulting tuple as the new key
.
key = (key + (new,))
key = key[1:]
return " ".join(chain)
Assign a multiline string
sonnet = '''From fairest creatures we desire increase,
That thereby beauty's rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou contracted to thine own bright eyes,
Feed'st thy light's flame with self-substantial fuel,
Making a famine where abundance lies,
Thy self thy foe, to thy sweet self too cruel:
Thou that art now the world's fresh ornament,
And only herald to the gaudy spring,
Within thine own bud buriest thy content,
And tender churl mak'st waste in niggarding:
Pity the world, or else this glutton be,
To eat the world's due, by the grave and thee.'''
Open the file 'sonnets.txt' for reading and read the contents.
(You will need to download sonnets.txt
to the same directory as this code.)
sonnets = open('sonnets.txt', 'r').read()
The split()
method splits a string into a list of substrings separated
by white space.
Example:
>>> my_string = 'I am Sam'
>>> my_string.split()
['I', 'am', 'Sam']
Try substituting sonnets for sonnet.
text = sonnet.split()
Check output of find_ngrams
ngrams = find_ngrams(text, 2)
print(ngrams)
Check output of markov_model
markov = markov_model(text, 1)
print(markov)
Print out the fist 10 keys
of the markov
model dictionary
for key in list(markov)[:10]:
print(key, markov[key])
Print the n
-order markov_chain
generated from text. Try using
different values for n
.
n = 1
print(markov_chain(text, n))