Building a Genre Classifier for Shakespeare Speeches

with scikit-learn and nltk

This is supplementary material to my talk Words, Words, Words: Reading Shakespeare with Python

Download this notebook here

In [1]:
# Dependencies

import pandas as pd  # for reading csvs

# imports from sklearn to make our classifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# for cleaning our text
import nltk
from nltk.stem import PorterStemmer
import string

# include this so we can render matplotlib in the browser
%matplotlib inline
In [2]:
# First, read the genre csv file
# available for now at https://s3.amazonaws.com/pyshakespeare/genre.csv
df = pd.DataFrame.from_csv("genre.csv")
In [3]:
# take a look at it to get a sense of what the data looks like
df.head()
Out[3]:
play citation text genre
0 Timon of Athens 3.5.62-65 Why, I say, my lords, h'as done fair service,\... tragedy
1 Cymbeline 1.6.130-137 Should he make me\nLive, like Diana's priest, ... tragedy
2 Midsummer Night’s Dream 5.1.81 Now the hungry lion roars,\nAnd the wolf behow... comedy
3 Troilus and Cressida 3.3.75-95 What, am I poor of late?\n'Tis certain, greatn... tragedy
4 Pericles 1.4.101-106 The which when any shall not gratify,\nOr pay ... comedy
In [4]:
# I've included the "play" and "citation" columns just for kicks.
# We only really need the "text" and "genre" columns.
speeches = list(df['text'])
labels = list(df['genre'])
In [5]:
# Now, we want to divide our data into two groups
# The first for training our classifer, and the second to test its accuracy
training_size = int(len(speeches) * 0.8)

# the training speeches and labels should have the bulk of the data
train_speeches = speeches[:training_size]
train_labels = labels[:training_size]

test_speeches = speeches[training_size:]
test_labels = labels[training_size:]
In [6]:
# Now, we want to convert the raw text in our speeches to a "bag of words" vector
# To do that, we use the CountVectorizor
vectorizer = CountVectorizer()

# first, we "teach" the vectorizor which tokens to vectorize on
vectorizer.fit(train_speeches)
# then we vectorize those speeches
train_features = vectorizer.transform(train_speeches)
In [7]:
%time
# Now we're ready to create and train our classifier!
classifier = MultinomialNB()
classifier.fit(train_features.toarray(), train_labels)
CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 7.15 µs

Out[7]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
In [8]:
# That was easy (and fast!)
# Now let's test it against the data we've set aside for testing

# first, let's vectorize our test_speeches:
test_features = vectorizer.transform(test_speeches)
classifier.score(test_features, test_labels)
Out[8]:
0.75427682737169521
In [9]:
# Now let's try the same thing, except with stemming

stemmer = PorterStemmer()
def tokenize_and_stem(text):
    tokens = nltk.tokenize.word_tokenize(text)
    # strip out punctuation and make lowercase
    tokens = [token.lower().strip(string.punctuation)
              for token in tokens if token.isalnum()]

    # now stem the tokens
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens
In [10]:
%time
# CountVectorizer allows you to pass a "tokenizer" argument
vectorizer2 = CountVectorizer(tokenizer=tokenize_and_stem)

train_features_tokenized = vectorizer2.fit_transform(train_speeches)
classifier = MultinomialNB()
# train the classifier
classifier.fit(train_features_tokenized, train_labels)

# and test it
test_features_tokenized = vectorizer2.transform(test_speeches)
classifier.score(test_features_tokenized, test_labels)
CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 5.96 µs

Out[10]:
0.77138413685847584
In [11]:
# What's nice about the syntax of scikit-learn is that all the classifiers use the same syntax
# So you can reuse all your vectorizing code, and just use a different classifier instead:
from sklearn.svm import SVC

def train_classifier(train_speeches, train_labels, test_speeches, test_labels, classifier):
    train_features = CountVectorizer(tokenizer=tokenize_and_stem).fit_transform(train_speeches)

    classifier.fit(train_features_tokenized, train_labels)

    return classifier

classifier = train_classifier(train_speeches, train_labels, test_speeches, test_labels, SVC(kernel='linear'))
print classifier.score(test_features_tokenized, test_labels)

classifier = train_classifier(train_speeches, train_labels, test_speeches, test_labels, MultinomialNB())
print classifier.score(test_features_tokenized, test_labels)
0.697511664075
0.762052877138

Putting it all together:

In [12]:
# instantiate vectorizer:
vectorizer = CountVectorizer()
vectorizer.fit(train_speeches)  # create bag of words
# extract features from training speeches:
train_features = vectorizer.transform(train_speeches)

classifier = MultinomialNB()  # instantiate classifier
classifier.fit(train_features, train_labels)  # train classifier

# extract features from test speeches:
test_features = vectorizer.transform(test_speeches)
# test classifier's accuracy
classifier.score(test_features, test_labels)
Out[12]:
0.75427682737169521
In [13]:
speech = test_speeches[0]
label = test_labels[0]
features = vectorizer.transform([speech])
print "predicted: ", classifier.predict(features)[0]
print "actual: ", label
print "speech:"
print speech
predicted:  tragedy
actual:  tragedy
speech:
Farewell, Andronicus, my noble father,
The woefull'st man that ever liv'd in Rome.
Farewell, proud Rome, till Lucius come again;
He loves his pledges dearer than his life.
Farewell, Lavinia, my noble sister,
O would thou wert as thou tofore hast been!
But now nor Lucius nor Lavinia lives
But in oblivion and hateful griefs.
If Lucius live, he will requite your wrongs,
And make proud Saturnine and his emperess
Beg at the gates, like Tarquin and his queen.
Now will I to the Goths and raise a pow'r,
To be reveng'd on Rome and Saturnine.

Appendix: More Details about Vectorizers

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
texts = ["Hello, Will", "Hello, Globe"]
vectorizer.fit(texts)
print vectorizer.get_feature_names()
print vectorizer.transform(texts).toarray()
[u'globe', u'hello', u'will']
[[0 1 1]
 [1 1 0]]

In [15]:
# as a pandas dataframe
df = pd.DataFrame(
    vectorizer.transform(texts).toarray(),
    columns=vectorizer.get_feature_names(),
    index=texts
)
df
Out[15]:
globe hello will
Hello, Will 0 1 1
Hello, Globe 1 1 0