IMDB Movies Genre Classification

In [122]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
In [123]:
ds = pd.read_csv('https://raw.githubusercontent.com/ishmeetkohli/imdbGenreClassification/master/data/trainingSet.csv')
ds.head()
Out[123]:
imdbID Title Plot Genre1 Genre2 Genre3
0 tt0114709 Toy Story A little boy named Andy loves to be in his roo... Animation Adventure Comedy
1 tt0113497 Jumanji After being trapped in a jungle board game for... Action Adventure Family
2 tt0113228 Grumpier Old Men Things don't seem to change much in Wabasha Co... Comedy Romance NaN
3 tt0114885 Waiting to Exhale This story based on the best selling novel by ... Comedy Drama Romance
4 tt0113041 Father of the Bride Part II In this sequel to "Father of the Bride", Georg... Comedy Family Romance
In [124]:
ds.drop(['Genre2', 'imdbID', 'Genre3'], axis=1, inplace=True)
ds.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4456 entries, 0 to 4455
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   4456 non-null   object
 1   Plot    4456 non-null   object
 2   Genre1  4456 non-null   object
dtypes: object(3)
memory usage: 104.6+ KB
In [125]:
ds = ds[ds.Genre1 != "Crime"]
ds = ds[ds.Genre1 != "Drama"]
ds = ds[ds.Genre1 != "Animation"]
ds = ds[ds.Genre1 != "Documentary"]
ds = ds[ds.Genre1 != "Adventure"]
ds = ds[ds.Genre1 != "Biography"]
ds = ds[ds.Genre1 != "Horror"]
ds = ds[ds.Genre1 != "Fantasy"]
ds = ds[ds.Genre1 != "Mystery"]
ds = ds[ds.Genre1 != "Romance"]
ds = ds[ds.Genre1 != "Sci-Fi"]
ds = ds[ds.Genre1 != "Family"]
ds = ds[ds.Genre1 != "Thriller"]
ds = ds[ds.Genre1 != "Short"]
ds = ds[ds.Genre1 != "War"]
ds = ds[ds.Genre1 != "Western"]
ds = ds[ds.Genre1 != "Musical"]
ds.Genre1.value_counts()
Out[125]:
Comedy    1339
Action     765
Name: Genre1, dtype: int64
In [126]:
ds = ds.rename(columns={"Genre1": "genre"})
ds.head()
Out[126]:
Title Plot genre
1 Jumanji After being trapped in a jungle board game for... Action
2 Grumpier Old Men Things don't seem to change much in Wabasha Co... Comedy
3 Waiting to Exhale This story based on the best selling novel by ... Comedy
4 Father of the Bride Part II In this sequel to "Father of the Bride", Georg... Comedy
5 Heat Hunters and their prey--Neil and his professio... Action
In [127]:
ds.to_csv('../datasets/plot_clean.csv')
In [128]:
ds.head()
Out[128]:
Title Plot genre
1 Jumanji After being trapped in a jungle board game for... Action
2 Grumpier Old Men Things don't seem to change much in Wabasha Co... Comedy
3 Waiting to Exhale This story based on the best selling novel by ... Comedy
4 Father of the Bride Part II In this sequel to "Father of the Bride", Georg... Comedy
5 Heat Hunters and their prey--Neil and his professio... Action
In [129]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
In [131]:
X_train, X_test, y_train, y_test = train_test_split(ds['Plot'], ds['genre'], test_size=0.2, random_state=69)
In [132]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test.values)
In [133]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
In [134]:
# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)
Out[134]:
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
In [135]:
# Create the predicted tags: pred
y_pred = nb_classifier.predict(tfidf_test)
In [136]:
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, y_pred)
print(score)
0.7743467933491687
In [137]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support as score

rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

rf_model = rf.fit(tfidf_train, y_train)
pred = rf_model.predict(tfidf_test)
In [140]:
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))
              precision    recall  f1-score   support

      Action       0.92      0.65      0.76       156
      Comedy       0.83      0.97      0.89       265

    accuracy                           0.85       421
   macro avg       0.87      0.81      0.83       421
weighted avg       0.86      0.85      0.84       421