IMDB Movies Genre Classification¶

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

ds = pd.read_csv('https://raw.githubusercontent.com/ishmeetkohli/imdbGenreClassification/master/data/trainingSet.csv')
ds.head()

ds.drop(['Genre2', 'imdbID', 'Genre3'], axis=1, inplace=True)
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4456 entries, 0 to 4455
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Title   4456 non-null   object
 1   Plot    4456 non-null   object
 2   Genre1  4456 non-null   object
dtypes: object(3)
memory usage: 104.6+ KB

ds = ds[ds.Genre1 != "Crime"]
ds = ds[ds.Genre1 != "Drama"]
ds = ds[ds.Genre1 != "Animation"]
ds = ds[ds.Genre1 != "Documentary"]
ds = ds[ds.Genre1 != "Adventure"]
ds = ds[ds.Genre1 != "Biography"]
ds = ds[ds.Genre1 != "Horror"]
ds = ds[ds.Genre1 != "Fantasy"]
ds = ds[ds.Genre1 != "Mystery"]
ds = ds[ds.Genre1 != "Romance"]
ds = ds[ds.Genre1 != "Sci-Fi"]
ds = ds[ds.Genre1 != "Family"]
ds = ds[ds.Genre1 != "Thriller"]
ds = ds[ds.Genre1 != "Short"]
ds = ds[ds.Genre1 != "War"]
ds = ds[ds.Genre1 != "Western"]
ds = ds[ds.Genre1 != "Musical"]
ds.Genre1.value_counts()

Comedy    1339
Action     765
Name: Genre1, dtype: int64

ds = ds.rename(columns={"Genre1": "genre"})
ds.head()

ds.to_csv('../datasets/plot_clean.csv')

ds.head()

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(ds['Plot'], ds['genre'], test_size=0.2, random_state=69)

tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Transform the training data: tfidf_train 
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)

# Transform the test data: tfidf_test 
tfidf_test = tfidf_vectorizer.transform(X_test.values)

from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

# Create the predicted tags: pred
y_pred = nb_classifier.predict(tfidf_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, y_pred)
print(score)

0.7743467933491687

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support as score

rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)

rf_model = rf.fit(tfidf_train, y_train)
pred = rf_model.predict(tfidf_test)

from sklearn.metrics import classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

      Action       0.92      0.65      0.76       156
      Comedy       0.83      0.97      0.89       265

    accuracy                           0.85       421
   macro avg       0.87      0.81      0.83       421
weighted avg       0.86      0.85      0.84       421

	Title	Plot	genre
1	Jumanji	After being trapped in a jungle board game for...	Action
2	Grumpier Old Men	Things don't seem to change much in Wabasha Co...	Comedy
3	Waiting to Exhale	This story based on the best selling novel by ...	Comedy
4	Father of the Bride Part II	In this sequel to "Father of the Bride", Georg...	Comedy
5	Heat	Hunters and their prey--Neil and his professio...	Action

	Title	Plot	genre
1	Jumanji	After being trapped in a jungle board game for...	Action
2	Grumpier Old Men	Things don't seem to change much in Wabasha Co...	Comedy
3	Waiting to Exhale	This story based on the best selling novel by ...	Comedy
4	Father of the Bride Part II	In this sequel to "Father of the Bride", Georg...	Comedy
5	Heat	Hunters and their prey--Neil and his professio...	Action

	imdbID	Title	Plot	Genre1	Genre2	Genre3
0	tt0114709	Toy Story	A little boy named Andy loves to be in his roo...	Animation	Adventure	Comedy
1	tt0113497	Jumanji	After being trapped in a jungle board game for...	Action	Adventure	Family
2	tt0113228	Grumpier Old Men	Things don't seem to change much in Wabasha Co...	Comedy	Romance	NaN
3	tt0114885	Waiting to Exhale	This story based on the best selling novel by ...	Comedy	Drama	Romance
4	tt0113041	Father of the Bride Part II	In this sequel to "Father of the Bride", Georg...	Comedy	Family	Romance