import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
ds = pd.read_csv('https://raw.githubusercontent.com/ishmeetkohli/imdbGenreClassification/master/data/trainingSet.csv')
ds.head()
ds.drop(['Genre2', 'imdbID', 'Genre3'], axis=1, inplace=True)
ds.info()
ds = ds[ds.Genre1 != "Crime"]
ds = ds[ds.Genre1 != "Drama"]
ds = ds[ds.Genre1 != "Animation"]
ds = ds[ds.Genre1 != "Documentary"]
ds = ds[ds.Genre1 != "Adventure"]
ds = ds[ds.Genre1 != "Biography"]
ds = ds[ds.Genre1 != "Horror"]
ds = ds[ds.Genre1 != "Fantasy"]
ds = ds[ds.Genre1 != "Mystery"]
ds = ds[ds.Genre1 != "Romance"]
ds = ds[ds.Genre1 != "Sci-Fi"]
ds = ds[ds.Genre1 != "Family"]
ds = ds[ds.Genre1 != "Thriller"]
ds = ds[ds.Genre1 != "Short"]
ds = ds[ds.Genre1 != "War"]
ds = ds[ds.Genre1 != "Western"]
ds = ds[ds.Genre1 != "Musical"]
ds.Genre1.value_counts()
ds = ds.rename(columns={"Genre1": "genre"})
ds.head()
ds.to_csv('../datasets/plot_clean.csv')
ds.head()
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
X_train, X_test, y_train, y_test = train_test_split(ds['Plot'], ds['genre'], test_size=0.2, random_state=69)
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
# Transform the training data: tfidf_train
tfidf_train = tfidf_vectorizer.fit_transform(X_train.values)
# Transform the test data: tfidf_test
tfidf_test = tfidf_vectorizer.transform(X_test.values)
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
# Create a Multinomial Naive Bayes classifier: nb_classifier
nb_classifier = MultinomialNB()
# Fit the classifier to the training data
nb_classifier.fit(tfidf_train, y_train)
# Create the predicted tags: pred
y_pred = nb_classifier.predict(tfidf_test)
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, y_pred)
print(score)
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import precision_recall_fscore_support as score
rf = RandomForestClassifier(n_estimators=150, max_depth=None, n_jobs=-1)
rf_model = rf.fit(tfidf_train, y_train)
pred = rf_model.predict(tfidf_test)
from sklearn.metrics import classification_report
print(classification_report(y_test, pred))