Oscar Prediction : Leonardo DiCaprio Will Finally Win an Oscar

How to make a prediction on best actor of oscar 2016 based on machine learning?

Posted on March 15, 2016

小李子的故事

import os, time
import urllib2, urllib, re, bs4
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import json
# list = ['BAFTA', 'Globes', 'SAG', 'Spirits', 'CC', 'Oscar']
# BAFTA=DataFrame()
# Globes=DataFrame()
# SAG=DataFrame()
# Spirits=DataFrame()
# CC=DataFrame()
# Oscar=DataFrame()

# dic = {"BAFTA":BAFTA, "Globes":Globes, "SAG":SAG, "Spirits":Spirits, "CC":CC, "Oscar":Oscar}
dic = {"BAFTA":DataFrame(), "Globes":DataFrame(), "SAG":DataFrame(), "Spirits":DataFrame(), "CC":DataFrame(), "Oscar":DataFrame()}

for key in dic.keys():
    dic[key] = pd.read_json(json.loads(open(key + '.json').read()), orient='split')
    print key   
    print dic[key].head()
BAFTA
              name  year  BAFTA0  BAFTA1  BAFTA2
0       Hugh Grant  1995       0       0       1
1    Terence Stamp  1995       0       1       0
2        Tom Hanks  1995       0       1       0
3    John Travolta  1995       0       1       0
4  Nigel Hawthorne  1996       0       0       1
Spirits
                name  year  Spirits0  Spirits1  Spirits2
0  Samuel L. Jackson  1995         0         0         1
1        Sihung Lung  1995         0         1         0
2    William H. Macy  1995         0         1         0
3     Campbell Scott  1995         0         1         0
4           Jon Seda  1995         0         1         0
CC
             name  year  CC0  CC1  CC2
0     Kevin Bacon  1996    0    0    1
1    Nicolas Cage  1996    0    1    0
2   Geoffrey Rush  1997    0    0    1
3      Tom Cruise  1997    0    1    0
4  Jack Nicholson  1998    0    0    1
Oscar
              name  year  Oscar0  Oscar1  Oscar2
0        Tom Hanks  1995       0       0       1
1   Morgan Freeman  1995       0       1       0
2  Nigel Hawthorne  1995       0       1       0
3      Paul Newman  1995       0       1       0
4    John Travolta  1995       0       1       0
SAG
             name  year  SAG0  SAG1  SAG2
0       Tom Hanks  1995     0     0     1
1  Morgan Freeman  1995     0     1     0
2     Paul Newman  1995     0     1     0
3     Tim Robbins  1995     0     1     0
4   John Travolta  1995     0     1     0
Globes
             name  year  Globes0  Globes1  Globes2
0       Tom Hanks  1995        0        0        1
1  Morgan Freeman  1995        0        1        0
2     Paul Newman  1995        0        1        0
3       Brad Pitt  1995        0        1        0
4   John Travolta  1995        0        1        0
award_list = dic['BAFTA']
for x in ['Globes', 'SAG', 'Spirits', 'CC', 'Oscar']:
    award_list = pd.merge(award_list,dic[x], how='outer', on=['name','year'])
award_list.head()
name year BAFTA0 BAFTA1 BAFTA2 Globes0 Globes1 Globes2 SAG0 SAG1 SAG2 Spirits0 Spirits1 Spirits2 CC0 CC1 CC2 Oscar0 Oscar1 Oscar2
0 Hugh Grant 1995 0 0 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 Terence Stamp 1995 0 1 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 Tom Hanks 1995 0 1 0 0 0 1 0 0 1 NaN NaN NaN NaN NaN NaN 0 0 1
3 John Travolta 1995 0 1 0 0 1 0 0 1 0 NaN NaN NaN NaN NaN NaN 0 1 0
4 Nigel Hawthorne 1996 0 0 1 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
# Globes = pd.read_json(json.loads(open('Globes.json').read()),orient='split')
# BAFTA = pd.read_json(json.loads(open('BAFTA.json').read()), orient='split')
# SAG = pd.read_json(json.loads(open('SAG.json').read()),orient='split')
# Spirits = pd.read_json(json.loads(open('Spirits.json').read()), orient='split')
# CC = pd.read_json(json.loads(open('CC.json').read()),orient='split')
# Oscar = pd.read_json(json.loads(open('Oscar.json').read()),orient='split')
# award_list = BAFTA
# for x in [Globes, SAG, Spirits, CC, Oscar]:
#     award_list = pd.merge(award_list,x, how='outer', on=['name','year'])
# award_list
Globes=dic['Globes']
Globes[Globes['name']=='Leonardo DiCaprio']
name year Globes0 Globes1 Globes2
18 Leonardo DiCaprio 1998 0 1 0
41 Leonardo DiCaprio 2003 0 1 0
50 Leonardo DiCaprio 2005 0 0 1
61 Leonardo DiCaprio 2007 0 1 0
62 Leonardo DiCaprio 2007 0 1 0
74 Leonardo DiCaprio 2009 0 1 0
86 Leonardo DiCaprio 2012 0 1 0
award_list['BAFTA0']=award_list['BAFTA0'].fillna(1)
award_list['Globes0']=award_list['Globes0'].fillna(1)
award_list['SAG0']=award_list['SAG0'].fillna(1)
award_list['Spirits0']=award_list['Spirits0'].fillna(1)
award_list['CC0']=award_list['CC0'].fillna(1)
award_list=award_list.fillna(0)

# award_list['BAFTA1']=award_list['BAFTA1'].fillna(0)
# award_list['Globes1']=award_list['Globes1'].fillna(0)
# award_list['SAG1']=award_list['SAG1'].fillna(0)
# award_list['Spirits1']=award_list['Spirits1'].fillna(0)
# award_list['CC1']=award_list['CC1'].fillna(0)

# award_list['BAFTA2']=award_list['BAFTA2'].fillna(0)
# award_list['Globes2']=award_list['Globes2'].fillna(0)
# award_list['SAG2']=award_list['SAG2'].fillna(0)
# award_list['Spirits2']=award_list['Spirits2'].fillna(0)
# award_list['CC2']=award_list['CC2'].fillna(0)
# award_list['Oscar2']=award_list['Oscar2'].fillna(0)

award_list = award_list.drop(['Oscar0','Oscar1'],axis=1)

award_list=award_list.drop_duplicates()
award_list.head()
name year BAFTA0 BAFTA1 BAFTA2 Globes0 Globes1 Globes2 SAG0 SAG1 SAG2 Spirits0 Spirits1 Spirits2 CC0 CC1 CC2 Oscar2
0 Hugh Grant 1995 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0
1 Terence Stamp 1995 0 1 0 1 0 0 1 0 0 1 0 0 1 0 0 0
2 Tom Hanks 1995 0 1 0 0 0 1 0 0 1 1 0 0 1 0 0 1
3 John Travolta 1995 0 1 0 0 1 0 0 1 0 1 0 0 1 0 0 0
4 Nigel Hawthorne 1996 0 0 1 1 0 0 1 0 0 1 0 0 1 0 0 0
award_list[award_list['name']=='Leonardo DiCaprio']
name year BAFTA0 BAFTA1 BAFTA2 Globes0 Globes1 Globes2 SAG0 SAG1 SAG2 Spirits0 Spirits1 Spirits2 CC0 CC1 CC2 Oscar2
47 Leonardo DiCaprio 2005 0 1 0 0 0 1 0 1 0 1 0 0 0 1 0 0
58 Leonardo DiCaprio 2007 0 1 0 0 1 0 0 1 0 1 0 0 0 1 0 0
97 Leonardo DiCaprio 2014 0 1 0 1 0 0 1 0 0 1 0 0 0 0 1 0
117 Leonardo DiCaprio 1998 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0
130 Leonardo DiCaprio 2003 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0
141 Leonardo DiCaprio 2009 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0
146 Leonardo DiCaprio 2012 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 0
# Machine Learning Imports
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.learning_curve import learning_curve

# For evaluating our ML results
from sklearn import metrics

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

# Dataset Import
import statsmodels.api as sm
%matplotlib inline
X = award_list.drop(['name','year','Oscar2'],axis=1)
y = award_list.Oscar2
y.head()
0    0
1    0
2    1
3    0
4    0
Name: Oscar2, dtype: float64
# Create LogisticRegression model
log_model = LogisticRegression(C=1.0, penalty='l2', tol=1e-6)

# Fit our data
log_model.fit(X,y)

# Check our accuracy
log_model.score(X,y)
0.96590909090909094
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        train_sizes=np.linspace(.1, 1.0, 5)):

    plt.figure()
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=5, n_jobs=1, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",alpha=0.6,linewidth=4,
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="navy",alpha=0.6,linewidth=4,
             label="Cross-validation score")

    plt.xlabel("Training examples")
    plt.ylabel("Score")
    plt.legend(loc="best")
    plt.grid("on") 
    if ylim:
        plt.ylim(ylim)
    plt.title(title)
    plt.show()


plot_learning_curve(LogisticRegression(C=20.0, penalty='l2'), "LogisticRegression(C=10.0,penalty='l2')",
                    X, y, ylim=(0.8, 1.05),
                    train_sizes=np.linspace(.1, 1, 10))

png

from sklearn.grid_search import GridSearchCV
GS = GridSearchCV(LogisticRegression(), 
                   param_grid={"C": [0.1, 1.0, 5.0, 10.0, 20.0],"penalty":['l1','l2']})

print "Chosen parameter on 100 datapoints: %s" % GS.fit(X, y).best_params_
plot_learning_curve(GS, "C", 
                    X, y, ylim=(0.8, 1.05),
                    train_sizes=np.linspace(.1, 1, 10))
GS.best_estimator_
Chosen parameter on 100 datapoints: {'penalty': 'l1', 'C': 1.0}

png

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
# Globes
colist=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2']
X_test = DataFrame(columns=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
win = Series([0,0,0,0,0,1,0,0,0,0,0,0,0,0,0],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)
win = Series([0,0,0,0,1,0,0,0,0,0,0,0,0,0,0],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)
win = Series([0,0,0,0,1,0,0,0,0,0,0,0,0,0,0],index=colist)
X_test = X_test.append(win,ignore_index=True)

log_model = LogisticRegression(C=1.0, penalty='l1')
log_model.fit(X,y)
log_model.predict_proba(X_test)[:,1]
array([ 0.32155716,  0.28428716,  0.28428716])
# Globes - CC
X_test = DataFrame(columns=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
win = Series([0,0,0,0,0,1,0,0,0,0,0,0,0,0,1],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)
win = Series([0,0,0,0,1,0,0,0,0,0,0,0,0,1,0],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)
win = Series([0,0,0,0,1,0,0,0,0,0,0,0,0,1,0],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)

log_model = LogisticRegression(C=1.0, penalty='l1')
log_model.fit(X,y)
log_model.predict_proba(X_test)[:,1]
array([ 0.48074555,  0.19276261,  0.19276261])
# Globes - CC - SAGs
X_test = DataFrame(columns=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
win = Series([0,0,0,0,0,1,0,0,1,0,0,0,0,0,1],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)
win = Series([0,0,0,0,1,0,0,1,0,0,0,0,0,1,0],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)
win = Series([0,0,0,0,1,0,1,0,0,0,0,0,0,1,0],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)

log_model = LogisticRegression(C=1.0, penalty='l1')
log_model.fit(X,y)
log_model.predict_proba(X_test)[:,1]
array([ 0.88804238,  0.05891302,  0.00865326])
# Globes - CC - SAGs - BAFTA
X_test = DataFrame(columns=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
win = Series([0,0,1,0,0,1,0,0,1,0,0,0,0,0,1],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)
win = Series([0,1,0,0,1,0,0,1,0,0,0,0,0,1,0],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)
win = Series([0,1,0,0,1,0,1,0,0,0,0,0,0,1,0],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)

log_model = LogisticRegression(C=1.0, penalty='l1')
log_model.fit(X,y)
log_model.predict_proba(X_test)[:,1]
array([ 0.88809967,  0.05896   ,  0.00865978])
# Globes - CC - SAGs - BAFTA - Spirits
X_test = DataFrame(columns=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
win = Series([0,0,1,0,0,1,0,0,1,1,0,0,0,0,1],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)
win = Series([0,1,0,0,1,0,0,1,0,1,0,0,0,1,0],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)
win = Series([0,1,0,0,1,0,1,0,0,1,0,0,0,1,0],index=['B0','B1','B2','G0','G1','G2','S0','S1','S2','SP0','SP1','SP2','C0','C1','C2'])
X_test = X_test.append(win,ignore_index=True)

log_model = LogisticRegression(C=1.0, penalty='l1')
log_model.fit(X,y)
log_model.predict_proba(X_test)[:,1]
array([ 0.87316036,  0.0515114 ,  0.0075178 ])
matrix = DataFrame(columns=['name','Globes','CriticsChoice','SAGs','BAFTA','Spirit'])
nominee = Series(['Leonardo DiCaprio',0.0157,0.279,0.83,0.911,0.911],index=['name','Globes','CriticsChoice','SAGs','BAFTA','Spirit'])
matrix = matrix.append(nominee,ignore_index=True)
nominee1 = Series(['Michael Fassbender',0.008,0.003,0.024,0.0486,0.0486],index=['name','Globes','CriticsChoice','SAGs','BAFTA','Spirit'])
matrix = matrix.append(nominee1,ignore_index=True)
nominee2 = Series(['Matt Damon',0.0086,0.0087,0.0033,0.0033,0.0068],index=['name','Globes','CriticsChoice','SAGs','BAFTA','Spirit'])
matrix = matrix.append(nominee2,ignore_index=True)
matrix
name Globes CriticsChoice SAGs BAFTA Spirit
0 Leonardo DiCaprio 0.0157 0.2790 0.8300 0.9110 0.9110
1 Michael Fassbender 0.0080 0.0030 0.0240 0.0486 0.0486
2 Matt Damon 0.0086 0.0087 0.0033 0.0033 0.0068
plt.figure()
plt.plot(nominee[1:], 's-', color="r",alpha=0.4,linewidth=4,
             label="Leonardo DiCaprio (The Revenant)")
plt.plot(nominee1[1:], 's-', color="b",alpha=0.5,linewidth=4,
             label="Michael Fassbender (Steve Jobs)")
plt.plot(nominee2[1:], 's-', color="g",alpha=0.5,linewidth=4,
             label="Matt Damon (The Martian)")
labels=['Globes','CriticsChoice','SAGs','BAFTA','Spirit']
plt.xticks([0,1,2,3,4], labels)
plt.xlabel("Awards List")
plt.ylabel("Probability to win Oscar")
plt.xlim(-0.5,4.5)
plt.ylim(-0.1,1.1)
plt.legend(loc="best")
plt.grid("on")
plt.show()

png