Data Dojo 18 - More Models¶

Setup¶

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

Hacking Order¶

In [3]:
np.random.seed(42)
names = ["Robin", "Simon", "Mike", "Chris", "Melanie", "Sabine", "Jörg", "Markus", "Kerstin", "Andreas", "Felix"]
np.random.shuffle(names)
" => ".join(names)
Out[3]:
'Sabine => Robin => Andreas => Felix => Mike => Simon => Kerstin => Melanie => Markus => Chris => Jörg'

Data Loading¶

In [2]:
data = pd.read_csv("https://github.com/ddojo/ddojo.github.io/raw/main/sessions/14_trees/train.tsv", sep="\t")
test = pd.read_csv("https://github.com/ddojo/ddojo.github.io/raw/main/sessions/14_trees/test.tsv", sep="\t")

All cases¶

In [3]:
X = data.drop("species",axis=1)
y = data.species
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)
In [4]:
X_test = test.drop("tree_id",axis=1)
tree_id = test.tree_id
pred = pd.DataFrame()
pred["tree_id"] = tree_id
pred["species"] = "unknown"

Only complete cases¶

In [5]:
X_complete = data.dropna().drop("species",axis=1)
y_complete = data.dropna().species
X_train_complete, X_val_complete, y_train_complete, y_val_complete = train_test_split(X_complete, y_complete, random_state=42)
In [6]:
X_test_complete = test.dropna().drop("tree_id",axis=1)
tree_id_complete = test.dropna().tree_id
pred_complete = pd.DataFrame()
pred_complete["tree_id"] = tree_id_complete
pred_complete["species"] = "unknown"

Models¶

In [16]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
In [10]:
svmclassifier = svm.SVC()
In [12]:
svmclassifier.fit(X_train_complete, y_train_complete)
Out[12]:
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [14]:
svmclassifier.predict(X_test_complete)
Out[14]:
array(['Fagus sylvatica', 'Quercus ilex', 'Quercus ilex', ...,
       'Quercus ilex', 'Fagus sylvatica', 'Fagus sylvatica'], dtype=object)
In [16]:
svmclassifier.score(X_val_complete,y_val_complete)
Out[16]:
0.8920541645375575
In [0]:
svmclassifier_op = svm.SVC(degree = 3, kernel = "poly")
In [0]:
 
In [17]:
svmclassifier_op_1 = make_pipeline(MinMaxScaler(), svm.SVC())
In [18]:
svmclassifier_op_1.fit(X_train_complete, y_train_complete)
Out[18]:
Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('svc', SVC())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('svc', SVC())])
MinMaxScaler()
SVC()
In [19]:
svmclassifier_op_1.score(X_val_complete,y_val_complete)
Out[19]:
0.9173479816044967
In [23]:
weight = {names[i]:(sum(counts)/counts[i]) for i in range(3)}
print(weight)
{'Fagus sylvatica': 4.674257557597549, 'Pinus pinaster': 3.727195497504513, 'Quercus ilex': 1.9313817201342651}
In [21]:
names, counts = np.unique(y, return_counts = True)
In [24]:
svmclassifier_weight = svm.SVC(kernel="rbf", class_weight=weight)
In [25]:
svmclassifier_weight.fit(X_train_complete, y_train_complete)
Out[25]:
SVC(class_weight={'Fagus sylvatica': 4.674257557597549,
                  'Pinus pinaster': 3.727195497504513,
                  'Quercus ilex': 1.9313817201342651})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC(class_weight={'Fagus sylvatica': 4.674257557597549,
                  'Pinus pinaster': 3.727195497504513,
                  'Quercus ilex': 1.9313817201342651})
In [26]:
svmclassifier_weight.score(X_val_complete,y_val_complete)
Out[26]:
0.9047010730710271
In [27]:
svmcw_pipe = make_pipeline(MinMaxScaler(), svm.SVC(class_weight=weight))
svmcw_pipe.fit(X_train_complete, y_train_complete)
svmcw_pipe.score(X_val_complete, y_val_complete)
Out[27]:
0.9110884006131834

Save Test Predictions¶

In [28]:
pred_complete["species"] = svmclassifier_op_1.predict(X_test_complete)
pred_complete.to_csv("prediction_svm_first.tsv", sep="\t")
In [0]:
pred["species"] = model.predict(X_test)
pred.to_csv("my_prediction.tsv", sep="\t")