import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
np.random.seed(42)
names = ["Robin", "Simon", "Mike", "Chris", "Melanie", "Sabine", "Jörg", "Markus", "Kerstin", "Andreas", "Felix"]
np.random.shuffle(names)
" => ".join(names)
'Sabine => Robin => Andreas => Felix => Mike => Simon => Kerstin => Melanie => Markus => Chris => Jörg'
data = pd.read_csv("https://github.com/ddojo/ddojo.github.io/raw/main/sessions/14_trees/train.tsv", sep="\t")
test = pd.read_csv("https://github.com/ddojo/ddojo.github.io/raw/main/sessions/14_trees/test.tsv", sep="\t")
X = data.drop("species",axis=1)
y = data.species
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)
X_test = test.drop("tree_id",axis=1)
tree_id = test.tree_id
pred = pd.DataFrame()
pred["tree_id"] = tree_id
pred["species"] = "unknown"
X_complete = data.dropna().drop("species",axis=1)
y_complete = data.dropna().species
X_train_complete, X_val_complete, y_train_complete, y_val_complete = train_test_split(X_complete, y_complete, random_state=42)
X_test_complete = test.dropna().drop("tree_id",axis=1)
tree_id_complete = test.dropna().tree_id
pred_complete = pd.DataFrame()
pred_complete["tree_id"] = tree_id_complete
pred_complete["species"] = "unknown"
from sklearn import svm
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
svmclassifier = svm.SVC()
svmclassifier.fit(X_train_complete, y_train_complete)
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC()
svmclassifier.predict(X_test_complete)
array(['Fagus sylvatica', 'Quercus ilex', 'Quercus ilex', ..., 'Quercus ilex', 'Fagus sylvatica', 'Fagus sylvatica'], dtype=object)
svmclassifier.score(X_val_complete,y_val_complete)
0.8920541645375575
svmclassifier_op = svm.SVC(degree = 3, kernel = "poly")
svmclassifier_op_1 = make_pipeline(MinMaxScaler(), svm.SVC())
svmclassifier_op_1.fit(X_train_complete, y_train_complete)
Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('svc', SVC())])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
Pipeline(steps=[('minmaxscaler', MinMaxScaler()), ('svc', SVC())])
MinMaxScaler()
SVC()
svmclassifier_op_1.score(X_val_complete,y_val_complete)
0.9173479816044967
weight = {names[i]:(sum(counts)/counts[i]) for i in range(3)}
print(weight)
{'Fagus sylvatica': 4.674257557597549, 'Pinus pinaster': 3.727195497504513, 'Quercus ilex': 1.9313817201342651}
names, counts = np.unique(y, return_counts = True)
svmclassifier_weight = svm.SVC(kernel="rbf", class_weight=weight)
svmclassifier_weight.fit(X_train_complete, y_train_complete)
SVC(class_weight={'Fagus sylvatica': 4.674257557597549, 'Pinus pinaster': 3.727195497504513, 'Quercus ilex': 1.9313817201342651})In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
SVC(class_weight={'Fagus sylvatica': 4.674257557597549, 'Pinus pinaster': 3.727195497504513, 'Quercus ilex': 1.9313817201342651})
svmclassifier_weight.score(X_val_complete,y_val_complete)
0.9047010730710271
svmcw_pipe = make_pipeline(MinMaxScaler(), svm.SVC(class_weight=weight))
svmcw_pipe.fit(X_train_complete, y_train_complete)
svmcw_pipe.score(X_val_complete, y_val_complete)
0.9110884006131834
pred_complete["species"] = svmclassifier_op_1.predict(X_test_complete)
pred_complete.to_csv("prediction_svm_first.tsv", sep="\t")
pred["species"] = model.predict(X_test)
pred.to_csv("my_prediction.tsv", sep="\t")