[`scikit-learn` documentation on ensemble models](https://scikit%5C-learn.org/stable/modules/ensemble.html%5C)
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
np.random.seed(42)
names = ["Robin", "Markus", "Mike", "Kerstin", "Stefan", "Chris","Andi","Carolin","Sascha", "Sabine", "Prithivi"]
np.random.shuffle(names)
" => ".join(names)
'Chris => Robin => Sabine => Prithivi => Mike => Markus => Sascha => Stefan => Carolin => Kerstin => Andi'
data = pd.read_csv("https://github.com/ddojo/ddojo.github.io/raw/main/sessions/14_trees/train.tsv", sep="\t")
test = pd.read_csv("https://github.com/ddojo/ddojo.github.io/raw/main/sessions/14_trees/test.tsv", sep="\t")
X = data.drop("species",axis=1)
y = data.species
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)
X_test = test.drop("tree_id",axis=1)
tree_id = test.tree_id
pred = pd.DataFrame()
pred["tree_id"] = tree_id
pred["species"] = "unknown"
X_complete = data.dropna().drop("species",axis=1)
y_complete = data.dropna().species
X_train_complete, X_val_complete, y_train_complete, y_val_complete = train_test_split(X_complete, y_complete, random_state=42)
X_test_complete = test.dropna().drop("tree_id",axis=1)
tree_id_complete = test.dropna().tree_id
pred_complete = pd.DataFrame()
pred_complete["tree_id"] = tree_id_complete
pred_complete["species"] = "unknown"
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
adaboost = AdaBoostClassifier(DecisionTreeClassifier(max_depth=10), n_estimators=10)
adaboost.fit(X_train_complete,y_train_complete)
AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=10), n_estimators=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=10), n_estimators=10)
DecisionTreeClassifier(max_depth=10)
DecisionTreeClassifier(max_depth=10)
adaboost.score(X_train_complete, y_train_complete)
0.9773878976280713
adaboost.score(X_val_complete, y_val_complete)
0.9412365866121615
from sklearn.ensemble import RandomForestClassifier
RF = RandomForestClassifier(n_estimators = 100, criterion="log_loss", oob_score=True)
RF.fit(X_train_complete, y_train_complete)
RandomForestClassifier(criterion='log_loss', oob_score=True)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(criterion='log_loss', oob_score=True)
RF.score(X_val_complete, y_val_complete)
0.9503065917220235
RF.oob_score
True
RF.oob_score_
0.9506025635566154
RF.feature_importances_
array([0.29352523, 0.27102041, 0.07447549, 0.27080703, 0.09017184])
RF.feature_names_in_
array(['latitude', 'longitude', 'stem_diameter_cm', 'height_m', 'crown_radius_m'], dtype=object)
pred["species"] = model.predict(X_test)
pred.to_csv("my_prediction.tsv", sep="\t")
or
pred_complete["species"] = RF.predict(X_test_complete)
pred_complete.to_csv("randomforest_logloss_prediction.tsv", sep="\t")