Data Dojo 21 - Neural Networks¶

[`scikit-learn` documentation on ensemble models]([https://scikit\-learn.org/stable/modules/neural\_networks\_supervised.html\#](https://scikit-learn.org/stable/modules/neural_networks_supervised.html#) )

Setup¶

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
In [0]:
 

Hacking Order¶

In [3]:
np.random.seed(42)
names = ["Robin", "Max", "Sonia", "Konstantin","Andi", "Kerstin","Sabine","Caro","Jana","Sascha","Dana","Chaitanya" ]
np.random.shuffle(names)
" => ".join(names)
Out[3]:
'Dana => Sascha => Robin => Jana => Kerstin => Sonia => Max => Chaitanya => Andi => Caro => Konstantin => Sabine'
In [4]:
80/12
Out[4]:
6.666666666666667

Data Loading¶

In [2]:
data = pd.read_csv("https://github.com/ddojo/ddojo.github.io/raw/main/sessions/14_trees/train.tsv", sep="\t")
test = pd.read_csv("https://github.com/ddojo/ddojo.github.io/raw/main/sessions/14_trees/test.tsv", sep="\t")

All cases¶

In [3]:
X = data.drop("species",axis=1)
y = data.species
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=42)
In [4]:
X_test = test.drop("tree_id",axis=1)
tree_id = test.tree_id
pred = pd.DataFrame()
pred["tree_id"] = tree_id
pred["species"] = "unknown"

Only complete cases¶

In [5]:
X_complete = data.dropna().drop("species",axis=1)
y_complete = data.dropna().species
X_train_complete, X_val_complete, y_train_complete, y_val_complete = train_test_split(X_complete, y_complete, random_state=42)
In [6]:
X_test_complete = test.dropna().drop("tree_id",axis=1)
tree_id_complete = test.dropna().tree_id
pred_complete = pd.DataFrame()
pred_complete["tree_id"] = tree_id_complete
pred_complete["species"] = "unknown"
In [13]:
X_test_complete
Out[13]:
latitude longitude stem_diameter_cm height_m crown_radius_m
0 46.01 11.44 5.0 14.2 3.10
1 46.01 11.44 5.0 5.4 1.80
2 46.03 11.43 5.0 4.9 1.75
3 46.05 10.99 5.0 6.5 1.55
4 46.05 10.99 5.0 4.9 1.90
... ... ... ... ... ...
8892 46.24 7.26 21.0 2.6 3.00
8893 47.39 7.36 32.0 21.6 4.00
8894 46.09 8.99 36.0 3.0 3.50
8895 46.74 6.96 37.0 32.3 4.50
8896 47.32 7.26 38.0 28.2 4.00

7963 rows × 5 columns

Models¶

In [7]:
from sklearn.neural_network import MLPClassifier
In [32]:
clf = MLPClassifier(solver='adam', alpha=1e-5,
                   hidden_layer_sizes=(5, 2), random_state=1, max_iter = 15)

clf.fit(X_train_complete, y_train_complete)
MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2),random_state=1,
              solver='lbfgs')
/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (15) reached and the optimization hasn't converged yet.
  warnings.warn(
Out[32]:
MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,
              solver='lbfgs')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MLPClassifier(alpha=1e-05, hidden_layer_sizes=(5, 2), random_state=1,
              solver='lbfgs')
In [18]:
predict = clf.predict(X_test_complete)
In [8]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  
# Don't cheat - fit only on training data
scaler.fit(X_train_complete)  
X_train_scale = scaler.transform(X_train_complete)  
# apply same transformation to test data
X_val_scale = scaler.transform(X_val_complete)
In [12]:
X_test_scale = scaler.transform(X_test_complete)
In [23]:
X_train_scale
Out[23]:
array([[ 0.3111399 ,  0.6479698 ,  0.31078895, -0.36213006,  0.86597811],
       [ 0.17541879, -0.17084478, -0.88682956, -0.68421516, -1.19200251],
       [-0.35326228, -0.4264638 ,  0.67370365, -0.28160878,  0.86597811],
       ...,
       [ 1.92306482,  2.77123044, -1.08038406, -0.168879  , -0.19731188],
       [-1.35507346, -0.02167165, -0.29406888, -0.68421516, -0.33451058],
       [-0.40723509, -0.41693578,  0.52248919, -0.44265133,  1.0374765 ]])
In [19]:
clf = MLPClassifier(solver='adam', alpha=1e-5,learning_rate_init=1e-5, tol=1e-5, hidden_layer_sizes= (10, 10), random_state=1, max_iter = 1500)
In [20]:
clf.fit(X_train_scale, y_train_complete)
/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:686: ConvergenceWarning: Stochastic Optimizer: Maximum iterations (1500) reached and the optimization hasn't converged yet.
  warnings.warn(
Out[20]:
MLPClassifier(alpha=1e-05, hidden_layer_sizes=(10, 10),
              learning_rate_init=1e-05, max_iter=1500, random_state=1,
              tol=1e-05)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MLPClassifier(alpha=1e-05, hidden_layer_sizes=(10, 10),
              learning_rate_init=1e-05, max_iter=1500, random_state=1,
              tol=1e-05)
In [22]:
clf.score(X_val_scale, y_val_complete)
Out[22]:
0.9021461420541645
In [61]:
result = [] 
for x_layers in range(6,7):
    for y_layers in range(2,4):
        clf_1 = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes= (x_layers, y_layers), random_state=1, max_iter = 1500)
        clf_1.fit(X_train_scale, y_train_complete)
        hans= clf_1.score(X_val_scale, y_val_complete)
        result.append([x_layers, y_layers,hans])
/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:693: UserWarning: Training interrupted by user.
  warnings.warn("Training interrupted by user.")
/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:693: UserWarning: Training interrupted by user.
  warnings.warn("Training interrupted by user.")
/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:693: UserWarning: Training interrupted by user.
  warnings.warn("Training interrupted by user.")
/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:693: UserWarning: Training interrupted by user.
  warnings.warn("Training interrupted by user.")
/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:693: UserWarning: Training interrupted by user.
  warnings.warn("Training interrupted by user.")
/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:693: UserWarning: Training interrupted by user.
  warnings.warn("Training interrupted by user.")
/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:693: UserWarning: Training interrupted by user.
  warnings.warn("Training interrupted by user.")
/usr/local/lib/python3.8/dist-packages/sklearn/neural_network/_multilayer_perceptron.py:693: UserWarning: Training interrupted by user.
  warnings.warn("Training interrupted by user.")
In [0]:
result
In [0]:
 

Save Test Predictions¶

In [13]:
pred["species"] = clf.predict(X_test_scale)
pred.to_csv("clf_prediction.tsv", sep="\t")
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
/tmp/ipykernel_1872/2466453748.py in <cell line: 1>()
----> 1 pred["species"] = clf.predict(X_test_scale)
      2 pred.to_csv("clf_prediction.tsv", sep="\t")

/usr/local/lib/python3.8/dist-packages/pandas/core/frame.py in __setitem__(self, key, value)
   3978         else:
   3979             # set column
-> 3980             self._set_item(key, value)
   3981 
   3982     def _setitem_slice(self, key: slice, value):

/usr/local/lib/python3.8/dist-packages/pandas/core/frame.py in _set_item(self, key, value)
   4172         ensure homogeneity.
   4173         """
-> 4174         value = self._sanitize_column(value)
   4175 
   4176         if (

/usr/local/lib/python3.8/dist-packages/pandas/core/frame.py in _sanitize_column(self, value)
   4913 
   4914         if is_list_like(value):
-> 4915             com.require_length_match(value, self.index)
   4916         return sanitize_array(value, self.index, copy=True, allow_2d=True)
   4917 

/usr/local/lib/python3.8/dist-packages/pandas/core/common.py in require_length_match(data, index)
    569     """
    570     if len(data) != len(index):
--> 571         raise ValueError(
    572             "Length of values "
    573             f"({len(data)}) "

ValueError: Length of values (7963) does not match length of index (8897)
In [14]:
pred
Out[14]:
tree_id species
0 T_75102 unknown
1 T_75103 unknown
2 T_75118 unknown
3 T_75152 unknown
4 T_75161 unknown
... ... ...
8892 T_497140 unknown
8893 T_497324 unknown
8894 T_497361 unknown
8895 T_497401 unknown
8896 T_497446 unknown

8897 rows × 2 columns

or

In [15]:
pred_complete["species"] = clf.predict(X_test_scale)
pred_complete.to_csv("clf_prediction.tsv", sep="\t")
In [0]: