In [2]:
import numpy as np
import pandas as pd
import sklearn
In [3]:
names = ["Mike","Max","Robin","Simon","Markus"]
In [4]:
np.random.shuffle(names)
In [5]:
names
Out[5]:
['Markus', 'Robin', 'Max', 'Mike', 'Simon']
In [6]:
data = pd.read_csv("../14_trees/train.tsv", sep = "\t").dropna()
In [7]:
data
Out[7]:
species latitude longitude stem_diameter_cm height_m crown_radius_m
0 Fagus sylvatica 46.01 11.44 5.0 6.7 2.05
1 Fagus sylvatica 46.03 11.44 5.0 11.1 2.90
2 Fagus sylvatica 46.03 11.44 5.0 14.2 4.00
3 Fagus sylvatica 46.03 11.44 5.0 5.7 2.10
4 Fagus sylvatica 46.03 11.44 5.0 8.0 4.15
... ... ... ... ... ... ...
35094 Fagus sylvatica 47.38 9.09 36.0 22.6 4.50
35095 Fagus sylvatica 46.74 6.85 37.0 34.3 4.50
35096 Fagus sylvatica 47.45 9.29 38.0 33.8 5.00
35097 Fagus sylvatica 46.52 6.35 40.0 29.3 6.00
35098 Fagus sylvatica 47.31 8.99 59.0 36.9 7.00

31311 rows × 6 columns

In [8]:
data.min()
Out[8]:
species             Fagus sylvatica
latitude                     36.263
longitude                    -9.148
stem_diameter_cm                2.0
height_m                        1.5
crown_radius_m                 0.25
dtype: object
In [9]:
data.max()
Out[9]:
species             Quercus ilex
latitude                  55.451
longitude                   26.0
stem_diameter_cm           162.0
height_m                    43.9
crown_radius_m              12.8
dtype: object
In [18]:
X = data[["latitude", "longitude", "stem_diameter_cm", "height_m", "crown_radius_m"]]
y = data["species"]

from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y)
In [22]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
In [24]:
from sklearn.neighbors import KNeighborsClassifier
In [52]:
scores = []
for k in range(1, 20):
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train_scaled, y_train)
    scores.append(neigh.score(X_val_scaled, y_val))
    
import matplotlib.pyplot as plt
plt.plot(list(range(1,len(scores)+1)), scores)
Out[52]:
[<matplotlib.lines.Line2D at 0x7f8b7e2f5dc0>]
In [48]:
plt.scatter(X_train.iloc[:,1], X_train.iloc[:,2],c = y_train.astype("category").cat.codes )
plt.axis("scaled") 
Out[48]:
(-10.9054, 27.7574, -5.4750000000000005, 169.975)
In [49]:
plt.scatter(X_train_scaled[:,1], X_train_scaled[:,2],c = y_train.astype("category").cat.codes )
plt.axis("scaled")
Out[49]:
(-0.05, 1.05, -0.05, 1.05)
In [47]:
c = y_train.astype("category").cat.codes 
Out[47]:
32270    1
31835    2
29305    2
4758     0
22820    1
        ..
9557     2
23041    2
32308    2
18646    2
33889    2
Length: 23483, dtype: int8
In [62]:
test = pd.read_csv("../14_trees/test.tsv", sep = "\t").dropna()
In [69]:
X_test = test[["latitude", "longitude", "stem_diameter_cm", "height_m", "crown_radius_m"]]
treeID = test["tree_id"]
In [64]:
X_test_scaled = scaler.transform(X_test)
In [65]:
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train_scaled, y_train)
prediction = neigh.predict(X_test_scaled)
In [70]:
pred = pd.DataFrame()
pred["tree_id"] = treeID
pred["species"] = prediction
In [73]:
pred.to_csv("knn_5_scaled.tsv", sep="\t", index=False)
In [ ]: