import numpy as np
import pandas as pd
import sklearn
names = ["Mike","Max","Robin","Simon","Markus"]
np.random.shuffle(names)
names
['Markus', 'Robin', 'Max', 'Mike', 'Simon']
data = pd.read_csv("../14_trees/train.tsv", sep = "\t").dropna()
data
species | latitude | longitude | stem_diameter_cm | height_m | crown_radius_m | |
---|---|---|---|---|---|---|
0 | Fagus sylvatica | 46.01 | 11.44 | 5.0 | 6.7 | 2.05 |
1 | Fagus sylvatica | 46.03 | 11.44 | 5.0 | 11.1 | 2.90 |
2 | Fagus sylvatica | 46.03 | 11.44 | 5.0 | 14.2 | 4.00 |
3 | Fagus sylvatica | 46.03 | 11.44 | 5.0 | 5.7 | 2.10 |
4 | Fagus sylvatica | 46.03 | 11.44 | 5.0 | 8.0 | 4.15 |
... | ... | ... | ... | ... | ... | ... |
35094 | Fagus sylvatica | 47.38 | 9.09 | 36.0 | 22.6 | 4.50 |
35095 | Fagus sylvatica | 46.74 | 6.85 | 37.0 | 34.3 | 4.50 |
35096 | Fagus sylvatica | 47.45 | 9.29 | 38.0 | 33.8 | 5.00 |
35097 | Fagus sylvatica | 46.52 | 6.35 | 40.0 | 29.3 | 6.00 |
35098 | Fagus sylvatica | 47.31 | 8.99 | 59.0 | 36.9 | 7.00 |
31311 rows × 6 columns
data.min()
species Fagus sylvatica latitude 36.263 longitude -9.148 stem_diameter_cm 2.0 height_m 1.5 crown_radius_m 0.25 dtype: object
data.max()
species Quercus ilex latitude 55.451 longitude 26.0 stem_diameter_cm 162.0 height_m 43.9 crown_radius_m 12.8 dtype: object
X = data[["latitude", "longitude", "stem_diameter_cm", "height_m", "crown_radius_m"]]
y = data["species"]
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
from sklearn.neighbors import KNeighborsClassifier
scores = []
for k in range(1, 20):
neigh = KNeighborsClassifier(n_neighbors=k)
neigh.fit(X_train_scaled, y_train)
scores.append(neigh.score(X_val_scaled, y_val))
import matplotlib.pyplot as plt
plt.plot(list(range(1,len(scores)+1)), scores)
[<matplotlib.lines.Line2D at 0x7f8b7e2f5dc0>]
plt.scatter(X_train.iloc[:,1], X_train.iloc[:,2],c = y_train.astype("category").cat.codes )
plt.axis("scaled")
(-10.9054, 27.7574, -5.4750000000000005, 169.975)
plt.scatter(X_train_scaled[:,1], X_train_scaled[:,2],c = y_train.astype("category").cat.codes )
plt.axis("scaled")
(-0.05, 1.05, -0.05, 1.05)
c = y_train.astype("category").cat.codes
32270 1 31835 2 29305 2 4758 0 22820 1 .. 9557 2 23041 2 32308 2 18646 2 33889 2 Length: 23483, dtype: int8
test = pd.read_csv("../14_trees/test.tsv", sep = "\t").dropna()
X_test = test[["latitude", "longitude", "stem_diameter_cm", "height_m", "crown_radius_m"]]
treeID = test["tree_id"]
X_test_scaled = scaler.transform(X_test)
neigh = KNeighborsClassifier(n_neighbors=5)
neigh.fit(X_train_scaled, y_train)
prediction = neigh.predict(X_test_scaled)
pred = pd.DataFrame()
pred["tree_id"] = treeID
pred["species"] = prediction
pred.to_csv("knn_5_scaled.tsv", sep="\t", index=False)