import numpy as np
import sklearn


names = ["Mike","Max","Jörg","Simon","Jana","Felix","Markus"]


np.random.shuffle(names)
print(names)

['Jana', 'Markus', 'Jörg', 'Mike', 'Simon', 'Felix', 'Max']


import pandas as pd


data = pd.read_csv("../14_trees/train.tsv", sep = "\t").dropna()


data


y = pd.read_csv("../14_trees/train.tsv", sep = "\t")


y = data["species"]


print(y)

0        Fagus sylvatica
1        Fagus sylvatica
2        Fagus sylvatica
3        Fagus sylvatica
4        Fagus sylvatica
              ...       
35094    Fagus sylvatica
35095    Fagus sylvatica
35096    Fagus sylvatica
35097    Fagus sylvatica
35098    Fagus sylvatica
Name: species, Length: 31311, dtype: object


X = data[["latitude", "longitude", "stem_diameter_cm", "height_m", "crown_radius_m"]]
X


X = X.to_numpy()


y = y.to_numpy()


np.unique(y, return_counts = True)

(array(['Fagus sylvatica', 'Pinus pinaster', 'Quercus ilex'], dtype=object),
 array([ 5194,  9417, 16700]))


test = pd.read_csv("../14_trees/test.tsv", sep = "\t")


def majority_prediction(X, y, test):
    u = np.unique(y, return_counts = True)
    majority = u[0][u[1].argmax()]
    prediction = pd.DataFrame()
    prediction["tree_id"] = test.tree_id
    prediction["species"] = majority
    return prediction


majority_prediction(X, y, test).to_csv("tree_pred.tsv", sep='\t', index=False)


from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=8)
neigh.fit(X, y)

KNeighborsClassifier(n_neighbors=8)

KNeighborsClassifier(n_neighbors=8)


print(neigh.predict(X))

['Quercus ilex' 'Fagus sylvatica' 'Fagus sylvatica' ... 'Fagus sylvatica'
 'Fagus sylvatica' 'Fagus sylvatica']


neigh.score(X, y)

0.9363482482194756


test


test_pd = test[["latitude", "longitude", "stem_diameter_cm", "height_m", "crown_radius_m"]].dropna()


prediction = pd.DataFrame()
prediction["tree_id"] = test.dropna().tree_id
prediction["species"] = neigh.predict(test_pd)

/home/markus/miniconda3/envs/jupyter/lib/python3.9/site-packages/sklearn/base.py:443: UserWarning: X has feature names, but KNeighborsClassifier was fitted without feature names
  warnings.warn(


prediction.to_csv("species_pred_knn8.tsv",sep='\t', index=False)


from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(X, y)


X_train.shape, X_val.shape

((23483, 5), (7828, 5))


scores = []
for k in range(1, 20):
    neigh = KNeighborsClassifier(n_neighbors=k)
    neigh.fit(X_train, y_train)
    scores.append(neigh.score(X_val, y_val))
    
import matplotlib.pyplot as plt
plt.plot(list(range(1,len(scores)+1)), scores)

[<matplotlib.lines.Line2D at 0x7f183111a730>]

	species	latitude	longitude	stem_diameter_cm	height_m	crown_radius_m
0	Fagus sylvatica	46.01	11.44	5.0	6.7	2.05
1	Fagus sylvatica	46.03	11.44	5.0	11.1	2.90
2	Fagus sylvatica	46.03	11.44	5.0	14.2	4.00
3	Fagus sylvatica	46.03	11.44	5.0	5.7	2.10
4	Fagus sylvatica	46.03	11.44	5.0	8.0	4.15
...	...	...	...	...	...	...
35094	Fagus sylvatica	47.38	9.09	36.0	22.6	4.50
35095	Fagus sylvatica	46.74	6.85	37.0	34.3	4.50
35096	Fagus sylvatica	47.45	9.29	38.0	33.8	5.00
35097	Fagus sylvatica	46.52	6.35	40.0	29.3	6.00
35098	Fagus sylvatica	47.31	8.99	59.0	36.9	7.00

	latitude	longitude	stem_diameter_cm	height_m	crown_radius_m
0	46.01	11.44	5.0	6.7	2.05
1	46.03	11.44	5.0	11.1	2.90
2	46.03	11.44	5.0	14.2	4.00
3	46.03	11.44	5.0	5.7	2.10
4	46.03	11.44	5.0	8.0	4.15
...	...	...	...	...	...
35094	47.38	9.09	36.0	22.6	4.50
35095	46.74	6.85	37.0	34.3	4.50
35096	47.45	9.29	38.0	33.8	5.00
35097	46.52	6.35	40.0	29.3	6.00
35098	47.31	8.99	59.0	36.9	7.00

	tree_id	latitude	longitude	stem_diameter_cm	height_m	crown_radius_m
0	T_75102	46.01	11.44	5.0	14.2	3.10
1	T_75103	46.01	11.44	5.0	5.4	1.80
2	T_75118	46.03	11.43	5.0	4.9	1.75
3	T_75152	46.05	10.99	5.0	6.5	1.55
4	T_75161	46.05	10.99	5.0	4.9	1.90
...	...	...	...	...	...	...
8892	T_497140	46.24	7.26	21.0	2.6	3.00
8893	T_497324	47.39	7.36	32.0	21.6	4.00
8894	T_497361	46.09	8.99	36.0	3.0	3.50
8895	T_497401	46.74	6.96	37.0	32.3	4.50
8896	T_497446	47.32	7.26	38.0	28.2	4.00