import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
np.random.seed(42)
" → ".join(np.random.permutation("Simon Mike Max Chris Felix Joel Robin".split()))
'Simon → Mike → Joel → Max → Felix → Chris → Robin'
np.random.seed(42)
" → ".join(np.random.permutation("Julia1 Julia2 Python".split()))
'Julia1 → Julia2 → Python'
Realisierte_Erzeugung_201501010000_202305301300_Stunde.csv
as a table into python :heavy_check_mark:Time
containing the combined info from Datum
and Anfang
df = pd.read_csv(
"../data_dojo_22/Realisierte_Erzeugung_201501010000_202305301300_Stunde.csv",
delimiter=';',
decimal=',',
thousands='.',
parse_dates=['Datum', 'Anfang', 'Ende'],
date_format={"Datum": "%d.%m.%Y", "Anfang": "%H:%M", "Ende": "%H:%M"},
na_values="-"
)
df = df.rename(lambda x: x.split(" [")[0].replace(" ","_"), axis="columns")
df
Datum | Anfang | Ende | Biomasse | Wasserkraft | Wind_Offshore | Wind_Onshore | Photovoltaik | Sonstige_Erneuerbare | Kernenergie | Braunkohle | Steinkohle | Erdgas | Pumpspeicher | Sonstige_Konventionelle | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2015-01-01 | 1900-01-01 00:00:00 | 1900-01-01 01:00:00 | 4024.25 | 1158.25 | 516.50 | 8128.00 | 0.00 | 133.00 | 10710.50 | 15687.25 | 3219.75 | 1226.25 | 1525.75 | 4909.25 |
1 | 2015-01-01 | 1900-01-01 01:00:00 | 1900-01-01 02:00:00 | 3982.75 | 1188.00 | 516.25 | 8297.50 | 0.00 | 122.50 | 11086.25 | 15321.75 | 2351.25 | 870.75 | 1079.25 | 4932.75 |
2 | 2015-01-01 | 1900-01-01 02:00:00 | 1900-01-01 03:00:00 | 4019.50 | 1139.25 | 514.00 | 8540.00 | 0.00 | 93.00 | 11026.25 | 14817.50 | 2227.00 | 809.50 | 787.00 | 5041.75 |
3 | 2015-01-01 | 1900-01-01 03:00:00 | 1900-01-01 04:00:00 | 4040.75 | 1122.50 | 517.75 | 8552.00 | 0.00 | 86.50 | 11027.75 | 14075.00 | 2339.75 | 821.00 | 287.75 | 5084.00 |
4 | 2015-01-01 | 1900-01-01 04:00:00 | 1900-01-01 05:00:00 | 4037.75 | 1112.00 | 519.75 | 8643.50 | 0.00 | 86.50 | 10962.25 | 14115.00 | 2461.50 | 831.25 | 346.75 | 5070.75 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
73711 | 2023-05-30 | 1900-01-01 08:00:00 | 1900-01-01 09:00:00 | 4503.50 | 2152.50 | 1643.25 | 3647.75 | 15893.25 | 138.00 | 0.00 | 5181.75 | 1196.25 | 6946.50 | 5079.50 | 1132.00 |
73712 | 2023-05-30 | 1900-01-01 09:00:00 | 1900-01-01 10:00:00 | 4544.50 | 2156.25 | 1285.00 | 2900.50 | 23612.50 | 135.00 | 0.00 | 4707.50 | 1114.75 | 6384.50 | 2122.75 | 1115.00 |
73713 | 2023-05-30 | 1900-01-01 10:00:00 | 1900-01-01 11:00:00 | 4568.25 | 2085.25 | 1241.25 | 2729.50 | 30341.75 | 121.25 | 0.00 | 3978.50 | 1095.50 | 4802.50 | 1279.00 | 1096.25 |
73714 | 2023-05-30 | 1900-01-01 11:00:00 | 1900-01-01 12:00:00 | 4618.75 | 2068.25 | 1053.25 | 3115.00 | 35196.25 | 119.00 | 0.00 | 3759.75 | 1151.50 | 3762.25 | 774.00 | 1096.50 |
73715 | 2023-05-30 | 1900-01-01 12:00:00 | 1900-01-01 13:00:00 | 4612.25 | 1967.25 | 781.25 | 3496.25 | 37030.00 | 119.00 | 0.00 | 3765.00 | 1296.75 | 3240.00 | 196.75 | 1099.75 |
73716 rows × 15 columns
Time
columnTodos 4th Session
Create a manual prediction model (with hard-coded rules) for "Photovoltaik" How accurately does your manually created model predict the values in the validation set?
Bonus:
Employ a more sophisticated prediction model from an existing package for time series analysis How does the prediction of that model compare to your manual model?
df.plot(x="Datum", y="Photovoltaik")
<Axes: xlabel='Datum'>
df_train = df[df.Datum < pd.to_datetime('2021-01-01')]
df_val = df[(pd.to_datetime('2021-12-31') >= df.Datum) & (df.Datum >= pd.to_datetime('2021-01-01'))]
df_test = df[(pd.to_datetime('2021-12-31') < df.Datum)]
pls give code for manual prediction model
# Create manual prediction model
def manual_prediction_model(input_data):
pass
df.Datum.apply(lambda x: str(x))
0 2015-01-01 00:00:00 1 2015-01-01 00:00:00 2 2015-01-01 00:00:00 3 2015-01-01 00:00:00 4 2015-01-01 00:00:00 ... 73711 2023-05-30 00:00:00 73712 2023-05-30 00:00:00 73713 2023-05-30 00:00:00 73714 2023-05-30 00:00:00 73715 2023-05-30 00:00:00 Name: Datum, Length: 73716, dtype: object
p = np.polyfit(x= df_train.index, y = df_train["Photovoltaik"], deg = 5)
print(p)
[-2.90188903e-19 2.93407736e-14 -9.22854477e-10 8.98555223e-06 2.23514325e-02 3.70831543e+03]
func = lambda x: p[5] + p[4] * x + p[3] * x**2 + p[2] * x**3 + p[1] * x**4 + p[0] * x**5
df_val.tail()
Datum | Anfang | Ende | Biomasse | Wasserkraft | Wind_Offshore | Wind_Onshore | Photovoltaik | Sonstige_Erneuerbare | Kernenergie | Braunkohle | Steinkohle | Erdgas | Pumpspeicher | Sonstige_Konventionelle | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
61363 | 2021-12-31 | 1900-01-01 19:00:00 | 1900-01-01 20:00:00 | 4564.50 | 1555.75 | 5697.25 | 33039.25 | 0.0 | 154.00 | 5863.00 | 3765.25 | 2279.50 | 2697.75 | 1180.00 | 1435.75 |
61364 | 2021-12-31 | 1900-01-01 20:00:00 | 1900-01-01 21:00:00 | 4463.00 | 1530.50 | 6169.25 | 30699.75 | 0.0 | 154.00 | 5574.75 | 3758.00 | 2203.75 | 2675.75 | 787.75 | 1442.25 |
61365 | 2021-12-31 | 1900-01-01 21:00:00 | 1900-01-01 22:00:00 | 4403.25 | 1586.75 | 6132.75 | 27932.75 | 0.0 | 155.50 | 4918.75 | 3767.75 | 2185.75 | 2662.00 | 593.50 | 1445.25 |
61366 | 2021-12-31 | 1900-01-01 22:00:00 | 1900-01-01 23:00:00 | 4382.00 | 1505.25 | 6041.50 | 27079.25 | 0.0 | 157.75 | 4070.25 | 3777.75 | 2145.25 | 2658.50 | 1027.50 | 1439.50 |
61367 | 2021-12-31 | 1900-01-01 23:00:00 | 1900-01-01 00:00:00 | 4363.75 | 1462.50 | 6313.25 | 25797.75 | 0.0 | 158.25 | 3224.50 | 3625.00 | 2115.75 | 2660.50 | 809.75 | 1455.75 |
x = list(range(52608, 61368))
pred = [func(i) for i in x]
plt.plot(df_train.index, df_train.Photovoltaik)
plt.plot(df_train.index, [func(i) for i in df_train.index])
[<matplotlib.lines.Line2D at 0x7efdbf197520>]
plt.plot(df_val.index, df_val.Photovoltaik)
plt.plot(df_val.index, [func(i) for i in df_val.index])
[<matplotlib.lines.Line2D at 0x7efdb20b3370>]
Calculate MAE of this prediction in the validation set
pred = np.array([func(i) for i in df_val.index])
np.mean(np.abs(df_val.Photovoltaik.values - pred))
8499.14687240829