In [1]:
### A Pluto.jl notebook ###
# v0.19.32

using Markdown
using InteractiveUtils
In [2]:
begin
	using Dates
	using CSV
	using DataFrames
	using DataFramesMeta
	using Chain
	using CairoMakie
end
In [3]:
using GLM, StatsBase
In [4]:
using Random
In [5]:
ENV["COLUMNS"] = 1000; # print more columns of tables
In [6]:
df = CSV.read("../data_dojo_22/Realisierte_Erzeugung_201501010000_202305301300_Stunde.csv",
    DataFrame,
    delim=";",
    decimal=',',
    groupmark='.',
    missingstring="-",
    dateformat=Dict("Datum"=>"dd.mm.yyyy"),
    types=Dict("Datum"=>Date)
);
In [7]:
begin
	clean_colname(x) = replace(split(x, " [")[1], " " => "_")
	clean_names = clean_colname.(names(df));
end
Out[7]:
15-element Vector{String}:
 "Datum"
 "Anfang"
 "Ende"
 "Biomasse"
 "Wasserkraft"
 "Wind_Offshore"
 "Wind_Onshore"
 "Photovoltaik"
 "Sonstige_Erneuerbare"
 "Kernenergie"
 "Braunkohle"
 "Steinkohle"
 "Erdgas"
 "Pumpspeicher"
 "Sonstige_Konventionelle"
In [8]:
rename!(df, clean_names)
Out[8]:
73716×15 DataFrame
73691 rows omitted
RowDatumAnfangEndeBiomasseWasserkraftWind_OffshoreWind_OnshorePhotovoltaikSonstige_ErneuerbareKernenergieBraunkohleSteinkohleErdgasPumpspeicherSonstige_Konventionelle
DateTimeTimeFloat64?Float64?Float64Float64Float64Float64?Float64Float64?Float64?Float64?Float64?Float64?
12015-01-0100:00:0001:00:004024.251158.25516.58128.00.0133.010710.515687.23219.751226.251525.754909.25
22015-01-0101:00:0002:00:003982.751188.0516.258297.50.0122.511086.215321.82351.25870.751079.254932.75
32015-01-0102:00:0003:00:004019.51139.25514.08540.00.093.011026.214817.52227.0809.5787.05041.75
42015-01-0103:00:0004:00:004040.751122.5517.758552.00.086.511027.814075.02339.75821.0287.755084.0
52015-01-0104:00:0005:00:004037.751112.0519.758643.50.086.510962.214115.02461.5831.25346.755070.75
62015-01-0105:00:0006:00:004028.251107.75520.08711.750.086.7510696.013474.22217.75851.0765.55096.75
72015-01-0106:00:0007:00:004013.251111.75521.59167.250.087.010299.512403.82373.25868.25414.55153.0
82015-01-0107:00:0008:00:004012.751113.75520.259811.00.087.010035.212062.52491.0876.0582.55161.0
92015-01-0108:00:0009:00:003999.751107.5525.259683.053.087.010245.812405.02530.25888.25750.55393.5
102015-01-0109:00:0010:00:004016.251121.0527.09501.75773.2585.7510060.212798.82386.25891.5387.05884.0
112015-01-0110:00:0011:00:004007.751122.0525.010025.02116.7582.010082.212728.52557.0888.25176.256064.0
122015-01-0111:00:0012:00:004011.751146.25527.2510862.53364.2582.09949.012451.02657.75876.5884.55914.75
132015-01-0112:00:0013:00:004014.01139.75527.7511575.24198.2582.010119.012464.52937.75874.25705.05778.25
⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮
737052023-05-3001:00:0002:00:004581.52033.753891.7511488.50.0119.50.03523.51288.53984.75769.251146.0
737062023-05-3002:00:0003:00:004551.01963.53725.510141.00.0119.00.03484.251263.753592.0604.251141.75
737072023-05-3003:00:0004:00:004503.252090.53635.259170.250.0119.00.03490.01272.54030.5578.751132.5
737082023-05-3004:00:0005:00:004454.252105.253635.758735.752.25120.00.03678.251275.04895.5433.251136.0
737092023-05-3005:00:0006:00:004406.752134.752940.08419.75407.75120.250.04087.51289.255723.75404.251142.25
737102023-05-3006:00:0007:00:004413.252227.52272.57531.02888.75128.250.05010.751362.06525.252803.751144.5
737112023-05-3007:00:0008:00:004468.02276.51916.255507.258343.25137.00.05275.51173.756905.755712.751156.75
737122023-05-3008:00:0009:00:004503.52152.51643.253647.7515893.2138.00.05181.751196.256946.55079.51132.0
737132023-05-3009:00:0010:00:004544.52156.251285.02900.523612.5135.00.04707.51114.756384.52122.751115.0
737142023-05-3010:00:0011:00:004568.252085.251241.252729.530341.8121.250.03978.51095.54802.51279.01096.25
737152023-05-3011:00:0012:00:004618.752068.251053.253115.035196.2119.00.03759.751151.53762.25774.01096.5
737162023-05-3012:00:0013:00:004612.251967.25781.253496.2537030.0119.00.03765.01296.753240.0196.751099.75
In [9]:
begin
	datetime_vec = Dates.DateTime.(df.Datum, df.Anfang);
	datetime_data = insertcols!(df,1, :Datetime => datetime_vec );
	datetime_clean = select(datetime_data, Not([:Datum, :Anfang, :Ende]));
end
Out[9]:
73716×13 DataFrame
73691 rows omitted
RowDatetimeBiomasseWasserkraftWind_OffshoreWind_OnshorePhotovoltaikSonstige_ErneuerbareKernenergieBraunkohleSteinkohleErdgasPumpspeicherSonstige_Konventionelle
DateTimeFloat64?Float64?Float64Float64Float64Float64?Float64Float64?Float64?Float64?Float64?Float64?
12015-01-01T00:00:004024.251158.25516.58128.00.0133.010710.515687.23219.751226.251525.754909.25
22015-01-01T01:00:003982.751188.0516.258297.50.0122.511086.215321.82351.25870.751079.254932.75
32015-01-01T02:00:004019.51139.25514.08540.00.093.011026.214817.52227.0809.5787.05041.75
42015-01-01T03:00:004040.751122.5517.758552.00.086.511027.814075.02339.75821.0287.755084.0
52015-01-01T04:00:004037.751112.0519.758643.50.086.510962.214115.02461.5831.25346.755070.75
62015-01-01T05:00:004028.251107.75520.08711.750.086.7510696.013474.22217.75851.0765.55096.75
72015-01-01T06:00:004013.251111.75521.59167.250.087.010299.512403.82373.25868.25414.55153.0
82015-01-01T07:00:004012.751113.75520.259811.00.087.010035.212062.52491.0876.0582.55161.0
92015-01-01T08:00:003999.751107.5525.259683.053.087.010245.812405.02530.25888.25750.55393.5
102015-01-01T09:00:004016.251121.0527.09501.75773.2585.7510060.212798.82386.25891.5387.05884.0
112015-01-01T10:00:004007.751122.0525.010025.02116.7582.010082.212728.52557.0888.25176.256064.0
122015-01-01T11:00:004011.751146.25527.2510862.53364.2582.09949.012451.02657.75876.5884.55914.75
132015-01-01T12:00:004014.01139.75527.7511575.24198.2582.010119.012464.52937.75874.25705.05778.25
⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮
737052023-05-30T01:00:004581.52033.753891.7511488.50.0119.50.03523.51288.53984.75769.251146.0
737062023-05-30T02:00:004551.01963.53725.510141.00.0119.00.03484.251263.753592.0604.251141.75
737072023-05-30T03:00:004503.252090.53635.259170.250.0119.00.03490.01272.54030.5578.751132.5
737082023-05-30T04:00:004454.252105.253635.758735.752.25120.00.03678.251275.04895.5433.251136.0
737092023-05-30T05:00:004406.752134.752940.08419.75407.75120.250.04087.51289.255723.75404.251142.25
737102023-05-30T06:00:004413.252227.52272.57531.02888.75128.250.05010.751362.06525.252803.751144.5
737112023-05-30T07:00:004468.02276.51916.255507.258343.25137.00.05275.51173.756905.755712.751156.75
737122023-05-30T08:00:004503.52152.51643.253647.7515893.2138.00.05181.751196.256946.55079.51132.0
737132023-05-30T09:00:004544.52156.251285.02900.523612.5135.00.04707.51114.756384.52122.751115.0
737142023-05-30T10:00:004568.252085.251241.252729.530341.8121.250.03978.51095.54802.51279.01096.25
737152023-05-30T11:00:004618.752068.251053.253115.035196.2119.00.03759.751151.53762.25774.01096.5
737162023-05-30T12:00:004612.251967.25781.253496.2537030.0119.00.03765.01296.753240.0196.751099.75
In [10]:
begin
	train = filter(:Datetime => <(Dates.Date(2021)), datetime_clean);
	val = filter(:Datetime => d -> Dates.year(d)==2021, datetime_clean);
	test = filter(:Datetime => >(Dates.Date(2021)), datetime_clean);
end
Out[10]:
21107×13 DataFrame
21082 rows omitted
RowDatetimeBiomasseWasserkraftWind_OffshoreWind_OnshorePhotovoltaikSonstige_ErneuerbareKernenergieBraunkohleSteinkohleErdgasPumpspeicherSonstige_Konventionelle
DateTimeFloat64?Float64?Float64Float64Float64Float64?Float64Float64?Float64?Float64?Float64?Float64?
12021-01-01T01:00:004584.251196.25394.53541.50.0227.08150.2511602.83044.755409.0518.51636.25
22021-01-01T02:00:004581.251164.0305.253217.00.0227.08156.511758.53067.255312.0219.01630.5
32021-01-01T03:00:004570.751179.25319.252787.50.0227.08153.7512337.52852.55127.02.01622.0
42021-01-01T04:00:004571.01155.0296.252492.50.0231.758150.512395.02713.254993.06.01621.75
52021-01-01T05:00:004589.251145.5432.02140.750.0231.258150.2512699.52736.55057.533.51650.0
62021-01-01T06:00:004628.251160.5510.252010.750.0232.758154.513051.02836.55094.089.751661.25
72021-01-01T07:00:004693.01157.5389.251975.00.0232.08152.7512472.52952.05169.25926.251661.25
82021-01-01T08:00:004766.251152.25359.52087.5194.0232.08157.512435.83087.255569.2580.51648.5
92021-01-01T09:00:004764.751163.5248.252123.51363.75231.08160.012169.53085.06052.5124.751652.25
102021-01-01T10:00:004789.51158.75446.751920.52986.5231.08161.2511607.83057.756515.596.01663.0
112021-01-01T11:00:004795.51159.75557.51627.54237.5231.08160.2511257.23044.257144.25119.251644.0
122021-01-01T12:00:004738.51169.75668.751322.54826.75231.08153.010870.53186.257445.5372.251654.25
132021-01-01T13:00:004693.51171.25836.251158.04234.25230.258153.510875.03150.257411.5266.01660.0
⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮⋮
210962023-05-30T01:00:004581.52033.753891.7511488.50.0119.50.03523.51288.53984.75769.251146.0
210972023-05-30T02:00:004551.01963.53725.510141.00.0119.00.03484.251263.753592.0604.251141.75
210982023-05-30T03:00:004503.252090.53635.259170.250.0119.00.03490.01272.54030.5578.751132.5
210992023-05-30T04:00:004454.252105.253635.758735.752.25120.00.03678.251275.04895.5433.251136.0
211002023-05-30T05:00:004406.752134.752940.08419.75407.75120.250.04087.51289.255723.75404.251142.25
211012023-05-30T06:00:004413.252227.52272.57531.02888.75128.250.05010.751362.06525.252803.751144.5
211022023-05-30T07:00:004468.02276.51916.255507.258343.25137.00.05275.51173.756905.755712.751156.75
211032023-05-30T08:00:004503.52152.51643.253647.7515893.2138.00.05181.751196.256946.55079.51132.0
211042023-05-30T09:00:004544.52156.251285.02900.523612.5135.00.04707.51114.756384.52122.751115.0
211052023-05-30T10:00:004568.252085.251241.252729.530341.8121.250.03978.51095.54802.51279.01096.25
211062023-05-30T11:00:004618.752068.251053.253115.035196.2119.00.03759.751151.53762.25774.01096.5
211072023-05-30T12:00:004612.251967.25781.253496.2537030.0119.00.03765.01296.753240.0196.751099.75
In [11]:
size(train), size(val), size(test)
Out[11]:
((52608, 13), (8760, 13), (21107, 13))
In [12]:
data_y = @chain train begin
    @transform(:Year = Dates.year.(:Datetime))
    groupby(:Year)
    @combine(:Biomasse = sum(:Biomasse), :Wasserkraft = sum(:Wasserkraft), :Wind_Offshore = sum(:Wind_Offshore), :Wind_Onshore = sum(:Wind_Onshore), :Photovoltaik = sum(:Photovoltaik), :Sonstige_Erneuerbare = sum(:Sonstige_Erneuerbare), :Kernenergie = sum(:Kernenergie), :Braunkohle = sum(:Braunkohle) ,:Steinkohle = sum(:Steinkohle), :Erdgas = sum(:Erdgas), :Pumpspeicher = sum(:Pumpspeicher), :Sonstige_Konventionelle = sum(:Sonstige_Konventionelle))
end
Out[12]:
6×13 DataFrame
RowYearBiomasseWasserkraftWind_OffshoreWind_OnshorePhotovoltaikSonstige_ErneuerbareKernenergieBraunkohleSteinkohleErdgasPumpspeicherSonstige_Konventionelle
Int64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64Float64
120153.48479e71.37193e78.13438e66.90421e73.48794e79.4563e58.43635e71.33798e88.11387e71.52259e79.93722e65.29686e7
220163.97455e71.80759e71.20931e76.52747e73.45389e71.40346e68.02425e71.30376e88.089e72.28888e78.58761e64.73436e7
320174.03284e71.55901e71.74142e78.51901e73.58832e71.75439e67.2214e71.29287e86.60055e72.55804e79.29078e64.87431e7
420184.00905e71.52577e71.90684e78.92706e74.12343e71.53113e67.18417e71.28362e87.15453e74.28782e79.13598e61.2838e7
520194.03876e71.58305e72.43828e79.98789e74.19148e71.46438e67.10422e71.02729e84.78153e75.46197e79.02058e61.27306e7
620204.08223e71.55834e72.68832e71.03104e84.58216e71.60774e66.09239e78.3374e73.4872e75.9126e71.15287e71.29299e7
In [13]:
begin
	photovoltaic_train = select(train, :Datetime, :Photovoltaik)
	photovoltaic_val = select(val, :Datetime, :Photovoltaik)
end
Out[13]:
8760×2 DataFrame
8735 rows omitted
RowDatetimePhotovoltaik
DateTimeFloat64
12021-01-01T00:00:000.0
22021-01-01T01:00:000.0
32021-01-01T02:00:000.0
42021-01-01T03:00:000.0
52021-01-01T04:00:000.0
62021-01-01T05:00:000.0
72021-01-01T06:00:000.0
82021-01-01T07:00:000.0
92021-01-01T08:00:00194.0
102021-01-01T09:00:001363.75
112021-01-01T10:00:002986.5
122021-01-01T11:00:004237.5
132021-01-01T12:00:004826.75
⋮⋮⋮
87492021-12-31T12:00:0010006.8
87502021-12-31T13:00:007878.75
87512021-12-31T14:00:004845.0
87522021-12-31T15:00:001990.75
87532021-12-31T16:00:00154.25
87542021-12-31T17:00:000.0
87552021-12-31T18:00:000.0
87562021-12-31T19:00:000.0
87572021-12-31T20:00:000.0
87582021-12-31T21:00:000.0
87592021-12-31T22:00:000.0
87602021-12-31T23:00:000.0
In [14]:
# kills the kernel with out of memory
# ols = lm(@formula(Photovoltaik ~ Datetime), photovoltaic_train)
In [15]:
dumb_estimate = photovoltaic_train[!,:Photovoltaik] |> x -> mean(x)
Out[15]:
4453.166181949513
In [16]:
error_dumb_estimate = mean(abs.(photovoltaic_val[!,:Photovoltaik] .- dumb_estimate))
Out[16]:
6217.043334561944
In [17]:
function estimate_me(df, datetime)
	return df[!,:Photovoltaik] |> x -> mean(x)
end
Out[17]:
estimate_me (generic function with 1 method)
In [18]:
function error(df, range_of_datetimes, val)
	estimates = estimate.(df, range_of_datetimes)
	return mean(abs.(val[!,:Photovoltaik] .- estimates))
end
Out[18]:
error (generic function with 1 method)
In [19]:
estimate_me(photovoltaic_train, 0)
Out[19]:
4453.166181949513
In [21]:
# vectorization does not work as the first argument will be vectorized as well
#estimate_me.(photovoltaic_train, [0,1,2])
In [35]:
dumb_estimates = repeat([dumb_estimate], size(val)[1]);
In [36]:
fig = Figure()
ax = Axis(fig[1,1])
scatter!(ax,val.Photovoltaik, color = "red", label = "val", markersize = 4)
scatter!(ax, dumb_estimates, color = "blue", label = "predict",markersize = 4)
axislegend(ax)
fig
Out[36]:
In [22]:
nm = ["Robin", "Sabine", "Dharu"]
Out[22]:
3-element Vector{String}:
 "Robin"
 "Sabine"
 "Dharu"
In [23]:
rand(nm)
Out[23]:
"Dharu"
In [24]:
lg = ["Jl1", "Jl2", "Py"]
Out[24]:
3-element Vector{String}:
 "Jl1"
 "Jl2"
 "Py"
In [25]:
Random.shuffle(lg)
Out[25]:
3-element Vector{String}:
 "Py"
 "Jl1"
 "Jl2"