Looking at 5-minute pre-dispatch demand forecast errors in 2021#
In this example, we will take a look at 5-minute pre-dispatch (5MPD) demand forecast “error” (the difference between actual and forecasted demand) for 2021. AEMO runs 5MPD to provide system and market information for the next hour.
We’ll look at forecast “error” on a NEM-wide basis; that is, we will sum actual scheduled demand across all NEM regions and then compare that to the sum of forecast scheduled demand across all NEM regions.
The code below could be modified to do this analysis on a region by region basis (we do this with (30-minute) pre-dispatch demand forecasts in this example).
Key imports#
# standard libraries
from datetime import datetime, timedelta
from pathlib import Path
# NEM data libraries
# NEMOSIS for actual demand data
# NEMSEER for forecast demand data
import nemosis
from nemseer import compile_data, download_raw_data, generate_runtimes
# data wrangling libraries
import numpy as np
import pandas as pd
# interactive plotting
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
# progress bar for error computation
from tqdm.autonotebook import tqdm
# supress logging from NEMSEER and NEMOSIS
import logging
logging.getLogger("nemosis").setLevel(logging.WARNING)
logging.getLogger("nemseer").setLevel(logging.ERROR)
Plot styling#
nemseer_template = dict(
layout=go.Layout(
font_family="Source Sans 3",
title_font_size=24,
title_x=0.05,
plot_bgcolor="#f0f0f0",
colorway=px.colors.qualitative.Bold,
)
)
Defining our analysis start and end dates#
analysis_start = "2021/01/01 00:05:00"
analysis_end = "2022/01/01 00:00:00"
Obtaining actual demand data from NEMOSIS
#
We will download DISPATCHREGIONSUM
to access the TOTALDEMAND
field (actual scheduled demand).
We’ll first download the data we need and cache it so that it’s ready for computation.
nemosis_cache = Path("nemosis_cache/")
if not nemosis_cache.exists():
nemosis_cache.mkdir()
nemosis.cache_compiler(
analysis_start, analysis_end, "DISPATCHREGIONSUM", nemosis_cache, fformat="parquet"
)
Obtaining forecast demand data from NEMSEER
#
We will download REGIONSOLUTION
to access the TOTALDEMAND
field in P5MIN
forecasts.
We’ll first download the data we need and cache it so that it’s ready for computation.
download_raw_data(
"P5MIN",
"REGIONSOLUTION",
"nemseer_cache/",
forecasted_start=analysis_start,
forecasted_end=analysis_end,
)
Calculating forecast error#
Below we calculate demand forecast error for P5MIN
forecasts using forecast demand data and actual demand data.
Attention
The actual run time of 5MPD is approximately 5 minutes before the nominal run time. We will adjust for this in this when calculating forecast ahead times. See the note in this section.
As data for the entire period is loaded into memory, adapt the length of the period you select to your machine specifications (e.g. a year’s worth of forecast data consumed ~15GB on the test machine).
Forecast error calculation functions#
The code below uses functionalities offered by NEMOSIS
, NEMSEER
and pandas
to calculate demand forecast error.
def calculate_p5min_demand_forecast_error_vectorised(
analysis_start: str, analysis_end: str
) -> pd.DataFrame:
"""
Calculates P5MIN demand forecast error (Actual - Forecast) for all forecasts
that are run for a given forecasted_time in a vectorised fashion.
Args:
forecasted_time: Datetime string in the form YYYY/mm/dd HH:MM:SS
Returns:
pandas DataFrame with forecast error in `TOTALDEMAND` columns, the ahead time
of the forecast run in `ahead_time`, and the forecasted time in
`forecasted_time`.
"""
def get_forecast_data(analysis_start: str, analysis_end: str) -> pd.DataFrame:
"""
Use NEMSEER to get 5MPD forecast data. Also omits any intervention periods.
"""
# use NEMSEER functions to compile pre-cached data
forecasts_run_start, forecasts_run_end = generate_runtimes(
analysis_start, analysis_end, "P5MIN"
)
forecast_df = compile_data(
forecasts_run_start,
forecasts_run_end,
analysis_start,
analysis_end,
"P5MIN",
"REGIONSOLUTION",
"nemseer_cache/",
)["REGIONSOLUTION"]
# remove intervention periods
forecast_df = forecast_df.query("INTERVENTION == 0")
return forecast_df
def get_actual_data(analysis_start: str, analysis_end: str) -> pd.DataFrame:
"""
Use NEMOSIS to get actual data. Also omits any intervention periods
"""
# NEMOSIS start time must precede end of interval of interest by 5 minutes
nemosis_start = (
datetime.strptime(analysis_start, "%Y/%m/%d %H:%M:%S")
- timedelta(minutes=5)
).strftime("%Y/%m/%d %H:%M:%S")
# use NEMOSIS to compile pre-cached data and filter out interventions
actual_df = nemosis.dynamic_data_compiler(
nemosis_start,
analysis_end,
"DISPATCHREGIONSUM",
nemosis_cache,
filter_cols=["INTERVENTION"],
filter_values=([0],),
fformat="parquet",
)
return actual_df
def calculate_p5min_forecast_demand_error(
actual_demand: pd.DataFrame, forecast_demand: pd.DataFrame
) -> pd.DataFrame:
"""
Calculate P5MIN forecast demand error given actual and forecast demand
Ahead time calculation reflects the fact that P5MIN actual run time is
5 minutes before the nominal run time.
"""
# left merge ensures all forecasted values have the corresponding actual value merged in
merged = pd.merge(
forecast_demand, actual_demand, on="forecasted_time", how="left"
)
if len(merged) > len(forecast_demand):
raise ValueError(
"Merge should return DataFrame with dimensions of forecast data"
)
# subtract 5 minutes from run time to get actual run time
merged["ahead_time"] = merged["forecasted_time"] - (
merged["RUN_DATETIME"] - timedelta(minutes=5)
)
forecast_error = (
merged["TOTALDEMAND"] - merged["FORECAST_TOTALDEMAND"]
).rename("TOTALDEMAND")
# create the forecast error DataFrame
forecast_error = pd.concat(
[forecast_error, merged["ahead_time"]], axis=1
).set_index(merged["forecasted_time"])
return forecast_error
# get forecast data
forecast_df = get_forecast_data(analysis_start, analysis_end)
# rename columns in preparation for merge
forecast_df = forecast_df.rename(
columns={
"TOTALDEMAND": "FORECAST_TOTALDEMAND",
"INTERVAL_DATETIME": "forecasted_time",
}
)
# group by forecasted and run times, then sum demand across regions to get NEM-wide demand
forecast_demand = forecast_df.groupby(["forecasted_time", "RUN_DATETIME"])[
"FORECAST_TOTALDEMAND"
].sum()
forecast_demand = forecast_demand.reset_index()
# get actual data
actual_df = get_actual_data(analysis_start, analysis_end)
# rename columns in preparation for merge
actual_df = actual_df.rename(
columns={
"SETTLEMENTDATE": "forecasted_time",
"TOTALDEMAND": "TOTALDEMAND",
}
)
# group by forecasted time and then sum demand across regions to get NEM-wide demand
actual_demand = (
actual_df.groupby("forecasted_time")["TOTALDEMAND"].sum().reset_index()
)
# calculate forecast error
forecast_error = calculate_p5min_forecast_demand_error(
actual_demand, forecast_demand
)
return forecast_error
forecast_error = calculate_p5min_demand_forecast_error_vectorised(
analysis_start, analysis_end
)
Plotting forecast error percentiles for each ahead time#
How does forecast error change based on how many minutes they are ahead of the time they are forecasting for?
Forecast error percentiles#
We can compute forecast error percentiles across ahead_times
(between 0 and 55 minutes for 5-minute pre-dispatch).
To do this, we will group the error DataFrame by ahead_time
, compute the percentile and then add a column that indicates the computed percentile. We’ll repeat this process across all percentiles of interest and then concatenate the results to form a single DataFrame for plotting.
percentile_data = []
for quantile in (0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99):
quantile_result = forecast_error.groupby(
forecast_error["ahead_time"].dt.seconds / 60
)["TOTALDEMAND"].quantile(quantile)
percentile_result = pd.concat(
[
quantile_result,
pd.Series(
np.repeat(quantile * 100, len(quantile_result)),
index=quantile_result.index,
name="Percentile",
).astype(int),
],
axis=1,
)
percentile_data.append(percentile_result)
percentile_df = pd.concat(percentile_data, axis=0).reset_index()
We can plot these quantiles for each ahead time.
It’s interesting to note that there is only a slight positive bias in the 50th percentile forecast as the forecast ahead time approaches one hour.
ahead_percentile = px.line(
percentile_df,
x="ahead_time",
y="TOTALDEMAND",
color="Percentile",
title="Hour-ahead (5MPD) NEM-wide Demand Forecast Error, 2021<br><sup>Error = Actual - Forecast,"
+ "</sup>",
labels={
"TOTALDEMAND": "Demand Forecast Error (MW)",
"ahead_time": "Forecast Ahead Time (minutes)",
},
template=nemseer_template,
color_discrete_map={
1: "#E24A33",
5: "#348ABD",
10: "#988ED5",
25: "#777777",
50: "#FBC15E",
75: "#777777",
90: "#988ED5",
95: "#348ABD",
99: "#E24A33",
},
)
ahead_percentile["layout"]["xaxis"]["autorange"] = "reversed"
Plotting the distributions of forecast errors by ahead time#
We can look at the full distributions of forecast errors across ahead times.
But first, we’ll remove “forecasts” at ahead_time
= 5, as these correspond to actual dispatch conditions.
We’ll also convert the Timedeltas into an integer, which will be helpful for plotting.
error_excluding_real_time = forecast_error[
forecast_error["ahead_time"].dt.seconds > 300
]
error_excluding_real_time.loc[:, "ahead_time"] = (
error_excluding_real_time.loc[:, "ahead_time"].dt.seconds / 60
).astype(int)
ahead_hist = px.histogram(
error_excluding_real_time,
x="TOTALDEMAND",
color="ahead_time",
template=nemseer_template,
)
ahead_hist.update_layout(
legend_title_text="Ahead Time (mins)",
);