Skip to content

Commit 4d6b401

Browse files
committed
fixes generic parser to correctly handle various scenarios that can happen
discussion and inspiration for fixes from #347 and #346
1 parent 647ae2b commit 4d6b401

File tree

3 files changed

+101
-73
lines changed

3 files changed

+101
-73
lines changed

entsoe/entsoe.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
warnings.filterwarnings('ignore', category=XMLParsedAsHTMLWarning)
2424

2525
__title__ = "entsoe-py"
26-
__version__ = "0.6.8"
26+
__version__ = "0.6.9"
2727
__author__ = "EnergieID.be, Frank Boerman"
2828
__license__ = "MIT"
2929

@@ -1152,7 +1152,7 @@ def query_withdrawn_unavailability_of_generation_units(
11521152
class EntsoePandasClient(EntsoeRawClient):
11531153
@year_limited
11541154
def query_net_position(self, country_code: Union[Area, str],
1155-
start: pd.Timestamp, end: pd.Timestamp, dayahead: bool = True) -> pd.Series:
1155+
start: pd.Timestamp, end: pd.Timestamp, dayahead: bool = True, resolution: Literal['60min', '30min', '15min'] = '60min') -> pd.Series:
11561156
"""
11571157
11581158
Parameters
@@ -1168,7 +1168,7 @@ def query_net_position(self, country_code: Union[Area, str],
11681168
area = lookup_area(country_code)
11691169
text = super(EntsoePandasClient, self).query_net_position(
11701170
country_code=area, start=start, end=end, dayahead=dayahead)
1171-
series = parse_netpositions(text)
1171+
series = parse_netpositions(text, resolution=resolution)
11721172
series = series.tz_convert(area.tz)
11731173
series = series.truncate(before=start, after=end)
11741174
return series

entsoe/parsers.py

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
CONSUMPTION_ELEMENT = "outBiddingZone_Domain.mRID"
1818

1919

20-
2120
def parse_prices(xml_text):
2221
"""
2322
Parameters
@@ -35,15 +34,18 @@ def parse_prices(xml_text):
3534
}
3635
for soup in _extract_timeseries(xml_text):
3736
soup_series = _parse_timeseries_generic(soup, 'price.amount')
38-
series[soup_series.index.freqstr].append(soup_series)
37+
for key in series.keys():
38+
series[key].append(soup_series[key])
3939

4040
for freq, freq_series in series.items():
41-
if len(freq_series) > 0:
41+
try:
4242
series[freq] = pd.concat(freq_series).sort_index()
43+
except ValueError:
44+
series[freq] = pd.Series()
4345
return series
4446

4547

46-
def parse_netpositions(xml_text):
48+
def parse_netpositions(xml_text, resolution):
4749
"""
4850
4951
Parameters
@@ -56,7 +58,7 @@ def parse_netpositions(xml_text):
5658
"""
5759
series_all = []
5860
for soup in _extract_timeseries(xml_text):
59-
series = _parse_timeseries_generic(soup)
61+
series = _parse_timeseries_generic(soup)[resolution]
6062
if 'REGION' in soup.find('out_domain.mrid').text:
6163
factor = -1 # flow is import so negative
6264
else:
@@ -692,7 +694,7 @@ def _parse_load_timeseries(soup):
692694
-------
693695
pd.Series
694696
"""
695-
return _parse_timeseries_generic(soup)
697+
return _parse_timeseries_generic(soup, merge_series=True)
696698

697699
def _parse_generation_timeseries(soup, per_plant: bool = False, include_eic: bool = False) -> pd.Series:
698700
"""
@@ -707,7 +709,10 @@ def _parse_generation_timeseries(soup, per_plant: bool = False, include_eic: boo
707709
-------
708710
pd.Series
709711
"""
710-
series = _parse_timeseries_generic(soup)
712+
# should never have duplicated timestamps when differing time resolution.
713+
# so simply concat all possibilities
714+
series = _parse_timeseries_generic(soup, merge_series=True)
715+
711716

712717
# Check if there is a psrtype, if so, get it.
713718
_psrtype = soup.find('psrtype')

entsoe/series_parsers.py

Lines changed: 86 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ def _extract_timeseries(xml_text):
1818
for timeseries in soup.find_all('timeseries'):
1919
yield timeseries
2020

21+
2122
def _resolution_to_timedelta(res_text: str) -> str:
2223
"""
2324
Convert an Entsoe resolution to something that pandas can understand
@@ -40,73 +41,95 @@ def _resolution_to_timedelta(res_text: str) -> str:
4041
"issue.".format(res_text))
4142
return delta
4243

44+
4345
def _parse_datetimeindex(soup, tz=None):
44-
"""
45-
Create a datetimeindex from a parsed beautifulsoup,
46-
given that it contains the elements 'start', 'end'
47-
and 'resolution'
48-
49-
Parameters
50-
----------
51-
soup : bs4.element.tag
52-
tz: str
53-
54-
Returns
55-
-------
56-
pd.DatetimeIndex
57-
"""
58-
start = pd.Timestamp(soup.find('start').text)
59-
end = pd.Timestamp(soup.find_all('end')[-1].text)
60-
if tz is not None:
61-
start = start.tz_convert(tz)
62-
end = end.tz_convert(tz)
63-
64-
delta = _resolution_to_timedelta(res_text=soup.find('resolution').text)
65-
index = pd.date_range(start=start, end=end, freq=delta, inclusive='left')
66-
if tz is not None:
67-
dst_jump = len(set(index.map(lambda d: d.dst()))) > 1
68-
if dst_jump and delta == "7D":
69-
# For a weekly granularity, if we jump over the DST date in October,
70-
# date_range erronously returns an additional index element
71-
# because that week contains 169 hours instead of 168.
72-
index = index[:-1]
73-
index = index.tz_convert("UTC")
74-
elif index.to_series().diff().min() >= pd.Timedelta('1D') and end.hour == start.hour + 1:
75-
# For a daily or larger granularity, if we jump over the DST date in October,
76-
# date_range erronously returns an additional index element
77-
# because the period contains one extra hour.
78-
index = index[:-1]
79-
80-
return index
81-
82-
def _parse_timeseries_generic(soup, label='quantity', to_float=True):
83-
data = {}
84-
for point in soup.find_all('point'):
85-
value = point.find(label).text
86-
if to_float:
87-
value = value.replace(',', '')
88-
data[int(point.find('position').text)] = value
89-
90-
series = pd.Series(data)
91-
series.sort_index()
92-
index = _parse_datetimeindex(soup)
93-
if soup.find('curvetype').text == 'A03':
94-
# with A03 its possible that positions are missing, this is when values are repeated
95-
# see docs: https://eepublicdownloads.entsoe.eu/clean-documents/EDI/Library/cim_based/Introduction_of_different_Timeseries_possibilities__curvetypes__with_ENTSO-E_electronic_document_v1.4.pdf
96-
# so lets do reindex on a continious range which creates gaps if positions are missing
97-
# then forward fill, so repeat last valid value, to fill the gaps
98-
series = series.reindex(list(range(1, len(index)+1))).ffill()
99-
100-
series.index = index
101-
if to_float:
102-
series = series.astype(float)
103-
104-
return series
46+
"""
47+
Create a datetimeindex from a parsed beautifulsoup,
48+
given that it contains the elements 'start', 'end'
49+
and 'resolution'
50+
51+
Parameters
52+
----------
53+
soup : bs4.element.tag
54+
tz: str
55+
56+
Returns
57+
-------
58+
pd.DatetimeIndex
59+
"""
60+
start = pd.Timestamp(soup.find('start').text)
61+
end = pd.Timestamp(soup.find_all('end')[-1].text)
62+
if tz is not None:
63+
start = start.tz_convert(tz)
64+
end = end.tz_convert(tz)
65+
66+
delta = _resolution_to_timedelta(res_text=soup.find('resolution').text)
67+
index = pd.date_range(start=start, end=end, freq=delta, inclusive='left')
68+
if tz is not None:
69+
dst_jump = len(set(index.map(lambda d: d.dst()))) > 1
70+
if dst_jump and delta == "7D":
71+
# For a weekly granularity, if we jump over the DST date in October,
72+
# date_range erronously returns an additional index element
73+
# because that week contains 169 hours instead of 168.
74+
index = index[:-1]
75+
index = index.tz_convert("UTC")
76+
elif index.to_series().diff().min() >= pd.Timedelta('1D') and end.hour == start.hour + 1:
77+
# For a daily or larger granularity, if we jump over the DST date in October,
78+
# date_range erronously returns an additional index element
79+
# because the period contains one extra hour.
80+
index = index[:-1]
81+
82+
return index
83+
84+
85+
def _parse_timeseries_generic(soup, label='quantity', to_float=True, merge_series=False):
86+
series = {
87+
'15min': [],
88+
'30min': [],
89+
'60min': []
90+
}
91+
92+
for period in soup.find_all('period'):
93+
data = {}
94+
start = pd.Timestamp(period.find('start').text)
95+
end = pd.Timestamp(period.find('end').text)
96+
delta_text = _resolution_to_timedelta(res_text=period.find('resolution').text)
97+
delta = pd.Timedelta(delta_text)
98+
for point in period.find_all('point'):
99+
value = point.find(label).text
100+
if to_float:
101+
value = value.replace(',', '')
102+
position = int(point.find('position').text)
103+
data[start + (position-1)*delta] = value
104+
S = pd.Series(data).sort_index()
105+
if soup.find('curvetype').text == 'A03':
106+
# with A03 its possible that positions are missing, this is when values are repeated
107+
# see docs: https://eepublicdownloads.entsoe.eu/clean-documents/EDI/Library/cim_based/Introduction_of_different_Timeseries_possibilities__curvetypes__with_ENTSO-E_electronic_document_v1.4.pdf
108+
# so lets do reindex on a continious range which creates gaps if positions are missing
109+
# then forward fill, so repeat last valid value, to fill the gaps
110+
S = S.reindex(pd.date_range(start, end-delta, freq=delta_text)).ffill()
111+
if delta_text not in series:
112+
series[delta_text] = []
113+
series[delta_text].append(S)
114+
for freq, S in series.items():
115+
if len(S) > 0:
116+
series[freq] = pd.concat(S).sort_index()
117+
if to_float:
118+
series[freq] = series[freq].astype(float)
119+
else:
120+
series[freq] = None
121+
122+
# for endpoints which never has duplicated timeseries the flag merge_series signals to just concat everything
123+
if merge_series:
124+
return pd.concat(series.values())
125+
else:
126+
return series
127+
105128

106129
def _parse_timeseries_generic_whole(xml_text, label='quantity', to_float=True):
107130
series_all = []
108131
for soup in _extract_timeseries(xml_text):
109-
series_all.append(_parse_timeseries_generic(soup, label=label, to_float=to_float))
132+
series_all.append(_parse_timeseries_generic(soup, label=label, to_float=to_float, merge_series=True))
110133

111134
series_all = pd.concat(series_all).sort_index()
112-
return series_all
135+
return series_all

0 commit comments

Comments
 (0)