@@ -18,6 +18,7 @@ def _extract_timeseries(xml_text):
18
18
for timeseries in soup .find_all ('timeseries' ):
19
19
yield timeseries
20
20
21
+
21
22
def _resolution_to_timedelta (res_text : str ) -> str :
22
23
"""
23
24
Convert an Entsoe resolution to something that pandas can understand
@@ -40,73 +41,95 @@ def _resolution_to_timedelta(res_text: str) -> str:
40
41
"issue." .format (res_text ))
41
42
return delta
42
43
44
+
43
45
def _parse_datetimeindex (soup , tz = None ):
44
- """
45
- Create a datetimeindex from a parsed beautifulsoup,
46
- given that it contains the elements 'start', 'end'
47
- and 'resolution'
48
-
49
- Parameters
50
- ----------
51
- soup : bs4.element.tag
52
- tz: str
53
-
54
- Returns
55
- -------
56
- pd.DatetimeIndex
57
- """
58
- start = pd .Timestamp (soup .find ('start' ).text )
59
- end = pd .Timestamp (soup .find_all ('end' )[- 1 ].text )
60
- if tz is not None :
61
- start = start .tz_convert (tz )
62
- end = end .tz_convert (tz )
63
-
64
- delta = _resolution_to_timedelta (res_text = soup .find ('resolution' ).text )
65
- index = pd .date_range (start = start , end = end , freq = delta , inclusive = 'left' )
66
- if tz is not None :
67
- dst_jump = len (set (index .map (lambda d : d .dst ()))) > 1
68
- if dst_jump and delta == "7D" :
69
- # For a weekly granularity, if we jump over the DST date in October,
70
- # date_range erronously returns an additional index element
71
- # because that week contains 169 hours instead of 168.
72
- index = index [:- 1 ]
73
- index = index .tz_convert ("UTC" )
74
- elif index .to_series ().diff ().min () >= pd .Timedelta ('1D' ) and end .hour == start .hour + 1 :
75
- # For a daily or larger granularity, if we jump over the DST date in October,
76
- # date_range erronously returns an additional index element
77
- # because the period contains one extra hour.
78
- index = index [:- 1 ]
79
-
80
- return index
81
-
82
- def _parse_timeseries_generic (soup , label = 'quantity' , to_float = True ):
83
- data = {}
84
- for point in soup .find_all ('point' ):
85
- value = point .find (label ).text
86
- if to_float :
87
- value = value .replace (',' , '' )
88
- data [int (point .find ('position' ).text )] = value
89
-
90
- series = pd .Series (data )
91
- series .sort_index ()
92
- index = _parse_datetimeindex (soup )
93
- if soup .find ('curvetype' ).text == 'A03' :
94
- # with A03 its possible that positions are missing, this is when values are repeated
95
- # see docs: https://eepublicdownloads.entsoe.eu/clean-documents/EDI/Library/cim_based/Introduction_of_different_Timeseries_possibilities__curvetypes__with_ENTSO-E_electronic_document_v1.4.pdf
96
- # so lets do reindex on a continious range which creates gaps if positions are missing
97
- # then forward fill, so repeat last valid value, to fill the gaps
98
- series = series .reindex (list (range (1 , len (index )+ 1 ))).ffill ()
99
-
100
- series .index = index
101
- if to_float :
102
- series = series .astype (float )
103
-
104
- return series
46
+ """
47
+ Create a datetimeindex from a parsed beautifulsoup,
48
+ given that it contains the elements 'start', 'end'
49
+ and 'resolution'
50
+
51
+ Parameters
52
+ ----------
53
+ soup : bs4.element.tag
54
+ tz: str
55
+
56
+ Returns
57
+ -------
58
+ pd.DatetimeIndex
59
+ """
60
+ start = pd .Timestamp (soup .find ('start' ).text )
61
+ end = pd .Timestamp (soup .find_all ('end' )[- 1 ].text )
62
+ if tz is not None :
63
+ start = start .tz_convert (tz )
64
+ end = end .tz_convert (tz )
65
+
66
+ delta = _resolution_to_timedelta (res_text = soup .find ('resolution' ).text )
67
+ index = pd .date_range (start = start , end = end , freq = delta , inclusive = 'left' )
68
+ if tz is not None :
69
+ dst_jump = len (set (index .map (lambda d : d .dst ()))) > 1
70
+ if dst_jump and delta == "7D" :
71
+ # For a weekly granularity, if we jump over the DST date in October,
72
+ # date_range erronously returns an additional index element
73
+ # because that week contains 169 hours instead of 168.
74
+ index = index [:- 1 ]
75
+ index = index .tz_convert ("UTC" )
76
+ elif index .to_series ().diff ().min () >= pd .Timedelta ('1D' ) and end .hour == start .hour + 1 :
77
+ # For a daily or larger granularity, if we jump over the DST date in October,
78
+ # date_range erronously returns an additional index element
79
+ # because the period contains one extra hour.
80
+ index = index [:- 1 ]
81
+
82
+ return index
83
+
84
+
85
+ def _parse_timeseries_generic (soup , label = 'quantity' , to_float = True , merge_series = False ):
86
+ series = {
87
+ '15min' : [],
88
+ '30min' : [],
89
+ '60min' : []
90
+ }
91
+
92
+ for period in soup .find_all ('period' ):
93
+ data = {}
94
+ start = pd .Timestamp (period .find ('start' ).text )
95
+ end = pd .Timestamp (period .find ('end' ).text )
96
+ delta_text = _resolution_to_timedelta (res_text = period .find ('resolution' ).text )
97
+ delta = pd .Timedelta (delta_text )
98
+ for point in period .find_all ('point' ):
99
+ value = point .find (label ).text
100
+ if to_float :
101
+ value = value .replace (',' , '' )
102
+ position = int (point .find ('position' ).text )
103
+ data [start + (position - 1 )* delta ] = value
104
+ S = pd .Series (data ).sort_index ()
105
+ if soup .find ('curvetype' ).text == 'A03' :
106
+ # with A03 its possible that positions are missing, this is when values are repeated
107
+ # see docs: https://eepublicdownloads.entsoe.eu/clean-documents/EDI/Library/cim_based/Introduction_of_different_Timeseries_possibilities__curvetypes__with_ENTSO-E_electronic_document_v1.4.pdf
108
+ # so lets do reindex on a continious range which creates gaps if positions are missing
109
+ # then forward fill, so repeat last valid value, to fill the gaps
110
+ S = S .reindex (pd .date_range (start , end - delta , freq = delta_text )).ffill ()
111
+ if delta_text not in series :
112
+ series [delta_text ] = []
113
+ series [delta_text ].append (S )
114
+ for freq , S in series .items ():
115
+ if len (S ) > 0 :
116
+ series [freq ] = pd .concat (S ).sort_index ()
117
+ if to_float :
118
+ series [freq ] = series [freq ].astype (float )
119
+ else :
120
+ series [freq ] = None
121
+
122
+ # for endpoints which never has duplicated timeseries the flag merge_series signals to just concat everything
123
+ if merge_series :
124
+ return pd .concat (series .values ())
125
+ else :
126
+ return series
127
+
105
128
106
129
def _parse_timeseries_generic_whole (xml_text , label = 'quantity' , to_float = True ):
107
130
series_all = []
108
131
for soup in _extract_timeseries (xml_text ):
109
- series_all .append (_parse_timeseries_generic (soup , label = label , to_float = to_float ))
132
+ series_all .append (_parse_timeseries_generic (soup , label = label , to_float = to_float , merge_series = True ))
110
133
111
134
series_all = pd .concat (series_all ).sort_index ()
112
- return series_all
135
+ return series_all
0 commit comments