21
21
from masu .util .aws .common import get_s3_resource
22
22
from reporting .models import SubsIDMap
23
23
from reporting .models import SubsLastProcessed
24
- from reporting .provider .aws .models import TRINO_LINE_ITEM_TABLE as AWS_TABLE
25
- from reporting .provider .azure .models import TRINO_LINE_ITEM_TABLE as AZURE_TABLE
26
24
27
25
28
26
LOG = logging .getLogger (__name__ )
29
27
30
- TABLE_MAP = {
31
- Provider .PROVIDER_AWS : AWS_TABLE ,
32
- Provider .PROVIDER_AZURE : AZURE_TABLE ,
33
- }
34
-
35
- ID_COLUMN_MAP = {
36
- Provider .PROVIDER_AWS : "lineitem_usageaccountid" ,
37
- Provider .PROVIDER_AZURE : "COALESCE(NULLIF(subscriptionid, ''), subscriptionguid)" ,
38
- }
39
-
40
- RECORD_FILTER_MAP = {
41
- Provider .PROVIDER_AWS : (
42
- " lineitem_productcode = 'AmazonEC2' AND lineitem_lineitemtype IN ('Usage', 'SavingsPlanCoveredUsage') "
43
- "AND product_vcpu != '' AND strpos(lower(resourcetags), 'com_redhat_rhel') > 0"
44
- ),
45
- Provider .PROVIDER_AZURE : (
46
- " metercategory = 'Virtual Machines' AND chargetype = 'Usage' "
47
- "AND json_extract_scalar(lower(additionalinfo), '$.vcpus') IS NOT NULL "
48
- "AND json_extract_scalar(lower(tags), '$.com_redhat_rhel') IS NOT NULL"
49
- ),
50
- }
51
-
52
- RESOURCE_ID_FILTER_MAP = {
53
- Provider .PROVIDER_AWS : (
54
- " AND lineitem_productcode = 'AmazonEC2' "
55
- "AND strpos(lower(resourcetags), 'com_redhat_rhel') > 0 AND lineitem_usageaccountid = {{usage_account}}"
56
- ),
57
- Provider .PROVIDER_AZURE : (
58
- " AND metercategory = 'Virtual Machines' "
59
- "AND json_extract_scalar(lower(additionalinfo), '$.vcpus') IS NOT NULL "
60
- "AND json_extract_scalar(lower(tags), '$.com_redhat_rhel') IS NOT NULL "
61
- "AND (subscriptionid = {{usage_account}} or subscriptionguid = {{usage_account}}) "
62
- ),
63
- }
64
-
65
- RESOURCE_SELECT_MAP = {
66
- Provider .PROVIDER_AWS : " SELECT lineitem_resourceid, max(lineitem_usagestartdate) " ,
67
- Provider .PROVIDER_AZURE : " SELECT coalesce(NULLIF(resourceid, ''), instanceid), date_add('day', -1, max(coalesce(date, usagedatetime))) " , # noqa E501
68
- }
69
-
70
- RESOURCE_ID_GROUP_BY_MAP = {
71
- Provider .PROVIDER_AWS : " GROUP BY lineitem_resourceid" ,
72
- Provider .PROVIDER_AZURE : " GROUP BY resourceid, instanceid" ,
73
- }
74
-
75
- RESOURCE_ID_EXCLUSION_CLAUSE_MAP = {
76
- Provider .PROVIDER_AWS : " AND lineitem_resourceid NOT IN {{excluded_ids | inclause}} " ,
77
- Provider .PROVIDER_AZURE : " and coalesce(NULLIF(resourceid, ''), instanceid) NOT IN {{excluded_ids | inclause}} " ,
78
- }
79
-
80
- RESOURCE_ID_SQL_CLAUSE_MAP = {
81
- Provider .PROVIDER_AWS : (
82
- " ( lineitem_resourceid = {{{{ rid_{0} }}}} "
83
- " AND lineitem_usagestartdate >= {{{{ start_date_{0} }}}} "
84
- " AND lineitem_usagestartdate <= {{{{ end_date_{0} }}}}) "
85
- ),
86
- Provider .PROVIDER_AZURE : (
87
- " ( coalesce(NULLIF(resourceid, ''), instanceid) = {{{{ rid_{0} }}}} "
88
- "AND coalesce(date, usagedatetime) >= {{{{ start_date_{0} }}}} "
89
- "AND coalesce(date, usagedatetime) <= {{{{ end_date_{0} }}}}) "
90
- ),
91
- }
92
-
93
- POST_OR_CLAUSE_SQL_MAP = {
94
- Provider .PROVIDER_AWS : """
95
- OFFSET
96
- {{ offset }}
97
- LIMIT
98
- {{ limit }}
99
- )
100
- WHERE json_extract_scalar(tags, '$.com_redhat_rhel') IS NOT NULL
101
- """ ,
102
- Provider .PROVIDER_AZURE : """
103
- OFFSET
104
- {{ offset }}
105
- LIMIT
106
- {{ limit }}
107
- """ ,
108
- }
109
-
110
28
111
29
class SUBSDataExtractor (ReportDBAccessorBase ):
112
30
def __init__ (self , tracing_id , context ):
@@ -125,16 +43,6 @@ def __init__(self, tracing_id, context):
125
43
settings .S3_SUBS_ACCESS_KEY , settings .S3_SUBS_SECRET , settings .S3_SUBS_REGION
126
44
)
127
45
self .context = context
128
- # The following variables all change depending on the provider type to run the correct SQL
129
- self .table = TABLE_MAP .get (self .provider_type )
130
- self .id_column = ID_COLUMN_MAP .get (self .provider_type )
131
- self .provider_where_clause = RECORD_FILTER_MAP .get (self .provider_type )
132
- self .resource_select_sql = RESOURCE_SELECT_MAP .get (self .provider_type )
133
- self .resource_id_where_clause = RESOURCE_ID_FILTER_MAP .get (self .provider_type )
134
- self .resource_id_group_by = RESOURCE_ID_GROUP_BY_MAP .get (self .provider_type )
135
- self .resource_id_sql_clause = RESOURCE_ID_SQL_CLAUSE_MAP .get (self .provider_type )
136
- self .resource_id_exclusion_clause = RESOURCE_ID_EXCLUSION_CLAUSE_MAP .get (self .provider_type )
137
- self .post_or_clause_sql = POST_OR_CLAUSE_SQL_MAP .get (self .provider_type )
138
46
139
47
@cached_property
140
48
def subs_s3_path (self ):
@@ -176,20 +84,15 @@ def determine_ids_for_provider(self, year, month):
176
84
excluded_ids = list (
177
85
SubsIDMap .objects .exclude (source_uuid = self .provider_uuid ).values_list ("usage_id" , flat = True )
178
86
)
179
- sql = (
180
- "SELECT DISTINCT {{id_column | sqlsafe}} FROM hive.{{schema | sqlsafe}}.{{table | sqlsafe}} WHERE"
181
- " source={{source_uuid}} AND year={{year}} AND month={{month}}"
182
- )
183
- if excluded_ids :
184
- sql += " AND {{id_column | sqlsafe}} NOT IN {{excluded_ids | inclause}}"
87
+ sql_file = f"trino_sql/{ self .provider_type .lower ()} /determine_ids_for_provider.sql"
88
+ sql = pkgutil .get_data ("subs" , sql_file )
89
+ sql = sql .decode ("utf-8" )
185
90
sql_params = {
186
91
"schema" : self .schema ,
187
92
"source_uuid" : self .provider_uuid ,
188
93
"year" : year ,
189
94
"month" : month ,
190
95
"excluded_ids" : excluded_ids ,
191
- "id_column" : self .id_column ,
192
- "table" : self .table ,
193
96
}
194
97
ids = self ._execute_trino_raw_sql_query (
195
98
sql , sql_params = sql_params , context = self .context , log_ref = "subs_determine_ids_for_provider"
@@ -202,49 +105,31 @@ def determine_ids_for_provider(self, year, month):
202
105
SubsIDMap .objects .bulk_create (bulk_maps , ignore_conflicts = True )
203
106
return id_list
204
107
205
- def determine_line_item_count (self , where_clause , sql_params ):
206
- """Determine the number of records in the table that have not been processed and match the criteria"""
207
- table_count_sql = f"SELECT count(*) FROM { self .schema } . { self . table } { where_clause } "
208
- count = self . _execute_trino_raw_sql_query (
209
- table_count_sql , sql_params = sql_params , log_ref = "determine_subs_processing_count"
210
- )
108
+ def determine_row_count (self , sql_params ):
109
+ """Determine the number of records in the table that have not been processed and match the criteria. """
110
+ sql_file = f"trino_sql/ { self .provider_type . lower () } /subs_row_count.sql "
111
+ sql = pkgutil . get_data ( "subs" , sql_file )
112
+ sql = sql . decode ( "utf-8" )
113
+ count = self . _execute_trino_raw_sql_query ( sql , sql_params = sql_params , log_ref = "determine_subs_row_count" )
211
114
return count [0 ][0 ]
212
115
213
- def determine_where_clause_and_params (self , year , month ):
214
- """Determine the where clause to use when processing subs data"""
215
- where_clause = "WHERE source={{source_uuid}} AND year={{year}} AND month={{month}} AND"
216
- # different provider types have different required filters here
217
- where_clause += self .provider_where_clause
218
- sql_params = {
219
- "source_uuid" : self .provider_uuid ,
220
- "year" : year ,
221
- "month" : month ,
222
- }
223
- return where_clause , sql_params
224
-
225
116
def get_resource_ids_for_usage_account (self , usage_account , year , month ):
226
117
"""Determine the relevant resource ids and end time to process to for each resource id."""
227
118
with schema_context (self .schema ):
228
119
# get a list of IDs to exclude from this source processing
229
120
excluded_ids = list (
230
121
SubsLastProcessed .objects .exclude (source_uuid = self .provider_uuid ).values_list ("resource_id" , flat = True )
231
122
)
232
- sql = self .resource_select_sql + (
233
- " FROM hive.{{schema | sqlsafe}}.{{table | sqlsafe}} WHERE"
234
- " source={{source_uuid}} AND year={{year}} AND month={{month}}"
235
- )
236
- sql += self .resource_id_where_clause
237
- if excluded_ids :
238
- sql += self .resource_id_exclusion_clause
239
- sql += self .resource_id_group_by
123
+ sql_file = f"trino_sql/{ self .provider_type .lower ()} /determine_resource_ids_for_usage_account.sql"
124
+ sql = pkgutil .get_data ("subs" , sql_file )
125
+ sql = sql .decode ("utf-8" )
240
126
sql_params = {
241
127
"schema" : self .schema ,
242
128
"source_uuid" : self .provider_uuid ,
243
129
"year" : year ,
244
130
"month" : month ,
245
131
"excluded_ids" : excluded_ids ,
246
132
"usage_account" : usage_account ,
247
- "table" : self .table ,
248
133
}
249
134
ids = self ._execute_trino_raw_sql_query (
250
135
sql , sql_params = sql_params , context = self .context , log_ref = "subs_determine_rids_for_provider"
@@ -253,33 +138,25 @@ def get_resource_ids_for_usage_account(self, usage_account, year, month):
253
138
254
139
def gather_and_upload_for_resource_batch (self , year , month , batch , base_filename ):
255
140
"""Gather the data and upload it to S3 for a batch of resource ids"""
256
- where_clause , sql_params = self .determine_where_clause_and_params (year , month )
257
- sql_file = f"trino_sql/{ self .provider_type .lower ()} _subs_pre_or_clause.sql"
141
+ sql_params = sql_params = {
142
+ "source_uuid" : self .provider_uuid ,
143
+ "year" : year ,
144
+ "month" : month ,
145
+ "schema" : self .schema ,
146
+ "resources" : batch ,
147
+ }
148
+ sql_file = f"trino_sql/{ self .provider_type .lower ()} /subs_summary.sql"
258
149
summary_sql = pkgutil .get_data ("subs" , sql_file )
259
150
summary_sql = summary_sql .decode ("utf-8" )
260
- rid_sql_clause = " AND ( "
261
- for i , e in enumerate (batch ):
262
- rid , start_time , end_time = e
263
- sql_params [f"rid_{ i } " ] = rid
264
- sql_params [f"start_date_{ i } " ] = start_time
265
- sql_params [f"end_date_{ i } " ] = end_time
266
- rid_sql_clause += self .resource_id_sql_clause .format (i )
267
- if i < len (batch ) - 1 :
268
- rid_sql_clause += " OR "
269
- rid_sql_clause += " )"
270
- where_clause += rid_sql_clause
271
- summary_sql += rid_sql_clause
272
- summary_sql += self .post_or_clause_sql
273
- total_count = self .determine_line_item_count (where_clause , sql_params )
151
+ total_count = self .determine_row_count (sql_params )
274
152
LOG .debug (
275
153
log_json (
276
154
self .tracing_id ,
277
155
msg = f"identified { total_count } matching records for metered rhel" ,
278
- context = self .context | {"resource_ids" : [rid for rid , _ , _ in batch ]},
156
+ context = self .context | {"resource_ids" : [row [ " rid" ] for row in batch ]},
279
157
)
280
158
)
281
159
upload_keys = []
282
- sql_params ["schema" ] = self .schema
283
160
for i , offset in enumerate (range (0 , total_count , settings .PARQUET_PROCESSING_BATCH_SIZE )):
284
161
sql_params ["offset" ] = offset
285
162
sql_params ["limit" ] = settings .PARQUET_PROCESSING_BATCH_SIZE
@@ -359,7 +236,7 @@ def extract_data_to_s3(self, month_start):
359
236
)
360
237
for rid , end_time in resource_ids :
361
238
start_time = max (last_processed_dict .get (rid , month_start ), self .creation_processing_time )
362
- batch .append (( rid , start_time , end_time ) )
239
+ batch .append ({ " rid" : rid , "start" : start_time , "end" : end_time } )
363
240
if len (batch ) >= 100 :
364
241
upload_keys .extend (
365
242
self .gather_and_upload_for_resource_batch (year , month , batch , f"{ base_filename } _{ batch_num } " )
0 commit comments