Skip to content

Commit a7d38ed

Browse files
authored
[COST-4589] Correctly get end of month data for Azure Metering (#4897)
* move SQL to files per provider, case statement for Azure to handle EoM scenario
1 parent 89bb16a commit a7d38ed

10 files changed

+272
-184
lines changed

koku/subs/subs_data_extractor.py

+23-146
Original file line numberDiff line numberDiff line change
@@ -21,92 +21,10 @@
2121
from masu.util.aws.common import get_s3_resource
2222
from reporting.models import SubsIDMap
2323
from reporting.models import SubsLastProcessed
24-
from reporting.provider.aws.models import TRINO_LINE_ITEM_TABLE as AWS_TABLE
25-
from reporting.provider.azure.models import TRINO_LINE_ITEM_TABLE as AZURE_TABLE
2624

2725

2826
LOG = logging.getLogger(__name__)
2927

30-
TABLE_MAP = {
31-
Provider.PROVIDER_AWS: AWS_TABLE,
32-
Provider.PROVIDER_AZURE: AZURE_TABLE,
33-
}
34-
35-
ID_COLUMN_MAP = {
36-
Provider.PROVIDER_AWS: "lineitem_usageaccountid",
37-
Provider.PROVIDER_AZURE: "COALESCE(NULLIF(subscriptionid, ''), subscriptionguid)",
38-
}
39-
40-
RECORD_FILTER_MAP = {
41-
Provider.PROVIDER_AWS: (
42-
" lineitem_productcode = 'AmazonEC2' AND lineitem_lineitemtype IN ('Usage', 'SavingsPlanCoveredUsage') "
43-
"AND product_vcpu != '' AND strpos(lower(resourcetags), 'com_redhat_rhel') > 0"
44-
),
45-
Provider.PROVIDER_AZURE: (
46-
" metercategory = 'Virtual Machines' AND chargetype = 'Usage' "
47-
"AND json_extract_scalar(lower(additionalinfo), '$.vcpus') IS NOT NULL "
48-
"AND json_extract_scalar(lower(tags), '$.com_redhat_rhel') IS NOT NULL"
49-
),
50-
}
51-
52-
RESOURCE_ID_FILTER_MAP = {
53-
Provider.PROVIDER_AWS: (
54-
" AND lineitem_productcode = 'AmazonEC2' "
55-
"AND strpos(lower(resourcetags), 'com_redhat_rhel') > 0 AND lineitem_usageaccountid = {{usage_account}}"
56-
),
57-
Provider.PROVIDER_AZURE: (
58-
" AND metercategory = 'Virtual Machines' "
59-
"AND json_extract_scalar(lower(additionalinfo), '$.vcpus') IS NOT NULL "
60-
"AND json_extract_scalar(lower(tags), '$.com_redhat_rhel') IS NOT NULL "
61-
"AND (subscriptionid = {{usage_account}} or subscriptionguid = {{usage_account}}) "
62-
),
63-
}
64-
65-
RESOURCE_SELECT_MAP = {
66-
Provider.PROVIDER_AWS: " SELECT lineitem_resourceid, max(lineitem_usagestartdate) ",
67-
Provider.PROVIDER_AZURE: " SELECT coalesce(NULLIF(resourceid, ''), instanceid), date_add('day', -1, max(coalesce(date, usagedatetime))) ", # noqa E501
68-
}
69-
70-
RESOURCE_ID_GROUP_BY_MAP = {
71-
Provider.PROVIDER_AWS: " GROUP BY lineitem_resourceid",
72-
Provider.PROVIDER_AZURE: " GROUP BY resourceid, instanceid",
73-
}
74-
75-
RESOURCE_ID_EXCLUSION_CLAUSE_MAP = {
76-
Provider.PROVIDER_AWS: " AND lineitem_resourceid NOT IN {{excluded_ids | inclause}} ",
77-
Provider.PROVIDER_AZURE: " and coalesce(NULLIF(resourceid, ''), instanceid) NOT IN {{excluded_ids | inclause}} ",
78-
}
79-
80-
RESOURCE_ID_SQL_CLAUSE_MAP = {
81-
Provider.PROVIDER_AWS: (
82-
" ( lineitem_resourceid = {{{{ rid_{0} }}}} "
83-
" AND lineitem_usagestartdate >= {{{{ start_date_{0} }}}} "
84-
" AND lineitem_usagestartdate <= {{{{ end_date_{0} }}}}) "
85-
),
86-
Provider.PROVIDER_AZURE: (
87-
" ( coalesce(NULLIF(resourceid, ''), instanceid) = {{{{ rid_{0} }}}} "
88-
"AND coalesce(date, usagedatetime) >= {{{{ start_date_{0} }}}} "
89-
"AND coalesce(date, usagedatetime) <= {{{{ end_date_{0} }}}}) "
90-
),
91-
}
92-
93-
POST_OR_CLAUSE_SQL_MAP = {
94-
Provider.PROVIDER_AWS: """
95-
OFFSET
96-
{{ offset }}
97-
LIMIT
98-
{{ limit }}
99-
)
100-
WHERE json_extract_scalar(tags, '$.com_redhat_rhel') IS NOT NULL
101-
""",
102-
Provider.PROVIDER_AZURE: """
103-
OFFSET
104-
{{ offset }}
105-
LIMIT
106-
{{ limit }}
107-
""",
108-
}
109-
11028

11129
class SUBSDataExtractor(ReportDBAccessorBase):
11230
def __init__(self, tracing_id, context):
@@ -125,16 +43,6 @@ def __init__(self, tracing_id, context):
12543
settings.S3_SUBS_ACCESS_KEY, settings.S3_SUBS_SECRET, settings.S3_SUBS_REGION
12644
)
12745
self.context = context
128-
# The following variables all change depending on the provider type to run the correct SQL
129-
self.table = TABLE_MAP.get(self.provider_type)
130-
self.id_column = ID_COLUMN_MAP.get(self.provider_type)
131-
self.provider_where_clause = RECORD_FILTER_MAP.get(self.provider_type)
132-
self.resource_select_sql = RESOURCE_SELECT_MAP.get(self.provider_type)
133-
self.resource_id_where_clause = RESOURCE_ID_FILTER_MAP.get(self.provider_type)
134-
self.resource_id_group_by = RESOURCE_ID_GROUP_BY_MAP.get(self.provider_type)
135-
self.resource_id_sql_clause = RESOURCE_ID_SQL_CLAUSE_MAP.get(self.provider_type)
136-
self.resource_id_exclusion_clause = RESOURCE_ID_EXCLUSION_CLAUSE_MAP.get(self.provider_type)
137-
self.post_or_clause_sql = POST_OR_CLAUSE_SQL_MAP.get(self.provider_type)
13846

13947
@cached_property
14048
def subs_s3_path(self):
@@ -176,20 +84,15 @@ def determine_ids_for_provider(self, year, month):
17684
excluded_ids = list(
17785
SubsIDMap.objects.exclude(source_uuid=self.provider_uuid).values_list("usage_id", flat=True)
17886
)
179-
sql = (
180-
"SELECT DISTINCT {{id_column | sqlsafe}} FROM hive.{{schema | sqlsafe}}.{{table | sqlsafe}} WHERE"
181-
" source={{source_uuid}} AND year={{year}} AND month={{month}}"
182-
)
183-
if excluded_ids:
184-
sql += " AND {{id_column | sqlsafe}} NOT IN {{excluded_ids | inclause}}"
87+
sql_file = f"trino_sql/{self.provider_type.lower()}/determine_ids_for_provider.sql"
88+
sql = pkgutil.get_data("subs", sql_file)
89+
sql = sql.decode("utf-8")
18590
sql_params = {
18691
"schema": self.schema,
18792
"source_uuid": self.provider_uuid,
18893
"year": year,
18994
"month": month,
19095
"excluded_ids": excluded_ids,
191-
"id_column": self.id_column,
192-
"table": self.table,
19396
}
19497
ids = self._execute_trino_raw_sql_query(
19598
sql, sql_params=sql_params, context=self.context, log_ref="subs_determine_ids_for_provider"
@@ -202,49 +105,31 @@ def determine_ids_for_provider(self, year, month):
202105
SubsIDMap.objects.bulk_create(bulk_maps, ignore_conflicts=True)
203106
return id_list
204107

205-
def determine_line_item_count(self, where_clause, sql_params):
206-
"""Determine the number of records in the table that have not been processed and match the criteria"""
207-
table_count_sql = f"SELECT count(*) FROM {self.schema}.{self.table} {where_clause}"
208-
count = self._execute_trino_raw_sql_query(
209-
table_count_sql, sql_params=sql_params, log_ref="determine_subs_processing_count"
210-
)
108+
def determine_row_count(self, sql_params):
109+
"""Determine the number of records in the table that have not been processed and match the criteria."""
110+
sql_file = f"trino_sql/{self.provider_type.lower()}/subs_row_count.sql"
111+
sql = pkgutil.get_data("subs", sql_file)
112+
sql = sql.decode("utf-8")
113+
count = self._execute_trino_raw_sql_query(sql, sql_params=sql_params, log_ref="determine_subs_row_count")
211114
return count[0][0]
212115

213-
def determine_where_clause_and_params(self, year, month):
214-
"""Determine the where clause to use when processing subs data"""
215-
where_clause = "WHERE source={{source_uuid}} AND year={{year}} AND month={{month}} AND"
216-
# different provider types have different required filters here
217-
where_clause += self.provider_where_clause
218-
sql_params = {
219-
"source_uuid": self.provider_uuid,
220-
"year": year,
221-
"month": month,
222-
}
223-
return where_clause, sql_params
224-
225116
def get_resource_ids_for_usage_account(self, usage_account, year, month):
226117
"""Determine the relevant resource ids and end time to process to for each resource id."""
227118
with schema_context(self.schema):
228119
# get a list of IDs to exclude from this source processing
229120
excluded_ids = list(
230121
SubsLastProcessed.objects.exclude(source_uuid=self.provider_uuid).values_list("resource_id", flat=True)
231122
)
232-
sql = self.resource_select_sql + (
233-
" FROM hive.{{schema | sqlsafe}}.{{table | sqlsafe}} WHERE"
234-
" source={{source_uuid}} AND year={{year}} AND month={{month}}"
235-
)
236-
sql += self.resource_id_where_clause
237-
if excluded_ids:
238-
sql += self.resource_id_exclusion_clause
239-
sql += self.resource_id_group_by
123+
sql_file = f"trino_sql/{self.provider_type.lower()}/determine_resource_ids_for_usage_account.sql"
124+
sql = pkgutil.get_data("subs", sql_file)
125+
sql = sql.decode("utf-8")
240126
sql_params = {
241127
"schema": self.schema,
242128
"source_uuid": self.provider_uuid,
243129
"year": year,
244130
"month": month,
245131
"excluded_ids": excluded_ids,
246132
"usage_account": usage_account,
247-
"table": self.table,
248133
}
249134
ids = self._execute_trino_raw_sql_query(
250135
sql, sql_params=sql_params, context=self.context, log_ref="subs_determine_rids_for_provider"
@@ -253,33 +138,25 @@ def get_resource_ids_for_usage_account(self, usage_account, year, month):
253138

254139
def gather_and_upload_for_resource_batch(self, year, month, batch, base_filename):
255140
"""Gather the data and upload it to S3 for a batch of resource ids"""
256-
where_clause, sql_params = self.determine_where_clause_and_params(year, month)
257-
sql_file = f"trino_sql/{self.provider_type.lower()}_subs_pre_or_clause.sql"
141+
sql_params = sql_params = {
142+
"source_uuid": self.provider_uuid,
143+
"year": year,
144+
"month": month,
145+
"schema": self.schema,
146+
"resources": batch,
147+
}
148+
sql_file = f"trino_sql/{self.provider_type.lower()}/subs_summary.sql"
258149
summary_sql = pkgutil.get_data("subs", sql_file)
259150
summary_sql = summary_sql.decode("utf-8")
260-
rid_sql_clause = " AND ( "
261-
for i, e in enumerate(batch):
262-
rid, start_time, end_time = e
263-
sql_params[f"rid_{i}"] = rid
264-
sql_params[f"start_date_{i}"] = start_time
265-
sql_params[f"end_date_{i}"] = end_time
266-
rid_sql_clause += self.resource_id_sql_clause.format(i)
267-
if i < len(batch) - 1:
268-
rid_sql_clause += " OR "
269-
rid_sql_clause += " )"
270-
where_clause += rid_sql_clause
271-
summary_sql += rid_sql_clause
272-
summary_sql += self.post_or_clause_sql
273-
total_count = self.determine_line_item_count(where_clause, sql_params)
151+
total_count = self.determine_row_count(sql_params)
274152
LOG.debug(
275153
log_json(
276154
self.tracing_id,
277155
msg=f"identified {total_count} matching records for metered rhel",
278-
context=self.context | {"resource_ids": [rid for rid, _, _ in batch]},
156+
context=self.context | {"resource_ids": [row["rid"] for row in batch]},
279157
)
280158
)
281159
upload_keys = []
282-
sql_params["schema"] = self.schema
283160
for i, offset in enumerate(range(0, total_count, settings.PARQUET_PROCESSING_BATCH_SIZE)):
284161
sql_params["offset"] = offset
285162
sql_params["limit"] = settings.PARQUET_PROCESSING_BATCH_SIZE
@@ -359,7 +236,7 @@ def extract_data_to_s3(self, month_start):
359236
)
360237
for rid, end_time in resource_ids:
361238
start_time = max(last_processed_dict.get(rid, month_start), self.creation_processing_time)
362-
batch.append((rid, start_time, end_time))
239+
batch.append({"rid": rid, "start": start_time, "end": end_time})
363240
if len(batch) >= 100:
364241
upload_keys.extend(
365242
self.gather_and_upload_for_resource_batch(year, month, batch, f"{base_filename}_{batch_num}")

koku/subs/test/test_subs_data_extractor.py

+18-38
Original file line numberDiff line numberDiff line change
@@ -84,29 +84,11 @@ def test_determine_latest_processed_time_for_provider_without_return_value(self)
8484
self.assertIsNone(actual)
8585

8686
@patch("subs.subs_data_extractor.SUBSDataExtractor._execute_trino_raw_sql_query")
87-
def test_determine_line_item_count(self, mock_trino):
87+
def test_determine_row_count(self, mock_trino):
8888
"""Test determining the line item count for the subs query calls trino"""
89-
self.extractor.determine_line_item_count("fake where clause", {"fake": "params"})
89+
self.extractor.determine_row_count({"fake": "params"})
9090
mock_trino.assert_called_once()
9191

92-
def test_determine_where_clause_and_params(self):
93-
"""Test resulting where clause and params matches expected values"""
94-
year = "2023"
95-
month = "07"
96-
expected_sql_params = {
97-
"source_uuid": self.aws_provider.uuid,
98-
"year": year,
99-
"month": month,
100-
}
101-
expected_clause = (
102-
"WHERE source={{source_uuid}} AND year={{year}} AND month={{month}} AND"
103-
" lineitem_productcode = 'AmazonEC2' AND lineitem_lineitemtype IN ('Usage', 'SavingsPlanCoveredUsage') AND"
104-
" product_vcpu != '' AND strpos(lower(resourcetags), 'com_redhat_rhel') > 0"
105-
)
106-
actual_clause, actual_params = self.extractor.determine_where_clause_and_params(year, month)
107-
self.assertEqual(expected_clause, actual_clause)
108-
self.assertEqual(expected_sql_params, actual_params)
109-
11092
@patch("subs.subs_data_extractor.SUBSDataExtractor.bulk_update_latest_processed_time")
11193
@patch("subs.subs_data_extractor.SUBSDataExtractor.gather_and_upload_for_resource_batch")
11294
@patch("subs.subs_data_extractor.SUBSDataExtractor.get_resource_ids_for_usage_account")
@@ -176,9 +158,8 @@ def test_extract_data_to_s3_no_resource_ids_found(
176158

177159
@patch("subs.subs_data_extractor.SUBSDataExtractor.copy_data_to_subs_s3_bucket")
178160
@patch("subs.subs_data_extractor.SUBSDataExtractor._execute_trino_raw_sql_query_with_description")
179-
@patch("subs.subs_data_extractor.SUBSDataExtractor.determine_line_item_count")
180-
@patch("subs.subs_data_extractor.SUBSDataExtractor.determine_where_clause_and_params")
181-
def test_gather_and_upload_for_resource_batch(self, mock_where_clause, mock_li_count, mock_trino, mock_copy):
161+
@patch("subs.subs_data_extractor.SUBSDataExtractor.determine_row_count")
162+
def test_gather_and_upload_for_resource_batch(self, mock_row_count, mock_trino, mock_copy):
182163
"""Test gathering data and uploading it to S3 calls the right functions and returns the right value."""
183164
self.dh.month_start(self.yesterday)
184165
rid = "12345"
@@ -187,28 +168,26 @@ def test_gather_and_upload_for_resource_batch(self, mock_where_clause, mock_li_c
187168
rid_2 = "23456"
188169
start_time = datetime.datetime(2023, 4, 3, tzinfo=datetime.timezone.utc)
189170
end_time = datetime.datetime(2023, 4, 5, tzinfo=datetime.timezone.utc)
190-
batch = [(rid, start_time, end_time), (rid_2, start_time, end_time)]
191-
mock_li_count.return_value = 10
171+
batch = [
172+
{"rid": rid, "start": start_time, "end": end_time},
173+
{"rid": rid_2, "start": start_time, "end": end_time},
174+
]
175+
mock_row_count.return_value = 10
192176
expected_key = "fake_key"
193177
base_filename = "fake_filename"
194178
mock_copy.return_value = expected_key
195179
mock_trino.return_value = (MagicMock(), MagicMock())
196-
mock_where_clause.return_value = (MagicMock(), MagicMock())
197180
upload_keys = self.extractor.gather_and_upload_for_resource_batch(year, month, batch, base_filename)
198-
mock_where_clause.assert_called_once()
199-
mock_li_count.assert_called_once()
181+
mock_row_count.assert_called_once()
200182
mock_trino.assert_called_once()
201183
mock_copy.assert_called_once()
202184
expected_result = [expected_key]
203185
self.assertEqual(expected_result, upload_keys)
204186

205187
@patch("subs.subs_data_extractor.SUBSDataExtractor.copy_data_to_subs_s3_bucket")
206188
@patch("subs.subs_data_extractor.SUBSDataExtractor._execute_trino_raw_sql_query_with_description")
207-
@patch("subs.subs_data_extractor.SUBSDataExtractor.determine_line_item_count")
208-
@patch("subs.subs_data_extractor.SUBSDataExtractor.determine_where_clause_and_params")
209-
def test_gather_and_upload_for_resource_batch_no_result(
210-
self, mock_where_clause, mock_li_count, mock_trino, mock_copy
211-
):
189+
@patch("subs.subs_data_extractor.SUBSDataExtractor.determine_row_count")
190+
def test_gather_and_upload_for_resource_batch_no_result(self, mock_row_count, mock_trino, mock_copy):
212191
"""Test uploading does not attempt with empty values from trino query."""
213192
self.dh.month_start(self.yesterday)
214193
rid = "12345"
@@ -217,16 +196,17 @@ def test_gather_and_upload_for_resource_batch_no_result(
217196
rid_2 = "23456"
218197
start_time = datetime.datetime(2023, 4, 3, tzinfo=datetime.timezone.utc)
219198
end_time = datetime.datetime(2023, 4, 5, tzinfo=datetime.timezone.utc)
220-
batch = [(rid, start_time, end_time), (rid_2, start_time, end_time)]
221-
mock_li_count.return_value = 10
199+
batch = [
200+
{"rid": rid, "start": start_time, "end": end_time},
201+
{"rid": rid_2, "start": start_time, "end": end_time},
202+
]
203+
mock_row_count.return_value = 10
222204
expected_key = "fake_key"
223205
base_filename = "fake_filename"
224206
mock_copy.return_value = expected_key
225207
mock_trino.return_value = ([], [("fake_col1",), ("fake_col2",)])
226-
mock_where_clause.return_value = (MagicMock(), MagicMock())
227208
upload_keys = self.extractor.gather_and_upload_for_resource_batch(year, month, batch, base_filename)
228-
mock_where_clause.assert_called_once()
229-
mock_li_count.assert_called_once()
209+
mock_row_count.assert_called_once()
230210
mock_trino.assert_called_once()
231211
mock_copy.assert_not_called()
232212
self.assertEqual(upload_keys, [])
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
SELECT DISTINCT lineitem_usageaccountid
2+
FROM hive.{{schema | sqlsafe}}.aws_line_items
3+
WHERE source={{source_uuid}}
4+
AND year={{year}}
5+
AND month={{month}}
6+
{% if excluded_ids %}
7+
AND lineitem_usageaccountid NOT IN {{excluded_ids | inclause}}
8+
{% endif %}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
SELECT lineitem_resourceid, max(lineitem_usagestartdate)
2+
FROM hive.{{schema | sqlsafe}}.aws_line_items
3+
WHERE source={{source_uuid}}
4+
AND year={{year}}
5+
AND month={{month}}
6+
AND lineitem_productcode = 'AmazonEC2'
7+
AND strpos(lower(resourcetags), 'com_redhat_rhel') > 0
8+
AND lineitem_usageaccountid = {{usage_account}}
9+
{% if excluded_ids %}
10+
AND lineitem_usageaccountid NOT IN {{excluded_ids | inclause}}
11+
{% endif %}
12+
GROUP BY lineitem_resourceid
+23
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
SELECT count(*)
2+
FROM
3+
hive.{{schema | sqlsafe}}.aws_line_items
4+
WHERE
5+
source = {{ source_uuid }}
6+
AND year = {{ year }}
7+
AND month = {{ month }}
8+
AND lineitem_productcode = 'AmazonEC2'
9+
AND lineitem_lineitemtype IN ('Usage', 'SavingsPlanCoveredUsage')
10+
AND product_vcpu != ''
11+
AND strpos(lower(resourcetags), 'com_redhat_rhel') > 0
12+
AND (
13+
{% for item in resources %}
14+
(
15+
lineitem_resourceid = {{item.rid}} AND
16+
lineitem_usagestartdate >= {{item.start}} AND
17+
lineitem_usagestartdate <= {{item.end}}
18+
)
19+
{% if not loop.last %}
20+
OR
21+
{% endif %}
22+
{% endfor %}
23+
)

0 commit comments

Comments
 (0)