Skip to content

Commit 5edb15d

Browse files
authored
Merge pull request #15 from Scout24/CDP-1507_DeltaUtilities
CDP-1507 : Configs and functions for delta lake support
2 parents 0c2c5f0 + b71216b commit 5edb15d

File tree

4 files changed

+91
-1
lines changed

4 files changed

+91
-1
lines changed

Makefile

+1-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ setup-environment:
99
$(PYTHON) -m virtualenv env
1010
. env/bin/activate; \
1111
pip3 install -r requirements.txt; \
12-
pip3 install pyspark==3.1.2
12+
pip3 install pyspark==3.2.0
1313

1414
test: setup-environment
1515
. env/bin/activate; \

requirements.txt

+1
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ testfixtures==6.18.3
44
PyYAML==6.0
55
importlib-resources==5.4.0
66
dotmap==1.3.25
7+
delta-spark==2.0.0

src/pyspark_core_utils/apps.py

+3
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,9 @@ def _init_spark(self):
3636

3737
return SparkSession \
3838
.builder \
39+
.config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
40+
.config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
41+
.config("spark.sql.warehouse.dir","s3://is24-data-hive-warehouse/") \
3942
.config(conf=spark_conf) \
4043
.enableHiveSupport() \
4144
.getOrCreate()

src/pyspark_core_utils/delta_utils.py

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
from delta.tables import DeltaTable
2+
import re
3+
4+
5+
def write_partitioned_data_delta(self, dataframe, partition_name, partition_dates_to_override, write_mode,
6+
target_base_path):
7+
return dataframe \
8+
.write.partitionBy(partition_name) \
9+
.format("delta") \
10+
.option("mergeSchema", "true") \
11+
.option("__partition_columns", partition_name) \
12+
.option("replaceWhere", "{} in ({})".format(partition_name, ', '.join(
13+
map(lambda x: "'{}'".format(x), partition_dates_to_override)))) \
14+
.mode(write_mode) \
15+
.save(target_base_path)
16+
17+
18+
def write_nonpartitioned_data_delta(self, dataframe, write_mode, target_base_path):
19+
return dataframe \
20+
.write.format("delta") \
21+
.option("mergeSchema", "true") \
22+
.mode(write_mode) \
23+
.save(target_base_path)
24+
25+
26+
def compact_delta_table_partitions(self, sparkSession, base_path, partition_name, dates, num_files):
27+
return sparkSession.read \
28+
.format("delta") \
29+
.load(base_path) \
30+
.where(f"{partition_name} in (', '.join(map(lambda x : "'{}'".format(x), dates)))") \
31+
.repartition(num_files) \
32+
.write \
33+
.option("dataChange", "false") \
34+
.format("delta") \
35+
.mode("overwrite") \
36+
.option("replaceWhere", "{} in ({})".format(partition_name, ', '.join(map(lambda x: "'{}'".format(x), dates)))) \
37+
.save(base_path)
38+
39+
40+
def generate_delta_table(self, sparkSession, schema_name, table_name, s3location):
41+
self.spark.sql("create database if not exists {}".format(schema_name))
42+
qualified_table_name = f"""{schema_name}.{table_name}"""
43+
DeltaTable.createIfNotExists(sparkSession) \
44+
.tableName(qualified_table_name) \
45+
.location(s3location) \
46+
.execute()
47+
print(f"Delta table {qualified_table_name} generated")
48+
49+
50+
def extract_delta_info_from_path(self, paths):
51+
path = paths[0]
52+
path_reg_exp = """(.*)/(.*)=(.*)"""
53+
try:
54+
match_pattern_to_path = re.match(path_reg_exp, path)
55+
except:
56+
raise Exception("Can not read {}: base path can not be extracted".format(paths.mkString(",")))
57+
58+
base_path = match_pattern_to_path.group(1)
59+
partition_name = match_pattern_to_path.group(2)
60+
dates = map(lambda path: re.match(path_reg_exp, path).group(3), paths)
61+
print(base_path)
62+
print(partition_name)
63+
print(dates)
64+
return (base_path, partition_name, dates)
65+
66+
67+
def read_delta_from_s3(self, sparkSession, paths):
68+
(base_path, partition_name, dates) = extract_delta_info_from_path(self, paths)
69+
df = sparkSession.read \
70+
.format("delta") \
71+
.load(base_path) \
72+
.where("{} in ({})".format(partition_name, ', '.join(map(lambda x: "'{}'".format(x), dates))))
73+
print(df.count())
74+
return df
75+
76+
77+
def delta_read_from_basepath(self, sparkSession, base_path):
78+
return sparkSession.read \
79+
.format("delta") \
80+
.load(base_path)
81+
82+
83+
def read_delta_table(self, sparkSession, schema_name, table_name, partition_name, partition_dates):
84+
qualified_table_name = f"""{schema_name}.{table_name}"""
85+
return sparkSession.read.table(qualified_table_name) \
86+
.where("{} in ({})".format(partition_name, ', '.join(map(lambda x: "'{}'".format(x), partition_dates))))

0 commit comments

Comments
 (0)