|
| 1 | +from delta.tables import DeltaTable |
| 2 | +import re |
| 3 | + |
| 4 | + |
| 5 | +def write_partitioned_data_delta(self, dataframe, partition_name, partition_dates_to_override, write_mode, |
| 6 | + target_base_path): |
| 7 | + return dataframe \ |
| 8 | + .write.partitionBy(partition_name) \ |
| 9 | + .format("delta") \ |
| 10 | + .option("mergeSchema", "true") \ |
| 11 | + .option("__partition_columns", partition_name) \ |
| 12 | + .option("replaceWhere", "{} in ({})".format(partition_name, ', '.join( |
| 13 | + map(lambda x: "'{}'".format(x), partition_dates_to_override)))) \ |
| 14 | + .mode(write_mode) \ |
| 15 | + .save(target_base_path) |
| 16 | + |
| 17 | + |
| 18 | +def write_nonpartitioned_data_delta(self, dataframe, write_mode, target_base_path): |
| 19 | + return dataframe \ |
| 20 | + .write.format("delta") \ |
| 21 | + .option("mergeSchema", "true") \ |
| 22 | + .mode(write_mode) \ |
| 23 | + .save(target_base_path) |
| 24 | + |
| 25 | + |
| 26 | +def compact_delta_table_partitions(self, sparkSession, base_path, partition_name, dates, num_files): |
| 27 | + return sparkSession.read \ |
| 28 | + .format("delta") \ |
| 29 | + .load(base_path) \ |
| 30 | + .where(f"{partition_name} in (', '.join(map(lambda x : "'{}'".format(x), dates)))") \ |
| 31 | + .repartition(num_files) \ |
| 32 | + .write \ |
| 33 | + .option("dataChange", "false") \ |
| 34 | + .format("delta") \ |
| 35 | + .mode("overwrite") \ |
| 36 | + .option("replaceWhere", "{} in ({})".format(partition_name, ', '.join(map(lambda x: "'{}'".format(x), dates)))) \ |
| 37 | + .save(base_path) |
| 38 | + |
| 39 | + |
| 40 | +def generate_delta_table(self, sparkSession, schema_name, table_name, s3location): |
| 41 | + self.spark.sql("create database if not exists {}".format(schema_name)) |
| 42 | + qualified_table_name = f"""{schema_name}.{table_name}""" |
| 43 | + DeltaTable.createIfNotExists(sparkSession) \ |
| 44 | + .tableName(qualified_table_name) \ |
| 45 | + .location(s3location) \ |
| 46 | + .execute() |
| 47 | + print(f"Delta table {qualified_table_name} generated") |
| 48 | + |
| 49 | + |
| 50 | +def extract_delta_info_from_path(self, paths): |
| 51 | + path = paths[0] |
| 52 | + path_reg_exp = """(.*)/(.*)=(.*)""" |
| 53 | + try: |
| 54 | + match_pattern_to_path = re.match(path_reg_exp, path) |
| 55 | + except: |
| 56 | + raise Exception("Can not read {}: base path can not be extracted".format(paths.mkString(","))) |
| 57 | + |
| 58 | + base_path = match_pattern_to_path.group(1) |
| 59 | + partition_name = match_pattern_to_path.group(2) |
| 60 | + dates = map(lambda path: re.match(path_reg_exp, path).group(3), paths) |
| 61 | + print(base_path) |
| 62 | + print(partition_name) |
| 63 | + print(dates) |
| 64 | + return (base_path, partition_name, dates) |
| 65 | + |
| 66 | + |
| 67 | +def read_delta_from_s3(self, sparkSession, paths): |
| 68 | + (base_path, partition_name, dates) = extract_delta_info_from_path(self, paths) |
| 69 | + df = sparkSession.read \ |
| 70 | + .format("delta") \ |
| 71 | + .load(base_path) \ |
| 72 | + .where("{} in ({})".format(partition_name, ', '.join(map(lambda x: "'{}'".format(x), dates)))) |
| 73 | + print(df.count()) |
| 74 | + return df |
| 75 | + |
| 76 | + |
| 77 | +def delta_read_from_basepath(self, sparkSession, base_path): |
| 78 | + return sparkSession.read \ |
| 79 | + .format("delta") \ |
| 80 | + .load(base_path) |
| 81 | + |
| 82 | + |
| 83 | +def read_delta_table(self, sparkSession, schema_name, table_name, partition_name, partition_dates): |
| 84 | + qualified_table_name = f"""{schema_name}.{table_name}""" |
| 85 | + return sparkSession.read.table(qualified_table_name) \ |
| 86 | + .where("{} in ({})".format(partition_name, ', '.join(map(lambda x: "'{}'".format(x), partition_dates)))) |
0 commit comments