docstring enforcement for sparklib

Maxwell Dylla · Maxwell Dylla · commit d3287731bdf4 · 2024-10-23T23:14:42.000-07:00
diff --git a/pyproject.toml b/pyproject.toml
@@ -34,6 +34,7 @@ select = [
     # pyflakes
     "F",
     # pycodestyle
+    "D",
     "E",
     "W",
     # flake8-2020
@@ -86,8 +87,18 @@ ignore = [
     "B006",
     # recommended by Ruff to disable to avoid issues with formatter
     "COM812", "ISC001",
+    # extra rules on documentation strings
+    "D100", "D401", "D203", "D213",
+
 ]
-per-file-ignores = {"__init__.py" = ["F401"]}
+
+[tool.ruff.lint.per-file-ignores]
+"__init__.py" = ["F401"]
+
+# requires docstring rules in sparklib
+"src/pulse_telemetry/sparklib/__init__.py" = ["D"]
+"!src/pulse_telemetry/sparklib/**/*.py" = ["D"]
+
 
 [tool.ruff.lint.flake8-type-checking]
 quote-annotations = true
diff --git a/src/pulse_telemetry/sparklib/iceberg.py b/src/pulse_telemetry/sparklib/iceberg.py
@@ -48,6 +48,7 @@ def create_table_if_not_exists(
     Returns
     -------
     None
+
     """
     # Creates the database if it does not exist
     spark.sql(f"CREATE SCHEMA IF NOT EXISTS {catalog_name}.{database_name}")
@@ -91,6 +92,7 @@ def read_table(spark: "SparkSession", catalog_name: str, database_name: str, tab
     -------
     DataFrame
         A PySpark DataFrame representing the table.
+
     """
     return spark.sql(f"SELECT * FROM {catalog_name}.{database_name}.{table_name}")
 
@@ -125,6 +127,7 @@ def merge_into_table(
     Returns
     -------
     None
+
     """
     source_df.createOrReplaceTempView("source")
     match_condition = " AND ".join([f"target.{col} = source.{col}" for col in match_columns])
@@ -149,8 +152,7 @@ def expire_snapshots(
     retain_last: int,
     max_concurrent_deletes: int = 8,
 ) -> int:
-    """
-    Removes old snapshots from the specified Iceberg table.
+    """Removes old snapshots from the specified Iceberg table.
 
     Parameters
     ----------
@@ -173,6 +175,7 @@ def expire_snapshots(
     -------
     int
         The number of data files deleted during snapshot expiration.
+
     """
     older_than_str = older_than.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
     return spark.sql(f"""
@@ -193,8 +196,7 @@ def remove_orphan_files(
     older_than: "datetime.datetime",
     max_concurrent_deletes: int = 8,
 ) -> int:
-    """
-    Removes orphaned files from the specified Iceberg table.
+    """Removes orphaned files from the specified Iceberg table.
 
     Parameters
     ----------
@@ -215,6 +217,7 @@ def remove_orphan_files(
     -------
     int
         The count of orphaned files removed during the operation.
+
     """
     older_than_str = older_than.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
     return spark.sql(f"""
@@ -232,8 +235,7 @@ def rewrite_data_files(
     database_name: str,
     table_name: str,
 ) -> int:
-    """
-    Rewrites data files from the specified Iceberg table.
+    """Rewrites data files from the specified Iceberg table.
 
     Uses the 'sort' strategy and defaults to the table's sort-order.
 
@@ -252,6 +254,7 @@ def rewrite_data_files(
     -------
     int
         The sum of the rewritten and new data files.
+
     """
     result = spark.sql(f"""
         CALL {catalog_name}.system.rewrite_data_files(
@@ -268,8 +271,7 @@ def rewrite_manifests(
     database_name: str,
     table_name: str,
 ) -> int:
-    """
-    Rewrites manifest files from the specified Iceberg table.
+    """Rewrites manifest files from the specified Iceberg table.
 
     Parameters
     ----------
@@ -286,6 +288,7 @@ def rewrite_manifests(
     -------
     int
         The sum of the rewritten and new manifest files.
+
     """
     result = spark.sql(f"""
         CALL {catalog_name}.system.rewrite_manifests(
diff --git a/src/pulse_telemetry/sparklib/processing_incremental.py b/src/pulse_telemetry/sparklib/processing_incremental.py
@@ -51,8 +51,8 @@ def processing_incremental(
     -------
     DataFrame
         The DataFrame resulting from applying the aggregation function to the filtered source records.
-    """
 
+    """
     # Get the adjusted last processed timestamp from the sink DataFrame
     watermark = _adjusted_watermark(
         sink=sink,
diff --git a/src/pulse_telemetry/sparklib/statistics_cycle.py b/src/pulse_telemetry/sparklib/statistics_cycle.py
@@ -85,6 +85,7 @@ def statistics_cycle(df: "DataFrame") -> "DataFrame":
     ```
     df = df.withWatermark("update_ts", "14 days")
     ```
+
     """
     # Calculating weighted averages using the duration__s column
     time_weighted_avg = lambda col: (F.sum(F.col(col) * F.col("duration__s")) / F.sum("duration__s"))  # noqa: E731
diff --git a/src/pulse_telemetry/sparklib/statistics_step.py b/src/pulse_telemetry/sparklib/statistics_step.py
@@ -88,6 +88,7 @@ def statistics_step(df: "DataFrame") -> "DataFrame":
     ```
     df = df.withWatermark("update_ts", "14 days")
     ```
+
     """
     # Calculating weighted averages using the duration__s column
     time_weighted_avg = lambda col: (F.sum(F.col(col) * F.col("duration__s")) / F.sum("duration__s"))  # noqa: E731