update error codes

andrewfulton9 · andrewfulton9 · commit f67374f5cb60 · 2025-05-12T23:42:05.000Z
diff --git a/pyproject.toml b/pyproject.toml
@@ -63,29 +63,69 @@ select = [
 ]
 
 ignore = [
-    "D104", # Missing docstring in public package
-    "D100", # Missing docstring in public module
-    "D211", # No blank line before class
-    "PD901", # Avoid using 'df' for pandas dataframes. Perfectly fine in functions with limited scope
+    "D104",   # Missing docstring in public package
+    "D100",   # Missing docstring in public module
+    "D211",   # No blank line before class
+    "PD901",  # Avoid using 'df' for pandas dataframes. Perfectly fine in functions with limited scope
     "ANN201", # Missing return type annotation for public function (makes no sense for NoneType return types...)
     "ANN101", # Missing type annotation for `self`
     "ANN204", # Missing return type annotation for special method
     "ANN002", # Missing type annotation for `*args`
     "ANN003", # Missing type annotation for `**kwargs`
-    "D105", # Missing docstring in magic method
-    "D203", # 1 blank line before after class docstring
-    "D204", # 1 blank line required after class docstring
-    "D413", # 1 blank line after parameters
+    "D105",   # Missing docstring in magic method
+    "D203",   # 1 blank line before after class docstring
+    "D204",   # 1 blank line required after class docstring
+    "D413",   # 1 blank line after parameters
     "SIM108", # Simplify if/else to one line; not always clearer
-    "D206", # Docstrings should be indented with spaces; unnecessary when running ruff-format
-    "E501", # Line length too long; unnecessary when running ruff-format
-    "W191", # Indentation contains tabs; unnecessary when running ruff-format
+    "D206",   # Docstrings should be indented with spaces; unnecessary when running ruff-format
+    "E501",   # Line length too long; unnecessary when running ruff-format
+    "W191",   # Indentation contains tabs; unnecessary when running ruff-format
 
     # REMOVE AFTER FIXING
     "ANN001", # Missing type annotation for function argument `args`
     "ANN202", # Missing Missing return type annotation for private function
-    "D103", # Missing docstring in public function
-    "D101", # Missing docstring in public class
+    "D103",   # Missing docstring in public function
+    "D101",   # Missing docstring in public class
+    "PT009",  # Use a regular `assert` instead of unittest-style `assertEqual`
+    "D102",   # Missing docstring in public method
+    "UP031",  # Use format specifiers instead of percent format
+    "D401",   # First line of docstring should be in imperative mood: "Loads the vocabulary from the specified path."
+    "RET505", # Unnecessary `elif` after `return` statement
+    "D107",   # Missing docstring in `__init__`,
+    "PT027",  # Use `pytest.raises` instead of unittest-style `assertRaisesRegex`
+    "SIM101", # Multiple `isinstance` calls for `maybe_collection`, merge into a single call
+    "FIX002", # Line contains TODO, consider resolving the issue
+    "SIM103", # Return the condition directly
+    "UP008",  # Use `super()` instead of `super(__class__, self)`
+    "N802",   # Function name should be lowercase,
+    "B008",   # Do not perform function call in argument defaults; instead, perform the call within the function, or read the default from a module-level singleton variable
+    "E731",   # Do not assign a `lambda` expression, use a `def`
+    "ERA001", # Found commented-out code
+    "B005",   # Using `.strip()` with multi-character strings is misleading
+    "SIM117", # Use a single `with` statement with multiple contexts instead of nested `with` statements
+    "B904",   # Within an `except` clause, raise exceptions with `raise ... from err` or `raise ... from None` to distinguish them from errors in exception handling
+    "ANN401", # Dynamically typed expressions (typing.Any) are disallowed in `domain`
+    "D417",   # Missing argument descriptions in the docstring
+    "NPY002", # Replace legacy
+    "ARG001", # Unused function argument
+    "D404",   # First word of the docstring should not be "This"
+    "SIM102", # Use a single `if` statement instead of nested `if` statements
+    "UP028",  # Replace `yield` over `for` loop with `yield from`
+    "RET504", # Unnecessary assignment to variable before `return` statement
+    "PD011",  # Use `.to_numpy()` instead of `.values`
+    "ANN206", # Missing return type annotation for classmethod
+    "ANN102", # Missing type annotation for `cls` in classmethod
+    "PD015",  # Use `.merge` method instead of `pd.merge` function
+    "PD003",  # `.isna` is preferred to `.isnull`; functionality is equivalent
+    "ANN205", # Missing return type annotation for staticmethod
+    "B007",   # Loop control variable not used within loop body
+    "SIM211", # Use `not ...` instead of `False if ... else True`
+    "ARG002", # Unused method argument
+    "PD002",  # `inplace=True` should be avoided; it has inconsistent behavior
+    "F821",   # Undefined name
+    "SIM105", # Use `contextlib.suppress(...)` instead of `try`-`except`-`pass`
+    "PT018",  # Assertion should be broken down into multiple parts
+    "E741",   # Ambiguous variable name
 ]
 
 
diff --git a/tensorflow_data_validation/utils/mutual_information_util_test.py b/tensorflow_data_validation/utils/mutual_information_util_test.py
@@ -389,7 +389,7 @@ def testCategoricalOrdinal(self):
         # using whatever log base we're using, in this case base 2.
         a = np.array([i % 2 for i in range(1000)])
         b = np.array([np.random.random() * (1.0 + i % 2) for i in range(1000)])
-        filt = np.array([True if i % 2 else False for i in range(1000)])
+        filt = np.array([bool(i % 2) for i in range(1000)])
         for method in ["smaller_data", "larger_data"]:
             self.assertAlmostEqual(
                 -0.75 * np.log2(0.75),
diff --git a/tensorflow_data_validation/utils/schema_util.py b/tensorflow_data_validation/utils/schema_util.py
@@ -177,7 +177,7 @@ def set_domain(
 
     for d_type, d_name in feature_domains.items():
         if isinstance(domain, d_type):
-            if d_type == str:
+            if d_type is str:
                 found_domain = False
                 for global_domain in schema.string_domain:
                     if global_domain.name == domain:
diff --git a/tensorflow_data_validation/utils/slicing_util.py b/tensorflow_data_validation/utils/slicing_util.py
@@ -166,7 +166,7 @@ def feature_value_slicer(
                     _PARENT_INDEX_COLUMN: value_parent_indices,
                 }
             )
-            df.drop_duplicates(inplace=True)
+            df = df.drop_duplicates()
             # Filter based on slice values
             if values is not None:
                 df = df.loc[df[feature_name].isin(values)]
@@ -183,8 +183,7 @@ def feature_value_slicer(
         # we expect the merged dataframe to have sorted parent indices per
         # slice key.
         merged_df = functools.reduce(
-            lambda base, update: pd.merge(
-                base,
+            lambda base, update: base.merge(
                 update,
                 how="inner",  # pylint: disable=g-long-lambda
                 on=_PARENT_INDEX_COLUMN,
@@ -224,7 +223,7 @@ def feature_value_slicer(
     return feature_value_slicer
 
 
-def _to_slice_key(feature_value: Any):
+def _to_slice_key(feature_value: Any):  # noqa: ANN401
     """Decode slice key as UTF-8."""
     # For bytes features we try decoding it as utf-8 (and throw an error if
     # fails). This is because in stats proto the slice name (dataset name) is a
@@ -260,8 +259,7 @@ def generate_slices(
     """
     for slice_fn in slice_functions:
         try:
-            for sliced_record_batch in slice_fn(record_batch, **kwargs):
-                yield sliced_record_batch
+            yield from slice_fn(record_batch, **kwargs)
         except Exception as e:
             raise ValueError(
                 "One of the slice_functions %s raised an exception: %s."