Merge pull request #626 from bashtage/improve-typing

bashtage · web-flow · commit c840e31f31f8 · 2024-11-06T10:08:07.000Z
TYP/DOC: Improve typign and docs
diff --git a/examples/asset-pricing_examples.ipynb b/examples/asset-pricing_examples.ipynb
@@ -412,7 +412,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.7"
   },
   "pycharm": {
    "stem_cell": {
diff --git a/examples/asset-pricing_formulas.ipynb b/examples/asset-pricing_formulas.ipynb
@@ -139,7 +139,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.7"
   },
   "pycharm": {
    "stem_cell": {
diff --git a/examples/iv_absorbing-regression.ipynb b/examples/iv_absorbing-regression.ipynb
@@ -151,7 +151,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.7"
   },
   "pycharm": {
    "stem_cell": {
diff --git a/examples/iv_advanced-examples.ipynb b/examples/iv_advanced-examples.ipynb
@@ -570,7 +570,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.7"
   },
   "nbsphinx": {
    "allow_errors": true
diff --git a/examples/iv_using-formulas.ipynb b/examples/iv_using-formulas.ipynb
@@ -191,7 +191,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.7"
   },
   "pycharm": {
    "stem_cell": {
diff --git a/examples/panel_data-formats.ipynb b/examples/panel_data-formats.ipynb
@@ -230,7 +230,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.7"
   },
   "pycharm": {
    "stem_cell": {
diff --git a/examples/panel_examples.ipynb b/examples/panel_examples.ipynb
@@ -427,7 +427,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.7"
   },
   "pycharm": {
    "stem_cell": {
diff --git a/examples/panel_using-formulas.ipynb b/examples/panel_using-formulas.ipynb
@@ -182,7 +182,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.7"
   },
   "pycharm": {
    "stem_cell": {
diff --git a/examples/system_examples.ipynb b/examples/system_examples.ipynb
@@ -39,7 +39,6 @@
    "outputs": [],
    "source": [
     "# Common libraries\n",
-    "%matplotlib inline\n",
     "import numpy as np\n",
     "import pandas as pd\n",
     "import statsmodels.api as sm"
@@ -733,6 +732,7 @@
    "outputs": [],
    "source": [
     "import statsmodels.api as sm\n",
+    "\n",
     "from linearmodels.datasets import french\n",
     "\n",
     "data = french.load()\n",
diff --git a/examples/system_formulas.ipynb b/examples/system_formulas.ipynb
@@ -23,6 +23,7 @@
    "source": [
     "import numpy as np\n",
     "import pandas as pd\n",
+    "\n",
     "from linearmodels.datasets import fringe\n",
     "\n",
     "data = fringe.load()"
@@ -179,7 +180,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.7"
   },
   "pycharm": {
    "stem_cell": {
diff --git a/examples/system_three-stage-ls.ipynb b/examples/system_three-stage-ls.ipynb
@@ -353,7 +353,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.7"
   },
   "pycharm": {
    "stem_cell": {
diff --git a/linearmodels/iv/absorbing.py b/linearmodels/iv/absorbing.py
@@ -2,7 +2,8 @@
 
 from collections import defaultdict
 from collections.abc import Hashable, Iterable
-from typing import Any, DefaultDict, TypeVar, Union, cast
+from hashlib import sha256
+from typing import Any, DefaultDict, Union, cast
 import warnings
 
 from numpy import (
@@ -58,23 +59,41 @@
 from linearmodels.shared.utility import DataFrameWrapper, SeriesWrapper
 import linearmodels.typing.data
 
+HAVE_XXHASH = False
 try:
-    from xxhash import xxh64 as hash_func
+    from xxhash import xxh64
+
+    HAVE_XXHASH = True
 except ImportError:
-    from hashlib import sha256 as hash_func
+    pass
 
-Hasher = TypeVar("Hasher", bound=hash_func)
 
+class Hasher:
+    def __init__(self):
+        if HAVE_XXHASH:
+            self._hasher = xxh64()
+            self._use_xx = True
+        else:
+            self._hasher = sha256()
+            self._use_xx = False
 
-_VARIABLE_CACHE: DefaultDict[Hashable, dict[str, ndarray]] = defaultdict(dict)
+    def reset(self):
+        if self._use_xx:
+            self._hasher = xxh64()
+        else:
+            self._hasher.reset()
 
+    def update(self, data: memoryview) -> None:
+        self._hasher.update(data)
 
-def _reset(hasher: Hasher) -> Hasher:
-    try:
-        hasher.reset()
-        return hasher
-    except AttributeError:
-        return hash_func()
+    def digest(self) -> bytes:
+        return self._hasher.digest()
+
+    def hexdigest(self) -> str:
+        return self._hasher.hexdigest()
+
+
+_VARIABLE_CACHE: DefaultDict[Hashable, dict[str, ndarray]] = defaultdict(dict)
 
 
 def clear_cache() -> None:
@@ -139,8 +158,8 @@ def lsmr_annihilate(
 
         variable_digest = ""
         if use_cache:
-            hasher = hash_func()
-            hasher.update(ascontiguousarray(_y.data))
+            hasher = Hasher()
+            hasher.update(memoryview(ascontiguousarray(_y.data)))
             variable_digest = hasher.hexdigest()
 
         if use_cache and variable_digest in _VARIABLE_CACHE[regressor_hash]:
@@ -153,7 +172,7 @@ def lsmr_annihilate(
     return column_stack(resids)
 
 
-def category_product(cats: linearmodels.typing.data.AnyPandas) -> Series:
+def category_product(cats: linearmodels.typing.AnyPandas) -> Series:
     """
     Construct category from all combination of input categories
 
@@ -171,7 +190,7 @@ def category_product(cats: linearmodels.typing.data.AnyPandas) -> Series:
     """
     if isinstance(cats, Series):
         return cats
-
+    assert isinstance(cats, DataFrame)
     sizes = []
     for c in cats:
         # TODO: Bug in pandas-stubs
@@ -197,7 +216,7 @@ def category_product(cats: linearmodels.typing.data.AnyPandas) -> Series:
     dtype_val = dtype(dtype_str)
     codes = zeros(nobs, dtype=dtype_val)
     cum_size = 0
-    for i, col in enumerate(cats):
+    for i, col_name in enumerate(cats):
         if dtype_str == "int8":
             shift: int8 | int16 | int32 | int64 = int8(cum_size)
         elif dtype_str == "int16":
@@ -206,7 +225,7 @@ def category_product(cats: linearmodels.typing.data.AnyPandas) -> Series:
             shift = int32(cum_size)
         else:  # elif dtype_str == "int64":
             shift = int64(cum_size)
-        cat_codes = asarray(cats[col].cat.codes)
+        cat_codes = asarray(cats[col_name].cat.codes)
         codes += cat_codes.astype(dtype_val) << shift
         cum_size += sizes[i]
 
@@ -236,8 +255,8 @@ def category_interaction(
 
 
 def category_continuous_interaction(
-    cat: linearmodels.typing.data.AnyPandas,
-    cont: linearmodels.typing.data.AnyPandas,
+    cat: linearmodels.typing.AnyPandas,
+    cont: linearmodels.typing.AnyPandas,
     precondition: bool = True,
 ) -> sp.csc_matrix:
     """
@@ -420,21 +439,23 @@ def hash(self) -> list[tuple[str, ...]]:
         Construct a hash that will be invariant for any permutation of
         inputs that produce the same fit when used as regressors"""
         # Sorted hashes of any categoricals
-        hasher = hash_func()
+        hasher = Hasher()
         cat_hashes = []
         cat = self.cat
         for col in cat:
-            hasher.update(ascontiguousarray(self.cat[col].cat.codes.to_numpy().data))
+            hasher.update(
+                memoryview(ascontiguousarray(self.cat[col].cat.codes.to_numpy().data))
+            )
             cat_hashes.append(hasher.hexdigest())
-            hasher = _reset(hasher)
+            hasher.reset()
         sorted_hashes = tuple(sorted(cat_hashes))
 
         hashes = []
         cont = self.cont
         for col in cont:
-            hasher.update(ascontiguousarray(cont[col].to_numpy()).data)
+            hasher.update(memoryview(ascontiguousarray(cont[col].to_numpy()).data))
             hashes.append(sorted_hashes + (hasher.hexdigest(),))
-            hasher = _reset(hasher)
+            hasher.reset()
 
         return sorted(hashes)
 
@@ -531,26 +552,30 @@ def approx_rank(self) -> int:
     @property
     def hash(self) -> tuple[tuple[str, ...], ...]:
         hashes: list[tuple[str, ...]] = []
-        hasher = hash_func()
+        hasher = Hasher()
         if self._cat is not None:
             for col in self._cat:
                 hasher.update(
-                    ascontiguousarray(self._cat[col].cat.codes.to_numpy()).data
+                    memoryview(
+                        ascontiguousarray(self._cat[col].cat.codes.to_numpy()).data
+                    )
                 )
                 hashes.append((hasher.hexdigest(),))
-                hasher = _reset(hasher)
+                hasher.reset()
         if self._cont is not None:
             for col in self._cont:
-                hasher.update(ascontiguousarray(self._cont[col].to_numpy()).data)
+                hasher.update(
+                    memoryview(ascontiguousarray(self._cont[col].to_numpy()).data)
+                )
                 hashes.append((hasher.hexdigest(),))
-                hasher = _reset(hasher)
+                hasher.reset()
         if self._interactions is not None:
             for interact in self._interactions:
                 hashes.extend(interact.hash)
         # Add weight hash if provided
         if self._weights is not None:
-            hasher = hash_func()
-            hasher.update(ascontiguousarray(self._weights.data))
+            hasher = Hasher()
+            hasher.update(memoryview(ascontiguousarray(self._weights.data)))
             hashes.append((hasher.hexdigest(),))
         return tuple(sorted(hashes))
 
@@ -706,7 +731,7 @@ def __init__(
         self._index = self._dependent.rows
         self._method = "Absorbing LS"
 
-        self._const_col = 0
+        self._const_col: int | None = 0
         self._has_constant = False
         self._has_constant_exog = self._check_constant()
         self._constant_absorbed = False
@@ -733,7 +758,7 @@ def _drop_missing(self) -> linearmodels.typing.data.BoolArray:
     def _check_constant(self) -> bool:
         col_delta = ptp(self.exog.ndarray, 0)
         has_constant = npany(col_delta == 0)
-        self._const_col = where(col_delta == 0)[0][0] if has_constant else None
+        self._const_col = int(where(col_delta == 0)[0][0]) if has_constant else None
         return bool(has_constant)
 
     def _check_weights(self) -> None:
diff --git a/linearmodels/iv/data.py b/linearmodels/iv/data.py
@@ -18,9 +18,7 @@
 type_err = "Only ndarrays, DataArrays and Series and DataFrames are supported"
 
 
-def convert_columns(
-    s: pd.Series, drop_first: bool
-) -> linearmodels.typing.data.AnyPandas:
+def convert_columns(s: pd.Series, drop_first: bool) -> linearmodels.typing.AnyPandas:
     if isinstance(s.dtype, pd.CategoricalDtype):
         out = pd.get_dummies(s, drop_first=drop_first)
         # TODO: Remove once pandas typing fixed
@@ -172,7 +170,7 @@ def pandas(self) -> pd.DataFrame:
         return self._pandas
 
     @property
-    def ndarray(self) -> linearmodels.typing.data.NumericArray:
+    def ndarray(self) -> linearmodels.typing.NumericArray:
         """ndarray view of data, always 2d"""
         return self._ndarray
 
diff --git a/linearmodels/panel/_utility.pxi b/linearmodels/panel/_utility.pxi
diff --git a/linearmodels/panel/_utility.pyi b/linearmodels/panel/_utility.pyi
@@ -0,0 +1,6 @@
+from linearmodels.typing.data import IntArray
+
+def _drop_singletons(
+    meta: IntArray,
+    orig_dest: IntArray,
+) -> None: ...
diff --git a/linearmodels/panel/data.py b/linearmodels/panel/data.py
@@ -63,10 +63,10 @@ def __init__(self, df: pandas.DataFrame):
     @classmethod
     def from_array(
         cls,
-        values: linearmodels.typing.data.NumericArray,
-        items: Sequence[linearmodels.typing.data.Label],
-        major_axis: Sequence[linearmodels.typing.data.Label],
-        minor_axis: Sequence[linearmodels.typing.data.Label],
+        values: linearmodels.typing.NumericArray,
+        items: Sequence[linearmodels.typing.Label],
+        major_axis: Sequence[linearmodels.typing.Label],
+        minor_axis: Sequence[linearmodels.typing.Label],
     ) -> _Panel:
         index = list(product(minor_axis, major_axis))
         multi_index = MultiIndex.from_tuples(index)
@@ -103,7 +103,7 @@ def to_frame(self) -> DataFrame:
 
 def convert_columns(
     s: pandas.Series, drop_first: bool
-) -> linearmodels.typing.data.AnyPandas:
+) -> linearmodels.typing.AnyPandas:
     if is_string_dtype(s.dtype) and s.map(lambda v: isinstance(v, str)).all():
         s = s.astype("category")
 
@@ -338,18 +338,18 @@ def nentity(self) -> int:
         return self._n
 
     @property
-    def vars(self) -> list[linearmodels.typing.data.Label]:
+    def vars(self) -> list[linearmodels.typing.Label]:
         """List of variable names"""
         return list(self._frame.columns)
 
     @property
-    def time(self) -> list[linearmodels.typing.data.Label]:
+    def time(self) -> list[linearmodels.typing.Label]:
         """List of time index names"""
         index = self.index
         return list(index.levels[1][index.codes[1]].unique())
 
     @property
-    def entities(self) -> list[linearmodels.typing.data.Label]:
+    def entities(self) -> list[linearmodels.typing.Label]:
         """List of entity index names"""
         index = self.index
         return list(index.levels[0][index.codes[0]].unique())
diff --git a/linearmodels/panel/model.py b/linearmodels/panel/model.py
diff --git a/linearmodels/system/_utility.py b/linearmodels/system/_utility.py
diff --git a/linearmodels/system/model.py b/linearmodels/system/model.py