2
2
3
3
from collections import defaultdict
4
4
from collections .abc import Hashable , Iterable
5
- from typing import Any , DefaultDict , TypeVar , Union , cast
5
+ from hashlib import sha256
6
+ from typing import Any , DefaultDict , Union , cast
6
7
import warnings
7
8
8
9
from numpy import (
58
59
from linearmodels .shared .utility import DataFrameWrapper , SeriesWrapper
59
60
import linearmodels .typing .data
60
61
62
+ HAVE_XXHASH = False
61
63
try :
62
- from xxhash import xxh64 as hash_func
64
+ from xxhash import xxh64
65
+
66
+ HAVE_XXHASH = True
63
67
except ImportError :
64
- from hashlib import sha256 as hash_func
68
+ pass
65
69
66
- Hasher = TypeVar ("Hasher" , bound = hash_func )
67
70
71
+ class Hasher :
72
+ def __init__ (self ):
73
+ if HAVE_XXHASH :
74
+ self ._hasher = xxh64 ()
75
+ self ._use_xx = True
76
+ else :
77
+ self ._hasher = sha256 ()
78
+ self ._use_xx = False
68
79
69
- _VARIABLE_CACHE : DefaultDict [Hashable , dict [str , ndarray ]] = defaultdict (dict )
80
+ def reset (self ):
81
+ if self ._use_xx :
82
+ self ._hasher = xxh64 ()
83
+ else :
84
+ self ._hasher .reset ()
70
85
86
+ def update (self , data : memoryview ) -> None :
87
+ self ._hasher .update (data )
71
88
72
- def _reset (hasher : Hasher ) -> Hasher :
73
- try :
74
- hasher .reset ()
75
- return hasher
76
- except AttributeError :
77
- return hash_func ()
89
+ def digest (self ) -> bytes :
90
+ return self ._hasher .digest ()
91
+
92
+ def hexdigest (self ) -> str :
93
+ return self ._hasher .hexdigest ()
94
+
95
+
96
+ _VARIABLE_CACHE : DefaultDict [Hashable , dict [str , ndarray ]] = defaultdict (dict )
78
97
79
98
80
99
def clear_cache () -> None :
@@ -139,8 +158,8 @@ def lsmr_annihilate(
139
158
140
159
variable_digest = ""
141
160
if use_cache :
142
- hasher = hash_func ()
143
- hasher .update (ascontiguousarray (_y .data ))
161
+ hasher = Hasher ()
162
+ hasher .update (memoryview ( ascontiguousarray (_y .data ) ))
144
163
variable_digest = hasher .hexdigest ()
145
164
146
165
if use_cache and variable_digest in _VARIABLE_CACHE [regressor_hash ]:
@@ -153,7 +172,7 @@ def lsmr_annihilate(
153
172
return column_stack (resids )
154
173
155
174
156
- def category_product (cats : linearmodels .typing .data . AnyPandas ) -> Series :
175
+ def category_product (cats : linearmodels .typing .AnyPandas ) -> Series :
157
176
"""
158
177
Construct category from all combination of input categories
159
178
@@ -171,7 +190,7 @@ def category_product(cats: linearmodels.typing.data.AnyPandas) -> Series:
171
190
"""
172
191
if isinstance (cats , Series ):
173
192
return cats
174
-
193
+ assert isinstance ( cats , DataFrame )
175
194
sizes = []
176
195
for c in cats :
177
196
# TODO: Bug in pandas-stubs
@@ -197,7 +216,7 @@ def category_product(cats: linearmodels.typing.data.AnyPandas) -> Series:
197
216
dtype_val = dtype (dtype_str )
198
217
codes = zeros (nobs , dtype = dtype_val )
199
218
cum_size = 0
200
- for i , col in enumerate (cats ):
219
+ for i , col_name in enumerate (cats ):
201
220
if dtype_str == "int8" :
202
221
shift : int8 | int16 | int32 | int64 = int8 (cum_size )
203
222
elif dtype_str == "int16" :
@@ -206,7 +225,7 @@ def category_product(cats: linearmodels.typing.data.AnyPandas) -> Series:
206
225
shift = int32 (cum_size )
207
226
else : # elif dtype_str == "int64":
208
227
shift = int64 (cum_size )
209
- cat_codes = asarray (cats [col ].cat .codes )
228
+ cat_codes = asarray (cats [col_name ].cat .codes )
210
229
codes += cat_codes .astype (dtype_val ) << shift
211
230
cum_size += sizes [i ]
212
231
@@ -236,8 +255,8 @@ def category_interaction(
236
255
237
256
238
257
def category_continuous_interaction (
239
- cat : linearmodels .typing .data . AnyPandas ,
240
- cont : linearmodels .typing .data . AnyPandas ,
258
+ cat : linearmodels .typing .AnyPandas ,
259
+ cont : linearmodels .typing .AnyPandas ,
241
260
precondition : bool = True ,
242
261
) -> sp .csc_matrix :
243
262
"""
@@ -420,21 +439,23 @@ def hash(self) -> list[tuple[str, ...]]:
420
439
Construct a hash that will be invariant for any permutation of
421
440
inputs that produce the same fit when used as regressors"""
422
441
# Sorted hashes of any categoricals
423
- hasher = hash_func ()
442
+ hasher = Hasher ()
424
443
cat_hashes = []
425
444
cat = self .cat
426
445
for col in cat :
427
- hasher .update (ascontiguousarray (self .cat [col ].cat .codes .to_numpy ().data ))
446
+ hasher .update (
447
+ memoryview (ascontiguousarray (self .cat [col ].cat .codes .to_numpy ().data ))
448
+ )
428
449
cat_hashes .append (hasher .hexdigest ())
429
- hasher = _reset ( hasher )
450
+ hasher . reset ( )
430
451
sorted_hashes = tuple (sorted (cat_hashes ))
431
452
432
453
hashes = []
433
454
cont = self .cont
434
455
for col in cont :
435
- hasher .update (ascontiguousarray (cont [col ].to_numpy ()).data )
456
+ hasher .update (memoryview ( ascontiguousarray (cont [col ].to_numpy ()).data ) )
436
457
hashes .append (sorted_hashes + (hasher .hexdigest (),))
437
- hasher = _reset ( hasher )
458
+ hasher . reset ( )
438
459
439
460
return sorted (hashes )
440
461
@@ -531,26 +552,30 @@ def approx_rank(self) -> int:
531
552
@property
532
553
def hash (self ) -> tuple [tuple [str , ...], ...]:
533
554
hashes : list [tuple [str , ...]] = []
534
- hasher = hash_func ()
555
+ hasher = Hasher ()
535
556
if self ._cat is not None :
536
557
for col in self ._cat :
537
558
hasher .update (
538
- ascontiguousarray (self ._cat [col ].cat .codes .to_numpy ()).data
559
+ memoryview (
560
+ ascontiguousarray (self ._cat [col ].cat .codes .to_numpy ()).data
561
+ )
539
562
)
540
563
hashes .append ((hasher .hexdigest (),))
541
- hasher = _reset ( hasher )
564
+ hasher . reset ( )
542
565
if self ._cont is not None :
543
566
for col in self ._cont :
544
- hasher .update (ascontiguousarray (self ._cont [col ].to_numpy ()).data )
567
+ hasher .update (
568
+ memoryview (ascontiguousarray (self ._cont [col ].to_numpy ()).data )
569
+ )
545
570
hashes .append ((hasher .hexdigest (),))
546
- hasher = _reset ( hasher )
571
+ hasher . reset ( )
547
572
if self ._interactions is not None :
548
573
for interact in self ._interactions :
549
574
hashes .extend (interact .hash )
550
575
# Add weight hash if provided
551
576
if self ._weights is not None :
552
- hasher = hash_func ()
553
- hasher .update (ascontiguousarray (self ._weights .data ))
577
+ hasher = Hasher ()
578
+ hasher .update (memoryview ( ascontiguousarray (self ._weights .data ) ))
554
579
hashes .append ((hasher .hexdigest (),))
555
580
return tuple (sorted (hashes ))
556
581
@@ -706,7 +731,7 @@ def __init__(
706
731
self ._index = self ._dependent .rows
707
732
self ._method = "Absorbing LS"
708
733
709
- self ._const_col = 0
734
+ self ._const_col : int | None = 0
710
735
self ._has_constant = False
711
736
self ._has_constant_exog = self ._check_constant ()
712
737
self ._constant_absorbed = False
@@ -733,7 +758,7 @@ def _drop_missing(self) -> linearmodels.typing.data.BoolArray:
733
758
def _check_constant (self ) -> bool :
734
759
col_delta = ptp (self .exog .ndarray , 0 )
735
760
has_constant = npany (col_delta == 0 )
736
- self ._const_col = where (col_delta == 0 )[0 ][0 ] if has_constant else None
761
+ self ._const_col = int ( where (col_delta == 0 )[0 ][0 ]) if has_constant else None
737
762
return bool (has_constant )
738
763
739
764
def _check_weights (self ) -> None :
0 commit comments