-
Notifications
You must be signed in to change notification settings - Fork 13
/
Copy pathkernel.py
105 lines (86 loc) · 3.1 KB
/
kernel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
"""Contains functions for molecular similarity."""
from typing import Union
import numpy as np
import numpy.typing as npt
from scipy import sparse
def tanimoto_similarity_sparse(
matrix_a: sparse.csr_matrix, matrix_b: sparse.csr_matrix
) -> npt.NDArray[np.float64]:
"""Calculate a matrix of tanimoto similarities between feature matrix a and b.
Parameters
----------
matrix_a: sparse.csr_matrix
Feature matrix A.
matrix_b: sparse.csr_matrix
Feature matrix B.
Returns
-------
npt.NDArray[np.float64]
Matrix of similarity values between instances of A (rows/first dim) , and instances of B (columns/second dim).
"""
intersection = matrix_a.dot(matrix_b.transpose()).toarray()
norm_1 = np.array(matrix_a.sum(axis=1))
if matrix_a is matrix_b:
# avoid calculating the same norm twice
norm_2 = norm_1
else:
norm_2 = np.array(matrix_b.sum(axis=1))
union = norm_1 + norm_2.T - intersection
# avoid division by zero https://stackoverflow.com/a/37977222
return np.divide(
intersection,
union,
out=np.zeros(intersection.shape, dtype=float),
where=union != 0,
)
def tanimoto_distance_sparse(
matrix_a: sparse.csr_matrix, matrix_b: sparse.csr_matrix
) -> npt.NDArray[np.float64]:
"""Calculate a matrix of tanimoto distance between feature matrix a and b.
Tanimoto distance is defined as 1-similarity.
Parameters
----------
matrix_a: sparse.csr_matrix
Feature matrix A.
matrix_b: sparse.csr_matrix
Feature matrix B.
Returns
-------
npt.NDArray[np.float64]
Matrix of similarity values between instances of A (rows/first dim) , and instances of B (columns/second dim).
"""
return 1 - tanimoto_similarity_sparse(matrix_a, matrix_b) # type: ignore
def self_tanimoto_similarity(
matrix_a: Union[sparse.csr_matrix, npt.NDArray[np.int_]],
) -> npt.NDArray[np.float64]:
"""Calculate a matrix of tanimoto similarity between feature matrix a and itself.
Parameters
----------
matrix_a: Union[sparse.csr_matrix, npt.NDArray[np.int_]]
Feature matrix.
Returns
-------
npt.NDArray[np.float64]
Square matrix of similarity values between all instances in the matrix.
"""
if isinstance(matrix_a, np.ndarray):
sparse_matrix = sparse.csr_matrix(matrix_a)
elif isinstance(matrix_a, sparse.csr_matrix):
sparse_matrix = matrix_a
else:
raise TypeError(f"Unsupported type: {type(matrix_a)}")
return tanimoto_similarity_sparse(sparse_matrix, sparse_matrix)
def self_tanimoto_distance(
matrix_a: Union[sparse.csr_matrix, npt.NDArray[np.int_]],
) -> npt.NDArray[np.float64]:
"""Calculate a matrix of tanimoto distance between feature matrix a and itself.
Parameters
----------
matrix_a: Union[sparse.csr_matrix, npt.NDArray[np.int_]]
Feature matrix.
Returns
-------
npt.NDArray[np.float64]
Square matrix of similarity values between all instances in the matrix.
"""
return 1 - self_tanimoto_similarity(matrix_a) # type: ignore