Skip to content

Commit

Permalink
initial support for for sklearn conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
MainRo committed Nov 4, 2024
1 parent cd6e968 commit fc10165
Show file tree
Hide file tree
Showing 10 changed files with 303 additions and 24 deletions.
1 change: 1 addition & 0 deletions ebm2onnx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
__version__ = '3.2.0'

from .convert import to_onnx, get_dtype_from_pandas
from . import sklearn
54 changes: 51 additions & 3 deletions ebm2onnx/convert.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from typing import List
import logging
from enum import Enum
from copy import deepcopy
from .utils import get_latest_opset_version
from ebm2onnx import graph
Expand All @@ -18,12 +21,25 @@
'str': onnx.TensorProto.STRING,
}

np_type_for = {
'bool': bool,
'float': np.float32,
'double': np.double,
'int': int,
'str': str,
}

bool_remap = {
'False': '0',
'True': '1',
}


class FeatureType(Enum):
COLUMN = 1
TENSOR = 2


def infer_features_dtype(dtype, feature_name):
feature_dtype = onnx.TensorProto.DOUBLE
if dtype is not None:
Expand Down Expand Up @@ -66,6 +82,16 @@ def get_dtype_from_pandas(df):
return dtype


def get_dtype_from_tensor_type(
dtype: str,
features: List[str]
):
return {
f: dtype
for f in features
}


def to_graph(model, dtype, name="ebm",
predict_proba=False,
explain=False,
Expand Down Expand Up @@ -99,6 +125,18 @@ def to_graph(model, dtype, name="ebm",
inputs = [None for _ in model.feature_names_in_]
parts = []

if type(dtype) is tuple:
dname, dtype = dtype
logging.debug(f"using tensor-based input {dtype} of len {len(model.feature_names_in_)}")
features_org = FeatureType.TENSOR
tensor_inputs = graph.create_input(root, dname, onnx_type_for[dtype], [None, len(model.feature_names_in_)])
tensor_inputs = ebm.split_input(model.feature_names_in_)(tensor_inputs)
tensor_inputs = graph.clear_transients(tensor_inputs)
dtype = get_dtype_from_tensor_type(dtype, model.feature_names_in_)
else:
logging.debug(f"using column-based inputs {model.feature_names_in_}")
features_org = FeatureType.COLUMN

feature_types = list(model.feature_types_in_)
interaction_count = len(model.term_names_) - len(feature_types)
for _ in range(interaction_count):
Expand All @@ -115,9 +153,12 @@ def to_graph(model, dtype, name="ebm",
if feature_type == 'continuous':
bins = [-np.inf, -np.inf] + list(model_bins[feature_group[0]][0])
additive_terms = model.term_scores_[feature_index]

feature_dtype = infer_features_dtype(dtype, feature_name)
part = graph.create_input(root, feature_name, feature_dtype, [None])

if features_org == FeatureType.TENSOR:
part = graph.create_transient_by_name(root, feature_name, feature_dtype, [None])
else:
part = graph.create_input(root, feature_name, feature_dtype, [None])
part = ops.flatten()(part)
inputs[feature_index] = part
part = ebm.get_bin_index_on_continuous_value(bins)(part)
Expand All @@ -129,6 +170,9 @@ def to_graph(model, dtype, name="ebm",
additive_terms = model.term_scores_[feature_index]

feature_dtype = infer_features_dtype(dtype, feature_name)
if features_org == FeatureType.TENSOR:
raise ValueError("tensor-based inputs are not supported with nominal/ordinal features")

part = graph.create_input(root, feature_name, feature_dtype, [None])
if feature_dtype == onnx.TensorProto.BOOL:
# ONNX converts booleans to strings 0/1, not False/True
Expand Down Expand Up @@ -180,7 +224,11 @@ def to_graph(model, dtype, name="ebm",
raise ValueError(f"The type of the feature {feature_name} is unknown: {feature_type}")

# compute scores, predict and proba
g = graph.merge(*parts)
if features_org == FeatureType.TENSOR:
g = graph.merge(tensor_inputs, *parts)
else:
g = graph.merge(*parts)

if type(model) is ExplainableBoostingClassifier:
class_type = onnx.TensorProto.STRING if model.classes_.dtype.type is np.str_ else onnx.TensorProto.INT64
classes = model.classes_
Expand Down
28 changes: 28 additions & 0 deletions ebm2onnx/ebm.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,34 @@
import ebm2onnx.graph as graph


def split_input(ebm_features):
"""
Transients:
- features as a single tensor
"""
def _split_input(g):
init_reshape = graph.create_initializer(
g, "reshape", onnx.TensorProto.INT64,
[1], [0],
)
init_reshape = graph.clear_transients(init_reshape)

g = graph.merge(g, init_reshape)
g = ops.split(axis=1)(g)
splits = g.transients

for index, t in enumerate(splits):
g = graph.clear_transients(g)
g = graph.add_transient_by_name(g, t.name)
g = graph.add_transient_by_name(g, init_reshape.initializers[0].name)
g = ops.reshape()(g)
g = ops.identity(ebm_features[index], suffix=False)(g)

return g

return _split_input


def get_bin_index_on_continuous_value(bin_edges):
"""
Expand Down
8 changes: 3 additions & 5 deletions ebm2onnx/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,22 +400,20 @@ def _softmax(g):

def split(axis=0):
def _split(g):
#print(g.transients[0])
#print(list(g.transients[0].type.tensor_type.shape.dim)[1].dim_value)
#print("foooo")
split_result_name = [
g.context.generate_variable_name('split_result')
for _ in range(list(g.transients[0].type.tensor_type.shape.dim)[axis].dim_value)
]
print(split_result_name)

nodes = [
onnx.helper.make_node(
op_type="Split",
inputs=[g.transients[0].name],
outputs=split_result_name,
name=g.context.generate_operator_name('Split'),
axis=axis,
),
num_outputs=len(split_result_name),
),
]

return g._replace(
Expand Down
44 changes: 44 additions & 0 deletions ebm2onnx/sklearn.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from skl2onnx.common.data_types import Int64TensorType, FloatTensorType, StringTensorType
from . import context
from . import convert

import onnx


def ebm_output_shape_calculator(operator):
op = operator.raw_operator

operator.outputs[0].type = Int64TensorType([None]) # label
operator.outputs[1].type = FloatTensorType([None, len(op.classes_)]) # probabilities


def convert_ebm_classifier(scope, operator, container):
"""Converts an EBM model to ONNX with sklearn-onnx
"""
op = operator.raw_operator

input_name = operator.inputs[0].onnx_name
ctx = context.create(
generate_variable_name=scope.get_unique_variable_name,
generate_operator_name=scope.get_unique_operator_name,
)

g = convert.to_graph(
op, dtype=(input_name, 'float'),
name="ebm",
predict_proba=True,
prediction_name="label",
probabilities_name="probabilities",
context=ctx
)

for node in g.nodes:
v = container._get_op_version(node.domain, node.op_type)
container.node_domain_version_pair_sets.add((node.domain, v))

container.nodes.extend(g.nodes)

for i in g.initializers:
content = i.SerializeToString()
container.initializers_strings[content] = i.name
container.initializers.append(i)
2 changes: 1 addition & 1 deletion ebm2onnx/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,4 +9,4 @@ def get_latest_opset_version():
version specified by *onnx* package if this one is lower
(return by `onnx.defs.onnx_opset_version()`).
"""
return min(13, defs.onnx_opset_version())
return min(21, defs.onnx_opset_version())
54 changes: 53 additions & 1 deletion tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,37 @@ def train_titanic_binary_classification(interactions=0, with_categorical=False,
return model, x_test, y_test


def train_titanic_binary_classification_tensor(
interactions=0,
old_th=65
):
df = pd.read_csv(
os.path.join('examples', 'titanic_train.csv'),
)
df = df.dropna()
df['Old'] = df['Age'] > old_th
feature_types = ['continuous', 'continuous', 'continuous', 'continuous']
feature_columns = ['Age', 'Fare', 'Pclass', 'Old']
label_column = "Survived"

y = df[[label_column]]
le = LabelEncoder()
y_enc = le.fit_transform(y)
x = df[feature_columns]
x_train, x_test, y_train, y_test = train_test_split(x, y_enc)
model = ExplainableBoostingClassifier(
interactions=interactions,
feature_types=feature_types
)
print(x_train.dtypes)
print(x_train.values.astype(np.float32).dtype)
model.fit(x_train.values.astype(np.float32), y_train)

return model, x_test, y_test


def train_titanic_regression(interactions):
df = pd.read_csv(os.path.join('examples','titanic_train.csv'))
df = pd.read_csv(os.path.join('examples', 'titanic_train.csv'))
df = df.dropna()
feature_columns = ['SibSp', 'Fare', 'Pclass']
label_column = "Age"
Expand Down Expand Up @@ -111,6 +140,29 @@ def test_predict_binary_classification(interactions, explain):
assert np.allclose(pred_ebm, pred_onnx[0])


@pytest.mark.parametrize("explain", [False, True])
@pytest.mark.parametrize("interactions", [0, 2, [(0, 1, 2)], [(0, 1, 2, 3)]])
def test_predict_binary_classification_tensor(interactions, explain):
model_ebm, x_test, y_test = train_titanic_binary_classification_tensor(
interactions=interactions
)
pred_ebm = model_ebm.predict(x_test.values)

model_onnx = ebm2onnx.to_onnx(
model_ebm,
explain=explain,
dtype=('data', 'float')
)

pred_onnx = infer_model(model_onnx, {
'data': x_test.values.astype(np.float32),
})

if explain is True:
assert len(pred_onnx) == 2
assert np.allclose(pred_ebm, pred_onnx[0])


@pytest.mark.parametrize("explain", [False, True])
@pytest.mark.parametrize("interactions", [0, 2, [(0, 1, 2)], [(0, 1, 2, 3)]])
def test_predict_proba_binary_classification(interactions, explain):
Expand Down
26 changes: 13 additions & 13 deletions tests/test_ebm.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,8 +13,8 @@ def test_get_bin_index_on_continuous_value():

g = ebm.get_bin_index_on_continuous_value([-np.inf, -np.inf, 0.2, 0.7, 1.2, 4.3])(i)
g = graph.add_output(g, g.transients[0].name, onnx.TensorProto.INT64, [None, 1])
assert_model_result(g,

assert_model_result(g,
input={
'i': [
[1.3],
Expand Down Expand Up @@ -66,8 +66,8 @@ def test_get_bin_score_1d():

g = ebm.get_bin_score_1d(np.array([0.0, 0.1, 0.2, 0.3]))(i)
g = graph.add_output(g, g.transients[0].name, onnx.TensorProto.FLOAT, [None, 1, 1])
assert_model_result(g,

assert_model_result(g,
input={
'i': [
[3],
Expand Down Expand Up @@ -122,6 +122,7 @@ def test_get_bin_score_1d_multiclass():
]],
)


def test_get_bin_score_2d():
g = graph.create_graph()
i1 = graph.create_input(g, "i1", onnx.TensorProto.INT64, [None, 1])
Expand All @@ -134,8 +135,8 @@ def test_get_bin_score_2d():
[10.0, 20.1, 30.2, 40.3],
]))(i)
g = graph.add_output(g, g.transients[0].name, onnx.TensorProto.FLOAT, [None, 1, 1])
assert_model_result(g,

assert_model_result(g,
input={
'i1': [[2], [1], [2], [0]],
'i2': [[3], [0], [2], [1]],
Expand All @@ -158,8 +159,8 @@ def test_compute_class_score():
i = graph.merge(i1, i2, i3)
g, _ = ebm.compute_class_score(np.array([0.2]), explain_name="scores")(i)
g = graph.add_output(g, g.transients[0].name, onnx.TensorProto.FLOAT, [None, 1])
assert_model_result(g,

assert_model_result(g,
input={
'i1': [[[0.1]], [[0.2]], [[0.3]], [[0.4]]],
'i2': [[[1.1]], [[1.2]], [[1.3]], [[1.4]]],
Expand Down Expand Up @@ -220,8 +221,8 @@ def test_predict_class_binary():
binary=True, prediction_name="prediction"
)(i)
g = graph.add_output(g, g.transients[0].name, onnx.TensorProto.INT64, [None])
assert_model_result(g,

assert_model_result(g,
input={
'i': [[3.5], [-3.8], [-0.1], [0.2]]
},
Expand Down Expand Up @@ -258,8 +259,8 @@ def test_predict_proba_binary():

g = ebm.predict_proba(binary=True, probabilities_name="probabilities")(i)
g = graph.add_output(g, g.transients[0].name, onnx.TensorProto.FLOAT, [None, 2])
assert_model_result(g,

assert_model_result(g,
input={
'i': [[3.5], [-3.8], [-0.1], [0.2]]
},
Expand All @@ -270,4 +271,3 @@ def test_predict_proba_binary():
[0.450166 , 0.54983395],
]]
)

Loading

0 comments on commit fc10165

Please sign in to comment.