-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathotherTools.py
155 lines (139 loc) · 4.68 KB
/
otherTools.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
import pandas as pd
import numpy as np
import itertools
from sklearn import preprocessing
import random
import streamlit as st
@st.cache(suppress_st_warning=True)
def read_data(directory):
data = pd.read_csv(directory)
return data
# deprecated method for generating default subgroups
def random_categories(categories, num=3):
N = len(categories)
index_array = np.random.choice(N, replace=False, size=num)
default_list = []
for i in index_array:
default_list.append(categories[i])
return default_list
@st.cache(suppress_st_warning=True)
def standardizer(df, numerics):
numeric_segment = df[numerics]
numeric_segment_copy = df[numerics].to_numpy()
normalizer = preprocessing.StandardScaler().fit(numeric_segment_copy)
normalized_numeric_segment = pd.DataFrame(
normalizer.transform(numeric_segment_copy), columns=numerics
)
return normalized_numeric_segment
@st.cache(suppress_st_warning=True)
def get_subgroups(df: pd.DataFrame, categories: list):
"""
args:
df: dataset dataframe
categories: a list of names of selected categories
returns: a list subgroups
"""
cat_list = []
for c in categories:
cat_list.append(set(df[c]))
subgroups = sorted(list(itertools.product(*cat_list)))
subgroups_dict = {}
for sg in subgroups:
idx = pd.Series(np.ones([len(df)], dtype=bool))
for i, feat in enumerate(sg):
idx = idx & (df[categories[i]] == feat)
subgroups_dict[sg] = pd.Index(idx)
return subgroups_dict
@st.cache(suppress_st_warning=True)
def retrieve_levels(data, categories):
"""
Retrieve the levels of categorical variables
A rather silly way though
args:
data: the dataset
categories: the full list of categorical variables
return:
a dictionary containing levels.
"""
level_dict = {}
for c in categories:
level_dict[c] = data[c].unique()
return level_dict
def plot_switcher(figure_dict, mode):
"""
Help readeres switch to different scatter plots
arg:
figure_dict: A dictionary for figures
mode: either "Similar" or "Different"
return:
an index for us to display text
"""
cur_list = figure_dict[mode]
figure_list_length = len(cur_list)
if figure_list_length == 1:
# TODO: Would be tricky to add legend in altair,
# Use other methods instead.
figure_column, legend_column = st.columns([3.6, 1])
with figure_column.container():
st.altair_chart(cur_list[0])
with legend_column.container():
st.write("Input Group: :red_circle:")
st.write(mode + " Group: :large_blue_circle:")
return 0
elif figure_list_length > 1:
figure_column, legend_column = st.columns([3.6, 1])
with figure_column.container():
figure_index = st.slider(
label="Select" + mode + "Groups",
min_value=0,
max_value=figure_list_length,
step=1,
value=1,
)
st.altair_chart(cur_list[figure_index - 1])
with legend_column.container():
st.write("Input Group: :red_circle:")
st.write(mode + " Group: :large_blue_circle:")
return figure_index - 1
def histogram_switch(histogram_dict, numeric_radio, mode_radio):
"""
histogram has a slightly different front-end logic
arg:
histogram_dict: A dictionary with key as numerical variables
and histogram list as value,
numeric_radio: the name of numeric variable selected by the user
mode_radio: "Similar" or "Different"
return:
an index for us to display text
"""
#
cur_histo_Dict = histogram_dict[mode_radio]
cur_list = cur_histo_Dict[numeric_radio]
figure_list_length = len(cur_list)
if figure_list_length == 1:
with st.container():
st.pyplot(cur_list[0])
return 0
elif figure_list_length > 1:
with st.container():
figure_index = st.slider(
label="Select" + mode + "Groups",
min_value=0,
max_value=figure_list_length,
step=1,
value=1,
)
st.pyplot(cur_list[figure_index - 1])
return figure_index - 1
@st.cache(suppress_st_warning=True)
def generate_tables(categories, levels):
"""
Method to generate pandas dataframe
that help readers learn current group levels
args:
categories: Current selected categories
levels: selected levels
return:
pd.DataFrame that can be "written" by streamlit
"""
return pd.DataFrame({"Category": categories, "level": levels})