-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathexploratory_data_analysis.py
95 lines (77 loc) · 3.57 KB
/
exploratory_data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# Copyright (C) 2022 Antonio Rodriguez
#
# This file is part of synthetic_data_generation_framework.
#
# synthetic_data_generation_framework is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# synthetic_data_generation_framework is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with synthetic_data_generation_framework. If not, see <http://www.gnu.org/licenses/>.
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sn
def eda(data : pd. DataFrame, X : pd.DataFrame , Y : pd.DataFrame, dataset_name : str, folder : str = r"./EDA") :
"""Performs Exploratory Data Analysis (EDA) when a numerical dataset if given:
General information, dataset dimensions, amount of missing data, control/cases ratio,
histograms, boxplots and Pearson's correlation matrix are computed and stored.
Errors correction with categorical variables has not been implemented yet.
Args:
-----
data: the whole dataset with features and target variable
X: features of the dataset
Y: target variable of the dataset
dataset_name: name of the dataset to properly store the files
folder: folder to save the generated files. Defaults to r"./EDA"
Returns:
--------
None
"""
# Save current working directory to come back later
cwd = os.getcwd()
# Create folder to store EDA analysis files if not existing
if not os.path.exists(folder):
os.mkdir(folder)
# Go to /EDA folder
os.chdir(folder)
# Print summarized information of the dataset
print("\nDataset dimensions: %d x %d\n" % (data.shape[0], data.shape[1]))
data.info(null_counts = True)
# Calculate and save the Control/Cases ratio
plt.figure()
plt.bar([0,0.15], Y.value_counts(), width = 0.1, align = "center", tick_label = ["control","cases"])
for i, v in enumerate(Y.value_counts()):
plt.text(i/7, 505, str(v), color='k', fontweight='bold')
name = dataset_name + '_ctrl_cases'
plt.savefig(name, dpi=600)
print(Y.value_counts()*100/len(Y))
# Generate and save histograms of all features
X.hist(edgecolor = "black",column=None, by=None, grid=False, xlabelsize=None, xrot=None, ylabelsize=None, yrot=None, ax=None, sharex=False, sharey=False, figsize=None, layout=None, bins=50, backend=None, legend=False)
name = dataset_name+"_all_histograms"
plt.savefig(name, dpi=300)
# Genarate and save boxplots of all features
for feature in X:
plt.figure()
X.boxplot(feature, grid = False)
name = "boxplot_" + feature
plt.savefig(name, dpi=300)
# Generate and save Pearson's correlation matrix
plt.figure()
corr_matrix = X.corr()
sn.heatmap(corr_matrix, vmin=-1, vmax=1, center=0, cmap="vlag" ,annot=True)
# Save figure
name = dataset_name + "_corr_matrix"
plt.savefig(name, dpi=300)
# For categorical variables, check if there is any category missing
for col in data :
vals = data[col].nunique()
print("Values of column %s: %s" % (col, vals))
# Back to working directory
os.chdir(cwd)