-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy patherror-detectors.py
117 lines (93 loc) · 3.47 KB
/
error-detectors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Loads input tables for the examples of the built-in error detectors
spark.read \
.option("header", True) \
.csv("./testdata/adult.csv") \
.write \
.saveAsTable("adult")
spark.read \
.option("header", True) \
.csv("./testdata/hospital.csv") \
.write \
.saveAsTable("hospital")
boston_schema = "tid int, CRIM double, ZN int, INDUS string, CHAS string, " \
"NOX string, RM double, AGE string, DIS double, RAD string, TAX int, " \
"PTRATIO string, B double, LSTAT double"
spark.read \
.option("header", True) \
.schema(boston_schema) \
.csv("./testdata/boston.csv") \
.write \
.saveAsTable("boston")
# Imports all the built-in error detectors
from repair.errors import NullErrorDetector
# For `NullErrorDetector`
error_cells_df = delphi.repair \
.setTableName("hospital") \
.setRowId("tid") \
.setErrorDetectors([NullErrorDetector()]) \
.run(detect_errors_only=True)
error_cells_df.show(3)
# For `DomainValues`
error_cells_df = delphi.repair \
.setTableName("adult") \
.setRowId("tid") \
.setErrorDetectors([DomainValues(attr='Sex', values=['Male', 'Female'])]) \
.run(detect_errors_only=True)
error_cells_df.show(3)
# A 'autofill' mode - we assume domain values tend to appear frequently against illegal values
target_columns = ['MeasureCode', 'ZipCode', 'City']
domain_value_error_detectors = []
for c in target_columns:
domain_value_error_detectors.append(DomainValues(attr=c, autofill=True, min_count_thres=12))
error_cells_df = delphi.repair \
.setTableName("hospital") \
.setRowId("tid") \
.setErrorDetectors(domain_value_error_detectors) \
.run(detect_errors_only=True)
error_cells_df.show(3)
# For `RegExErrorDetector`
error_cells_df = delphi.repair \
.setTableName("hospital") \
.setRowId("tid") \
.setErrorDetectors([RegExErrorDetector(attr='ZipCode', regex='\\d\\d\\d\\d\\d')]) \
.run(detect_errors_only=True)
error_cells_df.show(3)
# For `ConstraintErrorDetector` (denial constraint form)
target_columns = ['City', 'HospitalName', 'Address1', 'CountyName']
error_cells_df = delphi.repair \
.setTableName("hospital") \
.setRowId("tid") \
.setTargets(target_columns) \
.setErrorDetectors([ConstraintErrorDetector(constraint_path="./testdata/hospital_constraints.txt")]) \
.run(detect_errors_only=True)
error_cells_df.show(3)
# For `ConstraintErrorDetector` (simple form)
error_cells_df = delphi.repair \
.setTableName("hospital") \
.setRowId("tid") \
.setTargets(target_columns) \
.setErrorDetectors([ConstraintErrorDetector(constraints="City->CountyName;HospitalName->Address1")]) \
.run(detect_errors_only=True)
error_cells_df.show(3)
# For `GaussianOutlierErrorDetector`
error_cells_df = delphi.repair \
.setTableName("boston") \
.setRowId("tid") \
.setErrorDetectors([GaussianOutlierErrorDetector(approx_enabled=False)]) \
.run(detect_errors_only=True)
error_cells_df.show(3)
# For `LOFOutlierErrorDetector`
error_cells_df = delphi.repair \
.setTableName("boston") \
.setRowId("tid") \
.setErrorDetectors([LOFOutlierErrorDetector()]) \
.run(detect_errors_only=True)
error_cells_df.show(3)
# For `ScikitLearnBackedErrorDetector`
from sklearn.neighbors import LocalOutlierFactor
error_cells_df = delphi.repair \
.setTableName("boston") \
.setRowId("tid") \
.setErrorDetectors([ScikitLearnBackedErrorDetector(lambda: LocalOutlierFactor(novelty=False))]) \
.run(detect_errors_only=True)
error_cells_df.show(3)