Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Constant DC verification #510

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion examples/datasets/taxes_2.csv
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@ NewYork,5000,0.3
Wisconsin,5000,0.15
Wisconsin,6000,0.2
Wisconsin,4000,0.1
Wisconsin,3000,0.3
Texas,1000,0.15
Texas,2000,0.25
Texas,3000,0.3
Texas,5000,0.05
Texas,4000,0.1
86 changes: 86 additions & 0 deletions examples/expert/data_cleaning.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
from typing import Dict, List, Tuple
from collections import defaultdict
import matplotlib.pyplot as plt
import desbordante as db
import networkx as nx
import time


class DataCleaner:
def __init__(self, violations: List[Tuple[int, int]]) -> None:
self.graph: Dict[int, List[int]] = defaultdict(list)
for v1, v2 in violations:
if v1 != v2:
self.graph[v1].append(v2)
self.graph[v2].append(v1)
else:
self.graph[v1] = [v1]
self.nodes: List[int] = list(self.graph.keys())
self.removed_nodes: List[int] = []

def __remove_highest_degree_node(self) -> None:
max_key = max(self.graph, key=lambda x: len(self.graph[x]))
for neighbor in self.graph[max_key]:
self.graph[neighbor].remove(max_key)

del self.graph[max_key]
self.nodes.remove(max_key)
self.removed_nodes.append(max_key)

# Check if the graph contains any edges
def __has_edges(self) -> bool:
return any(self.graph[node] for node in self.graph)

# Remove highest degree node while graph has edges
def clean(self) -> None:
print("Cleaning algorithm started")
while self.__has_edges():
self.__remove_highest_degree_node()
print("Cleaning algorithm finished")

def draw(self, is_blocked: bool = True) -> None:
plt.figure()
G = nx.Graph()
G.add_nodes_from(self.nodes)
for node, neighbours in self.graph.items():
[G.add_edge(node, neighbour) for neighbour in neighbours]
nx.draw(G, with_labels=True)
plt.show(block=is_blocked)


def main():
TABLE_1 = 'examples/datasets/taxes_2.csv'
DC = "!(s.State == t.State and s.Salary < t.Salary and s.FedTaxRate > t.FedTaxRate)"
SEPARATOR = ','
HAS_HEADER = True

print("Data loading started")
verificator = db.dc_verification.algorithms.Default()
verificator.load_data(table=(TABLE_1, SEPARATOR, HAS_HEADER))
print("Data loading finished")

DO_COLLECT_VIOLATIONS = True

print("Algo execution started")

verificator.execute(denial_constraint=DC, do_collect_violations=DO_COLLECT_VIOLATIONS)

print("Algo execution finished")

dc_holds = verificator.dc_holds()

print("DC " + DC + " holds: " + str(dc_holds))

violations = verificator.get_violations()
cleaner = DataCleaner(violations)

cleaner.draw(False)
cleaner.clean()
cleaner.draw()

nodes = sorted(cleaner.removed_nodes)
print(f"Records to be removed: {", ".join(map(str, nodes))}")


if __name__ == "__main__":
main()
123 changes: 90 additions & 33 deletions src/core/algorithms/dc/model/column_operand.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,79 +3,136 @@
#include <cstdlib>
#include <exception>
#include <memory>
#include <optional>
#include <string>
#include <vector>

#include <boost/functional/hash.hpp>
#include <easylogging++.h>

#include "model/table/column.h"
#include "model/table/relation_data.h"
#include "model/types/type.h"

namespace algos::dc {

enum class Tuple { kS, kT, kMixed };

// @brief Represents a column operand for Predicate.
//
// A predicate (e.g., t.A == s.A) comprises three elements:
// the column operand from the first tuple ("t.A"), the comparison operator
// ("=="), and the column operand from the second tuple ("s.A"). The `ColumnOperand` class
// encapsulates the column operand part of a predicate, such as "t.A" or "s.A".
//
// A constant value also can be a column operand thus boost::optional is utilized
//
// The class distinguishes between operands derived from the first tuple (t) and those
// from the second tuple (s) using a boolean flag `is_first_tuple_`, where `true` indicates an
// operand from the first tuple (t), and `false` indicates an operand from the second
// tuple (s).
class ColumnOperand {
private:
Column const* column_;
bool is_first_tuple_;
std::optional<dc::Tuple> tuple_;
model::Type const* type_;
std::byte const* val_;

public:
ColumnOperand(Column const* column, bool is_first_tuple) noexcept
: column_(column), is_first_tuple_(is_first_tuple) {}

ColumnOperand() noexcept = default;

// For conversion from "t.ColumnPosition" or "t.ColumnName"
ColumnOperand(std::string operand, RelationalSchema const& schema) {
if (operand.front() != 't' and operand.front() != 's')
throw std::logic_error("Unknown tuple name");

is_first_tuple_ = operand.front() == 't';
std::string name(operand.begin() + 2, operand.end());
std::vector<std::unique_ptr<Column>> const& cols = schema.GetColumns();
if (!cols.front()->GetName().empty()) { // Has header
for (std::unique_ptr<Column> const& col : cols) {
if (name == col->GetName()) {
column_ = col.get();
return;
}
}
ColumnOperand() noexcept : column_(nullptr), val_(nullptr) {};

ColumnOperand(Column const* column, dc::Tuple tuple, model::Type const* type)
: column_(column), tuple_(tuple), type_(type), val_(nullptr) {}

ColumnOperand(std::string const& str_val, model::Type const* type)
: column_(nullptr), type_(type) {
std::byte* val = type->Allocate();
type->ValueFromStr(val, str_val);
val_ = val;
}

ColumnOperand(ColumnOperand const& rhs) : type_(rhs.type_) {
if (rhs.IsVariable()) {
tuple_ = rhs.tuple_;
column_ = rhs.column_;
val_ = nullptr;
} else {
val_ = rhs.type_->Clone(rhs.val_);
}
}

std::string str_ind(operand.begin() + 2, operand.end());
model::ColumnIndex ind = std::stoi(str_ind);
column_ = cols[ind].get();
ColumnOperand(ColumnOperand&& rhs) : ColumnOperand() {
Swap(rhs);
}

bool operator==(ColumnOperand const& rhs) const noexcept {
return column_ == rhs.column_ && is_first_tuple_ == rhs.is_first_tuple_;
ColumnOperand& operator=(ColumnOperand rhs) {
Swap(rhs);
return *this;
}

void Swap(ColumnOperand& rhs) {
std::swap(type_, rhs.type_);
std::swap(val_, rhs.val_);
std::swap(column_, rhs.column_);
std::swap(tuple_, rhs.tuple_);
}

bool operator==(ColumnOperand const& rhs) const {
if (IsConstant() != rhs.IsConstant()) return false;

if (IsConstant()) {
assert(type_ == rhs.type_);
return type_->Compare(GetVal(), rhs.GetVal()) == model::CompareResult::kEqual;
}

return column_ == rhs.column_ && tuple_ == rhs.tuple_;
}

bool operator!=(ColumnOperand const& rhs) const noexcept {
bool operator!=(ColumnOperand const& rhs) const {
return !(*this == rhs);
}

Column const* GetColumn() const noexcept {
Column const* GetColumn() const {
assert(column_ != nullptr);
return column_;
}

bool IsFirstTuple() const noexcept {
return is_first_tuple_;
Tuple GetTuple() const {
assert(tuple_.has_value());
return tuple_.value();
}

std::string ToString() const noexcept {
return (is_first_tuple_ ? "t." : "s.") + column_->GetName();
model::Type const* GetType() const noexcept {
return type_;
}

std::byte const* GetVal() const {
assert(val_ != nullptr);
return val_;
}

bool IsConstant() const {
return val_ != nullptr;
}

bool IsVariable() const {
return val_ == nullptr;
}

std::string ToString() const {
std::string res;
if (IsVariable()) {
res = (tuple_.value() == Tuple::kT ? "t." : "s.") + column_->GetName();
} else {
res = type_->ValueToString(val_);
}

return res;
}

~ColumnOperand() {
if (val_ != nullptr) {
type_->Free(val_);
}
}
};

Expand Down
104 changes: 104 additions & 0 deletions src/core/algorithms/dc/model/dc.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
#include "algorithms/dc/model/dc.h"

#include <algorithm>
#include <sstream>
#include <string>
#include <utility>
#include <vector>

#include "algorithms/dc/model/operator.h"
#include "algorithms/dc/model/predicate.h"

namespace algos::dc {

bool DC::CheckAllEquality() const {
auto check = [](auto const& pred) {
return pred.GetOperator() == dc::OperatorType::kEqual and pred.IsCrossTuple() and
pred.IsOneColumn();
};

return std::all_of(predicates_.begin(), predicates_.end(), check);
}

bool DC::CheckOneInequality() const {
size_t count_eq = 0, count_ineq = 0;
for (Predicate const& pred : predicates_) {
if (pred.IsConstant()) return false;

Operator op = pred.GetOperator();
if (op == OperatorType::kEqual and pred.IsOneColumn()) {
count_eq++;
} else if (op != OperatorType::kEqual and op != OperatorType::kUnequal and
pred.IsCrossTuple()) {
count_ineq++;
}
}

return count_eq + count_ineq == predicates_.size() && count_ineq == 1;
}

bool DC::CheckOneTuple() const {
ColumnOperand operand = predicates_.front().GetVariableOperand();
dc::Tuple tuple = operand.GetTuple();

auto check = [tuple](Predicate const& pred) {
return (pred.IsConstant() or pred.IsOneTuple()) and
pred.GetVariableOperand().GetTuple() == tuple;
};

return std::all_of(predicates_.begin(), predicates_.end(), check);
}

bool DC::CheckTwoTuples() const {
auto check = [](Predicate const& pred) { return pred.IsVariable() and pred.IsCrossTuple(); };

return std::all_of(predicates_.begin(), predicates_.end(), check);
}

DCType DC::GetType() const {
if (CheckAllEquality())
return DCType::kAllEquality;
else if (CheckOneInequality())
return DCType::kOneInequality;
else if (CheckOneTuple())
return DCType::kOneTuple;
else if (CheckTwoTuples())
return DCType::kTwoTuples;
else
return DCType::kMixed;
}

std::string DC::ToString() const {
if (predicates_.empty()) return {};

static constexpr char const* kNot = "!";
static constexpr char const* kAnd = " and ";

std::stringstream ss;
ss << kNot << '(' << predicates_.front().ToString();
for (auto pred = std::next(predicates_.begin()); pred != predicates_.end(); ++pred) {
ss << kAnd << pred->ToString();
}
ss << ')';

return ss.str();
}

void DC::ConvertEqualities() {
std::vector<dc::Predicate> res;
for (auto const& pred : predicates_) {
auto left = pred.GetLeftOperand();
auto right = pred.GetRightOperand();
if (pred.IsVariable() and pred.IsCrossColumn() and pred.IsCrossTuple() and
pred.GetOperator().GetType() == dc::OperatorType::kEqual) {
res.emplace_back(dc::OperatorType::kLessEqual, left, right);
res.emplace_back(dc::OperatorType::kGreaterEqual, left, right);
} else {
res.emplace_back(pred);
}
}

predicates_ = std::move(res);
}

} // namespace algos::dc
Loading
Loading