Skip to content
This repository was archived by the owner on May 17, 2024. It is now read-only.

Commit 77ce613

Browse files
authored
Merge pull request #231 from pik94/vertica-integration
add support of Vertica db
2 parents 3cb517f + 7cfa8d3 commit 77ce613

File tree

12 files changed

+610
-58
lines changed

12 files changed

+610
-58
lines changed

.github/workflows/ci.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ jobs:
3434
python-version: ${{ matrix.python-version }}
3535

3636
- name: Build the stack
37-
run: docker-compose up -d mysql postgres presto trino clickhouse
37+
run: docker-compose up -d mysql postgres presto trino clickhouse vertica
3838

3939
- name: Install Poetry
4040
run: pip install poetry
@@ -48,4 +48,7 @@ jobs:
4848
DATADIFF_PRESTO_URI: '${{ secrets.DATADIFF_PRESTO_URI }}'
4949
DATADIFF_TRINO_URI: '${{ secrets.DATADIFF_TRINO_URI }}'
5050
DATADIFF_CLICKHOUSE_URI: 'clickhouse://clickhouse:Password1@localhost:9000/clickhouse'
51-
run: poetry run unittest-parallel -j 16
51+
DATADIFF_VERTICA_URI: 'vertica://vertica:Password1@localhost:5433/vertica'
52+
run: |
53+
chmod +x tests/waiting_for_stack_up.sh
54+
./tests/waiting_for_stack_up.sh && poetry run unittest-parallel -j 16

README.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ $ data-diff \
133133
| Databricks | `databricks://<http_path>:<access_token>@<server_hostname>/<catalog>/<schema>` | 💛 |
134134
| Trino | `trino://<username>:<password>@<hostname>:8080/<database>` | 💛 |
135135
| Clickhouse | `clickhouse://<username>:<password>@<hostname>:9000/<database>` | 💛 |
136+
| Vertica | `vertica://<username>:<password>@<hostname>:5433/<database>` | 💛 |
136137
| ElasticSearch | | 📝 |
137138
| Planetscale | | 📝 |
138139
| Pinot | | 📝 |
@@ -177,6 +178,8 @@ While you may install them manually, we offer an easy way to install them along
177178

178179
- `pip install 'data-diff[clickhouse]'`
179180

181+
- `pip install 'data-diff[vertica]'`
182+
180183
- For BigQuery, see: https://pypi.org/project/google-cloud-bigquery/
181184

182185

data_diff/databases/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,5 +10,6 @@
1010
from .databricks import Databricks
1111
from .trino import Trino
1212
from .clickhouse import Clickhouse
13+
from .vertica import Vertica
1314

1415
from .connect import connect_to_uri

data_diff/databases/connect.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from .databricks import Databricks
1616
from .trino import Trino
1717
from .clickhouse import Clickhouse
18+
from .vertica import Vertica
1819

1920

2021
@dataclass
@@ -87,6 +88,7 @@ def match_path(self, dsn):
8788
),
8889
"trino": MatchUriPath(Trino, ["catalog", "schema"], help_str="trino://<user>@<host>/<catalog>/<schema>"),
8990
"clickhouse": MatchUriPath(Clickhouse, ["database?"], help_str="clickhouse://<user>:<pass>@<host>/<database>"),
91+
"vertica": MatchUriPath(Vertica, ["database?"], help_str="vertica://<user>:<pass>@<host>/<database>"),
9092
}
9193

9294

@@ -113,6 +115,7 @@ def connect_to_uri(db_uri: str, thread_count: Optional[int] = 1) -> Database:
113115
- databricks
114116
- trino
115117
- clickhouse
118+
- vertica
116119
"""
117120

118121
dsn = dsnparse.parse(db_uri)
@@ -200,6 +203,8 @@ def connect(db_conf: Union[str, dict], thread_count: Optional[int] = 1) -> Datab
200203
- presto
201204
- databricks
202205
- trino
206+
- clickhouse
207+
- vertica
203208
"""
204209
if isinstance(db_conf, str):
205210
return connect_to_uri(db_conf, thread_count)

data_diff/databases/vertica.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
from typing import List
2+
3+
from ..utils import match_regexps
4+
from .base import (
5+
CHECKSUM_HEXDIGITS,
6+
MD5_HEXDIGITS,
7+
TIMESTAMP_PRECISION_POS,
8+
ConnectError,
9+
DbPath,
10+
ColType,
11+
ColType_UUID,
12+
ThreadedDatabase,
13+
import_helper,
14+
)
15+
from .database_types import Decimal, Float, FractionalType, Integer, TemporalType, Text, Timestamp, TimestampTZ
16+
17+
18+
@import_helper("vertica")
19+
def import_vertica():
20+
import vertica_python
21+
22+
return vertica_python
23+
24+
25+
class Vertica(ThreadedDatabase):
26+
default_schema = "public"
27+
28+
TYPE_CLASSES = {
29+
# Timestamps
30+
"timestamp": Timestamp,
31+
"timestamptz": TimestampTZ,
32+
# Numbers
33+
"numeric": Decimal,
34+
"int": Integer,
35+
"float": Float,
36+
# Text
37+
"char": Text,
38+
"varchar": Text,
39+
}
40+
41+
ROUNDS_ON_PREC_LOSS = True
42+
43+
def __init__(self, *, thread_count, **kw):
44+
self._args = kw
45+
self._args["AUTOCOMMIT"] = False
46+
47+
super().__init__(thread_count=thread_count)
48+
49+
def create_connection(self):
50+
vertica = import_vertica()
51+
try:
52+
c = vertica.connect(**self._args)
53+
return c
54+
except vertica.errors.ConnectionError as e:
55+
raise ConnectError(*e.args) from e
56+
57+
def _parse_type(
58+
self,
59+
table_path: DbPath,
60+
col_name: str,
61+
type_repr: str,
62+
datetime_precision: int = None,
63+
numeric_precision: int = None,
64+
numeric_scale: int = None,
65+
) -> ColType:
66+
timestamp_regexps = {
67+
r"timestamp\(?(\d?)\)?": Timestamp,
68+
r"timestamptz\(?(\d?)\)?": TimestampTZ,
69+
}
70+
for m, t_cls in match_regexps(timestamp_regexps, type_repr):
71+
precision = int(m.group(1)) if m.group(1) else 6
72+
return t_cls(precision=precision, rounds=self.ROUNDS_ON_PREC_LOSS)
73+
74+
number_regexps = {
75+
r"numeric\((\d+),(\d+)\)": Decimal,
76+
}
77+
for m, n_cls in match_regexps(number_regexps, type_repr):
78+
_prec, scale = map(int, m.groups())
79+
return n_cls(scale)
80+
81+
string_regexps = {
82+
r"varchar\((\d+)\)": Text,
83+
r"char\((\d+)\)": Text,
84+
}
85+
for m, n_cls in match_regexps(string_regexps, type_repr):
86+
return n_cls()
87+
88+
return super()._parse_type(table_path, col_name, type_repr, datetime_precision, numeric_precision)
89+
90+
def select_table_schema(self, path: DbPath) -> str:
91+
schema, table = self._normalize_table_path(path)
92+
93+
return (
94+
"SELECT column_name, data_type, datetime_precision, numeric_precision, numeric_scale "
95+
"FROM V_CATALOG.COLUMNS "
96+
f"WHERE table_name = '{table}' AND table_schema = '{schema}'"
97+
)
98+
99+
def quote(self, s: str):
100+
return f'"{s}"'
101+
102+
def concat(self, l: List[str]) -> str:
103+
return " || ".join(l)
104+
105+
def md5_to_int(self, s: str) -> str:
106+
return f"CAST(HEX_TO_INTEGER(SUBSTRING(MD5({s}), {1 + MD5_HEXDIGITS - CHECKSUM_HEXDIGITS})) AS NUMERIC(38, 0))"
107+
108+
def to_string(self, s: str) -> str:
109+
return f"CAST({s} AS VARCHAR)"
110+
111+
def normalize_timestamp(self, value: str, coltype: TemporalType) -> str:
112+
if coltype.rounds:
113+
return f"TO_CHAR({value}::TIMESTAMP({coltype.precision}), 'YYYY-MM-DD HH24:MI:SS.US')"
114+
115+
timestamp6 = f"TO_CHAR({value}::TIMESTAMP(6), 'YYYY-MM-DD HH24:MI:SS.US')"
116+
return (
117+
f"RPAD(LEFT({timestamp6}, {TIMESTAMP_PRECISION_POS+coltype.precision}), {TIMESTAMP_PRECISION_POS+6}, '0')"
118+
)
119+
120+
def normalize_number(self, value: str, coltype: FractionalType) -> str:
121+
return self.to_string(f"CAST({value} AS NUMERIC(38, {coltype.precision}))")
122+
123+
def normalize_uuid(self, value: str, coltype: ColType_UUID) -> str:
124+
# Trim doesn't work on CHAR type
125+
return f"TRIM(CAST({value} AS VARCHAR))"

dev/dev.env

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,13 @@ CLICKHOUSE_USER=clickhouse
1111
CLICKHOUSE_PASSWORD=Password1
1212
CLICKHOUSE_DB=clickhouse
1313
CLICKHOUSE_DEFAULT_ACCESS_MANAGEMENT=1
14+
15+
# Vertica credentials
16+
APP_DB_USER=vertica
17+
APP_DB_PASSWORD=Password1
18+
VERTICA_DB_NAME=vertica
19+
20+
# To prevent generating sample demo VMart data (more about it here https://www.vertica.com/docs/9.2.x/HTML/Content/Authoring/GettingStartedGuide/IntroducingVMart/IntroducingVMart.htm),
21+
# leave VMART_DIR and VMART_ETL_SCRIPT empty.
22+
VMART_DIR=
23+
VMART_ETL_SCRIPT=

docker-compose.yml

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,10 +97,31 @@ services:
9797
networks:
9898
- local
9999

100+
vertica:
101+
container_name: vertica
102+
image: vertica/vertica-ce:12.0.0-0
103+
restart: always
104+
volumes:
105+
- vertica-data:/data:delegated
106+
ports:
107+
- '5433:5433'
108+
- '5444:5444'
109+
expose:
110+
- '5433'
111+
- '5444'
112+
env_file:
113+
- dev/dev.env
114+
tty: true
115+
networks:
116+
- local
117+
118+
119+
100120
volumes:
101121
postgresql-data:
102122
mysql-data:
103123
clickhouse-data:
124+
vertica-data:
104125

105126
networks:
106127
local:

0 commit comments

Comments
 (0)