Skip to content

Commit 864e10c

Browse files
authored
Merge pull request #104 from #80
Feature/80 add connectors and metadata
2 parents 717bfb1 + af21b3f commit 864e10c

22 files changed

+2777
-1238
lines changed

.env.example

Lines changed: 92 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,18 +3,18 @@
33
###############################################
44

55
# LLM_PROVIDER=openai
6-
# OPEN_AI_KEY=
7-
# OPEN_MODEL_PREF=gpt-4o
6+
# OPEN_AI_KEY=sk-proj-----
7+
# OPEN_AI_LLM_MODEL=gpt-4.1
88

99
# LLM_PROVIDER=gemini
1010
# GEMINI_API_KEY=
1111
# GEMINI_LLM_MODEL=gemini-2.0-flash-lite
1212

13-
# LLM_PROVIDER=azure
14-
# AZURE_OPENAI_LLM_ENDPOINT=
15-
# AZURE_OPENAI_LLM_KEY=
16-
# AZURE_OPENAI_LLM_MODEL=
17-
# AZURE_OPENAI_LLM_API_VERSION=
13+
LLM_PROVIDER=azure
14+
AZURE_OPENAI_LLM_ENDPOINT=https://-------.openai.azure.com/
15+
AZURE_OPENAI_LLM_KEY=-
16+
AZURE_OPENAI_LLM_MODEL=gpt4o
17+
AZURE_OPENAI_LLM_API_VERSION=2024-07-01-preview
1818

1919
# LLM_PROVIDER=ollama
2020
# OLLAMA_LLM_BASE_URL=
@@ -36,31 +36,99 @@
3636
########### Embedding API SElECTION ###########
3737
###############################################
3838
# Only used if you are using an LLM that does not natively support embedding (openai or Azure)
39-
# EMBEDDING_ENGINE='openai'
40-
# OPEN_AI_KEY=sk-xxxx
41-
# EMBEDDING_MODEL_PREF='text-embedding-ada-002'
39+
# EMBEDDING_PROVIDER='openai'
40+
# OPEN_AI_EMBEDDING_MODEL='text-embedding-ada-002'
4241

43-
# EMBEDDING_ENGINE='azure'
44-
# AZURE_OPENAI_ENDPOINT=
45-
# AZURE_OPENAI_KEY=
46-
# EMBEDDING_MODEL_PREF='my-embedder-model' # This is the "deployment" on Azure you want to use for embeddings. Not the base model. Valid base model is text-embedding-ada-002
42+
# EMBEDDING_PROVIDER=azure
43+
# AZURE_OPENAI_EMBEDDING_ENDPOINT=https://-------.openai.azure.com/openai/deployments
44+
# AZURE_OPENAI_EMBEDDING_KEY=-
45+
# AZURE_OPENAI_EMBEDDING_MODEL='textembeddingada002' # This is the "deployment" on Azure you want to use for embeddings. Not the base model. Valid base model is text-embedding-ada-002
46+
# AZURE_OPENAI_EMBEDDING_API_VERSION=2023-09-15-preview
4747

48-
# EMBEDDING_ENGINE='ollama'
48+
# EMBEDDING_PROVIDER='ollama'
4949
# EMBEDDING_BASE_PATH='http://host.docker.internal:11434'
50-
# EMBEDDING_MODEL_PREF='nomic-embed-text:latest'
50+
# EMBEDDING_MODEL='nomic-embed-text:latest'
5151
# EMBEDDING_MODEL_MAX_CHUNK_LENGTH=8192
5252

53-
# EMBEDDING_ENGINE='bedrock'
54-
# AWS_BEDROCK_EMBEDDING_ACCESS_KEY_ID=
55-
# AWS_BEDROCK_EMBEDDING_ACCESS_KEY=
56-
# AWS_BEDROCK_EMBEDDING_REGION=us-west-2
57-
# AWS_BEDROCK_EMBEDDING_MODEL_PREF=amazon.embedding-embedding-ada-002:0
53+
EMBEDDING_PROVIDER='bedrock'
54+
AWS_BEDROCK_EMBEDDING_ACCESS_KEY_ID=--
55+
AWS_BEDROCK_EMBEDDING_SECRET_ACCESS_KEY=-/-+-+-
56+
AWS_BEDROCK_EMBEDDING_REGION=us-west-2
57+
AWS_BEDROCK_EMBEDDING_MODEL=amazon.titan-embed-text-v2:0
5858

59-
# EMBEDDING_ENGINE='gemini'
59+
# EMBEDDING_PROVIDER='gemini'
6060
# GEMINI_EMBEDDING_API_KEY=
61-
# EMBEDDING_MODEL_PREF='text-embedding-004'
61+
# EMBEDDING_MODEL='text-embedding-004'
6262

63-
# EMBEDDING_ENGINE='huggingface'
63+
# EMBEDDING_PROVIDER='huggingface'
6464
# HUGGING_FACE_EMBEDDING_REPO_ID=
6565
# HUGGING_FACE_EMBEDDING_MODEL=
6666
# HUGGING_FACE_EMBEDDING_API_TOKEN=
67+
68+
DATAHUB_SERVER = 'http://-.-.-.-:-'
69+
70+
71+
###############################################
72+
######## Database Connector SELECTION #########
73+
###############################################
74+
75+
# clickhouse
76+
# DB_TYPE=clickhouse
77+
# CLICKHOUSE_HOST=_._._._
78+
# CLICKHOUSE_PORT=9000
79+
# CLICKHOUSE_USER=_
80+
# CLICKHOUSE_PASSWORD=_
81+
# CLICKHOUSE_DATABASE=_
82+
83+
# databricks
84+
# DB_TYPE=databricks
85+
# DATABRICKS_HOST=_
86+
# DATABRICKS_HTTP_PATH=_
87+
# DATABRICKS_ACCESS_TOKEN=_
88+
89+
# duckdb
90+
# DB_TYPE=duckdb
91+
# DUCKDB_PATH=./data/duckdb.db
92+
93+
# mariadb
94+
# DB_TYPE=mariadb
95+
# MARIADB_HOST=_
96+
# MARIADB_PORT=3306
97+
# MARIADB_USER=_
98+
# MARIADB_PASSWORD=_
99+
# MARIADB_DATABASE=_
100+
101+
# mysql
102+
# DB_TYPE=mysql
103+
# MYSQL_HOST=_
104+
# MYSQL_PORT=3306
105+
# MYSQL_USER=_
106+
# MYSQL_PASSWORD=_
107+
# MYSQL_DATABASE=_
108+
109+
# oracle
110+
# DB_TYPE=oracle
111+
# ORACLE_HOST=_
112+
# ORACLE_PORT=1521
113+
# ORACLE_USER=_
114+
# ORACLE_PASSWORD=_
115+
# ORACLE_DATABASE=_
116+
# ORACLE_SERVICE_NAME=_
117+
118+
# postgres
119+
# DB_TYPE=postgres
120+
# POSTGRES_HOST=_
121+
# POSTGRES_PORT=5432
122+
# POSTGRES_USER=_
123+
# POSTGRES_PASSWORD=_
124+
# POSTGRES_DATABASE=_
125+
126+
# snowflake
127+
# DB_TYPE=snowflake
128+
# SNOWFLAKE_USER=_
129+
# SNOWFLAKE_PASSWORD=_
130+
# SNOWFLAKE_ACCOUNT=_
131+
132+
# sqlite
133+
# DB_TYPE=sqlite
134+
# SQLITE_PATH=./data/sqlite.db

db_utils/__init__.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
from typing import Optional
2+
import os
3+
from .config import DBConfig
4+
from .logger import logger
5+
6+
from dotenv import load_dotenv
7+
8+
from .base_connector import BaseConnector
9+
10+
from .clickhouse_connector import ClickHouseConnector
11+
from .postgres_connector import PostgresConnector
12+
from .mysql_connector import MySQLConnector
13+
from .mariadb_connector import MariaDBConnector
14+
from .oracle_connector import OracleConnector
15+
from .duckdb_connector import DuckDBConnector
16+
from .databricks_connector import DatabricksConnector
17+
from .snowflake_connector import SnowflakeConnector
18+
19+
env_path = os.path.join(os.getcwd(), ".env")
20+
21+
if os.path.exists(env_path):
22+
load_dotenv(env_path, override=True)
23+
print(f"✅ 환경변수 파일(.env)이 {os.getcwd()}에 로드되었습니다!")
24+
else:
25+
print(f"⚠️ 환경변수 파일(.env)이 {os.getcwd()}에 없습니다!")
26+
27+
28+
def get_db_connector(db_type: Optional[str] = None, config: Optional[DBConfig] = None):
29+
"""
30+
Return the appropriate DB connector instance.
31+
- If db_type is not provided, loads from environment variable DB_TYPE
32+
- If config is not provided, loads from environment using db_type
33+
34+
Parameters:
35+
db_type (Optional[str]): Database type (e.g., 'postgresql', 'mysql')
36+
config (Optional[DBConfig]): Connection config
37+
38+
Returns:
39+
BaseConnector: Initialized DB connector instance
40+
41+
Raises:
42+
ValueError: If type/config is missing or invalid
43+
"""
44+
if db_type is None:
45+
db_type = os.getenv("DB_TYPE")
46+
if not db_type:
47+
raise ValueError(
48+
"DB type must be provided or set in environment as DB_TYPE."
49+
)
50+
51+
db_type = db_type.lower()
52+
53+
if config is None:
54+
config = load_config_from_env(db_type.upper())
55+
56+
connector_map = {
57+
"clickhouse": ClickHouseConnector,
58+
"postgresql": PostgresConnector,
59+
"mysql": MySQLConnector,
60+
"mariadb": MariaDBConnector,
61+
"oracle": OracleConnector,
62+
"duckdb": DuckDBConnector,
63+
"databricks": DatabricksConnector,
64+
"snowflake": SnowflakeConnector,
65+
}
66+
67+
if db_type not in connector_map:
68+
logger.error(f"Unsupported DB type: {db_type}")
69+
raise ValueError(f"Unsupported DB type: {db_type}")
70+
71+
required_fields = {
72+
"oracle": ["extra.service_name"],
73+
"databricks": ["extra.http_path", "extra.access_token"],
74+
"snowflake": ["extra.account"],
75+
}
76+
77+
missing = []
78+
for path in required_fields.get(db_type, []):
79+
cur = config
80+
for key in path.split("."):
81+
cur = cur.get(key) if isinstance(cur, dict) else None
82+
if cur is None:
83+
missing.append(path)
84+
break
85+
86+
if missing:
87+
logger.error(f"Missing required fields for {db_type}: {', '.join(missing)}")
88+
raise ValueError(f"Missing required fields for {db_type}: {', '.join(missing)}")
89+
90+
return connector_map[db_type](config)
91+
92+
93+
def load_config_from_env(prefix: str) -> DBConfig:
94+
"""
95+
Load DBConfig from environment variables with a given prefix.
96+
Standard keys are extracted, all other prefixed keys go to 'extra'.
97+
98+
Example:
99+
If prefix = 'SNOWFLAKE', loads:
100+
- SNOWFLAKE_HOST
101+
- SNOWFLAKE_USER
102+
- SNOWFLAKE_PASSWORD
103+
- SNOWFLAKE_PORT
104+
- SNOWFLAKE_DATABASE
105+
Other keys like SNOWFLAKE_ACCOUNT, SNOWFLAKE_WAREHOUSE -> extra
106+
"""
107+
base_keys = {"HOST", "PORT", "USER", "PASSWORD", "DATABASE"}
108+
109+
# Extract standard values
110+
config = {
111+
"host": os.getenv(f"{prefix}_HOST"),
112+
"port": (
113+
int(os.getenv(f"{prefix}_PORT")) if os.getenv(f"{prefix}_PORT") else None
114+
),
115+
"user": os.getenv(f"{prefix}_USER"),
116+
"password": os.getenv(f"{prefix}_PASSWORD"),
117+
"database": os.getenv(f"{prefix}_DATABASE"),
118+
}
119+
120+
# Auto-detect extra keys
121+
extra = {}
122+
for key, value in os.environ.items():
123+
if key.startswith(f"{prefix}_"):
124+
suffix = key[len(f"{prefix}_") :]
125+
if suffix.upper() not in base_keys:
126+
extra[suffix.lower()] = value
127+
128+
if extra:
129+
config["extra"] = extra
130+
131+
return DBConfig(**config)

db_utils/base_connector.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
from abc import ABC, abstractmethod
2+
import pandas as pd
3+
4+
5+
class BaseConnector(ABC):
6+
"""
7+
Abstract base class for database connectors.
8+
"""
9+
10+
@abstractmethod
11+
def connect(self):
12+
"""
13+
Initialize the database connection.
14+
"""
15+
pass
16+
17+
@abstractmethod
18+
def run_sql(self, sql: str) -> pd.DataFrame:
19+
"""
20+
Returns the result of the SQL query as a pandas DataFrame.
21+
22+
Parameters:
23+
sql (str): SQL query string to be executed.
24+
25+
Returns:
26+
pd.DataFrame: Result of the SQL query as a pandas DataFrame.
27+
"""
28+
pass

db_utils/clickhouse_connector.py

Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
from .base_connector import BaseConnector
2+
from clickhouse_driver import Client
3+
import pandas as pd
4+
from db_utils import DBConfig, logger
5+
6+
7+
class ClickHouseConnector(BaseConnector):
8+
"""
9+
Connect to ClickHouse and execute SQL queries.
10+
"""
11+
12+
client = None
13+
14+
def __init__(self, config: DBConfig):
15+
"""
16+
Initialize the ClickHouseConnector with connection parameters.
17+
18+
Parameters:
19+
config (DBConfig): Configuration object containing connection parameters.
20+
"""
21+
self.host = config["host"]
22+
self.port = config["port"]
23+
self.user = config["user"]
24+
self.password = config["password"]
25+
self.database = config["database"]
26+
self.connect()
27+
28+
def connect(self) -> None:
29+
"""
30+
Establish a connection to the ClickHouse server.
31+
"""
32+
try:
33+
self.client = Client(
34+
host=self.host,
35+
port=self.port,
36+
user=self.user,
37+
password=self.password,
38+
database=self.database,
39+
)
40+
logger.info("Successfully connected to ClickHouse.")
41+
except Exception as e:
42+
logger.error(f"Failed to connect to ClickHouse: {e}")
43+
raise
44+
45+
def run_sql(self, sql: str) -> pd.DataFrame:
46+
"""
47+
Execute a SQL query and return the result as a pandas DataFrame.
48+
49+
Parameters:
50+
sql (str): SQL query string to be executed.
51+
52+
Returns:
53+
pd.DataFrame: Result of the SQL query as a pandas DataFrame.
54+
"""
55+
if self.client is None:
56+
self.connect()
57+
58+
try:
59+
result = self.client.query_dataframe(sql)
60+
return result
61+
except Exception as e:
62+
logger.error(f"Failed to execute SQL query: {e}")
63+
raise
64+
65+
def close(self) -> None:
66+
"""
67+
Close the connection to the ClickHouse server.
68+
"""
69+
if self.client:
70+
self.client.disconnect()
71+
logger.error("Connection to ClickHouse closed.")
72+
self.client = None

db_utils/config.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from typing import Optional, Dict, TypedDict
2+
3+
4+
class DBConfig(TypedDict):
5+
6+
host: str
7+
port: Optional[int]
8+
user: Optional[str]
9+
password: Optional[str]
10+
database: Optional[str]
11+
extra: Optional[Dict[str, str]]

0 commit comments

Comments
 (0)