Skip to content

Commit e4563e8

Browse files
authored
Merge pull request #39 from ScalefreeCOM/Fixed-conflicts
Fixed conflicts
2 parents 72453d4 + 65f00c3 commit e4563e8

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

77 files changed

+2641
-905
lines changed

Diff for: .gitignore

+4-2
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
/procs/sqlite3/__pycache__/
2+
config.ini
3+
dump.db
24
*.pyc
35
*.sty
46
/tvdb-venv/
5-
install_packages.bat
6-
config.ini
7+
/VenTurboVault/
8+
/models/

Diff for: backend/bigquery.py

+148
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
import os
2+
import sqlite3
3+
from datetime import datetime
4+
from google.oauth2 import service_account
5+
from google.cloud import bigquery
6+
from backend.procs.sqlite3 import sources
7+
from backend.procs.sqlite3 import generate_erd
8+
from backend.procs.sqlite3 import generate_selected_entities
9+
from backend.procs.sqlite3 import properties
10+
11+
class BigQuery:
12+
def __init__(self, **kwargs):
13+
self.todo = []
14+
self.config = kwargs.get('turboVaultconfigs')
15+
root = os.path.join(os.path.dirname(os.path.abspath(__file__)).split('\\procs\\sqlite3')[0])
16+
root = '\\'.join(root.split('\\')[0:-1]) ## get one step back from the root folder
17+
self.model_path = self.config.get('model_path')
18+
self.model_path = os.path.join(root , self.model_path.replace('../', '').replace('/', '\\'))
19+
self.project_id = self.config.get('project_id')
20+
self.credential_path = self.config.get( 'credential_path')
21+
self.metadata_dataset = self.config.get('metadata_dataset')
22+
self.data_structure = {
23+
'print2FeedbackConsole': kwargs.get('print2FeedbackConsole'),
24+
'console_outputs': True,
25+
'cursor': None,
26+
'source': None,
27+
'generated_timestamp': datetime.now().strftime("%Y%m%d%H%M%S"),
28+
'rdv_default_schema': self.config.get("rdv_schema"),
29+
'model_path': self.model_path,
30+
'hashdiff_naming': self.config.get('hashdiff_naming'),
31+
'stage_default_schema': self.config.get("stage_schema"),
32+
'source_list': None,
33+
'generateSources': False,
34+
'source_name' : None, # "Source" field splits into this field
35+
'source_object' : None, # "Source" field splits into this field
36+
}
37+
38+
39+
def setTODO(self, **kwargs):
40+
self.SourceYML = kwargs.pop('SourceYML')
41+
self.todo = kwargs.pop('Tasks')
42+
self.DBDocs = kwargs.pop('DBDocs')
43+
self.Properties = kwargs.pop('Properties')
44+
self.selectedSources = kwargs.pop('Sources')
45+
46+
def __initializeInMemoryDatabase(self):
47+
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = self.credential_path
48+
credentials = service_account.Credentials.from_service_account_file(self.credential_path)
49+
50+
bigquery_client = bigquery.Client(project = self.project_id,credentials = credentials)
51+
52+
sql_source_data = f"""SELECT * FROM `{self.metadata_dataset}.source_data`"""
53+
df_source_data = bigquery_client.query(sql_source_data).to_dataframe()
54+
55+
sql_hub_entities = f"SELECT * FROM {self.metadata_dataset}.standard_hub"
56+
df_hub_entities = bigquery_client.query(sql_hub_entities).to_dataframe()
57+
58+
sql_hub_satellites = f"SELECT * FROM {self.metadata_dataset}.standard_satellite"
59+
df_hub_satellites = bigquery_client.query(sql_hub_satellites).to_dataframe()
60+
61+
sql_link_entities = f"SELECT * FROM {self.metadata_dataset}.standard_link"
62+
df_link_entities = bigquery_client.query(sql_link_entities).to_dataframe()
63+
64+
sql_pit_entities = f"SELECT * FROM {self.metadata_dataset}.pit"
65+
df_pit_entities = bigquery_client.query(sql_pit_entities).to_dataframe()
66+
67+
sql_non_historized_satellite_entities = f"SELECT * FROM {self.metadata_dataset}.non_historized_satellite"
68+
df_non_historized_satellite_entities = bigquery_client.query(sql_non_historized_satellite_entities).to_dataframe()
69+
70+
sql_non_historized_link_entities = f"SELECT * FROM {self.metadata_dataset}.non_historized_link"
71+
df_non_historized_link_entities = bigquery_client.query(sql_non_historized_link_entities).to_dataframe()
72+
73+
sql_ref_table_entities = f"SELECT * FROM {self.metadata_dataset}.ref_table"
74+
df_ref_table_entities = bigquery_client.query(sql_ref_table_entities).to_dataframe()
75+
76+
sql_ref_hub_entities = f"SELECT * FROM {self.metadata_dataset}.ref_hub"
77+
df_ref_hub_entities = bigquery_client.query(sql_ref_hub_entities).to_dataframe()
78+
79+
sql_ref_sat_entities = f"SELECT * FROM {self.metadata_dataset}.ref_sat"
80+
df_ref_sat_entities = bigquery_client.query(sql_ref_sat_entities).to_dataframe()
81+
82+
sql_multiactiv_satellite_entities = f"SELECT * FROM {self.metadata_dataset}.multiactive_satellite"
83+
df_multiactiv_satellite_entities = bigquery_client.query(sql_multiactiv_satellite_entities).to_dataframe()
84+
85+
db = sqlite3.connect(':memory:')
86+
dfs = {
87+
"source_data": df_source_data,
88+
"standard_hub": df_hub_entities,
89+
"standard_link": df_link_entities,
90+
"standard_satellite": df_hub_satellites,
91+
"pit": df_pit_entities,
92+
"non_historized_satellite": df_non_historized_satellite_entities,
93+
"non_historized_link": df_non_historized_link_entities,
94+
"multiactive_satellite": df_multiactiv_satellite_entities,
95+
"ref_table": df_ref_table_entities,
96+
"ref_hub": df_ref_hub_entities,
97+
"ref_sat": df_ref_sat_entities
98+
}
99+
100+
for table, df in dfs.items():
101+
df.to_sql(table, db)
102+
103+
return db.cursor()
104+
105+
def read(self):
106+
self.data_structure['cursor']= self.__initializeInMemoryDatabase()
107+
self.data_structure['cursor'].execute("SELECT DISTINCT SOURCE_SYSTEM || '_' || SOURCE_OBJECT FROM source_data")
108+
results = self.data_structure['cursor'].fetchall()
109+
source_list = []
110+
for row in results:
111+
source_list.append(row[0])
112+
self.data_structure['source_list'] = source_list
113+
self.catchDatabase()
114+
115+
def catchDatabase(self):
116+
if os.path.exists('dump.db'):
117+
os.remove('dump.db')
118+
self.data_structure['cursor'].execute("vacuum main into 'dump.db'")
119+
self.data_structure['cursor'].close()
120+
121+
def reloadDatabase(self):
122+
db = sqlite3.connect('dump.db')
123+
dest = sqlite3.connect(':memory:')
124+
db.backup(dest)
125+
db.close()
126+
os.remove('dump.db')
127+
return dest.cursor()
128+
129+
def run(self):
130+
self.data_structure['cursor'] = self.reloadDatabase()
131+
if self.SourceYML:
132+
sources.gen_sources(self.data_structure)
133+
try:
134+
for self.data_structure['source'] in self.selectedSources:
135+
self.data_structure['source'] = self.data_structure['source'].replace('_','_.._')
136+
seperatedNameAsList = self.data_structure['source'].split('_.._')
137+
self.data_structure['source_name'] = seperatedNameAsList[0]
138+
self.data_structure['source_object'] = ''.join(seperatedNameAsList[1:])
139+
generate_selected_entities.generate_selected_entities(self.todo, self.data_structure)
140+
if self.Properties:
141+
properties.gen_properties(self.data_structure)
142+
self.data_structure['print2FeedbackConsole'](message= 'Process successfully executed and models are ready to be used in Datavault 4dbt.')
143+
except Exception as e:
144+
self.data_structure['print2FeedbackConsole'](message= 'No sources selected!')
145+
146+
if self.DBDocs:
147+
generate_erd.generate_erd(self.data_structure['cursor'], self.selectedSources, self.data_structure['generated_timestamp'],self.data_structure['model_path'],self.data_structure['hashdiff_naming'])
148+
self.data_structure['cursor'].close()

Diff for: backend/config/config.py

+86
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
## TODO
2+
import os
3+
from configparser import ConfigParser
4+
5+
class MetadataInputConfig:
6+
def __init__(self)->None:
7+
self.supportedPlatforms: list = ['Excel','Google Sheets','Snowflake','BigQuery','db']
8+
self.configExpectedFields: dict ={
9+
'Snowflake':[
10+
'stage_schema',
11+
'rdv_schema',
12+
'hashdiff_naming',
13+
'model_path',
14+
'account_identifier',
15+
'database',
16+
'warehouse',
17+
'role',
18+
'meta_schema',
19+
'credential_path',
20+
],
21+
'Google Sheets':[
22+
'stage_schema',
23+
'rdv_schema',
24+
'hashdiff_naming',
25+
'model_path',
26+
'sheet_url',
27+
'gcp_oauth_credentials',
28+
'source_database',
29+
],
30+
'BigQuery':[
31+
'stage_schema',
32+
'rdv_schema',
33+
'metadata_dataset',
34+
'project_id',
35+
'hashdiff_naming',
36+
'model_path',
37+
'credential_path',
38+
],
39+
'Excel':[
40+
'stage_schema',
41+
'rdv_schema',
42+
'hashdiff_naming',
43+
'model_path',
44+
'excel_path',
45+
],
46+
'db': [
47+
'stage_schema',
48+
'rdv_schema',
49+
'hashdiff_naming',
50+
'model_path',
51+
'db_path',
52+
]}
53+
self.data: dict={
54+
'validSourcePlatforms': [],
55+
'invalidSourcePlatforms': [],
56+
'config':None,
57+
}
58+
self.read()
59+
try:
60+
self.validate()
61+
except:
62+
print('Failed to validate the source platforms')
63+
64+
def read(self):
65+
config = ConfigParser()
66+
config.read(os.path.join(os.path.dirname(__file__),"config.ini"))
67+
self.data['config']= config
68+
69+
def validate(self):
70+
for key in self.data['config'].sections():
71+
if key in self.configExpectedFields.keys():
72+
ValidSource: bool = True
73+
for field in self.configExpectedFields[key]:
74+
if field in self.data['config'][key]:
75+
pass
76+
else:
77+
ValidSource = False
78+
print('Expected field '+ field + ' in config.ini for '+ key +'!')
79+
if ValidSource:
80+
self.data['validSourcePlatforms'].append(key)
81+
else:
82+
self.data['invalidSourcePlatforms'].append(key)
83+
else:
84+
self.data['invalidSourcePlatforms'].append(key)
85+
print('Invalid source platform: '+ key)
86+

Diff for: backend/db.py

+76
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
import os
2+
from backend.procs.sqlite3 import generate_selected_entities, sources, generate_erd
3+
from logging import Logger
4+
import sqlite3
5+
import pandas as pd
6+
from datetime import datetime
7+
import time
8+
from backend.procs.sqlite3 import properties
9+
image_path = os.path.join(os.path.dirname(__file__),"images")
10+
log = Logger('log')
11+
12+
class DB:
13+
def __init__(self, **kwargs):
14+
self.todo = []
15+
self.config = kwargs.get('turboVaultconfigs')
16+
self.db_path = self.config.get('db_path')
17+
self.db_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), self.db_path)) # If a file path is relative, then resolve to an absolute path
18+
root = os.path.join(os.path.dirname(os.path.abspath(__file__)).split('\\procs\\sqlite3')[0])
19+
root = '\\'.join(root.split('\\')[0:-1]) ## get one step back from the root folder
20+
self.model_path = self.config.get('model_path')
21+
self.model_path = os.path.join(root , self.model_path.replace('../', '').replace('/', '\\'))
22+
self.data_structure = {
23+
'print2FeedbackConsole': kwargs.get('print2FeedbackConsole'),
24+
'console_outputs': True,
25+
'cursor': None,
26+
'source': None,
27+
'generated_timestamp': datetime.now().strftime("%Y%m%d%H%M%S"),
28+
'rdv_default_schema': self.config.get("rdv_schema"),
29+
'model_path': self.model_path,
30+
'hashdiff_naming': self.config.get('hashdiff_naming'),
31+
'stage_default_schema': self.config.get("stage_schema"),
32+
'source_list': None ,
33+
'generateSources': False,
34+
'source_name' : None, # "Source" field splits into this field
35+
'source_object' : None, # "Source" field splits into this field
36+
}
37+
38+
39+
def setTODO(self, **kwargs):
40+
self.SourceYML = kwargs.pop('SourceYML')
41+
self.todo = kwargs.pop('Tasks')
42+
self.DBDocs = kwargs.pop('DBDocs')
43+
self.Properties = kwargs.pop('Properties')
44+
self.selectedSources = kwargs.pop('Sources')
45+
46+
def __initializeInMemoryDatabase(self):
47+
db = sqlite3.connect(self.db_path)
48+
return db.cursor()
49+
50+
def read(self):
51+
self.data_structure['cursor'] = self.__initializeInMemoryDatabase()
52+
self.data_structure['cursor'].execute("SELECT DISTINCT SOURCE_SYSTEM || '_*-*_' || SOURCE_OBJECT FROM source_data")
53+
results = self.data_structure['cursor'].fetchall()
54+
source_list = []
55+
for row in results:
56+
source_list.append(row[0])
57+
self.data_structure['source_list'] = source_list
58+
59+
def run(self):
60+
self.read()
61+
if self.SourceYML:
62+
sources.gen_sources(self.data_structure)
63+
try:
64+
for self.data_structure['source'] in self.selectedSources:
65+
seperatedNameAsList = self.data_structure['source'].split('_*-*_')
66+
self.data_structure['source_name'] = seperatedNameAsList[0]
67+
self.data_structure['source_object'] = ''.join(seperatedNameAsList[1:])
68+
generate_selected_entities.generate_selected_entities(self.todo, self.data_structure)
69+
if self.Properties:
70+
properties.gen_properties(self.data_structure)
71+
self.data_structure['print2FeedbackConsole'](message= 'Process successfully executed and models are ready to be used in Datavault 4dbt.')
72+
except Exception as e:
73+
self.data_structure['print2FeedbackConsole'](message= 'No sources selected!')
74+
75+
if self.DBDocs:
76+
generate_erd.generate_erd(self.data_structure['cursor'], self.selectedSources, self.data_structure['generated_timestamp'],self.data_structure['model_path'],self.data_structure['hashdiff_naming'])

0 commit comments

Comments
 (0)