-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparse_project_materializations.py
132 lines (104 loc) · 5.88 KB
/
parse_project_materializations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# This script parses all DBT project folders found in root directory (see -r parameter) and generates report on materialization type used by each sql file.
# Reprot is palced in two output files of same content. One in xlsx format another in pkl under same filename (see -o parameter) Both parameters are optional,
# if not specified, default values are used.
#
# Use -h or --help option for help.
#
# Composed by Leonid Tepliakov, [email protected]
import os
import yaml
import glob
import re
import pandas as pd
import argparse
sep=os.sep #set path separator for current environment e.g ""/" for linux, "\"" for windows
# Parse input arguments
parser = argparse.ArgumentParser(
# prog='ProgramName',
description='Parses DBT projects in root folder and makes report on kind of materialization for each sql file. Report is placed in two files of same content in pkl and xlsx format.')
# epilog='Text at the bottom of help')
parser.add_argument('-r','--root', help="root directory where all project folders reside. Default current work directory. ")
parser.add_argument('-o','--out', help='Output file name. Default "DBT_Materializations"')
args=parser.parse_args()
if args.out:
outputFileName=args.out
else:
outputFileName="DBT_Materializations"
start_dir=os.getcwd()
if args.root:
root=os.path.normpath(start_dir+sep+args.root)
else:
root=os.getcwd()
# parse dbt_project.yml files in all projects folders found in root directory and build two dictionaries:
# modelPathDefaultMaterialisations for default materializations on project path level for each project as defined in model parameter of dbt_project.yml
# projectPaths all model paths defined in model-paths parameter of dbt_project.yml
os.chdir(root)
projectDefaultMaterialisations={}
projectPaths={}
dirFiles=os.listdir(".")
#print (type(modelPathDefaultMaterialisations))
for name in dirFiles: # loop by files in root directory
# print(name,os.path.isdir(name))
if os.path.isdir(name) and os. path. exists(name+sep+"dbt_project.yml"): # if file is a folder and contains dbt_project.yml - it is a project folder
# print (name+' is project')
with open(name+sep+'dbt_project.yml', 'r') as file:
projectParameters=yaml.safe_load(file)
if 'models' in projectParameters: # if dbt_project.yml contains "models" parameter, parse it and append results to dictionary
if projectParameters['models']:
modelPathDefaultMaterialisations={}
if name in projectParameters['models']:
# print (projectParameters['models'][name])
if projectParameters['models'][name]:
for k in projectParameters['models'][name].keys():
# print (k,'-',projectParameters['models'][name][k]['+materialized'])
modelPathDefaultMaterialisations[k]=projectParameters['models'][name][k]['+materialized']
projectDefaultMaterialisations[name]=modelPathDefaultMaterialisations
if 'model-paths' in projectParameters: # if dbt_project.yml contains "model-paths" parameter, parse it and append results to dictionary
if projectParameters['model-paths']:
projectPaths[name]=projectParameters['model-paths']
else:
projectPaths[name]='models'
else:
projectPaths[name]='models'
regMaterialzation=re.compile(r"materialized='([a-z]+)'") # compile regex to extract materialization from sql source
df = pd.DataFrame(columns=['Project Name', 'Object Type', 'File Path', 'File Name', 'File Extension','Materialized As']) #create empty dataframe
os.chdir(root)
globalMtr='veiw'
for project in projectPaths.keys(): # loop by projects
for pth in projectPaths[project]: # loop by model paths inside the project
# print(project+sep+pth)
if os.path.isdir(project+sep+pth): # if path defined in yml file does not really exist - ignore it
projectPathMtr=globalMtr
projectPathOrig='global default'
# if default materializatoin is set in yml model use this value as default for this path
if project in projectDefaultMaterialisations.keys() \
and pth in projectDefaultMaterialisations[project].keys() \
and projectDefaultMaterialisations[project][pth]:
projectPathMtr=projectDefaultMaterialisations[project][pth]
projectPathOrig='project default'
os.chdir(project+sep+pth) # dive into model path folder and get the list of *.sql files
l=glob.glob('**'+sep+'*.sql', recursive=True)
for fl in l: # loop by files in model path folder
mtr=projectPathMtr
orig=projectPathOrig
with open(fl, 'r') as sqlFile:
sqlText=sqlFile.read()
materialization=regMaterialzation.findall(sqlText) # look if materialization is defined in the sql source
if len(materialization)>0:
mtr=materialization[0]
orig='SQL source'
# compose new row for result dataframe
new_row={'Project Name':project,\
'Object Type':pth,\
'File Path':os.path.split(fl)[0],\
'File Name':os.path.splitext(os.path.split(fl)[1])[0],\
'File Extension':os.path.splitext(fl)[1].replace('.',''),\
'Materialized As':mtr,\
'Origin':orig,\
}
# print(project,' ',pth,' ',fl)
df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
os.chdir(root)
os.chdir(start_dir)
df.to_pickle(outputFileName+".pkl")
df.to_excel(outputFileName+".xlsx",index=False)