-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathutilities.py
222 lines (187 loc) · 7.67 KB
/
utilities.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
# some functions to use for find-snippets, pr-report, and merge-report
# function to read the CODEOWNERS file from the azureml-examples repo
# and return only the lines that are docs files.
def read_codeowners():
# read the codeowners file from azureml-examples
import requests
url = "https://raw.githubusercontent.com/Azure/azureml-examples/main/.github/CODEOWNERS"
response = requests.get(url)
contents = response.text.splitlines()
# get the lines that are docs files
start_index = end_index = 0
for i, line in enumerate(contents):
if line.startswith("#### files"):
start_index = i
if line.startswith("# End of docs"):
end_index = i
break
contents = contents[start_index + 1 : end_index]
return contents
# function to get the changes for a specific file in a PR.
# Then searches for notebook cells or code snippets the were added/deleted.
# Returns a tuple with a boolean for whether the file is a notebook,
# a list of added cells, and a list of deleted cells.
def find_changes(thisfile, prfiles, blob_url):
# pass blob_url back so we can preview the file in the report.
import re
patch = [file["patch"] for file in prfiles if file["filename"] == thisfile]
nb_cell = (
r'(\\n[\+-])\s*"name":\s*"([^"]*)"' # finds added or deleted cells with a name
)
code_cell = (
r"(\\n[\+-])\s*(#\s*<[^>]*>)" # finds lines that start with # <> or # </>
)
# only works for files that use # as comment.
adds = []
deletes = []
nb = False
if thisfile.endswith(".ipynb"):
nb = True
matches = re.findall(nb_cell, str(patch))
else:
matches = re.findall(code_cell, str(patch))
for match in matches:
if match[0] == "\\n+":
adds.append(match[1])
elif match[0] == "\\n-":
deletes.append(match[1])
else:
print(
"ERROR in utilities.py find_changes. The match was not an add or delete."
)
return (nb, adds, deletes, blob_url)
# function to clean up the matches
# syntax of a match is different if it is from a notebook vs. code files.
# returns info about the match:
# path to the file in azureml-examples
# file name of the file being referenced
# branch used to find the file(i.e., azureml-examples-main)
# match - the full match
# name - the name of the notebook cell
def cleanup_matches(match):
import os
# If match starts and ends with parentheses, remove them
if match.startswith("(") and match.endswith(")"):
match = match[1:-1]
# match= match.replace('(', '').replace(')', '').replace('"', '').replace(',', '').replace('source=', '')
match = match.replace('"', "").replace(",", "").replace("source=", "")
# print(f"** match is {match}")
# split up the match into parts here.
path = os.path.dirname(match)
ref_file = os.path.basename(match)
# the first part of the path, after ~/, is the "path-to-root" which includes the branch name
# path-to-root is configured in azure-ai-docs-pr/.openpublishing.publish.config.json
branch = path.split("/")[1]
# remove the branch info to get the path to the file in azureml-examples
path = path.replace("~/", "")
if path == branch:
path = ""
else:
path = path.replace(f"{branch}/", "")
if "?" in ref_file: # split out the id name from the ref_file if it exists
ref_file, name = ref_file.split("?", 1)
else:
name = ""
if path != "": # if the path is empty, we don't want a beginning slash.
ref_file = f"{path}/{ref_file}" # add the path to the ref_file
ref_file = ref_file.replace("///", "/").replace(
"//", "/"
) # get rid of triple or double slashes
return (
path,
ref_file,
branch,
match,
name,
) # right now, not using match and name. But might in the future
# function to read local file - try utf-8 first, then latin-1
def read_file(file_path):
try:
with open(file_path, "r", encoding="utf-8") as target_file:
lines = target_file.readlines()
except UnicodeDecodeError:
try:
with open(file_path, "r", encoding="latin-1") as target_file:
lines = target_file.readlines()
except Exception as e:
print(f"Error reading {file_path}: {e}")
lines = []
return lines
def read_snippets(fn):
import os
import sys
import pandas as pd
# read the snippets file
mydir = os.path.abspath(__file__)
snippet_fn = os.path.join(os.path.dirname(mydir), fn)
# Check if snippets file exists
if os.path.exists(snippet_fn):
snippets = pd.read_csv(snippet_fn)
else:
print(f"{snippet_fn} does not exist.")
print("Run 'find-snippets.py' to create the file.")
sys.exit()
return snippets
# function to compare file on two branches in a
def compare_branches(repo, file, branch1, branch2):
try:
file_b1 = repo.get_contents(file, ref=branch1)
except Exception:
print(f"Can't compare branches; {file} no longer found in {branch1}")
return
try:
file_b2 = repo.get_contents(file, ref=branch2)
except Exception:
print(f"Can't compare branches; {file} no longer found in {branch2}")
return
if file_b1.sha == file_b2.sha:
print(
f"*azureml-examples {branch2} branch has the same version of this file as {branch1}\n"
)
else:
print(
f"*azureml-examples {branch2} branch has a DIFFERENT version of this file from {branch1}\n"
)
# call for each line in the file, send in current info and get back updated values
def count_code_lines(line, blocks, inside_code_block, count, code_type):
line = line.lstrip()
if line.startswith("```"):
if inside_code_block: # done - this is the end of the block
blocks.append((code_type, count)) # Add type and count to the list
else: # starting - get the type and reset the count
code_type = line[3:].strip() # Get the rest of the line after ```
count = 0
inside_code_block = not inside_code_block
else:
count += 1
return blocks, inside_code_block, count, code_type
def find_snippets(line, branches, az_ml_branch, file):
match_snippet = re.findall(
r'\(~\/azureml-examples[^)]*\)|source="~\/azureml-examples[^"]*"', line
)
if match_snippet:
for match in match_snippet:
path, ref_file, branch, match, name = cleanup_matches(match)
branches.append(branch)
if (
branch == az_ml_branch
): # PRs are merged into main, so only these files are relevant
row_dict = {"ref_file": ref_file, "from_file": file}
dict_list.append(row_dict)
# get all contents from the path and all sub-directories
def get_all_contents(repo, path, repo_branch):
contents = []
stack = [path]
while stack:
current_path = stack.pop()
current_contents = repo.get_contents(current_path, ref=repo_branch)
for content in current_contents:
if content.type == 'dir' and 'media' not in content.path: # skip media directories
stack.append(content.path)
else:
contents.append(content)
return contents
if __name__ == "__main__":
problem = "~/azureml-examples-main/sdk/python/featurestore_sample/notebooks/sdk_only/7. Develop a feature set using Domain Specific Language (DSL).ipynb?name=setup-root-dir"
path, ref_file, branch, match, name = cleanup_matches(problem)
print(f"path: {path}, ref_file: {ref_file}")