Skip to content

Commit 0514873

Browse files
author
zhouang.za
committed
[sync] latest cli changes.
1 parent dfcc633 commit 0514873

17 files changed

+909
-202
lines changed

cli/database/create.py

+53-6
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,6 @@ def memory_statistics():
3434

3535
# 获取总内存大小(以字节为单位)
3636
total_memory = memory.total
37-
pod_memory_limit = get_pod_memory_limit()
38-
if pod_memory_limit != 0:
39-
total_memory = pod_memory_limit
4037

4138
# 格式化内存大小
4239
size_units = ["B", "KB", "MB", "GB", "TB"]
@@ -48,17 +45,65 @@ def memory_statistics():
4845
logging.info(f"final -Xmx is : {max(total_memory - 1, 6):.2f} {size_units[unit_index]}")
4946

5047

48+
def is_valid_regex(pattern):
49+
try:
50+
re.compile(pattern)
51+
return True
52+
except re.error:
53+
return False
54+
55+
5156
def conf_option_deal(args):
5257
options = dict()
5358
if args.extraction_config_file:
5459
try:
5560
with open(args.extraction_config_file, "r") as f:
56-
options = json.load(f)
61+
extract_options = json.load(f)
62+
for conf in extract_options:
63+
language = conf["extractor"]
64+
# all 先不处理
65+
if language == "all":
66+
continue
67+
if language not in args.language:
68+
logging.error("%s language will not be extracted and the configuration is invalid", language)
69+
continue
70+
for option in conf["extractor_options"]:
71+
if "name" not in option:
72+
logging.error("option language error: please check name not in this conf : %s",
73+
json.dumps(option))
74+
return -1
75+
key = option["name"]
76+
if "value" not in option:
77+
logging.error("option value error: value not in this conf : %s", json.dumps(option))
78+
return -1
79+
if "config" not in option["value"]:
80+
logging.error("option config error:config not in this conf[\"value\"]: %s",
81+
json.dumps(option))
82+
return -1
83+
value = option["value"]["config"]
84+
if "pattern" in option["value"]:
85+
pattern = option["value"]["pattern"]
86+
if is_valid_regex(pattern):
87+
if re.search(pattern, value):
88+
logging.warning("option pattern error: this conf will be ignore: %s",
89+
json.dumps(option))
90+
continue
91+
else:
92+
logging.warning("option pattern error: this conf will be ignore: %s",
93+
json.dumps(option))
94+
continue
95+
if language not in options:
96+
options[language] = dict()
97+
if key in options[language]:
98+
logging.error("in %s extract, %s redefine", language, key)
99+
return -1
100+
options[language][key] = value
57101
except Exception as e:
58102
logging.error(e)
59103
return -1
60104
for language in args.language:
61-
options[language] = dict()
105+
if language not in options:
106+
options[language] = dict()
62107
if args.extraction_config:
63108
# 要求option必须是a.b=c的形式,a为语言名,若不是报错
64109
pattern = r'^(.+)\.(.+)\=(.+)$'
@@ -72,6 +117,9 @@ def conf_option_deal(args):
72117
if language not in args.language:
73118
logging.error("option language error: %s does not need to be extracted", language)
74119
return -1
120+
if key in options[language]:
121+
logging.error("in %s extract, %s redefine", language, key)
122+
return -1
75123
options[language][key] = value
76124
else:
77125
logging.error("option format error: %s, it need like java.a=b", tmp)
@@ -87,7 +135,6 @@ def database_create(args):
87135
if options == -1:
88136
logging.error("configuration error, Please check conf")
89137
raise ValueError("configuration error")
90-
memory_statistics()
91138
timeout = args.timeout
92139
extractor_fail = list()
93140
for language in args.language:

cli/extractor/extractor.py

+66-46
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import logging
22

33
import psutil
4-
4+
import shlex
55
from run.runner import Runner
66
from sparrow_schema.schema import sparrow
77

@@ -17,6 +17,7 @@ class Extractor:
1717
sql_extractor = ""
1818
swift_extractor = ""
1919
xml_extractor = ""
20+
arkts_extractor = ""
2021

2122
def __init__(self):
2223
Extractor.cfamily_extractor = sparrow.home / "language" / "cfamily" / "extractor" / "usr" / "bin" / "coref-cfamily-src-extractor"
@@ -28,6 +29,7 @@ def __init__(self):
2829
Extractor.sql_extractor = sparrow.home / "language" / "sql" / "extractor" / "coref-sql-src-extractor_deploy.jar"
2930
Extractor.swift_extractor = sparrow.home / "language" / "swift" / "extractor" / "usr" / "bin" / "coref-swift-src-extractor"
3031
Extractor.xml_extractor = sparrow.home / "language" / "xml" / "extractor" / "coref-xml-extractor_deploy.jar"
32+
Extractor.arkts_extractor = sparrow.home / "language" / "arkts" / "extractor" / "coref-arkts-src-extractor"
3133

3234

3335
def cfamily_extractor_cmd(source_root, database, options):
@@ -58,15 +60,19 @@ def go_extractor_cmd(source_root, database, options):
5860

5961
def java_extractor_cmd(source_root, database, options):
6062
cmd = list()
61-
cmd += jar_extractor_cmd(Extractor.java_extractor, source_root, database)
63+
cmd += jar_extractor_cmd(Extractor.java_extractor, source_root, database, options)
6264
if options:
65+
if "white-list" in options and "whiteList" in options:
66+
logging.error("white-list and whiteList cannot be configured at the same time")
67+
return -1
68+
if "cp" in options and "classpath" in options:
69+
logging.error("cp and classpath cannot be configured at the same time")
70+
return -1
6371
for (key, value) in options.items():
6472
if key == "white-list" or key == "whiteList":
65-
cmd += ["-w=", value]
66-
elif key == "cp":
67-
cmd += ["-cp=", value]
68-
elif key == "classpath":
69-
cmd += ["--classpath=", value]
73+
cmd += ["-w=" + value]
74+
elif key == "cp" or key == "classpath":
75+
cmd += ["-cp=" + value]
7076
elif key == "incremental":
7177
if value == "true":
7278
cmd += ["--incremental"]
@@ -80,8 +86,9 @@ def java_extractor_cmd(source_root, database, options):
8086
logging.warning("java.incremental does not take effect, please use java.incremental=true")
8187
else:
8288
if key != "cache-dir" and key != "commit" and key != "remote-cache-type" and \
83-
key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix":
84-
logging.warning("unsupported config name:%s for java extractor.", key)
89+
key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix" and \
90+
key != "jvm_opts":
91+
logging.warning("unsupported config name: %s for java extractor.", key)
8592
if "incremental" not in options or options["incremental"] != "true":
8693
cmd += ["--parallel"]
8794
return cmd
@@ -124,7 +131,7 @@ def javascript_extractor_cmd(source_root, database, options):
124131

125132

126133
def properties_extractor_cmd(source_root, database, options):
127-
cmd = jar_extractor_cmd(Extractor.properties_extractor, source_root, database)
134+
cmd = jar_extractor_cmd(Extractor.properties_extractor, source_root, database, options)
128135
return cmd
129136

130137

@@ -136,13 +143,13 @@ def python_extractor_cmd(source_root, database, options):
136143

137144
def sql_extractor_cmd(source_root, database, options):
138145
cmd = list()
139-
cmd += jar_extractor_cmd(Extractor.sql_extractor, source_root, database)
146+
cmd += jar_extractor_cmd(Extractor.sql_extractor, source_root, database, options)
140147
if "sql-dialect-type" in options:
141148
cmd += ["--sql-dialect-type", options["sql-dialect-type"]]
142149
return cmd
143150

144151

145-
def swift_extractor(source_root, database, options):
152+
def swift_extractor_cmd(source_root, database, options):
146153
cmd = list()
147154
cmd += [str(Extractor.swift_extractor), str(source_root), str(database)]
148155
if options:
@@ -156,23 +163,56 @@ def swift_extractor(source_root, database, options):
156163

157164

158165
def xml_extractor_cmd(source_root, database, options):
159-
cmd = jar_extractor_cmd(Extractor.xml_extractor, source_root, database)
166+
cmd = jar_extractor_cmd(Extractor.xml_extractor, source_root, database, options)
160167
return cmd
161168

162169

163-
def jar_extractor_cmd(extractor_path, source_root, database):
164-
# 获取内存信息
165-
mem = psutil.virtual_memory()
166-
total_memory = mem.total
167-
pod_memory_limit = get_pod_memory_limit()
168-
if pod_memory_limit != 0:
169-
total_memory = pod_memory_limit
170-
total_memory_gb = round(total_memory / (1024 ** 3))
171-
logging.info("current memory is : %s GB", total_memory_gb)
172-
xmx = max(total_memory_gb - 1, 6)
173-
logging.info("final -Xmx is: %s GB", xmx)
170+
def arkts_extractor_cmd(source_root, database, options):
174171
cmd = list()
175-
cmd += ["java", "-jar", "-Xmx" + str(xmx) + "g", str(extractor_path)]
172+
cmd += [str(Extractor.arkts_extractor), "extract"] + \
173+
["--extract-text", "-s", str(source_root)] + \
174+
["-d", str(database / "coref_arkts_src.db")]
175+
if options:
176+
for (key, value) in options.items():
177+
if key == "blacklist" or key == "b":
178+
cmd += ["--blacklist"] + value.split(",")
179+
elif key == "use-gitignore":
180+
cmd += ["--use-gitignore"]
181+
elif key == "extract-text":
182+
cmd += ["--extract-text"]
183+
elif key == "extract-deps":
184+
cmd += ["--extract-deps"]
185+
elif key == "file-size-limit":
186+
cmd += ["--file-size-limit", value]
187+
elif key == "paths":
188+
cmd += ["--paths", value]
189+
else:
190+
logging.warning("unsupported config name:%s for arkts extractor.", key)
191+
return cmd
192+
193+
194+
def jar_extractor_cmd(extractor_path, source_root, database, options):
195+
jvm_opts = None
196+
if options:
197+
for (key, value) in options.items():
198+
if key == "jvm_opts":
199+
# jvm_opts from user specified extract config
200+
jvm_opts = value
201+
202+
# if no jvm_opts from extract config, calculate xmx according to current memory.
203+
if not jvm_opts:
204+
mem = psutil.virtual_memory()
205+
total_memory = mem.total
206+
total_memory_gb = round(total_memory / (1024 ** 3))
207+
total_memory_gb = min(total_memory_gb, 32) # limit to 32G
208+
xmx = max(total_memory_gb - 1, 6)
209+
logging.info("current memory is: %s GB, will use xmx: %s GB.", total_memory_gb, xmx)
210+
jvm_opts = f"-Xmx{xmx}g"
211+
212+
logging.info("extract jvm_opts is: %s .", jvm_opts)
213+
214+
cmd = list()
215+
cmd += ["java"] + shlex.split(jvm_opts) + ["-jar", str(extractor_path)]
176216
cmd += [str(source_root), str(database)]
177217
return cmd
178218

@@ -190,26 +230,6 @@ def extractor_run(language, source_root, database, timeout, options):
190230
tmp = Runner(cmd, timeout)
191231
return tmp.subrun()
192232
else:
193-
logging.error("Not supported language: %s", language)
233+
logging.error("Failed to obtain the %s extractor", language)
194234
return -1
195235

196-
197-
def get_pod_memory_limit():
198-
# cgroup 文件系统路径
199-
memory_limit_path = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
200-
memory_limit = 0
201-
try:
202-
with open(memory_limit_path, 'r') as f:
203-
memory_limit = int(f.read().strip())
204-
except FileNotFoundError:
205-
pass
206-
except PermissionError:
207-
logging.error("Permission denied when accessing cgroup files.")
208-
except IOError as e:
209-
logging.error(f"IO error occurred when accessing cgroup files: {e}")
210-
except Exception as e:
211-
logging.error(f"An unexpected error occurred: {e}")
212-
return memory_limit
213-
214-
215-

cli/godel/godel_compiler.py

+39-23
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,25 @@
11
import logging
2-
import tempfile
3-
import time
4-
from pathlib import Path
2+
import re
3+
import chardet
54

65
from run.runner import Runner
76
from sparrow_schema.schema import sparrow
87

98

9+
def get_encoding(file_path):
10+
with open(file_path, 'rb') as f:
11+
result = chardet.detect(f.read())
12+
return result['encoding']
13+
14+
1015
def godel_version_judge(path) -> str:
1116
# 判断脚本对应的godel编译器版本
12-
result = "script"
17+
result = "0.3"
1318
try:
14-
with open(path, "r") as f:
19+
with open(path, "r", encoding=get_encoding(path)) as f:
1520
tmp = f.readline()
16-
if "1.0" in tmp:
17-
result = "1.0"
21+
if re.match(r'//[ \t]*script', tmp):
22+
result = "script"
1823
except Exception as e:
1924
logging.error(f"godel version judge error: {str(e)}")
2025
return result
@@ -23,8 +28,8 @@ def godel_version_judge(path) -> str:
2328
def get_godel_compile(path):
2429
version = godel_version_judge(path)
2530
godel = ""
26-
if version == "1.0":
27-
godel = sparrow.godel_1_0
31+
if version == "0.3":
32+
godel = sparrow.godel_0_3
2833
elif version == "script":
2934
godel = sparrow.godel_script
3035
return godel
@@ -35,7 +40,8 @@ def backend_execute(path, database, output, timeout, output_format, verbose):
3540
version = godel_version_judge(path)
3641
cmd = list()
3742
cmd += [str(godel), str(path), "--run-souffle-directly", "--package-path"]
38-
cmd += [str(sparrow.lib_1_0)]
43+
if version == "0.3":
44+
cmd += [str(sparrow.lib_03)]
3945
if database is not None:
4046
cmd += ["--souffle-fact-dir", database]
4147
cmd += ["--souffle-output-format", output_format, "--souffle-output-path", output]
@@ -45,23 +51,33 @@ def backend_execute(path, database, output, timeout, output_format, verbose):
4551
return tmp.subrun()
4652

4753

54+
def precompiled(path, timeout):
55+
cmd = [str(sparrow.godel_script), "-p", str(sparrow.lib_script), "--semantic-only", str(path)]
56+
tmp = Runner(cmd, timeout)
57+
status = tmp.subrun()
58+
if status != 0:
59+
return False
60+
return True
61+
62+
4863
def execute(path, database, output, timeout, output_format, verbose):
4964
godel = get_godel_compile(path)
5065
version = godel_version_judge(path)
5166
cmd = list()
5267
if version == "script":
53-
# godel-script两步编译,实际执行后端为1.0
54-
with tempfile.NamedTemporaryFile(suffix='.gdl') as temp_file:
55-
cmd += [str(godel), str(path), "-p", str(sparrow.lib_1_0), "-o", temp_file.name]
56-
if verbose:
57-
cmd += ["--verbose"]
58-
tmp = Runner(cmd, timeout)
59-
start_time = time.time()
60-
return_code = tmp.subrun()
61-
if return_code != 0:
62-
logging.error("%s compile error, please check it yourself", str(path))
63-
return -1
64-
logging.info("godel-script compile time: %.2fs", time.time() - start_time)
65-
return backend_execute(Path(temp_file.name), database, output, timeout, output_format, verbose)
68+
# godel-script 直接执行
69+
cmd += [str(godel), "-p", str(sparrow.lib_script), "-f", database]
70+
cmd += ["-Of", "-r", str(path)]
71+
if output_format == "sqlite":
72+
cmd += ["--output-sqlite"]
73+
elif output_format == "csv":
74+
cmd += ["--output-csv"]
75+
else:
76+
cmd += ["--output-json"]
77+
cmd += [output]
78+
if verbose:
79+
cmd += ["--verbose"]
80+
tmp = Runner(cmd, timeout)
81+
return tmp.subrun()
6682
else:
6783
return backend_execute(path, database, output, timeout, output_format, verbose)

cli/package/__init__.py

Whitespace-only changes.

0 commit comments

Comments
 (0)