Skip to content

[sync] latest cli changes. #79

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Dec 9, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 53 additions & 6 deletions cli/database/create.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@ def memory_statistics():

# 获取总内存大小(以字节为单位)
total_memory = memory.total
pod_memory_limit = get_pod_memory_limit()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里应该有之前开源贡献的内容被覆盖了,记得同步一下

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

已同步开源贡献者的cgroup控制代码

if pod_memory_limit != 0:
total_memory = pod_memory_limit

# 格式化内存大小
size_units = ["B", "KB", "MB", "GB", "TB"]
Expand All @@ -48,17 +45,65 @@ def memory_statistics():
logging.info(f"final -Xmx is : {max(total_memory - 1, 6):.2f} {size_units[unit_index]}")


def is_valid_regex(pattern):
try:
re.compile(pattern)
return True
except re.error:
return False


def conf_option_deal(args):
options = dict()
if args.extraction_config_file:
try:
with open(args.extraction_config_file, "r") as f:
options = json.load(f)
extract_options = json.load(f)
for conf in extract_options:
language = conf["extractor"]
# all 先不处理
if language == "all":
continue
if language not in args.language:
logging.error("%s language will not be extracted and the configuration is invalid", language)
continue
for option in conf["extractor_options"]:
if "name" not in option:
logging.error("option language error: please check name not in this conf : %s",
json.dumps(option))
return -1
key = option["name"]
if "value" not in option:
logging.error("option value error: value not in this conf : %s", json.dumps(option))
return -1
if "config" not in option["value"]:
logging.error("option config error:config not in this conf[\"value\"]: %s",
json.dumps(option))
return -1
value = option["value"]["config"]
if "pattern" in option["value"]:
pattern = option["value"]["pattern"]
if is_valid_regex(pattern):
if re.search(pattern, value):
logging.warning("option pattern error: this conf will be ignore: %s",
json.dumps(option))
continue
else:
logging.warning("option pattern error: this conf will be ignore: %s",
json.dumps(option))
continue
if language not in options:
options[language] = dict()
if key in options[language]:
logging.error("in %s extract, %s redefine", language, key)
return -1
options[language][key] = value
except Exception as e:
logging.error(e)
return -1
for language in args.language:
options[language] = dict()
if language not in options:
options[language] = dict()
if args.extraction_config:
# 要求option必须是a.b=c的形式,a为语言名,若不是报错
pattern = r'^(.+)\.(.+)\=(.+)$'
Expand All @@ -72,6 +117,9 @@ def conf_option_deal(args):
if language not in args.language:
logging.error("option language error: %s does not need to be extracted", language)
return -1
if key in options[language]:
logging.error("in %s extract, %s redefine", language, key)
return -1
options[language][key] = value
else:
logging.error("option format error: %s, it need like java.a=b", tmp)
Expand All @@ -87,7 +135,6 @@ def database_create(args):
if options == -1:
logging.error("configuration error, Please check conf")
raise ValueError("configuration error")
memory_statistics()
timeout = args.timeout
extractor_fail = list()
for language in args.language:
Expand Down
112 changes: 66 additions & 46 deletions cli/extractor/extractor.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import logging

import psutil

import shlex
from run.runner import Runner
from sparrow_schema.schema import sparrow

Expand All @@ -17,6 +17,7 @@ class Extractor:
sql_extractor = ""
swift_extractor = ""
xml_extractor = ""
arkts_extractor = ""

def __init__(self):
Extractor.cfamily_extractor = sparrow.home / "language" / "cfamily" / "extractor" / "usr" / "bin" / "coref-cfamily-src-extractor"
Expand All @@ -28,6 +29,7 @@ def __init__(self):
Extractor.sql_extractor = sparrow.home / "language" / "sql" / "extractor" / "coref-sql-src-extractor_deploy.jar"
Extractor.swift_extractor = sparrow.home / "language" / "swift" / "extractor" / "usr" / "bin" / "coref-swift-src-extractor"
Extractor.xml_extractor = sparrow.home / "language" / "xml" / "extractor" / "coref-xml-extractor_deploy.jar"
Extractor.arkts_extractor = sparrow.home / "language" / "arkts" / "extractor" / "coref-arkts-src-extractor"


def cfamily_extractor_cmd(source_root, database, options):
Expand Down Expand Up @@ -58,15 +60,19 @@ def go_extractor_cmd(source_root, database, options):

def java_extractor_cmd(source_root, database, options):
cmd = list()
cmd += jar_extractor_cmd(Extractor.java_extractor, source_root, database)
cmd += jar_extractor_cmd(Extractor.java_extractor, source_root, database, options)
if options:
if "white-list" in options and "whiteList" in options:
logging.error("white-list and whiteList cannot be configured at the same time")
return -1
if "cp" in options and "classpath" in options:
logging.error("cp and classpath cannot be configured at the same time")
return -1
for (key, value) in options.items():
if key == "white-list" or key == "whiteList":
cmd += ["-w=", value]
elif key == "cp":
cmd += ["-cp=", value]
elif key == "classpath":
cmd += ["--classpath=", value]
cmd += ["-w=" + value]
elif key == "cp" or key == "classpath":
cmd += ["-cp=" + value]
elif key == "incremental":
if value == "true":
cmd += ["--incremental"]
Expand All @@ -80,8 +86,9 @@ def java_extractor_cmd(source_root, database, options):
logging.warning("java.incremental does not take effect, please use java.incremental=true")
else:
if key != "cache-dir" and key != "commit" and key != "remote-cache-type" and \
key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix":
logging.warning("unsupported config name:%s for java extractor.", key)
key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix" and \
key != "jvm_opts":
logging.warning("unsupported config name: %s for java extractor.", key)
if "incremental" not in options or options["incremental"] != "true":
cmd += ["--parallel"]
return cmd
Expand Down Expand Up @@ -124,7 +131,7 @@ def javascript_extractor_cmd(source_root, database, options):


def properties_extractor_cmd(source_root, database, options):
cmd = jar_extractor_cmd(Extractor.properties_extractor, source_root, database)
cmd = jar_extractor_cmd(Extractor.properties_extractor, source_root, database, options)
return cmd


Expand All @@ -136,13 +143,13 @@ def python_extractor_cmd(source_root, database, options):

def sql_extractor_cmd(source_root, database, options):
cmd = list()
cmd += jar_extractor_cmd(Extractor.sql_extractor, source_root, database)
cmd += jar_extractor_cmd(Extractor.sql_extractor, source_root, database, options)
if "sql-dialect-type" in options:
cmd += ["--sql-dialect-type", options["sql-dialect-type"]]
return cmd


def swift_extractor(source_root, database, options):
def swift_extractor_cmd(source_root, database, options):
cmd = list()
cmd += [str(Extractor.swift_extractor), str(source_root), str(database)]
if options:
Expand All @@ -156,23 +163,56 @@ def swift_extractor(source_root, database, options):


def xml_extractor_cmd(source_root, database, options):
cmd = jar_extractor_cmd(Extractor.xml_extractor, source_root, database)
cmd = jar_extractor_cmd(Extractor.xml_extractor, source_root, database, options)
return cmd


def jar_extractor_cmd(extractor_path, source_root, database):
# 获取内存信息
mem = psutil.virtual_memory()
total_memory = mem.total
pod_memory_limit = get_pod_memory_limit()
if pod_memory_limit != 0:
total_memory = pod_memory_limit
total_memory_gb = round(total_memory / (1024 ** 3))
logging.info("current memory is : %s GB", total_memory_gb)
xmx = max(total_memory_gb - 1, 6)
logging.info("final -Xmx is: %s GB", xmx)
def arkts_extractor_cmd(source_root, database, options):
cmd = list()
cmd += ["java", "-jar", "-Xmx" + str(xmx) + "g", str(extractor_path)]
cmd += [str(Extractor.arkts_extractor), "extract"] + \
["--extract-text", "-s", str(source_root)] + \
["-d", str(database / "coref_arkts_src.db")]
if options:
for (key, value) in options.items():
if key == "blacklist" or key == "b":
cmd += ["--blacklist"] + value.split(",")
elif key == "use-gitignore":
cmd += ["--use-gitignore"]
elif key == "extract-text":
cmd += ["--extract-text"]
elif key == "extract-deps":
cmd += ["--extract-deps"]
elif key == "file-size-limit":
cmd += ["--file-size-limit", value]
elif key == "paths":
cmd += ["--paths", value]
else:
logging.warning("unsupported config name:%s for arkts extractor.", key)
return cmd


def jar_extractor_cmd(extractor_path, source_root, database, options):
jvm_opts = None
if options:
for (key, value) in options.items():
if key == "jvm_opts":
# jvm_opts from user specified extract config
jvm_opts = value

# if no jvm_opts from extract config, calculate xmx according to current memory.
if not jvm_opts:
mem = psutil.virtual_memory()
total_memory = mem.total
total_memory_gb = round(total_memory / (1024 ** 3))
total_memory_gb = min(total_memory_gb, 32) # limit to 32G
xmx = max(total_memory_gb - 1, 6)
logging.info("current memory is: %s GB, will use xmx: %s GB.", total_memory_gb, xmx)
jvm_opts = f"-Xmx{xmx}g"

logging.info("extract jvm_opts is: %s .", jvm_opts)

cmd = list()
cmd += ["java"] + shlex.split(jvm_opts) + ["-jar", str(extractor_path)]
cmd += [str(source_root), str(database)]
return cmd

Expand All @@ -190,26 +230,6 @@ def extractor_run(language, source_root, database, timeout, options):
tmp = Runner(cmd, timeout)
return tmp.subrun()
else:
logging.error("Not supported language: %s", language)
logging.error("Failed to obtain the %s extractor", language)
return -1


def get_pod_memory_limit():
# cgroup 文件系统路径
memory_limit_path = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
memory_limit = 0
try:
with open(memory_limit_path, 'r') as f:
memory_limit = int(f.read().strip())
except FileNotFoundError:
pass
except PermissionError:
logging.error("Permission denied when accessing cgroup files.")
except IOError as e:
logging.error(f"IO error occurred when accessing cgroup files: {e}")
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")
return memory_limit



62 changes: 39 additions & 23 deletions cli/godel/godel_compiler.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,25 @@
import logging
import tempfile
import time
from pathlib import Path
import re
import chardet

from run.runner import Runner
from sparrow_schema.schema import sparrow


def get_encoding(file_path):
with open(file_path, 'rb') as f:
result = chardet.detect(f.read())
return result['encoding']


def godel_version_judge(path) -> str:
# 判断脚本对应的godel编译器版本
result = "script"
result = "0.3"
try:
with open(path, "r") as f:
with open(path, "r", encoding=get_encoding(path)) as f:
tmp = f.readline()
if "1.0" in tmp:
result = "1.0"
if re.match(r'//[ \t]*script', tmp):
result = "script"
except Exception as e:
logging.error(f"godel version judge error: {str(e)}")
return result
Expand All @@ -23,8 +28,8 @@ def godel_version_judge(path) -> str:
def get_godel_compile(path):
version = godel_version_judge(path)
godel = ""
if version == "1.0":
godel = sparrow.godel_1_0
if version == "0.3":
godel = sparrow.godel_0_3
elif version == "script":
godel = sparrow.godel_script
return godel
Expand All @@ -35,7 +40,8 @@ def backend_execute(path, database, output, timeout, output_format, verbose):
version = godel_version_judge(path)
cmd = list()
cmd += [str(godel), str(path), "--run-souffle-directly", "--package-path"]
cmd += [str(sparrow.lib_1_0)]
if version == "0.3":
cmd += [str(sparrow.lib_03)]
if database is not None:
cmd += ["--souffle-fact-dir", database]
cmd += ["--souffle-output-format", output_format, "--souffle-output-path", output]
Expand All @@ -45,23 +51,33 @@ def backend_execute(path, database, output, timeout, output_format, verbose):
return tmp.subrun()


def precompiled(path, timeout):
cmd = [str(sparrow.godel_script), "-p", str(sparrow.lib_script), "--semantic-only", str(path)]
tmp = Runner(cmd, timeout)
status = tmp.subrun()
if status != 0:
return False
return True


def execute(path, database, output, timeout, output_format, verbose):
godel = get_godel_compile(path)
version = godel_version_judge(path)
cmd = list()
if version == "script":
# godel-script两步编译,实际执行后端为1.0
with tempfile.NamedTemporaryFile(suffix='.gdl') as temp_file:
cmd += [str(godel), str(path), "-p", str(sparrow.lib_1_0), "-o", temp_file.name]
if verbose:
cmd += ["--verbose"]
tmp = Runner(cmd, timeout)
start_time = time.time()
return_code = tmp.subrun()
if return_code != 0:
logging.error("%s compile error, please check it yourself", str(path))
return -1
logging.info("godel-script compile time: %.2fs", time.time() - start_time)
return backend_execute(Path(temp_file.name), database, output, timeout, output_format, verbose)
# godel-script 直接执行
cmd += [str(godel), "-p", str(sparrow.lib_script), "-f", database]
cmd += ["-Of", "-r", str(path)]
if output_format == "sqlite":
cmd += ["--output-sqlite"]
elif output_format == "csv":
cmd += ["--output-csv"]
else:
cmd += ["--output-json"]
cmd += [output]
if verbose:
cmd += ["--verbose"]
tmp = Runner(cmd, timeout)
return tmp.subrun()
else:
return backend_execute(path, database, output, timeout, output_format, verbose)
Empty file added cli/package/__init__.py
Empty file.
Loading
Loading