codefuse-ai · ValKmjolnir · Dec 9, 2024 · Dec 8, 2024 · Dec 9, 2024 · ValKmjolnir
diff --git a/cli/database/create.py b/cli/database/create.py
@@ -34,9 +34,6 @@ def memory_statistics():
 
     # 获取总内存大小（以字节为单位）
     total_memory = memory.total
-    pod_memory_limit = get_pod_memory_limit()
-    if pod_memory_limit != 0:
-        total_memory = pod_memory_limit
 
     # 格式化内存大小
     size_units = ["B", "KB", "MB", "GB", "TB"]
@@ -48,17 +45,65 @@ def memory_statistics():
     logging.info(f"final -Xmx is : {max(total_memory - 1, 6):.2f} {size_units[unit_index]}")
 
 
+def is_valid_regex(pattern):
+    try:
+        re.compile(pattern)
+        return True
+    except re.error:
+        return False
+
+
 def conf_option_deal(args):
     options = dict()
     if args.extraction_config_file:
         try:
             with open(args.extraction_config_file, "r") as f:
-                options = json.load(f)
+                extract_options = json.load(f)
+                for conf in extract_options:
+                    language = conf["extractor"]
+                    # all 先不处理
+                    if language == "all":
+                        continue
+                    if language not in args.language:
+                        logging.error("%s language will not be extracted and the configuration is invalid", language)
+                        continue
+                    for option in conf["extractor_options"]:
+                        if "name" not in option:
+                            logging.error("option language error: please check name not in this conf : %s",
+                                          json.dumps(option))
+                            return -1
+                        key = option["name"]
+                        if "value" not in option:
+                            logging.error("option value error: value not in this conf : %s", json.dumps(option))
+                            return -1
+                        if "config" not in option["value"]:
+                            logging.error("option config error:config not in this conf[\"value\"]: %s",
+                                          json.dumps(option))
+                            return -1
+                        value = option["value"]["config"]
+                        if "pattern" in option["value"]:
+                            pattern = option["value"]["pattern"]
+                            if is_valid_regex(pattern):
+                                if re.search(pattern, value):
+                                    logging.warning("option pattern error: this conf will be ignore: %s",
+                                                    json.dumps(option))
+                                    continue
+                            else:
+                                logging.warning("option pattern error: this conf will be ignore: %s",
+                                                json.dumps(option))
+                                continue
+                        if language not in options:
+                            options[language] = dict()
+                        if key in options[language]:
+                            logging.error("in %s extract, %s redefine", language, key)
+                            return -1
+                        options[language][key] = value
         except Exception as e:
             logging.error(e)
             return -1
     for language in args.language:
-        options[language] = dict()
+        if language not in options:
+            options[language] = dict()
     if args.extraction_config:
         # 要求option必须是a.b=c的形式，a为语言名，若不是报错
         pattern = r'^(.+)\.(.+)\=(.+)$'
@@ -72,6 +117,9 @@ def conf_option_deal(args):
                 if language not in args.language:
                     logging.error("option language error: %s does not need to be extracted", language)
                     return -1
+                if key in options[language]:
+                    logging.error("in %s extract, %s redefine", language, key)
+                    return -1
                 options[language][key] = value
             else:
                 logging.error("option format error: %s, it need like java.a=b", tmp)
@@ -87,7 +135,6 @@ def database_create(args):
     if options == -1:
         logging.error("configuration error, Please check conf")
         raise ValueError("configuration error")
-    memory_statistics()
     timeout = args.timeout
     extractor_fail = list()
     for language in args.language:

diff --git a/cli/extractor/extractor.py b/cli/extractor/extractor.py
@@ -1,7 +1,7 @@
 import logging
 
 import psutil
-
+import shlex
 from run.runner import Runner
 from sparrow_schema.schema import sparrow
 
@@ -17,6 +17,7 @@ class Extractor:
     sql_extractor = ""
     swift_extractor = ""
     xml_extractor = ""
+    arkts_extractor = ""
 
     def __init__(self):
         Extractor.cfamily_extractor = sparrow.home / "language" / "cfamily" / "extractor" / "usr" / "bin" / "coref-cfamily-src-extractor"
@@ -28,6 +29,7 @@ def __init__(self):
         Extractor.sql_extractor = sparrow.home / "language" / "sql" / "extractor" / "coref-sql-src-extractor_deploy.jar"
         Extractor.swift_extractor = sparrow.home / "language" / "swift" / "extractor" / "usr" / "bin" / "coref-swift-src-extractor"
         Extractor.xml_extractor = sparrow.home / "language" / "xml" / "extractor" / "coref-xml-extractor_deploy.jar"
+        Extractor.arkts_extractor = sparrow.home / "language" / "arkts" / "extractor" / "coref-arkts-src-extractor"
 
 
 def cfamily_extractor_cmd(source_root, database, options):
@@ -58,15 +60,19 @@ def go_extractor_cmd(source_root, database, options):
 
 def java_extractor_cmd(source_root, database, options):
     cmd = list()
-    cmd += jar_extractor_cmd(Extractor.java_extractor, source_root, database)
+    cmd += jar_extractor_cmd(Extractor.java_extractor, source_root, database, options)
     if options:
+        if "white-list" in options and "whiteList" in options:
+            logging.error("white-list and whiteList cannot be configured at the same time")
+            return -1
+        if "cp" in options and "classpath" in options:
+            logging.error("cp and classpath cannot be configured at the same time")
+            return -1
         for (key, value) in options.items():
             if key == "white-list" or key == "whiteList":
-                cmd += ["-w=", value]
-            elif key == "cp":
-                cmd += ["-cp=", value]
-            elif key == "classpath":
-                cmd += ["--classpath=", value]
+                cmd += ["-w=" + value]
+            elif key == "cp" or key == "classpath":
+                cmd += ["-cp=" + value]
             elif key == "incremental":
                 if value == "true":
                     cmd += ["--incremental"]
@@ -80,8 +86,9 @@ def java_extractor_cmd(source_root, database, options):
                     logging.warning("java.incremental does not take effect, please use java.incremental=true")
             else:
                 if key != "cache-dir" and key != "commit" and key != "remote-cache-type" and \
-                        key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix":
-                    logging.warning("unsupported config name:%s for java extractor.", key)
+                        key != "oss-bucket" and key != "oss-config-file" and key != "oss-path-prefix" and \
+                        key != "jvm_opts":
+                    logging.warning("unsupported config name: %s for java extractor.", key)
     if "incremental" not in options or options["incremental"] != "true":
         cmd += ["--parallel"]
     return cmd
@@ -124,7 +131,7 @@ def javascript_extractor_cmd(source_root, database, options):
 
 
 def properties_extractor_cmd(source_root, database, options):
-    cmd = jar_extractor_cmd(Extractor.properties_extractor, source_root, database)
+    cmd = jar_extractor_cmd(Extractor.properties_extractor, source_root, database, options)
     return cmd
 
 
@@ -136,13 +143,13 @@ def python_extractor_cmd(source_root, database, options):
 
 def sql_extractor_cmd(source_root, database, options):
     cmd = list()
-    cmd += jar_extractor_cmd(Extractor.sql_extractor, source_root, database)
+    cmd += jar_extractor_cmd(Extractor.sql_extractor, source_root, database, options)
     if "sql-dialect-type" in options:
         cmd += ["--sql-dialect-type", options["sql-dialect-type"]]
     return cmd
 
 
-def swift_extractor(source_root, database, options):
+def swift_extractor_cmd(source_root, database, options):
     cmd = list()
     cmd += [str(Extractor.swift_extractor), str(source_root), str(database)]
     if options:
@@ -156,23 +163,56 @@ def swift_extractor(source_root, database, options):
 
 
 def xml_extractor_cmd(source_root, database, options):
-    cmd = jar_extractor_cmd(Extractor.xml_extractor, source_root, database)
+    cmd = jar_extractor_cmd(Extractor.xml_extractor, source_root, database, options)
     return cmd
 
 
-def jar_extractor_cmd(extractor_path, source_root, database):
-    # 获取内存信息
-    mem = psutil.virtual_memory()
-    total_memory = mem.total
-    pod_memory_limit = get_pod_memory_limit()
-    if pod_memory_limit != 0:
-        total_memory = pod_memory_limit
-    total_memory_gb = round(total_memory / (1024 ** 3))
-    logging.info("current memory is : %s GB", total_memory_gb)
-    xmx = max(total_memory_gb - 1, 6)
-    logging.info("final -Xmx is: %s GB", xmx)
+def arkts_extractor_cmd(source_root, database, options):
     cmd = list()
-    cmd += ["java", "-jar", "-Xmx" + str(xmx) + "g", str(extractor_path)]
+    cmd += [str(Extractor.arkts_extractor), "extract"] + \
+           ["--extract-text", "-s", str(source_root)] + \
+           ["-d", str(database / "coref_arkts_src.db")]
+    if options:
+        for (key, value) in options.items():
+            if key == "blacklist" or key == "b":
+                cmd += ["--blacklist"] + value.split(",")
+            elif key == "use-gitignore":
+                cmd += ["--use-gitignore"]
+            elif key == "extract-text":
+                cmd += ["--extract-text"]
+            elif key == "extract-deps":
+                cmd += ["--extract-deps"]
+            elif key == "file-size-limit":
+                cmd += ["--file-size-limit", value]
+            elif key == "paths":
+                cmd += ["--paths", value]
+            else:
+                logging.warning("unsupported config name:%s for arkts extractor.", key)
+    return cmd
+
+
+def jar_extractor_cmd(extractor_path, source_root, database, options):
+    jvm_opts = None
+    if options:
+        for (key, value) in options.items():
+            if key == "jvm_opts":
+                # jvm_opts from user specified extract config
+                jvm_opts = value
+
+    # if no jvm_opts from extract config, calculate xmx according to current memory.
+    if not jvm_opts:
+        mem = psutil.virtual_memory()
+        total_memory = mem.total
+        total_memory_gb = round(total_memory / (1024 ** 3))
+        total_memory_gb = min(total_memory_gb, 32)  # limit to 32G
+        xmx = max(total_memory_gb - 1, 6)
+        logging.info("current memory is: %s GB, will use xmx: %s GB.", total_memory_gb, xmx)
+        jvm_opts = f"-Xmx{xmx}g"
+
+    logging.info("extract jvm_opts is: %s .", jvm_opts)
+
+    cmd = list()
+    cmd += ["java"] + shlex.split(jvm_opts) + ["-jar", str(extractor_path)]
     cmd += [str(source_root), str(database)]
     return cmd
 
@@ -190,26 +230,6 @@ def extractor_run(language, source_root, database, timeout, options):
         tmp = Runner(cmd, timeout)
         return tmp.subrun()
     else:
-        logging.error("Not supported language: %s", language)
+        logging.error("Failed to obtain the %s extractor", language)
         return -1
 
-
-def get_pod_memory_limit():
-    # cgroup 文件系统路径
-    memory_limit_path = "/sys/fs/cgroup/memory/memory.limit_in_bytes"
-    memory_limit = 0
-    try:
-        with open(memory_limit_path, 'r') as f:
-            memory_limit = int(f.read().strip())
-    except FileNotFoundError:
-        pass
-    except PermissionError:
-        logging.error("Permission denied when accessing cgroup files.")
-    except IOError as e:
-        logging.error(f"IO error occurred when accessing cgroup files: {e}")
-    except Exception as e:
-        logging.error(f"An unexpected error occurred: {e}")
-    return memory_limit
-
-
-
diff --git a/cli/godel/godel_compiler.py b/cli/godel/godel_compiler.py
@@ -1,20 +1,25 @@
 import logging
-import tempfile
-import time
-from pathlib import Path
+import re
+import chardet
 
 from run.runner import Runner
 from sparrow_schema.schema import sparrow
 
 
+def get_encoding(file_path):
+    with open(file_path, 'rb') as f:
+        result = chardet.detect(f.read())
+    return result['encoding']
+
+
 def godel_version_judge(path) -> str:
     # 判断脚本对应的godel编译器版本
-    result = "script"
+    result = "0.3"
     try:
-        with open(path, "r") as f:
+        with open(path, "r", encoding=get_encoding(path)) as f:
             tmp = f.readline()
-            if "1.0" in tmp:
-                result = "1.0"
+            if re.match(r'//[ \t]*script', tmp):
+                result = "script"
     except Exception as e:
         logging.error(f"godel version judge error: {str(e)}")
     return result
@@ -23,8 +28,8 @@ def godel_version_judge(path) -> str:
 def get_godel_compile(path):
     version = godel_version_judge(path)
     godel = ""
-    if version == "1.0":
-        godel = sparrow.godel_1_0
+    if version == "0.3":
+        godel = sparrow.godel_0_3
     elif version == "script":
         godel = sparrow.godel_script
     return godel
@@ -35,7 +40,8 @@ def backend_execute(path, database, output, timeout, output_format, verbose):
     version = godel_version_judge(path)
     cmd = list()
     cmd += [str(godel), str(path), "--run-souffle-directly", "--package-path"]
-    cmd += [str(sparrow.lib_1_0)]
+    if version == "0.3":
+        cmd += [str(sparrow.lib_03)]
     if database is not None:
         cmd += ["--souffle-fact-dir", database]
     cmd += ["--souffle-output-format", output_format, "--souffle-output-path", output]
@@ -45,23 +51,33 @@ def backend_execute(path, database, output, timeout, output_format, verbose):
     return tmp.subrun()
 
 
+def precompiled(path, timeout):
+    cmd = [str(sparrow.godel_script), "-p", str(sparrow.lib_script), "--semantic-only", str(path)]
+    tmp = Runner(cmd, timeout)
+    status = tmp.subrun()
+    if status != 0:
+        return False
+    return True
+
+
 def execute(path, database, output, timeout, output_format, verbose):
     godel = get_godel_compile(path)
     version = godel_version_judge(path)
     cmd = list()
     if version == "script":
-        # godel-script两步编译，实际执行后端为1.0
-        with tempfile.NamedTemporaryFile(suffix='.gdl') as temp_file:
-            cmd += [str(godel), str(path), "-p", str(sparrow.lib_1_0), "-o", temp_file.name]
-            if verbose:
-                cmd += ["--verbose"]
-            tmp = Runner(cmd, timeout)
-            start_time = time.time()
-            return_code = tmp.subrun()
-            if return_code != 0:
-                logging.error("%s compile error, please check it yourself", str(path))
-                return -1
-            logging.info("godel-script compile time: %.2fs", time.time() - start_time)
-            return backend_execute(Path(temp_file.name), database, output, timeout, output_format, verbose)
+        # godel-script 直接执行
+        cmd += [str(godel), "-p", str(sparrow.lib_script), "-f", database]
+        cmd += ["-Of", "-r", str(path)]
+        if output_format == "sqlite":
+            cmd += ["--output-sqlite"]
+        elif output_format == "csv":
+            cmd += ["--output-csv"]
+        else:
+            cmd += ["--output-json"]
+        cmd += [output]
+        if verbose:
+            cmd += ["--verbose"]
+        tmp = Runner(cmd, timeout)
+        return tmp.subrun()
     else:
         return backend_execute(path, database, output, timeout, output_format, verbose)
diff --git a/cli/package/__init__.py b/cli/package/__init__.py