Skip to content

MaD generator: use --threads=0 and 2GB per thread for --ram by default #19744

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ repos:
rev: 25.1.0
hooks:
- id: black
files: ^(misc/codegen/.*|misc/scripts/models-as-data/bulk_generate_mad)\.py$
files: ^(misc/codegen/.*|misc/scripts/models-as-data/.*)\.py$

- repo: local
hooks:
Expand Down
24 changes: 14 additions & 10 deletions misc/scripts/models-as-data/bulk_generate_mad.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,7 @@ def clone_project(project: Project) -> str:
return target_dir


def run_in_parallel[
T, U
](
def run_in_parallel[T, U](
func: Callable[[T], U],
items: List[T],
*,
Expand Down Expand Up @@ -238,13 +236,12 @@ def generate_models(config, args, project: Project, database_dir: str) -> None:
language = config["language"]

generator = mad.Generator(language)
# Note: The argument parser converts with-sinks to with_sinks, etc.
generator.generateSinks = should_generate_sinks(project)
generator.generateSources = should_generate_sources(project)
generator.generateSummaries = should_generate_summaries(project)
generator.setenvironment(database=database_dir, folder=name)
generator.threads = args.codeql_threads
generator.ram = args.codeql_ram
generator.setenvironment(database=database_dir, folder=name)
generator.run()


Expand Down Expand Up @@ -350,7 +347,7 @@ def download_dca_databases(
"""
print("\n=== Finding projects ===")
project_map = {project["name"]: project for project in projects}
analyzed_databases = {}
analyzed_databases = {n: None for n in project_map}
for experiment_name in experiment_names:
response = get_json_from_github(
f"https://raw.githubusercontent.com/github/codeql-dca-main/data/{experiment_name}/reports/downloads.json",
Expand All @@ -363,17 +360,24 @@ def download_dca_databases(
artifact_name = analyzed_database["artifact_name"]
pretty_name = pretty_name_from_artifact_name(artifact_name)

if not pretty_name in project_map:
if not pretty_name in analyzed_databases:
print(f"Skipping {pretty_name} as it is not in the list of projects")
continue

if pretty_name in analyzed_databases:
if analyzed_databases[pretty_name] is not None:
print(
f"Skipping previous database {analyzed_databases[pretty_name]['artifact_name']} for {pretty_name}"
)

analyzed_databases[pretty_name] = analyzed_database

not_found = [name for name, db in analyzed_databases.items() if db is None]
if not_found:
print(
f"ERROR: The following projects were not found in the DCA experiments: {', '.join(not_found)}"
)
sys.exit(1)

def download_and_decompress(analyzed_database: dict) -> str:
artifact_name = analyzed_database["artifact_name"]
repository = analyzed_database["repository"]
Expand Down Expand Up @@ -516,7 +520,7 @@ def main(config, args) -> None:
"--dca",
type=str,
help="Name of a DCA run that built all the projects. Can be repeated, with sources taken from all provided runs, "
"the last provided ones having priority",
"the last provided ones having priority",
action="append",
)
parser.add_argument(
Expand All @@ -527,7 +531,7 @@ def main(config, args) -> None:
parser.add_argument(
"--codeql-ram",
type=int,
help="What `--ram` value to pass to `codeql` while generating models (by default the flag is not passed)",
help="What `--ram` value to pass to `codeql` while generating models (by default 2048 MB per thread)",
default=None,
)
parser.add_argument(
Expand Down
59 changes: 41 additions & 18 deletions misc/scripts/models-as-data/convert_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,65 +7,86 @@
import sys
import tempfile


def quote_if_needed(v):
# string columns
if type(v) is str:
return "\"" + v + "\""
return '"' + v + '"'
# bool column
return str(v)


def parseData(data):
rows = [{ }, { }]
rows = [{}, {}]
for row in data:
d = map(quote_if_needed, row)
provenance = row[-1]
targetRows = rows[1] if provenance.endswith("generated") else rows[0]
helpers.insert_update(targetRows, row[0], " - [" + ', '.join(d) + ']\n')
helpers.insert_update(targetRows, row[0], " - [" + ", ".join(d) + "]\n")

return rows


class Converter:
def __init__(self, language, dbDir):
self.language = language
self.dbDir = dbDir
self.codeQlRoot = subprocess.check_output(["git", "rev-parse", "--show-toplevel"]).decode("utf-8").strip()
self.codeQlRoot = (
subprocess.check_output(["git", "rev-parse", "--show-toplevel"])
.decode("utf-8")
.strip()
)
self.extDir = os.path.join(self.codeQlRoot, f"{self.language}/ql/lib/ext/")
self.dirname = "modelconverter"
self.modelFileExtension = ".model.yml"
self.workDir = tempfile.mkdtemp()


def runQuery(self, query):
print('########## Querying: ', query)
queryFile = os.path.join(self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query)
print("########## Querying: ", query)
queryFile = os.path.join(
self.codeQlRoot, f"{self.language}/ql/src/utils/{self.dirname}", query
)
resultBqrs = os.path.join(self.workDir, "out.bqrs")

helpers.run_cmd(['codeql', 'query', 'run', queryFile, '--database', self.dbDir, '--output', resultBqrs], "Failed to generate " + query)
helpers.run_cmd(
[
"codeql",
"query",
"run",
queryFile,
"--database",
self.dbDir,
"--output",
resultBqrs,
],
"Failed to generate " + query,
)
return helpers.readData(self.workDir, resultBqrs)


def asAddsTo(self, rows, predicate):
extensions = [{ }, { }]
extensions = [{}, {}]
for i in range(2):
for key in rows[i]:
extensions[i][key] = helpers.addsToTemplate.format(f"codeql/{self.language}-all", predicate, rows[i][key])

return extensions
extensions[i][key] = helpers.addsToTemplate.format(
f"codeql/{self.language}-all", predicate, rows[i][key]
)

return extensions

def getAddsTo(self, query, predicate):
data = self.runQuery(query)
rows = parseData(data)
return self.asAddsTo(rows, predicate)


def makeContent(self):
summaries = self.getAddsTo("ExtractSummaries.ql", helpers.summaryModelPredicate)
sources = self.getAddsTo("ExtractSources.ql", helpers.sourceModelPredicate)
sinks = self.getAddsTo("ExtractSinks.ql", helpers.sinkModelPredicate)
neutrals = self.getAddsTo("ExtractNeutrals.ql", helpers.neutralModelPredicate)
return [helpers.merge(sources[0], sinks[0], summaries[0], neutrals[0]), helpers.merge(sources[1], sinks[1], summaries[1], neutrals[1])]

return [
helpers.merge(sources[0], sinks[0], summaries[0], neutrals[0]),
helpers.merge(sources[1], sinks[1], summaries[1], neutrals[1]),
]

def save(self, extensions):
# Create directory if it doesn't exist
Expand All @@ -77,9 +98,11 @@ def save(self, extensions):
for entry in extensions[0]:
with open(self.extDir + "/" + entry + self.modelFileExtension, "w") as f:
f.write(extensionTemplate.format(extensions[0][entry]))

for entry in extensions[1]:
with open(self.extDir + "/generated/" + entry + self.modelFileExtension, "w") as f:
with open(
self.extDir + "/generated/" + entry + self.modelFileExtension, "w"
) as f:
f.write(extensionTemplate.format(extensions[1][entry]))

def run(self):
Expand Down
Loading
Loading