Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
202bce9
Merge pull request #768 from KnowledgeCaptureAndDiscovery/dev
dgarijo Apr 24, 2025
11ea957
now somef can detect codemeta.json and parse the values
Anas-Elhounsri Apr 25, 2025
5f98cb8
Merge remote-tracking branch 'upstream/dev' into dev
Anas-Elhounsri Apr 25, 2025
8ce4307
somef now detects and parses metadata of Cargo.toml
Anas-Elhounsri May 5, 2025
65b9843
SoMEF can now detect composer.json and parse the metadata
Anas-Elhounsri May 5, 2025
c43fba5
Merge branch 'dev' into dev
Anas-Elhounsri May 9, 2025
5dbb850
added tests and fixed issues for codemeta.json
Anas-Elhounsri May 19, 2025
3fb2e3e
Bump setuptools in /src/somef/test/test_data/repositories/inspect4py
dependabot[bot] May 19, 2025
45b7bda
Merge pull request #781 from KnowledgeCaptureAndDiscovery/dependabot/…
dgarijo May 19, 2025
4397c80
added the edited unittesting and deleted package parsers to be commit…
Anas-Elhounsri May 20, 2025
a4f328a
this commit deletes the packages
Anas-Elhounsri May 20, 2025
d20ba6c
Deleted tests for packages
Anas-Elhounsri May 20, 2025
0af850d
Adding tested packages for cargo.toml and composer.json
Anas-Elhounsri May 20, 2025
cdb8d6e
Added bower.json parser and unittesting
Anas-Elhounsri May 20, 2025
975f4b4
Merge branch 'KnowledgeCaptureAndDiscovery:master' into packages
Anas-Elhounsri May 21, 2025
8e4241a
Added .gemspec parser and the unittest
Anas-Elhounsri May 21, 2025
b7a46d9
Updated the formats recognized in documentation
Anas-Elhounsri May 26, 2025
71bc939
Updated the categories labeled by somef
Anas-Elhounsri May 26, 2025
7189fec
Added parser and test for .cabal package
Anas-Elhounsri May 26, 2025
f260b97
inital parser for DESCRIPTION
Anas-Elhounsri Jun 6, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,10 @@ Lib/*
Scripts/*
.idea/*
*.json
!**/codemeta.json
!**/composer.json
!**/package.json
!**/bower.json
*.ttl
env_3.9/*
src/somef/create_corpus_for_NER.py
Expand Down
18 changes: 14 additions & 4 deletions docs/output.md
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,11 @@ SOMEF aims to recognize the following categories (in alphabetical order):
- `type`: Software type: Commandline Application, Notebook Application, Ontology, Scientific Workflow. Non-Software types: Static Website, Uncategorized
- `usage`: Usage examples and considerations of a code repository.
- `workflows`: URL and path to the computational workflow files present in the repository.
- `homepage`: URL to the homepage of the software or organization.
- `reference_publication`: URL to the paper associated with the code repository.
- `package_id`: Identifier extracted from packages. (e.g., `packages.json`)
- `funding`: Funding code for the related project.
- `has_package_file`: Specifies what package file is present in the code repository.

The following table summarized the properties used to describe a `category`:

Expand Down Expand Up @@ -210,10 +215,15 @@ The following formats for a result value are currently recognized:
- `docker_compose`: [orchestration file](https://docs.docker.com/compose/compose-file/) used to communicate multiple containers.
- `readthedocs`: documentation format used by many repositories in order to describe their projects.
- `wiki`: documentation format used in GitHub repositories.
- `setup.py`: package file format used in python projects
- `pyproject.toml`: package file format used in python projects
- `pom.xml`: package file used in Java projects
- `package.json`: package file used in Javascript projects
- `setup.py`: package file format used in python projects.
- `pyproject.toml`: package file format used in python projects.
- `pom.xml`: package file used in Java projects.
- `package.json`: package file used in Javascript projects.
- `bower.json`: package descriptor used for configuring packages that can be used as a dependency for Bower-managed front-end projects.
- `composer.json`: manifest file serves as the package descriptor used in PHP projects.
- `cargo.toml.json`: manifest file serves as the package descriptor used in Rust projects.
- `[name].gemspec`:manifest file serves as the package descriptor used in Ruby gem projects.


### Technique
The techniques can be of several types:
Expand Down
162 changes: 162 additions & 0 deletions src/somef/parser/bower_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,162 @@
import json
import logging
import os
from pathlib import Path
from ..process_results import Result
from ..utils import constants
from ..regular_expressions import detect_license_spdx

def parse_bower_json_file(file_path, metadata_result: Result, source):
"""

Parameters
----------
file_path: path of the bower file being analysed
metadata_result: metadata object where the metadata dictionary is kept
source: source of the package file (URL)

Returns
-------

"""
try:
if Path(file_path).name.lower() in ["bower.json"]:
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
"value": "bower.json",
"type": constants.URL,
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

with open(file_path, 'r', encoding='utf-8') as file:
data = json.load(file)

if "name" in data:
metadata_result.add_result(
constants.CAT_NAME,
{
"value": data["name"],
"type": constants.STRING
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

if "description" in data:
metadata_result.add_result(
constants.CAT_DESCRIPTION,
{
"value": data["description"],
"type": constants.STRING
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

if "homepage" in data:
metadata_result.add_result(
constants.CAT_HOMEPAGE,
{
"value": data["homepage"],
"type": constants.URL
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

if "version" in data:
metadata_result.add_result(
constants.CAT_VERSION,
{
"value": data["version"],
"type": constants.RELEASE,
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

if "authors" in data:
metadata_result.add_result(
constants.CAT_AUTHORS,
{
"value": data["authors"],
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

if "license" in data:
metadata_result.add_result(
constants.CAT_LICENSE,
{
"value": data["license"],
"type": constants.LICENSE
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)


if "dependencies" in data and isinstance(data["dependencies"], dict):
for name, version in data["dependencies"].items():
req = f"{name}: {version}"

metadata_result.add_result(
constants.CAT_REQUIREMENTS,
{
"value": req,
"name": name,
"version": version,
"type": constants.SOFTWARE_APPLICATION,
"dependency_type": "runtime"
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

if "devDependencies" in data and isinstance(data["devDependencies"], dict):
for name, version in data["devDependencies"].items():
req = f"{name}: {version}"

metadata_result.add_result(
constants.CAT_REQUIREMENTS,
{
"value": req,
"name": name,
"version": version,
"type": constants.SOFTWARE_APPLICATION,
"dependency_type": "dev"
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

if "keywords" in data:
for keyword in data["keywords"]:
metadata_result.add_result(
constants.CAT_KEYWORDS,
{
"value": keyword,
"type": constants.STRING
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

except Exception as e:
logging.error(f"Error parsing bower.json from {file_path}: {str(e)}")

return metadata_result
168 changes: 168 additions & 0 deletions src/somef/parser/cabal_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import os
import re
import logging
from pathlib import Path
from ..process_results import Result
from ..regular_expressions import detect_license_spdx
from ..utils import constants

def parse_cabal_file(file_path, metadata_result: Result, source):
"""
Parse a .cabal file and extract relevant metadata.

Parameters
----------
file_path: path of the cabal file being analysed
metadata_result: metadata object where the metadata dictionary is kept
source: source of the package file (URL)

Returns
"""

try:
if file_path.endswith('.cabal'):
metadata_result.add_result(
constants.CAT_HAS_PACKAGE_FILE,
{
"value": Path(file_path).name,
"type": constants.URL,
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()

name_match = re.search(r'name:\s*(.*)', content, re.IGNORECASE)
if name_match:
metadata_result.add_result(
constants.CAT_PACKAGE_ID,
{
"value": name_match.group(1),
"type": constants.STRING
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

version_match = re.search(r'version:\s*(.*)', content, re.IGNORECASE)
if version_match:
metadata_result.add_result(
constants.CAT_VERSION,
{
"value": version_match.group(1),
"type": constants.STRING
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

description_match = re.search(r'description:\s*(.*)', content, re.IGNORECASE)
if description_match:
metadata_result.add_result(
constants.CAT_DESCRIPTION,
{
"value": description_match.group(1),
"type": constants.STRING
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

homepage_match = re.search(r'homepage:\s*(.*)', content, re.IGNORECASE)
if homepage_match:
metadata_result.add_result(
constants.CAT_HOMEPAGE,
{
"value": homepage_match.group(1),
"type": constants.URL
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

license_match = re.search(r'license:\s*(.*)', content, re.IGNORECASE)
if license_match:
license_value = license_match.group(1)
license_text = ""

dir_path = os.path.dirname(file_path)
license_paths = [
os.path.join(dir_path, "LICENSE"),
os.path.join(dir_path, "LICENSE.txt"),
os.path.join(dir_path, "LICENSE.md")
]

for license_path in license_paths:
if os.path.exists(license_path):
with open(license_path, "r", encoding="utf-8") as lf:
license_text = lf.read()
break

license_info_spdx = detect_license_spdx(license_text, 'Ruby')

if license_info_spdx:
license_data = {
"value": license_value,
"spdx_id": license_info_spdx.get('spdx_id'),
"name": license_info_spdx.get('name'),
"type": constants.LICENSE
}
else:
license_data = {
"value": license_value,
"type": constants.LICENSE
}

metadata_result.add_result(
constants.CAT_LICENSE,
license_data,
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

library_section_match = re.search(r'library\s*\n(.*?)(?=\n\S|\Z)', content, re.DOTALL | re.IGNORECASE)
if library_section_match:
library_content = library_section_match.group(1)

build_depends_match = re.search(r'build-depends:\s*(.*?)(?=\n\s*\w+:|\Z)', library_content, re.DOTALL)
if build_depends_match:
build_depends_content = build_depends_match.group(1)

dependencies = re.split(r'[,\n]', build_depends_content)

for dep_line in dependencies:
dep_line = dep_line.strip()
if dep_line and not dep_line.startswith(','):

dep_match = re.match(r'^([a-zA-Z0-9-_]+)\s*(.*?)$', dep_line)
if dep_match:
name = dep_match.group(1)
version_constraint = dep_match.group(2).strip() if dep_match.group(2) else "any"
req = f"{name}: {version_constraint}" if version_constraint != "any" else name

metadata_result.add_result(
constants.CAT_REQUIREMENTS,
{
"value": req,
"name": name,
"version": version_constraint,
"type": constants.SOFTWARE_APPLICATION,
"dependency_type": "runtime"
},
1,
constants.TECHNIQUE_CODE_CONFIG_PARSER,
source
)

except Exception as e:
logging.error(f"Error parsing gemspec file from {file_path}: {str(e)}")

return metadata_result
Loading