Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(SemanticSearch): Added a new semantic search agent that uses fuzzy string mathcing and levenshtein distance. #103

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
208 changes: 208 additions & 0 deletions atarashi/agents/semanticSearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Copyright 2024 Abdelrahman Jamal ([email protected])

SPDX-License-Identifier: GPL-2.0

This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
import argparse
import os
import nirjas
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz


from atarashi.agents.atarashiAgent import AtarashiAgent, exactMatcher

__author__ = "Abdelrahman Jamal"
__email__ = "[email protected]"

class SemanticSearchAgent(AtarashiAgent):
"""
An agent that performs semantic search to identify potential licenses within files.
"""

def __init__(self, licenseList):
super().__init__(licenseList)
Comment on lines +39 to +40
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def __init__(self, licenseList):
super().__init__(licenseList)
def __init__(self, licenseList, verbose=0):
super().__init__(licenseList)

If verbose type output is planned, The input flag for verbose is defined but not passed. Prone to throw error


def extract_comments(self, filePath: str):
"""
Extracts comments from a file using the 'nirjas' library, falling back to reading the entire file if comment extraction fails.
"""
if not os.path.exists(filePath):
raise Exception(f"File path '{filePath}' does not exist")
try:
# * Manually extracting comments using nirjas and not the commentPreprocessor class
# * Because I prefer to do my own preprocessing and I also need accurate comment reading,
# * Normal nirjas works by appending comments together.
nirjas_comments = nirjas.extract(filePath)
if nirjas_comments.total_lines_of_comments == 0:
# Go to the except case to read all the file, even if it has no comments
# ! This is debatable, and I might remove it.
raise Exception()
all_comments = []
# Go through each comment type, and read the comment itself given it's starting and ending lines
# This is necessary because nirjas by default appends multi-line or continous single-line comments
# together, and for semantic search purposes, I want to read the comments exactly as they were in the file.
with open(filePath, "r") as f:
all_lines = f.readlines()
for single_line_comment in nirjas_comments['single_line_comment']:
all_comments.append(single_line_comment['comment'])
for cont_single_line_comment in nirjas_comments['cont_single_line_comment']:
start = cont_single_line_comment['start_line'] - 1
end = cont_single_line_comment['end_line']
for line_idx in range(start, end):
comment = all_lines[line_idx]
all_comments.append(comment)
for multi_line_comment in nirjas_comments['multi_line_comment']:
start = multi_line_comment['start_line'] - 1
end = multi_line_comment['end_line']
for line_idx in range(start, end):
line = all_lines[line_idx]
all_comments.append(line)
comments = "".join(all_comments)
except:
with open(filePath, "r") as f:
comments = f.read()
return comments

def scan(self, filePath):
'''
Scans a file for potential licenses using semantic search and fuzzy string matching.
'''

# Quick check if an exact match exists, if it does then return that.
temp = exactMatcher(super().loadFile(filePath), self.licenseList)
if temp != -1:
result = []
for shortname in temp:
result.append({
"shortname": str(shortname),
"sim_score": 1,
"sim_type": "SemanticSearch-LVD",
"description": "exact match"
})
return result

# Append The Fulll License Name, Short License Name, and SPDX-License-Identifier to the license text
# Some files only contain 'SPDX-License-Identifier: 0BSD' or 'License: 0BSD' and since this agent attempts
# To match based on license text matching, those will not be identified. Appending those lines
# To the license text helps this agent identify those cases more often
def convert(row):
row['text'] = f"License Name: {row['fullname']} \n License: {row['shortname']} \n SPDX-License-Identifier: {row['shortname']} \n{row['text']}"
return row

self.licenseList = self.licenseList.apply(convert, axis=1)

file_comments = self.extract_comments(filePath)

# Remove characters not found in license texts
chars_to_remove = ['—', '…', '•', '§', '«', '»', '„', '・', '−', '*', '>', '<']
for char_to_remove in chars_to_remove:
file_comments = file_comments.replace(char_to_remove, '')

# Separate the comments into single line comments
file_comments = file_comments.split('\n')

# Separate each license text line by line and append them to one big list (used for matching)
# The license_index_map maps each line to the correct license index in the licenseList dataframe
license_index_map = {}
all_license_texts = []
for license_index, license_text in enumerate(self.licenseList['text']):
for line in license_text.split('\n'):
all_license_texts.append(line)
license_index_map[len(all_license_texts) - 1] = license_index

# Perform first level fuzzy matching for all lines in the comments
results = []
fuzzy_similarity_matrix = np.zeros((len(file_comments), len(all_license_texts)))
for index, comment in enumerate(file_comments):
for i in range(len(all_license_texts)):
fuzzy_similarity_matrix[index][i] = fuzz.ratio(comment, all_license_texts[i])
max_score_index = np.argmax(fuzzy_similarity_matrix[index])
results.append(
(
fuzzy_similarity_matrix[index][max_score_index],
comment
)
)

# Try to append lines that match with a similarity score of more than 40% with one of the lines in
# Any of the licenses. The goal is to get a bigger and bigger text chunk that matches to a bigger
# text chunk in one of the license texts - The bigger the chunk, the more likely that this match is
# correct.
appended_comments = []
appended_comment = []
for result in results:
if result[0] >= 40:
if appended_comment == [] and result[1] == '':
continue
appended_comment.append(result[1])
else:
appended_comments.append(appended_comment)
appended_comment = []

if len(appended_comment) > 0 and appended_comment not in appended_comments:
appended_comments.append(appended_comment)

# Attempt the final - second level license match with all the bigger text chunks
# In some licenses, the license header is used instead of the license text
# In case that a license has a license header, we match all chunks with both and take
# the match with the highest similarity score
results = []
for appended_comment in appended_comments:
appended_comment = "\n".join(appended_comment)
fuzzy_similarity_matrix_2 = np.zeros(len(self.licenseList))
for i in range(len(self.licenseList)):
fuzzy_similarity_matrix_2[i] = fuzz.ratio(appended_comment, self.licenseList.loc[i, 'text'])
if pd.notna(licenseList.loc[i, 'license_header']):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
if pd.notna(licenseList.loc[i, 'license_header']):
if pd.notna(self.licenseList.loc[i, 'license_header']):

licenseList variable is not accessible

license_header_sim_score = fuzz.ratio(appended_comment, self.licenseList.loc[i, 'license_header'].replace('\n\n', '\n'))
fuzzy_similarity_matrix_2[i] = license_header_sim_score if license_header_sim_score > fuzzy_similarity_matrix_2[i] else fuzzy_similarity_matrix_2[i]
if self.verbose > 0:
print('Comment: ' + appended_comment + " - License: " + self.licenseList.iloc[i]['shortname'] + " Similarity Score: " + str(fuzzy_similarity_matrix_2[i]))
top_5_license_text_indices = np.argsort(fuzzy_similarity_matrix_2)[-5:][::-1]

if fuzzy_similarity_matrix_2[top_5_license_text_indices[0]] >= 50:
results.append({
"shortname": self.licenseList.loc[top_5_license_text_indices[0], 'shortname'],
"sim_score": fuzzy_similarity_matrix_2[top_5_license_text_indices[0]],
"sim_type": "SemanticSearch-LVD",
"description": ""
})
return results

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("processedLicenseList", help="Specify the processed license list file")
parser.add_argument("inputFile", help="Specify the input file which needs to be scanned")
parser.add_argument("-v", "--verbose", help="increase output verbosity",
action='count', default=0)
args = parser.parse_args()

inputFile = args.inputFile
licenseList = args.processedLicenseList
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we also make it more reliable, If user doesnt provide processedLicenseList, Agent should pick what we already have :)

Something like:

defaultProcessed = resource_filename("atarashi",
                                       "data/licenses/processedLicenses.csv")

if processedLicense is None:
    processedLicense = defaultProcessed

verbose = args.verbose

scanner = SemanticSearchAgent(licenseList, verbose=verbose)

results = scanner.scan(inputFile)
if len(results) == 0:
print("Result is nothing")
for result in results:
print("License Detected using Semantic Search: " + result[0]['shortname'])


5 changes: 4 additions & 1 deletion atarashi/atarashii.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from atarashi.agents.dameruLevenDist import DameruLevenDist
from atarashi.agents.tfidf import TFIDF
from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
from atarashi.agents.semanticSearch import SemanticSearchAgent

__author__ = "Aman Jain"
__email__ = "[email protected]"
Expand Down Expand Up @@ -98,6 +99,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim",
else:
print("Please choose similarity from {CosineSim,DiceSim,BigramCosineSim}")
return -1
elif agent_name == 'SemanticSearch':
scanner = SemanticSearchAgent(processedLicense)

scanner.setVerbose(verbose)
return scanner
Expand Down Expand Up @@ -128,7 +131,7 @@ def main():
parser.add_argument("-l", "--processedLicenseList", required=False,
help="Specify the location of processed license list file")
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'SemanticSearch'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down
2 changes: 1 addition & 1 deletion atarashi/evaluator/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def evaluate(scanner):
defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'SemanticSearch'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down
2 changes: 1 addition & 1 deletion atarashi/license/license_merger.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def license_merger(licenseList, requiredlicenseList, verbose=0):
spdx_compatible_shortname, case=False, regex=False).any():
# SPDX style short name match
continue
licenses_merge = licenses_merge.append(
licenses_merge = licenses_merge._append(
licenses.loc[idx], ignore_index=True, sort=False)

if verbose > 0:
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ textdistance>=3.0.3
setuptools>=39.2.0
nirjas>=0.0.5
urllib3>=1.24.1
fuzzywuzzy[speedup]>=0.18.0
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ def read(fname):
'textdistance>=3.0.3',
'pyxDamerauLevenshtein>=1.5',
'urllib3>=1.24.1',
'nirjas>=0.0.5'
'nirjas>=0.0.5',
'fuzzywuzzy[speedup]>=0.18.0'
]

class BuildAtarashiDependencies(distutils.cmd.Command):
Expand Down