Skip to content

Commit

Permalink
feat(SemanticSearch): Added a new semantic search agent that uses fuz…
Browse files Browse the repository at this point in the history
…zy string mathcing and levenshtein distance.
  • Loading branch information
Hero2323 committed Aug 29, 2024
1 parent 9666a92 commit 43fd865
Show file tree
Hide file tree
Showing 5 changed files with 216 additions and 3 deletions.
208 changes: 208 additions & 0 deletions atarashi/agents/semanticSearch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,208 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Copyright 2024 Abdelrahman Jamal ([email protected])
SPDX-License-Identifier: GPL-2.0
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
version 2 as published by the Free Software Foundation.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
"""
import argparse
import os
import nirjas
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz


from atarashi.agents.atarashiAgent import AtarashiAgent, exactMatcher

__author__ = "Abdelrahman Jamal"
__email__ = "[email protected]"

class SemanticSearchAgent(AtarashiAgent):
"""
An agent that performs semantic search to identify potential licenses within files.
"""

def __init__(self, licenseList):
super().__init__(licenseList)

def extract_comments(self, filePath: str):
"""
Extracts comments from a file using the 'nirjas' library, falling back to reading the entire file if comment extraction fails.
"""
if not os.path.exists(filePath):
raise Exception(f"File path '{filePath}' does not exist")
try:
# * Manually extracting comments using nirjas and not the commentPreprocessor class
# * Because I prefer to do my own preprocessing and I also need accurate comment reading,
# * Normal nirjas works by appending comments together.
nirjas_comments = nirjas.extract(filePath)
if nirjas_comments.total_lines_of_comments == 0:
# Go to the except case to read all the file, even if it has no comments
# ! This is debatable, and I might remove it.
raise Exception()
all_comments = []
# Go through each comment type, and read the comment itself given it's starting and ending lines
# This is necessary because nirjas by default appends multi-line or continous single-line comments
# together, and for semantic search purposes, I want to read the comments exactly as they were in the file.
with open(filePath, "r") as f:
all_lines = f.readlines()
for single_line_comment in nirjas_comments['single_line_comment']:
all_comments.append(single_line_comment['comment'])
for cont_single_line_comment in nirjas_comments['cont_single_line_comment']:
start = cont_single_line_comment['start_line'] - 1
end = cont_single_line_comment['end_line']
for line_idx in range(start, end):
comment = all_lines[line_idx]
all_comments.append(comment)
for multi_line_comment in nirjas_comments['multi_line_comment']:
start = multi_line_comment['start_line'] - 1
end = multi_line_comment['end_line']
for line_idx in range(start, end):
line = all_lines[line_idx]
all_comments.append(line)
comments = "".join(all_comments)
except:
with open(filePath, "r") as f:
comments = f.read()
return comments

def scan(self, filePath):
'''
Scans a file for potential licenses using semantic search and fuzzy string matching.
'''

# Quick check if an exact match exists, if it does then return that.
temp = exactMatcher(super().loadFile(filePath), self.licenseList)
if temp != -1:
result = []
for shortname in temp:
result.append({
"shortname": str(shortname),
"sim_score": 1,
"sim_type": "SemanticSearch-LVD",
"description": "exact match"
})
return result

# Append The Fulll License Name, Short License Name, and SPDX-License-Identifier to the license text
# Some files only contain 'SPDX-License-Identifier: 0BSD' or 'License: 0BSD' and since this agent attempts
# To match based on license text matching, those will not be identified. Appending those lines
# To the license text helps this agent identify those cases more often
def convert(row):
row['text'] = f"License Name: {row['fullname']} \n License: {row['shortname']} \n SPDX-License-Identifier: {row['shortname']} \n{row['text']}"
return row

self.licenseList = self.licenseList.apply(convert, axis=1)

file_comments = self.extract_comments(filePath)

# Remove characters not found in license texts
chars_to_remove = ['—', '…', '•', '§', '«', '»', '„', '・', '−', '*', '>', '<']
for char_to_remove in chars_to_remove:
file_comments = file_comments.replace(char_to_remove, '')

# Separate the comments into single line comments
file_comments = file_comments.split('\n')

# Separate each license text line by line and append them to one big list (used for matching)
# The license_index_map maps each line to the correct license index in the licenseList dataframe
license_index_map = {}
all_license_texts = []
for license_index, license_text in enumerate(self.licenseList['text']):
for line in license_text.split('\n'):
all_license_texts.append(line)
license_index_map[len(all_license_texts) - 1] = license_index

# Perform first level fuzzy matching for all lines in the comments
results = []
fuzzy_similarity_matrix = np.zeros((len(file_comments), len(all_license_texts)))
for index, comment in enumerate(file_comments):
for i in range(len(all_license_texts)):
fuzzy_similarity_matrix[index][i] = fuzz.ratio(comment, all_license_texts[i])
max_score_index = np.argmax(fuzzy_similarity_matrix[index])
results.append(
(
fuzzy_similarity_matrix[index][max_score_index],
comment
)
)

# Try to append lines that match with a similarity score of more than 40% with one of the lines in
# Any of the licenses. The goal is to get a bigger and bigger text chunk that matches to a bigger
# text chunk in one of the license texts - The bigger the chunk, the more likely that this match is
# correct.
appended_comments = []
appended_comment = []
for result in results:
if result[0] >= 40:
if appended_comment == [] and result[1] == '':
continue
appended_comment.append(result[1])
else:
appended_comments.append(appended_comment)
appended_comment = []

if len(appended_comment) > 0 and appended_comment not in appended_comments:
appended_comments.append(appended_comment)

# Attempt the final - second level license match with all the bigger text chunks
# In some licenses, the license header is used instead of the license text
# In case that a license has a license header, we match all chunks with both and take
# the match with the highest similarity score
results = []
for appended_comment in appended_comments:
appended_comment = "\n".join(appended_comment)
fuzzy_similarity_matrix_2 = np.zeros(len(self.licenseList))
for i in range(len(self.licenseList)):
fuzzy_similarity_matrix_2[i] = fuzz.ratio(appended_comment, self.licenseList.loc[i, 'text'])
if pd.notna(licenseList.loc[i, 'license_header']):
license_header_sim_score = fuzz.ratio(appended_comment, self.licenseList.loc[i, 'license_header'].replace('\n\n', '\n'))
fuzzy_similarity_matrix_2[i] = license_header_sim_score if license_header_sim_score > fuzzy_similarity_matrix_2[i] else fuzzy_similarity_matrix_2[i]
if self.verbose > 0:
print('Comment: ' + appended_comment + " - License: " + self.licenseList.iloc[i]['shortname'] + " Similarity Score: " + str(fuzzy_similarity_matrix_2[i]))
top_5_license_text_indices = np.argsort(fuzzy_similarity_matrix_2)[-5:][::-1]

if fuzzy_similarity_matrix_2[top_5_license_text_indices[0]] >= 50:
results.append({
"shortname": self.licenseList.loc[top_5_license_text_indices[0], 'shortname'],
"sim_score": fuzzy_similarity_matrix_2[top_5_license_text_indices[0]],
"sim_type": "SemanticSearch-LVD",
"description": ""
})
return results

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("processedLicenseList", help="Specify the processed license list file")
parser.add_argument("inputFile", help="Specify the input file which needs to be scanned")
parser.add_argument("-v", "--verbose", help="increase output verbosity",
action='count', default=0)
args = parser.parse_args()

inputFile = args.inputFile
licenseList = args.processedLicenseList
verbose = args.verbose

scanner = SemanticSearchAgent(licenseList, verbose=verbose)

results = scanner.scan(inputFile)
if len(results) == 0:
print("Result is nothing")
for result in results:
print("License Detected using Semantic Search: " + result[0]['shortname'])


5 changes: 4 additions & 1 deletion atarashi/atarashii.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from atarashi.agents.dameruLevenDist import DameruLevenDist
from atarashi.agents.tfidf import TFIDF
from atarashi.agents.wordFrequencySimilarity import WordFrequencySimilarity
from atarashi.agents.semanticSearch import SemanticSearchAgent

__author__ = "Aman Jain"
__email__ = "[email protected]"
Expand Down Expand Up @@ -98,6 +99,8 @@ def build_scanner_obj(processedLicense, agent_name, similarity="CosineSim",
else:
print("Please choose similarity from {CosineSim,DiceSim,BigramCosineSim}")
return -1
elif agent_name == 'SemanticSearch':
scanner = SemanticSearchAgent(processedLicense)

scanner.setVerbose(verbose)
return scanner
Expand Down Expand Up @@ -128,7 +131,7 @@ def main():
parser.add_argument("-l", "--processedLicenseList", required=False,
help="Specify the location of processed license list file")
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'SemanticSearch'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down
2 changes: 1 addition & 1 deletion atarashi/evaluator/evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ def evaluate(scanner):
defaultJSON = resource_filename("atarashi", "data/Ngram_keywords.json")
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--agent_name", required=True,
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram'],
choices=['wordFrequencySimilarity', 'DLD', 'tfidf', 'Ngram', 'SemanticSearch'],
help="Name of the agent that needs to be run")
parser.add_argument("-s", "--similarity", required=False, default="CosineSim",
choices=["ScoreSim", "CosineSim", "DiceSim", "BigramCosineSim"],
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ textdistance>=3.0.3
setuptools>=39.2.0
nirjas>=0.0.5
urllib3>=1.24.1
fuzzywuzzy[speedup]>=0.18.0
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,8 @@ def read(fname):
'textdistance>=3.0.3',
'pyxDamerauLevenshtein>=1.5',
'urllib3>=1.24.1',
'nirjas>=0.0.5'
'nirjas>=0.0.5',
'fuzzywuzzy[speedup]>=0.18.0'
]

class BuildAtarashiDependencies(distutils.cmd.Command):
Expand Down

0 comments on commit 43fd865

Please sign in to comment.