Merge pull request #778 from JaniniRami/janinirami

pawangeek · web-flow · commit 973396e45a68 · 2021-10-19T20:38:21.000+05:30
Added  a near duplicate files remover
diff --git a/duplicate_files_remover/README.md b/duplicate_files_remover/README.md
@@ -0,0 +1,19 @@
+# Near Duplicate Files Remover
+A script that searches for duplicate files and delete them to save up storage.
+
+
+
+## Setup instructions
+```
+python pip install -r requirements.txt
+python main.py
+```
+
+## Detailed explanation of script, if needed
+After running the script, it crawls a given directory returning all the files in the directory, after that it generates a hash for every file
+and save them in a pandas dataframe hashtable. Then, The script looks for similar hashes in the hashtable and deletes all the files with the
+similar hashtable only keeping the original one.
+
+## Author(s)
+
+Rami Janini
diff --git a/duplicate_files_remover/main.py b/duplicate_files_remover/main.py
@@ -0,0 +1,95 @@
+import os
+import sys
+import hashlib
+import pandas as pd
+
+
+# get list of all files in a dir
+def file_list(folder):
+    path = os.path.abspath(folder)
+    files = [entry.path for entry in os.scandir(path) if entry.is_file()]
+    print(f'[+] Found {len(files)} files in {folder}. ')
+
+    return files
+
+
+# Calculate the hash for any given file
+def get_hash(filename):
+    block_size = 65536
+
+    with open(filename, 'rb') as f:
+        m = hashlib.sha256()
+        block = f.read(block_size)
+        while len(block) > 0:
+            m.update(block)
+            block = f.read(block_size)
+        digest = m.hexdigest()
+
+    return digest
+
+# create hashtable
+
+
+def hashtable(files):
+    if not isinstance(files, list):
+        files = [files]
+    else:
+        pass
+
+    hash_identifier = []
+    for f in files:
+        try:
+            hash_identifier.extend([get_hash(f)])
+        except OSError:
+            hash_identifier.extend(['Hash could not be generated.'])
+
+    return hash_identifier
+
+
+# crawl through a directory and return the hashes of each file as a pd
+# dataframe.
+def create_hashtable(folder):
+    files = file_list(folder)
+
+    df = pd.DataFrame(columns=['file', 'hash'])
+    df['file'] = files
+    df['hash'] = hashtable(files)
+    print('[+] Generated all hashes.')
+
+    return df
+
+
+# get duplicates
+def list_duplicates(folder):
+    duplicates_files = create_hashtable(folder)
+    duplicates_files = duplicates_files[duplicates_files['hash'].duplicated(
+        keep=False)]
+    duplicates_files.sort_values(by='hash', inplace=True)
+    duplicates_files = duplicates_files.drop_duplicates(
+        subset='hash', keep='first')
+    print(f'[+] Found {len(duplicates_files)} duplicates.\n')
+    print(duplicates_files)
+
+    return duplicates_files
+
+
+# list_duplicates('C:/Users/ramij/Desktop/secret')
+
+if __name__ == '__main__':
+    folder = str(input('Folder full path (eg: C:/Users/bob/Desktop): '))
+    if not os.path.exists(folder):
+        print('Folder does not exist.')
+        sys.exit(1)
+    else:
+        pass
+
+    duplicates = list_duplicates(folder)
+    delete = input('\n[!] Do you want to delete the duplicates (y/n):')
+    print('\n')
+    if delete.lower() == 'y':
+        duplicates = duplicates['file'].tolist()
+        for f in duplicates:
+            os.remove(f)
+            print(f'Deleted {f}')
+    else:
+        print('[X] Exiting...')
diff --git a/duplicate_files_remover/requirements.txt b/duplicate_files_remover/requirements.txt
@@ -0,0 +1 @@
+pandas==1.2.0