Skip to content

Commit 57f93f6

Browse files
author
Rami Janini
committed
Added a near duplicate files remover.
1 parent 45e1847 commit 57f93f6

File tree

3 files changed

+111
-0
lines changed

3 files changed

+111
-0
lines changed

duplicate_files_remover/README.md

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Near Duplicate Files Remover
2+
3+
4+
# Setup:
5+
```
6+
python pip install -r requirements.txt
7+
python main.py
8+
```
9+
10+
# How is works
11+
After running the script, it crawls a given directory returning all the files in the directory, after that it generates a hash for every file
12+
and save them in a pandas dataframe hashtable. Then, The script looks for similar hashes in the hashtable and deletes all the files with the
13+
similar hashtable only keeping the original one.

duplicate_files_remover/main.py

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,97 @@
1+
import os
2+
import sys
3+
from os import listdir
4+
from os.path import isfile, join
5+
import hashlib
6+
import pandas as pd
7+
8+
9+
# get list of all files in a dir
10+
def file_list(folder):
11+
path = os.path.abspath(folder)
12+
files = [entry.path for entry in os.scandir(path) if entry.is_file()]
13+
print(f'[+] Found {len(files)} files in {folder}. ')
14+
15+
return files
16+
17+
18+
# Calculate the hash for any given file
19+
def get_hash(filename):
20+
block_size = 65536
21+
22+
with open(filename, 'rb') as f:
23+
m = hashlib.sha256()
24+
block = f.read(block_size)
25+
while len(block) > 0:
26+
m.update(block)
27+
block = f.read(block_size)
28+
digest = m.hexdigest()
29+
30+
return digest
31+
32+
# create hashtable
33+
34+
35+
def hashtable(files):
36+
if not isinstance(files, list):
37+
files = [files]
38+
else:
39+
pass
40+
41+
hash_identifier = []
42+
for f in files:
43+
try:
44+
hash_identifier.extend([get_hash(f)])
45+
except OSError:
46+
hash_identifier.extend(['Hash could not be generated.'])
47+
48+
return hash_identifier
49+
50+
51+
# crawl through a directory and return the hashes of each file as a pd
52+
# dataframe.
53+
def create_hashtable(folder):
54+
files = file_list(folder)
55+
56+
df = pd.DataFrame(columns=['file', 'hash'])
57+
df['file'] = files
58+
df['hash'] = hashtable(files)
59+
print('[+] Generated all hashes.')
60+
61+
return df
62+
63+
64+
# get duplicates
65+
def list_duplicates(folder):
66+
duplicates_files = create_hashtable(folder)
67+
duplicates_files = duplicates_files[duplicates_files['hash'].duplicated(
68+
keep=False)]
69+
duplicates_files.sort_values(by='hash', inplace=True)
70+
duplicates_files = duplicates_files.drop_duplicates(
71+
subset='hash', keep='first')
72+
print(f'[+] Found {len(duplicates_files)} duplicates.\n')
73+
print(duplicates_files)
74+
75+
return duplicates_files
76+
77+
78+
# list_duplicates('C:/Users/ramij/Desktop/secret')
79+
80+
if __name__ == '__main__':
81+
folder = str(input('Folder full path (eg: C:/Users/bob/Desktop): '))
82+
if not os.path.exists(folder):
83+
print('Folder does not exist.')
84+
sys.exit(1)
85+
else:
86+
pass
87+
88+
duplicates = list_duplicates(folder)
89+
delete = input('\n[!] Do you want to delete the duplicates (y/n):')
90+
print('\n')
91+
if delete.lower() == 'y':
92+
duplicates = duplicates['file'].tolist()
93+
for f in duplicates:
94+
os.remove(f)
95+
print(f'Deleted {f}')
96+
else:
97+
print('[X] Exiting...')
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pandas==1.2.0

0 commit comments

Comments
 (0)