Skip to content

Commit 973396e

Browse files
authored
Merge pull request #778 from JaniniRami/janinirami
Added a near duplicate files remover
2 parents 45e1847 + 6e2d0ca commit 973396e

File tree

3 files changed

+115
-0
lines changed

3 files changed

+115
-0
lines changed

duplicate_files_remover/README.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Near Duplicate Files Remover
2+
A script that searches for duplicate files and delete them to save up storage.
3+
4+
5+
6+
## Setup instructions
7+
```
8+
python pip install -r requirements.txt
9+
python main.py
10+
```
11+
12+
## Detailed explanation of script, if needed
13+
After running the script, it crawls a given directory returning all the files in the directory, after that it generates a hash for every file
14+
and save them in a pandas dataframe hashtable. Then, The script looks for similar hashes in the hashtable and deletes all the files with the
15+
similar hashtable only keeping the original one.
16+
17+
## Author(s)
18+
19+
Rami Janini

duplicate_files_remover/main.py

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
import os
2+
import sys
3+
import hashlib
4+
import pandas as pd
5+
6+
7+
# get list of all files in a dir
8+
def file_list(folder):
9+
path = os.path.abspath(folder)
10+
files = [entry.path for entry in os.scandir(path) if entry.is_file()]
11+
print(f'[+] Found {len(files)} files in {folder}. ')
12+
13+
return files
14+
15+
16+
# Calculate the hash for any given file
17+
def get_hash(filename):
18+
block_size = 65536
19+
20+
with open(filename, 'rb') as f:
21+
m = hashlib.sha256()
22+
block = f.read(block_size)
23+
while len(block) > 0:
24+
m.update(block)
25+
block = f.read(block_size)
26+
digest = m.hexdigest()
27+
28+
return digest
29+
30+
# create hashtable
31+
32+
33+
def hashtable(files):
34+
if not isinstance(files, list):
35+
files = [files]
36+
else:
37+
pass
38+
39+
hash_identifier = []
40+
for f in files:
41+
try:
42+
hash_identifier.extend([get_hash(f)])
43+
except OSError:
44+
hash_identifier.extend(['Hash could not be generated.'])
45+
46+
return hash_identifier
47+
48+
49+
# crawl through a directory and return the hashes of each file as a pd
50+
# dataframe.
51+
def create_hashtable(folder):
52+
files = file_list(folder)
53+
54+
df = pd.DataFrame(columns=['file', 'hash'])
55+
df['file'] = files
56+
df['hash'] = hashtable(files)
57+
print('[+] Generated all hashes.')
58+
59+
return df
60+
61+
62+
# get duplicates
63+
def list_duplicates(folder):
64+
duplicates_files = create_hashtable(folder)
65+
duplicates_files = duplicates_files[duplicates_files['hash'].duplicated(
66+
keep=False)]
67+
duplicates_files.sort_values(by='hash', inplace=True)
68+
duplicates_files = duplicates_files.drop_duplicates(
69+
subset='hash', keep='first')
70+
print(f'[+] Found {len(duplicates_files)} duplicates.\n')
71+
print(duplicates_files)
72+
73+
return duplicates_files
74+
75+
76+
# list_duplicates('C:/Users/ramij/Desktop/secret')
77+
78+
if __name__ == '__main__':
79+
folder = str(input('Folder full path (eg: C:/Users/bob/Desktop): '))
80+
if not os.path.exists(folder):
81+
print('Folder does not exist.')
82+
sys.exit(1)
83+
else:
84+
pass
85+
86+
duplicates = list_duplicates(folder)
87+
delete = input('\n[!] Do you want to delete the duplicates (y/n):')
88+
print('\n')
89+
if delete.lower() == 'y':
90+
duplicates = duplicates['file'].tolist()
91+
for f in duplicates:
92+
os.remove(f)
93+
print(f'Deleted {f}')
94+
else:
95+
print('[X] Exiting...')
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
pandas==1.2.0

0 commit comments

Comments
 (0)