Skip to content

Commit

Permalink
Add dupechecker.
Browse files Browse the repository at this point in the history
  • Loading branch information
Tony Young committed Jan 8, 2012
1 parent 94cc6a7 commit 02c75be
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 7 deletions.
82 changes: 79 additions & 3 deletions Haskell/Dupechecker.hs
Original file line number Diff line number Diff line change
@@ -1,6 +1,82 @@
-- Duplicate file checker
-- Finds duplicate files based on MD5
--
-- Author: rfw
--
-- This file has been placed in the public domain.

import Control.Monad (foldM)
import qualified Data.Hash.MD5 as MD5
import Data.List (nub)
import qualified Data.Map as Map
import Data.Maybe (fromMaybe)
import Data.String.Utils (join)
import System (getArgs)
import System.Directory (getDirectoryContents)
import System.IO
import System.FilePath (combine)
import System.IO (withBinaryFile, hGetContents, IOMode(ReadMode))
import System.IO.PlafCompat (getFileStatus, isDirectory)
import Text.Printf (printf)

type MD5HashMap = Map.Map Integer [FilePath]

-- hash a single file and return the MD5 sum
hashFile :: String -> IO Integer
hashFile fileName = do
withBinaryFile fileName ReadMode $ \handle -> do
contents <- hGetContents handle
return $! MD5.md5i $ MD5.Str contents

-- hash a directory and put them into an MD5 hash map
hashDirectory :: MD5HashMap -> FilePath -> IO MD5HashMap
hashDirectory hashes dirName = do
let banner = printf "Scanning: %s" dirName
putStrLn banner
putStrLn $ [ '=' | _ <- [1..length banner] ]

contents <- getDirectoryContents dirName
statuses <- mapM getFileStatus contents

let files = map fst $ filter (not . isDirectory . snd) (zip contents statuses)

-- now hash the files and throw them into the map
foldM hashAndPut hashes files

where
hashAndPut hashes fileName = do
hash <- hashFile fileName
let fn = combine dirName fileName
existing <- maybe (return []) (notify hash fn) $ Map.lookup hash hashes
return $ Map.insert hash (fn:existing) hashes

notify hash fileName existing = do
putStrLn $ printf "%32x: %s, %s" hash fileName (head existing)
return existing

-- hash multiple directories
hashDirectories :: MD5HashMap -> [FilePath] -> IO MD5HashMap
hashDirectories hashes dirs =
foldM hashDirectory hashes dirs

-- generate a summary of duplicates
generateSummary :: MD5HashMap -> IO ()
generateSummary hashes = do
putStrLn ""
putStrLn "Summary"
putStrLn "======="
mapM_ summarize $ Map.toList hashes

where
summarize (_, [x]) = return ()
summarize (hash, fileNames) = do
putStrLn $ printf "%32x: %s" hash (join ", " fileNames)

lisFiles = getDirectoryContents "/home/john/Pictures/Dupes"
main :: IO ()
main = do
argv <- getArgs
let dirs = nub argv

main = print "Protype Stage"
if dirs == [] then
putStrLn "ERROR: At least one directory should be specified."
else
hashDirectories Map.empty dirs >>= generateSummary
6 changes: 2 additions & 4 deletions Haskell/Makefile
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
all: dupechecker

dupechecker:
ghc -o Dupechecker-Haskell Dupechecker-Haskell.hs
chmod +x Dupechecker-Haskell
rm Dupechecker-Haskell.o
rm Dupechecker-Haskell.hi
ghc --make -o Dupechecker-Haskell Dupechecker.hs
rm Dupechecker.o Dupechecker.hi

0 comments on commit 02c75be

Please sign in to comment.