Skip to content

Commit

Permalink
cleaner
Browse files Browse the repository at this point in the history
  • Loading branch information
WillySuspension committed Jan 9, 2012
1 parent 02c75be commit 4cb3049
Showing 1 changed file with 98 additions and 25 deletions.
123 changes: 98 additions & 25 deletions Haskell/Scraper.hs
Original file line number Diff line number Diff line change
Expand Up @@ -13,37 +13,110 @@
-- You should have received a copy of the GNU General Public License
-- along with this program. If not, see <http://www.gnu.org/licenses/>.

-- Depends on, from Hackage:
-- * curl
-- * regex-tdfa
-- The rest should be distributed with the Haskell Platform.
{-# LANGUAGE DeriveDataTypeable #-}

-- Note: To run this you will obviously need the relevant packages from hackage
-- or your distro.

import Data.List ( nub )
import System.IO
import Control.Monad (unless, forM_)
import Data.List (nub)
import qualified Data.ByteString.Internal as B
import qualified Data.ByteString as B
import Data.IORef
import Foreign
import Foreign.ForeignPtr
import Network.Curl
import Network.Curl.Easy
import Text.Regex.TDFA
import System.Environment ( getArgs )
import System.Environment (getArgs)
import System.Console.CmdArgs
import System.IO

writeToFile handle string = do
hPutStr handle string
-- pasted from Network.Curl.Download, by Don Stewart
data P = P !(Ptr Word8) !Int
newtype URL = URL String

getImage s = do
curl <- initialize
putStr $ "Getting image: " ++ show s
outFile <- openFile (filePath s) WriteMode
setopt curl (CurlWriteFunction $ easyWriter (writeToFile outFile) )
setopt curl (CurlURL s)
hSetBinaryMode outFile True
perform curl
hClose outFile
putStr " ... Done!\n"
where
filePath s = s =~ "[0-9]{13}.[A-Za-z0-9]+"
-- memcpy chunks of data into our bytestring.
writer :: ((Ptr Word8, Int) -> IO ()) -> WriteFunction
writer f src sz nelems _ = do
let n' = sz * nelems
f (castPtr src, fromIntegral n')
return n'

gather :: IORef P -> WriteFunction
gather r = writer $ \(src, m) -> do
P dest n <- readIORef r
dest' <- reallocBytes dest (n + m)
B.memcpy (dest' `plusPtr` n) src (fromIntegral m)
writeIORef r (P dest' (n + m))

parseURL :: String -> Maybe URL
parseURL s = Just (URL s) -- no parsing

getFile :: URL -> [CurlOption] -> IO (Either String B.ByteString)
getFile (URL url) flags = do
h <- initialize
let start = 1024
buf <- mallocBytes start
ref <- newIORef (P buf 0)
setopt h (CurlFailOnError True)
setDefaultSSLOpts h url
setopt h (CurlURL url)
setopt h (CurlWriteFunction (gather ref))
mapM_ (setopt h) flags
rc <- perform h
P buf' sz <- readIORef ref
if rc /= CurlOK
then do
free buf'
return $ Left (show rc)
else do
fp <- newForeignPtr finalizerFree buf'
return (Right $! B.fromForeignPtr fp 0 (fromIntegral sz))

openURI :: String -> IO (Either String B.ByteString)
openURI s = case parseURL s of
Nothing -> return $ Left $ "Malformed url: "++ s
Just url -> do
e <- getFile url []
return $ case e of
Left err -> Left $ "Failed to connect: " ++ err
Right src -> Right src
-- end snippets from Network.Curl.Download

getImage :: String -> String -> IO ()
getImage s o = do
Right a <- openURI s
B.writeFile (o ++ "/" ++ (s =~ "[0-9]{13}.[A-Za-z0-9]+")) a

data Args = Args
{ output :: String
, uri :: String
, quiet :: Bool
} deriving (Show, Data, Typeable)

defaultArgs = Args
{ output = "./"
&= help "Where to download the images."
&= typ "DIR"
, uri = "boards.4chan.org/g/"
&= help "The URI to scrape images from." &= typ "URI"
, quiet = False
&= help "If specified, nothing is printed."
} &= summary "4chan image scraper alpha beater."

main :: IO ()
main = withCurlDo $ do
args <- getArgs
args <- cmdArgs defaultArgs
curl <- initialize
bodyText <- curlGetString (last args) []
let images = nub $ (show bodyText =~ "http://images.4chan.org/[A-Za-z0-9]+/src/[0-9]{13}.[A-Za-z0-9]+")
putStr $ "Page retrieved. " ++ show (length images) ++ " images found.\n"
mapM_ getImage (concat images)
bodyText <- curlGetString (uri args) []
let images = nub $ show bodyText =~ "http://images.4chan.org/[A-Za-z0-9]+/src/[0-9]{13}.[A-Za-z0-9]+"
unless (quiet args) $ putStrLn $ "Page retrieved. " ++ show (length images) ++ " images found."
forM_ (concat images) $
\a -> do
getImage a (output args)
unless (quiet args) $ putStrLn $ "Downloaded " ++ show a ++
if output args == "./"
then "."
else "to" ++ show (output args) ++ "."

0 comments on commit 4cb3049

Please sign in to comment.