-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGOOG.pl
84 lines (83 loc) · 2.35 KB
/
GOOG.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#!/usr/local/bin/perl
use strict; use warnings;
use Proc::Daemon;
use LWP;
use WWW::Mechanize;
################## SUMMONS #
# GOOG - imgur viral scraper
# `o_0 ---skrp of MKRX
# SETUP ###############################
my $work = 'MINION/';
my $limbo = 'limbo'; my $dump = 'dump';
my $state = 'STATE'; my $debug = 'DEBUG';
my $log = 'LOG'; my $pid = 'PID';
my $que = 'QUE'; my $clean = 'CLEAN'
my $pause = 'PAUSE'; my $shutdown = 'SHUT';
# DAEMONIZE ##########################
my $daemon = Proc::Daemon->new(
work_dir => $work,
child_STDOUT => $log,
child_STDERR => +>>$debug,
pid_file => $pid,
);
$daemon->Init();
# PROC #####################
my $num = 164; # scrape top 60 images from today to Jan 2011
while ($num < 2218) { # farthest back is 2218 days
my $o_url = 'http://imgur.com/gallery/hot/viral/page/';
my $url = "$o_url$num";
open(my $ifh, '>>', $init);
print $ifh "$url: page\n";
my $temp = 'temp';
my $ua = uagent();
my $mech = WWW::Mechanize->new($ua);
my $response = $mech->get($url);
$mech->save_content($temp);
# FILTER #########################
open(my $tfh, '<', $temp);
my @content = readline $tfh; chomp @content; close $tfh; unlink $temp;
my $pre_base = '<img alt="" src="//i.imgur.com/';
my $end = 'b.jpg" />';
foreach my $i (@content) {
if ($i =~ /$pre_base/) {
$i =~ s/$pre_base//;
$i =~ s/$end//; $i =~ s/^\s+//;
my $item = "$i.jpg";
print "$item: item\n";
my $here = "$dump/$item";
print "starting: $item\n";
my $vua = uagent();
my $i_url = 'http://i.imgur.com/';
my $n_url = "$i_url$item";
my $imech = WWW::Mechanize->new($vua);
my $response = $imech->get($n_url);
$imech->save_content($here);
print "$item: finished\n";
}
}
$num++;
}
# SUB ##########################
sub pause {
my $pausefile = 'GOOG_PAUSE';
open(my $pfh, '<', $pausefile);
my $timeout = readline $pfh; chomp $timeout;
print "sleeping for $timeout\n";
sleep $timeout;
}
sub shut {
my $shut = 'GOOG_SHUTDOWN';
unlink $shut;
open(my $sinitfh, '>', $init);
# foreach (@content)
# { print $sinitfh "$_\n"; }
die "Shutdown clean\n";
}
sub uagent {
my $s_ua = LWP::UserAgent->new(
agent => "Mozilla/50.0.2",
from => '[email protected]',
timeout => 45,
);
return $s_ua;
}