Skip to content

Commit 4bafa73

Browse files
committed
add scraper exercise
1 parent 08e31e2 commit 4bafa73

File tree

2 files changed

+136
-0
lines changed

2 files changed

+136
-0
lines changed

exercise-scraper/scraper.sh

+71
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
#!/bin/bash
2+
3+
# This is the base url for all the images. To use this variable,
4+
# you have to prefix the variable with the '$' character, e.g.
5+
# echo $BASE_URL
6+
BASE_URL="http://ecx.images-amazon.com/images/I/"
7+
8+
# TODO
9+
# 1. Find the length of BASE_URL, and save the value in variable
10+
# called "cutpoint"
11+
12+
13+
14+
# When you are done with (1), uncomment the following line below.
15+
# This extracts any extraneous whitespace and leaves only the
16+
# numeric characters left in cutpoint.
17+
#
18+
# UNCOMMENT THIS LINE WHEN DONE WITH (1)
19+
# cutpoint=$(echo $cutpoint | grep '[0-9]\{1,\}' --only-matching)
20+
21+
# Name of the directory containing the images.
22+
IMAGE_DIR="images"
23+
24+
# TODO
25+
# 2. Make the directory for IMAGE_DIR if it doesn't already exist.
26+
# Hint: man mkdir
27+
28+
29+
30+
# Url to scrape images from
31+
REQUEST_URL="http://www.amazon.com/s/&field-keywords=ocaml"
32+
33+
# TODO
34+
# 3. Get the html for REQUEST_URL, and save it in a variable called "html"
35+
36+
37+
38+
# Since I haven't shown you what a *regular expression* is (take CS121 to find
39+
# out), I'll extract all the links for you with following command.
40+
#
41+
# UNCOMMENT THIS LINE WHEN DONE WITH (3)
42+
# urls=$(echo $html | grep 'http://ecx.images-amazon.com/images/I/[0-9A-Za-z\.\_\,\%\\-]\{0,\}.jpg' --only-matching)
43+
44+
# We feed into the loop all the urls that we just collected.
45+
# For each url, we will download the file into the truncated file name
46+
# (the url without the BASE_URL part).
47+
#
48+
# Here, we use a for loop because we know exactly the list of things that
49+
# we want to loop over (urls). The variable "url" will be the current url
50+
# in our list of urls.
51+
for url in $urls
52+
do
53+
# TODO
54+
# 4. Use the "cut" command to extract the part of the url
55+
# after BASE_URL. Hint: you'll want to use your variable $cutpoint.
56+
# Save this in a variable called "suffix".
57+
58+
59+
60+
# TODO
61+
# 5. Download the image, and save it into the file $IMAGE_DIR/$suffix.
62+
# Look at the man page for "wget" to figure out the option you need
63+
# to provide to download something into a specific file name.
64+
65+
66+
# REMOVE THIS LINE WHEN YOU'RE DONE
67+
echo "noop"
68+
done
69+
70+
71+

exercise-scraper/scraper_complete.sh

+65
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
#!/bin/bash
2+
3+
# This is the base url for all the images. To use this variable,
4+
# you have to prefix the variable with the '$' character, e.g.
5+
# echo $BASE_URL
6+
BASE_URL="http://ecx.images-amazon.com/images/I/"
7+
8+
# TODO
9+
# 1. Find the length of BASE_URL, and save the value in variable
10+
# called "cutpoint"
11+
cutpoint=$(echo $BASE_URL | wc -m)
12+
13+
# When you are done with (1), uncomment the following line below.
14+
# This extracts any extraneous whitespace and leaves only the
15+
# numeric characters left in cutpoint.
16+
#
17+
# UNCOMMENT THIS LINE WHEN DONE WITH (1)
18+
cutpoint=$(echo $cutpoint | grep '[0-9]\{1,\}' --only-matching)
19+
20+
# Name of the directory containing the images.
21+
IMAGE_DIR="images"
22+
23+
# TODO
24+
# 2. Make the directory for IMAGE_DIR if it doesn't already exist.
25+
# Hint: man mkdir
26+
mkdir -p $IMAGE_DIR
27+
28+
# Url to scrape images from
29+
REQUEST_URL="http://www.amazon.com/s/&field-keywords=ocaml"
30+
31+
# TODO
32+
# 3. Get the html for REQUEST_URL, and save it in a variable called "html"
33+
html=$(curl --silent $REQUEST_URL)
34+
35+
# Since I haven't shown you what a *regular expression* is (take CS121 to find
36+
# out), I'll extract all the links for you with following command.
37+
#
38+
# UNCOMMENT THIS LINE WHEN DONE WITH (3)
39+
urls=$(echo $html | grep 'http://ecx.images-amazon.com/images/I/[0-9A-Za-z\.\_\,\%\\-]\{0,\}.jpg' --only-matching)
40+
41+
# We feed into the loop all the urls that we just collected.
42+
# For each url, we will download the file into the truncated file name
43+
# (the url without the BASE_URL part).
44+
#
45+
# Here, we use a for loop because we know exactly the list of things that
46+
# we want to loop over (urls). The variable "url" will be the current url
47+
# in our list of urls.
48+
for url in $urls
49+
do
50+
# TODO
51+
# 4. Use the "cut" command to extract the part of the url
52+
# after BASE_URL. Hint: you'll want to use your variable $cutpoint.
53+
# Save this in a variable called "suffix".
54+
suffix=$(echo $url | cut -c $cutpoint-)
55+
56+
# TODO
57+
# 5. Download the image, and save it into the file $IMAGE_DIR/$suffix.
58+
# Look at the man page for "wget" to figure out the option you need
59+
# to provide to download something into a specific file name.
60+
wget $url -O $IMAGE_DIR/$suffix
61+
62+
done
63+
64+
65+

0 commit comments

Comments
 (0)