Skip to content

Commit 2b525da

Browse files
committed
first commit
1 parent 17a03b1 commit 2b525da

10 files changed

+17595
-0
lines changed

.DS_Store

6 KB
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,288 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"metadata": {},
6+
"source": [
7+
"# Downloading Files with Requests"
8+
]
9+
},
10+
{
11+
"cell_type": "code",
12+
"execution_count": 1,
13+
"metadata": {},
14+
"outputs": [],
15+
"source": [
16+
"# The requests package can also be used to download files from the web.\n",
17+
"import requests"
18+
]
19+
},
20+
{
21+
"cell_type": "markdown",
22+
"metadata": {},
23+
"source": [
24+
"## Naive downloading"
25+
]
26+
},
27+
{
28+
"cell_type": "code",
29+
"execution_count": 2,
30+
"metadata": {},
31+
"outputs": [],
32+
"source": [
33+
"# One way to 'download' a file is to send a request to it.\n",
34+
"# Then, export the content of the response to a local file"
35+
]
36+
},
37+
{
38+
"cell_type": "code",
39+
"execution_count": 3,
40+
"metadata": {},
41+
"outputs": [],
42+
"source": [
43+
"# Let's use an image from wikipedia for this purpose\n",
44+
"file_url = \"https://upload.wikimedia.org/wikipedia/commons/thumb/d/d9/Collage_of_Nine_Dogs.jpg/1024px-Collage_of_Nine_Dogs.jpg\""
45+
]
46+
},
47+
{
48+
"cell_type": "code",
49+
"execution_count": 4,
50+
"metadata": {},
51+
"outputs": [
52+
{
53+
"data": {
54+
"text/plain": [
55+
"200"
56+
]
57+
},
58+
"execution_count": 4,
59+
"metadata": {},
60+
"output_type": "execute_result"
61+
}
62+
],
63+
"source": [
64+
"response = requests.get(file_url)\n",
65+
"response.status_code"
66+
]
67+
},
68+
{
69+
"cell_type": "code",
70+
"execution_count": 5,
71+
"metadata": {},
72+
"outputs": [
73+
{
74+
"data": {
75+
"text/plain": [
76+
"b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x01\\x01\\x01\\x00H\\x00H\\x00\\x00\\xff\\xfe\\x00OFile source: https://commons.wikimedia.org/wiki/File:Collage_of_Nine_Dogs.jpg\\xff\\xe2\\x02\\x1cICC_PROFILE\\x00\\x01\\x01\\x00\\x00\\x02\\x0clcms\\x02\\x10\\x00\\x00mntrRGB XYZ \\x07\\xdc\\x00\\x01\\x00\\x19\\x00\\x03\\x00)\\x009acspAPPL\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\xf6\\xd6\\x00\\x01\\x00\\x00\\x00\\x00\\xd3-lcms\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\ndesc\\x00\\x00\\x00\\xfc\\x00\\x00\\x00^cprt\\x00\\x00\\x01\\\\\\x00\\x00\\x00\\x0bwtpt\\x00\\x00\\x01h\\x00\\x00\\x00\\x14bkpt\\x00\\x00\\x01|\\x00\\x00\\x00\\x14rXYZ\\x00\\x00\\x01\\x90\\x00\\x00\\x00\\x14gXYZ\\x00\\x00\\x01\\xa4\\x00\\x00\\x00\\x14bXYZ\\x00\\x00\\x01\\xb8\\x00\\x00\\x00\\x14rTRC\\x00\\x00\\x01\\xcc\\x00\\x00\\x00@gTRC\\x00\\x00\\x01\\xcc\\x00\\x00\\x00@bTRC\\x00\\x00\\x01\\xcc\\x00\\x00\\x00@desc\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x03c2\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x00text\\x00\\x00\\x00\\x00FB\\x00\\x00XYZ \\x00\\x00\\x00\\x00\\x00\\x00\\xf6\\xd6\\x00\\x01\\x00\\x00\\x00\\x00\\xd3-X'"
77+
]
78+
},
79+
"execution_count": 5,
80+
"metadata": {},
81+
"output_type": "execute_result"
82+
}
83+
],
84+
"source": [
85+
"# Printing out the begining of the content of the response\n",
86+
"# It is in a binary-encoded format, thus it looks like gibberish\n",
87+
"response.content[:500]"
88+
]
89+
},
90+
{
91+
"cell_type": "code",
92+
"execution_count": 6,
93+
"metadata": {},
94+
"outputs": [],
95+
"source": [
96+
"# We need to export this to an image file (jpg, png, gif...)"
97+
]
98+
},
99+
{
100+
"cell_type": "markdown",
101+
"metadata": {},
102+
"source": [
103+
"### Writing to a file"
104+
]
105+
},
106+
{
107+
"cell_type": "code",
108+
"execution_count": 7,
109+
"metadata": {},
110+
"outputs": [],
111+
"source": [
112+
"# We open/create a file with the function 'open()'\n",
113+
"file = open(\"dog_image.jpg\", \"wb\")\n",
114+
"\n",
115+
"# Then, write to it\n",
116+
"file.write(response.content)\n",
117+
"\n",
118+
"# And close the file after finishing\n",
119+
"file.close()"
120+
]
121+
},
122+
{
123+
"cell_type": "code",
124+
"execution_count": 8,
125+
"metadata": {},
126+
"outputs": [],
127+
"source": [
128+
"# The two parameters in the function open() are:\n",
129+
"# - the name of the file (along with a path to it if it is not in the same directory as our program)\n",
130+
"# - the mode in wich we want to edit the file\n",
131+
"\n",
132+
"# Some popular modes are:\n",
133+
"# - 'r' : Opens the file in read-only mode;\n",
134+
"# - 'rb' : Opens the file as read-only in binary format;\n",
135+
"# - 'w' : Creates a file in write-only mode. If the file already exists, it will overwrite it;\n",
136+
"# - 'wb': Write-only mode in binary format;\n",
137+
"# - 'a' : Opens the file for appending new information to the end;\n",
138+
"# - 'w+' : Opens the file for writing and reading;\n",
139+
"\n",
140+
"# We have used 'wb' in this example, since we want to export the data to a file (thus, write to it)\n",
141+
"# and response.content is in bytes\n",
142+
"\n",
143+
"# Never forget to close the file!"
144+
]
145+
},
146+
{
147+
"cell_type": "code",
148+
"execution_count": 9,
149+
"metadata": {},
150+
"outputs": [],
151+
"source": [
152+
"# To ensure the file will always be closed, use the 'with' statement\n",
153+
"# This automatically calls file.close() at the end"
154+
]
155+
},
156+
{
157+
"cell_type": "code",
158+
"execution_count": 10,
159+
"metadata": {},
160+
"outputs": [],
161+
"source": [
162+
"with open(\"dog_image_2.jpg\", \"wb\") as file:\n",
163+
" file.write(response.content)"
164+
]
165+
},
166+
{
167+
"cell_type": "code",
168+
"execution_count": null,
169+
"metadata": {},
170+
"outputs": [],
171+
"source": []
172+
},
173+
{
174+
"cell_type": "code",
175+
"execution_count": 11,
176+
"metadata": {},
177+
"outputs": [],
178+
"source": [
179+
"# Here, we first receive the whole file and store it in the RAM, then export it to the hard disk\n",
180+
"# This method is really inefficient, especially for bigger files\n",
181+
"# In effect we download the file to the RAM\n",
182+
"\n",
183+
"# We can fix that with a couple of small changes to our code"
184+
]
185+
},
186+
{
187+
"cell_type": "markdown",
188+
"metadata": {},
189+
"source": [
190+
"## Streaming the download to a file"
191+
]
192+
},
193+
{
194+
"cell_type": "code",
195+
"execution_count": 12,
196+
"metadata": {},
197+
"outputs": [],
198+
"source": [
199+
"# Instead of reading the whole response immidiatelly, \n",
200+
"# we can signal the program to only read part of the response when we tell it to.\n",
201+
"\n",
202+
"# This is achieved with the 'stream' parameter"
203+
]
204+
},
205+
{
206+
"cell_type": "code",
207+
"execution_count": 13,
208+
"metadata": {},
209+
"outputs": [],
210+
"source": [
211+
"# I will use test video files provided by file-examples.com\n",
212+
"url = \"https://file-examples.com/wp-content/uploads/2017/04/file_example_MP4_480_1_5MG.mp4\""
213+
]
214+
},
215+
{
216+
"cell_type": "code",
217+
"execution_count": 14,
218+
"metadata": {},
219+
"outputs": [],
220+
"source": [
221+
"r = requests.get(url, stream = True)\n",
222+
"\n",
223+
"with open(\"Sample_video_1,5_MB.mp4\", \"wb\") as f:\n",
224+
" \n",
225+
" # Now we iterate over the response in chunks\n",
226+
" for chunk in r.iter_content(chunk_size = 16*1024):\n",
227+
" f.write(chunk)"
228+
]
229+
},
230+
{
231+
"cell_type": "code",
232+
"execution_count": 15,
233+
"metadata": {},
234+
"outputs": [],
235+
"source": [
236+
"# You can change the chunk size to optimize the fastest download speed for your system"
237+
]
238+
},
239+
{
240+
"cell_type": "code",
241+
"execution_count": 16,
242+
"metadata": {},
243+
"outputs": [],
244+
"source": [
245+
"# However, when using 'stream=True' requests will not close the connection to the server until all data has been read\n",
246+
"# Thus, sometimes the connection needs to be closed manually\n",
247+
"\n",
248+
"# Again, that is best done using the 'with' statement"
249+
]
250+
},
251+
{
252+
"cell_type": "code",
253+
"execution_count": 17,
254+
"metadata": {},
255+
"outputs": [],
256+
"source": [
257+
"# So, the final code for file download is\n",
258+
"url = \"https://file-examples.com/wp-content/uploads/2017/04/file_example_MP4_1920_18MG.mp4\"\n",
259+
"\n",
260+
"with requests.get(url, stream = True) as r:\n",
261+
" with open(\"Sample_video_18_MB.mp4\", \"wb\") as f:\n",
262+
" for chunk in r.iter_content(chunk_size = 16*1024):\n",
263+
" f.write(chunk)\n"
264+
]
265+
}
266+
],
267+
"metadata": {
268+
"kernelspec": {
269+
"display_name": "Python 3",
270+
"language": "python",
271+
"name": "python3"
272+
},
273+
"language_info": {
274+
"codemirror_mode": {
275+
"name": "ipython",
276+
"version": 3
277+
},
278+
"file_extension": ".py",
279+
"mimetype": "text/x-python",
280+
"name": "python",
281+
"nbconvert_exporter": "python",
282+
"pygments_lexer": "ipython3",
283+
"version": "3.7.3"
284+
}
285+
},
286+
"nbformat": 4,
287+
"nbformat_minor": 2
288+
}

0 commit comments

Comments
 (0)