1
+ {
2
+ "nbformat" : 4 ,
3
+ "nbformat_minor" : 0 ,
4
+ "metadata" : {
5
+ "colab" : {
6
+ "name" : " 웹 크롤링" ,
7
+ "provenance" : [],
8
+ "collapsed_sections" : [],
9
+ "include_colab_link" : true
10
+ },
11
+ "kernelspec" : {
12
+ "name" : " python3" ,
13
+ "display_name" : " Python 3"
14
+ }
15
+ },
16
+ "cells" : [
17
+ {
18
+ "cell_type" : " markdown" ,
19
+ "metadata" : {
20
+ "id" : " view-in-github" ,
21
+ "colab_type" : " text"
22
+ },
23
+ "source" : [
24
+ " <a href=\" https://colab.research.google.com/github/ndb796/Python-Data-Analysis-and-Image-Processing-Tutorial/blob/master/28.%20%EC%9B%B9%20%ED%81%AC%EB%A1%A4%EB%A7%81/%EC%9B%B9%20%ED%81%AC%EB%A1%A4%EB%A7%81.ipynb\" target=\" _parent\" ><img src=\" https://colab.research.google.com/assets/colab-badge.svg\" alt=\" Open In Colab\" /></a>"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type" : " markdown" ,
29
+ "metadata" : {
30
+ "id" : " oB7RjFELjJkA" ,
31
+ "colab_type" : " text"
32
+ },
33
+ "source" : [
34
+ " ## 웹 크롤링\n " ,
35
+ " [강의 노트](https://github.com/ndb796/Python-Data-Analysis-and-Image-Processing-Tutorial/blob/master/28.%20%EC%9B%B9%20%ED%81%AC%EB%A1%A4%EB%A7%81/Python%20%EB%8D%B0%EC%9D%B4%ED%84%B0%20%EB%B6%84%EC%84%9D%EA%B3%BC%20%EC%9D%B4%EB%AF%B8%EC%A7%80%20%EC%B2%98%EB%A6%AC%20-%20%EC%9B%B9%20%ED%81%AC%EB%A1%A4%EB%A7%81.pdf)"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type" : " markdown" ,
40
+ "metadata" : {
41
+ "id" : " UgjcJAgXjUBv" ,
42
+ "colab_type" : " text"
43
+ },
44
+ "source" : [
45
+ " **Web Crawler**\n " ,
46
+ " - 웹 크롤러란 자동화된 방법으로 웹(Web)에서 다양한 정보를 수집하는 소프트웨어입니다.\n " ,
47
+ " - 원하는 서비스에서 원하는 정보를 편하게 얻어올 수 있습니다.\n " ,
48
+ " - 언어를 막론하고 구현할 수 있지만, 주로 Python을 이용합니다."
49
+ ]
50
+ },
51
+ {
52
+ "cell_type" : " markdown" ,
53
+ "metadata" : {
54
+ "id" : " geNJPrWcjZrP" ,
55
+ "colab_type" : " text"
56
+ },
57
+ "source" : [
58
+ " **특정 웹 사이트 HTML 코드 추출 ①**"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type" : " code" ,
63
+ "metadata" : {
64
+ "id" : " bi5N9w0_jJ3f" ,
65
+ "colab_type" : " code" ,
66
+ "colab" : {
67
+ "base_uri" : " https://localhost:8080/" ,
68
+ "height" : 1000
69
+ },
70
+ "outputId" : " ae04f470-b509-44c4-f1d5-7c3c77bb6765"
71
+ },
72
+ "source" : [
73
+ " import requests\n " ,
74
+ " \n " ,
75
+ " # 특정 URL에 접속하는 요청(Request) 객체를 생성합니다.\n " ,
76
+ " request = requests.get('http://www.dowellcomputer.com/main.jsp')\n " ,
77
+ " \n " ,
78
+ " # 접속한 이후의 웹 사이트 소스코드를 추출합니다\n " ,
79
+ " html = request.text.strip()\n " ,
80
+ " \n " ,
81
+ " print(html)"
82
+ ],
83
+ "execution_count" : 5 ,
84
+ "outputs" : [
85
+ {
86
+ "output_type" : " stream" ,
87
+ "text" : [
88
+ " <!DOCTYPE html>\r\n " ,
89
+ " \r\n " ,
90
+ " <html>\r\n " ,
91
+ " \t <head>\r\n " ,
92
+ " \t\t <link rel=\" stylesheet\" type=\" text/css\" href=\" ./css/mainStyle.css\" >\t\r\n " ,
93
+ " \t\t <meta http-equiv=\" Content-Type\" content=\" text/html; charset=UTF-8\" >\r\n " ,
94
+ " \t\t <title>컴잘알</title>\r\n " ,
95
+ " \t </head>\r\n " ,
96
+ " \t <body>\r\n " ,
97
+ " \t\t <div id=\" mainBox\" >\r\n " ,
98
+ " \t\t\t <div id=\" titleBox\" >\r\n " ,
99
+ " \t\t\t\t <a href=\" ./main.jsp\" >컴잘알</a>\r\n " ,
100
+ " \t\t\t </div>\r\n " ,
101
+ " \t\t\t <div id=\" navigationBox\" >\r\n " ,
102
+ " \t\t\t\t\r\n " ,
103
+ " \t\t\t\t\t\t <a href=\" ./member/memberLoginForm.jsp\" class=\" basicButton\" >로그인</a>\r\n " ,
104
+ " \t\t\t\t\t\t <a href=\" ./member/memberJoinForm.jsp\" class=\" basicButton\" >회원가입</a>\r\n " ,
105
+ " \t\t\t\t\r\n " ,
106
+ " \t\t\t\t\t <a href=\" ./study/study.jsp\" class=\" basicButton\" >공부방</a>\r\n " ,
107
+ " \t\t\t\t\t <a href=\" ./talk/talkListForm.jsp\" class=\" basicButton\" >대화방</a>\r\n " ,
108
+ " \t\t\t\t\t <a href=\" ./notice/noticeListForm.jsp\" class=\" basicButton\" >공지사항</a>\r\n " ,
109
+ " \t\t\t </div>\r\n " ,
110
+ " \t\t\t <hr style=\" border: 2px solid black;\" >\r\n " ,
111
+ " \t\t\t <div class=\" slideshow-container\" ><br>\r\n " ,
112
+ " \t\t\t <div class=\" mySlides fade\" >\r\n " ,
113
+ " \t\t\t <img src=\" ./image/mainOne.jpg\" style=\" width:100%\" >\r\n " ,
114
+ " \t\t\t </div>\r\n " ,
115
+ " \t\t\t <div class=\" mySlides fade\" >\r\n " ,
116
+ " \t\t\t <img src=\" ./image/mainTwo.jpg\" style=\" width:100%\" >\r\n " ,
117
+ " \t\t\t </div>\r\n " ,
118
+ " \t\t\t <div class=\" mySlides fade\" >\r\n " ,
119
+ " \t\t\t <img src=\" ./image/mainThree.jpg\" style=\" width:100%\" >\r\n " ,
120
+ " \t\t\t </div>\r\n " ,
121
+ " \t\t\t <a class=\" prev\" onclick=\" plusSlides(-1)\" >❮</a>\r\n " ,
122
+ " \t\t\t <a class=\" next\" onclick=\" plusSlides(1)\" >❯</a>\r\n " ,
123
+ " \t\t\t </div>\r\n " ,
124
+ " \t\t\t <br>\r\n " ,
125
+ " \t\t\t <div style=\" text-align:center\" >\r\n " ,
126
+ " \t\t\t <span class=\" dot\" onclick=\" currentSlide(1)\" ></span> \r\n " ,
127
+ " \t\t\t <span class=\" dot\" onclick=\" currentSlide(2)\" ></span> \r\n " ,
128
+ " \t\t\t <span class=\" dot\" onclick=\" currentSlide(3)\" ></span> \r\n " ,
129
+ " \t\t\t </div>\r\n " ,
130
+ " \t\t\t <script>\r\n " ,
131
+ " \t\t\t\t var slideIndex = 1;\r\n " ,
132
+ " \t\t\t\t showSlides(slideIndex);\r\n " ,
133
+ " \t\t\t\t\r\n " ,
134
+ " \t\t\t\t function plusSlides(n) {\r\n " ,
135
+ " \t\t\t\t showSlides(slideIndex += n);\r\n " ,
136
+ " \t\t\t\t }\r\n " ,
137
+ " \t\t\t\t\r\n " ,
138
+ " \t\t\t\t function currentSlide(n) {\r\n " ,
139
+ " \t\t\t\t showSlides(slideIndex = n);\r\n " ,
140
+ " \t\t\t\t }\r\n " ,
141
+ " \t\t\t\t\r\n " ,
142
+ " \t\t\t\t function showSlides(n) {\r\n " ,
143
+ " \t\t\t\t var i;\r\n " ,
144
+ " \t\t\t\t var slides = document.getElementsByClassName(\" mySlides\" );\r\n " ,
145
+ " \t\t\t\t var dots = document.getElementsByClassName(\" dot\" );\r\n " ,
146
+ " \t\t\t\t if (n > slides.length) {slideIndex = 1} \r\n " ,
147
+ " \t\t\t\t if (n < 1) {slideIndex = slides.length}\r\n " ,
148
+ " \t\t\t\t for (i = 0; i < slides.length; i++) {\r\n " ,
149
+ " \t\t\t\t slides[i].style.display = \" none\" ; \r\n " ,
150
+ " \t\t\t\t }\r\n " ,
151
+ " \t\t\t\t for (i = 0; i < dots.length; i++) {\r\n " ,
152
+ " \t\t\t\t dots[i].className = dots[i].className.replace(\" active\" , \"\" );\r\n " ,
153
+ " \t\t\t\t }\r\n " ,
154
+ " \t\t\t\t slides[slideIndex-1].style.display = \" block\" ; \r\n " ,
155
+ " \t\t\t\t dots[slideIndex-1].className += \" active\" ;\r\n " ,
156
+ " \t\t\t\t }\r\n " ,
157
+ " \t\t\t </script>\r\n " ,
158
+ " \t\t\t <br>\r\n " ,
159
+ " \t\t\t <div id=\" viewBox\" >\r\n " ,
160
+ " \t\t\t\t <table>\r\n " ,
161
+ " \t\t \t <tr>\r\n " ,
162
+ " \t\t \t\t <td class=\" head\" colspan=\" 4\" >\r\n " ,
163
+ " \t\t \t\t\t 최근 공지사항\r\n " ,
164
+ " \t\t \t\t </td>\r\n " ,
165
+ " \t\t \t </tr>\t\t\t\t\r\n " ,
166
+ " \t\t\t\t <tr>\r\n " ,
167
+ " \t\t\t\t <td class=\" middle\" >\r\n " ,
168
+ " \t\t\t\t \t 아이디\r\n " ,
169
+ " \t\t\t\t </td>\r\n " ,
170
+ " \t\t\t\t <td class=\" middle\" style=\" width: 320px;\" >\r\n " ,
171
+ " \t\t\t\t \t 제목\r\n " ,
172
+ " \t\t\t\t </td>\r\n " ,
173
+ " \t\t\t\t <td class=\" middle\" style=\" width: 180px;\" >\r\n " ,
174
+ " \t\t\t\t \t 게시글 등록일\r\n " ,
175
+ " \t\t\t\t </td>\t\t \t\t \t \t\t \t\t \r\n " ,
176
+ " \t\t\t\t </tr>\r\n " ,
177
+ " \t\t\t\t \r\n " ,
178
+ " \t\t\t\t <tr> \r\n " ,
179
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >나동빈</td>\r\n " ,
180
+ " \t\t\t\t \t <td class=\" tail\" ><a href=\" ./notice/noticeViewForm.jsp?noticeID=4\" ><b>자바 기초 프로그래밍 강좌를 완강했습니다.</b></a></td>\r\n " ,
181
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >2017-05-02\r\n " ,
182
+ " \t\t\t\t </tr>\r\n " ,
183
+ " \t\t\t\t \r\n " ,
184
+ " \t\t\t\t <tr> \r\n " ,
185
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >나동빈</td>\r\n " ,
186
+ " \t\t\t\t \t <td class=\" tail\" ><a href=\" ./notice/noticeViewForm.jsp?noticeID=1\" ><b>컴잘알에 오신 것을 환영합니다.</b></a></td>\r\n " ,
187
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >2016-11-28\r\n " ,
188
+ " \t\t\t\t </tr>\r\n " ,
189
+ " \t\t\t\t \r\n " ,
190
+ " \t\t\t\t </table>\t\t\t\r\n " ,
191
+ " \t\t\t </div>\t\r\n " ,
192
+ " \t\t\t <div class=\" studyViewBox\" >\r\n " ,
193
+ " \t\t\t\t <table>\r\n " ,
194
+ " \t\t \t <tr>\r\n " ,
195
+ " \t\t \t\t <td class=\" head\" colspan=\" 3\" >\r\n " ,
196
+ " \t\t \t\t\t 최근 강의\r\n " ,
197
+ " \t\t \t\t </td>\r\n " ,
198
+ " \t\t \t </tr>\t\t\t\t\r\n " ,
199
+ " \t\t\t\t <tr>\r\n " ,
200
+ " \t\t\t\t <td class=\" middle\" >\r\n " ,
201
+ " \t\t\t\t \t 선생님\r\n " ,
202
+ " \t\t\t\t </td>\r\n " ,
203
+ " \t\t\t\t <td class=\" middle\" style=\" width: 560px;\" >\r\n " ,
204
+ " \t\t\t\t \t 제목\r\n " ,
205
+ " \t\t\t\t </td>\r\n " ,
206
+ " \t\t\t\t <td class=\" middle\" style=\" width: 180px;\" >\r\n " ,
207
+ " \t\t\t\t \t 게시글 등록일\r\n " ,
208
+ " \t\t\t\t </td>\t\t \t\t \t \t\t \t\t \r\n " ,
209
+ " \t\t\t\t </tr>\r\n " ,
210
+ " \t\t\t\t \r\n " ,
211
+ " \t\t\t\t <tr> \r\n " ,
212
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >나동빈</td>\r\n " ,
213
+ " \t\t\t\t \t <td class=\" tail\" ><a href=\" ./study/study.jsp?studyID=85\" ><b>C언어 기초 프로그래밍 강좌 20강 - 동적 메모리의 활용 (C Programming Tutorial For Beginners 2017 #20) </b></a></td>\r\n " ,
214
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >2017-05-15\r\n " ,
215
+ " \t\t\t\t </tr>\r\n " ,
216
+ " \t\t\t\t \r\n " ,
217
+ " \t\t\t\t <tr> \r\n " ,
218
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >나동빈</td>\r\n " ,
219
+ " \t\t\t\t \t <td class=\" tail\" ><a href=\" ./study/study.jsp?studyID=84\" ><b>C언어 기초 프로그래밍 강좌 19강 - 동적 메모리 (C Programming Tutorial For Beginners 2017 #19) </b></a></td>\r\n " ,
220
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >2017-05-15\r\n " ,
221
+ " \t\t\t\t </tr>\r\n " ,
222
+ " \t\t\t\t \r\n " ,
223
+ " \t\t\t\t <tr> \r\n " ,
224
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >나동빈</td>\r\n " ,
225
+ " \t\t\t\t \t <td class=\" tail\" ><a href=\" ./study/study.jsp?studyID=83\" ><b>C언어 기초 프로그래밍 강좌 18강 - 파일 입출력 (C Programming Tutorial For Beginners 2017 #18) </b></a></td>\r\n " ,
226
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >2017-05-15\r\n " ,
227
+ " \t\t\t\t </tr>\r\n " ,
228
+ " \t\t\t\t \r\n " ,
229
+ " \t\t\t\t <tr> \r\n " ,
230
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >나동빈</td>\r\n " ,
231
+ " \t\t\t\t \t <td class=\" tail\" ><a href=\" ./study/study.jsp?studyID=82\" ><b>C언어 기초 프로그래밍 강좌 17강 - 구조체의 활용 ② (C Programming Tutorial For Beginners 2017 #17) </b></a></td>\r\n " ,
232
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >2017-05-15\r\n " ,
233
+ " \t\t\t\t </tr>\r\n " ,
234
+ " \t\t\t\t \r\n " ,
235
+ " \t\t\t\t <tr> \r\n " ,
236
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >나동빈</td>\r\n " ,
237
+ " \t\t\t\t \t <td class=\" tail\" ><a href=\" ./study/study.jsp?studyID=81\" ><b>C언어 기초 프로그래밍 강좌 16강 - 구조체의 활용 ① (C Programming Tutorial For Beginners 2017 #16) </b></a></td>\r\n " ,
238
+ " \t\t\t\t \t <td class=\" tail\" style=\" text-align: center;\" >2017-05-15\r\n " ,
239
+ " \t\t\t\t </tr>\r\n " ,
240
+ " \t\t\t\t \r\n " ,
241
+ " \t\t\t\t </table>\r\n " ,
242
+ " \t\t\t </div>\t\r\n " ,
243
+ " \t\t </div>\t\r\n " ,
244
+ " \t </body>\r\n " ,
245
+ " </html>\n "
246
+ ],
247
+ "name" : " stdout"
248
+ }
249
+ ]
250
+ },
251
+ {
252
+ "cell_type" : " markdown" ,
253
+ "metadata" : {
254
+ "id" : " qdqode2ejhv-" ,
255
+ "colab_type" : " text"
256
+ },
257
+ "source" : [
258
+ " **특정 웹 사이트 HTML 코드 추출 ②**"
259
+ ]
260
+ },
261
+ {
262
+ "cell_type" : " code" ,
263
+ "metadata" : {
264
+ "id" : " bxoTAzPwjjEv" ,
265
+ "colab_type" : " code" ,
266
+ "colab" : {
267
+ "base_uri" : " https://localhost:8080/" ,
268
+ "height" : 52
269
+ },
270
+ "outputId" : " ae14dd3d-bf20-41bd-c955-01dc27aff5d4"
271
+ },
272
+ "source" : [
273
+ " import requests\n " ,
274
+ " from bs4 import BeautifulSoup\n " ,
275
+ " \n " ,
276
+ " # 특정 URL에 접속하는 요청(Request) 객체를 생성합니다.\n " ,
277
+ " request = requests.get('http://www.dowellcomputer.com/main.jsp')\n " ,
278
+ " # 접속한 이후의 웹 사이트 소스코드를 추출합니다.\n " ,
279
+ " html = request.text\n " ,
280
+ " # HTML 소스코드를 파이썬 BeatifulSoup 객체로 변환합니다.\n " ,
281
+ " soup = BeautifulSoup(html, 'html.parser')\n " ,
282
+ " \n " ,
283
+ " # <a> 태그를 포함하는 요소를 추출합니다.\n " ,
284
+ " links = soup.select('td > a')\n " ,
285
+ " \n " ,
286
+ " # 모든 링크에 하나씩 접근합니다.\n " ,
287
+ " for link in links:\n " ,
288
+ " # 링크가 href 속성을 가지고 있다면\n " ,
289
+ " if link.has_attr('href'):\n " ,
290
+ " # href 속성의 값으로 notice라는 문자가 포함되어 있다면\n " ,
291
+ " if link.get('href').find('notice') != -1:\n " ,
292
+ " print(link.text)"
293
+ ],
294
+ "execution_count" : 2 ,
295
+ "outputs" : [
296
+ {
297
+ "output_type" : " stream" ,
298
+ "text" : [
299
+ " 자바 기초 프로그래밍 강좌를 완강했습니다.\n " ,
300
+ " 컴잘알에 오신 것을 환영합니다.\n "
301
+ ],
302
+ "name" : " stdout"
303
+ }
304
+ ]
305
+ }
306
+ ]
307
+ }
0 commit comments