15
15
16
16
getContent <-
17
17
function (turl ,
18
- col = c(" url " ,
19
- " original_url " ,
20
- " section " ,
21
- " datetime " ,
22
- " edittime " ,
23
- " press " ,
24
- " title " ,
25
- " body " ,
26
- " value " )) {
27
-
18
+ col = c(
19
+ " url " ,
20
+ " original_url " ,
21
+ " section " ,
22
+ " datetime " ,
23
+ " edittime " ,
24
+ " press " ,
25
+ " title " ,
26
+ " body "
27
+ )) {
28
28
httr2 :: request(turl ) %> %
29
29
httr2 :: req_user_agent(
" N2H4 by chanyub.park <[email protected] >" ) %
> %
30
30
httr2 :: req_method(" GET" ) %> %
31
31
httr2 :: req_perform() - > root
32
32
33
33
html_obj <- httr2 :: resp_body_html(root )
34
34
urlcheck <- root $ url
35
- value <- T
36
- if (identical(grep(" ^https?://n.news.naver.com" ,
37
- urlcheck ),
38
- integer(0 ))) {
35
+
36
+ if (
37
+ identical(
38
+ grep(" ^https?://n.news.naver.com" , urlcheck ), integer(0 )
39
+ )
40
+ ) {
39
41
original_url <- " page is not news section."
40
42
title <- " page is not news section."
41
43
datetime <- " page is not news section."
42
44
edittime <- " page is not news section."
43
45
press <- " page is not news section."
44
46
body <- " page is not news section."
45
47
section <- " page is not news section."
46
- value <- F
48
+
47
49
} else {
48
- # TODO: 이거 동작하는지 확인해야 함.
49
- chk <- rvest :: html_nodes(html_obj , " div#main_content div div" )
50
- chk <- rvest :: html_attr(chk , " class" )
51
- chk <- chk [1 ]
52
- if (is.na(chk )) {
53
- chk <- " not error"
54
- }
55
- if (" error_msg 404" == chk & value ) {
56
- original_url <- " page is moved."
57
- title <- " page is moved."
58
- datetime <- " page is moved."
59
- edittime <- " page is moved."
60
- press <- " page is moved."
61
- body <- " page is moved."
62
- section <- " page is moved."
63
- value <- F
64
- }
65
- }
66
- if (value ) {
67
50
original_url <- getOriginalUrl(html_obj )
68
51
title <- getContentTitle(html_obj )
69
52
datetime <- getContentDatetime(html_obj )
@@ -84,8 +67,7 @@ getContent <-
84
67
press = press ,
85
68
title = title ,
86
69
body = body ,
87
- section = section ,
88
- value = value
70
+ section = section
89
71
)
90
72
return (newsInfo [, col ])
91
73
}
@@ -141,7 +123,7 @@ getContentPress <-
141
123
142
124
getContentBody <-
143
125
function (html_obj ,
144
- body_node_info = " div #dic_area" ,
126
+ body_node_info = " article #dic_area" ,
145
127
body_attr = " " ) {
146
128
node <- rvest :: html_nodes(html_obj , body_node_info )
147
129
body <- rvest :: html_text(node )
@@ -171,4 +153,3 @@ getSection <- function(turl) {
171
153
}
172
154
return (httr2 :: url_parse(turl )$ query $ sid )
173
155
}
174
-
0 commit comments