-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetch_opengraph.py
90 lines (73 loc) · 2.98 KB
/
fetch_opengraph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import requests
from bs4 import BeautifulSoup
import yaml
import re
from datetime import datetime
INPUT_FILE = '_data/external_articles.yml'
OUTPUT_FILE = '_data/articles_data.yml'
def fetch_opengraph_data(url):
headers = {
'User-Agent': 'Mozilla/5.0 (compatible; OpenGraphFetcher/1.0)'
}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
except Exception as e:
print(f"Error fetching URL {url}: {e}")
return None
soup = BeautifulSoup(response.text, 'html.parser')
# タイトルと画像を取得
og_title = soup.find('meta', property='og:title')
og_image = soup.find('meta', property='og:image')
title = og_title['content'] if og_title and 'content' in og_title.attrs else 'No Title'
image = og_image['content'] if og_image and 'content' in og_image.attrs else 'https://via.placeholder.com/512x320?text=No+Image'
# 公開日を取得
og_published = soup.find('meta', property='article:published_time') or soup.find('meta', {'itemprop': 'datePublished'})
if og_published and 'content' in og_published.attrs:
published_raw = og_published['content']
# Unixタイムスタンプを日付に変換
if re.match(r'^\d{10}$', published_raw):
published = datetime.utcfromtimestamp(int(published_raw)).strftime('%Y-%m-%d')
else:
published = published_raw
else:
published = '公開日不明'
return {
'url': url,
'title': title,
'image': image,
'published': published
}
def main():
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
topics_data = []
for topic in data['topics']:
topic_data = {
'title': topic['title'],
'articles': []
}
for article in topic['articles']:
url = article['url']
print(f"Fetching Open Graph data from: {url}")
fetched_data = fetch_opengraph_data(url)
if fetched_data:
# 手動で設定された公開日が存在するか確認
manual_published = article.get('published')
if manual_published:
fetched_data['published'] = manual_published
# 手動で設定されたタイトルと画像が存在するか確認
manual_title = article.get('title')
manual_image = article.get('image')
if manual_title:
fetched_data['title'] = manual_title
if manual_image:
fetched_data['image'] = manual_image
topic_data['articles'].append(fetched_data)
else:
print(f"Failed to fetch data for {url}")
topics_data.append(topic_data)
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
yaml.dump({'topics': topics_data}, f, allow_unicode=True)
if __name__ == "__main__":
main()