-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Initial the Repo
- Loading branch information
0 parents
commit 2a41c15
Showing
3 changed files
with
395 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
# NewsAI |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,394 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "markdown", | ||
"id": "8fd059cd", | ||
"metadata": {}, | ||
"source": [ | ||
"# 0.Initial " | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 11, | ||
"id": "52c5fb7a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"import re\n", | ||
"import schedule\n", | ||
"\n", | ||
"import time\n", | ||
"from datetime import datetime, timedelta\n", | ||
"\n", | ||
"import json\n", | ||
"\n", | ||
"import requests\n", | ||
"\n", | ||
"from bs4 import BeautifulSoup\n", | ||
"\n", | ||
"import mysql.connector\n", | ||
"\n", | ||
"pattern = re.compile(r'jQuery\\d+_\\d+\\((.*?)\\);')\n", | ||
"\n", | ||
"def get_title(text):\n", | ||
" return text.split(\"】\", 1)[0].split(\"【\", 1)[-1].strip()\n", | ||
"\n", | ||
"def get_text(text):\n", | ||
" return text.split(\"】\", 1)[-1].strip()\n", | ||
"\n", | ||
"def get_source(text):\n", | ||
" return text.rsplit(\"(\", 1)[-1].rsplit(\")\", 1)[0].strip()\n", | ||
"\n", | ||
"def get_text_from_source(text):\n", | ||
" return text.rsplit(\"(\", 1)[0].strip()\n", | ||
"\n", | ||
"def get_section(text):\n", | ||
" return text.split(\"|\", 1)[0].strip()\n", | ||
"\n", | ||
"def get_text_from_section(text):\n", | ||
" return text.split(\"|\", 1)[-1].strip()\n", | ||
"\n", | ||
"def get_section_chinese(text):\n", | ||
" return text.split(\"|\", 1)[0].strip()\n", | ||
"\n", | ||
"def get_text_from_section_chinese(text):\n", | ||
" return text.split(\"|\", 1)[-1].strip()\n", | ||
"\n", | ||
"def get_section_special(text):\n", | ||
" return text.split(\"︱\", 1)[0].strip()\n", | ||
"\n", | ||
"def get_text_from_section_special(text):\n", | ||
" return text.split(\"︱\", 1)[-1].strip()\n", | ||
"\n", | ||
"def print_scrape_progress(time_last, num):\n", | ||
" print(\"Data scraped until %s. %d was inserted into MySQL.\" % (time_last, num), end=\"\\r\")\n", | ||
" \n", | ||
"def exit_loop_condition(time_this, time_now, time_delta):\n", | ||
" return time_this <= time_now - timedelta(hours = time_delta)\n", | ||
"\n", | ||
"def start_mysql():\n", | ||
" mydb = mysql.connector.connect(\n", | ||
" host=\"localhost\",\n", | ||
" user=\"root\",\n", | ||
" password=\"password\",\n", | ||
" database=\"NEWS\",\n", | ||
" auth_plugin='mysql_native_password'\n", | ||
" )\n", | ||
"\n", | ||
" mycursor = mydb.cursor()\n", | ||
" return mycursor, mydb\n", | ||
"\n", | ||
"def insert_mysql(val, mycursor, mydb, table):\n", | ||
" sql = \"INSERT INTO \" + table + \" (originalID, time, title, source, section, text) VALUES (%s, %s, %s, %s, %s, %s)\"\n", | ||
" mycursor.executemany(sql, val)\n", | ||
" mydb.commit()\n", | ||
"\n", | ||
"def close_mysql(mycursor, mydb):\n", | ||
" mycursor.close()\n", | ||
" mydb.close()\n", | ||
"\n", | ||
"headers = \"\"\"\n", | ||
"Accept: */*\n", | ||
"Accept-Encoding: gzip, deflate, br, zstd\n", | ||
"Accept-Language: zh-CN,zh;q=0.9,en;q=0.8\n", | ||
"Connection: keep-alive\n", | ||
"Cookie: v=A4YJcvo4HO9K8MiLKiJJdN9r0XcN58qhnCv-BXCvcqmEcygp2HcasWy7ThBD\n", | ||
"Host: news.10jqka.com.cn\n", | ||
"Referer: https://news.10jqka.com.cn/realtimenews.html\n", | ||
"Sec-Fetch-Dest: empty\n", | ||
"Sec-Fetch-Mode: cors\n", | ||
"Sec-Fetch-Site: same-origin\n", | ||
"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36\n", | ||
"X-Requested-With: XMLHttpRequest\n", | ||
"hexin-v: A4YJcvo4HO9K8MiLKiJJdN9r0XcN58qhnCv-BXCvcqmEcygp2HcasWy7ThBD\n", | ||
"sec-ch-ua: \"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"\n", | ||
"sec-ch-ua-mobile: ?0\n", | ||
"sec-ch-ua-platform: \"macOS\"\n", | ||
"\"\"\"\n", | ||
"headers = dict([i.split(\": \") for i in headers.split(\"\\n\") if i !=\"\"])" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "86308181", | ||
"metadata": {}, | ||
"source": [ | ||
"# 1. SINA" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 2, | ||
"id": "866d313a", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_sina_news(time_delta=24):\n", | ||
" datetime_now = datetime.now()\n", | ||
" mycursor, mydb = start_mysql()\n", | ||
"\n", | ||
" url = \"https://zhibo.sina.com.cn/api/zhibo/feed?callback=jQuery0_%s&page=1&page_size=1&zhibo_id=152&tag_id=0\" % (datetime_now.strftime(\"%s\"))\n", | ||
" response = requests.get(url)\n", | ||
" match = pattern.search(response.text)\n", | ||
" json_content = match.group(1)\n", | ||
" data = json.loads(json_content)\n", | ||
" id_now = data[\"result\"][\"data\"][\"feed\"][\"list\"][0][\"id\"]\n", | ||
"\n", | ||
" page = 1\n", | ||
" while True:\n", | ||
" url = \"https://zhibo.sina.com.cn/api/zhibo/feed?callback=jQuery0_%s&page=%d&page_size=100&zhibo_id=152&tag_id=0&id=%d&type=1\" % (datetime_now.strftime(\"%s\"), page, id_now)\n", | ||
" response = requests.get(url)\n", | ||
" match = pattern.search(response.text)\n", | ||
" json_content = match.group(1)\n", | ||
" data = json.loads(json_content)\n", | ||
" data_list = data[\"result\"][\"data\"][\"feed\"][\"list\"]\n", | ||
" val = []\n", | ||
" for i in data_list:\n", | ||
" if i[\"rich_text\"][0] == \"【\":\n", | ||
" val.append([i[\"id\"], i[\"create_time\"], get_title(i[\"rich_text\"]), \"\", \"\", get_text(i[\"rich_text\"])])\n", | ||
" else:\n", | ||
" val.append([i[\"id\"], i[\"create_time\"], \"\", \"\", \"\", i[\"rich_text\"]])\n", | ||
"\n", | ||
" insert_mysql(val, mycursor, mydb, \"sina\")\n", | ||
"\n", | ||
" time_last = data_list[-1][\"create_time\"]\n", | ||
" print_scrape_progress(time_last, mycursor.rowcount)\n", | ||
" if exit_loop_condition(datetime.strptime(time_last, '%Y-%m-%d %H:%M:%S'), datetime_now, time_delta):\n", | ||
" break\n", | ||
" else:\n", | ||
" page += 1\n", | ||
"\n", | ||
" close_mysql(mycursor, mydb)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "45619db1", | ||
"metadata": {}, | ||
"source": [ | ||
"# 2. 10JQKA" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 3, | ||
"id": "b3fa1994", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_10jqka_news(time_delta=24):\n", | ||
" datetime_now = datetime.now()\n", | ||
" mycursor, mydb = start_mysql()\n", | ||
"\n", | ||
" page = 1\n", | ||
" while True:\n", | ||
" url = \"https://news.10jqka.com.cn/tapp/news/push/stock/?page=%d&tag=&track=website&pagesize=100\" % page\n", | ||
" response = requests.get(url, headers=headers)\n", | ||
" data_list = response.json()[\"data\"][\"list\"]\n", | ||
"\n", | ||
" val = [[i[\"id\"], datetime.fromtimestamp(int(i[\"ctime\"])), i[\"title\"], \"\", \"\", i[\"digest\"]] for i in data_list]\n", | ||
" insert_mysql(val, mycursor, mydb, \"10jqka\")\n", | ||
"\n", | ||
" time_last = datetime.fromtimestamp(int(data_list[-1][\"ctime\"]))\n", | ||
" print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n", | ||
" if exit_loop_condition(time_last, datetime_now, time_delta):\n", | ||
" break\n", | ||
" else:\n", | ||
" page = page + 1\n", | ||
"\n", | ||
" close_mysql(mycursor, mydb)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "305658d7", | ||
"metadata": {}, | ||
"source": [ | ||
"# 3. WSCN" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"id": "48988219", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_wscn_news(time_delta=24):\n", | ||
" mycursor, mydb = start_mysql()\n", | ||
" datetime_now = datetime.now()\n", | ||
"\n", | ||
" url = \"https://api-one-wscn.awtmt.com/apiv1/content/lives?channel=global-channel&client=pc&limit=100\"\n", | ||
" response = requests.get(url)\n", | ||
" data = response.json()[\"data\"]\n", | ||
"\n", | ||
" val = []\n", | ||
" for i in data[\"items\"]:\n", | ||
" if i[\"content_text\"][-1] == \")\":\n", | ||
" val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n", | ||
" i[\"title\"], get_source(i[\"content_text\"]), \"\",\n", | ||
" get_text_from_source(i[\"content_text\"])])\n", | ||
" else:\n", | ||
" val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n", | ||
" i[\"title\"], \"\", \"\",\n", | ||
" i[\"content_text\"]])\n", | ||
"\n", | ||
" insert_mysql(val, mycursor, mydb, \"wscn\")\n", | ||
"\n", | ||
" next_cursor = data[\"next_cursor\"]\n", | ||
" time_last = datetime.fromtimestamp(int(next_cursor))\n", | ||
" print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n", | ||
"\n", | ||
" while True:\n", | ||
" url = \"https://api-one-wscn.awtmt.com/apiv1/content/lives?channel=global-channel&client=pc&limit=100&cursor=%s\" % next_cursor\n", | ||
" response = requests.get(url)\n", | ||
" data = response.json()[\"data\"]\n", | ||
"\n", | ||
" val = []\n", | ||
" for i in data[\"items\"]:\n", | ||
" if \"(\" in i[\"content_text\"]:\n", | ||
" val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n", | ||
" i[\"title\"], get_source(i[\"content_text\"]), \"\",\n", | ||
" get_text_from_source(i[\"content_text\"])])\n", | ||
" else:\n", | ||
" val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n", | ||
" i[\"title\"], \"\", \"\", \n", | ||
" i[\"content_text\"]])\n", | ||
"\n", | ||
" insert_mysql(val, mycursor, mydb, \"wscn\")\n", | ||
"\n", | ||
" next_cursor = data[\"next_cursor\"]\n", | ||
" time_last = datetime.fromtimestamp(int(next_cursor))\n", | ||
" print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n", | ||
" if exit_loop_condition(time_last, datetime_now, time_delta):\n", | ||
" break\n", | ||
"\n", | ||
" close_mysql(mycursor, mydb)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "2c0ddcc7", | ||
"metadata": {}, | ||
"source": [ | ||
"# 4. NDB" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 5, | ||
"id": "08cb9c89", | ||
"metadata": { | ||
"scrolled": true | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"def get_ndb_news(time_delta=24):\n", | ||
" mycursor, mydb = start_mysql()\n", | ||
" datetime_now = datetime.now()\n", | ||
"\n", | ||
" page = 1\n", | ||
" while True:\n", | ||
" url = \"https://www.nbd.com.cn/columns/3/page/%d\" % page\n", | ||
"\n", | ||
" response = requests.get(url)\n", | ||
" soup = BeautifulSoup(response.text)\n", | ||
"\n", | ||
" data = soup.find(\"div\", {\"class\": \"g-list-text\"})\n", | ||
" dates = [i.text.strip() for i in data.find_all(\"p\", {\"class\": \"u-channeltime\"})]\n", | ||
" this_page_ids = [[j.find(\"a\")[\"href\"] for j in i.find_all(\"li\", {\"class\": \"u-news-title\"})] for i in data.find_all(\"ul\", {\"class\": \"u-news-list\"})]\n", | ||
" this_page_news = [[[t.strip() for t in j.text.strip().split(\"\\n\\n\")] for j in i.find_all(\"li\", {\"class\": \"u-news-title\"})] for i in data.find_all(\"ul\", {\"class\": \"u-news-list\"})]\n", | ||
" for index, this_part_news in enumerate(this_page_news):\n", | ||
" val = []\n", | ||
" for i, this_news in enumerate(this_part_news):\n", | ||
" if \"|\" in this_news[0]:\n", | ||
" val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n", | ||
" get_section(this_news[0]), get_text_from_section(this_news[0])])\n", | ||
" elif \"|\" in this_news[0]:\n", | ||
" val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n", | ||
" get_section_chinese(this_news[0]), get_text_from_section_chinese(this_news[0])])\n", | ||
" elif \"︱\" in this_news[0]:\n", | ||
" val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n", | ||
" get_section_special(this_news[0]), get_text_from_section_special(this_news[0])])\n", | ||
" else:\n", | ||
" val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n", | ||
" \"\", this_news[0]])\n", | ||
" insert_mysql(val, mycursor, mydb, \"ndb\")\n", | ||
"\n", | ||
" time_last = val[-1][1]\n", | ||
"\n", | ||
" print_scrape_progress(time_last, mycursor.rowcount)\n", | ||
" if exit_loop_condition(datetime.strptime(time_last, '%Y-%m-%d %H:%M:%S'), datetime_now, time_delta):\n", | ||
" break\n", | ||
" else:\n", | ||
" page += 1\n" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"id": "4e64dbd3", | ||
"metadata": {}, | ||
"source": [ | ||
"# Schedule Running" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 13, | ||
"id": "f7650485", | ||
"metadata": {}, | ||
"outputs": [], | ||
"source": [ | ||
"def scrape_news(time_delta=24):\n", | ||
" get_sina_news(time_delta)\n", | ||
" get_10jqka_news(time_delta)\n", | ||
" get_wscn_news(time_delta)\n", | ||
" get_ndb_news(time_delta)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"id": "aeecc5b2", | ||
"metadata": {}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"Data scraped until 2024-06-30 14:35:42. 36 was inserted into MySQLL\r" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"schedule.every().day.at(\"23:59\").do(scrape_news, 24)\n", | ||
"\n", | ||
"while True:\n", | ||
" schedule.run_pending()\n", | ||
" time.sleep(60)" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"kernelspec": { | ||
"display_name": "Python 3 (ipykernel)", | ||
"language": "python", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.12" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 5 | ||
} |