Skip to content

Commit

Permalink
Initial the Repo
Browse files Browse the repository at this point in the history
Initial the Repo
  • Loading branch information
TianzhuQin committed Jul 1, 2024
0 parents commit 2a41c15
Show file tree
Hide file tree
Showing 3 changed files with 395 additions and 0 deletions.
Binary file added .DS_Store
Binary file not shown.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# NewsAI
394 changes: 394 additions & 0 deletions Scrape News Daily.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,394 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "8fd059cd",
"metadata": {},
"source": [
"# 0.Initial "
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "52c5fb7a",
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import schedule\n",
"\n",
"import time\n",
"from datetime import datetime, timedelta\n",
"\n",
"import json\n",
"\n",
"import requests\n",
"\n",
"from bs4 import BeautifulSoup\n",
"\n",
"import mysql.connector\n",
"\n",
"pattern = re.compile(r'jQuery\\d+_\\d+\\((.*?)\\);')\n",
"\n",
"def get_title(text):\n",
" return text.split(\"\", 1)[0].split(\"\", 1)[-1].strip()\n",
"\n",
"def get_text(text):\n",
" return text.split(\"\", 1)[-1].strip()\n",
"\n",
"def get_source(text):\n",
" return text.rsplit(\"\", 1)[-1].rsplit(\"\", 1)[0].strip()\n",
"\n",
"def get_text_from_source(text):\n",
" return text.rsplit(\"\", 1)[0].strip()\n",
"\n",
"def get_section(text):\n",
" return text.split(\"|\", 1)[0].strip()\n",
"\n",
"def get_text_from_section(text):\n",
" return text.split(\"|\", 1)[-1].strip()\n",
"\n",
"def get_section_chinese(text):\n",
" return text.split(\"\", 1)[0].strip()\n",
"\n",
"def get_text_from_section_chinese(text):\n",
" return text.split(\"\", 1)[-1].strip()\n",
"\n",
"def get_section_special(text):\n",
" return text.split(\"\", 1)[0].strip()\n",
"\n",
"def get_text_from_section_special(text):\n",
" return text.split(\"\", 1)[-1].strip()\n",
"\n",
"def print_scrape_progress(time_last, num):\n",
" print(\"Data scraped until %s. %d was inserted into MySQL.\" % (time_last, num), end=\"\\r\")\n",
" \n",
"def exit_loop_condition(time_this, time_now, time_delta):\n",
" return time_this <= time_now - timedelta(hours = time_delta)\n",
"\n",
"def start_mysql():\n",
" mydb = mysql.connector.connect(\n",
" host=\"localhost\",\n",
" user=\"root\",\n",
" password=\"password\",\n",
" database=\"NEWS\",\n",
" auth_plugin='mysql_native_password'\n",
" )\n",
"\n",
" mycursor = mydb.cursor()\n",
" return mycursor, mydb\n",
"\n",
"def insert_mysql(val, mycursor, mydb, table):\n",
" sql = \"INSERT INTO \" + table + \" (originalID, time, title, source, section, text) VALUES (%s, %s, %s, %s, %s, %s)\"\n",
" mycursor.executemany(sql, val)\n",
" mydb.commit()\n",
"\n",
"def close_mysql(mycursor, mydb):\n",
" mycursor.close()\n",
" mydb.close()\n",
"\n",
"headers = \"\"\"\n",
"Accept: */*\n",
"Accept-Encoding: gzip, deflate, br, zstd\n",
"Accept-Language: zh-CN,zh;q=0.9,en;q=0.8\n",
"Connection: keep-alive\n",
"Cookie: v=A4YJcvo4HO9K8MiLKiJJdN9r0XcN58qhnCv-BXCvcqmEcygp2HcasWy7ThBD\n",
"Host: news.10jqka.com.cn\n",
"Referer: https://news.10jqka.com.cn/realtimenews.html\n",
"Sec-Fetch-Dest: empty\n",
"Sec-Fetch-Mode: cors\n",
"Sec-Fetch-Site: same-origin\n",
"User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36\n",
"X-Requested-With: XMLHttpRequest\n",
"hexin-v: A4YJcvo4HO9K8MiLKiJJdN9r0XcN58qhnCv-BXCvcqmEcygp2HcasWy7ThBD\n",
"sec-ch-ua: \"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"\n",
"sec-ch-ua-mobile: ?0\n",
"sec-ch-ua-platform: \"macOS\"\n",
"\"\"\"\n",
"headers = dict([i.split(\": \") for i in headers.split(\"\\n\") if i !=\"\"])"
]
},
{
"cell_type": "markdown",
"id": "86308181",
"metadata": {},
"source": [
"# 1. SINA"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "866d313a",
"metadata": {},
"outputs": [],
"source": [
"def get_sina_news(time_delta=24):\n",
" datetime_now = datetime.now()\n",
" mycursor, mydb = start_mysql()\n",
"\n",
" url = \"https://zhibo.sina.com.cn/api/zhibo/feed?callback=jQuery0_%s&page=1&page_size=1&zhibo_id=152&tag_id=0\" % (datetime_now.strftime(\"%s\"))\n",
" response = requests.get(url)\n",
" match = pattern.search(response.text)\n",
" json_content = match.group(1)\n",
" data = json.loads(json_content)\n",
" id_now = data[\"result\"][\"data\"][\"feed\"][\"list\"][0][\"id\"]\n",
"\n",
" page = 1\n",
" while True:\n",
" url = \"https://zhibo.sina.com.cn/api/zhibo/feed?callback=jQuery0_%s&page=%d&page_size=100&zhibo_id=152&tag_id=0&id=%d&type=1\" % (datetime_now.strftime(\"%s\"), page, id_now)\n",
" response = requests.get(url)\n",
" match = pattern.search(response.text)\n",
" json_content = match.group(1)\n",
" data = json.loads(json_content)\n",
" data_list = data[\"result\"][\"data\"][\"feed\"][\"list\"]\n",
" val = []\n",
" for i in data_list:\n",
" if i[\"rich_text\"][0] == \"\":\n",
" val.append([i[\"id\"], i[\"create_time\"], get_title(i[\"rich_text\"]), \"\", \"\", get_text(i[\"rich_text\"])])\n",
" else:\n",
" val.append([i[\"id\"], i[\"create_time\"], \"\", \"\", \"\", i[\"rich_text\"]])\n",
"\n",
" insert_mysql(val, mycursor, mydb, \"sina\")\n",
"\n",
" time_last = data_list[-1][\"create_time\"]\n",
" print_scrape_progress(time_last, mycursor.rowcount)\n",
" if exit_loop_condition(datetime.strptime(time_last, '%Y-%m-%d %H:%M:%S'), datetime_now, time_delta):\n",
" break\n",
" else:\n",
" page += 1\n",
"\n",
" close_mysql(mycursor, mydb)"
]
},
{
"cell_type": "markdown",
"id": "45619db1",
"metadata": {},
"source": [
"# 2. 10JQKA"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "b3fa1994",
"metadata": {},
"outputs": [],
"source": [
"def get_10jqka_news(time_delta=24):\n",
" datetime_now = datetime.now()\n",
" mycursor, mydb = start_mysql()\n",
"\n",
" page = 1\n",
" while True:\n",
" url = \"https://news.10jqka.com.cn/tapp/news/push/stock/?page=%d&tag=&track=website&pagesize=100\" % page\n",
" response = requests.get(url, headers=headers)\n",
" data_list = response.json()[\"data\"][\"list\"]\n",
"\n",
" val = [[i[\"id\"], datetime.fromtimestamp(int(i[\"ctime\"])), i[\"title\"], \"\", \"\", i[\"digest\"]] for i in data_list]\n",
" insert_mysql(val, mycursor, mydb, \"10jqka\")\n",
"\n",
" time_last = datetime.fromtimestamp(int(data_list[-1][\"ctime\"]))\n",
" print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n",
" if exit_loop_condition(time_last, datetime_now, time_delta):\n",
" break\n",
" else:\n",
" page = page + 1\n",
"\n",
" close_mysql(mycursor, mydb)"
]
},
{
"cell_type": "markdown",
"id": "305658d7",
"metadata": {},
"source": [
"# 3. WSCN"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "48988219",
"metadata": {},
"outputs": [],
"source": [
"def get_wscn_news(time_delta=24):\n",
" mycursor, mydb = start_mysql()\n",
" datetime_now = datetime.now()\n",
"\n",
" url = \"https://api-one-wscn.awtmt.com/apiv1/content/lives?channel=global-channel&client=pc&limit=100\"\n",
" response = requests.get(url)\n",
" data = response.json()[\"data\"]\n",
"\n",
" val = []\n",
" for i in data[\"items\"]:\n",
" if i[\"content_text\"][-1] == \"\":\n",
" val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n",
" i[\"title\"], get_source(i[\"content_text\"]), \"\",\n",
" get_text_from_source(i[\"content_text\"])])\n",
" else:\n",
" val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n",
" i[\"title\"], \"\", \"\",\n",
" i[\"content_text\"]])\n",
"\n",
" insert_mysql(val, mycursor, mydb, \"wscn\")\n",
"\n",
" next_cursor = data[\"next_cursor\"]\n",
" time_last = datetime.fromtimestamp(int(next_cursor))\n",
" print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n",
"\n",
" while True:\n",
" url = \"https://api-one-wscn.awtmt.com/apiv1/content/lives?channel=global-channel&client=pc&limit=100&cursor=%s\" % next_cursor\n",
" response = requests.get(url)\n",
" data = response.json()[\"data\"]\n",
"\n",
" val = []\n",
" for i in data[\"items\"]:\n",
" if \"\" in i[\"content_text\"]:\n",
" val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n",
" i[\"title\"], get_source(i[\"content_text\"]), \"\",\n",
" get_text_from_source(i[\"content_text\"])])\n",
" else:\n",
" val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n",
" i[\"title\"], \"\", \"\", \n",
" i[\"content_text\"]])\n",
"\n",
" insert_mysql(val, mycursor, mydb, \"wscn\")\n",
"\n",
" next_cursor = data[\"next_cursor\"]\n",
" time_last = datetime.fromtimestamp(int(next_cursor))\n",
" print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n",
" if exit_loop_condition(time_last, datetime_now, time_delta):\n",
" break\n",
"\n",
" close_mysql(mycursor, mydb)"
]
},
{
"cell_type": "markdown",
"id": "2c0ddcc7",
"metadata": {},
"source": [
"# 4. NDB"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "08cb9c89",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"def get_ndb_news(time_delta=24):\n",
" mycursor, mydb = start_mysql()\n",
" datetime_now = datetime.now()\n",
"\n",
" page = 1\n",
" while True:\n",
" url = \"https://www.nbd.com.cn/columns/3/page/%d\" % page\n",
"\n",
" response = requests.get(url)\n",
" soup = BeautifulSoup(response.text)\n",
"\n",
" data = soup.find(\"div\", {\"class\": \"g-list-text\"})\n",
" dates = [i.text.strip() for i in data.find_all(\"p\", {\"class\": \"u-channeltime\"})]\n",
" this_page_ids = [[j.find(\"a\")[\"href\"] for j in i.find_all(\"li\", {\"class\": \"u-news-title\"})] for i in data.find_all(\"ul\", {\"class\": \"u-news-list\"})]\n",
" this_page_news = [[[t.strip() for t in j.text.strip().split(\"\\n\\n\")] for j in i.find_all(\"li\", {\"class\": \"u-news-title\"})] for i in data.find_all(\"ul\", {\"class\": \"u-news-list\"})]\n",
" for index, this_part_news in enumerate(this_page_news):\n",
" val = []\n",
" for i, this_news in enumerate(this_part_news):\n",
" if \"|\" in this_news[0]:\n",
" val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n",
" get_section(this_news[0]), get_text_from_section(this_news[0])])\n",
" elif \"\" in this_news[0]:\n",
" val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n",
" get_section_chinese(this_news[0]), get_text_from_section_chinese(this_news[0])])\n",
" elif \"\" in this_news[0]:\n",
" val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n",
" get_section_special(this_news[0]), get_text_from_section_special(this_news[0])])\n",
" else:\n",
" val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n",
" \"\", this_news[0]])\n",
" insert_mysql(val, mycursor, mydb, \"ndb\")\n",
"\n",
" time_last = val[-1][1]\n",
"\n",
" print_scrape_progress(time_last, mycursor.rowcount)\n",
" if exit_loop_condition(datetime.strptime(time_last, '%Y-%m-%d %H:%M:%S'), datetime_now, time_delta):\n",
" break\n",
" else:\n",
" page += 1\n"
]
},
{
"cell_type": "markdown",
"id": "4e64dbd3",
"metadata": {},
"source": [
"# Schedule Running"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "f7650485",
"metadata": {},
"outputs": [],
"source": [
"def scrape_news(time_delta=24):\n",
" get_sina_news(time_delta)\n",
" get_10jqka_news(time_delta)\n",
" get_wscn_news(time_delta)\n",
" get_ndb_news(time_delta)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "aeecc5b2",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data scraped until 2024-06-30 14:35:42. 36 was inserted into MySQLL\r"
]
}
],
"source": [
"schedule.every().day.at(\"23:59\").do(scrape_news, 24)\n",
"\n",
"while True:\n",
" schedule.run_pending()\n",
" time.sleep(60)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit 2a41c15

Please sign in to comment.