diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000..70acbf9 Binary files /dev/null and b/.DS_Store differ diff --git a/README.md b/README.md new file mode 100644 index 0000000..53c7d5a --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +# NewsAI diff --git a/Scrape News Daily.ipynb b/Scrape News Daily.ipynb new file mode 100644 index 0000000..38f7ab2 --- /dev/null +++ b/Scrape News Daily.ipynb @@ -0,0 +1,394 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "8fd059cd", + "metadata": {}, + "source": [ + "# 0.Initial " + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "52c5fb7a", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import schedule\n", + "\n", + "import time\n", + "from datetime import datetime, timedelta\n", + "\n", + "import json\n", + "\n", + "import requests\n", + "\n", + "from bs4 import BeautifulSoup\n", + "\n", + "import mysql.connector\n", + "\n", + "pattern = re.compile(r'jQuery\\d+_\\d+\\((.*?)\\);')\n", + "\n", + "def get_title(text):\n", + " return text.split(\"】\", 1)[0].split(\"【\", 1)[-1].strip()\n", + "\n", + "def get_text(text):\n", + " return text.split(\"】\", 1)[-1].strip()\n", + "\n", + "def get_source(text):\n", + " return text.rsplit(\"(\", 1)[-1].rsplit(\")\", 1)[0].strip()\n", + "\n", + "def get_text_from_source(text):\n", + " return text.rsplit(\"(\", 1)[0].strip()\n", + "\n", + "def get_section(text):\n", + " return text.split(\"|\", 1)[0].strip()\n", + "\n", + "def get_text_from_section(text):\n", + " return text.split(\"|\", 1)[-1].strip()\n", + "\n", + "def get_section_chinese(text):\n", + " return text.split(\"|\", 1)[0].strip()\n", + "\n", + "def get_text_from_section_chinese(text):\n", + " return text.split(\"|\", 1)[-1].strip()\n", + "\n", + "def get_section_special(text):\n", + " return text.split(\"︱\", 1)[0].strip()\n", + "\n", + "def get_text_from_section_special(text):\n", + " return text.split(\"︱\", 1)[-1].strip()\n", + "\n", + "def print_scrape_progress(time_last, num):\n", + " print(\"Data scraped until %s. %d was inserted into MySQL.\" % (time_last, num), end=\"\\r\")\n", + " \n", + "def exit_loop_condition(time_this, time_now, time_delta):\n", + " return time_this <= time_now - timedelta(hours = time_delta)\n", + "\n", + "def start_mysql():\n", + " mydb = mysql.connector.connect(\n", + " host=\"localhost\",\n", + " user=\"root\",\n", + " password=\"password\",\n", + " database=\"NEWS\",\n", + " auth_plugin='mysql_native_password'\n", + " )\n", + "\n", + " mycursor = mydb.cursor()\n", + " return mycursor, mydb\n", + "\n", + "def insert_mysql(val, mycursor, mydb, table):\n", + " sql = \"INSERT INTO \" + table + \" (originalID, time, title, source, section, text) VALUES (%s, %s, %s, %s, %s, %s)\"\n", + " mycursor.executemany(sql, val)\n", + " mydb.commit()\n", + "\n", + "def close_mysql(mycursor, mydb):\n", + " mycursor.close()\n", + " mydb.close()\n", + "\n", + "headers = \"\"\"\n", + "Accept: */*\n", + "Accept-Encoding: gzip, deflate, br, zstd\n", + "Accept-Language: zh-CN,zh;q=0.9,en;q=0.8\n", + "Connection: keep-alive\n", + "Cookie: v=A4YJcvo4HO9K8MiLKiJJdN9r0XcN58qhnCv-BXCvcqmEcygp2HcasWy7ThBD\n", + "Host: news.10jqka.com.cn\n", + "Referer: https://news.10jqka.com.cn/realtimenews.html\n", + "Sec-Fetch-Dest: empty\n", + "Sec-Fetch-Mode: cors\n", + "Sec-Fetch-Site: same-origin\n", + "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36\n", + "X-Requested-With: XMLHttpRequest\n", + "hexin-v: A4YJcvo4HO9K8MiLKiJJdN9r0XcN58qhnCv-BXCvcqmEcygp2HcasWy7ThBD\n", + "sec-ch-ua: \"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"\n", + "sec-ch-ua-mobile: ?0\n", + "sec-ch-ua-platform: \"macOS\"\n", + "\"\"\"\n", + "headers = dict([i.split(\": \") for i in headers.split(\"\\n\") if i !=\"\"])" + ] + }, + { + "cell_type": "markdown", + "id": "86308181", + "metadata": {}, + "source": [ + "# 1. SINA" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "866d313a", + "metadata": {}, + "outputs": [], + "source": [ + "def get_sina_news(time_delta=24):\n", + " datetime_now = datetime.now()\n", + " mycursor, mydb = start_mysql()\n", + "\n", + " url = \"https://zhibo.sina.com.cn/api/zhibo/feed?callback=jQuery0_%s&page=1&page_size=1&zhibo_id=152&tag_id=0\" % (datetime_now.strftime(\"%s\"))\n", + " response = requests.get(url)\n", + " match = pattern.search(response.text)\n", + " json_content = match.group(1)\n", + " data = json.loads(json_content)\n", + " id_now = data[\"result\"][\"data\"][\"feed\"][\"list\"][0][\"id\"]\n", + "\n", + " page = 1\n", + " while True:\n", + " url = \"https://zhibo.sina.com.cn/api/zhibo/feed?callback=jQuery0_%s&page=%d&page_size=100&zhibo_id=152&tag_id=0&id=%d&type=1\" % (datetime_now.strftime(\"%s\"), page, id_now)\n", + " response = requests.get(url)\n", + " match = pattern.search(response.text)\n", + " json_content = match.group(1)\n", + " data = json.loads(json_content)\n", + " data_list = data[\"result\"][\"data\"][\"feed\"][\"list\"]\n", + " val = []\n", + " for i in data_list:\n", + " if i[\"rich_text\"][0] == \"【\":\n", + " val.append([i[\"id\"], i[\"create_time\"], get_title(i[\"rich_text\"]), \"\", \"\", get_text(i[\"rich_text\"])])\n", + " else:\n", + " val.append([i[\"id\"], i[\"create_time\"], \"\", \"\", \"\", i[\"rich_text\"]])\n", + "\n", + " insert_mysql(val, mycursor, mydb, \"sina\")\n", + "\n", + " time_last = data_list[-1][\"create_time\"]\n", + " print_scrape_progress(time_last, mycursor.rowcount)\n", + " if exit_loop_condition(datetime.strptime(time_last, '%Y-%m-%d %H:%M:%S'), datetime_now, time_delta):\n", + " break\n", + " else:\n", + " page += 1\n", + "\n", + " close_mysql(mycursor, mydb)" + ] + }, + { + "cell_type": "markdown", + "id": "45619db1", + "metadata": {}, + "source": [ + "# 2. 10JQKA" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "b3fa1994", + "metadata": {}, + "outputs": [], + "source": [ + "def get_10jqka_news(time_delta=24):\n", + " datetime_now = datetime.now()\n", + " mycursor, mydb = start_mysql()\n", + "\n", + " page = 1\n", + " while True:\n", + " url = \"https://news.10jqka.com.cn/tapp/news/push/stock/?page=%d&tag=&track=website&pagesize=100\" % page\n", + " response = requests.get(url, headers=headers)\n", + " data_list = response.json()[\"data\"][\"list\"]\n", + "\n", + " val = [[i[\"id\"], datetime.fromtimestamp(int(i[\"ctime\"])), i[\"title\"], \"\", \"\", i[\"digest\"]] for i in data_list]\n", + " insert_mysql(val, mycursor, mydb, \"10jqka\")\n", + "\n", + " time_last = datetime.fromtimestamp(int(data_list[-1][\"ctime\"]))\n", + " print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n", + " if exit_loop_condition(time_last, datetime_now, time_delta):\n", + " break\n", + " else:\n", + " page = page + 1\n", + "\n", + " close_mysql(mycursor, mydb)" + ] + }, + { + "cell_type": "markdown", + "id": "305658d7", + "metadata": {}, + "source": [ + "# 3. WSCN" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "48988219", + "metadata": {}, + "outputs": [], + "source": [ + "def get_wscn_news(time_delta=24):\n", + " mycursor, mydb = start_mysql()\n", + " datetime_now = datetime.now()\n", + "\n", + " url = \"https://api-one-wscn.awtmt.com/apiv1/content/lives?channel=global-channel&client=pc&limit=100\"\n", + " response = requests.get(url)\n", + " data = response.json()[\"data\"]\n", + "\n", + " val = []\n", + " for i in data[\"items\"]:\n", + " if i[\"content_text\"][-1] == \")\":\n", + " val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n", + " i[\"title\"], get_source(i[\"content_text\"]), \"\",\n", + " get_text_from_source(i[\"content_text\"])])\n", + " else:\n", + " val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n", + " i[\"title\"], \"\", \"\",\n", + " i[\"content_text\"]])\n", + "\n", + " insert_mysql(val, mycursor, mydb, \"wscn\")\n", + "\n", + " next_cursor = data[\"next_cursor\"]\n", + " time_last = datetime.fromtimestamp(int(next_cursor))\n", + " print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n", + "\n", + " while True:\n", + " url = \"https://api-one-wscn.awtmt.com/apiv1/content/lives?channel=global-channel&client=pc&limit=100&cursor=%s\" % next_cursor\n", + " response = requests.get(url)\n", + " data = response.json()[\"data\"]\n", + "\n", + " val = []\n", + " for i in data[\"items\"]:\n", + " if \"(\" in i[\"content_text\"]:\n", + " val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n", + " i[\"title\"], get_source(i[\"content_text\"]), \"\",\n", + " get_text_from_source(i[\"content_text\"])])\n", + " else:\n", + " val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n", + " i[\"title\"], \"\", \"\", \n", + " i[\"content_text\"]])\n", + "\n", + " insert_mysql(val, mycursor, mydb, \"wscn\")\n", + "\n", + " next_cursor = data[\"next_cursor\"]\n", + " time_last = datetime.fromtimestamp(int(next_cursor))\n", + " print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n", + " if exit_loop_condition(time_last, datetime_now, time_delta):\n", + " break\n", + "\n", + " close_mysql(mycursor, mydb)" + ] + }, + { + "cell_type": "markdown", + "id": "2c0ddcc7", + "metadata": {}, + "source": [ + "# 4. NDB" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "08cb9c89", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def get_ndb_news(time_delta=24):\n", + " mycursor, mydb = start_mysql()\n", + " datetime_now = datetime.now()\n", + "\n", + " page = 1\n", + " while True:\n", + " url = \"https://www.nbd.com.cn/columns/3/page/%d\" % page\n", + "\n", + " response = requests.get(url)\n", + " soup = BeautifulSoup(response.text)\n", + "\n", + " data = soup.find(\"div\", {\"class\": \"g-list-text\"})\n", + " dates = [i.text.strip() for i in data.find_all(\"p\", {\"class\": \"u-channeltime\"})]\n", + " this_page_ids = [[j.find(\"a\")[\"href\"] for j in i.find_all(\"li\", {\"class\": \"u-news-title\"})] for i in data.find_all(\"ul\", {\"class\": \"u-news-list\"})]\n", + " this_page_news = [[[t.strip() for t in j.text.strip().split(\"\\n\\n\")] for j in i.find_all(\"li\", {\"class\": \"u-news-title\"})] for i in data.find_all(\"ul\", {\"class\": \"u-news-list\"})]\n", + " for index, this_part_news in enumerate(this_page_news):\n", + " val = []\n", + " for i, this_news in enumerate(this_part_news):\n", + " if \"|\" in this_news[0]:\n", + " val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n", + " get_section(this_news[0]), get_text_from_section(this_news[0])])\n", + " elif \"|\" in this_news[0]:\n", + " val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n", + " get_section_chinese(this_news[0]), get_text_from_section_chinese(this_news[0])])\n", + " elif \"︱\" in this_news[0]:\n", + " val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n", + " get_section_special(this_news[0]), get_text_from_section_special(this_news[0])])\n", + " else:\n", + " val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n", + " \"\", this_news[0]])\n", + " insert_mysql(val, mycursor, mydb, \"ndb\")\n", + "\n", + " time_last = val[-1][1]\n", + "\n", + " print_scrape_progress(time_last, mycursor.rowcount)\n", + " if exit_loop_condition(datetime.strptime(time_last, '%Y-%m-%d %H:%M:%S'), datetime_now, time_delta):\n", + " break\n", + " else:\n", + " page += 1\n" + ] + }, + { + "cell_type": "markdown", + "id": "4e64dbd3", + "metadata": {}, + "source": [ + "# Schedule Running" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f7650485", + "metadata": {}, + "outputs": [], + "source": [ + "def scrape_news(time_delta=24):\n", + " get_sina_news(time_delta)\n", + " get_10jqka_news(time_delta)\n", + " get_wscn_news(time_delta)\n", + " get_ndb_news(time_delta)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "aeecc5b2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Data scraped until 2024-06-30 14:35:42. 36 was inserted into MySQLL\r" + ] + } + ], + "source": [ + "schedule.every().day.at(\"23:59\").do(scrape_news, 24)\n", + "\n", + "while True:\n", + " schedule.run_pending()\n", + " time.sleep(60)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}