Initial the Repo

TianzhuQin · Jul 1, 2024 · 2a41c15 · 2a41c15
commit 2a41c15
Show file tree

Hide file tree

Showing 3 changed files with 395 additions and 0 deletions.
diff --git a/.DS_Store b/.DS_Store
diff --git a/README.md b/README.md
@@ -0,0 +1 @@
+# NewsAI
diff --git a/Scrape News Daily.ipynb b/Scrape News Daily.ipynb
@@ -0,0 +1,394 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "8fd059cd",
+   "metadata": {},
+   "source": [
+    "# 0.Initial "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "52c5fb7a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import schedule\n",
+    "\n",
+    "import time\n",
+    "from datetime import datetime, timedelta\n",
+    "\n",
+    "import json\n",
+    "\n",
+    "import requests\n",
+    "\n",
+    "from bs4 import BeautifulSoup\n",
+    "\n",
+    "import mysql.connector\n",
+    "\n",
+    "pattern = re.compile(r'jQuery\\d+_\\d+\\((.*?)\\);')\n",
+    "\n",
+    "def get_title(text):\n",
+    "    return text.split(\"】\", 1)[0].split(\"【\", 1)[-1].strip()\n",
+    "\n",
+    "def get_text(text):\n",
+    "    return text.split(\"】\", 1)[-1].strip()\n",
+    "\n",
+    "def get_source(text):\n",
+    "    return text.rsplit(\"（\", 1)[-1].rsplit(\"）\", 1)[0].strip()\n",
+    "\n",
+    "def get_text_from_source(text):\n",
+    "    return text.rsplit(\"（\", 1)[0].strip()\n",
+    "\n",
+    "def get_section(text):\n",
+    "    return text.split(\"|\", 1)[0].strip()\n",
+    "\n",
+    "def get_text_from_section(text):\n",
+    "    return text.split(\"|\", 1)[-1].strip()\n",
+    "\n",
+    "def get_section_chinese(text):\n",
+    "    return text.split(\"｜\", 1)[0].strip()\n",
+    "\n",
+    "def get_text_from_section_chinese(text):\n",
+    "    return text.split(\"｜\", 1)[-1].strip()\n",
+    "\n",
+    "def get_section_special(text):\n",
+    "    return text.split(\"︱\", 1)[0].strip()\n",
+    "\n",
+    "def get_text_from_section_special(text):\n",
+    "    return text.split(\"︱\", 1)[-1].strip()\n",
+    "\n",
+    "def print_scrape_progress(time_last, num):\n",
+    "    print(\"Data scraped until %s. %d was inserted into MySQL.\" % (time_last, num), end=\"\\r\")\n",
+    "    \n",
+    "def exit_loop_condition(time_this, time_now, time_delta):\n",
+    "    return time_this <= time_now - timedelta(hours = time_delta)\n",
+    "\n",
+    "def start_mysql():\n",
+    "    mydb = mysql.connector.connect(\n",
+    "        host=\"localhost\",\n",
+    "        user=\"root\",\n",
+    "        password=\"password\",\n",
+    "        database=\"NEWS\",\n",
+    "        auth_plugin='mysql_native_password'\n",
+    "    )\n",
+    "\n",
+    "    mycursor = mydb.cursor()\n",
+    "    return mycursor, mydb\n",
+    "\n",
+    "def insert_mysql(val, mycursor, mydb, table):\n",
+    "    sql = \"INSERT INTO \" + table + \" (originalID, time, title, source, section, text) VALUES (%s, %s, %s, %s, %s, %s)\"\n",
+    "    mycursor.executemany(sql, val)\n",
+    "    mydb.commit()\n",
+    "\n",
+    "def close_mysql(mycursor, mydb):\n",
+    "    mycursor.close()\n",
+    "    mydb.close()\n",
+    "\n",
+    "headers = \"\"\"\n",
+    "Accept: */*\n",
+    "Accept-Encoding: gzip, deflate, br, zstd\n",
+    "Accept-Language: zh-CN,zh;q=0.9,en;q=0.8\n",
+    "Connection: keep-alive\n",
+    "Cookie: v=A4YJcvo4HO9K8MiLKiJJdN9r0XcN58qhnCv-BXCvcqmEcygp2HcasWy7ThBD\n",
+    "Host: news.10jqka.com.cn\n",
+    "Referer: https://news.10jqka.com.cn/realtimenews.html\n",
+    "Sec-Fetch-Dest: empty\n",
+    "Sec-Fetch-Mode: cors\n",
+    "Sec-Fetch-Site: same-origin\n",
+    "User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36\n",
+    "X-Requested-With: XMLHttpRequest\n",
+    "hexin-v: A4YJcvo4HO9K8MiLKiJJdN9r0XcN58qhnCv-BXCvcqmEcygp2HcasWy7ThBD\n",
+    "sec-ch-ua: \"Not/A)Brand\";v=\"8\", \"Chromium\";v=\"126\", \"Google Chrome\";v=\"126\"\n",
+    "sec-ch-ua-mobile: ?0\n",
+    "sec-ch-ua-platform: \"macOS\"\n",
+    "\"\"\"\n",
+    "headers = dict([i.split(\": \") for i in headers.split(\"\\n\") if i !=\"\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "86308181",
+   "metadata": {},
+   "source": [
+    "# 1. SINA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "866d313a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_sina_news(time_delta=24):\n",
+    "    datetime_now = datetime.now()\n",
+    "    mycursor, mydb = start_mysql()\n",
+    "\n",
+    "    url = \"https://zhibo.sina.com.cn/api/zhibo/feed?callback=jQuery0_%s&page=1&page_size=1&zhibo_id=152&tag_id=0\" % (datetime_now.strftime(\"%s\"))\n",
+    "    response = requests.get(url)\n",
+    "    match = pattern.search(response.text)\n",
+    "    json_content = match.group(1)\n",
+    "    data = json.loads(json_content)\n",
+    "    id_now = data[\"result\"][\"data\"][\"feed\"][\"list\"][0][\"id\"]\n",
+    "\n",
+    "    page = 1\n",
+    "    while True:\n",
+    "        url = \"https://zhibo.sina.com.cn/api/zhibo/feed?callback=jQuery0_%s&page=%d&page_size=100&zhibo_id=152&tag_id=0&id=%d&type=1\" % (datetime_now.strftime(\"%s\"), page, id_now)\n",
+    "        response = requests.get(url)\n",
+    "        match = pattern.search(response.text)\n",
+    "        json_content = match.group(1)\n",
+    "        data = json.loads(json_content)\n",
+    "        data_list = data[\"result\"][\"data\"][\"feed\"][\"list\"]\n",
+    "        val = []\n",
+    "        for i in data_list:\n",
+    "            if i[\"rich_text\"][0] == \"【\":\n",
+    "                val.append([i[\"id\"], i[\"create_time\"], get_title(i[\"rich_text\"]), \"\", \"\", get_text(i[\"rich_text\"])])\n",
+    "            else:\n",
+    "                val.append([i[\"id\"], i[\"create_time\"], \"\", \"\", \"\", i[\"rich_text\"]])\n",
+    "\n",
+    "        insert_mysql(val, mycursor, mydb, \"sina\")\n",
+    "\n",
+    "        time_last = data_list[-1][\"create_time\"]\n",
+    "        print_scrape_progress(time_last, mycursor.rowcount)\n",
+    "        if exit_loop_condition(datetime.strptime(time_last, '%Y-%m-%d %H:%M:%S'), datetime_now, time_delta):\n",
+    "            break\n",
+    "        else:\n",
+    "            page += 1\n",
+    "\n",
+    "    close_mysql(mycursor, mydb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "45619db1",
+   "metadata": {},
+   "source": [
+    "# 2. 10JQKA"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b3fa1994",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_10jqka_news(time_delta=24):\n",
+    "    datetime_now = datetime.now()\n",
+    "    mycursor, mydb = start_mysql()\n",
+    "\n",
+    "    page = 1\n",
+    "    while True:\n",
+    "        url = \"https://news.10jqka.com.cn/tapp/news/push/stock/?page=%d&tag=&track=website&pagesize=100\" % page\n",
+    "        response = requests.get(url, headers=headers)\n",
+    "        data_list = response.json()[\"data\"][\"list\"]\n",
+    "\n",
+    "        val = [[i[\"id\"], datetime.fromtimestamp(int(i[\"ctime\"])), i[\"title\"], \"\", \"\", i[\"digest\"]] for i in data_list]\n",
+    "        insert_mysql(val, mycursor, mydb, \"10jqka\")\n",
+    "\n",
+    "        time_last = datetime.fromtimestamp(int(data_list[-1][\"ctime\"]))\n",
+    "        print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n",
+    "        if exit_loop_condition(time_last, datetime_now, time_delta):\n",
+    "            break\n",
+    "        else:\n",
+    "            page = page + 1\n",
+    "\n",
+    "    close_mysql(mycursor, mydb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "305658d7",
+   "metadata": {},
+   "source": [
+    "# 3. WSCN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "48988219",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_wscn_news(time_delta=24):\n",
+    "    mycursor, mydb = start_mysql()\n",
+    "    datetime_now = datetime.now()\n",
+    "\n",
+    "    url = \"https://api-one-wscn.awtmt.com/apiv1/content/lives?channel=global-channel&client=pc&limit=100\"\n",
+    "    response = requests.get(url)\n",
+    "    data = response.json()[\"data\"]\n",
+    "\n",
+    "    val = []\n",
+    "    for i in data[\"items\"]:\n",
+    "        if i[\"content_text\"][-1] == \"）\":\n",
+    "            val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n",
+    "                        i[\"title\"], get_source(i[\"content_text\"]), \"\",\n",
+    "                        get_text_from_source(i[\"content_text\"])])\n",
+    "        else:\n",
+    "            val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n",
+    "                        i[\"title\"], \"\", \"\",\n",
+    "                        i[\"content_text\"]])\n",
+    "\n",
+    "    insert_mysql(val, mycursor, mydb, \"wscn\")\n",
+    "\n",
+    "    next_cursor = data[\"next_cursor\"]\n",
+    "    time_last = datetime.fromtimestamp(int(next_cursor))\n",
+    "    print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n",
+    "\n",
+    "    while True:\n",
+    "        url = \"https://api-one-wscn.awtmt.com/apiv1/content/lives?channel=global-channel&client=pc&limit=100&cursor=%s\" % next_cursor\n",
+    "        response = requests.get(url)\n",
+    "        data = response.json()[\"data\"]\n",
+    "\n",
+    "        val = []\n",
+    "        for i in data[\"items\"]:\n",
+    "            if \"（\" in i[\"content_text\"]:\n",
+    "                val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n",
+    "                            i[\"title\"], get_source(i[\"content_text\"]), \"\",\n",
+    "                            get_text_from_source(i[\"content_text\"])])\n",
+    "            else:\n",
+    "                val.append([i[\"id\"], datetime.fromtimestamp(int(i[\"display_time\"])), \n",
+    "                            i[\"title\"], \"\", \"\", \n",
+    "                            i[\"content_text\"]])\n",
+    "\n",
+    "        insert_mysql(val, mycursor, mydb, \"wscn\")\n",
+    "\n",
+    "        next_cursor = data[\"next_cursor\"]\n",
+    "        time_last = datetime.fromtimestamp(int(next_cursor))\n",
+    "        print_scrape_progress(time_last.strftime('%Y-%m-%d %H:%M:%S'), mycursor.rowcount)\n",
+    "        if exit_loop_condition(time_last, datetime_now, time_delta):\n",
+    "            break\n",
+    "\n",
+    "    close_mysql(mycursor, mydb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2c0ddcc7",
+   "metadata": {},
+   "source": [
+    "# 4. NDB"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "08cb9c89",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "def get_ndb_news(time_delta=24):\n",
+    "    mycursor, mydb = start_mysql()\n",
+    "    datetime_now = datetime.now()\n",
+    "\n",
+    "    page = 1\n",
+    "    while True:\n",
+    "        url = \"https://www.nbd.com.cn/columns/3/page/%d\" % page\n",
+    "\n",
+    "        response = requests.get(url)\n",
+    "        soup = BeautifulSoup(response.text)\n",
+    "\n",
+    "        data = soup.find(\"div\", {\"class\": \"g-list-text\"})\n",
+    "        dates = [i.text.strip() for i in data.find_all(\"p\", {\"class\": \"u-channeltime\"})]\n",
+    "        this_page_ids = [[j.find(\"a\")[\"href\"] for j in i.find_all(\"li\", {\"class\": \"u-news-title\"})] for i in data.find_all(\"ul\", {\"class\": \"u-news-list\"})]\n",
+    "        this_page_news = [[[t.strip() for t in j.text.strip().split(\"\\n\\n\")] for j in i.find_all(\"li\", {\"class\": \"u-news-title\"})] for i in data.find_all(\"ul\", {\"class\": \"u-news-list\"})]\n",
+    "        for index, this_part_news in enumerate(this_page_news):\n",
+    "            val = []\n",
+    "            for i, this_news in enumerate(this_part_news):\n",
+    "                if \"|\" in this_news[0]:\n",
+    "                    val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n",
+    "                                get_section(this_news[0]), get_text_from_section(this_news[0])])\n",
+    "                elif \"｜\" in this_news[0]:\n",
+    "                    val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n",
+    "                                get_section_chinese(this_news[0]), get_text_from_section_chinese(this_news[0])])\n",
+    "                elif \"︱\" in this_news[0]:\n",
+    "                    val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n",
+    "                                get_section_special(this_news[0]), get_text_from_section_special(this_news[0])])\n",
+    "                else:\n",
+    "                    val.append([this_page_ids[index][i], dates[index] + \" \" + this_news[1], \"\", \"\", \n",
+    "                            \"\", this_news[0]])\n",
+    "            insert_mysql(val, mycursor, mydb, \"ndb\")\n",
+    "\n",
+    "        time_last = val[-1][1]\n",
+    "\n",
+    "        print_scrape_progress(time_last, mycursor.rowcount)\n",
+    "        if exit_loop_condition(datetime.strptime(time_last, '%Y-%m-%d %H:%M:%S'), datetime_now, time_delta):\n",
+    "            break\n",
+    "        else:\n",
+    "            page += 1\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4e64dbd3",
+   "metadata": {},
+   "source": [
+    "# Schedule Running"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "f7650485",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def scrape_news(time_delta=24):\n",
+    "    get_sina_news(time_delta)\n",
+    "    get_10jqka_news(time_delta)\n",
+    "    get_wscn_news(time_delta)\n",
+    "    get_ndb_news(time_delta)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "aeecc5b2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Data scraped until 2024-06-30 14:35:42. 36 was inserted into MySQLL\r"
+     ]
+    }
+   ],
+   "source": [
+    "schedule.every().day.at(\"23:59\").do(scrape_news, 24)\n",
+    "\n",
+    "while True:\n",
+    "    schedule.run_pending()\n",
+    "    time.sleep(60)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}