Merge pull request #629 from kshittijagrawal/hackernews

pawangeek · web-flow · commit d081e05aa57c · 2021-10-06T10:47:44.000+05:30
Added hackernews_scrapper directory with 3 files.
diff --git a/hackernews_scrapper/README.md b/hackernews_scrapper/README.md
@@ -0,0 +1,9 @@
+# Hacker News Scrapper
+This script allows the user to scrape a particular count *(based on user choice)* of tech news from **Hacker News** and store it in the form of a **csv file** in the same directory as the script. Csv file is divided into 2 columns; the topic of the news and the link to the news.
+  
+## Requirements
+There's a file provided in the repository named requirements.txt. Run the following command after changing to the local folder holding the files.
+```
+pip install -r requirements.txt
+```
+This should download any necessary library required for running the tool.
diff --git a/hackernews_scrapper/hackernews_scrapper.py b/hackernews_scrapper/hackernews_scrapper.py
@@ -0,0 +1,65 @@
+import requests
+import csv
+import sys
+from bs4 import BeautifulSoup as bs
+
+
+def main():
+    # Base url for the latest articles on the hackernews website
+    baseurl = "https://news.ycombinator.com/newest"
+
+    # Number of articles requested by the user
+    try:
+        number_of_articles = int(input(
+            '''Enter the number of articles you want from the hackernews website.
+(1-30) : '''))
+    except ValueError:
+        print("\nYou did not enter a number. Try again.\n")
+        sys.exit(1)
+
+    if not 1 <= number_of_articles <= 30:
+        print("\nYour input was not in the given range!\n")
+        sys.exit(1)
+    # Response obect to fetch the hackernews url
+    response = requests.get(baseurl)
+
+    # soup object for easy scrapping
+    soup = bs(response.content, 'html.parser')
+
+    # Finding all the a tags with the class storylink
+    latest = soup.find_all('a', attrs={'class': 'storylink'})
+
+    # list to track the links of the articles
+    links = []
+
+    # list to keep track of the names of the articles
+    titles = []
+
+    # Fetching the links and names from the soup object
+    # storing them in respective lists
+    for article in latest:
+        links.append(article['href'])
+        titles.append(article.text)
+
+    result = []
+
+    for title, link in zip(titles[:number_of_articles],
+                           links[:number_of_articles]):
+        d = {}
+        d["News Title"] = title
+        d["Link to the News"] = link
+        result.append(d)
+
+    keys = ["News Title", "Link to the News"]
+
+    with open("hackernews_latest.csv", "w") as hackernews:
+        writer = csv.DictWriter(hackernews, fieldnames=keys)
+        writer.writeheader()
+        writer.writerows(result)
+
+    return
+
+
+if __name__ == "__main__":
+    main()
+    print("\nYour news file has been successfully created!\n")
diff --git a/hackernews_scrapper/requirements.txt b/hackernews_scrapper/requirements.txt
@@ -0,0 +1,2 @@
+requests == 2.26.0
+beautifulsoup4 == 4.10.0

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+requests == 2.26.0`
	`2`	`+beautifulsoup4 == 4.10.0`