-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Saman
committed
Nov 25, 2019
0 parents
commit d85ed42
Showing
67 changed files
with
302,663 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
# Newspaper Topic Modelling | ||
|
||
* Scraped around 280.000 articles from Spiegel Online distributed in 11 topics | ||
|
||
## Research Questions: | ||
|
||
Is it possible to recreate and classify topics assigned to news articles by using a topic modelling algorithm on their respective content? | ||
|
||
### Subquestions | ||
|
||
* Which of the topics has the most topic markers put out by the algorithm? | ||
|
||
* Which of the topic markers is classified the most? | ||
|
||
* Are articles of a certain topic classified more accurately than others? |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#TensorFLow can make this | ||
import numpy as np | ||
|
||
#Getting some data for our linear regression | ||
data_x = np.linspace(1.0, 10.0, 100)[:, np.newaxis] | ||
data_y = np.sin(data_x) + 0.1*np.power(data_x,2) + 0.5*np.random.randn(100,1) | ||
data_x /= np.max(data_x) | ||
|
||
# Take w and b(intercept) in one matrix. | ||
# So we dont have to make a second calculation with b | ||
data_x = np.hstack((np.ones_like(data_x), data_x)) | ||
|
||
# Random ordering | ||
order = np.random.permutation(len(data_x)) | ||
portion = 20 | ||
test_x = data_x[order[:portion]] | ||
test_y = data_y[order[:portion]] | ||
train_x = data_x[order[portion:]] | ||
train_y = data_y[order[portion:]] | ||
|
||
# w = real price of house. Doing the partial derivitive | ||
# Python uses Fortran in numpy | ||
def get_gradient(w, x, y): | ||
y_estimate = x.dot(w).flatten() | ||
error = (y.flatten() - y_estimate) | ||
gradient = -(1.0/len(x)) * error.dot(x) | ||
return gradient, np.pow(error, 2) | ||
|
||
|
||
w = np.random.randn(2) | ||
alpha = 0.5 | ||
tolerance = 1e-5 | ||
# Perform Gradient Descent | ||
iterations = 1 | ||
while True: | ||
gradient, error = get_gradient(w, train_x, train_y) | ||
new_w = w - alpha * gradient | ||
|
||
# Stopping Condition | ||
if np.sum(abs(new_w - w)) < tolerance: | ||
print("Converged.") | ||
break | ||
|
||
# Print error every 50 iterations | ||
if iterations % 100 == 0: | ||
print("Iteration: %d - Error: %.4f" %(iterations, error)) | ||
|
||
iterations += 1 | ||
w = new_w | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
# Adapted from Source: http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html | ||
import numpy as np | ||
import matplotlib.pyplot as plt | ||
|
||
from sklearn import metrics | ||
from sklearn.cluster import KMeans | ||
from sklearn.datasets import load_digits | ||
from sklearn.decomposition import PCA | ||
from sklearn.preprocessing import scale | ||
|
||
np.random.seed(42) | ||
|
||
digits = load_digits() | ||
data = scale(digits.data) | ||
|
||
n_samples, n_features = data.shape | ||
n_digits = len(np.unique(digits.target)) | ||
labels = digits.target | ||
|
||
# in this case the seeding of the centers is deterministic, hence we run the | ||
# kmeans algorithm only once with n_init=1 | ||
pca = PCA(n_components=n_digits).fit(data) | ||
|
||
reduced_data = PCA(n_components=2).fit_transform(data) | ||
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10) | ||
kmeans.fit(reduced_data) | ||
|
||
# Step size of the mesh. Decrease to increase the quality of the VQ. | ||
h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max]. | ||
|
||
# Plot the decision boundary. For that, we will assign a color to each | ||
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1 | ||
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1 | ||
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) | ||
|
||
# Obtain labels for each point in mesh. Use last trained model. | ||
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()]) | ||
|
||
# Put the result into a color plot | ||
Z = Z.reshape(xx.shape) | ||
plt.figure(1) | ||
plt.clf() | ||
plt.imshow(Z, interpolation='nearest', | ||
extent=(xx.min(), xx.max(), yy.min(), yy.max()), | ||
cmap=plt.cm.Paired, | ||
aspect='auto', origin='lower') | ||
|
||
plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2) | ||
# Plot the centroids as a white X | ||
centroids = kmeans.cluster_centers_ | ||
plt.scatter(centroids[:, 0], centroids[:, 1], | ||
marker='x', s=169, linewidths=3, | ||
color='w', zorder=10) | ||
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n' | ||
'Centroids are marked with white cross') | ||
plt.xlim(x_min, x_max) | ||
plt.ylim(y_min, y_max) | ||
plt.xticks(()) | ||
plt.yticks(()) | ||
plt.show() |
Empty file.
14 changes: 14 additions & 0 deletions
14
scrapy_project/spiegel_project/build/lib/spiegel_project/items.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Define here the models for your scraped items | ||
# | ||
# See documentation in: | ||
# http://doc.scrapy.org/en/latest/topics/items.html | ||
|
||
import scrapy | ||
|
||
|
||
class SpiegelProjectItem(scrapy.Item): | ||
# define the fields for your item here like: | ||
# name = scrapy.Field() | ||
pass |
56 changes: 56 additions & 0 deletions
56
scrapy_project/spiegel_project/build/lib/spiegel_project/middlewares.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Define here the models for your spider middleware | ||
# | ||
# See documentation in: | ||
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html | ||
|
||
from scrapy import signals | ||
|
||
|
||
class SpiegelProjectSpiderMiddleware(object): | ||
# Not all methods need to be defined. If a method is not defined, | ||
# scrapy acts as if the spider middleware does not modify the | ||
# passed objects. | ||
|
||
@classmethod | ||
def from_crawler(cls, crawler): | ||
# This method is used by Scrapy to create your spiders. | ||
s = cls() | ||
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) | ||
return s | ||
|
||
def process_spider_input(self, response, spider): | ||
# Called for each response that goes through the spider | ||
# middleware and into the spider. | ||
|
||
# Should return None or raise an exception. | ||
return None | ||
|
||
def process_spider_output(self, response, result, spider): | ||
# Called with the results returned from the Spider, after | ||
# it has processed the response. | ||
|
||
# Must return an iterable of Request, dict or Item objects. | ||
for i in result: | ||
yield i | ||
|
||
def process_spider_exception(self, response, exception, spider): | ||
# Called when a spider or process_spider_input() method | ||
# (from other spider middleware) raises an exception. | ||
|
||
# Should return either None or an iterable of Response, dict | ||
# or Item objects. | ||
pass | ||
|
||
def process_start_requests(self, start_requests, spider): | ||
# Called with the start requests of the spider, and works | ||
# similarly to the process_spider_output() method, except | ||
# that it doesn’t have a response associated. | ||
|
||
# Must return only requests (not items). | ||
for r in start_requests: | ||
yield r | ||
|
||
def spider_opened(self, spider): | ||
spider.logger.info('Spider opened: %s' % spider.name) |
11 changes: 11 additions & 0 deletions
11
scrapy_project/spiegel_project/build/lib/spiegel_project/pipelines.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Define your item pipelines here | ||
# | ||
# Don't forget to add your pipeline to the ITEM_PIPELINES setting | ||
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html | ||
|
||
|
||
class SpiegelProjectPipeline(object): | ||
def process_item(self, item, spider): | ||
return item |
90 changes: 90 additions & 0 deletions
90
scrapy_project/spiegel_project/build/lib/spiegel_project/settings.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
# Scrapy settings for spiegel_project project | ||
# | ||
# For simplicity, this file contains only settings considered important or | ||
# commonly used. You can find more settings consulting the documentation: | ||
# | ||
# http://doc.scrapy.org/en/latest/topics/settings.html | ||
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
|
||
BOT_NAME = 'spiegel_project' | ||
|
||
SPIDER_MODULES = ['spiegel_project.spiders'] | ||
NEWSPIDER_MODULE = 'spiegel_project.spiders' | ||
|
||
|
||
# Crawl responsibly by identifying yourself (and your website) on the user-agent | ||
#USER_AGENT = 'spiegel_project (+http://www.yourdomain.com)' | ||
|
||
# Obey robots.txt rules | ||
ROBOTSTXT_OBEY = True | ||
|
||
# Configure maximum concurrent requests performed by Scrapy (default: 16) | ||
#CONCURRENT_REQUESTS = 32 | ||
|
||
# Configure a delay for requests for the same website (default: 0) | ||
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay | ||
# See also autothrottle settings and docs | ||
#DOWNLOAD_DELAY = 3 | ||
# The download delay setting will honor only one of: | ||
#CONCURRENT_REQUESTS_PER_DOMAIN = 16 | ||
#CONCURRENT_REQUESTS_PER_IP = 16 | ||
|
||
# Disable cookies (enabled by default) | ||
#COOKIES_ENABLED = False | ||
|
||
# Disable Telnet Console (enabled by default) | ||
#TELNETCONSOLE_ENABLED = False | ||
|
||
# Override the default request headers: | ||
#DEFAULT_REQUEST_HEADERS = { | ||
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | ||
# 'Accept-Language': 'en', | ||
#} | ||
|
||
# Enable or disable spider middlewares | ||
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html | ||
#SPIDER_MIDDLEWARES = { | ||
# 'spiegel_project.middlewares.SpiegelProjectSpiderMiddleware': 543, | ||
#} | ||
|
||
# Enable or disable downloader middlewares | ||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html | ||
#DOWNLOADER_MIDDLEWARES = { | ||
# 'spiegel_project.middlewares.MyCustomDownloaderMiddleware': 543, | ||
#} | ||
|
||
# Enable or disable extensions | ||
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html | ||
#EXTENSIONS = { | ||
# 'scrapy.extensions.telnet.TelnetConsole': None, | ||
#} | ||
|
||
# Configure item pipelines | ||
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html | ||
#ITEM_PIPELINES = { | ||
# 'spiegel_project.pipelines.SpiegelProjectPipeline': 300, | ||
#} | ||
|
||
# Enable and configure the AutoThrottle extension (disabled by default) | ||
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html | ||
#AUTOTHROTTLE_ENABLED = True | ||
# The initial download delay | ||
#AUTOTHROTTLE_START_DELAY = 5 | ||
# The maximum download delay to be set in case of high latencies | ||
#AUTOTHROTTLE_MAX_DELAY = 60 | ||
# The average number of requests Scrapy should be sending in parallel to | ||
# each remote server | ||
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 | ||
# Enable showing throttling stats for every response received: | ||
#AUTOTHROTTLE_DEBUG = False | ||
|
||
# Enable and configure HTTP caching (disabled by default) | ||
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings | ||
#HTTPCACHE_ENABLED = True | ||
#HTTPCACHE_EXPIRATION_SECS = 0 | ||
#HTTPCACHE_DIR = 'httpcache' | ||
#HTTPCACHE_IGNORE_HTTP_CODES = [] | ||
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' |
4 changes: 4 additions & 0 deletions
4
scrapy_project/spiegel_project/build/lib/spiegel_project/spiders/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# This package will contain the spiders of your Scrapy project | ||
# | ||
# Please refer to the documentation for information on how to create and manage | ||
# your spiders. |
30 changes: 30 additions & 0 deletions
30
scrapy_project/spiegel_project/build/lib/spiegel_project/spiders/spiegel-single.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# -*- coding: utf-8 -*- | ||
import scrapy | ||
|
||
|
||
class SpiegelSpider(scrapy.Spider): | ||
""" | ||
Crawls a single spiegel.de article | ||
""" | ||
name = "spiegel-single" | ||
start_urls = [ | ||
'http://www.spiegel.de/politik/deutschland/bundeswehr-der-rechte-kosmos-des-franco-a-a-1147221.html', | ||
] | ||
|
||
# extracting article content through links on the rubric page | ||
# parse will we called for the elements of start_url (default callback) | ||
def parse(self, response): | ||
article_column = response.xpath('//*[@id="js-article-column"]/div') | ||
intro_p = response.xpath('//p[@class="article-intro"]') | ||
author_p = response.xpath('//p[@class="author"]') | ||
|
||
yield { | ||
'rubric': response.xpath('//*[@id="header"]/div[2]/div[1]/a/text()').extract_first(), | ||
'timestamp': article_column.xpath('./div[2]/span/time/attribute::datetime').extract_first(), | ||
'author': author_p.xpath('./a/text()').extract(), | ||
'source': response.xpath('//*[@id="js-article-column"]/p/i/text()').extract_first(), | ||
'headline': response.xpath('//*[@id="content-main"]/div[1]/div[3]/h2/span[2]/text()').extract_first(), | ||
'intro': intro_p.xpath('./strong/text()').extract_first(), | ||
'text': "".join(article_column.xpath('./p/text()').extract()) | ||
} | ||
|
52 changes: 52 additions & 0 deletions
52
scrapy_project/spiegel_project/build/lib/spiegel_project/spiders/spiegel.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
# -*- coding: utf-8 -*- | ||
import scrapy | ||
from scrapy.spiders import Rule | ||
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor | ||
|
||
|
||
class SpiegelSpider(scrapy.Spider): | ||
name = "spiegel" | ||
start_urls = [ | ||
'http://www.spiegel.de/politik/', | ||
] | ||
|
||
|
||
rules = ( | ||
Rule(LxmlLinkExtractor(allow_domains='spiegel.de/')), | ||
) | ||
|
||
def parse(self, response): | ||
""" | ||
Standard callback for every URL in @start_urls | ||
Extracts all Article URL from a spiegel.de rubric page | ||
Only fetches subdomains of @start_urls | ||
:param response: a spiegel.de rubric page e.g. 'http://www.spiegel.de/politik/' | ||
:return: parsed articles | ||
""" | ||
# extracts Urls within the given start_url | ||
for link in LxmlLinkExtractor(allow=map(lambda x: x+'[a-z]+/.+html', self.start_urls)).extract_links(response): | ||
yield response.follow(link.url, callback=self.parse_article) | ||
# extracts the archive link on the current page and parses its content recursivly | ||
for archive_link in LxmlLinkExtractor(allow=map(lambda x: x+'archiv.*.html', self.start_urls)).extract_links(response): | ||
yield response.follow(archive_link.url) | ||
|
||
|
||
def parse_article(self, response): | ||
""" | ||
Parsing of a single article | ||
:param response: The whole article as an HTML object | ||
:return: Article as a datapoint | ||
""" | ||
article_column = response.xpath('//*[@id="js-article-column"]/div') | ||
intro_p = response.xpath('//p[@class="article-intro"]') | ||
author_p = response.xpath('//p[@class="author"]') | ||
yield { | ||
'rubric': response.xpath('//*[@id="header"]/div[2]/div[1]/a/text()').extract_first(), | ||
'timestamp': article_column.xpath('./div[2]/span/time/attribute::datetime').extract_first(), | ||
'author': author_p.xpath('./a/text()').extract(), | ||
'source': response.xpath('//*[@id="js-article-column"]/p/i/text()').extract_first(), | ||
'headline': response.xpath('//*[@id="content-main"]/div[1]/div[3]/h2/span[2]/text()').extract_first(), | ||
'intro': intro_p.xpath('./strong/text()').extract_first(), | ||
'text': "".join(article_column.xpath('./p/text()').extract()) | ||
|
||
} |
1 change: 1 addition & 0 deletions
1
scrapy_project/spiegel_project/crawls/spiegelCar1/requests.queue/active.json
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
[0] |
Oops, something went wrong.