Skip to content

Commit

Permalink
uploads files
Browse files Browse the repository at this point in the history
  • Loading branch information
Saman committed Nov 25, 2019
0 parents commit d85ed42
Show file tree
Hide file tree
Showing 67 changed files with 302,663 additions and 0 deletions.
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Newspaper Topic Modelling

* Scraped around 280.000 articles from Spiegel Online distributed in 11 topics

## Research Questions:

Is it possible to recreate and classify topics assigned to news articles by using a topic modelling algorithm on their respective content?

### Subquestions

* Which of the topics has the most topic markers put out by the algorithm?

* Which of the topic markers is classified the most?

* Are articles of a certain topic classified more accurately than others?
52 changes: 52 additions & 0 deletions gradient.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
#TensorFLow can make this
import numpy as np

#Getting some data for our linear regression
data_x = np.linspace(1.0, 10.0, 100)[:, np.newaxis]
data_y = np.sin(data_x) + 0.1*np.power(data_x,2) + 0.5*np.random.randn(100,1)
data_x /= np.max(data_x)

# Take w and b(intercept) in one matrix.
# So we dont have to make a second calculation with b
data_x = np.hstack((np.ones_like(data_x), data_x))

# Random ordering
order = np.random.permutation(len(data_x))
portion = 20
test_x = data_x[order[:portion]]
test_y = data_y[order[:portion]]
train_x = data_x[order[portion:]]
train_y = data_y[order[portion:]]

# w = real price of house. Doing the partial derivitive
# Python uses Fortran in numpy
def get_gradient(w, x, y):
y_estimate = x.dot(w).flatten()
error = (y.flatten() - y_estimate)
gradient = -(1.0/len(x)) * error.dot(x)
return gradient, np.pow(error, 2)


w = np.random.randn(2)
alpha = 0.5
tolerance = 1e-5
# Perform Gradient Descent
iterations = 1
while True:
gradient, error = get_gradient(w, train_x, train_y)
new_w = w - alpha * gradient

# Stopping Condition
if np.sum(abs(new_w - w)) < tolerance:
print("Converged.")
break

# Print error every 50 iterations
if iterations % 100 == 0:
print("Iteration: %d - Error: %.4f" %(iterations, error))

iterations += 1
w = new_w



60 changes: 60 additions & 0 deletions k_means_clustering_digits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Adapted from Source: http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html
import numpy as np
import matplotlib.pyplot as plt

from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.datasets import load_digits
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

np.random.seed(42)

digits = load_digits()
data = scale(digits.data)

n_samples, n_features = data.shape
n_digits = len(np.unique(digits.target))
labels = digits.target

# in this case the seeding of the centers is deterministic, hence we run the
# kmeans algorithm only once with n_init=1
pca = PCA(n_components=n_digits).fit(data)

reduced_data = PCA(n_components=2).fit_transform(data)
kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
kmeans.fit(reduced_data)

# Step size of the mesh. Decrease to increase the quality of the VQ.
h = .02 # point in the mesh [x_min, x_max]x[y_min, y_max].

# Plot the decision boundary. For that, we will assign a color to each
x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

# Obtain labels for each point in mesh. Use last trained model.
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure(1)
plt.clf()
plt.imshow(Z, interpolation='nearest',
extent=(xx.min(), xx.max(), yy.min(), yy.max()),
cmap=plt.cm.Paired,
aspect='auto', origin='lower')

plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
# Plot the centroids as a white X
centroids = kmeans.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
marker='x', s=169, linewidths=3,
color='w', zorder=10)
plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
'Centroids are marked with white cross')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.xticks(())
plt.yticks(())
plt.show()
Empty file.
14 changes: 14 additions & 0 deletions scrapy_project/spiegel_project/build/lib/spiegel_project/items.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class SpiegelProjectItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
pass
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
# -*- coding: utf-8 -*-

# Define here the models for your spider middleware
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/spider-middleware.html

from scrapy import signals


class SpiegelProjectSpiderMiddleware(object):
# Not all methods need to be defined. If a method is not defined,
# scrapy acts as if the spider middleware does not modify the
# passed objects.

@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s

def process_spider_input(self, response, spider):
# Called for each response that goes through the spider
# middleware and into the spider.

# Should return None or raise an exception.
return None

def process_spider_output(self, response, result, spider):
# Called with the results returned from the Spider, after
# it has processed the response.

# Must return an iterable of Request, dict or Item objects.
for i in result:
yield i

def process_spider_exception(self, response, exception, spider):
# Called when a spider or process_spider_input() method
# (from other spider middleware) raises an exception.

# Should return either None or an iterable of Response, dict
# or Item objects.
pass

def process_start_requests(self, start_requests, spider):
# Called with the start requests of the spider, and works
# similarly to the process_spider_output() method, except
# that it doesn’t have a response associated.

# Must return only requests (not items).
for r in start_requests:
yield r

def spider_opened(self, spider):
spider.logger.info('Spider opened: %s' % spider.name)
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html


class SpiegelProjectPipeline(object):
def process_item(self, item, spider):
return item
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
# -*- coding: utf-8 -*-

# Scrapy settings for spiegel_project project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'spiegel_project'

SPIDER_MODULES = ['spiegel_project.spiders']
NEWSPIDER_MODULE = 'spiegel_project.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'spiegel_project (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'spiegel_project.middlewares.SpiegelProjectSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'spiegel_project.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
#ITEM_PIPELINES = {
# 'spiegel_project.pipelines.SpiegelProjectPipeline': 300,
#}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# -*- coding: utf-8 -*-
import scrapy


class SpiegelSpider(scrapy.Spider):
"""
Crawls a single spiegel.de article
"""
name = "spiegel-single"
start_urls = [
'http://www.spiegel.de/politik/deutschland/bundeswehr-der-rechte-kosmos-des-franco-a-a-1147221.html',
]

# extracting article content through links on the rubric page
# parse will we called for the elements of start_url (default callback)
def parse(self, response):
article_column = response.xpath('//*[@id="js-article-column"]/div')
intro_p = response.xpath('//p[@class="article-intro"]')
author_p = response.xpath('//p[@class="author"]')

yield {
'rubric': response.xpath('//*[@id="header"]/div[2]/div[1]/a/text()').extract_first(),
'timestamp': article_column.xpath('./div[2]/span/time/attribute::datetime').extract_first(),
'author': author_p.xpath('./a/text()').extract(),
'source': response.xpath('//*[@id="js-article-column"]/p/i/text()').extract_first(),
'headline': response.xpath('//*[@id="content-main"]/div[1]/div[3]/h2/span[2]/text()').extract_first(),
'intro': intro_p.xpath('./strong/text()').extract_first(),
'text': "".join(article_column.xpath('./p/text()').extract())
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# -*- coding: utf-8 -*-
import scrapy
from scrapy.spiders import Rule
from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor


class SpiegelSpider(scrapy.Spider):
name = "spiegel"
start_urls = [
'http://www.spiegel.de/politik/',
]


rules = (
Rule(LxmlLinkExtractor(allow_domains='spiegel.de/')),
)

def parse(self, response):
"""
Standard callback for every URL in @start_urls
Extracts all Article URL from a spiegel.de rubric page
Only fetches subdomains of @start_urls
:param response: a spiegel.de rubric page e.g. 'http://www.spiegel.de/politik/'
:return: parsed articles
"""
# extracts Urls within the given start_url
for link in LxmlLinkExtractor(allow=map(lambda x: x+'[a-z]+/.+html', self.start_urls)).extract_links(response):
yield response.follow(link.url, callback=self.parse_article)
# extracts the archive link on the current page and parses its content recursivly
for archive_link in LxmlLinkExtractor(allow=map(lambda x: x+'archiv.*.html', self.start_urls)).extract_links(response):
yield response.follow(archive_link.url)


def parse_article(self, response):
"""
Parsing of a single article
:param response: The whole article as an HTML object
:return: Article as a datapoint
"""
article_column = response.xpath('//*[@id="js-article-column"]/div')
intro_p = response.xpath('//p[@class="article-intro"]')
author_p = response.xpath('//p[@class="author"]')
yield {
'rubric': response.xpath('//*[@id="header"]/div[2]/div[1]/a/text()').extract_first(),
'timestamp': article_column.xpath('./div[2]/span/time/attribute::datetime').extract_first(),
'author': author_p.xpath('./a/text()').extract(),
'source': response.xpath('//*[@id="js-article-column"]/p/i/text()').extract_first(),
'headline': response.xpath('//*[@id="content-main"]/div[1]/div[3]/h2/span[2]/text()').extract_first(),
'intro': intro_p.xpath('./strong/text()').extract_first(),
'text': "".join(article_column.xpath('./p/text()').extract())

}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
[0]
Loading

0 comments on commit d85ed42

Please sign in to comment.