uploads files

notsaman · Nov 25, 2019 · d85ed42 · d85ed42
commit d85ed42
Show file tree

Hide file tree

Showing 67 changed files with 302,663 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -0,0 +1,15 @@
+# Newspaper Topic Modelling
+
+* Scraped around 280.000 articles from Spiegel Online distributed in 11 topics
+
+## Research Questions:
+
+Is it possible to recreate and classify topics assigned to news articles by using a topic modelling algorithm on their respective content?
+
+### Subquestions
+
+* Which of the topics has the most topic markers put out by the algorithm?
+
+* Which of the topic markers is classified the most?
+
+* Are articles of a certain topic classified more accurately than others?
diff --git a/gradient.py b/gradient.py
@@ -0,0 +1,52 @@
+#TensorFLow can make this
+import numpy as np
+
+#Getting some data for our linear regression
+data_x = np.linspace(1.0, 10.0, 100)[:, np.newaxis]
+data_y = np.sin(data_x) + 0.1*np.power(data_x,2) + 0.5*np.random.randn(100,1)
+data_x /= np.max(data_x)
+
+# Take w and b(intercept) in one matrix.
+# So we dont have to make a second calculation with b
+data_x = np.hstack((np.ones_like(data_x), data_x))
+
+# Random ordering
+order = np.random.permutation(len(data_x))
+portion = 20
+test_x = data_x[order[:portion]]
+test_y = data_y[order[:portion]]
+train_x = data_x[order[portion:]]
+train_y = data_y[order[portion:]]
+
+# w = real price of house. Doing the partial derivitive
+# Python uses Fortran in numpy
+def  get_gradient(w, x, y):
+     y_estimate = x.dot(w).flatten()
+     error = (y.flatten() - y_estimate)
+     gradient = -(1.0/len(x)) * error.dot(x)
+     return gradient, np.pow(error, 2)
+
+
+w = np.random.randn(2)
+alpha = 0.5
+tolerance = 1e-5
+# Perform Gradient Descent
+iterations = 1
+while True:
+    gradient, error = get_gradient(w, train_x, train_y)
+    new_w = w - alpha * gradient
+
+    # Stopping Condition
+    if np.sum(abs(new_w - w)) < tolerance:
+        print("Converged.")
+        break
+
+    # Print error every 50 iterations
+    if iterations % 100 == 0:
+        print("Iteration: %d - Error: %.4f" %(iterations, error))
+
+    iterations += 1
+    w = new_w
+
+
+
diff --git a/k_means_clustering_digits.py b/k_means_clustering_digits.py
@@ -0,0 +1,60 @@
+# Adapted from Source: http://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_digits.html
+import numpy as np
+import matplotlib.pyplot as plt
+
+from sklearn import metrics
+from sklearn.cluster import KMeans
+from sklearn.datasets import load_digits
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import scale
+
+np.random.seed(42)
+
+digits = load_digits()
+data = scale(digits.data)
+
+n_samples, n_features = data.shape
+n_digits = len(np.unique(digits.target))
+labels = digits.target
+
+# in this case the seeding of the centers is deterministic, hence we run the
+# kmeans algorithm only once with n_init=1
+pca = PCA(n_components=n_digits).fit(data)
+
+reduced_data = PCA(n_components=2).fit_transform(data)
+kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
+kmeans.fit(reduced_data)
+
+# Step size of the mesh. Decrease to increase the quality of the VQ.
+h = .02     # point in the mesh [x_min, x_max]x[y_min, y_max].
+
+# Plot the decision boundary. For that, we will assign a color to each
+x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
+y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
+xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
+
+# Obtain labels for each point in mesh. Use last trained model.
+Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
+
+# Put the result into a color plot
+Z = Z.reshape(xx.shape)
+plt.figure(1)
+plt.clf()
+plt.imshow(Z, interpolation='nearest',
+           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
+           cmap=plt.cm.Paired,
+           aspect='auto', origin='lower')
+
+plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
+# Plot the centroids as a white X
+centroids = kmeans.cluster_centers_
+plt.scatter(centroids[:, 0], centroids[:, 1],
+            marker='x', s=169, linewidths=3,
+            color='w', zorder=10)
+plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
+          'Centroids are marked with white cross')
+plt.xlim(x_min, x_max)
+plt.ylim(y_min, y_max)
+plt.xticks(())
+plt.yticks(())
+plt.show()
diff --git a/scrapy_project/spiegel_project/build/lib/spiegel_project/__init__.py b/scrapy_project/spiegel_project/build/lib/spiegel_project/__init__.py
diff --git a/scrapy_project/spiegel_project/build/lib/spiegel_project/items.py b/scrapy_project/spiegel_project/build/lib/spiegel_project/items.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class SpiegelProjectItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
diff --git a/scrapy_project/spiegel_project/build/lib/spiegel_project/middlewares.py b/scrapy_project/spiegel_project/build/lib/spiegel_project/middlewares.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class SpiegelProjectSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/scrapy_project/spiegel_project/build/lib/spiegel_project/pipelines.py b/scrapy_project/spiegel_project/build/lib/spiegel_project/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class SpiegelProjectPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/scrapy_project/spiegel_project/build/lib/spiegel_project/settings.py b/scrapy_project/spiegel_project/build/lib/spiegel_project/settings.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for spiegel_project project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'spiegel_project'
+
+SPIDER_MODULES = ['spiegel_project.spiders']
+NEWSPIDER_MODULE = 'spiegel_project.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'spiegel_project (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'spiegel_project.middlewares.SpiegelProjectSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'spiegel_project.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'spiegel_project.pipelines.SpiegelProjectPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/scrapy_project/spiegel_project/build/lib/spiegel_project/spiders/__init__.py b/scrapy_project/spiegel_project/build/lib/spiegel_project/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/scrapy_project/spiegel_project/build/lib/spiegel_project/spiders/spiegel-single.py b/scrapy_project/spiegel_project/build/lib/spiegel_project/spiders/spiegel-single.py
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+import scrapy
+
+
+class SpiegelSpider(scrapy.Spider):
+    """
+    Crawls a single spiegel.de article             
+    """
+    name = "spiegel-single"
+    start_urls = [
+        'http://www.spiegel.de/politik/deutschland/bundeswehr-der-rechte-kosmos-des-franco-a-a-1147221.html',
+    ]
+
+    # extracting article content through links on the rubric page
+    # parse will we called for the elements of start_url (default callback)
+    def parse(self, response):
+        article_column = response.xpath('//*[@id="js-article-column"]/div')
+        intro_p = response.xpath('//p[@class="article-intro"]')
+        author_p = response.xpath('//p[@class="author"]')
+
+        yield {
+            'rubric': response.xpath('//*[@id="header"]/div[2]/div[1]/a/text()').extract_first(),
+            'timestamp': article_column.xpath('./div[2]/span/time/attribute::datetime').extract_first(),
+            'author': author_p.xpath('./a/text()').extract(),
+            'source': response.xpath('//*[@id="js-article-column"]/p/i/text()').extract_first(),
+            'headline': response.xpath('//*[@id="content-main"]/div[1]/div[3]/h2/span[2]/text()').extract_first(),
+            'intro': intro_p.xpath('./strong/text()').extract_first(),
+            'text': "".join(article_column.xpath('./p/text()').extract())
+        }
+
diff --git a/scrapy_project/spiegel_project/build/lib/spiegel_project/spiders/spiegel.py b/scrapy_project/spiegel_project/build/lib/spiegel_project/spiders/spiegel.py
@@ -0,0 +1,52 @@
+# -*- coding: utf-8 -*-
+import scrapy
+from scrapy.spiders import Rule
+from scrapy.linkextractors.lxmlhtml import LxmlLinkExtractor
+
+
+class SpiegelSpider(scrapy.Spider):
+    name = "spiegel"
+    start_urls = [
+        'http://www.spiegel.de/politik/',
+    ]
+
+
+    rules = (
+        Rule(LxmlLinkExtractor(allow_domains='spiegel.de/')),
+    )
+
+    def parse(self, response):
+        """
+        Standard callback for every URL in @start_urls
+        Extracts all Article URL from a spiegel.de rubric page
+        Only fetches subdomains of @start_urls
+        :param response:  a spiegel.de rubric page e.g. 'http://www.spiegel.de/politik/'
+        :return: parsed articles 
+        """
+        # extracts Urls within the given start_url
+        for link in LxmlLinkExtractor(allow=map(lambda x: x+'[a-z]+/.+html', self.start_urls)).extract_links(response):
+            yield response.follow(link.url, callback=self.parse_article)
+        # extracts the archive link on the current page and parses its content recursivly
+        for archive_link in LxmlLinkExtractor(allow=map(lambda x: x+'archiv.*.html', self.start_urls)).extract_links(response):
+            yield response.follow(archive_link.url)
+
+
+    def parse_article(self, response):
+        """
+        Parsing of a single article 
+        :param response: The whole article as an HTML object
+        :return: Article as a datapoint 
+        """
+        article_column = response.xpath('//*[@id="js-article-column"]/div')
+        intro_p = response.xpath('//p[@class="article-intro"]')
+        author_p = response.xpath('//p[@class="author"]')
+        yield {
+            'rubric': response.xpath('//*[@id="header"]/div[2]/div[1]/a/text()').extract_first(),
+            'timestamp': article_column.xpath('./div[2]/span/time/attribute::datetime').extract_first(),
+            'author': author_p.xpath('./a/text()').extract(),
+            'source': response.xpath('//*[@id="js-article-column"]/p/i/text()').extract_first(),
+            'headline': response.xpath('//*[@id="content-main"]/div[1]/div[3]/h2/span[2]/text()').extract_first(),
+            'intro': intro_p.xpath('./strong/text()').extract_first(),
+            'text': "".join(article_column.xpath('./p/text()').extract())
+
+        }
diff --git a/scrapy_project/spiegel_project/crawls/spiegelCar1/requests.queue/active.json b/scrapy_project/spiegel_project/crawls/spiegelCar1/requests.queue/active.json
@@ -0,0 +1 @@
+[0]