From 070035ebf0cf4065f32f00e78044bb24a22172bd Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Fri, 20 Sep 2024 15:01:31 +0200
Subject: [PATCH 01/26] Link to Bluesky in readme

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/README.md b/README.md
index 7e4ec7769..6022145f1 100644
--- a/README.md
+++ b/README.md
@@ -6,9 +6,10 @@
 [![Requires Python 3.8](https://img.shields.io/badge/py-v3.8-blue)](https://www.python.org/)
 [![Docker image status](https://github.com/digitalmethodsinitiative/4cat/actions/workflows/docker_latest.yml/badge.svg)](https://github.com/digitalmethodsinitiative/4cat/actions/workflows/docker_latest.yml)
 
-<p align="center">4CAT has a website at <a href="https://4cat.nl">4cat.nl</a>.</p>
 <p align="center"><img alt="A screenshot of 4CAT, displaying its 'Create Dataset' interface" src="common/assets/screenshot1.png"><img alt="A screenshot of 4CAT, displaying a network visualisation of a dataset" src="common/assets/screenshot2.png"></p>
 
+<p align="center">4CAT has a website at <a href="https://4cat.nl">4cat.nl</a>.</p>
+<p align="center">You can also [follow 4CAT on Bluesky](https://bsky.app/profile/4cat.nl) for updates</p>
 4CAT is a research tool that can be used to analyse and process data from
 online social platforms. Its goal is to make the capture and analysis of data
 from these platforms accessible to people through a web interface, without

From 02f90bd1559d710360324e1dca116e8c5519f9fe Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Fri, 20 Sep 2024 15:03:09 +0200
Subject: [PATCH 02/26] Link to Bluesky in readme

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 6022145f1..9fc84f890 100644
--- a/README.md
+++ b/README.md
@@ -9,7 +9,7 @@
 <p align="center"><img alt="A screenshot of 4CAT, displaying its 'Create Dataset' interface" src="common/assets/screenshot1.png"><img alt="A screenshot of 4CAT, displaying a network visualisation of a dataset" src="common/assets/screenshot2.png"></p>
 
 <p align="center">4CAT has a website at <a href="https://4cat.nl">4cat.nl</a>.</p>
-<p align="center">You can also [follow 4CAT on Bluesky](https://bsky.app/profile/4cat.nl) for updates</p>
+<p align="center"><a href="https://bsky.app/profile/4cat.nl">Follow 4CAT on Bluesky</a> for updates.</p>
 4CAT is a research tool that can be used to analyse and process data from
 online social platforms. Its goal is to make the capture and analysis of data
 from these platforms accessible to people through a web interface, without

From dd85961696de3d01fa48cfbbac8a31a4374edc83 Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Mon, 23 Sep 2024 14:37:50 +0200
Subject: [PATCH 03/26] Only import bsky embed JS on front page, make divs
 wider

---
 webtool/static/js/bsky-embed.es.js | 4 ++--
 webtool/templates/frontpage.html   | 2 +-
 webtool/templates/layout.html      | 3 +++
 3 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/webtool/static/js/bsky-embed.es.js b/webtool/static/js/bsky-embed.es.js
index 3169355d9..6a4ae63a2 100644
--- a/webtool/static/js/bsky-embed.es.js
+++ b/webtool/static/js/bsky-embed.es.js
@@ -680,7 +680,7 @@ function m4(a) {
 function h4(a, p, l) {
   return arguments.length === 2 && (l = p, p = {}), Qb(a, p)(m4(l));
 }
-const y4 = '*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,[type=button],[type=reset],[type=submit]{-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]{display:none}*,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: }.fixed{position:fixed}.right-5{right:1.25rem}.top-5{top:1.25rem}.col-span-2{grid-column:span 2 / span 2}.mx-1{margin-left:.25rem;margin-right:.25rem}.mx-auto{margin-left:auto;margin-right:auto}.mb-1{margin-bottom:.25rem}.mb-16{margin-bottom:4rem}.ml-10{margin-left:2.5rem}.mr-1{margin-right:.25rem}.mt-4{margin-top:1rem}.mt-8{margin-top:2rem}.block{display:block}.inline{display:inline}.flex{display:flex}.grid{display:grid}.h-10{height:2.5rem}.h-14{height:3.5rem}.h-2{height:.5rem}.h-4{height:1rem}.max-h-\\[90vh\\]{max-height:90vh}.w-10{width:2.5rem}.w-14{width:3.5rem}.w-4{width:1rem}.w-full{width:100%}.max-w-\\[calc\\(100vw-96px\\)\\]{max-width:calc(100vw - 96px)}.max-w-screen-sm{max-width:640px}.flex-1{flex:1 1 0%}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.flex-col{flex-direction:column}.items-center{align-items:left}.justify-center{justify-content:center}.gap-1{gap:.25rem}.gap-2{gap:.5rem}.gap-4{gap:1rem}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.5rem * var(--tw-space-y-reverse))}.overflow-hidden{overflow:hidden}.text-ellipsis{text-overflow:ellipsis}.whitespace-nowrap{white-space:nowrap}.whitespace-pre-wrap{white-space:pre-wrap}.rounded{border-radius:.25rem}.rounded-full{border-radius:9999px}.rounded-md{border-radius:.375rem}.rounded-t-md{border-top-left-radius:.375rem;border-top-right-radius:.375rem}.border{border-width:1px}.border-b{border-bottom-width:1px}.border-slate-300{--tw-border-opacity: 1;border-color:rgb(203 213 225 / var(--tw-border-opacity))}.bg-blue-500{--tw-bg-opacity: 1;background-color:rgb(59 130 246 / var(--tw-bg-opacity))}.bg-gray-900{--tw-bg-opacity: 1;background-color:rgb(17 24 39 / var(--tw-bg-opacity))}.bg-slate-100{--tw-bg-opacity: 1;background-color:rgb(241 245 249 / var(--tw-bg-opacity))}.bg-slate-200{--tw-bg-opacity: 1;background-color:rgb(226 232 240 / var(--tw-bg-opacity))}.bg-slate-900{--tw-bg-opacity: 1;background-color:rgb(15 23 42 / var(--tw-bg-opacity))}.p-3{padding:.75rem}.p-4{padding:1rem}.px-4{padding-left:1rem;padding-right:1rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.text-sm{font-size:.875rem;line-height:1.25rem}.font-bold{font-weight:700}.font-semibold{font-weight:600}.text-blue-500{--tw-text-opacity: 1;color:rgb(59 130 246 / var(--tw-text-opacity))}.text-slate-500{--tw-text-opacity: 1;color:rgb(100 116 139 / var(--tw-text-opacity))}.text-slate-600{--tw-text-opacity: 1;color:rgb(71 85 105 / var(--tw-text-opacity))}.text-white{--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity))}.underline{text-decoration-line:underline}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.backdrop\\:bg-gray-800::backdrop{--tw-bg-opacity: 1;background-color:rgb(31 41 55 / var(--tw-bg-opacity))}.backdrop\\:opacity-90::backdrop{opacity:.9}.hover\\:bg-blue-700:hover{--tw-bg-opacity: 1;background-color:rgb(29 78 216 / var(--tw-bg-opacity))}.hover\\:underline:hover{text-decoration-line:underline}.dark\\:border-slate-800:where(.dark,.dark *){--tw-border-opacity: 1;border-color:rgb(30 41 59 / var(--tw-border-opacity))}.dark\\:bg-slate-800:where(.dark,.dark *){--tw-bg-opacity: 1;background-color:rgb(30 41 59 / var(--tw-bg-opacity))}.dark\\:text-slate-400:where(.dark,.dark *){--tw-text-opacity: 1;color:rgb(148 163 184 / var(--tw-text-opacity))}.dark\\:text-white:where(.dark,.dark *){--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity))}';
+const y4 = '*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,[type=button],[type=reset],[type=submit]{-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]{display:none}*,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: }.fixed{position:fixed}.right-5{right:1.25rem}.top-5{top:1.25rem}.col-span-2{grid-column:span 2 / span 2}.mx-1{margin-left:.25rem;margin-right:.25rem}.mx-auto{margin-left:auto;margin-right:auto}.mb-1{margin-bottom:.25rem}.mb-16{margin-bottom:4rem}.ml-10{margin-left:2.5rem}.mr-1{margin-right:.25rem}.mt-4{margin-top:1rem}.mt-8{margin-top:2rem}.block{display:block}.inline{display:inline}.flex{display:flex}.grid{display:grid}.h-10{height:2.5rem}.h-14{height:3.5rem}.h-2{height:.5rem}.h-4{height:1rem}.max-h-\\[90vh\\]{max-height:90vh}.w-10{width:2.5rem}.w-14{width:3.5rem}.w-4{width:1rem}.w-full{width:100%}.max-w-\\[calc\\(100vw-96px\\)\\]{max-width:calc(100vw - 96px)}.max-w-screen-sm{max-width:640px}.flex-1{flex:1 1 0%}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.flex-col{flex-direction:column}.items-center{align-items:normal}.justify-center{justify-content:center}.gap-1{gap:.25rem}.gap-2{gap:.5rem}.gap-4{gap:1rem}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.5rem * var(--tw-space-y-reverse))}.overflow-hidden{overflow:hidden}.text-ellipsis{text-overflow:ellipsis}.whitespace-nowrap{white-space:nowrap}.whitespace-pre-wrap{white-space:pre-wrap}.rounded{border-radius:.25rem}.rounded-full{border-radius:9999px}.rounded-md{border-radius:.375rem}.rounded-t-md{border-top-left-radius:.375rem;border-top-right-radius:.375rem}.border{border-width:1px}.border-b{border-bottom-width:1px}.border-slate-300{--tw-border-opacity: 1;border-color:rgb(203 213 225 / var(--tw-border-opacity))}.bg-blue-500{--tw-bg-opacity: 1;background-color:rgb(59 130 246 / var(--tw-bg-opacity))}.bg-gray-900{--tw-bg-opacity: 1;background-color:rgb(17 24 39 / var(--tw-bg-opacity))}.bg-slate-100{--tw-bg-opacity: 1;background-color:rgb(241 245 249 / var(--tw-bg-opacity))}.bg-slate-200{--tw-bg-opacity: 1;background-color:rgb(226 232 240 / var(--tw-bg-opacity))}.bg-slate-900{--tw-bg-opacity: 1;background-color:rgb(15 23 42 / var(--tw-bg-opacity))}.p-3{padding:.75rem}.p-4{padding:1rem}.px-4{padding-left:1rem;padding-right:1rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.text-sm{font-size:.875rem;line-height:1.25rem}.font-bold{font-weight:700}.font-semibold{font-weight:600}.text-blue-500{--tw-text-opacity: 1;color:rgb(59 130 246 / var(--tw-text-opacity))}.text-slate-500{--tw-text-opacity: 1;color:rgb(100 116 139 / var(--tw-text-opacity))}.text-slate-600{--tw-text-opacity: 1;color:rgb(71 85 105 / var(--tw-text-opacity))}.text-white{--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity))}.underline{text-decoration-line:underline}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.backdrop\\:bg-gray-800::backdrop{--tw-bg-opacity: 1;background-color:rgb(31 41 55 / var(--tw-bg-opacity))}.backdrop\\:opacity-90::backdrop{opacity:.9}.hover\\:bg-blue-700:hover{--tw-bg-opacity: 1;background-color:rgb(29 78 216 / var(--tw-bg-opacity))}.hover\\:underline:hover{text-decoration-line:underline}.dark\\:border-slate-800:where(.dark,.dark *){--tw-border-opacity: 1;border-color:rgb(30 41 59 / var(--tw-border-opacity))}.dark\\:bg-slate-800:where(.dark,.dark *){--tw-bg-opacity: 1;background-color:rgb(30 41 59 / var(--tw-bg-opacity))}.dark\\:text-slate-400:where(.dark,.dark *){--tw-text-opacity: 1;color:rgb(148 163 184 / var(--tw-text-opacity))}.dark\\:text-white:where(.dark,.dark *){--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity))}';
 var Bu = { exports: {} };
 (function(a) {
   var p = Object.create, l = Object.defineProperty, m = Object.getOwnPropertyDescriptor, g = Object.getOwnPropertyNames, A = Object.getPrototypeOf, b = Object.prototype.hasOwnProperty, R = (t, i) => function() {
@@ -25594,7 +25594,7 @@ const k4 = ({
     return J(F, y4, null), J(F, b, null), F;
   })(), (() => {
     var F = _4(), G = F.firstChild, ee = G.firstChild, ie = ee.nextSibling;
-    return Su(F, `${m} max-w-screen-sm mx-auto flex flex-col items-center`), J(F, (() => {
+    return Su(F, `${m} mx-auto flex flex-col items-center`), J(F, (() => {
       var ge = tt(() => Ce().length > 0);
       return () => ge() && Ce().map((Xe, hr) => (() => {
         var Ae = S4();
diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html
index 6851b9df8..d8b41b1ee 100644
--- a/webtool/templates/frontpage.html
+++ b/webtool/templates/frontpage.html
@@ -21,7 +21,7 @@ <h2><span>4CAT updates</span></h2>
                   <bsky-embed
                     username="4cat.nl"
                     mode=""
-                    limit="20"
+                    limit="5"
                     link_target="_blank"
                     link_image="true"
                     load_more="true"
diff --git a/webtool/templates/layout.html b/webtool/templates/layout.html
index bb304f705..1815c2336 100644
--- a/webtool/templates/layout.html
+++ b/webtool/templates/layout.html
@@ -15,7 +15,10 @@
 	<script type="text/javascript" src="{{url_for('static', filename='js/svg-pan-zoom.min.js')}}"></script>
     <script type="text/javascript" src="{{url_for('static', filename='js/zip.min.js')}}"></script>
 	<script type="text/javascript" src="{{url_for('static', filename='js/fourcat.js')}}"></script>
+
+    {% if navigation.current == "about" %}
     <script type="text/javascript" src="{{url_for('static', filename='js/bsky-embed.es.js')}}"></script>
+    {% endif %}
 
 	<!-- favicons -->
 	<link rel="shortcut icon" href="{{url_for('static', filename='img/favicon/favicon.ico')}}">

From 07094f8ef071a526ac06d43a31a454f3bec42640 Mon Sep 17 00:00:00 2001
From: Dale Wahl <32108944+dale-wahl@users.noreply.github.com>
Date: Mon, 23 Sep 2024 15:50:08 +0200
Subject: [PATCH 04/26] telegram crawl improvements (#444)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* telegram crawl improvements

* Squashed commit of the following:

commit dd85961696de3d01fa48cfbbac8a31a4374edc83
Author: sal-phd-desktop <s.h.hagen@uva.nl>
Date:   Mon Sep 23 14:37:50 2024 +0200

    Only import bsky embed JS on front page, make divs wider

commit 02f90bd1559d710360324e1dca116e8c5519f9fe
Author: sal-phd-desktop <s.h.hagen@uva.nl>
Date:   Fri Sep 20 15:03:09 2024 +0200

    Link to Bluesky in readme

commit e675dd04a9ffb45cc72704763b7553fee6cf59a2
Merge: 070035eb 38418b2e
Author: sal-phd-desktop <s.h.hagen@uva.nl>
Date:   Fri Sep 20 15:01:45 2024 +0200

    Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

commit 070035ebf0cf4065f32f00e78044bb24a22172bd
Author: sal-phd-desktop <s.h.hagen@uva.nl>
Date:   Fri Sep 20 15:01:31 2024 +0200

    Link to Bluesky in readme

commit 38418b2ec1533f5e13c8d3f001903db0bfdab4af
Author: Sal Hagen <s.h.hagen@uva.nl>
Date:   Thu Sep 19 17:27:00 2024 +0200

    Host BlueSky widget ourselves

commit e281eb8bdfad3ec4c800bec2a64e6ff3263a2f74
Author: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com>
Date:   Thu Sep 19 15:32:08 2024 +0200

    Refactor module loading (#396)

    * Refactor module loading

    * Optionally inject modules when instantiating dataset object

    * pass modules in a few more places where possible

    I think that is everywhere in the frontend.

    Backend is a bit odd as we are passing dataset.modules when it is None and thus creating children that would require individual inits of ModuleCollector. Could be more to look at there.

    * Do not lazy-load modules

    * modules/all_modules

    * Squashed commit of the following:

    commit 3f2a62a124926cfeb840796f104a702878ac10e5
    Author: Carsten Schnober <carschno@gmail.com>
    Date:   Wed Sep 18 18:18:29 2024 +0200

        Update Gensim to >=4.3.3, <4.4.0 (#450)

        * Update Gensim to >=4.3.3, <4.4.0

        * update nltk as well

        ---------

        Co-authored-by: Dale Wahl <dalewahl@gmail.com>
        Co-authored-by: Sal Hagen <s.h.hagen@uva.nl>

    commit fee2c8c08617094f28496963da282d2e2dddeab7
    Merge: 3d94b666 f8e93eda
    Author: sal-phd-desktop <s.h.hagen@uva.nl>
    Date:   Wed Sep 18 18:11:19 2024 +0200

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit 3d94b666cedd0de4e0bee953cbf1d787fdc38854
    Author: sal-phd-desktop <s.h.hagen@uva.nl>
    Date:   Wed Sep 18 18:11:04 2024 +0200

        FINALLY remove 'News' from the front page, replace with 4CAT BlueSky updates and potential information about the specific server (to be set on config page)

    commit f8e93edabe9013a2c1229caa4c454fab09620125
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed Sep 18 15:11:21 2024 +0200

        Simple extensions page in Control Panel

    commit b5be128c7b8682fb233d962326d9118a61053165
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed Sep 18 14:08:13 2024 +0200

        Remove 'docs' directory

    commit 1e2010af44817016c274c9ec9f7f9971deb57f66
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed Sep 18 14:07:38 2024 +0200

        Forgot TikTok and Douyin

    commit c757dd51884e7ec9cf62ca1726feacab4b2283b7
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed Sep 18 14:01:31 2024 +0200

        Say 'zeeschuimer' instead of 'extension' to avoid confusion with 4CAT extensions

    commit ee7f4345478f923541536c86a5b06246deae03f6
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed Sep 18 14:00:40 2024 +0200

        RIP Parler data source

    commit 11300f2430b51887823b280405de4ded4f15ede1
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed Sep 18 11:21:37 2024 +0200

        Tuplestring

    commit 547265240eba81ca0ad270cd3c536a2b1dcf512d
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed Sep 18 11:15:29 2024 +0200

        Pass user obj instead of str to ConfigWrapper in Processor

    commit b21866d7900b5d20ed6ce61ee9aff50f3c0df910
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Sep 17 17:45:01 2024 +0200

        Ensure request-aware config reader in user object when using config wrapper

    commit bbe79e4b0fe870ccc36cab7bfe7963b28d1948e3
    Author: Sal Hagen <s.h.hagen@uva.nl>
    Date:   Tue Sep 17 15:12:46 2024 +0200

        Fix extension path walk for Windows

    commit d6064beaf31a6a85b0e34ed4f8126eb4c4fc07e3
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Sep 16 14:50:45 2024 +0200

        Allow tags that have no users

        Use case: tag-based frontend differentiation using X-4CAT-Config-Via-Proxy

    commit b542ded6f976809ec88445e7b04f2c81b900188e
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Sep 16 14:13:14 2024 +0200

        Trailing slash in query results list

    commit a4bddae575b22a009925206a1337bdd89349e567
    Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com>
    Date:   Mon Sep 16 13:57:23 2024 +0200

        4CAT Extension - easy(ier) adding of new datasources/processors that can be mainted seperately from 4CAT base code (#451)

        * domain only

        * fix reference

        * try and collect links with selenium

        * update column_filter to find multiple matches

        * fix up the normal url_scraper datasource

        * ensure all selenium links are strings for join

        * change output of url_scraper to ndjson with map_items

        * missed key/index change

        * update web archive to use json and map to 4CAT

        * fix no text found

        * and none on scraped_links

        * check key first

        * fix up web_archive error reporting

        * handle None type for error

        * record web archive "bad request"

        * add wait after redirect movement

        * increase waittime for redirects

        * add processor for trackers

        * dict to list for addition

        * allow both newline and comma seperated links

        * attempt to scrape iframes as seperate pages

        * Fixes for selenium scraper to work with config database

        * installation of packages, geckodriver, and firefox if selenium enabled

        * update install instructions

        * fix merge error

        * fix dropped function

        * have to be kidding me

        * add note; setup requires docker... need to think about IF this will ever
        be installed without Docker

        * seperate selenium class into wrapper and Search class so wrapper can be
        used in processors!

        * add screenshots; add firefox extension support

        * update selenium definitions

        * regex for extracting urls from strings

        * screenshots processor; extract urls from text and takes screenshots

        * Allow producing zip files from data sources

        * import time

        * pick better default

        * test screenshot datasource

        * validate all params

        * fix enable extension

        * haha break out of while loop

        * count my items

        * whoops, len() is important here

        * must be getting tired...

        * remove redundant logging

        * Eager loading for screenshots, viewport options, etc

        * Woops, wrong folder

        * Fix label shortening

        * Just 'queue' instead of 'search queue'

        * Yeah, make it headless

        * README -> DESCRIPTION

        * h1 -> h2

        * Actually just have no header

        * Use proper filename for downloaded files

        * Configure whether to offer pseudonymisation etc

        * Tweak descriptions

        * fix log missing data

        * add columns to post_topic_matrix

        * fix breadcrumb bug

        * Add top topics column

        * Fix selenium config install parameter (Docker uses this/manual would
        need to run install_selenium, well, manually)

        * this processor is slow; i thought it was broken long before it updated!

        * refactor detect_trackers as conversion processor not filter

        * add geckodriver executable to docker install

        * Auto-configure webdrivers if available in PATH

        * update screenshots to act as image-downloader and benefit from processors

        * fix is_compatible_with

        * Delete helper-scripts/migrate/migrate-1.30-1.31.py

        * fix embeddings is_compatible_with

        * fix up UI options for hashing and private

        * abstract was moved to lib

        * various fixes to selenium based datasources

        * processors not compatible with image datasets

        * update firefox extension handling

        * screenshots datasource fix get_options

        * rename screenshots processor to be detected as image dataset

        * add monthly and weekly frequencies to wayback machine datasource

        * wayback ds: fix fail if all attempts do not realize results; addion frequency options to options; add daily

        * add scroll down page to allow lazy loading for entire page screenshots

        * screenshots: adjust pause time so it can be used to force a wait for images to load

        I have not successfully come up with or found a way to wait for all images to load; document.readyState == 'complete' does not function in this way on certain sites including the wayback machine

        * hash URLs to create filenames

        * remove log

        * add setting to toggle display advanced options

        * add progress bars

        * web archive fix query validation

        * count subpages in progress

        * remove overwritten function

        * move http response to own column

        * special filenames

        * add timestamps to all screenshots

        * restart selenium on failure

        * new build have selenium

        * process urls after start (keep original query parameters)

        * undo default firefox

        * quick max

        * rename SeleniumScraper to SeleniumSearch

        todo: build SeleniumProcessor!

        * max number screenshots configurable

        * method to get url with error handling

        * use get_with_error_handling

        * d'oh, screenshot processor needs to quit selenium

        * update log to contain URL

        * Update scrolling to use Page down key if necessary

        * improve logs

        * update image_category_wall as screenshot datasource does not have category column; this is not ideal and ought to be solved in another way.

        Also, could I get categories from the metadata? That's... ugh.

        * no category, no processor

        * str errors

        * screenshots: dismiss alerts when checking ready state is complete

        * set screenshot timeout to 30 seconds

        * update gensim package

        * screenshots: move processor interrupt into attempts loop

        * if alert disappears before we can dismiss it...

        * selenium specific logger

        * do not switch window when no alert found on dismiss

        * extract wait for page to load to selenium class

        * improve descriptions of screenshot options

        * remove unused line

        * treat timeouts differently from other errors

        these are more likely due to an issue with the website in question

        * debug if requested

        * increase pause time

        * restart browser w/ PID

        * increase max_workers for selenium

        this is by individual worker class not for all selenium classes... so you can really crank them out if desired

        * quick fix restart by pid

        * avoid bad urls

        * missing bracket & attempt to fix-missing dependencies in Docker install

        * Allow dynamic form options in processors

        * Allow 'requires' on data source options as well

        * Handle list values with requires

        * basic processor for apple store; setup checks for additional requirements

        * fix is_4cat_class

        * show preview when no map_item

        * add google store datasource

        * Docker setup.py use extensions

        * Wider support for file upload in processors

        * Log file uploads in DMI service manager

        * add map_item methods and record more data per item

        need additional item data as map_item is staticmethod

        * update from master; merge conflicts

        * fix docker build context (ignore data files)

        * fix option requirements

        * apple store fix: list still tries to get query

        * apple & google stores fix up item mapping

        * missed merge error

        * minor fix

        * remove unused import

        * fix datasources w/ files frontend error

        * fix error w/ datasources having file option

        * better way to name docker volumes

        * update two other docker compose files

        * fix docker-compose ymls

        * minor bug: fix and add warning; fix no results fail

        * update apple field names to better match interface

        * update google store fieldnames and order

        * sneak in jinja logger if needed

        * fix fourcat.js handling checkboxes for dynamic settings

        * add new endpoint for app details to apple store

        * apple_store map new beta app data

        * add default lang/country

        * not all apps have advisories

        * revert so button works

        * add chart positions to beta map items

        * basic scheduler

        To-do
        - fix up and add options to scheduler view (e.g. delete/change)
        - add scheduler view to navigator
        - tie jobs to datasets? (either in scheduler view or, perhaps, filter dataset view)
        - more testing...

        * update scheduler view, add functions to update job interval

        * revert .env

        * working scheduler!

        * basic scheduler view w/ datasets

        * fix postgres tag

        * update job status in scheduled_jobs table

        * fix timestamp; end_date needed for last run check; add dataset label

        * improve scheduler view

        * remove dataset from scheduled_jobs table on delete

        * scheduler view order by last creation

        * scheduler views: separate scheduler list from scheduled dataset list

        * additional update from master fixes

        * apple_store map_items fix missing locales

        * add back depth for pagination

        * correct route

        * modify pagination to accept args

        * pagination fun

        * pagination: i hate testing on live servers...

        * ok ok need the pagination route

        * pagination: add route_args

        * fix up scheduler header

        * improve app store descriptions

        * add azure store

        * fix azure links

        * azure_store: add category search

        * azure fix type of config update timestamp

        OPTION_DATE does not appear correctly in settings and causes it to be written incorrectly

        * basic aws store

        * check if selenium available; get correct app_id

        * aws: implement pagination

        * add logging; wait for elements to load after next page; attempts to rework filter option collection

        * apple_store: handle invalid param error

        * fix filter_options

        * aws: fix filter option collection!

        * more merge

        * move new datasources and processors to extensions and modify setup.py and module loader to use the new locations

        * migrate.py to run extension "fourcat_install.py" files

        * formatting

        * remove extensions; add gitignore

        * excise scheduler merge

        * some additional cleanup from app_studies branch

        * allow nested datasources folders; ignore files in extensions main folder

        * allow extension install scripts to run pip if migrate.py has not

        * Remove unused URL functions we could use ural for

        * Take care of git commit hash tracking for extension processors

        * Get rid of unused path.versionfile config setting

        * Add extensions README

        * Squashed commit of the following:

        commit cd356f7a69d15e8ecc8efffc6d63a16368e62962
        Author: Stijn Peeters <stijn.peeters@uva.nl>
        Date:   Sat Sep 14 17:36:18 2024 +0200

            UI setting for 4CAT install ad in login

        commit 0945d8c0a11803a6bb411f15099d50fea25f10ab
        Author: Stijn Peeters <stijn.peeters@uva.nl>
        Date:   Sat Sep 14 17:32:55 2024 +0200

            UI setting for anonymisation controls

            Todo: make per-datasource

        commit 1a2562c2f9a368dbe0fc03264fb387e44313213b
        Author: Stijn Peeters <stijn.peeters@uva.nl>
        Date:   Sat Sep 14 15:53:27 2024 +0200

            Debug panel for HTTP headers in control panel

        commit 203314ec83fb631d985926a0b5c5c440cfaba9aa
        Author: Stijn Peeters <stijn.peeters@uva.nl>
        Date:   Sat Sep 14 15:53:17 2024 +0200

            Preview for HTML datasets

        commit 48c20c2ebac382bd41b92da4481ff7d832dc1538
        Author: Desktop Sal <info@salhagen.nl>
        Date:   Wed Sep 11 13:54:23 2024 +0200

            Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies

        commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3
        Author: Dale Wahl <dalewahl@gmail.com>
        Date:   Fri Sep 6 16:29:19 2024 +0200

            fix nltk where it matters

        commit 2ef5c80f2d1a5b5f893c8977d8394740de6d796d
        Author: Stijn Peeters <stijn.peeters@uva.nl>
        Date:   Tue Sep 3 12:05:14 2024 +0200

            Actually check progress in text annotator

        commit 693960f41b73e39eda0c2f23eb361c18bde632cd
        Author: Stijn Peeters <stijn.peeters@uva.nl>
        Date:   Mon Sep 2 18:03:18 2024 +0200

            Add processor for stormtrooper DMI service

        commit 6ae964aad492527bc5d016a00f870145aab6e1af
        Author: Stijn Peeters <stijn.peeters@uva.nl>
        Date:   Fri Aug 30 17:31:37 2024 +0200

            Fix reference to old stopwords list in neologisms preset

        * Fix Github links for extensions

        * Fix commit detection in extensions

        * Fix extension detection in module loader

        * Follow symlinks when loading extensions

        Probably not uncommon to have a checked out repo somewhere to then symlink into the extensions dir

        * Make queue message on create page more generic

        * Markdown in datasource option tooltips

        * Remove Spacy model from requirements

        * Add software_source to database SQL

        ---------

        Co-authored-by: Stijn Peeters <stijn.peeters@uva.nl>
        Co-authored-by: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com>

    commit cd356f7a69d15e8ecc8efffc6d63a16368e62962
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 17:36:18 2024 +0200

        UI setting for 4CAT install ad in login

    commit 0945d8c0a11803a6bb411f15099d50fea25f10ab
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 17:32:55 2024 +0200

        UI setting for anonymisation controls

        Todo: make per-datasource

    commit 1a2562c2f9a368dbe0fc03264fb387e44313213b
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 15:53:27 2024 +0200

        Debug panel for HTTP headers in control panel

    commit 203314ec83fb631d985926a0b5c5c440cfaba9aa
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 15:53:17 2024 +0200

        Preview for HTML datasets

    commit 48c20c2ebac382bd41b92da4481ff7d832dc1538
    Author: Desktop Sal <info@salhagen.nl>
    Date:   Wed Sep 11 13:54:23 2024 +0200

        Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies

    commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Sep 6 16:29:19 2024 +0200

        fix nltk where it matters

    commit 2ef5c80f2d1a5b5f893c8977d8394740de6d796d
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Sep 3 12:05:14 2024 +0200

        Actually check progress in text annotator

    commit 693960f41b73e39eda0c2f23eb361c18bde632cd
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Sep 2 18:03:18 2024 +0200

        Add processor for stormtrooper DMI service

    commit 6ae964aad492527bc5d016a00f870145aab6e1af
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Aug 30 17:31:37 2024 +0200

        Fix reference to old stopwords list in neologisms preset

    commit 4ba872bef2968f7f8bf5831fd3a4f413420b36ed
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Aug 27 13:04:46 2024 +0200

        fix hatebase: default column option for OPTION_MULTI_SELECT must be list

    commit e276033542f2d22e7f614f318a01d65114a21482
    Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
    Date:   Wed Aug 21 12:53:10 2024 +0200

        Bump nltk from 3.6.7 to 3.9 (#447)

        Bumps [nltk](https://github.com/nltk/nltk) from 3.6.7 to 3.9.
        - [Changelog](https://github.com/nltk/nltk/blob/develop/ChangeLog)
        - [Commits](https://github.com/nltk/nltk/compare/3.6.7...3.9)

        ---
        updated-dependencies:
        - dependency-name: nltk
          dependency-type: direct:production
        ...

        Signed-off-by: dependabot[bot] <support@github.com>
        Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>

    commit 1d749c3cf83b130ba70bdb09174f382d6711a14b
    Author: sal-phd-desktop <s.h.hagen@uva.nl>
    Date:   Wed Aug 21 12:52:54 2024 +0200

        Set UTF-8 encoding when opening stop words (fixes Windows bug)

    commit a03e5fd4252e7242563c291558606440256eb3d1
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Aug 19 14:19:21 2024 +0200

        remove duplicate line

    commit aa07e8c13c2d59c6b699f78133036514659ee420
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Jul 29 09:35:22 2024 +0200

        tweet import fix: author banner key missing when author has no banner

    commit 32dac5d2ffb936210f12f5c725514fd25a0286f1
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Jul 29 08:52:08 2024 +0200

        tell user when dataset is not found

        we could have a proper 404 page, but at least leave a message

    commit 2c8c860fc5378113d1352016ac26ca761adecb32
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Jul 22 17:41:00 2024 +0200

        telegram fix: reactions datastructure

    commit 1c0bf5e580eb16d8a6f9afa415f9febce449a537
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Jul 22 11:19:52 2024 +0200

        fix telegram: crawl_max_depth can be None if it is not enabled for a user

    commit 3dfe7af292b33574a31630e3a0da10954ed87d0a
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Jul 19 11:52:31 2024 +0200

        fix more config.get() magic

    commit 2453182bcee6e54b396b762ab77b60b8a0893638
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Jul 19 10:54:23 2024 +0200

        config_manager - fix `get_all` w/ one results (super rare edge); fix overwriting self.db in `with_db`

    commit 6b9cb0b5479e6e64e09a49fa2ca9effe1c5a7415
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Jul 17 15:20:49 2024 +0200

        add surf nginx init file

    commit 5e984e13a08d9fba7d5806a7ef4e012ce7d57319
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Jul 17 14:30:34 2024 +0200

        change port for surf

    commit 2ce8c354e90f939a16dad3f0155fd7d79405c79e
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Jul 17 12:54:11 2024 +0200

        use latest image on surf

    commit 13ec0fd3f2bed86c3b2dff73014093a6a92fbfb5
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Jul 17 12:46:59 2024 +0200

        update surf docker-compose.yml

         this may require a new release

    commit 78698f6ac1b22b1154d31f69543ba7b266d33191
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Jul 17 10:34:56 2024 +0200

        clip: handle new and old format

    commit eb7693780cb191403f107817ca30d90373929bf0
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Jul 16 14:27:08 2024 +0200

        DMI SM updates to use status endpoint w/ database records; run on CPU if no GPU enabled

    commit d2a787e2c1559417bb5401f3208c82954052504f
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Jul 15 15:58:06 2024 +0200

        Require most recent Telethon version

    commit 346150bd9cc96ac099abd4d15fa3de39bd65e9d1
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Jul 15 15:57:55 2024 +0200

        Catch UPDATE_APP_TO_LOGIN in Telegram

    commit 04acc06e95098d7e2f9b4af404447c9cfaee5b99
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Jul 15 11:27:30 2024 +0200

        Unbreak Twitter error handling

    commit e9b5232a963be02c2e86dabacb607b2315a4e0e6
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Jul 12 13:27:15 2024 +0200

        Ensure str type when trying to extract video URLs from a field

    commit d69dd6f337cac05ed31c05334890679976a1e6de
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Jul 12 12:31:14 2024 +0200

        Make CSV column mapping params look nicer on result page

    commit 9bd9da568f593085a8d54744836e3290a75b51a7
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Jul 12 12:22:03 2024 +0200

        Add "empty" and "current timestamp" as options to CSV mapping

    commit 0b574571952a206904440faf8601ddf95ab42b24
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Jul 11 16:59:56 2024 +0200

        image_wall: backup fit method

    commit eeb1ddeb7ca85b6802dfed3c74d1352062383d50
    Merge: 2504c37b 43239467
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu Jul 11 16:47:45 2024 +0200

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit 43239467db046eea5eb5268f91d1b63a1042238d
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Jul 11 12:08:08 2024 +0200

        fix processor more button

        would only show top level analysis if not logged in

    commit d6ab2b0783f8e40ecd8fadbc2abccffa6f093e39
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Jul 9 15:35:25 2024 +0200

        search_gab - use MappedItem

    commit 2504c37b67ff6f19720b44d8bb6054b1c3d5a155
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Jul 6 17:51:22 2024 +0200

        Fix multiline spacing in multi select list

    commit fea66ce38be0717da6c1f847e7124f7069c096e2
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Jul 5 13:15:45 2024 +0200

        use processor media_type if dataset does not have media_type; set default media_type for downloaders

    commit d41fa34514e8177efdac7e64a31f2ee75c7d1652
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Jul 5 12:57:18 2024 +0200

        video_hasher: handle no metadata file

    commit 2820dcecc36ed4705a2776064d387ff7ed14e84f
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Jul 5 12:50:09 2024 +0200

        num_rows not num_items()

    commit fb09162db902fa22fdf2d7a3ed171ce1489bd92f
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Jul 5 12:44:03 2024 +0200

        Google vision API returning 400s; properly log and record processed entries; google networks should not run on empty datasets

    commit ebf39d8262d199895aedc4f7fa275c5685e58563
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Jul 5 12:28:13 2024 +0200

        fix image_category_wall

        whoops, cleared categories and post_values after filling them!

    commit 1ad9ec2c2e76604793ec37584c051f116af2fdab
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Jul 5 12:03:54 2024 +0200

        fsdfdsgd sorry

    commit c7254c08a477c6cdc8497507e8452c3eff7101c9
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Jul 5 12:01:21 2024 +0200

        Fix razdel versioning

    commit b9a327abe99f2d9ede4f2747f34f20d1dc6803cb
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Jul 5 11:57:47 2024 +0200

        Reorganise tokeniser, stopwords

    commit fb13bc483af9ba0d677ee35fd045bf36ab1cddf7
    Merge: 0b745692 e3046496
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Jul 5 11:56:08 2024 +0200

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit e30464964262870c54c73f65a3bce630d6576f45
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Jul 5 10:51:53 2024 +0200

        media_upload allow setting for max_form_part and warn users of failure above certain number of files

    commit e4f982b4550b352a5d1a131abd78d52e6c196e48
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Jul 5 09:50:49 2024 +0200

        Update media_import help text; looks like failure happens somewhere between 600-1000 files due to Flask request size limits

    commit 0b74569280f8f87376a964a6b160ea1993cb3354
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu Jul 4 17:55:36 2024 +0200

        Add razdel as option for Russian tokenisation

    commit 9f15a2b8d666c3b6fddeb151b7c424cb44df18a6
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Jul 4 17:13:15 2024 +0200

        remove the log

    commit ffcb6a4239075ba190fb534b25b89507e09e5f56
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Jul 4 17:12:43 2024 +0200

        Inform user if too many files are uploaded

        I do not understand why this is appearing. app.config['MAX_CONTENT_LENGTH'] is set to None. Problem persists in Flask alone (i.e., does not appear to be Gunicorn/Nginx/Apache).

    commit 9cad12dd6f64a63c48d3b5b304b5c7d9d1a6ddb7
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu Jul 4 15:09:42 2024 +0200

        Bump version

    commit aad94f393de77cc9d4f578e1f5be66a3601a4c90
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Jul 4 10:51:01 2024 +0200

        Update setup.py to ensure videohash updates

    commit d9154a6f9c46a5c793909b88da751bc71d6f759f
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Jul 2 17:45:26 2024 +0200

        clip: categorizing requires categories...

        seriously, guys?

    commit 0af9a5ec49bd2bcfbb87bda33976c65683f68777
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Jul 2 17:31:49 2024 +0200

        blip2: fix no metadata file found (uploads...)

    commit d695053f440bd938a57f06adea7b9c732ecf30d7
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Jul 2 17:25:26 2024 +0200

        cat_vis_wall - use str as category type if mixed

        i.e., use floats as string categories

    commit bcb914076760ea1fb0e277cdcd1782ffa101b535
    Author: Sal Hagen <s.h.hagen@uva.nl>
    Date:   Tue Jul 2 16:06:43 2024 +0200

        Add Twitter author profile pic and banner URLs

    commit 1b3b02f826578e8f702ea84a27c8ced7b1fab345
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Jul 2 11:42:50 2024 +0200

        add migrate.py log file in Docker

    commit 2aaa972e6888743fc329d721c37fa626cf2eeae3
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Jul 2 11:42:22 2024 +0200

        add necessary pip packages for upgrade in Docker environment; add error logging and save to file for trouble shooting

    commit 18b8a53c01b334e0f70610b1305d380b25dbe9c6
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Jul 2 11:41:36 2024 +0200

        update Dockerfile to keep build environment

        useful for interactive upgrade

    commit 7b224b9b798c9aaf956b5b618b98d742c4a2e7cd
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Jul 2 11:41:12 2024 +0200

        remove docker-compose.yml versions

    commit acf5de0ed02e144b920a80abfdfa35986dd0ed4c
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Jul 1 17:38:32 2024 +0200

        Better issues.md, footer link

    commit 1953ca3895656ca9a12d2657e58019795ae64b3a
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Jul 1 12:00:07 2024 +0200

        FIX: get_key() is more of a creating of a key then general getting of a key...

    commit 12289bb5c766d1af23799ff11278b46b48fc2841
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Jul 1 11:37:06 2024 +0200

        .metadata.json may not have top_parent via Media Uploader

        This may exist in other processors if a proper check is not in place; will need to review

    commit 25f4ed65ec2c32298a90490cf51037a7ea2d0bf9
    Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com>
    Date:   Tue Jun 25 14:43:40 2024 +0200

        Media upload datasource! (#419)

        * basic changes to allow files box

        * basic imports, yay!

        * video_scene_timelines to work on video imports!

        * add is_compatible_with checks to processors that cannot run on new media top_datasets

        * more is_compatible fixes

        * necessary function for checking media_types

        * enable more processors on media datasets

        * consolidate user_input file type

        * detect mimetype from filename

        best I can do without downloading all the files first.

        * handle zip archives; allow log and metadata files

        * do not count metadata or log files in num_files

        * move machine learning processors so they can be imported elsewhere

        * audio_to_text datasource

        * When validating zip file uploads, send list of file attributes instead of the first 128K of the zip file

        * Check type of files in zip when uploading media

        * Skip useless files when uploading media as zip

        * check multiple zip types in JS

        * js !=== python

        * fix media_type for loose file imports; fix extension for audio_to_text preset; fix merge for some processors w/ media_type

        ---------

        Co-authored-by: Stijn Peeters <stijn.peeters@uva.nl>

    commit 4ce689bdc3e441a7adf85883ddcda6bae0525ed9
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Jun 24 11:58:50 2024 +0200

        Avoid KeyError

    commit 155522d0817d19ac7b6b0b0164242156d6f7443a
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Jun 20 15:58:21 2024 +0200

        add generated images to image wall w/ text visual

    commit eecde519eab1208eeb6ee53c2d8febff7fb8febf
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Jun 20 15:57:56 2024 +0200

        allow users to NOT generate all images from prompts

    commit d0b9574093a109997e63b1062b2bdd8e71300a29
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed Jun 19 16:28:26 2024 +0200

        ...don't mangle URLs in preview links

    commit c105e368a521ec54ae717bb9eb2fe9fae66cf6e8
    Merge: 0028a999 8d4f99b2
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Jun 19 16:25:36 2024 +0200

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit 0028a9994d698611dd8b546b9b3bccbeec30b74f
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Jun 19 16:25:12 2024 +0200

        add followups to processors

    commit 8d4f99b22e0308606c7f713ef704dfa939e85247
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed Jun 19 16:17:22 2024 +0200

        More flexible URL linking in CSV preview

    commit f4f8e6621bd6f2504dc3afc2078280bf5edb6444
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Jun 19 13:54:00 2024 +0200

        tokeniser fix: use default lang for word_tokenize if language is 'other'

    commit 127472e91d8e510f3de2a9cc4a87be6cf2d0deaa
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Jun 18 16:45:01 2024 +0200

        Better log messages for Telegram data source

    commit e8714b6fba72e00c690a8d643d8dc54d2250c94a
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Jun 17 17:42:21 2024 +0200

        Add 'crawl' feature to Telegram data source

        Fixes #321 (though might need a bit more testing)

    commit 25fded7b596097f7916e1793f1841bae2b63d453
    Merge: d67cf440 b10e3bb8
    Author: sal-phd-desktop <s.h.hagen@uva.nl>
    Date:   Fri Jun 14 16:23:02 2024 +0200

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit d67cf440730ea1d4e124c76a4c21d65b56f39c68
    Author: sal-phd-desktop <s.h.hagen@uva.nl>
    Date:   Fri Jun 14 16:22:59 2024 +0200

        Fix export 4chan script and remove some unecessary code

    commit b10e3bb8f0c8a67aa5fdbba1962301d8acdf625c
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Jun 13 15:14:06 2024 +0200

        video_hasher prefix: fix extension type

    commit ba565cdaa2ebeecf23fd60889d546c76b9ea5eb1
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Jun 13 14:53:13 2024 +0200

        video_hasher: fix to work with Pillow updates; add max amount videos

    commit 90da5d231eff6a4249bef5468fcdbf1ebcf9247a
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Jun 13 10:25:24 2024 +0200

        image_cat_wall fix the fix

    commit a8b943d8e2c5471f82ea0442e2659d84fe8d9760
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Jun 12 13:29:41 2024 +0200

        add OCR processor to image w/ text visualization

    commit e7e636b6b89b6163fa6976e67edba68e7d75b7ac
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Jun 11 15:23:12 2024 +0200

        add image_wall_w_text to follow on BLIP captions

    commit f74b97827f0465baf8483040471a77e4654e70b1
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Jun 6 11:05:25 2024 +0200

        image_category_wall: allow multiple images per item/post

    commit e3c9ea57d46b32ba47b00a6047a278ddd530adc1
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu May 30 16:27:50 2024 +0200

        image_category_wall convert None to str for category

    commit 00874576c354235f4655f1d433ec4382010e18e3
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu May 30 14:54:51 2024 +0200

        image_category_wall fix float categories

    commit e0c55a8ae132bedef5da27ecbbb9489a094d454c
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu May 30 12:51:42 2024 +0200

        download_images fix divide by zero when user can download all

    commit 3580fc9450501262badb8e61ef4b4df4b4c54322
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu May 30 12:51:24 2024 +0200

        image_category_wall remove 'max' when user can use all images

    commit f2145bdeff1d68e46cdd3521ecbb61573f01a2f2
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed May 29 17:59:23 2024 +0200

        rank_attributes: option to count missing data or blanks

    commit 01e7ab9677a75181bbedc62fa00e636ce2b17c18
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed May 29 16:53:57 2024 +0200

        fix missing field strategy so default_stategy not overwritten on second loop

        default_stategy would be set to correctly to the callable, but overwritten on second loop (and map_missing is a dictionary at that point).

    commit 097f838af1f5f2748578dd9072eb9e3a8b3a7057
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue May 28 12:16:08 2024 +0200

        add log_level arg to 4cat-daemon.py

        I've been using this forever and don't know why I haven't commited it

    commit fd3ac238e60f052889d99c71588170570a384900
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue May 28 10:10:56 2024 +0200

        google & clarifai to csv had identical "type"

        possibly caused issue w/ preset

    commit 1b9965d40aa33035a73f685c13a1ab50cc877f78
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon May 27 15:54:20 2024 +0200

        Ensure file cleanup worker always exists

    commit 0e0917f2232e240df3412fd4df51cf0be19248b5
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu May 23 17:36:22 2024 +0200

        Also update Spacy model versions...

    commit f40128213529d154cfb77afa7aa67a72d5bb640f
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu May 23 17:32:35 2024 +0200

        *Actually* remove typing_extensions dependency

        ???

    commit ba3d83b824c5fb6fcb0aec5e1c36b35070d6e5d9
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu May 23 17:30:08 2024 +0200

        Update minimum Pillow dependency version

    commit 1c3485648bf2a911052eeeae4f293f303a944aec
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu May 23 17:27:27 2024 +0200

        Do not require typing_extensions explicitly

        This was required to ensure Spacy could load - looks like Spacy has since been updated to work with newer versions of typing_extensions as well

    commit 3828de83ba123254463a904392f24daec626c136
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu May 23 17:02:04 2024 +0200

        Bump version

    commit 8f0d098107a4bbc9d55cc6048f7a38f1d1891a32
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu May 23 17:01:28 2024 +0200

        Require non-broken version of emoji library

    commit 4b2ad805fcc99a83e46732fc991d98d78ef06c6c
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu May 23 13:11:03 2024 +0200

        Show worker progress in control panel if available

    commit 9144d4503f46108437616d6bc0cf4fde74df3aca
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu May 23 11:07:41 2024 +0200

        Bump version

    commit 807ab77101d197ec897640480a2140439d570c05
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 21:57:11 2024 +0200

        Fix Instagram upload with missing media URL

    commit d0b4840fd465b6d21657c3d50f9291ac911b6082
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 17:35:04 2024 +0200

        Comma comma comma

    commit 7fd2e14c9505d0ed1ac77dc09c24f766ea61ee6c
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 17:25:26 2024 +0200

        Fix progress indicator for scene extractor

    commit 661c42c2d083da7004335b0e14910935c3d392f6
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 17:12:21 2024 +0200

        Don't crash video hasher non non-str item IDs

    commit 1f280321cdde27a9909885fa2f64dbeffa549fb1
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 17:09:53 2024 +0200

        Do not crash timelines processor when metadata has unexpected format

    commit 572d03f1f368f0ad5f47e705a119b37646148d1d
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 17:09:30 2024 +0200

        More efficient video frame extractor

    commit 1b51d224ca544d7e2913238adbff2049412bc41e
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 17:04:27 2024 +0200

        Fix crash in video stack processor with ffmpeg < 5.1

    commit ddc73cb2e2f0985e64f84ca86bc167fa9e9dc81a
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 17:03:48 2024 +0200

        Helper function for determining ffmpeg version

    commit ef9dd482b2258c428584997dc661156f63f68b91
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 12:14:58 2024 +0200

        Allow absence of articleComponent in LinkedIn posts

    commit 060f2cd7f922e7fae337b0697f7c477442d21ef1
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 12:12:54 2024 +0200

        Cast post IDs to string when mapping video scenes

    commit ab34c415c9ada23763b45676639ce3e80a34f594
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 11:46:39 2024 +0200

        Twitter -> X/Twitter

    commit de6d97554ccb68375979e5ff09c7e65d8d70a6cd
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 22 11:45:19 2024 +0200

        Colleges -> Collages

    commit 30365580dc59b4d95e8a62d1b3c666bef60ce7e8
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue May 21 15:41:55 2024 +0200

        Explicit disconnect after Telegram image download

    commit 5727ff7230db42463a824f45d63f0b8343caac14
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue May 21 14:05:50 2024 +0200

        Catch TimedOutError while downloading Telegram images

    commit e0e06686e78976f971aac620267d7e009eaaadff
    Author: Sal Hagen <s.h.hagen@uva.nl>
    Date:   Mon May 13 13:01:42 2024 +0200

        Typo in LinkedIn search

    commit 51e58dde6ca21278a80f252a8c22dc83d87ace1f
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue May 7 13:10:43 2024 +0200

        text_from_image: fix metadata missing (indent issue)

    commit c1f8ecc1674375bba2b2e38cb29c9d4d44098f0a
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue May 7 09:45:25 2024 +0200

        text_from_image fix: ensure metadata success before attempting to update original

    commit 72dbf80db71499c59133e1128205b756d240b300
    Merge: d7561625 baacc86b
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri May 3 13:14:08 2024 +0200

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit d7561625b127573fbb0332fbb713be6a3cb3d953
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri May 3 13:14:03 2024 +0200

        Comments without replies don't always have reply_comment_total

    commit baacc86b269612b4b0956345f8b9fa902df1b61f
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri May 3 12:01:22 2024 +0200

        DSM fix and simplify GPU mem check

    commit 9b662e9f9b4f4ce194608c8e20a8fc50bc6d9ae3
    Author: Parker-Kasiewicz <110084850+Parker-Kasiewicz@users.noreply.github.com>
    Date:   Thu May 2 00:53:45 2024 -0700

        Adding Gab as a Data Source! (#401)

        * Can successfully import gab data, although
        can't tell if formatting is right becuase
        waiting on queued requests.

        * Version w/ different item types

        * Ingest Gab posts from Zeeschuimer

        * Small fix for merge conflicts (whoops)

        * Gab processing logic transferred from Zeeschuimer

        * fixing small errors for Gab data source

        * basic processing for truth social from Zeeschuimer

        ---------

        Co-authored-by: Dale Wahl <32108944+dale-wahl@users.noreply.github.com>

    commit 3ecb8fd9c27aee4c457f03516794c6c4eac19c09
    Author: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com>
    Date:   Wed May 1 17:51:36 2024 +0200

        Fix duplicate line in views_admin.py

    commit 8b66ae7e467913f8e7571cf4b45493f63804266f
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 1 17:49:54 2024 +0200

        Allow processors to define which fields should be pseudonymised

    commit c973750c8cabb8698704c5997903e92d1de866d2
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 1 17:15:32 2024 +0200

        Allow auto-queue of pseudonymisation after import

    commit 49ad9f0ff785fd44ae494755b785c7fdf7c9cf15
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 1 17:08:35 2024 +0200

        Get rid of redundant and buggy next/copy_to implementation in Search class

    commit 106d3659e2fda89867d3a4f587c1c1addfaff2f7
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed May 1 16:14:03 2024 +0200

        use current branch in settings

    commit 60bef4157d807f7c01ef3b425295244e91919f31
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed May 1 11:04:07 2024 +0200

        Nicer code

    commit 4182c436e4fb5109c5e041dc729f77a58d877889
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Apr 30 16:19:36 2024 +0200

        Always shut down API worker only after everything else has been shut down

    commit e685108b3cbe5f005ce2df21906267071ad8118e
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Apr 30 16:12:42 2024 +0200

        Properly interrupt expiration worker when asked

    commit 27a568eca7f2f3742223fef6285eaf80583e0fc4
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Apr 30 13:40:50 2024 +0200

        Allow floats-as-strings as timestamps when importing CSV

    commit 2d2bbb9fdb9b426b8f4a80782f04257721a97f2e
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Apr 30 13:05:07 2024 +0200

        douyin: add consistency to map_item stats

    commit 289aa342c9912aceeca35887c079c72aa6ffbf52
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Apr 29 15:26:38 2024 +0200

        fix collection data in Douyin to handle $undefined

    commit 5b9b23fb1696bc1b69e1d902c0a2ad4b7d168984
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Apr 29 13:00:03 2024 +0200

        add scipy requirement to make compatible with gensim

        https://stackoverflow.com/questions/78279136/importerror-cannot-import-name-triu-from-scipy-linalg-gensim

    commit 7eab746e944f1ababe3dcd6a5d25387a64c2237d
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Apr 29 12:00:09 2024 +0200

        stupid, stupid, stupid

    commit 90577982ac05019a7ac76818a62f91e84dd65902
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Apr 29 11:56:22 2024 +0200

        Fix leftover iterate_mapped_items

    commit 57dbdf74c49c34c05784debb9f7e258da7ae7d54
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 26 15:26:39 2024 +0200

        Woops

    commit f11760d2c13e817e23cfa5e26b24f74cf817f65e
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 26 15:26:04 2024 +0200

        Update list of supported platforms in readme

    commit 760ff1cdeb006f70acaa00ded82fb3cbc7617c9d
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 26 12:13:28 2024 +0200

        Bump version

    commit 1fd78b2362840299e80f5540c9fedc1be3b06da1
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu Apr 25 12:58:24 2024 +0200

        Use MissingMappedField for Douyin fields undefined in the source data

    commit 6918baeabc7a08b6a63495c5d38c86b2c88bca44
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu Apr 25 12:31:11 2024 +0200

        Fix Douyin mapping failure if cellRoom is $undefined

    commit aad6208167c07686348234daff4dcf9cd036f5a5
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu Apr 25 12:30:53 2024 +0200

        Better error when trying to import data for unknown datasource

    commit 43c6ed646994111188bde66d5bcfe4ab602e8512
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu Apr 25 12:30:31 2024 +0200

        Fix Twitter mapping on URLs that cannot be expanded

    commit 91c3da176fad90ba16871fa8892fac5a0df13785
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu Apr 25 12:12:54 2024 +0200

        Safe cast to int in CrowdTangle import

    commit 765f29e9232afdf284ab1667b0f371951e0bf2f4
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed Apr 24 12:37:02 2024 +0200

        Fix erroneous shell command in front-end restart trigger

    commit c99fdd9eca8f5925d93375cac846e8b7633194fb
    Merge: 342a4037 bc1deddf
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Apr 23 12:29:35 2024 +0200

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit 342a4037411e7ccaa50b25a4686434bec39e2568
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Apr 23 12:29:32 2024 +0200

        Enable TikTok comment and Gab import by default

    commit bc1deddf57aa5049fb79622c4309fb7051d77bdb
    Merge: 537d7645 3c644f01
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Apr 23 12:16:37 2024 +0200

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit 537d76456e2826e8c4dd7026ec5b2d436370fad8
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Apr 23 12:14:46 2024 +0200

        do the todo: fix column_filter to match exact/contains with int

    commit 3c644f01baeca34e712d36efdf5c77ccd3ef7a06
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Apr 23 11:16:07 2024 +0200

        Don't crash on empty URLs in dataset merge

    commit f1574c26e2e3bdc40cc04bb8193cf6d3fa14792b
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Apr 18 12:08:55 2024 +0200

        fix: do not fail when no processor exists

        weird! failed on a dataset `type="custom-search"` which was created by an import script w/ no processor. Also likely would make deprecated processors fail.
        500 server error:
        ```
        File "/opt/4cat/common/lib/dataset.py", line 800, in get_columns
             return self.get_item_keys(processor=self.get_own_processor())
           File "/opt/4cat/common/lib/dataset.py", line 405, in get_item_keys
             keys = list(items.__next__().keys())
           File "/opt/4cat/common/lib/dataset.py", line 337, in iterate_items
             if own_processor.map_item_method_available(dataset=self):
         AttributeError: 'NoneType' object has no attribute 'map_item_method_available'
        ```

    commit 50a4434a37d71af6a9470c7fc4a236b043cbfb4d
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Wed Apr 17 14:30:58 2024 +0200

        Add "TikTok comments" data source

    commit c43e76daae3c2e6ecdb218ee749315b985eccca4
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Apr 16 17:59:25 2024 +0200

        Allow notifications per tag

    commit 36984104e674e8577756bfc3fdd5c72f6569d9e1
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Apr 16 17:25:38 2024 +0200

        fix: pass dataset to get_options when queuing processors

    commit 59cb19a3c88f7f4a4ac02d0b7a891afde50ea069
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Apr 16 10:55:29 2024 +0200

        fix: dicts are shared in classes & you cannot delete a key more than once

        randomly found this; probably as no one else has reddit enabled!

    commit 3ec9c6ea471bcdbe9fb1caad1e5fe1502a705444
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Apr 15 13:22:19 2024 +0200

        fix results page error when dataset was being created; do not check for resultspage updates when user not focused on page

    commit db05ae5e565248e865e67b8ea60e6653357bb1f4
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Apr 15 11:27:33 2024 +0200

        on import file, differentiate between missing field(s) and unable to map item

    commit 940bac72c7e53bec9e136867c13e2a0a355961a4
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 12 12:57:48 2024 +0200

        Case-insensitive username/note matching in user list

    commit d0f34245bd07b5ad2fd3e90754ef0264ffc350a9
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 12 12:29:12 2024 +0200

        Only determine settings tab name in one place

    commit 9f69d7bc0bbb657be1e725d5fb3fe350b7205bff
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 12 12:20:34 2024 +0200

        git != github

    commit 9b4981d8c7358f31ed65d9f161d556e578389801
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 12 11:56:04 2024 +0200

        Fix issues with user tags

        Fix number of users in tag overview; allow filtering by user tags on user list; don't delete all user tags when deleting one

    commit 9e8ccd3a78765acdfd2005eaa215dc0dc07266e0
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 12 11:32:45 2024 +0200

        Do not hide all non-hidden child processors

        lol

    commit 3f15410af3a278f5644f41f49e25498a1fac3c76
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 12 11:23:52 2024 +0200

        Disable standard video downloader for Telegram

    commit 94c814b9cab2ae2be10d5c5d3f6cfe20898e349c
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 12 11:14:16 2024 +0200

        Telegram video downloader processor

    commit d36254a188947fff507e8df59f793e98b3be1570
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 12 11:14:04 2024 +0200

        Better styling for 4CAT settings, alphabetic order, submenus

    commit 808300fa109f306a921f2048b2cf4b6dafc4ba5f
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Thu Apr 11 14:44:32 2024 +0200

        Fix multiselect in UI

    commit 131a0eca0ad514b1ee57803e5c560ab0e56de42d
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Apr 8 18:28:04 2024 +0200

        Do not attempt to load crashed file as module in Slack webhook. Fixes #422 (hopefully)

    commit 6d8cb067bc12f8be68749f74a7291e0849494225
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 5 19:43:58 2024 +0200

        Allow comma-separated list when adding new dataset owners

    commit 2612aea49f63c37ac691cc89c553c764ead2344f
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 5 19:40:04 2024 +0200

        Include number of users with tag on tag page

    commit 39f2ec40faa3b8493bd5525279aeaeb2e4f586e0
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 5 19:26:02 2024 +0200

        Fix confirmation before deleting user tag

    commit b00a410a3441e7f2a9d73a9f2dfb0f4ef70ea8a5
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 5 19:25:01 2024 +0200

        Add link to users with tag on tag admin page

    commit 3ef3e5ec9adbd8ddd128ce2b3f8fa3b1de1297e3
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Apr 5 18:49:25 2024 +0200

        Give filtered datasets a more sensible label, based on source dataset

    commit 0d5870b78fb73cb58231736cc8a2efbb0b3cd88a
    Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com>
    Date:   Fri Apr 5 17:40:57 2024 +0200

        update iterate methods (#418)

        * working to make iterate_mapped_item primary method used by processors and elsewhere in 4CAT; iterate_item method only internally (and provide item directly as is from file) with iterate_mapped_object as intermediate method to use map_missing method and handle missing values as well as warn if needed

        * switch from iterate_items to iterate_mapped_items; careful attention to item_to_yield allowing a choice of the original item, the mapped item, or both

        * revert some unecessary renaming

        * fix annotations bug...

        this fixes the bug, but i noticed that the notations saved in the database do not have the correct post IDs.

        * Introduce DatasetItem class and simplify iterate_items

        * Don't crash when no item mapper

        * ...actually commit the DatasetItem class

        * Fix typos in comment

        ---------

        Co-authored-by: Stijn Peeters <stijn.peeters@uva.nl>
        Co-authored-by: Sal Hagen <s.h.hagen@uva.nl>

    commit 17b77351c51ace21b7057276bbae9da2643a3fc4
    Author: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com>
    Date:   Fri Apr 5 16:20:19 2024 +0200

        Allow dynamic form options in processors (#397)

        * Allow dynamic form options in processors

        * Allow 'requires' on data source options as well

        * Handle list values with requires

        * Wider support for file upload in processors

        * Log file uploads in DMI service manager

        * fix error w/ datasources having file option

        * fix fourcat.js use of checkboxes for dynamic settings

        * Fix faulty toggleButton targeting

        ---------

        Co-authored-by: Dale Wahl <dalewahl@gmail.com>

    commit 693fcedc93ee4476a60d0e0876e688f82a8526fa
    Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com>
    Date:   Fri Apr 5 15:59:10 2024 +0200

        Add method to processors to toggle display in UI (#411)

        * add ui_only parameter to DataSet.get_available_processors() and BasicProcessor.display_in_ui()

        Allow using `display_in_ui` to hide processors from UI but allow them to be queued either via API or presets. This avoids issue of is_compatible_with() having to be used to hide processors with sometimes ill effects.

        * keep same data structure....

        * don't delete twice; it's redundant... and raises an error

        * Rename arguments/properties

        * Exclude hidden processors in top level view

        * fix logic

        * Exclude in child template as well

        ---------

        Co-authored-by: Stijn Peeters <stijn.peeters@uva.nl>

    commit 3cd146c2908da6b3a06a0c1511bf042c4223af0f
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Apr 4 16:41:39 2024 +0200

        fix: whoops remove debug

    commit daa7291e813e62fed4600a4acb8430004836cb86
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Apr 4 15:16:30 2024 +0200

        CSV preview add hyperlinks if "url" or "link" in column header

    commit 5f2d6e65bad4f71b2c3cc75d2cdab76f15671d4c
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Apr 4 15:16:01 2024 +0200

        blip2 processor to work w/ DMI Service Manager

    commit fe881dec18778d99ac4a0f60ca40a1f43fdb1689
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Thu Apr 4 09:53:30 2024 +0200

        catch AttributeError on slackhook if unable to read file

        ever vigilant against a lack of flavour...

    commit 2808256b1fabf2e6e8a5a94aad98af60c50fb7b0
    Merge: 14123847 eb474640
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Apr 3 17:28:40 2024 +0200

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit 14123847b5852bf0e7c84fced6c2380165ec93f6
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Apr 3 17:28:38 2024 +0200

        staging_areas should not be made for completed datasets (else they may be deleted prematurely)

    commit eb474640559ee3e914d9c95adb60be09b906f1d6
    Merge: bbdf2ab9 3f8b285c
    Author: sal-phd-desktop <s.h.hagen@uva.nl>
    Date:   Wed Apr 3 16:50:54 2024 +0200

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit bbdf2ab9b4292c14911ac01b481c829defa85e5c
    Author: sal-phd-desktop <s.h.hagen@uva.nl>
    Date:   Wed Apr 3 16:50:36 2024 +0200

        Helper script to export the 'classic' 4CAT 4chan data

    commit 3f8b285c44c33a3ce08e885889b311bc454a70ea
    Merge: 8f40f3f5 f7cc5b8d
    Author: Sal Hagen <s.h.hagen@uva.nl>
    Date:   Wed Apr 3 12:12:17 2024 +0200

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit 8f40f3f5222a63e93f46eb3b57791d10060a0cc8
    Author: Sal Hagen <s.h.hagen@uva.nl>
    Date:   Wed Apr 3 12:12:13 2024 +0200

        Tumblr search typo

    commit f7cc5b8d012dec3d8e0c8847ae16c662e82040b5
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Apr 2 12:32:51 2024 +0200

        More/less flavour in restart worker

    commit 073587efc581adca0608988573ac83ea8b0c93d0
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Mar 27 14:15:27 2024 +0100

        create favicon.ico (remove from repo)

        be sure to keep webtool/static/img/favicon/favicon-bw.ico as basis

    commit 28d733d56204231f4089660ff61282174aac7aed
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Mar 27 09:44:45 2024 +0100

        add allow_access_request check to request-password page

        clicking it would only return the user to the login page anyway, but better not even show it

    commit 1f2cb77e3cb0fc9b5403da52aaa925b33089d18f
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Wed Mar 27 09:37:51 2024 +0100

        fix can_request_access to use 4cat.allow_access_request option

    commit 0d66f11d3619af798d5acc41dbf4fe118b7ddad8
    Merge: 25825383 05b3fc07
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Mar 26 17:54:48 2024 +0100

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit 2582538303e31470ed6bf8a01645f7b45af15e5d
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Mar 26 17:54:45 2024 +0100

        More permissive timeout for pixplot

    commit 05b3fc0771ded10dc55db799e8f47e42add08d43
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Tue Mar 26 14:01:59 2024 +0100

        remove redundant call of Path

    commit e4a93442efb84d73d6a4c9af9bc46a8f3e3fdda2
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Mar 26 11:52:09 2024 +0100

        Include column with link description in Telegram mapping

    commit 876f4a4b6df51ec4b30a048c32191438b6778f90
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Mon Mar 25 14:48:47 2024 +0100

        douyin handle image posts

    commit 81ad61baabaf965b1c848f55a80c23bd3e1a9000
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Mar 25 08:01:44 2024 +0100

        Accept non-numeric IDs in Telegram image downloader

    commit a8b36dc5682df7c16e25474ea8fdbfc4f12f9d46
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sun Mar 24 23:15:51 2024 +0100

        Ensure unique IDs for Telegram datasets

    commit 4a3e9ffee072c4d3efb7bfd8744369b46f19eef2
    Merge: 0c119130 d749237e
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sun Mar 24 22:56:59 2024 +0100

        Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

    commit 0c11913049aabb5a83ffe26d58bdf17affdbc0b9
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sun Mar 24 20:09:10 2024 +0100

        Better string formatting in Telegram image downloader

    commit 8a7da5317defdafb5bdbf74dcbeb68e464fa21f4
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sun Mar 24 20:06:06 2024 +0100

        Add 'link thumbnails' op…

---------

Co-authored-by: Stijn Peeters <stijn.peeters@uva.nl>
---
 datasources/telegram/search_telegram.py | 85 ++++++++++++++++---------
 1 file changed, 55 insertions(+), 30 deletions(-)

diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index 55c3a61b7..2028772b9 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -234,6 +234,7 @@ def get_items(self, query):
 
         self.details_cache = {}
         self.failures_cache = set()
+        #TODO: This ought to yield as we're holding everything in memory; async generator? execute_queries() also needs to be modified for this
         results = asyncio.run(self.execute_queries())
 
         if not query.get("save-session"):
@@ -326,9 +327,10 @@ async def execute_queries(self):
         except Exception as e:
             # catch-all so we can disconnect properly
             # ...should we?
-            self.dataset.update_status("Error scraping posts from Telegram")
-            self.log.error(f"Telegram scraping error: {traceback.format_exc()}")
-            return []
+            self.dataset.update_status("Error scraping posts from Telegram; halting collection.")
+            self.log.error(f"Telegram scraping error (dataset {self.dataset.key}): {traceback.format_exc()}")
+            # May as well return what was captured, yes?
+            return posts
         finally:
             await client.disconnect()
 
@@ -364,12 +366,13 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
         # has been mentioned. When crawling is enabled and this exceeds the
         # given threshold, the entity is added to the query
         crawl_references = {}
-        queried_entities = list(queries)
-        full_query = list(queries)
+        full_query = set(queries)
+        num_queries = len(queries)
 
         # we may not always know the 'entity username' for an entity ID, so
         # keep a reference map as we go
         entity_id_map = {}
+        query_id_map= {}
 
         # Collect queries
         # Use while instead of for so we can change queries during iteration
@@ -383,17 +386,18 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
             delay = 10
             retries = 0
             processed += 1
-            self.dataset.update_progress(processed / len(full_query))
+            self.dataset.update_progress(processed / num_queries)
 
             if no_additional_queries:
-                # Note that we are note completing this query
+                # Note that we are not completing this query
                 self.dataset.update_status(f"Rate-limited by Telegram; not executing query {entity_id_map.get(query, query)}")
                 continue
 
             while True:
                 self.dataset.update_status(f"Retrieving messages for entity '{entity_id_map.get(query, query)}'")
+                entity_posts = 0
+                discovered = 0
                 try:
-                    entity_posts = 0
                     async for message in client.iter_messages(entity=query, offset_date=max_date):
                         entity_posts += 1
                         total_messages += 1
@@ -413,11 +417,14 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                         # the channel a message was forwarded from (but that
                         # needs extra API requests...)
                         serialized_message = SearchTelegram.serialize_obj(message)
-                        if "_chat" in serialized_message and query not in entity_id_map and serialized_message["_chat"]["id"] == query:
-                            # once we know what a channel ID resolves to, use the username instead so it is easier to
-                            # understand for the user
-                            entity_id_map[query] = serialized_message["_chat"]["username"]
-                            self.dataset.update_status(f"Fetching messages for entity '{entity_id_map[query]}' (channel ID {query})")
+                        if "_chat" in serialized_message:
+                            # Add query ID to check if queries have been crawled previously
+                            full_query.add(serialized_message["_chat"]["id"])
+                            if query not in entity_id_map and serialized_message["_chat"]["id"] == query:
+                                # once we know what a channel ID resolves to, use the username instead so it is easier to
+                                # understand for the user
+                                entity_id_map[query] = serialized_message["_chat"]["username"]
+                                self.dataset.update_status(f"Fetching messages for entity '{entity_id_map[query]}' (channel ID {query})")
 
                         if resolve_refs:
                             serialized_message = await self.resolve_groups(client, serialized_message)
@@ -427,29 +434,46 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                             break
 
                         # if crawling is enabled, see if we found something to add to the query
-                        if crawl_max_depth and (not crawl_msg_threshold or depth_map.get(query) < crawl_msg_threshold):
+                        if crawl_max_depth and (depth_map.get(query) < crawl_max_depth):
                             message_fwd = serialized_message.get("fwd_from")
                             fwd_from = None
-                            if message_fwd and message_fwd["from_id"] and message_fwd["from_id"].get("_type") == "PeerChannel":
-                                # even if we haven't resolved the ID, we can feed the numeric ID
-                                # to Telethon! this is nice because it means we don't have to
-                                # resolve entities to crawl iteratively
-                                fwd_from = int(message_fwd["from_id"]["channel_id"])
+                            if message_fwd and message_fwd.get("from_id"):
+                                if message_fwd["from_id"].get("_type") == "PeerChannel":
+                                    # Legacy(?) data structure (pre 2024/7/22)
+                                    # even if we haven't resolved the ID, we can feed the numeric ID
+                                    # to Telethon! this is nice because it means we don't have to
+                                    # resolve entities to crawl iteratively
+                                    fwd_from = int(message_fwd["from_id"]["channel_id"])
+                                elif message_fwd and message_fwd.get("from_id", {}).get('full_chat',{}):
+                                    # TODO: do we need a check here to only follow certain types of messages? this is similar to resolving, but the types do not appear the same to me
+                                    # Note: message_fwd["from_id"]["channel_id"] == message_fwd["from_id"]["full_chat"]["id"] in test cases so far
+                                    fwd_from = int(message_fwd["from_id"]["full_chat"]["id"])
+                                else:
+                                    self.log.warning(f"Telegram (dataset {self.dataset.key}): Unknown fwd_from data structure; unable to crawl")
+
+                            # Check if fwd_from or the resolved entity ID is already queued or has been queried
+                            if fwd_from and fwd_from not in full_query and fwd_from not in queries:
 
-                            if fwd_from and fwd_from not in queried_entities and fwd_from not in queries:
                                 # new entity discovered!
                                 # might be discovered (before collection) multiple times, so retain lowest depth
                                 depth_map[fwd_from] = min(depth_map.get(fwd_from, crawl_max_depth), depth_map[query] + 1)
-                                if depth_map[query] < crawl_max_depth:
-                                    if fwd_from not in crawl_references:
-                                        crawl_references[fwd_from] = 0
-
-                                    crawl_references[fwd_from] += 1
-                                    if crawl_references[fwd_from] >= crawl_msg_threshold and fwd_from not in queries:
-                                        queries.append(fwd_from)
-                                        full_query.append(fwd_from)
-                                        self.dataset.update_status(f"Discovered new entity {entity_id_map.get(fwd_from, fwd_from)} in {entity_id_map.get(query, query)} at crawl depth {depth_map[query]}, adding to query")
-
+                                if fwd_from not in crawl_references:
+                                    crawl_references[fwd_from] = 0
+                                crawl_references[fwd_from] += 1
+
+                                # Add to queries if it has been referenced enough times
+                                if crawl_references[fwd_from] >= crawl_msg_threshold:
+                                    queries.append(fwd_from)
+                                    full_query.add(fwd_from)
+                                    num_queries += 1
+                                    discovered += 1
+                                    self.dataset.update_status(f"Discovered new entity {entity_id_map.get(fwd_from, fwd_from)} in {entity_id_map.get(query, query)} at crawl depth {depth_map[query]}, adding to query")
+
+                        serialized_message["4CAT_metadata"] = {
+                            "collected_at": datetime.now().isoformat(), # this is relevant for rather long crawls
+                            "query": query, # possibly redundant, but we are adding non-user defined queries by crawling and may be useful to know exactly what query was used to collect an entity
+                            "query_depth": depth_map.get(query, 0)
+                        }
                         yield serialized_message
 
                         if entity_posts >= max_items:
@@ -502,6 +526,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                     delay *= 2
                     continue
 
+                self.dataset.log(f"Completed {entity_id_map.get(query, query)} with {entity_posts} messages (discovered {discovered} new entities)")
                 break
 
     async def resolve_groups(self, client, message):

From c67a046137d916df3bb707f2243542d289045a06 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 23 Sep 2024 16:21:58 +0200
Subject: [PATCH 05/26] Telegram mapping fixes

---
 datasources/telegram/search_telegram.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index 2028772b9..525430910 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -437,6 +437,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                         if crawl_max_depth and (depth_map.get(query) < crawl_max_depth):
                             message_fwd = serialized_message.get("fwd_from")
                             fwd_from = None
+                            fwd_source_type = None
                             if message_fwd and message_fwd.get("from_id"):
                                 if message_fwd["from_id"].get("_type") == "PeerChannel":
                                     # Legacy(?) data structure (pre 2024/7/22)
@@ -444,18 +445,26 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                                     # to Telethon! this is nice because it means we don't have to
                                     # resolve entities to crawl iteratively
                                     fwd_from = int(message_fwd["from_id"]["channel_id"])
+                                    fwd_source_type = "channel"
                                 elif message_fwd and message_fwd.get("from_id", {}).get('full_chat',{}):
                                     # TODO: do we need a check here to only follow certain types of messages? this is similar to resolving, but the types do not appear the same to me
                                     # Note: message_fwd["from_id"]["channel_id"] == message_fwd["from_id"]["full_chat"]["id"] in test cases so far
                                     fwd_from = int(message_fwd["from_id"]["full_chat"]["id"])
+                                    fwd_source_type = "channel"
+                                elif message_fwd and message_fwd.get("from_id", {}).get('full_user',{}):
+                                    # forwards can also come from users
+                                    # these can never be followed, so don't add these to the crawl, but do document them
+                                    fwd_source_type = "user"
                                 else:
+                                    print(json.dumps(message_fwd))
                                     self.log.warning(f"Telegram (dataset {self.dataset.key}): Unknown fwd_from data structure; unable to crawl")
+                                    fwd_source_type = "unknown"
 
                             # Check if fwd_from or the resolved entity ID is already queued or has been queried
-                            if fwd_from and fwd_from not in full_query and fwd_from not in queries:
-
+                            if fwd_from and fwd_from not in full_query and fwd_from not in queries and fwd_source_type not in ("user",):
                                 # new entity discovered!
                                 # might be discovered (before collection) multiple times, so retain lowest depth
+                                print(f"Potentially crawling {fwd_from}")
                                 depth_map[fwd_from] = min(depth_map.get(fwd_from, crawl_max_depth), depth_map[query] + 1)
                                 if fwd_from not in crawl_references:
                                     crawl_references[fwd_from] = 0
@@ -728,6 +737,9 @@ def map_item(message):
             if from_data and from_data.get("from_name"):
                 forwarded_name = message["fwd_from"]["from_name"]
 
+            if from_data and from_data.get("users") and len(from_data["users"]) > 0 and "user" not in from_data:
+                from_data["user"] = from_data["users"][0]
+
             if from_data and ("user" in from_data or "chats" in from_data):
                 # 'resolve entities' was enabled for this dataset
                 if "user" in from_data:
@@ -779,7 +791,7 @@ def map_item(message):
             "body": message["message"],
             "reply_to": message.get("reply_to_msg_id", ""),
             "views": message["views"] if message["views"] else "",
-            "forwards": message.get("forwards", MissingMappedField(0)),
+            # "forwards": message.get("forwards", MissingMappedField(0)),
             "reactions": reactions,
             "timestamp": datetime.fromtimestamp(message["date"]).strftime("%Y-%m-%d %H:%M:%S"),
             "unix_timestamp": int(message["date"]),

From 36913490481910d8c68c66c8680da2e8cf1218bb Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 23 Sep 2024 17:46:36 +0200
Subject: [PATCH 06/26] Cast to string when lowercasing in rank_attributes

---
 processors/metrics/rank_attribute.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processors/metrics/rank_attribute.py b/processors/metrics/rank_attribute.py
index 0e38757c6..9b90b3c7b 100644
--- a/processors/metrics/rank_attribute.py
+++ b/processors/metrics/rank_attribute.py
@@ -203,7 +203,7 @@ def missing_value_placeholder(data, field_name):
 			# keep track of occurrences of found items per relevant time period
 			for value in values:
 				if to_lowercase:
-						value = value.lower()
+						value = str(value).lower()
 
 				if rank_style == "overall" and value not in overall_top:
 					continue

From 959710ab613bd201c5cf56bb01b9e1e7d6ee84e5 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 23 Sep 2024 18:36:07 +0200
Subject: [PATCH 07/26] Find Telegram crawl refs in message body

---
 datasources/telegram/search_telegram.py | 78 ++++++++++++++++++-------
 1 file changed, 58 insertions(+), 20 deletions(-)

diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index 525430910..c963e47f6 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -6,6 +6,7 @@
 import hashlib
 import asyncio
 import json
+import ural
 import time
 import re
 
@@ -24,7 +25,7 @@
     FloodWaitError, ApiIdInvalidError, PhoneNumberInvalidError, RPCError
 from telethon.tl.functions.channels import GetFullChannelRequest
 from telethon.tl.functions.users import GetFullUserRequest
-from telethon.tl.types import User
+from telethon.tl.types import User, MessageEntityMention
 
 
 
@@ -214,6 +215,14 @@ def get_options(cls, parent_dataset=None, user=None):
                 "tooltip": "Entities need to be references at least this many times to be added to the query. Only "
                            "references discovered below the max crawl depth are taken into account."
             }
+            options["crawl-via-links"] = {
+                "type": UserInput.OPTION_TOGGLE,
+                "default": False,
+                "help": "Extract new groups from links",
+                "tooltip": "Look for references to other groups in message content via t.me links and @references. "
+                           "This is more error-prone than crawling only via forwards, but can be a way to discover "
+                           "links that would otherwise remain undetected."
+            }
 
         return options
 
@@ -358,6 +367,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
 
         crawl_max_depth = self.parameters.get("crawl-depth", 0)
         crawl_msg_threshold = self.parameters.get("crawl-threshold", 10)
+        crawl_via_links = self.parameters.get("crawl-via-links", False)
 
         self.dataset.log(f"Max crawl depth: {crawl_max_depth}")
         self.dataset.log(f"Crawl threshold: {crawl_msg_threshold}")
@@ -434,6 +444,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                             break
 
                         # if crawling is enabled, see if we found something to add to the query
+                        linked_entities = set()
                         if crawl_max_depth and (depth_map.get(query) < crawl_max_depth):
                             message_fwd = serialized_message.get("fwd_from")
                             fwd_from = None
@@ -451,7 +462,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                                     # Note: message_fwd["from_id"]["channel_id"] == message_fwd["from_id"]["full_chat"]["id"] in test cases so far
                                     fwd_from = int(message_fwd["from_id"]["full_chat"]["id"])
                                     fwd_source_type = "channel"
-                                elif message_fwd and message_fwd.get("from_id", {}).get('full_user',{}):
+                                elif message_fwd and (message_fwd.get("from_id", {}).get('full_user',{}) or message_fwd.get("from_id", {}).get("_type") == "PeerUser"):
                                     # forwards can also come from users
                                     # these can never be followed, so don't add these to the crawl, but do document them
                                     fwd_source_type = "user"
@@ -460,23 +471,50 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                                     self.log.warning(f"Telegram (dataset {self.dataset.key}): Unknown fwd_from data structure; unable to crawl")
                                     fwd_source_type = "unknown"
 
+                                if fwd_from:
+                                    linked_entities.add(fwd_from)
+
+
+                            if crawl_via_links:
+                                # t.me links
+                                all_links = ural.urls_from_text(serialized_message["message"])
+                                all_links = [link.split("t.me/")[1] for link in all_links if ural.get_hostname(link) == "t.me"]
+                                for link in all_links:
+                                    if link.startswith("+"):
+                                        # invite links
+                                        continue
+
+                                    entity_name = link.split("?")[0].split("#")[0]
+                                    linked_entities.add(entity_name)
+
+                                # @references
+                                references = [r for t, r in message.get_entities_text() if type(t) is MessageEntityMention]
+                                for reference in references:
+                                    if reference.startswith("@"):
+                                        reference = reference[1:]
+
+                                    linked_entities.add(reference)
+
                             # Check if fwd_from or the resolved entity ID is already queued or has been queried
-                            if fwd_from and fwd_from not in full_query and fwd_from not in queries and fwd_source_type not in ("user",):
-                                # new entity discovered!
-                                # might be discovered (before collection) multiple times, so retain lowest depth
-                                print(f"Potentially crawling {fwd_from}")
-                                depth_map[fwd_from] = min(depth_map.get(fwd_from, crawl_max_depth), depth_map[query] + 1)
-                                if fwd_from not in crawl_references:
-                                    crawl_references[fwd_from] = 0
-                                crawl_references[fwd_from] += 1
-
-                                # Add to queries if it has been referenced enough times
-                                if crawl_references[fwd_from] >= crawl_msg_threshold:
-                                    queries.append(fwd_from)
-                                    full_query.add(fwd_from)
-                                    num_queries += 1
-                                    discovered += 1
-                                    self.dataset.update_status(f"Discovered new entity {entity_id_map.get(fwd_from, fwd_from)} in {entity_id_map.get(query, query)} at crawl depth {depth_map[query]}, adding to query")
+                            for link in linked_entities:
+                                if link not in full_query and link not in queries and fwd_source_type not in ("user",):
+                                    # new entity discovered!
+                                    # might be discovered (before collection) multiple times, so retain lowest depth
+                                    # print(f"Potentially crawling {link}")
+                                    depth_map[link] = min(depth_map.get(link, crawl_max_depth), depth_map[query] + 1)
+                                    if link not in crawl_references:
+                                        crawl_references[link] = 0
+                                    crawl_references[link] += 1
+
+                                    # Add to queries if it has been referenced enough times
+                                    if crawl_references[link] >= crawl_msg_threshold:
+                                        queries.append(link)
+                                        full_query.add(link)
+                                        num_queries += 1
+                                        discovered += 1
+                                        self.dataset.update_status(f"Discovered new entity {entity_id_map.get(link, link)} in {entity_id_map.get(query, query)} at crawl depth {depth_map[query]}, adding to query")
+
+
 
                         serialized_message["4CAT_metadata"] = {
                             "collected_at": datetime.now().isoformat(), # this is relevant for rather long crawls
@@ -1012,7 +1050,6 @@ def validate_query(query, request, user):
         return {
             "items": num_items,
             "query": ",".join(sanitized_items),
-            "board": "",  # needed for web interface
             "api_id": query.get("api_id"),
             "api_hash": query.get("api_hash"),
             "api_phone": query.get("api_phone"),
@@ -1021,7 +1058,8 @@ def validate_query(query, request, user):
             "min_date": min_date,
             "max_date": max_date,
             "crawl-depth": query.get("crawl-depth"),
-            "crawl-threshold": query.get("crawl-threshold")
+            "crawl-threshold": query.get("crawl-threshold"),
+            "crawl-via-links": query.get("crawl-via-links")
         }
 
     @staticmethod

From 7115b6f16199c6e886212d0730b21acf5b479245 Mon Sep 17 00:00:00 2001
From: Dale Wahl <32108944+dale-wahl@users.noreply.github.com>
Date: Tue, 24 Sep 2024 12:42:28 +0200
Subject: [PATCH 08/26] Improve github action workflow (#456)

* test new github action

* test backend fail

* test frontend fail

* update checkout action to v4 and text

* test container starts but 4cat has issue

* fix forced fail
---
 .github/workflows/docker_pr_test.yml | 36 ++++++++++++++++++++++------
 1 file changed, 29 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/docker_pr_test.yml b/.github/workflows/docker_pr_test.yml
index 3109e3f8f..7d8b8db3b 100644
--- a/.github/workflows/docker_pr_test.yml
+++ b/.github/workflows/docker_pr_test.yml
@@ -12,16 +12,34 @@ jobs:
     name: Test docker-compose up with build
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
       - name: Run docker compose up
         run: docker compose -f docker-compose_build.yml up -d
-      - name: Wait and check log
+      - name: Check backend container is running
+        run: |
+          sleep 30
+          if [ "$(docker ps | grep 4cat_backend)" ]; then 
+              echo "Docker 4cat_backend container is running..."
+          else 
+              echo -e "Docker 4cat_backend container is not running...\nPrinting 4cat_backend logs:\n\n$(docker container logs 4cat_backend)"
+              exit 1
+          fi
+      - name: Check frontend container is running
+        run: |
+          sleep 10
+          if [ "$(docker ps | grep 4cat_frontend)" ]; then 
+              echo "Docker 4cat_frontend container is running..."
+          else 
+              echo -e "Docker 4cat_frontend container is not running...\nPrinting 4cat_frontend logs:\n\n$(docker container logs 4cat_frontend)"
+              exit 1
+          fi
+      - name: Check 4CAT backend log for expected INFO message
         run: |
           test_case=" INFO at api.py:65: Local API listening for requests at backend:4444"
           sleep 30 && var=$(docker exec 4cat_backend tail -n 1 logs/backend_4cat.log)
           echo "::group::Backend test"
           if [ "$(echo "$var" | tr "|" "\n" | sed -n '2p')" = "$test_case" ]; then 
-              echo "Backend running as expected"
+              echo "4CAT backend running as expected"
           else 
               echo "::error::Backend failed to start"
               echo "Test:$test_case"
@@ -32,7 +50,11 @@ jobs:
       - name: Print log on failure
         if: failure()
         run: |
-          docker cp 4cat_backend:/usr/src/app/logs/backend_4cat.log ./backend_4cat.log
-          echo "::group::Backend logs"
-          cat backend_4cat.log
-          echo "::endgroup::"
+          if [ "$(docker ps | grep 4cat)" ]; then 
+              docker cp 4cat_backend:/usr/src/app/logs/backend_4cat.log ./backend_4cat.log
+              echo "::group::Backend logs"
+              cat backend_4cat.log
+              echo "::endgroup::"
+          else 
+              echo "Docker containers not running; check logs in previous steps"
+          fi

From 040b5f427f528229471fbd38059c549f09fd8b9f Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 25 Sep 2024 14:03:13 +0200
Subject: [PATCH 09/26] Allow info boxes for processors to be as high as they
 want.

---
 webtool/static/css/dataset-page.css | 1 -
 1 file changed, 1 deletion(-)

diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css
index e257bf293..8e99832f3 100644
--- a/webtool/static/css/dataset-page.css
+++ b/webtool/static/css/dataset-page.css
@@ -513,7 +513,6 @@ article.result > section:first-child {
 .processor-option-wrap > label {
     display: flex;
     align-items: center;
-    max-height: 1.5em;
 }
 
 .processor-option-wrap > label.option-type-toggle {

From 579ff64e18fbdcda39ef3c2457ab7a4f01ce3d9d Mon Sep 17 00:00:00 2001
From: sal-phd-desktop <s.h.hagen@uva.nl>
Date: Wed, 25 Sep 2024 15:21:12 +0200
Subject: [PATCH 10/26] Add LocationParseError exception to download images
 processor

---
 processors/visualisation/download_images.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/processors/visualisation/download_images.py b/processors/visualisation/download_images.py
index c13fd0fca..8b0792e22 100644
--- a/processors/visualisation/download_images.py
+++ b/processors/visualisation/download_images.py
@@ -574,6 +574,10 @@ def request_get_w_error_handling(self, url, retries=0, **kwargs):
 			else:
 				self.dataset.log("Error: ConnectionError while trying to download image %s: %s" % (url, e))
 				raise FileNotFoundError()
+		except requests.exceptions.LocationParseError as e:
+			# not an valid url, just skip
+			self.dataset.log("Error: LocationParseError while trying to download image %s: %s" % (url, e))
+			raise FileNotFoundError()
 		except requests.exceptions.InvalidSchema:
 			# not an http url, just skip
 			raise FileNotFoundError()

From bfaf23b1065f068276e0c6c49d610a8c57083ae3 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 26 Sep 2024 15:01:34 +0200
Subject: [PATCH 11/26] cleanup_tempfiles waits 7 days to remove unclaimed data
 files

---
 backend/workers/cleanup_tempfiles.py | 52 ++++++++++++++++++++--------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/backend/workers/cleanup_tempfiles.py b/backend/workers/cleanup_tempfiles.py
index 51e96fd57..b0b3a6d21 100644
--- a/backend/workers/cleanup_tempfiles.py
+++ b/backend/workers/cleanup_tempfiles.py
@@ -3,7 +3,8 @@
 """
 import shutil
 import re
-
+import json
+from datetime import datetime
 from pathlib import Path
 
 from common.config_manager import config
@@ -27,12 +28,21 @@ class TempFileCleaner(BasicWorker):
 
     ensure_job = {"remote_id": "localhost", "interval": 10800}
 
+    # Use tracking file to delay deletion of files that may still be in use
+    tracking_file = config.get('PATH_DATA').joinpath(".temp_file_cleaner")
+    days_to_keep = 7
+
     def work(self):
         """
         Go through result files, and for each one check if it should still
         exist
         :return:
         """
+        # Load tracking file
+        if not self.tracking_file.exists():
+            tracked_files = {}
+        else:
+            tracked_files = json.loads(self.tracking_file.read_text())
 
         result_files = Path(config.get('PATH_DATA')).glob("*")
         for file in result_files:
@@ -41,6 +51,7 @@ def work(self):
                 continue
 
             if self.interrupted:
+                self.tracking_file.write_text(json.dumps(tracked_files))
                 raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files")
 
             # the key of the dataset files belong to can be extracted from the
@@ -59,20 +70,28 @@ def work(self):
             except DataSetException:
                 # the dataset has been deleted since, but the result file still
                 # exists - should be safe to clean up
-                self.log.info("No matching dataset with key %s for file %s, deleting file" % (key, str(file)))
-                if file.is_dir():
-                    try:
-                        shutil.rmtree(file)
-                    except PermissionError:
-                        self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
-                                      f"permissions), skipping")
-
-                else:
-                    try:
-                        file.unlink()
-                    except FileNotFoundError:
-                        # the file has been deleted since
-                        pass
+                if file.name not in tracked_files:
+                    self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion")
+                    tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400)
+                elif tracked_files[file.name] < datetime.now().timestamp():
+                    self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file")
+                    if file.is_dir():
+                        try:
+                            shutil.rmtree(file)
+                        except PermissionError:
+                            self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no "
+                                          f"permissions), skipping")
+
+                    else:
+                        try:
+                            file.unlink()
+                        except FileNotFoundError:
+                            # the file has been deleted since
+                            pass
+
+                    # Remove from tracking
+                    del tracked_files[file.name]
+
                 continue
 
             if file.is_dir() and "-staging" in file.stem and dataset.is_finished():
@@ -84,4 +103,7 @@ def work(self):
                 dataset.key, str(file)))
                 shutil.rmtree(file)
 
+        # Update tracked files
+        self.tracking_file.write_text(json.dumps(tracked_files))
+
         self.job.finish()
\ No newline at end of file

From cb4b7706762259aa700c2bcaf2df88ef0cbd2ae2 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 26 Sep 2024 15:26:57 +0200
Subject: [PATCH 12/26] Clean link references in Telegram crawler

---
 datasources/telegram/search_telegram.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index c963e47f6..0fb7b282a 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -484,7 +484,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                                         # invite links
                                         continue
 
-                                    entity_name = link.split("?")[0].split("#")[0]
+                                    entity_name = link.split("/")[0].split("?")[0].split("#")[0]
                                     linked_entities.add(entity_name)
 
                                 # @references
@@ -493,6 +493,8 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                                     if reference.startswith("@"):
                                         reference = reference[1:]
 
+                                    reference = reference.split("/")[0]
+
                                     linked_entities.add(reference)
 
                             # Check if fwd_from or the resolved entity ID is already queued or has been queried

From b66418350bcacc83ede7eaf4ae515ef4fc4e5bfa Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Thu, 26 Sep 2024 16:19:47 +0200
Subject: [PATCH 13/26] Don't crash on "t.me" without path

---
 datasources/telegram/search_telegram.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index 0fb7b282a..365fef2c9 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -478,7 +478,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date):
                             if crawl_via_links:
                                 # t.me links
                                 all_links = ural.urls_from_text(serialized_message["message"])
-                                all_links = [link.split("t.me/")[1] for link in all_links if ural.get_hostname(link) == "t.me"]
+                                all_links = [link.split("t.me/")[1] for link in all_links if ural.get_hostname(link) == "t.me" and len(link.split("t.me/")) > 1]
                                 for link in all_links:
                                     if link.startswith("+"):
                                         # invite links

From 8f2193cdcf0179ba34947861be87ec587e22e638 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 27 Sep 2024 11:24:15 +0200
Subject: [PATCH 14/26] Add linked and mentioned entities to mapped Telegram
 item

---
 datasources/telegram/search_telegram.py | 35 +++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py
index 365fef2c9..9e523a247 100644
--- a/datasources/telegram/search_telegram.py
+++ b/datasources/telegram/search_telegram.py
@@ -820,6 +820,38 @@ def map_item(message):
                     # Failsafe; can be updated to support formatting of new datastructures in the future
                     reactions += f"{reaction}, "
 
+        # t.me links
+        linked_entities = set()
+        all_links = ural.urls_from_text(message["message"])
+        all_links = [link.split("t.me/")[1] for link in all_links if
+                     ural.get_hostname(link) == "t.me" and len(link.split("t.me/")) > 1]
+
+        for link in all_links:
+            if link.startswith("+"):
+                # invite links
+                continue
+
+            entity_name = link.split("/")[0].split("?")[0].split("#")[0]
+            linked_entities.add(entity_name)
+
+        # @references
+        # in execute_queries we use MessageEntityMention to get these
+        # however, after serializing these objects we only have the offsets of
+        # the mentioned username, and telegram does weird unicode things to its
+        # offsets meaning we can't just substring the message. So use a regex
+        # as a 'good enough' solution
+        all_mentions = set(re.findall(r"@([^\s\W]+)", message["message"]))
+
+        # make this case-insensitive since people may use different casing in
+        # messages than the 'official' username for example
+        all_connections = set([v for v in [forwarded_username, *linked_entities, *all_mentions] if v])
+        all_ci_connections = set()
+        seen = set()
+        for connection in all_connections:
+            if connection.lower() not in seen:
+                all_ci_connections.add(connection)
+                seen.add(connection.lower())
+
         return MappedItem({
             "id": f"{message['_chat']['username']}-{message['id']}",
             "thread_id": thread,
@@ -841,6 +873,9 @@ def map_item(message):
             "author_forwarded_from_name": forwarded_name,
             "author_forwarded_from_username": forwarded_username,
             "author_forwarded_from_id": forwarded_id,
+            "entities_linked": ",".join(linked_entities),
+            "entities_mentioned": ",".join(all_mentions),
+            "all_connections": ",".join(all_ci_connections),
             "timestamp_forwarded_from": datetime.fromtimestamp(forwarded_timestamp).strftime(
                 "%Y-%m-%d %H:%M:%S") if forwarded_timestamp else "",
             "unix_timestamp_forwarded_from": forwarded_timestamp,

From a224dd96d20a5d9bed62d25bd39c3bc9a929d307 Mon Sep 17 00:00:00 2001
From: Dale Wahl <32108944+dale-wahl@users.noreply.github.com>
Date: Tue, 1 Oct 2024 15:01:49 +0200
Subject: [PATCH 15/26] Export 4CAT datasets and analyses as ZIP file... and
 import them elsewhere! (#452)

* export processor

* start of importer

* finish off importing ZIP 4CAT datasets

* ensure cleanup on failure

had some weird lost datasets when debugging this

* auto-expire export zips

* nltk again

* Squashed commit of the following:

commit 3f2a62a124926cfeb840796f104a702878ac10e5
Author: Carsten Schnober <carschno@gmail.com>
Date:   Wed Sep 18 18:18:29 2024 +0200

    Update Gensim to >=4.3.3, <4.4.0 (#450)

    * Update Gensim to >=4.3.3, <4.4.0

    * update nltk as well

    ---------

    Co-authored-by: Dale Wahl <dalewahl@gmail.com>
    Co-authored-by: Sal Hagen <s.h.hagen@uva.nl>

commit fee2c8c08617094f28496963da282d2e2dddeab7
Merge: 3d94b666 f8e93eda
Author: sal-phd-desktop <s.h.hagen@uva.nl>
Date:   Wed Sep 18 18:11:19 2024 +0200

    Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat

commit 3d94b666cedd0de4e0bee953cbf1d787fdc38854
Author: sal-phd-desktop <s.h.hagen@uva.nl>
Date:   Wed Sep 18 18:11:04 2024 +0200

    FINALLY remove 'News' from the front page, replace with 4CAT BlueSky updates and potential information about the specific server (to be set on config page)

commit f8e93edabe9013a2c1229caa4c454fab09620125
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 15:11:21 2024 +0200

    Simple extensions page in Control Panel

commit b5be128c7b8682fb233d962326d9118a61053165
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 14:08:13 2024 +0200

    Remove 'docs' directory

commit 1e2010af44817016c274c9ec9f7f9971deb57f66
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 14:07:38 2024 +0200

    Forgot TikTok and Douyin

commit c757dd51884e7ec9cf62ca1726feacab4b2283b7
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 14:01:31 2024 +0200

    Say 'zeeschuimer' instead of 'extension' to avoid confusion with 4CAT extensions

commit ee7f4345478f923541536c86a5b06246deae03f6
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 14:00:40 2024 +0200

    RIP Parler data source

commit 11300f2430b51887823b280405de4ded4f15ede1
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 11:21:37 2024 +0200

    Tuplestring

commit 547265240eba81ca0ad270cd3c536a2b1dcf512d
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Wed Sep 18 11:15:29 2024 +0200

    Pass user obj instead of str to ConfigWrapper in Processor

commit b21866d7900b5d20ed6ce61ee9aff50f3c0df910
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Tue Sep 17 17:45:01 2024 +0200

    Ensure request-aware config reader in user object when using config wrapper

commit bbe79e4b0fe870ccc36cab7bfe7963b28d1948e3
Author: Sal Hagen <s.h.hagen@uva.nl>
Date:   Tue Sep 17 15:12:46 2024 +0200

    Fix extension path walk for Windows

commit d6064beaf31a6a85b0e34ed4f8126eb4c4fc07e3
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Mon Sep 16 14:50:45 2024 +0200

    Allow tags that have no users

    Use case: tag-based frontend differentiation using X-4CAT-Config-Via-Proxy

commit b542ded6f976809ec88445e7b04f2c81b900188e
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Mon Sep 16 14:13:14 2024 +0200

    Trailing slash in query results list

commit a4bddae575b22a009925206a1337bdd89349e567
Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com>
Date:   Mon Sep 16 13:57:23 2024 +0200

    4CAT Extension - easy(ier) adding of new datasources/processors that can be mainted seperately from 4CAT base code (#451)

    * domain only

    * fix reference

    * try and collect links with selenium

    * update column_filter to find multiple matches

    * fix up the normal url_scraper datasource

    * ensure all selenium links are strings for join

    * change output of url_scraper to ndjson with map_items

    * missed key/index change

    * update web archive to use json and map to 4CAT

    * fix no text found

    * and none on scraped_links

    * check key first

    * fix up web_archive error reporting

    * handle None type for error

    * record web archive "bad request"

    * add wait after redirect movement

    * increase waittime for redirects

    * add processor for trackers

    * dict to list for addition

    * allow both newline and comma seperated links

    * attempt to scrape iframes as seperate pages

    * Fixes for selenium scraper to work with config database

    * installation of packages, geckodriver, and firefox if selenium enabled

    * update install instructions

    * fix merge error

    * fix dropped function

    * have to be kidding me

    * add note; setup requires docker... need to think about IF this will ever
    be installed without Docker

    * seperate selenium class into wrapper and Search class so wrapper can be
    used in processors!

    * add screenshots; add firefox extension support

    * update selenium definitions

    * regex for extracting urls from strings

    * screenshots processor; extract urls from text and takes screenshots

    * Allow producing zip files from data sources

    * import time

    * pick better default

    * test screenshot datasource

    * validate all params

    * fix enable extension

    * haha break out of while loop

    * count my items

    * whoops, len() is important here

    * must be getting tired...

    * remove redundant logging

    * Eager loading for screenshots, viewport options, etc

    * Woops, wrong folder

    * Fix label shortening

    * Just 'queue' instead of 'search queue'

    * Yeah, make it headless

    * README -> DESCRIPTION

    * h1 -> h2

    * Actually just have no header

    * Use proper filename for downloaded files

    * Configure whether to offer pseudonymisation etc

    * Tweak descriptions

    * fix log missing data

    * add columns to post_topic_matrix

    * fix breadcrumb bug

    * Add top topics column

    * Fix selenium config install parameter (Docker uses this/manual would
    need to run install_selenium, well, manually)

    * this processor is slow; i thought it was broken long before it updated!

    * refactor detect_trackers as conversion processor not filter

    * add geckodriver executable to docker install

    * Auto-configure webdrivers if available in PATH

    * update screenshots to act as image-downloader and benefit from processors

    * fix is_compatible_with

    * Delete helper-scripts/migrate/migrate-1.30-1.31.py

    * fix embeddings is_compatible_with

    * fix up UI options for hashing and private

    * abstract was moved to lib

    * various fixes to selenium based datasources

    * processors not compatible with image datasets

    * update firefox extension handling

    * screenshots datasource fix get_options

    * rename screenshots processor to be detected as image dataset

    * add monthly and weekly frequencies to wayback machine datasource

    * wayback ds: fix fail if all attempts do not realize results; addion frequency options to options; add daily

    * add scroll down page to allow lazy loading for entire page screenshots

    * screenshots: adjust pause time so it can be used to force a wait for images to load

    I have not successfully come up with or found a way to wait for all images to load; document.readyState == 'complete' does not function in this way on certain sites including the wayback machine

    * hash URLs to create filenames

    * remove log

    * add setting to toggle display advanced options

    * add progress bars

    * web archive fix query validation

    * count subpages in progress

    * remove overwritten function

    * move http response to own column

    * special filenames

    * add timestamps to all screenshots

    * restart selenium on failure

    * new build have selenium

    * process urls after start (keep original query parameters)

    * undo default firefox

    * quick max

    * rename SeleniumScraper to SeleniumSearch

    todo: build SeleniumProcessor!

    * max number screenshots configurable

    * method to get url with error handling

    * use get_with_error_handling

    * d'oh, screenshot processor needs to quit selenium

    * update log to contain URL

    * Update scrolling to use Page down key if necessary

    * improve logs

    * update image_category_wall as screenshot datasource does not have category column; this is not ideal and ought to be solved in another way.

    Also, could I get categories from the metadata? That's... ugh.

    * no category, no processor

    * str errors

    * screenshots: dismiss alerts when checking ready state is complete

    * set screenshot timeout to 30 seconds

    * update gensim package

    * screenshots: move processor interrupt into attempts loop

    * if alert disappears before we can dismiss it...

    * selenium specific logger

    * do not switch window when no alert found on dismiss

    * extract wait for page to load to selenium class

    * improve descriptions of screenshot options

    * remove unused line

    * treat timeouts differently from other errors

    these are more likely due to an issue with the website in question

    * debug if requested

    * increase pause time

    * restart browser w/ PID

    * increase max_workers for selenium

    this is by individual worker class not for all selenium classes... so you can really crank them out if desired

    * quick fix restart by pid

    * avoid bad urls

    * missing bracket & attempt to fix-missing dependencies in Docker install

    * Allow dynamic form options in processors

    * Allow 'requires' on data source options as well

    * Handle list values with requires

    * basic processor for apple store; setup checks for additional requirements

    * fix is_4cat_class

    * show preview when no map_item

    * add google store datasource

    * Docker setup.py use extensions

    * Wider support for file upload in processors

    * Log file uploads in DMI service manager

    * add map_item methods and record more data per item

    need additional item data as map_item is staticmethod

    * update from master; merge conflicts

    * fix docker build context (ignore data files)

    * fix option requirements

    * apple store fix: list still tries to get query

    * apple & google stores fix up item mapping

    * missed merge error

    * minor fix

    * remove unused import

    * fix datasources w/ files frontend error

    * fix error w/ datasources having file option

    * better way to name docker volumes

    * update two other docker compose files

    * fix docker-compose ymls

    * minor bug: fix and add warning; fix no results fail

    * update apple field names to better match interface

    * update google store fieldnames and order

    * sneak in jinja logger if needed

    * fix fourcat.js handling checkboxes for dynamic settings

    * add new endpoint for app details to apple store

    * apple_store map new beta app data

    * add default lang/country

    * not all apps have advisories

    * revert so button works

    * add chart positions to beta map items

    * basic scheduler

    To-do
    - fix up and add options to scheduler view (e.g. delete/change)
    - add scheduler view to navigator
    - tie jobs to datasets? (either in scheduler view or, perhaps, filter dataset view)
    - more testing...

    * update scheduler view, add functions to update job interval

    * revert .env

    * working scheduler!

    * basic scheduler view w/ datasets

    * fix postgres tag

    * update job status in scheduled_jobs table

    * fix timestamp; end_date needed for last run check; add dataset label

    * improve scheduler view

    * remove dataset from scheduled_jobs table on delete

    * scheduler view order by last creation

    * scheduler views: separate scheduler list from scheduled dataset list

    * additional update from master fixes

    * apple_store map_items fix missing locales

    * add back depth for pagination

    * correct route

    * modify pagination to accept args

    * pagination fun

    * pagination: i hate testing on live servers...

    * ok ok need the pagination route

    * pagination: add route_args

    * fix up scheduler header

    * improve app store descriptions

    * add azure store

    * fix azure links

    * azure_store: add category search

    * azure fix type of config update timestamp

    OPTION_DATE does not appear correctly in settings and causes it to be written incorrectly

    * basic aws store

    * check if selenium available; get correct app_id

    * aws: implement pagination

    * add logging; wait for elements to load after next page; attempts to rework filter option collection

    * apple_store: handle invalid param error

    * fix filter_options

    * aws: fix filter option collection!

    * more merge

    * move new datasources and processors to extensions and modify setup.py and module loader to use the new locations

    * migrate.py to run extension "fourcat_install.py" files

    * formatting

    * remove extensions; add gitignore

    * excise scheduler merge

    * some additional cleanup from app_studies branch

    * allow nested datasources folders; ignore files in extensions main folder

    * allow extension install scripts to run pip if migrate.py has not

    * Remove unused URL functions we could use ural for

    * Take care of git commit hash tracking for extension processors

    * Get rid of unused path.versionfile config setting

    * Add extensions README

    * Squashed commit of the following:

    commit cd356f7a69d15e8ecc8efffc6d63a16368e62962
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 17:36:18 2024 +0200

        UI setting for 4CAT install ad in login

    commit 0945d8c0a11803a6bb411f15099d50fea25f10ab
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 17:32:55 2024 +0200

        UI setting for anonymisation controls

        Todo: make per-datasource

    commit 1a2562c2f9a368dbe0fc03264fb387e44313213b
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 15:53:27 2024 +0200

        Debug panel for HTTP headers in control panel

    commit 203314ec83fb631d985926a0b5c5c440cfaba9aa
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Sat Sep 14 15:53:17 2024 +0200

        Preview for HTML datasets

    commit 48c20c2ebac382bd41b92da4481ff7d832dc1538
    Author: Desktop Sal <info@salhagen.nl>
    Date:   Wed Sep 11 13:54:23 2024 +0200

        Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies

    commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3
    Author: Dale Wahl <dalewahl@gmail.com>
    Date:   Fri Sep 6 16:29:19 2024 +0200

        fix nltk where it matters

    commit 2ef5c80f2d1a5b5f893c8977d8394740de6d796d
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Tue Sep 3 12:05:14 2024 +0200

        Actually check progress in text annotator

    commit 693960f41b73e39eda0c2f23eb361c18bde632cd
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Mon Sep 2 18:03:18 2024 +0200

        Add processor for stormtrooper DMI service

    commit 6ae964aad492527bc5d016a00f870145aab6e1af
    Author: Stijn Peeters <stijn.peeters@uva.nl>
    Date:   Fri Aug 30 17:31:37 2024 +0200

        Fix reference to old stopwords list in neologisms preset

    * Fix Github links for extensions

    * Fix commit detection in extensions

    * Fix extension detection in module loader

    * Follow symlinks when loading extensions

    Probably not uncommon to have a checked out repo somewhere to then symlink into the extensions dir

    * Make queue message on create page more generic

    * Markdown in datasource option tooltips

    * Remove Spacy model from requirements

    * Add software_source to database SQL

    ---------

    Co-authored-by: Stijn Peeters <stijn.peeters@uva.nl>
    Co-authored-by: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com>

commit cd356f7a69d15e8ecc8efffc6d63a16368e62962
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Sat Sep 14 17:36:18 2024 +0200

    UI setting for 4CAT install ad in login

commit 0945d8c0a11803a6bb411f15099d50fea25f10ab
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Sat Sep 14 17:32:55 2024 +0200

    UI setting for anonymisation controls

    Todo: make per-datasource

commit 1a2562c2f9a368dbe0fc03264fb387e44313213b
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Sat Sep 14 15:53:27 2024 +0200

    Debug panel for HTTP headers in control panel

commit 203314ec83fb631d985926a0b5c5c440cfaba9aa
Author: Stijn Peeters <stijn.peeters@uva.nl>
Date:   Sat Sep 14 15:53:17 2024 +0200

    Preview for HTML datasets

commit 48c20c2ebac382bd41b92da4481ff7d832dc1538
Author: Desktop Sal <info@salhagen.nl>
Date:   Wed Sep 11 13:54:23 2024 +0200

    Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies

commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3
Author: Dale Wahl <dalewahl@gmail.com>
Date:   Fri Sep 6 16:29:19 2024 +0200

    fix nltk where it matters

* merge docker files

* fix merge issues

* more modules passing fixes

* disappearing import

not sure pycharm's merge is super awesome...

* fix import 4cat datasource with modules changes

---------

Co-authored-by: Stijn Peeters <stijn.peeters@uva.nl>
---
 common/lib/dataset.py                     |  16 +-
 datasources/fourcat_import/import_4cat.py | 502 ++++++++++++++++------
 docker-compose_build.yml                  |   6 +
 processors/conversion/export_datasets.py  | 106 +++++
 webtool/views/api_tool.py                 |   6 +-
 5 files changed, 500 insertions(+), 136 deletions(-)
 create mode 100644 processors/conversion/export_datasets.py

diff --git a/common/lib/dataset.py b/common/lib/dataset.py
index b092d2a4e..b494acbd3 100644
--- a/common/lib/dataset.py
+++ b/common/lib/dataset.py
@@ -15,7 +15,7 @@
 from common.config_manager import config
 from common.lib.job import Job, JobNotFoundException
 from common.lib.module_loader import ModuleCollector
-from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int
+from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, get_software_version
 from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem
 from common.lib.fourcat_module import FourcatModule
 from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException,
@@ -1586,6 +1586,20 @@ def get_media_type(self):
 		# Default to text
 		return self.parameters.get("media_type", "text")
 
+	def get_metadata(self):
+		"""
+		Get dataset metadata
+
+		This consists of all the data stored in the database for this dataset, plus the current 4CAT version (appended
+		as 'current_4CAT_version'). This is useful for exporting datasets, as it can be used by another 4CAT instance to
+		update its database (and ensure compatibility with the exporting version of 4CAT).
+		"""
+		metadata = self.db.fetchone("SELECT * FROM datasets WHERE key = %s", (self.key,))
+
+		# get 4CAT version (presumably to ensure export is compatible with import)
+		metadata["current_4CAT_version"] = get_software_version()
+		return metadata
+
 	def get_result_url(self):
 		"""
 		Gets the 4CAT frontend URL of a dataset file.
diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py
index cd231b445..dc5d079fc 100644
--- a/datasources/fourcat_import/import_4cat.py
+++ b/datasources/fourcat_import/import_4cat.py
@@ -4,6 +4,7 @@
 import requests
 import json
 import time
+import zipfile
 
 from backend.lib.processor import BasicProcessor
 from common.lib.exceptions import (QueryParametersException, FourcatException, ProcessorInterruptedException,
@@ -19,8 +20,8 @@ class FourcatImportException(FourcatException):
 class SearchImportFromFourcat(BasicProcessor):
     type = "import_4cat-search"  # job ID
     category = "Search"  # category
-    title = "Import from 4CAT"  # title displayed in UI
-    description = "Import a dataset from another 4CAT server"  # description displayed in UI
+    title = "Import 4CAT dataset and analyses"  # title displayed in UI
+    description = "Import a dataset from another 4CAT server or from a zip file (exported from a 4CAT server)"  # description displayed in UI
     is_local = False  # Whether this datasource is locally scraped
     is_static = False  # Whether this datasource is still updated
 
@@ -33,29 +34,328 @@ class SearchImportFromFourcat(BasicProcessor):
                     "\n\nTo import a dataset across servers, both servers need to be running the same version of 4CAT. "
                     "You can find the current version in the footer at the bottom of the interface."
         },
+        "method": {
+            "type": UserInput.OPTION_CHOICE,
+            "help": "Import Type",
+            "options": {
+                "zip": "Zip File",
+                "url": "4CAT URL",
+            },
+            "default": "url"
+        },
         "url": {
             "type": UserInput.OPTION_TEXT,
             "help": "Dataset URL",
-            "tooltip": "URL to the dataset's page, for example https://4cat.example/results/28da332f8918e6dc5aacd1c3b0170f01b80bd95f8ff9964ac646cecd33bfee49/."
+            "tooltip": "URL to the dataset's page, for example https://4cat.example/results/28da332f8918e6dc5aacd1c3b0170f01b80bd95f8ff9964ac646cecd33bfee49/.",
+            "requires": "method^=url"
         },
         "intro2": {
             "type": UserInput.OPTION_INFO,
             "help": "You can create an API key via the 'API Access' item in 4CAT's navigation menu. Note that you need "
                     "an API key from **the server you are importing from**, not the one you are looking at right now. "
-                    "Additionally, you need to have owner access to the dataset you want to import."
+                    "Additionally, you need to have owner access to the dataset you want to import.",
+            "requires": "method^=url"
         },
         "api-key": {
             "type": UserInput.OPTION_TEXT,
             "help": "4CAT API Key",
             "sensitive": True,
             "cache": True,
-        }
+            "requires": "method^=url"
+        },
+        "data_upload": {
+            "type": UserInput.OPTION_FILE,
+            "help": "File",
+            "tooltip": "Upload a ZIP file containing a dataset exported from a 4CAT server.",
+            "requires": "method^=zip"
+        },
+
     }
 
     created_datasets = None
     base = None
+    remapped_keys = None
+    dataset_owner = None
 
     def process(self):
+        """
+        Import 4CAT dataset either from another 4CAT server or from the uploaded zip file
+        """
+        self.created_datasets = set()  # keys of created datasets - may not be successful!
+        self.remapped_keys = {}  # changed dataset keys
+        self.dataset_owner = self.dataset.get_owners()[0]  # at this point it has 1 owner
+        try:
+            if self.parameters.get("method") == "zip":
+                self.process_zip()
+            else:
+                self.process_urls()
+        except Exception as e:
+            # Catch all exceptions and finish the job with an error
+            # Resuming is impossible because this dataset was overwritten with the importing dataset
+            # halt_and_catch_fire() will clean up and delete the datasets that were created
+            self.interrupted = True
+            try:
+                self.halt_and_catch_fire()
+            except ProcessorInterruptedException:
+                pass
+            # Reraise the original exception for logging
+            raise e
+
+    def after_create(query, dataset, request):
+        """
+        Hook to execute after the dataset for this source has been created
+
+        In this case, put the file in a temporary location so it can be
+        processed properly by the related Job later.
+
+        :param dict query:  Sanitised query parameters
+        :param DataSet dataset:  Dataset created for this query
+        :param request:  Flask request submitted for its creation
+        """
+        if query.get("method") == "zip":
+            file = request.files["option-data_upload"]
+            file.seek(0)
+            with dataset.get_results_path().with_suffix(".importing").open("wb") as outfile:
+                while True:
+                    chunk = file.read(1024)
+                    if len(chunk) == 0:
+                        break
+                    outfile.write(chunk)
+        else:
+            # nothing to do for URLs
+            pass
+
+
+    def process_zip(self):
+        """
+        Import 4CAT dataset from a ZIP file
+        """
+        self.dataset.update_status(f"Importing datasets and analyses from ZIP file.")
+        temp_file = self.dataset.get_results_path().with_suffix(".importing")
+
+        imported = []
+        processed_files = 1 # take into account the export.log file
+        failed_imports = []
+        with zipfile.ZipFile(temp_file, "r") as zip_ref:
+            zip_contents = zip_ref.namelist()
+
+            # Get all metadata files and determine primary dataset
+            metadata_files = [file for file in zip_contents if file.endswith("_metadata.json")]
+            if not metadata_files:
+                self.dataset.finish_with_error("No metadata files found in ZIP file; is this a 4CAT export?")
+                return
+
+            # Get the primary dataset
+            primary_dataset_keys = set()
+            datasets = []
+            parent_child_mapping = {}
+            for file in metadata_files:
+                with zip_ref.open(file) as f:
+                    metadata = json.load(f)
+                    if not metadata.get("key_parent"):
+                        primary_dataset_keys.add(metadata.get("key"))
+                        datasets.append(metadata)
+                    else:
+                        # Store the mapping of parent to child datasets
+                        parent_key = metadata.get("key_parent")
+                        if parent_key not in parent_child_mapping:
+                            parent_child_mapping[parent_key] = []
+                        parent_child_mapping[parent_key].append(metadata)
+
+            # Primary dataset will overwrite this dataset; we could address this to support multiple primary datasets
+            if len(primary_dataset_keys) != 1:
+                self.dataset.finish_with_error("ZIP file contains multiple primary datasets; only one is allowed.")
+                return
+
+            # Import datasets
+            while datasets:
+                self.halt_and_catch_fire()
+
+                # Create the datasets
+                metadata = datasets.pop(0)
+                dataset_key = metadata.get("key")
+                processed_metadata = self.process_metadata(metadata)
+                new_dataset = self.create_dataset(processed_metadata, dataset_key, dataset_key in primary_dataset_keys)
+                processed_files += 1
+
+                # TODO: I am now noticing that we do not update the results_file; it is even more unlikely to collide as it is both a random key and label combined... but...
+                # Copy the log file
+                self.halt_and_catch_fire()
+                log_filename = new_dataset.get_log_path().name
+                if log_filename in zip_contents:
+                    self.dataset.update_status(f"Transferring log file for dataset {new_dataset.key}")
+                    with zip_ref.open(log_filename) as f:
+                        with new_dataset.get_log_path().open("wb") as outfile:
+                            outfile.write(f.read())
+                    processed_files += 1
+                else:
+                    self.dataset.log(f"Log file not found for dataset {new_dataset.key} (original key {dataset_key}).")
+
+                # Copy the results
+                self.halt_and_catch_fire()
+                results_filename = new_dataset.get_results_path().name
+                if results_filename in zip_contents:
+                    self.dataset.update_status(f"Transferring data file for dataset {new_dataset.key}")
+                    with zip_ref.open(results_filename) as f:
+                        with new_dataset.get_results_path().open("wb") as outfile:
+                            outfile.write(f.read())
+                    processed_files += 1
+
+                    if not imported:
+                        # first dataset - use num rows as 'overall'
+                        num_rows = metadata["num_rows"]
+                else:
+                    # TODO: should I just delete the new_dataset here?
+                    self.dataset.log(f"Results file not found for dataset {new_dataset.key} (original key {dataset_key}).")
+                    new_dataset.finish_with_error(f"Results file not found for dataset {new_dataset.key} (original key {dataset_key}).")
+                    failed_imports.append(dataset_key)
+                    continue
+
+                # finally, the kids
+                self.halt_and_catch_fire()
+                if dataset_key in parent_child_mapping:
+                    datasets.extend(parent_child_mapping[dataset_key])
+                    self.dataset.log(f"Adding ({len(parent_child_mapping[dataset_key])}) child datasets to import queue")
+
+                # done - remember that we've imported this one
+                imported.append(new_dataset)
+                new_dataset.update_status(metadata["status"])
+
+                if new_dataset.key != self.dataset.key:
+                    # only finish if this is not the 'main' dataset, or the user
+                    # will think the whole import is done
+                    new_dataset.finish(metadata["num_rows"])
+
+            # Check that all files were processed
+            missed_files = []
+            if len(zip_contents) != processed_files:
+                for file in zip_contents:
+                    if file not in processed_files:
+                        missed_files.append(file)
+
+            # todo: this part needs updating if/when we support importing multiple datasets!
+            if failed_imports:
+                self.dataset.update_status(f"Dataset import finished, but not all data was imported properly. "
+                                           f"{len(failed_imports)} dataset(s) were not successfully imported. Check the "
+                                           f"dataset log file for details.", is_final=True)
+            elif missed_files:
+                self.dataset.log(f"ZIP file contained {len(missed_files)} files that were not processed: {missed_files}")
+                self.dataset.update_status(f"Dataset import finished, but not all files were processed. "
+                                           f"{len(missed_files)} files were not successfully imported. Check the "
+                                           f"dataset log file for details.", is_final=True)
+            else:
+                self.dataset.update_status(f"{len(imported)} dataset(s) succesfully imported.",
+                                           is_final=True)
+
+            if not self.dataset.is_finished():
+                # now all related datasets are imported, we can finish the 'main'
+                # dataset, and the user will be alerted that the full import is
+                # complete
+                self.dataset.finish(num_rows)
+
+
+    @staticmethod
+    def process_metadata(metadata):
+        """
+        Process metadata for import
+        """
+        # get rid of some keys that are server-specific and don't need to
+        # be stored (or don't correspond to database columns)
+        metadata.pop("current_4CAT_version")
+        metadata.pop("id")
+        metadata.pop("job")
+        metadata.pop("is_private")
+        metadata.pop("is_finished")  # we'll finish it ourselves, thank you!!!
+
+        # extra params are stored as JSON...
+        metadata["parameters"] = json.loads(metadata["parameters"])
+        if "copied_from" in metadata["parameters"]:
+            metadata["parameters"].pop("copied_from")
+        metadata["parameters"] = json.dumps(metadata["parameters"])
+
+        return metadata
+
+    def create_dataset(self, metadata, original_key, primary=False):
+        """
+        Create a new dataset
+        """
+        if primary:
+            self.dataset.update_status(f"Importing primary dataset {original_key}.")
+            # if this is the first dataset we're importing, make it the
+            # processor's "own" dataset. the key has already been set to
+            # the imported dataset's key via ensure_key() (or a new unqiue
+            # key if it already existed on this server)
+            # by making it the "own" dataset, the user initiating the
+            # import will see the imported dataset as the "result" of their
+            # import query in the interface, similar to the workflow for
+            # other data sources
+            new_dataset = self.dataset
+            metadata.pop("key")  # key already OK (see above)
+            self.db.update("datasets", where={"key": new_dataset.key}, data=metadata)
+
+        else:
+            self.dataset.update_status(f"Importing child dataset {original_key}.")
+            # supernumerary datasets - handle on their own
+            # these include any children of imported datasets
+            try:
+                key_exists = DataSet(key=metadata["key"], db=self.db, modules=self.modules)
+
+                # if we *haven't* thrown a DatasetException now, then the
+                # key is already in use, so create a "dummy" dataset and
+                # overwrite it with the metadata we have (except for the
+                # key). this ensures that a new unique key will be
+                # generated.
+                new_dataset = DataSet(parameters={}, type=self.type, db=self.db, modules=self.modules)
+                metadata.pop("key")
+                self.db.update("datasets", where={"key": new_dataset.key}, data=metadata)
+
+            except DataSetException:
+                # this is *good* since it means the key doesn't exist, so
+                # we can re-use the key of the imported dataset
+                self.db.insert("datasets", data=metadata)
+                new_dataset = DataSet(key=metadata["key"], db=self.db, modules=self.modules)
+
+        # make sure the dataset path uses the new key and local dataset
+        # path settings. this also makes sure the log file is created in
+        # the right place (since it is derived from the results file path)
+        extension = metadata["result_file"].split(".")[-1]
+        new_dataset.reserve_result_file(parameters=new_dataset.parameters, extension=extension)
+
+        new_dataset.update_status("Imported dataset created")
+        if new_dataset.key != original_key:
+            # could not use original key because it was already in use
+            # so update any references to use the new key
+            self.remapped_keys[original_key] = new_dataset.key
+            new_dataset.update_status(f"Cannot import with same key - already in use on this server. Using key "
+                                      f"{new_dataset.key} instead of key {original_key}!")
+
+        # refresh object, make sure it's in sync with the database
+        self.created_datasets.add(new_dataset.key)
+        new_dataset = DataSet(key=new_dataset.key, db=self.db, modules=self.modules)
+        if new_dataset.key == self.dataset.key:
+            # this ensures that the first imported dataset becomes the
+            # processor's "own" dataset, and that the import logs go to
+            # that dataset's log file. For later imports, this evaluates to
+            # False.
+            self.dataset = new_dataset
+
+        # if the key of the parent dataset was changed, change the
+        # reference to it that the child dataset has
+        if new_dataset.key_parent and new_dataset.key_parent in self.remapped_keys:
+            new_dataset.key_parent = self.remapped_keys[new_dataset.key_parent]
+
+        # update some attributes that should come from the new server, not
+        # the old
+        new_dataset.creator = self.dataset_owner
+        new_dataset.original_timestamp = new_dataset.timestamp
+        new_dataset.imported = True
+        new_dataset.timestamp = int(time.time())
+        new_dataset.db.commit()
+
+        return new_dataset
+
+
+    def process_urls(self):
         """
         Import 4CAT dataset from another 4CAT server
 
@@ -67,12 +367,9 @@ def process(self):
         keys = SearchImportFromFourcat.get_keys_from_urls(urls)
         api_key = self.parameters.get("api-key")
 
-        self.created_datasets = set()   # keys of created datasets - may not be successful!
         imported = []  # successfully imported datasets
         failed_imports = []  # keys that failed to import
-        remapped_keys = {}  # changed dataset keys
         num_rows = 0  # will be used later
-        dataset_owner = self.dataset.get_owners()[0]  # at this point it has 1 owner
 
         # we can add support for multiple datasets later by removing
         # this part!
@@ -101,90 +398,10 @@ def process(self):
                 failed_imports.append(dataset_key)
                 continue
 
-            # get rid of some keys that are server-specific and don't need to
-            # be stored (or don't correspond to database columns)
-            metadata.pop("current_4CAT_version")
-            metadata.pop("id")
-            metadata.pop("job")
-            metadata.pop("is_private")
-            metadata.pop("is_finished")  # we'll finish it ourselves, thank you!!!
-
-            # extra params are stored as JSON...
-            metadata["parameters"] = json.loads(metadata["parameters"])
-            if "copied_from" in metadata["parameters"]:
-                metadata["parameters"].pop("copied_from")
-            metadata["parameters"] = json.dumps(metadata["parameters"])
-
-            if not imported:
-                # if this is the first dataset we're importing, make it the
-                # processor's "own" dataset. the key has already been set to
-                # the imported dataset's key via ensure_key() (or a new unqiue
-                # key if it already existed on this server)
-                # by making it the "own" dataset, the user initiating the
-                # import will see the imported dataset as the "result" of their
-                # import query in the interface, similar to the workflow for
-                # other data sources
-                new_dataset = self.dataset
-                metadata.pop("key")  # key already OK (see above)
-                self.db.update("datasets", where={"key": new_dataset.key}, data=metadata)
+            metadata = self.process_metadata(metadata)
 
-            else:
-                # supernumerary datasets - handle on their own
-                # these include any children of imported datasets
-                try:
-                    key_exists = DataSet(key=metadata["key"], db=self.db)
-
-                    # if we *haven't* thrown a DatasetException now, then the
-                    # key is already in use, so create a "dummy" dataset and
-                    # overwrite it with the metadata we have (except for the
-                    # key). this ensures that a new unique key will be
-                    # generated.
-                    new_dataset = DataSet(parameters={}, type=self.type, db=self.db)
-                    metadata.pop("key")
-                    self.db.update("datasets", where={"key": new_dataset.key}, data=metadata)
-
-                except DataSetException:
-                    # this is *good* since it means the key doesn't exist, so
-                    # we can re-use the key of the imported dataset
-                    self.db.insert("datasets", data=metadata)
-                    new_dataset = DataSet(key=metadata["key"], db=self.db)
-
-            # make sure the dataset path uses the new key and local dataset
-            # path settings. this also makes sure the log file is created in
-            # the right place (since it is derived from the results file path)
-            extension = metadata["result_file"].split(".")[-1]
-            new_dataset.reserve_result_file(parameters=new_dataset.parameters, extension=extension)
-
-            new_dataset.update_status("Imported dataset created")
-            if new_dataset.key != dataset_key:
-                # could not use original key because it was already in use
-                # so update any references to use the new key
-                remapped_keys[dataset_key] = new_dataset.key
-                new_dataset.update_status(f"Cannot import with same key - already in use on this server. Using key "
-                                f"{new_dataset.key} instead of key {dataset_key}!")
-
-            # refresh object, make sure it's in sync with the database
-            self.created_datasets.add(new_dataset.key)
-            new_dataset = DataSet(key=new_dataset.key, db=self.db)
-            if new_dataset.key == self.dataset.key:
-                # this ensures that the first imported dataset becomes the
-                # processor's "own" dataset, and that the import logs go to
-                # that dataset's log file. For later imports, this evaluates to
-                # False.
-                self.dataset = new_dataset
-
-            # if the key of the parent dataset was changed, change the
-            # reference to it that the child dataset has
-            if new_dataset.key_parent and new_dataset.key_parent in remapped_keys:
-                new_dataset.key_parent = remapped_keys[new_dataset.key_parent]
-
-            # update some attributes that should come from the new server, not
-            # the old
-            new_dataset.creator = dataset_owner
-            new_dataset.original_timestamp = new_dataset.timestamp
-            new_dataset.imported = True
-            new_dataset.timestamp = int(time.time())
-            new_dataset.db.commit()
+            # create the new dataset
+            new_dataset = self.create_dataset(metadata, dataset_key, primary=True if not imported else False)
 
             # then, the log
             self.halt_and_catch_fire()
@@ -283,9 +500,9 @@ def halt_and_catch_fire(self):
             # overwritten by this point
             deletables = [k for k in self.created_datasets if k != self.dataset.key]
             for deletable in deletables:
-                DataSet(key=deletable, db=self.db).delete()
+                DataSet(key=deletable, db=self.db, modules=self.modules).delete()
 
-            self.dataset.finish_with_error(f"Interrupted while importing datasets from {self.base}. Cannot resume - you "
+            self.dataset.finish_with_error(f"Interrupted while importing datasets{' from '+self.base if self.base else ''}. Cannot resume - you "
                                            f"will need to initiate the import again.")
 
             raise ProcessorInterruptedException()
@@ -353,47 +570,72 @@ def validate_query(query, request, user):
         :param User user:  User object of user who has submitted the query
         :return dict:  Safe query parameters
         """
-        urls = query.get("url")
-        if not urls:
-            return QueryParametersException("Provide at least one dataset URL.")
-
-        urls = urls.split(",")
-        bases = set([url.split("/results/")[0].lower() for url in urls])
-        keys = SearchImportFromFourcat.get_keys_from_urls(urls)
+        if query.get("method") == "zip":
+            filename = ""
+            if "option-data_upload-entries" in request.form:
+                # First pass sends list of files in the zip
+                pass
+            elif "option-data_upload" in request.files:
+                # Second pass sends the actual file
+                file = request.files["option-data_upload"]
+                if not file:
+                    raise QueryParametersException("No file uploaded.")
+
+                if not file.filename.endswith(".zip"):
+                    raise QueryParametersException("Uploaded file must be a ZIP file.")
+
+                filename = file.filename
+            else:
+                raise QueryParametersException("No file was offered for upload.")
+
+            return {
+                "method": "zip",
+                "filename": filename
+            }
+        elif query.get("method") == "url":
+            urls = query.get("url")
+            if not urls:
+                raise QueryParametersException("Provide at least one dataset URL.")
+
+            urls = urls.split(",")
+            bases = set([url.split("/results/")[0].lower() for url in urls])
+            keys = SearchImportFromFourcat.get_keys_from_urls(urls)
+
+            if len(keys) != 1:
+                # todo: change this to < 1 if we allow multiple datasets
+                raise QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.")
+
+            if len(bases) != 1:
+                raise QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from "
+                                                "one 4CAT server at a time.")
+
+            base = urls[0].split("/results/")[0]
+            try:
+                # test if API key is valid and server is reachable
+                test = SearchImportFromFourcat.fetch_from_4cat(base, keys[0], query.get("api-key"), "metadata")
+            except FourcatImportException as e:
+                raise QueryParametersException(str(e))
 
-        if len(keys) != 1:
-            # todo: change this to < 1 if we allow multiple datasets
-            return QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.")
+            try:
+                # test if we get a response we can parse
+                metadata = test.json()
+            except ValueError:
+                raise QueryParametersException(f"Unexpected response when trying to fetch metadata for dataset {keys[0]}.")
 
-        if len(bases) != 1:
-            return QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from "
-                                            "one 4CAT server at a time.")
+            version = get_software_version()
 
-        base = urls[0].split("/results/")[0]
-        try:
-            # test if API key is valid and server is reachable
-            test = SearchImportFromFourcat.fetch_from_4cat(base, keys[0], query.get("api-key"), "metadata")
-        except FourcatImportException as e:
-            raise QueryParametersException(str(e))
+            if metadata.get("current_4CAT_version") != version:
+                raise QueryParametersException(f"This 4CAT server is running a different version of 4CAT ({version}) than "
+                                               f"the one you are trying to import from ({metadata.get('current_4CAT_version')}). Make "
+                                               "sure both are running the same version of 4CAT and try again.")
 
-        try:
-            # test if we get a response we can parse
-            metadata = test.json()
-        except ValueError:
-            raise QueryParametersException(f"Unexpected response when trying to fetch metadata for dataset {keys[0]}.")
-
-        version = get_software_version()
-
-        if metadata.get("current_4CAT_version") != version:
-            raise QueryParametersException(f"This 4CAT server is running a different version of 4CAT ({version}) than "
-                                           f"the one you are trying to import from ({metadata.get('current_4CAT_version')}). Make "
-                                           "sure both are running the same version of 4CAT and try again.")
-
-        # OK, we can import at least one dataset
-        return {
-            "url": ",".join(urls),
-            "api-key": query.get("api-key")
-        }
+            # OK, we can import at least one dataset
+            return {
+                "url": ",".join(urls),
+                "api-key": query.get("api-key")
+            }
+        else:
+            raise QueryParametersException("Import method not yet implemented.")
 
     @staticmethod
     def get_keys_from_urls(urls):
diff --git a/docker-compose_build.yml b/docker-compose_build.yml
index 7466e8ba8..b81a9fb94 100644
--- a/docker-compose_build.yml
+++ b/docker-compose_build.yml
@@ -32,6 +32,9 @@ services:
       - ./data/datasets/:/usr/src/app/data/
       - ./data/config/:/usr/src/app/config/
       - ./data/logs/:/usr/src/app/logs/
+#      - 4cat_data:/usr/src/app/data/
+#      - 4cat_config:/usr/src/app/config/
+#      - 4cat_logs:/usr/src/app/logs/
     entrypoint: docker/docker-entrypoint.sh
 
   frontend:
@@ -49,6 +52,9 @@ services:
       - ./data/datasets/:/usr/src/app/data/
       - ./data/config/:/usr/src/app/config/
       - ./data/logs/:/usr/src/app/logs/
+#      - 4cat_data:/usr/src/app/data/
+#      - 4cat_config:/usr/src/app/config/
+#      - 4cat_logs:/usr/src/app/logs/
     command: ["docker/wait-for-backend.sh"]
 
 volumes:
diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py
new file mode 100644
index 000000000..bd7b81289
--- /dev/null
+++ b/processors/conversion/export_datasets.py
@@ -0,0 +1,106 @@
+"""
+Export a dataset and all its children to a ZIP file
+"""
+import shutil
+import json
+import datetime
+
+from backend.lib.processor import BasicProcessor
+from common.lib.dataset import DataSet
+from common.lib.exceptions import DataSetException
+
+__author__ = "Dale Wahl"
+__credits__ = ["Dale Wahl"]
+__maintainer__ = "Dale Wahl"
+__email__ = "4cat@oilab.eu"
+
+
+
+class ExportDatasets(BasicProcessor):
+	"""
+	Export a dataset and all its children to a ZIP file
+	"""
+	type = "export-datasets"  # job type ID
+	category = "Conversion"  # category
+	title = "Export Dataset and All Analyses"  # title displayed in UI
+	description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Automatically expires after 1 day, after which you must run again."  # description displayed in UI
+	extension = "zip"  # extension of result file, used internally and in UI
+
+	@classmethod
+	def is_compatible_with(cls, module=None, user=None):
+		"""
+		Determine if processor is compatible with dataset
+
+		:param module: Module to determine compatibility with
+		"""
+		return module.is_top_dataset() and user.can_access_dataset(dataset=module, role="owner")
+
+	def process(self):
+		"""
+		This takes a CSV file as input and writes the same data as a JSON file
+		"""
+		self.dataset.update_status("Collecting dataset and all analyses")
+
+		results_path = self.dataset.get_staging_area()
+
+		exported_datasets = []
+		failed_exports = []  # keys that failed to import
+		keys = [self.dataset.top_parent().key] # get the key of the top parent
+		while keys:
+			dataset_key = keys.pop(0)
+			self.dataset.log(f"Exporting dataset {dataset_key}.")
+
+			try:
+				dataset = DataSet(key=dataset_key, db=self.db)
+			# TODO: these two should fail for the primary dataset, but should they fail for the children too?
+			except DataSetException:
+				self.dataset.finish_with_error("Dataset not found.")
+				return
+			if not dataset.is_finished():
+				self.dataset.finish_with_error("You cannot export unfinished datasets.")
+				return
+
+			# get metadata
+			metadata = dataset.get_metadata()
+			if metadata["num_rows"] == 0:
+				self.dataset.update_status(f"Skipping empty dataset {dataset_key}")
+				failed_exports.append(dataset_key)
+				continue
+
+			# get data
+			data_file = dataset.get_results_path()
+			if not data_file.exists():
+				self.dataset.finish_with_error(f"Dataset {dataset_key} has no data; skipping.")
+				failed_exports.append(dataset_key)
+				continue
+
+			# get log
+			log_file = dataset.get_results_path().with_suffix(".log")
+
+			# All good, add to ZIP
+			with results_path.joinpath(f"{dataset_key}_metadata.json").open("w", encoding="utf-8") as outfile:
+				outfile.write(json.dumps(metadata))
+			shutil.copy(data_file, results_path.joinpath(data_file.name))
+			if log_file.exists():
+				shutil.copy(log_file, results_path.joinpath(log_file.name))
+
+			# add children to queue
+			# Not using get_all_children() because we want to skip unfinished datasets and only need the keys
+			children = [d["key"] for d in self.db.fetchall("SELECT key FROM datasets WHERE key_parent = %s AND is_finished = TRUE", (dataset_key,))]
+			keys.extend(children)
+
+			self.dataset.update_status(f"Exported dataset {dataset_key}.")
+			exported_datasets.append(dataset_key)
+
+		# Add export log to ZIP
+		self.dataset.log(f"Exported datasets: {exported_datasets}")
+		self.dataset.log(f"Failed to export datasets: {failed_exports}")
+		shutil.copy(self.dataset.get_log_path(), results_path.joinpath("export.log"))
+
+		# set expiration date
+		# these datasets can be very large and are just copies of the existing datasets, so we don't need to keep them around for long
+		# TODO: convince people to stop using hyphens in python variables and file names...
+		self.dataset.__setattr__("expires-after", (datetime.datetime.now() + datetime.timedelta(days=1)).timestamp())
+
+		# done!
+		self.write_archive_and_finish(results_path, len(exported_datasets))
\ No newline at end of file
diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py
index 5b47c030d..f7f66ad6e 100644
--- a/webtool/views/api_tool.py
+++ b/webtool/views/api_tool.py
@@ -1246,11 +1246,7 @@ def export_packed_dataset(key=None, component=None):
 		return error(403, error="You cannot export unfinished datasets.")
 
 	if component == "metadata":
-		metadata = db.fetchone("SELECT * FROM datasets WHERE key = %s", (dataset.key,))
-
-		# get 4CAT version (presumably to ensure export is compatible with import)
-		metadata["current_4CAT_version"] = get_software_version()
-		return jsonify(metadata)
+		return jsonify(dataset.get_metadata())
 
 	elif component == "children":
 		children = [d["key"] for d in db.fetchall("SELECT key FROM datasets WHERE key_parent = %s AND is_finished = TRUE", (dataset.key,))]

From dbcc7bddc5b3b440254c3645407629d304057931 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 1 Oct 2024 17:32:59 +0200
Subject: [PATCH 16/26] remove auto settings deletion

---
 .env                     |  2 +-
 common/config_manager.py | 10 ----------
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/.env b/.env
index d03f9c703..7d89486c7 100644
--- a/.env
+++ b/.env
@@ -2,7 +2,7 @@
 # https://hub.docker.com/repository/docker/digitalmethodsinitiative/4cat/tags?page=1&ordering=last_updated
 DOCKER_TAG=stable
 # You can select Postrgres Docker image tags here to suit your needs: https://hub.docker.com/_/postgres
-POSTGRES_TAG=latest
+POSTGRES_TAG=15
 
 # Database setup
 POSTGRES_USER=fourcat
diff --git a/common/config_manager.py b/common/config_manager.py
index eb6c846d0..1b8d4052f 100644
--- a/common/config_manager.py
+++ b/common/config_manager.py
@@ -146,16 +146,6 @@ def ensure_database(self):
         """
         self.with_db()
 
-        # delete unknown keys
-        known_keys = tuple([names for names, settings in config.config_definition.items() if settings.get("type") not in UserInput.OPTIONS_COSMETIC])
-        unknown_keys = self.db.fetchall("SELECT DISTINCT name FROM settings WHERE name NOT IN %s", (known_keys,))
-
-        if unknown_keys:
-            self.db.log.info(f"Deleting unknown settings from database: {', '.join([key['name'] for key in unknown_keys])}")
-            self.db.delete("settings", where={"name": tuple([key["name"] for key in unknown_keys])}, commit=False)
-
-        self.db.commit()
-
         # create global values for known keys with the default
         known_settings = self.get_all()
         for setting, parameters in self.config_definition.items():

From 8559a113313cad860d4a5a2b5bb55943278bfc6c Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 1 Oct 2024 17:33:50 +0200
Subject: [PATCH 17/26] undo .env change

---
 .env | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.env b/.env
index 7d89486c7..d03f9c703 100644
--- a/.env
+++ b/.env
@@ -2,7 +2,7 @@
 # https://hub.docker.com/repository/docker/digitalmethodsinitiative/4cat/tags?page=1&ordering=last_updated
 DOCKER_TAG=stable
 # You can select Postrgres Docker image tags here to suit your needs: https://hub.docker.com/_/postgres
-POSTGRES_TAG=15
+POSTGRES_TAG=latest
 
 # Database setup
 POSTGRES_USER=fourcat

From d769be44adb920503c33f88777d2879dcca4b98c Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Thu, 3 Oct 2024 16:48:46 +0200
Subject: [PATCH 18/26] douyin fix link for streams

---
 datasources/douyin/search_douyin.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py
index 4b5d5b814..12768196c 100644
--- a/datasources/douyin/search_douyin.py
+++ b/datasources/douyin/search_douyin.py
@@ -218,7 +218,7 @@ def map_item(item):
             "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
             "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")),
             # Adding this as different Douyin pages contain different data
-            "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}",
+            "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}",
             "region": item.get("region", ""),
             "hashtags": ",".join(
                 [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if

From e4c0099d75cdc27f0e1f3f3609a8af93c52b425c Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 8 Oct 2024 12:07:57 +0200
Subject: [PATCH 19/26] word-trees allow selection of column

---
 processors/visualisation/word-trees.py | 148 ++++++++++++++-----------
 1 file changed, 86 insertions(+), 62 deletions(-)

diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py
index f7783bcc1..0dfe2d408 100644
--- a/processors/visualisation/word-trees.py
+++ b/processors/visualisation/word-trees.py
@@ -38,71 +38,94 @@ class MakeWordtree(BasicProcessor):
 		"Wattenberg, M., & Viégas, F. B. (2008). The Word Tree, an Interactive Visual Concordance. IEEE Transactions on Visualization and Computer Graphics, 14(6), 1221–1228. <https://doi.org/10.1109/TVCG.2008.172>"
 	]
 
-	options = {
-		"query": {
-			"type": UserInput.OPTION_TEXT,
-			"default": "",
-			"help": "Word tree root query",
-			"tooltip": "Enter a word here to serve as the root of the word tree. The context of this query will be mapped in the tree visualisation. Cannot be empty or contain whitespace."
-		},
-		"limit": {
-			"type": UserInput.OPTION_TEXT,
-			"default": 3,
-			"min": 1,
-			"max": 25,
-			"help": "Max branches/level",
-			"tooltip": "Limit the amount of branches per level, sorted by most-occuring phrases. Range 1-25."
-		},
-		"window": {
-			"type": UserInput.OPTION_TEXT,
-			"min": 1,
-			"max": 10,
-			"default": 5,
-			"help": "Window size",
-			"tooltip": "Up to this many words before and/or after the queried phrase will be visualised"
-		},
-		"sides": {
-			"type": UserInput.OPTION_CHOICE,
-			"default": "right",
-			"options": {
-				"left": "Before query",
-				"right": "After query",
-				"both": "Before and after query"
+	@classmethod
+	def get_options(cls, parent_dataset=None, user=None):
+		"""
+		Get processor options
+		"""
+		options = {
+			"column": {
+				"type": UserInput.OPTION_TEXT,
+				"help": "Text column",
+				"default": "url",
+				"inline": True,
+				"tooltip": "Select the column containing the text from which to generate the word tree.",
+			},
+			"query": {
+				"type": UserInput.OPTION_TEXT,
+				"default": "",
+				"help": "Word tree root query",
+				"tooltip": "Enter a word here to serve as the root of the word tree. The context of this query will be mapped in the tree visualisation. Cannot be empty or contain whitespace."
+			},
+			"limit": {
+				"type": UserInput.OPTION_TEXT,
+				"default": 3,
+				"min": 1,
+				"max": 25,
+				"help": "Max branches/level",
+				"tooltip": "Limit the amount of branches per level, sorted by most-occuring phrases. Range 1-25."
 			},
-			"help": "Query context to visualise"
-		},
-		"align": {
-			"type": UserInput.OPTION_CHOICE,
-			"default": "middle",
-			"options": {
-				"middle": "Vertically centered",
-				"top": "Top",
+			"window": {
+				"type": UserInput.OPTION_TEXT,
+				"min": 1,
+				"max": 10,
+				"default": 5,
+				"help": "Window size",
+				"tooltip": "Up to this many words before and/or after the queried phrase will be visualised"
 			},
-			"help": "Visual alignment"
-		},
-		"tokeniser_type": {
-			"type": UserInput.OPTION_CHOICE,
-			"default": "regular",
-			"options": {
-				"regular": "nltk word_tokenize",
-				"jieba-cut": "jieba (for Chinese text; accurate mode, recommended)",
-				"jieba-cut-all": "jieba (for Chinese text; full mode)",
-				"jieba-search": "jieba (for Chinese text; search engine suggestion style)",
+			"sides": {
+				"type": UserInput.OPTION_CHOICE,
+				"default": "right",
+				"options": {
+					"left": "Before query",
+					"right": "After query",
+					"both": "Before and after query"
+				},
+				"help": "Query context to visualise"
 			},
-			"help": "Tokeniser",
-			"tooltip": "What heuristic to use to split up the text into separate words."
-		},
-		"strip-urls": {
-			"type": UserInput.OPTION_TOGGLE,
-			"default": True,
-			"help": "Remove URLs"
-		},
-		"strip-symbols": {
-			"type": UserInput.OPTION_TOGGLE,
-			"default": True,
-			"help": "Remove punctuation"
+			"align": {
+				"type": UserInput.OPTION_CHOICE,
+				"default": "middle",
+				"options": {
+					"middle": "Vertically centered",
+					"top": "Top",
+				},
+				"help": "Visual alignment"
+			},
+			"tokeniser_type": {
+				"type": UserInput.OPTION_CHOICE,
+				"default": "regular",
+				"options": {
+					"regular": "nltk word_tokenize",
+					"jieba-cut": "jieba (for Chinese text; accurate mode, recommended)",
+					"jieba-cut-all": "jieba (for Chinese text; full mode)",
+					"jieba-search": "jieba (for Chinese text; search engine suggestion style)",
+				},
+				"help": "Tokeniser",
+				"tooltip": "What heuristic to use to split up the text into separate words."
+			},
+			"strip-urls": {
+				"type": UserInput.OPTION_TOGGLE,
+				"default": True,
+				"help": "Remove URLs"
+			},
+			"strip-symbols": {
+				"type": UserInput.OPTION_TOGGLE,
+				"default": True,
+				"help": "Remove punctuation"
+			}
 		}
-	}
+
+		# Get the columns for the select columns option
+		if parent_dataset and parent_dataset.get_columns():
+			columns = parent_dataset.get_columns()
+			options["column"]["type"] = UserInput.OPTION_CHOICE
+			options["column"]["options"] = {v: v for v in columns}
+			options["column"]["default"] = "body" if "body" in columns else sorted(
+				columns,
+				key=lambda k: any([name in k for name in ["text", "subject", "description"]]), reverse=True).pop(0)
+
+		return options
 
 	@classmethod
 	def is_compatible_with(cls, module=None, user=None):
@@ -146,6 +169,7 @@ def process(self):
 		delete_regex = re.compile(r"[^a-zA-Z)(.,\n -]")
 
 		# settings
+		column = self.parameters.get("column")
 		strip_urls = self.parameters.get("strip-urls")
 		strip_symbols = self.parameters.get("strip-symbols")
 		sides = self.parameters.get("sides")
@@ -187,7 +211,7 @@ def process(self):
 			processed += 1
 			if processed % 500 == 0:
 				self.dataset.update_status("Processing and tokenising post %i" % processed)
-			body = post["body"]
+			body = post.get(column)
 			if not body:
 				continue
 

From a269f96ed0cf296400fc1d5b4252d0a6765dda52 Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 8 Oct 2024 12:31:22 +0200
Subject: [PATCH 20/26] use punkt_tab instead of punkt

due to pickle issue: https://github.com/nltk/nltk/issues/3293
---
 helper-scripts/first-run.py          | 2 +-
 helper-scripts/migrate.py            | 4 ++--
 processors/text-analysis/tokenise.py | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/helper-scripts/first-run.py b/helper-scripts/first-run.py
index dea0fd487..a565a591e 100644
--- a/helper-scripts/first-run.py
+++ b/helper-scripts/first-run.py
@@ -40,7 +40,7 @@
 
 # Now check for presence of required NLTK packages
 import nltk
-nltk_downloads = ("wordnet", "punkt", "omw-1.4")
+nltk_downloads = ("wordnet", "punkt_tab", "omw-1.4")
 for package in nltk_downloads:
     # if it already exists, .download() will just NOP
     try:
diff --git a/helper-scripts/migrate.py b/helper-scripts/migrate.py
index 25071afe4..55c26c044 100644
--- a/helper-scripts/migrate.py
+++ b/helper-scripts/migrate.py
@@ -69,9 +69,9 @@ def check_for_nltk():
 	# NLTK
 	import nltk
 	try:
-		nltk.data.find('tokenizers/punkt')
+		nltk.data.find('tokenizers/punkt_tab')
 	except LookupError:
-		nltk.download('punkt', quiet=True)
+		nltk.download('punkt_tab', quiet=True)
 	try:
 		nltk.data.find('corpora/wordnet')
 	except LookupError:
diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py
index a104306f1..17c350c86 100644
--- a/processors/text-analysis/tokenise.py
+++ b/processors/text-analysis/tokenise.py
@@ -357,7 +357,7 @@ def dummy_function(x, *args, **kwargs):
 				# for russian we use a special purpose splitter with better
 				# performance
 				sentence_method = razdel.sentenize
-			elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt')) if
+			elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if
 								'pickle' in lang]:
 				self.dataset.update_status(
 					f"Language {language} not available for sentence tokenizer; grouping by item/post instead.")

From db5b6498acd9310a0849be4abf21ab7b04a979bc Mon Sep 17 00:00:00 2001
From: Dale Wahl <dalewahl@gmail.com>
Date: Tue, 8 Oct 2024 13:10:50 +0200
Subject: [PATCH 21/26] histwords: return on fail

---
 processors/visualisation/histwords.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/processors/visualisation/histwords.py b/processors/visualisation/histwords.py
index f6ae05261..7463e9662 100644
--- a/processors/visualisation/histwords.py
+++ b/processors/visualisation/histwords.py
@@ -243,6 +243,7 @@ def process(self):
                 vectors = tsne.fit_transform(vectors)
             except ValueError:
                 self.dataset.finish_with_error("Insufficient data to reduce to 2D. The word embeddings model may be too small to visualise properly.")
+                return
         elif reduction_method == "TruncatedSVD":
             # standard sklearn parameters made explicit
             svd = TruncatedSVD(n_components=2, algorithm="randomized", n_iter=5, random_state=0)

From d399921b7538bc06264b122c8a0d523826bb91a5 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 11 Oct 2024 17:12:21 +0200
Subject: [PATCH 22/26] Configurable model list for stormtrooper processor

---
 processors/machine_learning/annotate_text.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/processors/machine_learning/annotate_text.py b/processors/machine_learning/annotate_text.py
index 954963de4..59016e077 100644
--- a/processors/machine_learning/annotate_text.py
+++ b/processors/machine_learning/annotate_text.py
@@ -40,6 +40,11 @@ class TextClassifier(BasicProcessor):
             "type": UserInput.OPTION_TOGGLE,
             "default": False,
             "help": "Enable LLM-powered text classification",
+        },
+        "dmi-service-manager.stormtrooper_models": {
+            "type": UserInput.OPTION_TEXT,
+            "default": "google/flan-t5-large,tiiaue/falcon-7b-instruct",
+            "help": "Comma-separated list of models that can be selected"
         }
     }
 
@@ -53,8 +58,6 @@ class TextClassifier(BasicProcessor):
             "type": UserInput.OPTION_CHOICE,
             "default": "google/flan-t5-large",
             "options": {
-                "google/flan-t5-large": "google/flan-t5-large",
-                "tiiaue/falcon-7b-instruct": "tiiaue/falcon-7b-instruct"
             },
             "help": "Large Language Model to use"
         },
@@ -97,6 +100,10 @@ def get_options(cls, parent_dataset=None, user=None):
         :return dict:  Processor options
         """
         options = cls.options
+
+        models = config.get("dmi-service-manager.stormtrooper_models", user=user).split(",")
+        options["model"]["options"] = {m: m for m in models}
+
         if parent_dataset is None:
             return options
 

From 0bc88a3fabcb8818720ab0a6ea40e166cbcb45e3 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Fri, 11 Oct 2024 17:17:18 +0200
Subject: [PATCH 23/26] Add references to stormtrooper processor

---
 processors/machine_learning/annotate_text.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/processors/machine_learning/annotate_text.py b/processors/machine_learning/annotate_text.py
index 59016e077..022e96de5 100644
--- a/processors/machine_learning/annotate_text.py
+++ b/processors/machine_learning/annotate_text.py
@@ -31,6 +31,13 @@ class TextClassifier(BasicProcessor):
                    "provided categories.")  # description displayed in UI
     extension = "csv"  # extension of result file, used internally and in UI
 
+    references = [
+        "Annotations are made using the [Stormtrooper](https://centre-for-humanities-computing.github.io/stormtrooper/) library",
+        "Model card: [google/flan-t5-large](https://huggingface.co/google/flan-t5-large)",
+        "Model card: [tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct)",
+        "Model card: [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)"
+    ]
+
     config = {
         "dmi-service-manager.stormtrooper_intro-1": {
             "type": UserInput.OPTION_INFO,

From c27fbbe44175740bffa959fc21d3d98cb42758ce Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 14 Oct 2024 09:35:27 +0200
Subject: [PATCH 24/26] Yet more ways LinkedIn stores image URLs

---
 datasources/linkedin/search_linkedin.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py
index f357341ed..a8380b4d8 100644
--- a/datasources/linkedin/search_linkedin.py
+++ b/datasources/linkedin/search_linkedin.py
@@ -79,7 +79,10 @@ def map_item(item):
         # or alternatively they are stored here:
         if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"):
             image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"]
-            images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"])
+            if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]:
+                images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"])
+            elif image and image.get("artifacts"):
+                images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"])
 
         author = SearchLinkedIn.get_author(item)
 

From a68f5d64de14c02368ba7b9b7dd0a1118cd15417 Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 14 Oct 2024 12:21:17 +0200
Subject: [PATCH 25/26] Threads data source

---
 datasources/threads/DESCRIPTION.md    |  9 ++++
 datasources/threads/__init__.py       | 12 +++++
 datasources/threads/search_threads.py | 78 +++++++++++++++++++++++++++
 3 files changed, 99 insertions(+)
 create mode 100644 datasources/threads/DESCRIPTION.md
 create mode 100644 datasources/threads/__init__.py
 create mode 100644 datasources/threads/search_threads.py

diff --git a/datasources/threads/DESCRIPTION.md b/datasources/threads/DESCRIPTION.md
new file mode 100644
index 000000000..22f95bba8
--- /dev/null
+++ b/datasources/threads/DESCRIPTION.md
@@ -0,0 +1,9 @@
+The Threads data source can be used to manipulate data collected from [Threads](https://threads.net) - Meta's 
+microblogging platform - with  [Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected 
+with the browser extension; 4CAT cannot collect data on its own. After collecting data with Zeeschuimer it can be 
+uploaded to 4CAT for further processing and analysis. See the Zeeschuimer documentation for more information on how to 
+collect data with it.
+
+Data is collected as it is formatted internally by Threads' website. Posts are stored as (large) JSON objects; it 
+will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead. The JSON structure
+is relatively straightforward and contains some data not included in the CSV exports.
\ No newline at end of file
diff --git a/datasources/threads/__init__.py b/datasources/threads/__init__.py
new file mode 100644
index 000000000..a4f019429
--- /dev/null
+++ b/datasources/threads/__init__.py
@@ -0,0 +1,12 @@
+"""
+Initialize Threads data source
+"""
+
+# An init_datasource function is expected to be available to initialize this
+# data source. A default function that does this is available from the
+# backend helpers library.
+from common.lib.helpers import init_datasource
+
+# Internal identifier for this data source
+DATASOURCE = "threads"
+NAME = "Threads"
\ No newline at end of file
diff --git a/datasources/threads/search_threads.py b/datasources/threads/search_threads.py
new file mode 100644
index 000000000..02c8c2de4
--- /dev/null
+++ b/datasources/threads/search_threads.py
@@ -0,0 +1,78 @@
+"""
+Import scraped Threads data
+
+It's prohibitively difficult to scrape data from Threads within 4CAT itself due
+to its aggressive rate limiting. Instead, import data collected elsewhere.
+"""
+from datetime import datetime
+from urllib.parse import urlparse, parse_qs, unquote
+import re
+
+from backend.lib.search import Search
+from common.lib.item_mapping import MappedItem
+
+
+class SearchThreads(Search):
+    """
+    Import scraped Threads data
+    """
+    type = "threads-search"  # job ID
+    category = "Search"  # category
+    title = "Import scraped Threads data"  # title displayed in UI
+    description = "Import Threads data collected with an external tool such as Zeeschuimer."  # description displayed in UI
+    extension = "ndjson"  # extension of result file, used internally and in UI
+    is_from_zeeschuimer = True
+
+    # not available as a processor for existing datasets
+    accepts = [None]
+    references = [
+        "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)",
+        "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)"
+    ]
+
+    def get_items(self, query):
+        """
+        Run custom search
+
+        Not available for 9gag
+        """
+        raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere")
+
+    @staticmethod
+    def map_item(post):
+        post_timestamp = datetime.fromtimestamp(post["taken_at"])
+
+        if post["carousel_media"]:
+            image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"]]
+            video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]]
+        else:
+            image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"].get("candidates") else []
+            video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else []
+
+        linked_url = ""
+        link_thumbnail = ""
+        if post["text_post_app_info"].get("link_preview_attachment"):
+            linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"]
+            linked_url = parse_qs(urlparse(linked_url).query).get("u", "").pop()
+            link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url")
+
+        return MappedItem({
+            "id": post["code"],
+            "url": f"https://www.threads.net/@{post['user']['username']}/post/{post['code']}",
+            "body": post["caption"]["text"] if post["caption"] else "",
+            "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"),
+            "author": post["user"]["username"],
+            "author_is_verified": "yes" if post["user"].get("is_verified") else "no",
+            "author_avatar": post["user"].get("profile_pic_url"),
+            "image_url": ",".join(image_urls),
+            "video_url": ",".join(video_urls),
+            "link_url": linked_url,
+            "link_thumbnail_url": link_thumbnail if link_thumbnail else "",
+            "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no",
+            "likes": post["like_count"],
+            "reposts": post["text_post_app_info"]["repost_count"],
+            "replies": post["text_post_app_info"]["direct_reply_count"],
+            "quotes": post["text_post_app_info"]["quote_count"],
+            "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "",
+            "unix_timestamp": int(post_timestamp.timestamp()),
+        })

From cbbf89ec35c782823240668d653a5e921e15794a Mon Sep 17 00:00:00 2001
From: Stijn Peeters <stijn.peeters@uva.nl>
Date: Mon, 14 Oct 2024 15:09:12 +0200
Subject: [PATCH 26/26] Don't crash URL Titles when trying to extract URLs from
 numbers

---
 processors/metrics/url_titles.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/processors/metrics/url_titles.py b/processors/metrics/url_titles.py
index e32e3538d..75ebd12d0 100644
--- a/processors/metrics/url_titles.py
+++ b/processors/metrics/url_titles.py
@@ -145,7 +145,7 @@ def process(self):
         self.dataset.update_status("Finding URLs in dataset")
         for item in self.source_dataset.iterate_items(self):
             # combine column contents that we need to extract URLs from
-            source_text = " ".join([item[column] for column in columns])
+            source_text = " ".join([str(item[column]) for column in columns])
             urls = ural.urls_from_text(source_text)
 
             for url in urls: