From 070035ebf0cf4065f32f00e78044bb24a22172bd Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Fri, 20 Sep 2024 15:01:31 +0200 Subject: [PATCH 01/26] Link to Bluesky in readme --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 7e4ec7769..6022145f1 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,10 @@ [![Requires Python 3.8](https://img.shields.io/badge/py-v3.8-blue)](https://www.python.org/) [![Docker image status](https://github.com/digitalmethodsinitiative/4cat/actions/workflows/docker_latest.yml/badge.svg)](https://github.com/digitalmethodsinitiative/4cat/actions/workflows/docker_latest.yml) -

4CAT has a website at 4cat.nl.

A screenshot of 4CAT, displaying its 'Create Dataset' interfaceA screenshot of 4CAT, displaying a network visualisation of a dataset

+

4CAT has a website at 4cat.nl.

+

You can also [follow 4CAT on Bluesky](https://bsky.app/profile/4cat.nl) for updates

4CAT is a research tool that can be used to analyse and process data from online social platforms. Its goal is to make the capture and analysis of data from these platforms accessible to people through a web interface, without From 02f90bd1559d710360324e1dca116e8c5519f9fe Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Fri, 20 Sep 2024 15:03:09 +0200 Subject: [PATCH 02/26] Link to Bluesky in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 6022145f1..9fc84f890 100644 --- a/README.md +++ b/README.md @@ -9,7 +9,7 @@

A screenshot of 4CAT, displaying its 'Create Dataset' interfaceA screenshot of 4CAT, displaying a network visualisation of a dataset

4CAT has a website at 4cat.nl.

-

You can also [follow 4CAT on Bluesky](https://bsky.app/profile/4cat.nl) for updates

+

Follow 4CAT on Bluesky for updates.

4CAT is a research tool that can be used to analyse and process data from online social platforms. Its goal is to make the capture and analysis of data from these platforms accessible to people through a web interface, without From dd85961696de3d01fa48cfbbac8a31a4374edc83 Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Mon, 23 Sep 2024 14:37:50 +0200 Subject: [PATCH 03/26] Only import bsky embed JS on front page, make divs wider --- webtool/static/js/bsky-embed.es.js | 4 ++-- webtool/templates/frontpage.html | 2 +- webtool/templates/layout.html | 3 +++ 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/webtool/static/js/bsky-embed.es.js b/webtool/static/js/bsky-embed.es.js index 3169355d9..6a4ae63a2 100644 --- a/webtool/static/js/bsky-embed.es.js +++ b/webtool/static/js/bsky-embed.es.js @@ -680,7 +680,7 @@ function m4(a) { function h4(a, p, l) { return arguments.length === 2 && (l = p, p = {}), Qb(a, p)(m4(l)); } -const y4 = '*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,[type=button],[type=reset],[type=submit]{-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]{display:none}*,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: }.fixed{position:fixed}.right-5{right:1.25rem}.top-5{top:1.25rem}.col-span-2{grid-column:span 2 / span 2}.mx-1{margin-left:.25rem;margin-right:.25rem}.mx-auto{margin-left:auto;margin-right:auto}.mb-1{margin-bottom:.25rem}.mb-16{margin-bottom:4rem}.ml-10{margin-left:2.5rem}.mr-1{margin-right:.25rem}.mt-4{margin-top:1rem}.mt-8{margin-top:2rem}.block{display:block}.inline{display:inline}.flex{display:flex}.grid{display:grid}.h-10{height:2.5rem}.h-14{height:3.5rem}.h-2{height:.5rem}.h-4{height:1rem}.max-h-\\[90vh\\]{max-height:90vh}.w-10{width:2.5rem}.w-14{width:3.5rem}.w-4{width:1rem}.w-full{width:100%}.max-w-\\[calc\\(100vw-96px\\)\\]{max-width:calc(100vw - 96px)}.max-w-screen-sm{max-width:640px}.flex-1{flex:1 1 0%}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.flex-col{flex-direction:column}.items-center{align-items:left}.justify-center{justify-content:center}.gap-1{gap:.25rem}.gap-2{gap:.5rem}.gap-4{gap:1rem}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.5rem * var(--tw-space-y-reverse))}.overflow-hidden{overflow:hidden}.text-ellipsis{text-overflow:ellipsis}.whitespace-nowrap{white-space:nowrap}.whitespace-pre-wrap{white-space:pre-wrap}.rounded{border-radius:.25rem}.rounded-full{border-radius:9999px}.rounded-md{border-radius:.375rem}.rounded-t-md{border-top-left-radius:.375rem;border-top-right-radius:.375rem}.border{border-width:1px}.border-b{border-bottom-width:1px}.border-slate-300{--tw-border-opacity: 1;border-color:rgb(203 213 225 / var(--tw-border-opacity))}.bg-blue-500{--tw-bg-opacity: 1;background-color:rgb(59 130 246 / var(--tw-bg-opacity))}.bg-gray-900{--tw-bg-opacity: 1;background-color:rgb(17 24 39 / var(--tw-bg-opacity))}.bg-slate-100{--tw-bg-opacity: 1;background-color:rgb(241 245 249 / var(--tw-bg-opacity))}.bg-slate-200{--tw-bg-opacity: 1;background-color:rgb(226 232 240 / var(--tw-bg-opacity))}.bg-slate-900{--tw-bg-opacity: 1;background-color:rgb(15 23 42 / var(--tw-bg-opacity))}.p-3{padding:.75rem}.p-4{padding:1rem}.px-4{padding-left:1rem;padding-right:1rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.text-sm{font-size:.875rem;line-height:1.25rem}.font-bold{font-weight:700}.font-semibold{font-weight:600}.text-blue-500{--tw-text-opacity: 1;color:rgb(59 130 246 / var(--tw-text-opacity))}.text-slate-500{--tw-text-opacity: 1;color:rgb(100 116 139 / var(--tw-text-opacity))}.text-slate-600{--tw-text-opacity: 1;color:rgb(71 85 105 / var(--tw-text-opacity))}.text-white{--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity))}.underline{text-decoration-line:underline}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.backdrop\\:bg-gray-800::backdrop{--tw-bg-opacity: 1;background-color:rgb(31 41 55 / var(--tw-bg-opacity))}.backdrop\\:opacity-90::backdrop{opacity:.9}.hover\\:bg-blue-700:hover{--tw-bg-opacity: 1;background-color:rgb(29 78 216 / var(--tw-bg-opacity))}.hover\\:underline:hover{text-decoration-line:underline}.dark\\:border-slate-800:where(.dark,.dark *){--tw-border-opacity: 1;border-color:rgb(30 41 59 / var(--tw-border-opacity))}.dark\\:bg-slate-800:where(.dark,.dark *){--tw-bg-opacity: 1;background-color:rgb(30 41 59 / var(--tw-bg-opacity))}.dark\\:text-slate-400:where(.dark,.dark *){--tw-text-opacity: 1;color:rgb(148 163 184 / var(--tw-text-opacity))}.dark\\:text-white:where(.dark,.dark *){--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity))}'; +const y4 = '*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html,:host{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal;-webkit-tap-highlight-color:transparent}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-feature-settings:normal;font-variation-settings:normal;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,[type=button],[type=reset],[type=submit]{-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]{display:none}*,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: }.fixed{position:fixed}.right-5{right:1.25rem}.top-5{top:1.25rem}.col-span-2{grid-column:span 2 / span 2}.mx-1{margin-left:.25rem;margin-right:.25rem}.mx-auto{margin-left:auto;margin-right:auto}.mb-1{margin-bottom:.25rem}.mb-16{margin-bottom:4rem}.ml-10{margin-left:2.5rem}.mr-1{margin-right:.25rem}.mt-4{margin-top:1rem}.mt-8{margin-top:2rem}.block{display:block}.inline{display:inline}.flex{display:flex}.grid{display:grid}.h-10{height:2.5rem}.h-14{height:3.5rem}.h-2{height:.5rem}.h-4{height:1rem}.max-h-\\[90vh\\]{max-height:90vh}.w-10{width:2.5rem}.w-14{width:3.5rem}.w-4{width:1rem}.w-full{width:100%}.max-w-\\[calc\\(100vw-96px\\)\\]{max-width:calc(100vw - 96px)}.max-w-screen-sm{max-width:640px}.flex-1{flex:1 1 0%}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}.grid-cols-2{grid-template-columns:repeat(2,minmax(0,1fr))}.grid-cols-4{grid-template-columns:repeat(4,minmax(0,1fr))}.flex-col{flex-direction:column}.items-center{align-items:normal}.justify-center{justify-content:center}.gap-1{gap:.25rem}.gap-2{gap:.5rem}.gap-4{gap:1rem}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.5rem * var(--tw-space-y-reverse))}.overflow-hidden{overflow:hidden}.text-ellipsis{text-overflow:ellipsis}.whitespace-nowrap{white-space:nowrap}.whitespace-pre-wrap{white-space:pre-wrap}.rounded{border-radius:.25rem}.rounded-full{border-radius:9999px}.rounded-md{border-radius:.375rem}.rounded-t-md{border-top-left-radius:.375rem;border-top-right-radius:.375rem}.border{border-width:1px}.border-b{border-bottom-width:1px}.border-slate-300{--tw-border-opacity: 1;border-color:rgb(203 213 225 / var(--tw-border-opacity))}.bg-blue-500{--tw-bg-opacity: 1;background-color:rgb(59 130 246 / var(--tw-bg-opacity))}.bg-gray-900{--tw-bg-opacity: 1;background-color:rgb(17 24 39 / var(--tw-bg-opacity))}.bg-slate-100{--tw-bg-opacity: 1;background-color:rgb(241 245 249 / var(--tw-bg-opacity))}.bg-slate-200{--tw-bg-opacity: 1;background-color:rgb(226 232 240 / var(--tw-bg-opacity))}.bg-slate-900{--tw-bg-opacity: 1;background-color:rgb(15 23 42 / var(--tw-bg-opacity))}.p-3{padding:.75rem}.p-4{padding:1rem}.px-4{padding-left:1rem;padding-right:1rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.text-sm{font-size:.875rem;line-height:1.25rem}.font-bold{font-weight:700}.font-semibold{font-weight:600}.text-blue-500{--tw-text-opacity: 1;color:rgb(59 130 246 / var(--tw-text-opacity))}.text-slate-500{--tw-text-opacity: 1;color:rgb(100 116 139 / var(--tw-text-opacity))}.text-slate-600{--tw-text-opacity: 1;color:rgb(71 85 105 / var(--tw-text-opacity))}.text-white{--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity))}.underline{text-decoration-line:underline}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.backdrop\\:bg-gray-800::backdrop{--tw-bg-opacity: 1;background-color:rgb(31 41 55 / var(--tw-bg-opacity))}.backdrop\\:opacity-90::backdrop{opacity:.9}.hover\\:bg-blue-700:hover{--tw-bg-opacity: 1;background-color:rgb(29 78 216 / var(--tw-bg-opacity))}.hover\\:underline:hover{text-decoration-line:underline}.dark\\:border-slate-800:where(.dark,.dark *){--tw-border-opacity: 1;border-color:rgb(30 41 59 / var(--tw-border-opacity))}.dark\\:bg-slate-800:where(.dark,.dark *){--tw-bg-opacity: 1;background-color:rgb(30 41 59 / var(--tw-bg-opacity))}.dark\\:text-slate-400:where(.dark,.dark *){--tw-text-opacity: 1;color:rgb(148 163 184 / var(--tw-text-opacity))}.dark\\:text-white:where(.dark,.dark *){--tw-text-opacity: 1;color:rgb(255 255 255 / var(--tw-text-opacity))}'; var Bu = { exports: {} }; (function(a) { var p = Object.create, l = Object.defineProperty, m = Object.getOwnPropertyDescriptor, g = Object.getOwnPropertyNames, A = Object.getPrototypeOf, b = Object.prototype.hasOwnProperty, R = (t, i) => function() { @@ -25594,7 +25594,7 @@ const k4 = ({ return J(F, y4, null), J(F, b, null), F; })(), (() => { var F = _4(), G = F.firstChild, ee = G.firstChild, ie = ee.nextSibling; - return Su(F, `${m} max-w-screen-sm mx-auto flex flex-col items-center`), J(F, (() => { + return Su(F, `${m} mx-auto flex flex-col items-center`), J(F, (() => { var ge = tt(() => Ce().length > 0); return () => ge() && Ce().map((Xe, hr) => (() => { var Ae = S4(); diff --git a/webtool/templates/frontpage.html b/webtool/templates/frontpage.html index 6851b9df8..d8b41b1ee 100644 --- a/webtool/templates/frontpage.html +++ b/webtool/templates/frontpage.html @@ -21,7 +21,7 @@

4CAT updates

+ + {% if navigation.current == "about" %} + {% endif %} From 07094f8ef071a526ac06d43a31a454f3bec42640 Mon Sep 17 00:00:00 2001 From: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Mon, 23 Sep 2024 15:50:08 +0200 Subject: [PATCH 04/26] telegram crawl improvements (#444) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * telegram crawl improvements * Squashed commit of the following: commit dd85961696de3d01fa48cfbbac8a31a4374edc83 Author: sal-phd-desktop Date: Mon Sep 23 14:37:50 2024 +0200 Only import bsky embed JS on front page, make divs wider commit 02f90bd1559d710360324e1dca116e8c5519f9fe Author: sal-phd-desktop Date: Fri Sep 20 15:03:09 2024 +0200 Link to Bluesky in readme commit e675dd04a9ffb45cc72704763b7553fee6cf59a2 Merge: 070035eb 38418b2e Author: sal-phd-desktop Date: Fri Sep 20 15:01:45 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 070035ebf0cf4065f32f00e78044bb24a22172bd Author: sal-phd-desktop Date: Fri Sep 20 15:01:31 2024 +0200 Link to Bluesky in readme commit 38418b2ec1533f5e13c8d3f001903db0bfdab4af Author: Sal Hagen Date: Thu Sep 19 17:27:00 2024 +0200 Host BlueSky widget ourselves commit e281eb8bdfad3ec4c800bec2a64e6ff3263a2f74 Author: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com> Date: Thu Sep 19 15:32:08 2024 +0200 Refactor module loading (#396) * Refactor module loading * Optionally inject modules when instantiating dataset object * pass modules in a few more places where possible I think that is everywhere in the frontend. Backend is a bit odd as we are passing dataset.modules when it is None and thus creating children that would require individual inits of ModuleCollector. Could be more to look at there. * Do not lazy-load modules * modules/all_modules * Squashed commit of the following: commit 3f2a62a124926cfeb840796f104a702878ac10e5 Author: Carsten Schnober Date: Wed Sep 18 18:18:29 2024 +0200 Update Gensim to >=4.3.3, <4.4.0 (#450) * Update Gensim to >=4.3.3, <4.4.0 * update nltk as well --------- Co-authored-by: Dale Wahl Co-authored-by: Sal Hagen commit fee2c8c08617094f28496963da282d2e2dddeab7 Merge: 3d94b666 f8e93eda Author: sal-phd-desktop Date: Wed Sep 18 18:11:19 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 3d94b666cedd0de4e0bee953cbf1d787fdc38854 Author: sal-phd-desktop Date: Wed Sep 18 18:11:04 2024 +0200 FINALLY remove 'News' from the front page, replace with 4CAT BlueSky updates and potential information about the specific server (to be set on config page) commit f8e93edabe9013a2c1229caa4c454fab09620125 Author: Stijn Peeters Date: Wed Sep 18 15:11:21 2024 +0200 Simple extensions page in Control Panel commit b5be128c7b8682fb233d962326d9118a61053165 Author: Stijn Peeters Date: Wed Sep 18 14:08:13 2024 +0200 Remove 'docs' directory commit 1e2010af44817016c274c9ec9f7f9971deb57f66 Author: Stijn Peeters Date: Wed Sep 18 14:07:38 2024 +0200 Forgot TikTok and Douyin commit c757dd51884e7ec9cf62ca1726feacab4b2283b7 Author: Stijn Peeters Date: Wed Sep 18 14:01:31 2024 +0200 Say 'zeeschuimer' instead of 'extension' to avoid confusion with 4CAT extensions commit ee7f4345478f923541536c86a5b06246deae03f6 Author: Stijn Peeters Date: Wed Sep 18 14:00:40 2024 +0200 RIP Parler data source commit 11300f2430b51887823b280405de4ded4f15ede1 Author: Stijn Peeters Date: Wed Sep 18 11:21:37 2024 +0200 Tuplestring commit 547265240eba81ca0ad270cd3c536a2b1dcf512d Author: Stijn Peeters Date: Wed Sep 18 11:15:29 2024 +0200 Pass user obj instead of str to ConfigWrapper in Processor commit b21866d7900b5d20ed6ce61ee9aff50f3c0df910 Author: Stijn Peeters Date: Tue Sep 17 17:45:01 2024 +0200 Ensure request-aware config reader in user object when using config wrapper commit bbe79e4b0fe870ccc36cab7bfe7963b28d1948e3 Author: Sal Hagen Date: Tue Sep 17 15:12:46 2024 +0200 Fix extension path walk for Windows commit d6064beaf31a6a85b0e34ed4f8126eb4c4fc07e3 Author: Stijn Peeters Date: Mon Sep 16 14:50:45 2024 +0200 Allow tags that have no users Use case: tag-based frontend differentiation using X-4CAT-Config-Via-Proxy commit b542ded6f976809ec88445e7b04f2c81b900188e Author: Stijn Peeters Date: Mon Sep 16 14:13:14 2024 +0200 Trailing slash in query results list commit a4bddae575b22a009925206a1337bdd89349e567 Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Mon Sep 16 13:57:23 2024 +0200 4CAT Extension - easy(ier) adding of new datasources/processors that can be mainted seperately from 4CAT base code (#451) * domain only * fix reference * try and collect links with selenium * update column_filter to find multiple matches * fix up the normal url_scraper datasource * ensure all selenium links are strings for join * change output of url_scraper to ndjson with map_items * missed key/index change * update web archive to use json and map to 4CAT * fix no text found * and none on scraped_links * check key first * fix up web_archive error reporting * handle None type for error * record web archive "bad request" * add wait after redirect movement * increase waittime for redirects * add processor for trackers * dict to list for addition * allow both newline and comma seperated links * attempt to scrape iframes as seperate pages * Fixes for selenium scraper to work with config database * installation of packages, geckodriver, and firefox if selenium enabled * update install instructions * fix merge error * fix dropped function * have to be kidding me * add note; setup requires docker... need to think about IF this will ever be installed without Docker * seperate selenium class into wrapper and Search class so wrapper can be used in processors! * add screenshots; add firefox extension support * update selenium definitions * regex for extracting urls from strings * screenshots processor; extract urls from text and takes screenshots * Allow producing zip files from data sources * import time * pick better default * test screenshot datasource * validate all params * fix enable extension * haha break out of while loop * count my items * whoops, len() is important here * must be getting tired... * remove redundant logging * Eager loading for screenshots, viewport options, etc * Woops, wrong folder * Fix label shortening * Just 'queue' instead of 'search queue' * Yeah, make it headless * README -> DESCRIPTION * h1 -> h2 * Actually just have no header * Use proper filename for downloaded files * Configure whether to offer pseudonymisation etc * Tweak descriptions * fix log missing data * add columns to post_topic_matrix * fix breadcrumb bug * Add top topics column * Fix selenium config install parameter (Docker uses this/manual would need to run install_selenium, well, manually) * this processor is slow; i thought it was broken long before it updated! * refactor detect_trackers as conversion processor not filter * add geckodriver executable to docker install * Auto-configure webdrivers if available in PATH * update screenshots to act as image-downloader and benefit from processors * fix is_compatible_with * Delete helper-scripts/migrate/migrate-1.30-1.31.py * fix embeddings is_compatible_with * fix up UI options for hashing and private * abstract was moved to lib * various fixes to selenium based datasources * processors not compatible with image datasets * update firefox extension handling * screenshots datasource fix get_options * rename screenshots processor to be detected as image dataset * add monthly and weekly frequencies to wayback machine datasource * wayback ds: fix fail if all attempts do not realize results; addion frequency options to options; add daily * add scroll down page to allow lazy loading for entire page screenshots * screenshots: adjust pause time so it can be used to force a wait for images to load I have not successfully come up with or found a way to wait for all images to load; document.readyState == 'complete' does not function in this way on certain sites including the wayback machine * hash URLs to create filenames * remove log * add setting to toggle display advanced options * add progress bars * web archive fix query validation * count subpages in progress * remove overwritten function * move http response to own column * special filenames * add timestamps to all screenshots * restart selenium on failure * new build have selenium * process urls after start (keep original query parameters) * undo default firefox * quick max * rename SeleniumScraper to SeleniumSearch todo: build SeleniumProcessor! * max number screenshots configurable * method to get url with error handling * use get_with_error_handling * d'oh, screenshot processor needs to quit selenium * update log to contain URL * Update scrolling to use Page down key if necessary * improve logs * update image_category_wall as screenshot datasource does not have category column; this is not ideal and ought to be solved in another way. Also, could I get categories from the metadata? That's... ugh. * no category, no processor * str errors * screenshots: dismiss alerts when checking ready state is complete * set screenshot timeout to 30 seconds * update gensim package * screenshots: move processor interrupt into attempts loop * if alert disappears before we can dismiss it... * selenium specific logger * do not switch window when no alert found on dismiss * extract wait for page to load to selenium class * improve descriptions of screenshot options * remove unused line * treat timeouts differently from other errors these are more likely due to an issue with the website in question * debug if requested * increase pause time * restart browser w/ PID * increase max_workers for selenium this is by individual worker class not for all selenium classes... so you can really crank them out if desired * quick fix restart by pid * avoid bad urls * missing bracket & attempt to fix-missing dependencies in Docker install * Allow dynamic form options in processors * Allow 'requires' on data source options as well * Handle list values with requires * basic processor for apple store; setup checks for additional requirements * fix is_4cat_class * show preview when no map_item * add google store datasource * Docker setup.py use extensions * Wider support for file upload in processors * Log file uploads in DMI service manager * add map_item methods and record more data per item need additional item data as map_item is staticmethod * update from master; merge conflicts * fix docker build context (ignore data files) * fix option requirements * apple store fix: list still tries to get query * apple & google stores fix up item mapping * missed merge error * minor fix * remove unused import * fix datasources w/ files frontend error * fix error w/ datasources having file option * better way to name docker volumes * update two other docker compose files * fix docker-compose ymls * minor bug: fix and add warning; fix no results fail * update apple field names to better match interface * update google store fieldnames and order * sneak in jinja logger if needed * fix fourcat.js handling checkboxes for dynamic settings * add new endpoint for app details to apple store * apple_store map new beta app data * add default lang/country * not all apps have advisories * revert so button works * add chart positions to beta map items * basic scheduler To-do - fix up and add options to scheduler view (e.g. delete/change) - add scheduler view to navigator - tie jobs to datasets? (either in scheduler view or, perhaps, filter dataset view) - more testing... * update scheduler view, add functions to update job interval * revert .env * working scheduler! * basic scheduler view w/ datasets * fix postgres tag * update job status in scheduled_jobs table * fix timestamp; end_date needed for last run check; add dataset label * improve scheduler view * remove dataset from scheduled_jobs table on delete * scheduler view order by last creation * scheduler views: separate scheduler list from scheduled dataset list * additional update from master fixes * apple_store map_items fix missing locales * add back depth for pagination * correct route * modify pagination to accept args * pagination fun * pagination: i hate testing on live servers... * ok ok need the pagination route * pagination: add route_args * fix up scheduler header * improve app store descriptions * add azure store * fix azure links * azure_store: add category search * azure fix type of config update timestamp OPTION_DATE does not appear correctly in settings and causes it to be written incorrectly * basic aws store * check if selenium available; get correct app_id * aws: implement pagination * add logging; wait for elements to load after next page; attempts to rework filter option collection * apple_store: handle invalid param error * fix filter_options * aws: fix filter option collection! * more merge * move new datasources and processors to extensions and modify setup.py and module loader to use the new locations * migrate.py to run extension "fourcat_install.py" files * formatting * remove extensions; add gitignore * excise scheduler merge * some additional cleanup from app_studies branch * allow nested datasources folders; ignore files in extensions main folder * allow extension install scripts to run pip if migrate.py has not * Remove unused URL functions we could use ural for * Take care of git commit hash tracking for extension processors * Get rid of unused path.versionfile config setting * Add extensions README * Squashed commit of the following: commit cd356f7a69d15e8ecc8efffc6d63a16368e62962 Author: Stijn Peeters Date: Sat Sep 14 17:36:18 2024 +0200 UI setting for 4CAT install ad in login commit 0945d8c0a11803a6bb411f15099d50fea25f10ab Author: Stijn Peeters Date: Sat Sep 14 17:32:55 2024 +0200 UI setting for anonymisation controls Todo: make per-datasource commit 1a2562c2f9a368dbe0fc03264fb387e44313213b Author: Stijn Peeters Date: Sat Sep 14 15:53:27 2024 +0200 Debug panel for HTTP headers in control panel commit 203314ec83fb631d985926a0b5c5c440cfaba9aa Author: Stijn Peeters Date: Sat Sep 14 15:53:17 2024 +0200 Preview for HTML datasets commit 48c20c2ebac382bd41b92da4481ff7d832dc1538 Author: Desktop Sal Date: Wed Sep 11 13:54:23 2024 +0200 Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3 Author: Dale Wahl Date: Fri Sep 6 16:29:19 2024 +0200 fix nltk where it matters commit 2ef5c80f2d1a5b5f893c8977d8394740de6d796d Author: Stijn Peeters Date: Tue Sep 3 12:05:14 2024 +0200 Actually check progress in text annotator commit 693960f41b73e39eda0c2f23eb361c18bde632cd Author: Stijn Peeters Date: Mon Sep 2 18:03:18 2024 +0200 Add processor for stormtrooper DMI service commit 6ae964aad492527bc5d016a00f870145aab6e1af Author: Stijn Peeters Date: Fri Aug 30 17:31:37 2024 +0200 Fix reference to old stopwords list in neologisms preset * Fix Github links for extensions * Fix commit detection in extensions * Fix extension detection in module loader * Follow symlinks when loading extensions Probably not uncommon to have a checked out repo somewhere to then symlink into the extensions dir * Make queue message on create page more generic * Markdown in datasource option tooltips * Remove Spacy model from requirements * Add software_source to database SQL --------- Co-authored-by: Stijn Peeters Co-authored-by: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com> commit cd356f7a69d15e8ecc8efffc6d63a16368e62962 Author: Stijn Peeters Date: Sat Sep 14 17:36:18 2024 +0200 UI setting for 4CAT install ad in login commit 0945d8c0a11803a6bb411f15099d50fea25f10ab Author: Stijn Peeters Date: Sat Sep 14 17:32:55 2024 +0200 UI setting for anonymisation controls Todo: make per-datasource commit 1a2562c2f9a368dbe0fc03264fb387e44313213b Author: Stijn Peeters Date: Sat Sep 14 15:53:27 2024 +0200 Debug panel for HTTP headers in control panel commit 203314ec83fb631d985926a0b5c5c440cfaba9aa Author: Stijn Peeters Date: Sat Sep 14 15:53:17 2024 +0200 Preview for HTML datasets commit 48c20c2ebac382bd41b92da4481ff7d832dc1538 Author: Desktop Sal Date: Wed Sep 11 13:54:23 2024 +0200 Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3 Author: Dale Wahl Date: Fri Sep 6 16:29:19 2024 +0200 fix nltk where it matters commit 2ef5c80f2d1a5b5f893c8977d8394740de6d796d Author: Stijn Peeters Date: Tue Sep 3 12:05:14 2024 +0200 Actually check progress in text annotator commit 693960f41b73e39eda0c2f23eb361c18bde632cd Author: Stijn Peeters Date: Mon Sep 2 18:03:18 2024 +0200 Add processor for stormtrooper DMI service commit 6ae964aad492527bc5d016a00f870145aab6e1af Author: Stijn Peeters Date: Fri Aug 30 17:31:37 2024 +0200 Fix reference to old stopwords list in neologisms preset commit 4ba872bef2968f7f8bf5831fd3a4f413420b36ed Author: Dale Wahl Date: Tue Aug 27 13:04:46 2024 +0200 fix hatebase: default column option for OPTION_MULTI_SELECT must be list commit e276033542f2d22e7f614f318a01d65114a21482 Author: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed Aug 21 12:53:10 2024 +0200 Bump nltk from 3.6.7 to 3.9 (#447) Bumps [nltk](https://github.com/nltk/nltk) from 3.6.7 to 3.9. - [Changelog](https://github.com/nltk/nltk/blob/develop/ChangeLog) - [Commits](https://github.com/nltk/nltk/compare/3.6.7...3.9) --- updated-dependencies: - dependency-name: nltk dependency-type: direct:production ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> commit 1d749c3cf83b130ba70bdb09174f382d6711a14b Author: sal-phd-desktop Date: Wed Aug 21 12:52:54 2024 +0200 Set UTF-8 encoding when opening stop words (fixes Windows bug) commit a03e5fd4252e7242563c291558606440256eb3d1 Author: Dale Wahl Date: Mon Aug 19 14:19:21 2024 +0200 remove duplicate line commit aa07e8c13c2d59c6b699f78133036514659ee420 Author: Dale Wahl Date: Mon Jul 29 09:35:22 2024 +0200 tweet import fix: author banner key missing when author has no banner commit 32dac5d2ffb936210f12f5c725514fd25a0286f1 Author: Dale Wahl Date: Mon Jul 29 08:52:08 2024 +0200 tell user when dataset is not found we could have a proper 404 page, but at least leave a message commit 2c8c860fc5378113d1352016ac26ca761adecb32 Author: Dale Wahl Date: Mon Jul 22 17:41:00 2024 +0200 telegram fix: reactions datastructure commit 1c0bf5e580eb16d8a6f9afa415f9febce449a537 Author: Dale Wahl Date: Mon Jul 22 11:19:52 2024 +0200 fix telegram: crawl_max_depth can be None if it is not enabled for a user commit 3dfe7af292b33574a31630e3a0da10954ed87d0a Author: Dale Wahl Date: Fri Jul 19 11:52:31 2024 +0200 fix more config.get() magic commit 2453182bcee6e54b396b762ab77b60b8a0893638 Author: Dale Wahl Date: Fri Jul 19 10:54:23 2024 +0200 config_manager - fix `get_all` w/ one results (super rare edge); fix overwriting self.db in `with_db` commit 6b9cb0b5479e6e64e09a49fa2ca9effe1c5a7415 Author: Dale Wahl Date: Wed Jul 17 15:20:49 2024 +0200 add surf nginx init file commit 5e984e13a08d9fba7d5806a7ef4e012ce7d57319 Author: Dale Wahl Date: Wed Jul 17 14:30:34 2024 +0200 change port for surf commit 2ce8c354e90f939a16dad3f0155fd7d79405c79e Author: Dale Wahl Date: Wed Jul 17 12:54:11 2024 +0200 use latest image on surf commit 13ec0fd3f2bed86c3b2dff73014093a6a92fbfb5 Author: Dale Wahl Date: Wed Jul 17 12:46:59 2024 +0200 update surf docker-compose.yml this may require a new release commit 78698f6ac1b22b1154d31f69543ba7b266d33191 Author: Dale Wahl Date: Wed Jul 17 10:34:56 2024 +0200 clip: handle new and old format commit eb7693780cb191403f107817ca30d90373929bf0 Author: Dale Wahl Date: Tue Jul 16 14:27:08 2024 +0200 DMI SM updates to use status endpoint w/ database records; run on CPU if no GPU enabled commit d2a787e2c1559417bb5401f3208c82954052504f Author: Stijn Peeters Date: Mon Jul 15 15:58:06 2024 +0200 Require most recent Telethon version commit 346150bd9cc96ac099abd4d15fa3de39bd65e9d1 Author: Stijn Peeters Date: Mon Jul 15 15:57:55 2024 +0200 Catch UPDATE_APP_TO_LOGIN in Telegram commit 04acc06e95098d7e2f9b4af404447c9cfaee5b99 Author: Stijn Peeters Date: Mon Jul 15 11:27:30 2024 +0200 Unbreak Twitter error handling commit e9b5232a963be02c2e86dabacb607b2315a4e0e6 Author: Stijn Peeters Date: Fri Jul 12 13:27:15 2024 +0200 Ensure str type when trying to extract video URLs from a field commit d69dd6f337cac05ed31c05334890679976a1e6de Author: Stijn Peeters Date: Fri Jul 12 12:31:14 2024 +0200 Make CSV column mapping params look nicer on result page commit 9bd9da568f593085a8d54744836e3290a75b51a7 Author: Stijn Peeters Date: Fri Jul 12 12:22:03 2024 +0200 Add "empty" and "current timestamp" as options to CSV mapping commit 0b574571952a206904440faf8601ddf95ab42b24 Author: Dale Wahl Date: Thu Jul 11 16:59:56 2024 +0200 image_wall: backup fit method commit eeb1ddeb7ca85b6802dfed3c74d1352062383d50 Merge: 2504c37b 43239467 Author: Stijn Peeters Date: Thu Jul 11 16:47:45 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 43239467db046eea5eb5268f91d1b63a1042238d Author: Dale Wahl Date: Thu Jul 11 12:08:08 2024 +0200 fix processor more button would only show top level analysis if not logged in commit d6ab2b0783f8e40ecd8fadbc2abccffa6f093e39 Author: Dale Wahl Date: Tue Jul 9 15:35:25 2024 +0200 search_gab - use MappedItem commit 2504c37b67ff6f19720b44d8bb6054b1c3d5a155 Author: Stijn Peeters Date: Sat Jul 6 17:51:22 2024 +0200 Fix multiline spacing in multi select list commit fea66ce38be0717da6c1f847e7124f7069c096e2 Author: Dale Wahl Date: Fri Jul 5 13:15:45 2024 +0200 use processor media_type if dataset does not have media_type; set default media_type for downloaders commit d41fa34514e8177efdac7e64a31f2ee75c7d1652 Author: Dale Wahl Date: Fri Jul 5 12:57:18 2024 +0200 video_hasher: handle no metadata file commit 2820dcecc36ed4705a2776064d387ff7ed14e84f Author: Dale Wahl Date: Fri Jul 5 12:50:09 2024 +0200 num_rows not num_items() commit fb09162db902fa22fdf2d7a3ed171ce1489bd92f Author: Dale Wahl Date: Fri Jul 5 12:44:03 2024 +0200 Google vision API returning 400s; properly log and record processed entries; google networks should not run on empty datasets commit ebf39d8262d199895aedc4f7fa275c5685e58563 Author: Dale Wahl Date: Fri Jul 5 12:28:13 2024 +0200 fix image_category_wall whoops, cleared categories and post_values after filling them! commit 1ad9ec2c2e76604793ec37584c051f116af2fdab Author: Stijn Peeters Date: Fri Jul 5 12:03:54 2024 +0200 fsdfdsgd sorry commit c7254c08a477c6cdc8497507e8452c3eff7101c9 Author: Stijn Peeters Date: Fri Jul 5 12:01:21 2024 +0200 Fix razdel versioning commit b9a327abe99f2d9ede4f2747f34f20d1dc6803cb Author: Stijn Peeters Date: Fri Jul 5 11:57:47 2024 +0200 Reorganise tokeniser, stopwords commit fb13bc483af9ba0d677ee35fd045bf36ab1cddf7 Merge: 0b745692 e3046496 Author: Stijn Peeters Date: Fri Jul 5 11:56:08 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit e30464964262870c54c73f65a3bce630d6576f45 Author: Dale Wahl Date: Fri Jul 5 10:51:53 2024 +0200 media_upload allow setting for max_form_part and warn users of failure above certain number of files commit e4f982b4550b352a5d1a131abd78d52e6c196e48 Author: Dale Wahl Date: Fri Jul 5 09:50:49 2024 +0200 Update media_import help text; looks like failure happens somewhere between 600-1000 files due to Flask request size limits commit 0b74569280f8f87376a964a6b160ea1993cb3354 Author: Stijn Peeters Date: Thu Jul 4 17:55:36 2024 +0200 Add razdel as option for Russian tokenisation commit 9f15a2b8d666c3b6fddeb151b7c424cb44df18a6 Author: Dale Wahl Date: Thu Jul 4 17:13:15 2024 +0200 remove the log commit ffcb6a4239075ba190fb534b25b89507e09e5f56 Author: Dale Wahl Date: Thu Jul 4 17:12:43 2024 +0200 Inform user if too many files are uploaded I do not understand why this is appearing. app.config['MAX_CONTENT_LENGTH'] is set to None. Problem persists in Flask alone (i.e., does not appear to be Gunicorn/Nginx/Apache). commit 9cad12dd6f64a63c48d3b5b304b5c7d9d1a6ddb7 Author: Stijn Peeters Date: Thu Jul 4 15:09:42 2024 +0200 Bump version commit aad94f393de77cc9d4f578e1f5be66a3601a4c90 Author: Dale Wahl Date: Thu Jul 4 10:51:01 2024 +0200 Update setup.py to ensure videohash updates commit d9154a6f9c46a5c793909b88da751bc71d6f759f Author: Dale Wahl Date: Tue Jul 2 17:45:26 2024 +0200 clip: categorizing requires categories... seriously, guys? commit 0af9a5ec49bd2bcfbb87bda33976c65683f68777 Author: Dale Wahl Date: Tue Jul 2 17:31:49 2024 +0200 blip2: fix no metadata file found (uploads...) commit d695053f440bd938a57f06adea7b9c732ecf30d7 Author: Dale Wahl Date: Tue Jul 2 17:25:26 2024 +0200 cat_vis_wall - use str as category type if mixed i.e., use floats as string categories commit bcb914076760ea1fb0e277cdcd1782ffa101b535 Author: Sal Hagen Date: Tue Jul 2 16:06:43 2024 +0200 Add Twitter author profile pic and banner URLs commit 1b3b02f826578e8f702ea84a27c8ced7b1fab345 Author: Dale Wahl Date: Tue Jul 2 11:42:50 2024 +0200 add migrate.py log file in Docker commit 2aaa972e6888743fc329d721c37fa626cf2eeae3 Author: Dale Wahl Date: Tue Jul 2 11:42:22 2024 +0200 add necessary pip packages for upgrade in Docker environment; add error logging and save to file for trouble shooting commit 18b8a53c01b334e0f70610b1305d380b25dbe9c6 Author: Dale Wahl Date: Tue Jul 2 11:41:36 2024 +0200 update Dockerfile to keep build environment useful for interactive upgrade commit 7b224b9b798c9aaf956b5b618b98d742c4a2e7cd Author: Dale Wahl Date: Tue Jul 2 11:41:12 2024 +0200 remove docker-compose.yml versions commit acf5de0ed02e144b920a80abfdfa35986dd0ed4c Author: Stijn Peeters Date: Mon Jul 1 17:38:32 2024 +0200 Better issues.md, footer link commit 1953ca3895656ca9a12d2657e58019795ae64b3a Author: Dale Wahl Date: Mon Jul 1 12:00:07 2024 +0200 FIX: get_key() is more of a creating of a key then general getting of a key... commit 12289bb5c766d1af23799ff11278b46b48fc2841 Author: Dale Wahl Date: Mon Jul 1 11:37:06 2024 +0200 .metadata.json may not have top_parent via Media Uploader This may exist in other processors if a proper check is not in place; will need to review commit 25f4ed65ec2c32298a90490cf51037a7ea2d0bf9 Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Tue Jun 25 14:43:40 2024 +0200 Media upload datasource! (#419) * basic changes to allow files box * basic imports, yay! * video_scene_timelines to work on video imports! * add is_compatible_with checks to processors that cannot run on new media top_datasets * more is_compatible fixes * necessary function for checking media_types * enable more processors on media datasets * consolidate user_input file type * detect mimetype from filename best I can do without downloading all the files first. * handle zip archives; allow log and metadata files * do not count metadata or log files in num_files * move machine learning processors so they can be imported elsewhere * audio_to_text datasource * When validating zip file uploads, send list of file attributes instead of the first 128K of the zip file * Check type of files in zip when uploading media * Skip useless files when uploading media as zip * check multiple zip types in JS * js !=== python * fix media_type for loose file imports; fix extension for audio_to_text preset; fix merge for some processors w/ media_type --------- Co-authored-by: Stijn Peeters commit 4ce689bdc3e441a7adf85883ddcda6bae0525ed9 Author: Stijn Peeters Date: Mon Jun 24 11:58:50 2024 +0200 Avoid KeyError commit 155522d0817d19ac7b6b0b0164242156d6f7443a Author: Dale Wahl Date: Thu Jun 20 15:58:21 2024 +0200 add generated images to image wall w/ text visual commit eecde519eab1208eeb6ee53c2d8febff7fb8febf Author: Dale Wahl Date: Thu Jun 20 15:57:56 2024 +0200 allow users to NOT generate all images from prompts commit d0b9574093a109997e63b1062b2bdd8e71300a29 Author: Stijn Peeters Date: Wed Jun 19 16:28:26 2024 +0200 ...don't mangle URLs in preview links commit c105e368a521ec54ae717bb9eb2fe9fae66cf6e8 Merge: 0028a999 8d4f99b2 Author: Dale Wahl Date: Wed Jun 19 16:25:36 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 0028a9994d698611dd8b546b9b3bccbeec30b74f Author: Dale Wahl Date: Wed Jun 19 16:25:12 2024 +0200 add followups to processors commit 8d4f99b22e0308606c7f713ef704dfa939e85247 Author: Stijn Peeters Date: Wed Jun 19 16:17:22 2024 +0200 More flexible URL linking in CSV preview commit f4f8e6621bd6f2504dc3afc2078280bf5edb6444 Author: Dale Wahl Date: Wed Jun 19 13:54:00 2024 +0200 tokeniser fix: use default lang for word_tokenize if language is 'other' commit 127472e91d8e510f3de2a9cc4a87be6cf2d0deaa Author: Stijn Peeters Date: Tue Jun 18 16:45:01 2024 +0200 Better log messages for Telegram data source commit e8714b6fba72e00c690a8d643d8dc54d2250c94a Author: Stijn Peeters Date: Mon Jun 17 17:42:21 2024 +0200 Add 'crawl' feature to Telegram data source Fixes #321 (though might need a bit more testing) commit 25fded7b596097f7916e1793f1841bae2b63d453 Merge: d67cf440 b10e3bb8 Author: sal-phd-desktop Date: Fri Jun 14 16:23:02 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit d67cf440730ea1d4e124c76a4c21d65b56f39c68 Author: sal-phd-desktop Date: Fri Jun 14 16:22:59 2024 +0200 Fix export 4chan script and remove some unecessary code commit b10e3bb8f0c8a67aa5fdbba1962301d8acdf625c Author: Dale Wahl Date: Thu Jun 13 15:14:06 2024 +0200 video_hasher prefix: fix extension type commit ba565cdaa2ebeecf23fd60889d546c76b9ea5eb1 Author: Dale Wahl Date: Thu Jun 13 14:53:13 2024 +0200 video_hasher: fix to work with Pillow updates; add max amount videos commit 90da5d231eff6a4249bef5468fcdbf1ebcf9247a Author: Dale Wahl Date: Thu Jun 13 10:25:24 2024 +0200 image_cat_wall fix the fix commit a8b943d8e2c5471f82ea0442e2659d84fe8d9760 Author: Dale Wahl Date: Wed Jun 12 13:29:41 2024 +0200 add OCR processor to image w/ text visualization commit e7e636b6b89b6163fa6976e67edba68e7d75b7ac Author: Dale Wahl Date: Tue Jun 11 15:23:12 2024 +0200 add image_wall_w_text to follow on BLIP captions commit f74b97827f0465baf8483040471a77e4654e70b1 Author: Dale Wahl Date: Thu Jun 6 11:05:25 2024 +0200 image_category_wall: allow multiple images per item/post commit e3c9ea57d46b32ba47b00a6047a278ddd530adc1 Author: Dale Wahl Date: Thu May 30 16:27:50 2024 +0200 image_category_wall convert None to str for category commit 00874576c354235f4655f1d433ec4382010e18e3 Author: Dale Wahl Date: Thu May 30 14:54:51 2024 +0200 image_category_wall fix float categories commit e0c55a8ae132bedef5da27ecbbb9489a094d454c Author: Dale Wahl Date: Thu May 30 12:51:42 2024 +0200 download_images fix divide by zero when user can download all commit 3580fc9450501262badb8e61ef4b4df4b4c54322 Author: Dale Wahl Date: Thu May 30 12:51:24 2024 +0200 image_category_wall remove 'max' when user can use all images commit f2145bdeff1d68e46cdd3521ecbb61573f01a2f2 Author: Dale Wahl Date: Wed May 29 17:59:23 2024 +0200 rank_attributes: option to count missing data or blanks commit 01e7ab9677a75181bbedc62fa00e636ce2b17c18 Author: Dale Wahl Date: Wed May 29 16:53:57 2024 +0200 fix missing field strategy so default_stategy not overwritten on second loop default_stategy would be set to correctly to the callable, but overwritten on second loop (and map_missing is a dictionary at that point). commit 097f838af1f5f2748578dd9072eb9e3a8b3a7057 Author: Dale Wahl Date: Tue May 28 12:16:08 2024 +0200 add log_level arg to 4cat-daemon.py I've been using this forever and don't know why I haven't commited it commit fd3ac238e60f052889d99c71588170570a384900 Author: Dale Wahl Date: Tue May 28 10:10:56 2024 +0200 google & clarifai to csv had identical "type" possibly caused issue w/ preset commit 1b9965d40aa33035a73f685c13a1ab50cc877f78 Author: Stijn Peeters Date: Mon May 27 15:54:20 2024 +0200 Ensure file cleanup worker always exists commit 0e0917f2232e240df3412fd4df51cf0be19248b5 Author: Stijn Peeters Date: Thu May 23 17:36:22 2024 +0200 Also update Spacy model versions... commit f40128213529d154cfb77afa7aa67a72d5bb640f Author: Stijn Peeters Date: Thu May 23 17:32:35 2024 +0200 *Actually* remove typing_extensions dependency ??? commit ba3d83b824c5fb6fcb0aec5e1c36b35070d6e5d9 Author: Stijn Peeters Date: Thu May 23 17:30:08 2024 +0200 Update minimum Pillow dependency version commit 1c3485648bf2a911052eeeae4f293f303a944aec Author: Stijn Peeters Date: Thu May 23 17:27:27 2024 +0200 Do not require typing_extensions explicitly This was required to ensure Spacy could load - looks like Spacy has since been updated to work with newer versions of typing_extensions as well commit 3828de83ba123254463a904392f24daec626c136 Author: Stijn Peeters Date: Thu May 23 17:02:04 2024 +0200 Bump version commit 8f0d098107a4bbc9d55cc6048f7a38f1d1891a32 Author: Stijn Peeters Date: Thu May 23 17:01:28 2024 +0200 Require non-broken version of emoji library commit 4b2ad805fcc99a83e46732fc991d98d78ef06c6c Author: Stijn Peeters Date: Thu May 23 13:11:03 2024 +0200 Show worker progress in control panel if available commit 9144d4503f46108437616d6bc0cf4fde74df3aca Author: Stijn Peeters Date: Thu May 23 11:07:41 2024 +0200 Bump version commit 807ab77101d197ec897640480a2140439d570c05 Author: Stijn Peeters Date: Wed May 22 21:57:11 2024 +0200 Fix Instagram upload with missing media URL commit d0b4840fd465b6d21657c3d50f9291ac911b6082 Author: Stijn Peeters Date: Wed May 22 17:35:04 2024 +0200 Comma comma comma commit 7fd2e14c9505d0ed1ac77dc09c24f766ea61ee6c Author: Stijn Peeters Date: Wed May 22 17:25:26 2024 +0200 Fix progress indicator for scene extractor commit 661c42c2d083da7004335b0e14910935c3d392f6 Author: Stijn Peeters Date: Wed May 22 17:12:21 2024 +0200 Don't crash video hasher non non-str item IDs commit 1f280321cdde27a9909885fa2f64dbeffa549fb1 Author: Stijn Peeters Date: Wed May 22 17:09:53 2024 +0200 Do not crash timelines processor when metadata has unexpected format commit 572d03f1f368f0ad5f47e705a119b37646148d1d Author: Stijn Peeters Date: Wed May 22 17:09:30 2024 +0200 More efficient video frame extractor commit 1b51d224ca544d7e2913238adbff2049412bc41e Author: Stijn Peeters Date: Wed May 22 17:04:27 2024 +0200 Fix crash in video stack processor with ffmpeg < 5.1 commit ddc73cb2e2f0985e64f84ca86bc167fa9e9dc81a Author: Stijn Peeters Date: Wed May 22 17:03:48 2024 +0200 Helper function for determining ffmpeg version commit ef9dd482b2258c428584997dc661156f63f68b91 Author: Stijn Peeters Date: Wed May 22 12:14:58 2024 +0200 Allow absence of articleComponent in LinkedIn posts commit 060f2cd7f922e7fae337b0697f7c477442d21ef1 Author: Stijn Peeters Date: Wed May 22 12:12:54 2024 +0200 Cast post IDs to string when mapping video scenes commit ab34c415c9ada23763b45676639ce3e80a34f594 Author: Stijn Peeters Date: Wed May 22 11:46:39 2024 +0200 Twitter -> X/Twitter commit de6d97554ccb68375979e5ff09c7e65d8d70a6cd Author: Stijn Peeters Date: Wed May 22 11:45:19 2024 +0200 Colleges -> Collages commit 30365580dc59b4d95e8a62d1b3c666bef60ce7e8 Author: Stijn Peeters Date: Tue May 21 15:41:55 2024 +0200 Explicit disconnect after Telegram image download commit 5727ff7230db42463a824f45d63f0b8343caac14 Author: Stijn Peeters Date: Tue May 21 14:05:50 2024 +0200 Catch TimedOutError while downloading Telegram images commit e0e06686e78976f971aac620267d7e009eaaadff Author: Sal Hagen Date: Mon May 13 13:01:42 2024 +0200 Typo in LinkedIn search commit 51e58dde6ca21278a80f252a8c22dc83d87ace1f Author: Dale Wahl Date: Tue May 7 13:10:43 2024 +0200 text_from_image: fix metadata missing (indent issue) commit c1f8ecc1674375bba2b2e38cb29c9d4d44098f0a Author: Dale Wahl Date: Tue May 7 09:45:25 2024 +0200 text_from_image fix: ensure metadata success before attempting to update original commit 72dbf80db71499c59133e1128205b756d240b300 Merge: d7561625 baacc86b Author: Stijn Peeters Date: Fri May 3 13:14:08 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit d7561625b127573fbb0332fbb713be6a3cb3d953 Author: Stijn Peeters Date: Fri May 3 13:14:03 2024 +0200 Comments without replies don't always have reply_comment_total commit baacc86b269612b4b0956345f8b9fa902df1b61f Author: Dale Wahl Date: Fri May 3 12:01:22 2024 +0200 DSM fix and simplify GPU mem check commit 9b662e9f9b4f4ce194608c8e20a8fc50bc6d9ae3 Author: Parker-Kasiewicz <110084850+Parker-Kasiewicz@users.noreply.github.com> Date: Thu May 2 00:53:45 2024 -0700 Adding Gab as a Data Source! (#401) * Can successfully import gab data, although can't tell if formatting is right becuase waiting on queued requests. * Version w/ different item types * Ingest Gab posts from Zeeschuimer * Small fix for merge conflicts (whoops) * Gab processing logic transferred from Zeeschuimer * fixing small errors for Gab data source * basic processing for truth social from Zeeschuimer --------- Co-authored-by: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> commit 3ecb8fd9c27aee4c457f03516794c6c4eac19c09 Author: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com> Date: Wed May 1 17:51:36 2024 +0200 Fix duplicate line in views_admin.py commit 8b66ae7e467913f8e7571cf4b45493f63804266f Author: Stijn Peeters Date: Wed May 1 17:49:54 2024 +0200 Allow processors to define which fields should be pseudonymised commit c973750c8cabb8698704c5997903e92d1de866d2 Author: Stijn Peeters Date: Wed May 1 17:15:32 2024 +0200 Allow auto-queue of pseudonymisation after import commit 49ad9f0ff785fd44ae494755b785c7fdf7c9cf15 Author: Stijn Peeters Date: Wed May 1 17:08:35 2024 +0200 Get rid of redundant and buggy next/copy_to implementation in Search class commit 106d3659e2fda89867d3a4f587c1c1addfaff2f7 Author: Dale Wahl Date: Wed May 1 16:14:03 2024 +0200 use current branch in settings commit 60bef4157d807f7c01ef3b425295244e91919f31 Author: Stijn Peeters Date: Wed May 1 11:04:07 2024 +0200 Nicer code commit 4182c436e4fb5109c5e041dc729f77a58d877889 Author: Stijn Peeters Date: Tue Apr 30 16:19:36 2024 +0200 Always shut down API worker only after everything else has been shut down commit e685108b3cbe5f005ce2df21906267071ad8118e Author: Stijn Peeters Date: Tue Apr 30 16:12:42 2024 +0200 Properly interrupt expiration worker when asked commit 27a568eca7f2f3742223fef6285eaf80583e0fc4 Author: Stijn Peeters Date: Tue Apr 30 13:40:50 2024 +0200 Allow floats-as-strings as timestamps when importing CSV commit 2d2bbb9fdb9b426b8f4a80782f04257721a97f2e Author: Dale Wahl Date: Tue Apr 30 13:05:07 2024 +0200 douyin: add consistency to map_item stats commit 289aa342c9912aceeca35887c079c72aa6ffbf52 Author: Dale Wahl Date: Mon Apr 29 15:26:38 2024 +0200 fix collection data in Douyin to handle $undefined commit 5b9b23fb1696bc1b69e1d902c0a2ad4b7d168984 Author: Dale Wahl Date: Mon Apr 29 13:00:03 2024 +0200 add scipy requirement to make compatible with gensim https://stackoverflow.com/questions/78279136/importerror-cannot-import-name-triu-from-scipy-linalg-gensim commit 7eab746e944f1ababe3dcd6a5d25387a64c2237d Author: Stijn Peeters Date: Mon Apr 29 12:00:09 2024 +0200 stupid, stupid, stupid commit 90577982ac05019a7ac76818a62f91e84dd65902 Author: Stijn Peeters Date: Mon Apr 29 11:56:22 2024 +0200 Fix leftover iterate_mapped_items commit 57dbdf74c49c34c05784debb9f7e258da7ae7d54 Author: Stijn Peeters Date: Fri Apr 26 15:26:39 2024 +0200 Woops commit f11760d2c13e817e23cfa5e26b24f74cf817f65e Author: Stijn Peeters Date: Fri Apr 26 15:26:04 2024 +0200 Update list of supported platforms in readme commit 760ff1cdeb006f70acaa00ded82fb3cbc7617c9d Author: Stijn Peeters Date: Fri Apr 26 12:13:28 2024 +0200 Bump version commit 1fd78b2362840299e80f5540c9fedc1be3b06da1 Author: Stijn Peeters Date: Thu Apr 25 12:58:24 2024 +0200 Use MissingMappedField for Douyin fields undefined in the source data commit 6918baeabc7a08b6a63495c5d38c86b2c88bca44 Author: Stijn Peeters Date: Thu Apr 25 12:31:11 2024 +0200 Fix Douyin mapping failure if cellRoom is $undefined commit aad6208167c07686348234daff4dcf9cd036f5a5 Author: Stijn Peeters Date: Thu Apr 25 12:30:53 2024 +0200 Better error when trying to import data for unknown datasource commit 43c6ed646994111188bde66d5bcfe4ab602e8512 Author: Stijn Peeters Date: Thu Apr 25 12:30:31 2024 +0200 Fix Twitter mapping on URLs that cannot be expanded commit 91c3da176fad90ba16871fa8892fac5a0df13785 Author: Stijn Peeters Date: Thu Apr 25 12:12:54 2024 +0200 Safe cast to int in CrowdTangle import commit 765f29e9232afdf284ab1667b0f371951e0bf2f4 Author: Stijn Peeters Date: Wed Apr 24 12:37:02 2024 +0200 Fix erroneous shell command in front-end restart trigger commit c99fdd9eca8f5925d93375cac846e8b7633194fb Merge: 342a4037 bc1deddf Author: Stijn Peeters Date: Tue Apr 23 12:29:35 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 342a4037411e7ccaa50b25a4686434bec39e2568 Author: Stijn Peeters Date: Tue Apr 23 12:29:32 2024 +0200 Enable TikTok comment and Gab import by default commit bc1deddf57aa5049fb79622c4309fb7051d77bdb Merge: 537d7645 3c644f01 Author: Dale Wahl Date: Tue Apr 23 12:16:37 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 537d76456e2826e8c4dd7026ec5b2d436370fad8 Author: Dale Wahl Date: Tue Apr 23 12:14:46 2024 +0200 do the todo: fix column_filter to match exact/contains with int commit 3c644f01baeca34e712d36efdf5c77ccd3ef7a06 Author: Stijn Peeters Date: Tue Apr 23 11:16:07 2024 +0200 Don't crash on empty URLs in dataset merge commit f1574c26e2e3bdc40cc04bb8193cf6d3fa14792b Author: Dale Wahl Date: Thu Apr 18 12:08:55 2024 +0200 fix: do not fail when no processor exists weird! failed on a dataset `type="custom-search"` which was created by an import script w/ no processor. Also likely would make deprecated processors fail. 500 server error: ``` File "/opt/4cat/common/lib/dataset.py", line 800, in get_columns return self.get_item_keys(processor=self.get_own_processor()) File "/opt/4cat/common/lib/dataset.py", line 405, in get_item_keys keys = list(items.__next__().keys()) File "/opt/4cat/common/lib/dataset.py", line 337, in iterate_items if own_processor.map_item_method_available(dataset=self): AttributeError: 'NoneType' object has no attribute 'map_item_method_available' ``` commit 50a4434a37d71af6a9470c7fc4a236b043cbfb4d Author: Stijn Peeters Date: Wed Apr 17 14:30:58 2024 +0200 Add "TikTok comments" data source commit c43e76daae3c2e6ecdb218ee749315b985eccca4 Author: Stijn Peeters Date: Tue Apr 16 17:59:25 2024 +0200 Allow notifications per tag commit 36984104e674e8577756bfc3fdd5c72f6569d9e1 Author: Dale Wahl Date: Tue Apr 16 17:25:38 2024 +0200 fix: pass dataset to get_options when queuing processors commit 59cb19a3c88f7f4a4ac02d0b7a891afde50ea069 Author: Dale Wahl Date: Tue Apr 16 10:55:29 2024 +0200 fix: dicts are shared in classes & you cannot delete a key more than once randomly found this; probably as no one else has reddit enabled! commit 3ec9c6ea471bcdbe9fb1caad1e5fe1502a705444 Author: Dale Wahl Date: Mon Apr 15 13:22:19 2024 +0200 fix results page error when dataset was being created; do not check for resultspage updates when user not focused on page commit db05ae5e565248e865e67b8ea60e6653357bb1f4 Author: Dale Wahl Date: Mon Apr 15 11:27:33 2024 +0200 on import file, differentiate between missing field(s) and unable to map item commit 940bac72c7e53bec9e136867c13e2a0a355961a4 Author: Stijn Peeters Date: Fri Apr 12 12:57:48 2024 +0200 Case-insensitive username/note matching in user list commit d0f34245bd07b5ad2fd3e90754ef0264ffc350a9 Author: Stijn Peeters Date: Fri Apr 12 12:29:12 2024 +0200 Only determine settings tab name in one place commit 9f69d7bc0bbb657be1e725d5fb3fe350b7205bff Author: Stijn Peeters Date: Fri Apr 12 12:20:34 2024 +0200 git != github commit 9b4981d8c7358f31ed65d9f161d556e578389801 Author: Stijn Peeters Date: Fri Apr 12 11:56:04 2024 +0200 Fix issues with user tags Fix number of users in tag overview; allow filtering by user tags on user list; don't delete all user tags when deleting one commit 9e8ccd3a78765acdfd2005eaa215dc0dc07266e0 Author: Stijn Peeters Date: Fri Apr 12 11:32:45 2024 +0200 Do not hide all non-hidden child processors lol commit 3f15410af3a278f5644f41f49e25498a1fac3c76 Author: Stijn Peeters Date: Fri Apr 12 11:23:52 2024 +0200 Disable standard video downloader for Telegram commit 94c814b9cab2ae2be10d5c5d3f6cfe20898e349c Author: Stijn Peeters Date: Fri Apr 12 11:14:16 2024 +0200 Telegram video downloader processor commit d36254a188947fff507e8df59f793e98b3be1570 Author: Stijn Peeters Date: Fri Apr 12 11:14:04 2024 +0200 Better styling for 4CAT settings, alphabetic order, submenus commit 808300fa109f306a921f2048b2cf4b6dafc4ba5f Author: Stijn Peeters Date: Thu Apr 11 14:44:32 2024 +0200 Fix multiselect in UI commit 131a0eca0ad514b1ee57803e5c560ab0e56de42d Author: Stijn Peeters Date: Mon Apr 8 18:28:04 2024 +0200 Do not attempt to load crashed file as module in Slack webhook. Fixes #422 (hopefully) commit 6d8cb067bc12f8be68749f74a7291e0849494225 Author: Stijn Peeters Date: Fri Apr 5 19:43:58 2024 +0200 Allow comma-separated list when adding new dataset owners commit 2612aea49f63c37ac691cc89c553c764ead2344f Author: Stijn Peeters Date: Fri Apr 5 19:40:04 2024 +0200 Include number of users with tag on tag page commit 39f2ec40faa3b8493bd5525279aeaeb2e4f586e0 Author: Stijn Peeters Date: Fri Apr 5 19:26:02 2024 +0200 Fix confirmation before deleting user tag commit b00a410a3441e7f2a9d73a9f2dfb0f4ef70ea8a5 Author: Stijn Peeters Date: Fri Apr 5 19:25:01 2024 +0200 Add link to users with tag on tag admin page commit 3ef3e5ec9adbd8ddd128ce2b3f8fa3b1de1297e3 Author: Stijn Peeters Date: Fri Apr 5 18:49:25 2024 +0200 Give filtered datasets a more sensible label, based on source dataset commit 0d5870b78fb73cb58231736cc8a2efbb0b3cd88a Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Fri Apr 5 17:40:57 2024 +0200 update iterate methods (#418) * working to make iterate_mapped_item primary method used by processors and elsewhere in 4CAT; iterate_item method only internally (and provide item directly as is from file) with iterate_mapped_object as intermediate method to use map_missing method and handle missing values as well as warn if needed * switch from iterate_items to iterate_mapped_items; careful attention to item_to_yield allowing a choice of the original item, the mapped item, or both * revert some unecessary renaming * fix annotations bug... this fixes the bug, but i noticed that the notations saved in the database do not have the correct post IDs. * Introduce DatasetItem class and simplify iterate_items * Don't crash when no item mapper * ...actually commit the DatasetItem class * Fix typos in comment --------- Co-authored-by: Stijn Peeters Co-authored-by: Sal Hagen commit 17b77351c51ace21b7057276bbae9da2643a3fc4 Author: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com> Date: Fri Apr 5 16:20:19 2024 +0200 Allow dynamic form options in processors (#397) * Allow dynamic form options in processors * Allow 'requires' on data source options as well * Handle list values with requires * Wider support for file upload in processors * Log file uploads in DMI service manager * fix error w/ datasources having file option * fix fourcat.js use of checkboxes for dynamic settings * Fix faulty toggleButton targeting --------- Co-authored-by: Dale Wahl commit 693fcedc93ee4476a60d0e0876e688f82a8526fa Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Fri Apr 5 15:59:10 2024 +0200 Add method to processors to toggle display in UI (#411) * add ui_only parameter to DataSet.get_available_processors() and BasicProcessor.display_in_ui() Allow using `display_in_ui` to hide processors from UI but allow them to be queued either via API or presets. This avoids issue of is_compatible_with() having to be used to hide processors with sometimes ill effects. * keep same data structure.... * don't delete twice; it's redundant... and raises an error * Rename arguments/properties * Exclude hidden processors in top level view * fix logic * Exclude in child template as well --------- Co-authored-by: Stijn Peeters commit 3cd146c2908da6b3a06a0c1511bf042c4223af0f Author: Dale Wahl Date: Thu Apr 4 16:41:39 2024 +0200 fix: whoops remove debug commit daa7291e813e62fed4600a4acb8430004836cb86 Author: Dale Wahl Date: Thu Apr 4 15:16:30 2024 +0200 CSV preview add hyperlinks if "url" or "link" in column header commit 5f2d6e65bad4f71b2c3cc75d2cdab76f15671d4c Author: Dale Wahl Date: Thu Apr 4 15:16:01 2024 +0200 blip2 processor to work w/ DMI Service Manager commit fe881dec18778d99ac4a0f60ca40a1f43fdb1689 Author: Dale Wahl Date: Thu Apr 4 09:53:30 2024 +0200 catch AttributeError on slackhook if unable to read file ever vigilant against a lack of flavour... commit 2808256b1fabf2e6e8a5a94aad98af60c50fb7b0 Merge: 14123847 eb474640 Author: Dale Wahl Date: Wed Apr 3 17:28:40 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 14123847b5852bf0e7c84fced6c2380165ec93f6 Author: Dale Wahl Date: Wed Apr 3 17:28:38 2024 +0200 staging_areas should not be made for completed datasets (else they may be deleted prematurely) commit eb474640559ee3e914d9c95adb60be09b906f1d6 Merge: bbdf2ab9 3f8b285c Author: sal-phd-desktop Date: Wed Apr 3 16:50:54 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit bbdf2ab9b4292c14911ac01b481c829defa85e5c Author: sal-phd-desktop Date: Wed Apr 3 16:50:36 2024 +0200 Helper script to export the 'classic' 4CAT 4chan data commit 3f8b285c44c33a3ce08e885889b311bc454a70ea Merge: 8f40f3f5 f7cc5b8d Author: Sal Hagen Date: Wed Apr 3 12:12:17 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 8f40f3f5222a63e93f46eb3b57791d10060a0cc8 Author: Sal Hagen Date: Wed Apr 3 12:12:13 2024 +0200 Tumblr search typo commit f7cc5b8d012dec3d8e0c8847ae16c662e82040b5 Author: Stijn Peeters Date: Tue Apr 2 12:32:51 2024 +0200 More/less flavour in restart worker commit 073587efc581adca0608988573ac83ea8b0c93d0 Author: Dale Wahl Date: Wed Mar 27 14:15:27 2024 +0100 create favicon.ico (remove from repo) be sure to keep webtool/static/img/favicon/favicon-bw.ico as basis commit 28d733d56204231f4089660ff61282174aac7aed Author: Dale Wahl Date: Wed Mar 27 09:44:45 2024 +0100 add allow_access_request check to request-password page clicking it would only return the user to the login page anyway, but better not even show it commit 1f2cb77e3cb0fc9b5403da52aaa925b33089d18f Author: Dale Wahl Date: Wed Mar 27 09:37:51 2024 +0100 fix can_request_access to use 4cat.allow_access_request option commit 0d66f11d3619af798d5acc41dbf4fe118b7ddad8 Merge: 25825383 05b3fc07 Author: Stijn Peeters Date: Tue Mar 26 17:54:48 2024 +0100 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 2582538303e31470ed6bf8a01645f7b45af15e5d Author: Stijn Peeters Date: Tue Mar 26 17:54:45 2024 +0100 More permissive timeout for pixplot commit 05b3fc0771ded10dc55db799e8f47e42add08d43 Author: Dale Wahl Date: Tue Mar 26 14:01:59 2024 +0100 remove redundant call of Path commit e4a93442efb84d73d6a4c9af9bc46a8f3e3fdda2 Author: Stijn Peeters Date: Tue Mar 26 11:52:09 2024 +0100 Include column with link description in Telegram mapping commit 876f4a4b6df51ec4b30a048c32191438b6778f90 Author: Dale Wahl Date: Mon Mar 25 14:48:47 2024 +0100 douyin handle image posts commit 81ad61baabaf965b1c848f55a80c23bd3e1a9000 Author: Stijn Peeters Date: Mon Mar 25 08:01:44 2024 +0100 Accept non-numeric IDs in Telegram image downloader commit a8b36dc5682df7c16e25474ea8fdbfc4f12f9d46 Author: Stijn Peeters Date: Sun Mar 24 23:15:51 2024 +0100 Ensure unique IDs for Telegram datasets commit 4a3e9ffee072c4d3efb7bfd8744369b46f19eef2 Merge: 0c119130 d749237e Author: Stijn Peeters Date: Sun Mar 24 22:56:59 2024 +0100 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 0c11913049aabb5a83ffe26d58bdf17affdbc0b9 Author: Stijn Peeters Date: Sun Mar 24 20:09:10 2024 +0100 Better string formatting in Telegram image downloader commit 8a7da5317defdafb5bdbf74dcbeb68e464fa21f4 Author: Stijn Peeters Date: Sun Mar 24 20:06:06 2024 +0100 Add 'link thumbnails' op… --------- Co-authored-by: Stijn Peeters --- datasources/telegram/search_telegram.py | 85 ++++++++++++++++--------- 1 file changed, 55 insertions(+), 30 deletions(-) diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 55c3a61b7..2028772b9 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -234,6 +234,7 @@ def get_items(self, query): self.details_cache = {} self.failures_cache = set() + #TODO: This ought to yield as we're holding everything in memory; async generator? execute_queries() also needs to be modified for this results = asyncio.run(self.execute_queries()) if not query.get("save-session"): @@ -326,9 +327,10 @@ async def execute_queries(self): except Exception as e: # catch-all so we can disconnect properly # ...should we? - self.dataset.update_status("Error scraping posts from Telegram") - self.log.error(f"Telegram scraping error: {traceback.format_exc()}") - return [] + self.dataset.update_status("Error scraping posts from Telegram; halting collection.") + self.log.error(f"Telegram scraping error (dataset {self.dataset.key}): {traceback.format_exc()}") + # May as well return what was captured, yes? + return posts finally: await client.disconnect() @@ -364,12 +366,13 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): # has been mentioned. When crawling is enabled and this exceeds the # given threshold, the entity is added to the query crawl_references = {} - queried_entities = list(queries) - full_query = list(queries) + full_query = set(queries) + num_queries = len(queries) # we may not always know the 'entity username' for an entity ID, so # keep a reference map as we go entity_id_map = {} + query_id_map= {} # Collect queries # Use while instead of for so we can change queries during iteration @@ -383,17 +386,18 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): delay = 10 retries = 0 processed += 1 - self.dataset.update_progress(processed / len(full_query)) + self.dataset.update_progress(processed / num_queries) if no_additional_queries: - # Note that we are note completing this query + # Note that we are not completing this query self.dataset.update_status(f"Rate-limited by Telegram; not executing query {entity_id_map.get(query, query)}") continue while True: self.dataset.update_status(f"Retrieving messages for entity '{entity_id_map.get(query, query)}'") + entity_posts = 0 + discovered = 0 try: - entity_posts = 0 async for message in client.iter_messages(entity=query, offset_date=max_date): entity_posts += 1 total_messages += 1 @@ -413,11 +417,14 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): # the channel a message was forwarded from (but that # needs extra API requests...) serialized_message = SearchTelegram.serialize_obj(message) - if "_chat" in serialized_message and query not in entity_id_map and serialized_message["_chat"]["id"] == query: - # once we know what a channel ID resolves to, use the username instead so it is easier to - # understand for the user - entity_id_map[query] = serialized_message["_chat"]["username"] - self.dataset.update_status(f"Fetching messages for entity '{entity_id_map[query]}' (channel ID {query})") + if "_chat" in serialized_message: + # Add query ID to check if queries have been crawled previously + full_query.add(serialized_message["_chat"]["id"]) + if query not in entity_id_map and serialized_message["_chat"]["id"] == query: + # once we know what a channel ID resolves to, use the username instead so it is easier to + # understand for the user + entity_id_map[query] = serialized_message["_chat"]["username"] + self.dataset.update_status(f"Fetching messages for entity '{entity_id_map[query]}' (channel ID {query})") if resolve_refs: serialized_message = await self.resolve_groups(client, serialized_message) @@ -427,29 +434,46 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): break # if crawling is enabled, see if we found something to add to the query - if crawl_max_depth and (not crawl_msg_threshold or depth_map.get(query) < crawl_msg_threshold): + if crawl_max_depth and (depth_map.get(query) < crawl_max_depth): message_fwd = serialized_message.get("fwd_from") fwd_from = None - if message_fwd and message_fwd["from_id"] and message_fwd["from_id"].get("_type") == "PeerChannel": - # even if we haven't resolved the ID, we can feed the numeric ID - # to Telethon! this is nice because it means we don't have to - # resolve entities to crawl iteratively - fwd_from = int(message_fwd["from_id"]["channel_id"]) + if message_fwd and message_fwd.get("from_id"): + if message_fwd["from_id"].get("_type") == "PeerChannel": + # Legacy(?) data structure (pre 2024/7/22) + # even if we haven't resolved the ID, we can feed the numeric ID + # to Telethon! this is nice because it means we don't have to + # resolve entities to crawl iteratively + fwd_from = int(message_fwd["from_id"]["channel_id"]) + elif message_fwd and message_fwd.get("from_id", {}).get('full_chat',{}): + # TODO: do we need a check here to only follow certain types of messages? this is similar to resolving, but the types do not appear the same to me + # Note: message_fwd["from_id"]["channel_id"] == message_fwd["from_id"]["full_chat"]["id"] in test cases so far + fwd_from = int(message_fwd["from_id"]["full_chat"]["id"]) + else: + self.log.warning(f"Telegram (dataset {self.dataset.key}): Unknown fwd_from data structure; unable to crawl") + + # Check if fwd_from or the resolved entity ID is already queued or has been queried + if fwd_from and fwd_from not in full_query and fwd_from not in queries: - if fwd_from and fwd_from not in queried_entities and fwd_from not in queries: # new entity discovered! # might be discovered (before collection) multiple times, so retain lowest depth depth_map[fwd_from] = min(depth_map.get(fwd_from, crawl_max_depth), depth_map[query] + 1) - if depth_map[query] < crawl_max_depth: - if fwd_from not in crawl_references: - crawl_references[fwd_from] = 0 - - crawl_references[fwd_from] += 1 - if crawl_references[fwd_from] >= crawl_msg_threshold and fwd_from not in queries: - queries.append(fwd_from) - full_query.append(fwd_from) - self.dataset.update_status(f"Discovered new entity {entity_id_map.get(fwd_from, fwd_from)} in {entity_id_map.get(query, query)} at crawl depth {depth_map[query]}, adding to query") - + if fwd_from not in crawl_references: + crawl_references[fwd_from] = 0 + crawl_references[fwd_from] += 1 + + # Add to queries if it has been referenced enough times + if crawl_references[fwd_from] >= crawl_msg_threshold: + queries.append(fwd_from) + full_query.add(fwd_from) + num_queries += 1 + discovered += 1 + self.dataset.update_status(f"Discovered new entity {entity_id_map.get(fwd_from, fwd_from)} in {entity_id_map.get(query, query)} at crawl depth {depth_map[query]}, adding to query") + + serialized_message["4CAT_metadata"] = { + "collected_at": datetime.now().isoformat(), # this is relevant for rather long crawls + "query": query, # possibly redundant, but we are adding non-user defined queries by crawling and may be useful to know exactly what query was used to collect an entity + "query_depth": depth_map.get(query, 0) + } yield serialized_message if entity_posts >= max_items: @@ -502,6 +526,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): delay *= 2 continue + self.dataset.log(f"Completed {entity_id_map.get(query, query)} with {entity_posts} messages (discovered {discovered} new entities)") break async def resolve_groups(self, client, message): From c67a046137d916df3bb707f2243542d289045a06 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 23 Sep 2024 16:21:58 +0200 Subject: [PATCH 05/26] Telegram mapping fixes --- datasources/telegram/search_telegram.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 2028772b9..525430910 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -437,6 +437,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): if crawl_max_depth and (depth_map.get(query) < crawl_max_depth): message_fwd = serialized_message.get("fwd_from") fwd_from = None + fwd_source_type = None if message_fwd and message_fwd.get("from_id"): if message_fwd["from_id"].get("_type") == "PeerChannel": # Legacy(?) data structure (pre 2024/7/22) @@ -444,18 +445,26 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): # to Telethon! this is nice because it means we don't have to # resolve entities to crawl iteratively fwd_from = int(message_fwd["from_id"]["channel_id"]) + fwd_source_type = "channel" elif message_fwd and message_fwd.get("from_id", {}).get('full_chat',{}): # TODO: do we need a check here to only follow certain types of messages? this is similar to resolving, but the types do not appear the same to me # Note: message_fwd["from_id"]["channel_id"] == message_fwd["from_id"]["full_chat"]["id"] in test cases so far fwd_from = int(message_fwd["from_id"]["full_chat"]["id"]) + fwd_source_type = "channel" + elif message_fwd and message_fwd.get("from_id", {}).get('full_user',{}): + # forwards can also come from users + # these can never be followed, so don't add these to the crawl, but do document them + fwd_source_type = "user" else: + print(json.dumps(message_fwd)) self.log.warning(f"Telegram (dataset {self.dataset.key}): Unknown fwd_from data structure; unable to crawl") + fwd_source_type = "unknown" # Check if fwd_from or the resolved entity ID is already queued or has been queried - if fwd_from and fwd_from not in full_query and fwd_from not in queries: - + if fwd_from and fwd_from not in full_query and fwd_from not in queries and fwd_source_type not in ("user",): # new entity discovered! # might be discovered (before collection) multiple times, so retain lowest depth + print(f"Potentially crawling {fwd_from}") depth_map[fwd_from] = min(depth_map.get(fwd_from, crawl_max_depth), depth_map[query] + 1) if fwd_from not in crawl_references: crawl_references[fwd_from] = 0 @@ -728,6 +737,9 @@ def map_item(message): if from_data and from_data.get("from_name"): forwarded_name = message["fwd_from"]["from_name"] + if from_data and from_data.get("users") and len(from_data["users"]) > 0 and "user" not in from_data: + from_data["user"] = from_data["users"][0] + if from_data and ("user" in from_data or "chats" in from_data): # 'resolve entities' was enabled for this dataset if "user" in from_data: @@ -779,7 +791,7 @@ def map_item(message): "body": message["message"], "reply_to": message.get("reply_to_msg_id", ""), "views": message["views"] if message["views"] else "", - "forwards": message.get("forwards", MissingMappedField(0)), + # "forwards": message.get("forwards", MissingMappedField(0)), "reactions": reactions, "timestamp": datetime.fromtimestamp(message["date"]).strftime("%Y-%m-%d %H:%M:%S"), "unix_timestamp": int(message["date"]), From 36913490481910d8c68c66c8680da2e8cf1218bb Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 23 Sep 2024 17:46:36 +0200 Subject: [PATCH 06/26] Cast to string when lowercasing in rank_attributes --- processors/metrics/rank_attribute.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processors/metrics/rank_attribute.py b/processors/metrics/rank_attribute.py index 0e38757c6..9b90b3c7b 100644 --- a/processors/metrics/rank_attribute.py +++ b/processors/metrics/rank_attribute.py @@ -203,7 +203,7 @@ def missing_value_placeholder(data, field_name): # keep track of occurrences of found items per relevant time period for value in values: if to_lowercase: - value = value.lower() + value = str(value).lower() if rank_style == "overall" and value not in overall_top: continue From 959710ab613bd201c5cf56bb01b9e1e7d6ee84e5 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 23 Sep 2024 18:36:07 +0200 Subject: [PATCH 07/26] Find Telegram crawl refs in message body --- datasources/telegram/search_telegram.py | 78 ++++++++++++++++++------- 1 file changed, 58 insertions(+), 20 deletions(-) diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 525430910..c963e47f6 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -6,6 +6,7 @@ import hashlib import asyncio import json +import ural import time import re @@ -24,7 +25,7 @@ FloodWaitError, ApiIdInvalidError, PhoneNumberInvalidError, RPCError from telethon.tl.functions.channels import GetFullChannelRequest from telethon.tl.functions.users import GetFullUserRequest -from telethon.tl.types import User +from telethon.tl.types import User, MessageEntityMention @@ -214,6 +215,14 @@ def get_options(cls, parent_dataset=None, user=None): "tooltip": "Entities need to be references at least this many times to be added to the query. Only " "references discovered below the max crawl depth are taken into account." } + options["crawl-via-links"] = { + "type": UserInput.OPTION_TOGGLE, + "default": False, + "help": "Extract new groups from links", + "tooltip": "Look for references to other groups in message content via t.me links and @references. " + "This is more error-prone than crawling only via forwards, but can be a way to discover " + "links that would otherwise remain undetected." + } return options @@ -358,6 +367,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): crawl_max_depth = self.parameters.get("crawl-depth", 0) crawl_msg_threshold = self.parameters.get("crawl-threshold", 10) + crawl_via_links = self.parameters.get("crawl-via-links", False) self.dataset.log(f"Max crawl depth: {crawl_max_depth}") self.dataset.log(f"Crawl threshold: {crawl_msg_threshold}") @@ -434,6 +444,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): break # if crawling is enabled, see if we found something to add to the query + linked_entities = set() if crawl_max_depth and (depth_map.get(query) < crawl_max_depth): message_fwd = serialized_message.get("fwd_from") fwd_from = None @@ -451,7 +462,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): # Note: message_fwd["from_id"]["channel_id"] == message_fwd["from_id"]["full_chat"]["id"] in test cases so far fwd_from = int(message_fwd["from_id"]["full_chat"]["id"]) fwd_source_type = "channel" - elif message_fwd and message_fwd.get("from_id", {}).get('full_user',{}): + elif message_fwd and (message_fwd.get("from_id", {}).get('full_user',{}) or message_fwd.get("from_id", {}).get("_type") == "PeerUser"): # forwards can also come from users # these can never be followed, so don't add these to the crawl, but do document them fwd_source_type = "user" @@ -460,23 +471,50 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): self.log.warning(f"Telegram (dataset {self.dataset.key}): Unknown fwd_from data structure; unable to crawl") fwd_source_type = "unknown" + if fwd_from: + linked_entities.add(fwd_from) + + + if crawl_via_links: + # t.me links + all_links = ural.urls_from_text(serialized_message["message"]) + all_links = [link.split("t.me/")[1] for link in all_links if ural.get_hostname(link) == "t.me"] + for link in all_links: + if link.startswith("+"): + # invite links + continue + + entity_name = link.split("?")[0].split("#")[0] + linked_entities.add(entity_name) + + # @references + references = [r for t, r in message.get_entities_text() if type(t) is MessageEntityMention] + for reference in references: + if reference.startswith("@"): + reference = reference[1:] + + linked_entities.add(reference) + # Check if fwd_from or the resolved entity ID is already queued or has been queried - if fwd_from and fwd_from not in full_query and fwd_from not in queries and fwd_source_type not in ("user",): - # new entity discovered! - # might be discovered (before collection) multiple times, so retain lowest depth - print(f"Potentially crawling {fwd_from}") - depth_map[fwd_from] = min(depth_map.get(fwd_from, crawl_max_depth), depth_map[query] + 1) - if fwd_from not in crawl_references: - crawl_references[fwd_from] = 0 - crawl_references[fwd_from] += 1 - - # Add to queries if it has been referenced enough times - if crawl_references[fwd_from] >= crawl_msg_threshold: - queries.append(fwd_from) - full_query.add(fwd_from) - num_queries += 1 - discovered += 1 - self.dataset.update_status(f"Discovered new entity {entity_id_map.get(fwd_from, fwd_from)} in {entity_id_map.get(query, query)} at crawl depth {depth_map[query]}, adding to query") + for link in linked_entities: + if link not in full_query and link not in queries and fwd_source_type not in ("user",): + # new entity discovered! + # might be discovered (before collection) multiple times, so retain lowest depth + # print(f"Potentially crawling {link}") + depth_map[link] = min(depth_map.get(link, crawl_max_depth), depth_map[query] + 1) + if link not in crawl_references: + crawl_references[link] = 0 + crawl_references[link] += 1 + + # Add to queries if it has been referenced enough times + if crawl_references[link] >= crawl_msg_threshold: + queries.append(link) + full_query.add(link) + num_queries += 1 + discovered += 1 + self.dataset.update_status(f"Discovered new entity {entity_id_map.get(link, link)} in {entity_id_map.get(query, query)} at crawl depth {depth_map[query]}, adding to query") + + serialized_message["4CAT_metadata"] = { "collected_at": datetime.now().isoformat(), # this is relevant for rather long crawls @@ -1012,7 +1050,6 @@ def validate_query(query, request, user): return { "items": num_items, "query": ",".join(sanitized_items), - "board": "", # needed for web interface "api_id": query.get("api_id"), "api_hash": query.get("api_hash"), "api_phone": query.get("api_phone"), @@ -1021,7 +1058,8 @@ def validate_query(query, request, user): "min_date": min_date, "max_date": max_date, "crawl-depth": query.get("crawl-depth"), - "crawl-threshold": query.get("crawl-threshold") + "crawl-threshold": query.get("crawl-threshold"), + "crawl-via-links": query.get("crawl-via-links") } @staticmethod From 7115b6f16199c6e886212d0730b21acf5b479245 Mon Sep 17 00:00:00 2001 From: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Tue, 24 Sep 2024 12:42:28 +0200 Subject: [PATCH 08/26] Improve github action workflow (#456) * test new github action * test backend fail * test frontend fail * update checkout action to v4 and text * test container starts but 4cat has issue * fix forced fail --- .github/workflows/docker_pr_test.yml | 36 ++++++++++++++++++++++------ 1 file changed, 29 insertions(+), 7 deletions(-) diff --git a/.github/workflows/docker_pr_test.yml b/.github/workflows/docker_pr_test.yml index 3109e3f8f..7d8b8db3b 100644 --- a/.github/workflows/docker_pr_test.yml +++ b/.github/workflows/docker_pr_test.yml @@ -12,16 +12,34 @@ jobs: name: Test docker-compose up with build runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v4 - name: Run docker compose up run: docker compose -f docker-compose_build.yml up -d - - name: Wait and check log + - name: Check backend container is running + run: | + sleep 30 + if [ "$(docker ps | grep 4cat_backend)" ]; then + echo "Docker 4cat_backend container is running..." + else + echo -e "Docker 4cat_backend container is not running...\nPrinting 4cat_backend logs:\n\n$(docker container logs 4cat_backend)" + exit 1 + fi + - name: Check frontend container is running + run: | + sleep 10 + if [ "$(docker ps | grep 4cat_frontend)" ]; then + echo "Docker 4cat_frontend container is running..." + else + echo -e "Docker 4cat_frontend container is not running...\nPrinting 4cat_frontend logs:\n\n$(docker container logs 4cat_frontend)" + exit 1 + fi + - name: Check 4CAT backend log for expected INFO message run: | test_case=" INFO at api.py:65: Local API listening for requests at backend:4444" sleep 30 && var=$(docker exec 4cat_backend tail -n 1 logs/backend_4cat.log) echo "::group::Backend test" if [ "$(echo "$var" | tr "|" "\n" | sed -n '2p')" = "$test_case" ]; then - echo "Backend running as expected" + echo "4CAT backend running as expected" else echo "::error::Backend failed to start" echo "Test:$test_case" @@ -32,7 +50,11 @@ jobs: - name: Print log on failure if: failure() run: | - docker cp 4cat_backend:/usr/src/app/logs/backend_4cat.log ./backend_4cat.log - echo "::group::Backend logs" - cat backend_4cat.log - echo "::endgroup::" + if [ "$(docker ps | grep 4cat)" ]; then + docker cp 4cat_backend:/usr/src/app/logs/backend_4cat.log ./backend_4cat.log + echo "::group::Backend logs" + cat backend_4cat.log + echo "::endgroup::" + else + echo "Docker containers not running; check logs in previous steps" + fi From 040b5f427f528229471fbd38059c549f09fd8b9f Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 25 Sep 2024 14:03:13 +0200 Subject: [PATCH 09/26] Allow info boxes for processors to be as high as they want. --- webtool/static/css/dataset-page.css | 1 - 1 file changed, 1 deletion(-) diff --git a/webtool/static/css/dataset-page.css b/webtool/static/css/dataset-page.css index e257bf293..8e99832f3 100644 --- a/webtool/static/css/dataset-page.css +++ b/webtool/static/css/dataset-page.css @@ -513,7 +513,6 @@ article.result > section:first-child { .processor-option-wrap > label { display: flex; align-items: center; - max-height: 1.5em; } .processor-option-wrap > label.option-type-toggle { From 579ff64e18fbdcda39ef3c2457ab7a4f01ce3d9d Mon Sep 17 00:00:00 2001 From: sal-phd-desktop Date: Wed, 25 Sep 2024 15:21:12 +0200 Subject: [PATCH 10/26] Add LocationParseError exception to download images processor --- processors/visualisation/download_images.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/processors/visualisation/download_images.py b/processors/visualisation/download_images.py index c13fd0fca..8b0792e22 100644 --- a/processors/visualisation/download_images.py +++ b/processors/visualisation/download_images.py @@ -574,6 +574,10 @@ def request_get_w_error_handling(self, url, retries=0, **kwargs): else: self.dataset.log("Error: ConnectionError while trying to download image %s: %s" % (url, e)) raise FileNotFoundError() + except requests.exceptions.LocationParseError as e: + # not an valid url, just skip + self.dataset.log("Error: LocationParseError while trying to download image %s: %s" % (url, e)) + raise FileNotFoundError() except requests.exceptions.InvalidSchema: # not an http url, just skip raise FileNotFoundError() From bfaf23b1065f068276e0c6c49d610a8c57083ae3 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 26 Sep 2024 15:01:34 +0200 Subject: [PATCH 11/26] cleanup_tempfiles waits 7 days to remove unclaimed data files --- backend/workers/cleanup_tempfiles.py | 52 ++++++++++++++++++++-------- 1 file changed, 37 insertions(+), 15 deletions(-) diff --git a/backend/workers/cleanup_tempfiles.py b/backend/workers/cleanup_tempfiles.py index 51e96fd57..b0b3a6d21 100644 --- a/backend/workers/cleanup_tempfiles.py +++ b/backend/workers/cleanup_tempfiles.py @@ -3,7 +3,8 @@ """ import shutil import re - +import json +from datetime import datetime from pathlib import Path from common.config_manager import config @@ -27,12 +28,21 @@ class TempFileCleaner(BasicWorker): ensure_job = {"remote_id": "localhost", "interval": 10800} + # Use tracking file to delay deletion of files that may still be in use + tracking_file = config.get('PATH_DATA').joinpath(".temp_file_cleaner") + days_to_keep = 7 + def work(self): """ Go through result files, and for each one check if it should still exist :return: """ + # Load tracking file + if not self.tracking_file.exists(): + tracked_files = {} + else: + tracked_files = json.loads(self.tracking_file.read_text()) result_files = Path(config.get('PATH_DATA')).glob("*") for file in result_files: @@ -41,6 +51,7 @@ def work(self): continue if self.interrupted: + self.tracking_file.write_text(json.dumps(tracked_files)) raise WorkerInterruptedException("Interrupted while cleaning up orphaned result files") # the key of the dataset files belong to can be extracted from the @@ -59,20 +70,28 @@ def work(self): except DataSetException: # the dataset has been deleted since, but the result file still # exists - should be safe to clean up - self.log.info("No matching dataset with key %s for file %s, deleting file" % (key, str(file))) - if file.is_dir(): - try: - shutil.rmtree(file) - except PermissionError: - self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no " - f"permissions), skipping") - - else: - try: - file.unlink() - except FileNotFoundError: - # the file has been deleted since - pass + if file.name not in tracked_files: + self.log.info(f"No matching dataset with key {key} for file {file}; marking for deletion") + tracked_files[file.name] = datetime.now().timestamp() + (self.days_to_keep * 86400) + elif tracked_files[file.name] < datetime.now().timestamp(): + self.log.info(f"File {file} marked for deletion since {datetime.fromtimestamp(tracked_files[file.name]).strftime('%Y-%m-%d %H:%M:%S')}, deleting file") + if file.is_dir(): + try: + shutil.rmtree(file) + except PermissionError: + self.log.info(f"Folder {file} does not belong to a dataset but cannot be deleted (no " + f"permissions), skipping") + + else: + try: + file.unlink() + except FileNotFoundError: + # the file has been deleted since + pass + + # Remove from tracking + del tracked_files[file.name] + continue if file.is_dir() and "-staging" in file.stem and dataset.is_finished(): @@ -84,4 +103,7 @@ def work(self): dataset.key, str(file))) shutil.rmtree(file) + # Update tracked files + self.tracking_file.write_text(json.dumps(tracked_files)) + self.job.finish() \ No newline at end of file From cb4b7706762259aa700c2bcaf2df88ef0cbd2ae2 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 26 Sep 2024 15:26:57 +0200 Subject: [PATCH 12/26] Clean link references in Telegram crawler --- datasources/telegram/search_telegram.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index c963e47f6..0fb7b282a 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -484,7 +484,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): # invite links continue - entity_name = link.split("?")[0].split("#")[0] + entity_name = link.split("/")[0].split("?")[0].split("#")[0] linked_entities.add(entity_name) # @references @@ -493,6 +493,8 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): if reference.startswith("@"): reference = reference[1:] + reference = reference.split("/")[0] + linked_entities.add(reference) # Check if fwd_from or the resolved entity ID is already queued or has been queried From b66418350bcacc83ede7eaf4ae515ef4fc4e5bfa Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Thu, 26 Sep 2024 16:19:47 +0200 Subject: [PATCH 13/26] Don't crash on "t.me" without path --- datasources/telegram/search_telegram.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 0fb7b282a..365fef2c9 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -478,7 +478,7 @@ async def gather_posts(self, client, queries, max_items, min_date, max_date): if crawl_via_links: # t.me links all_links = ural.urls_from_text(serialized_message["message"]) - all_links = [link.split("t.me/")[1] for link in all_links if ural.get_hostname(link) == "t.me"] + all_links = [link.split("t.me/")[1] for link in all_links if ural.get_hostname(link) == "t.me" and len(link.split("t.me/")) > 1] for link in all_links: if link.startswith("+"): # invite links From 8f2193cdcf0179ba34947861be87ec587e22e638 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 27 Sep 2024 11:24:15 +0200 Subject: [PATCH 14/26] Add linked and mentioned entities to mapped Telegram item --- datasources/telegram/search_telegram.py | 35 +++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/datasources/telegram/search_telegram.py b/datasources/telegram/search_telegram.py index 365fef2c9..9e523a247 100644 --- a/datasources/telegram/search_telegram.py +++ b/datasources/telegram/search_telegram.py @@ -820,6 +820,38 @@ def map_item(message): # Failsafe; can be updated to support formatting of new datastructures in the future reactions += f"{reaction}, " + # t.me links + linked_entities = set() + all_links = ural.urls_from_text(message["message"]) + all_links = [link.split("t.me/")[1] for link in all_links if + ural.get_hostname(link) == "t.me" and len(link.split("t.me/")) > 1] + + for link in all_links: + if link.startswith("+"): + # invite links + continue + + entity_name = link.split("/")[0].split("?")[0].split("#")[0] + linked_entities.add(entity_name) + + # @references + # in execute_queries we use MessageEntityMention to get these + # however, after serializing these objects we only have the offsets of + # the mentioned username, and telegram does weird unicode things to its + # offsets meaning we can't just substring the message. So use a regex + # as a 'good enough' solution + all_mentions = set(re.findall(r"@([^\s\W]+)", message["message"])) + + # make this case-insensitive since people may use different casing in + # messages than the 'official' username for example + all_connections = set([v for v in [forwarded_username, *linked_entities, *all_mentions] if v]) + all_ci_connections = set() + seen = set() + for connection in all_connections: + if connection.lower() not in seen: + all_ci_connections.add(connection) + seen.add(connection.lower()) + return MappedItem({ "id": f"{message['_chat']['username']}-{message['id']}", "thread_id": thread, @@ -841,6 +873,9 @@ def map_item(message): "author_forwarded_from_name": forwarded_name, "author_forwarded_from_username": forwarded_username, "author_forwarded_from_id": forwarded_id, + "entities_linked": ",".join(linked_entities), + "entities_mentioned": ",".join(all_mentions), + "all_connections": ",".join(all_ci_connections), "timestamp_forwarded_from": datetime.fromtimestamp(forwarded_timestamp).strftime( "%Y-%m-%d %H:%M:%S") if forwarded_timestamp else "", "unix_timestamp_forwarded_from": forwarded_timestamp, From a224dd96d20a5d9bed62d25bd39c3bc9a929d307 Mon Sep 17 00:00:00 2001 From: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Tue, 1 Oct 2024 15:01:49 +0200 Subject: [PATCH 15/26] Export 4CAT datasets and analyses as ZIP file... and import them elsewhere! (#452) * export processor * start of importer * finish off importing ZIP 4CAT datasets * ensure cleanup on failure had some weird lost datasets when debugging this * auto-expire export zips * nltk again * Squashed commit of the following: commit 3f2a62a124926cfeb840796f104a702878ac10e5 Author: Carsten Schnober Date: Wed Sep 18 18:18:29 2024 +0200 Update Gensim to >=4.3.3, <4.4.0 (#450) * Update Gensim to >=4.3.3, <4.4.0 * update nltk as well --------- Co-authored-by: Dale Wahl Co-authored-by: Sal Hagen commit fee2c8c08617094f28496963da282d2e2dddeab7 Merge: 3d94b666 f8e93eda Author: sal-phd-desktop Date: Wed Sep 18 18:11:19 2024 +0200 Merge branch 'master' of https://github.com/digitalmethodsinitiative/4cat commit 3d94b666cedd0de4e0bee953cbf1d787fdc38854 Author: sal-phd-desktop Date: Wed Sep 18 18:11:04 2024 +0200 FINALLY remove 'News' from the front page, replace with 4CAT BlueSky updates and potential information about the specific server (to be set on config page) commit f8e93edabe9013a2c1229caa4c454fab09620125 Author: Stijn Peeters Date: Wed Sep 18 15:11:21 2024 +0200 Simple extensions page in Control Panel commit b5be128c7b8682fb233d962326d9118a61053165 Author: Stijn Peeters Date: Wed Sep 18 14:08:13 2024 +0200 Remove 'docs' directory commit 1e2010af44817016c274c9ec9f7f9971deb57f66 Author: Stijn Peeters Date: Wed Sep 18 14:07:38 2024 +0200 Forgot TikTok and Douyin commit c757dd51884e7ec9cf62ca1726feacab4b2283b7 Author: Stijn Peeters Date: Wed Sep 18 14:01:31 2024 +0200 Say 'zeeschuimer' instead of 'extension' to avoid confusion with 4CAT extensions commit ee7f4345478f923541536c86a5b06246deae03f6 Author: Stijn Peeters Date: Wed Sep 18 14:00:40 2024 +0200 RIP Parler data source commit 11300f2430b51887823b280405de4ded4f15ede1 Author: Stijn Peeters Date: Wed Sep 18 11:21:37 2024 +0200 Tuplestring commit 547265240eba81ca0ad270cd3c536a2b1dcf512d Author: Stijn Peeters Date: Wed Sep 18 11:15:29 2024 +0200 Pass user obj instead of str to ConfigWrapper in Processor commit b21866d7900b5d20ed6ce61ee9aff50f3c0df910 Author: Stijn Peeters Date: Tue Sep 17 17:45:01 2024 +0200 Ensure request-aware config reader in user object when using config wrapper commit bbe79e4b0fe870ccc36cab7bfe7963b28d1948e3 Author: Sal Hagen Date: Tue Sep 17 15:12:46 2024 +0200 Fix extension path walk for Windows commit d6064beaf31a6a85b0e34ed4f8126eb4c4fc07e3 Author: Stijn Peeters Date: Mon Sep 16 14:50:45 2024 +0200 Allow tags that have no users Use case: tag-based frontend differentiation using X-4CAT-Config-Via-Proxy commit b542ded6f976809ec88445e7b04f2c81b900188e Author: Stijn Peeters Date: Mon Sep 16 14:13:14 2024 +0200 Trailing slash in query results list commit a4bddae575b22a009925206a1337bdd89349e567 Author: Dale Wahl <32108944+dale-wahl@users.noreply.github.com> Date: Mon Sep 16 13:57:23 2024 +0200 4CAT Extension - easy(ier) adding of new datasources/processors that can be mainted seperately from 4CAT base code (#451) * domain only * fix reference * try and collect links with selenium * update column_filter to find multiple matches * fix up the normal url_scraper datasource * ensure all selenium links are strings for join * change output of url_scraper to ndjson with map_items * missed key/index change * update web archive to use json and map to 4CAT * fix no text found * and none on scraped_links * check key first * fix up web_archive error reporting * handle None type for error * record web archive "bad request" * add wait after redirect movement * increase waittime for redirects * add processor for trackers * dict to list for addition * allow both newline and comma seperated links * attempt to scrape iframes as seperate pages * Fixes for selenium scraper to work with config database * installation of packages, geckodriver, and firefox if selenium enabled * update install instructions * fix merge error * fix dropped function * have to be kidding me * add note; setup requires docker... need to think about IF this will ever be installed without Docker * seperate selenium class into wrapper and Search class so wrapper can be used in processors! * add screenshots; add firefox extension support * update selenium definitions * regex for extracting urls from strings * screenshots processor; extract urls from text and takes screenshots * Allow producing zip files from data sources * import time * pick better default * test screenshot datasource * validate all params * fix enable extension * haha break out of while loop * count my items * whoops, len() is important here * must be getting tired... * remove redundant logging * Eager loading for screenshots, viewport options, etc * Woops, wrong folder * Fix label shortening * Just 'queue' instead of 'search queue' * Yeah, make it headless * README -> DESCRIPTION * h1 -> h2 * Actually just have no header * Use proper filename for downloaded files * Configure whether to offer pseudonymisation etc * Tweak descriptions * fix log missing data * add columns to post_topic_matrix * fix breadcrumb bug * Add top topics column * Fix selenium config install parameter (Docker uses this/manual would need to run install_selenium, well, manually) * this processor is slow; i thought it was broken long before it updated! * refactor detect_trackers as conversion processor not filter * add geckodriver executable to docker install * Auto-configure webdrivers if available in PATH * update screenshots to act as image-downloader and benefit from processors * fix is_compatible_with * Delete helper-scripts/migrate/migrate-1.30-1.31.py * fix embeddings is_compatible_with * fix up UI options for hashing and private * abstract was moved to lib * various fixes to selenium based datasources * processors not compatible with image datasets * update firefox extension handling * screenshots datasource fix get_options * rename screenshots processor to be detected as image dataset * add monthly and weekly frequencies to wayback machine datasource * wayback ds: fix fail if all attempts do not realize results; addion frequency options to options; add daily * add scroll down page to allow lazy loading for entire page screenshots * screenshots: adjust pause time so it can be used to force a wait for images to load I have not successfully come up with or found a way to wait for all images to load; document.readyState == 'complete' does not function in this way on certain sites including the wayback machine * hash URLs to create filenames * remove log * add setting to toggle display advanced options * add progress bars * web archive fix query validation * count subpages in progress * remove overwritten function * move http response to own column * special filenames * add timestamps to all screenshots * restart selenium on failure * new build have selenium * process urls after start (keep original query parameters) * undo default firefox * quick max * rename SeleniumScraper to SeleniumSearch todo: build SeleniumProcessor! * max number screenshots configurable * method to get url with error handling * use get_with_error_handling * d'oh, screenshot processor needs to quit selenium * update log to contain URL * Update scrolling to use Page down key if necessary * improve logs * update image_category_wall as screenshot datasource does not have category column; this is not ideal and ought to be solved in another way. Also, could I get categories from the metadata? That's... ugh. * no category, no processor * str errors * screenshots: dismiss alerts when checking ready state is complete * set screenshot timeout to 30 seconds * update gensim package * screenshots: move processor interrupt into attempts loop * if alert disappears before we can dismiss it... * selenium specific logger * do not switch window when no alert found on dismiss * extract wait for page to load to selenium class * improve descriptions of screenshot options * remove unused line * treat timeouts differently from other errors these are more likely due to an issue with the website in question * debug if requested * increase pause time * restart browser w/ PID * increase max_workers for selenium this is by individual worker class not for all selenium classes... so you can really crank them out if desired * quick fix restart by pid * avoid bad urls * missing bracket & attempt to fix-missing dependencies in Docker install * Allow dynamic form options in processors * Allow 'requires' on data source options as well * Handle list values with requires * basic processor for apple store; setup checks for additional requirements * fix is_4cat_class * show preview when no map_item * add google store datasource * Docker setup.py use extensions * Wider support for file upload in processors * Log file uploads in DMI service manager * add map_item methods and record more data per item need additional item data as map_item is staticmethod * update from master; merge conflicts * fix docker build context (ignore data files) * fix option requirements * apple store fix: list still tries to get query * apple & google stores fix up item mapping * missed merge error * minor fix * remove unused import * fix datasources w/ files frontend error * fix error w/ datasources having file option * better way to name docker volumes * update two other docker compose files * fix docker-compose ymls * minor bug: fix and add warning; fix no results fail * update apple field names to better match interface * update google store fieldnames and order * sneak in jinja logger if needed * fix fourcat.js handling checkboxes for dynamic settings * add new endpoint for app details to apple store * apple_store map new beta app data * add default lang/country * not all apps have advisories * revert so button works * add chart positions to beta map items * basic scheduler To-do - fix up and add options to scheduler view (e.g. delete/change) - add scheduler view to navigator - tie jobs to datasets? (either in scheduler view or, perhaps, filter dataset view) - more testing... * update scheduler view, add functions to update job interval * revert .env * working scheduler! * basic scheduler view w/ datasets * fix postgres tag * update job status in scheduled_jobs table * fix timestamp; end_date needed for last run check; add dataset label * improve scheduler view * remove dataset from scheduled_jobs table on delete * scheduler view order by last creation * scheduler views: separate scheduler list from scheduled dataset list * additional update from master fixes * apple_store map_items fix missing locales * add back depth for pagination * correct route * modify pagination to accept args * pagination fun * pagination: i hate testing on live servers... * ok ok need the pagination route * pagination: add route_args * fix up scheduler header * improve app store descriptions * add azure store * fix azure links * azure_store: add category search * azure fix type of config update timestamp OPTION_DATE does not appear correctly in settings and causes it to be written incorrectly * basic aws store * check if selenium available; get correct app_id * aws: implement pagination * add logging; wait for elements to load after next page; attempts to rework filter option collection * apple_store: handle invalid param error * fix filter_options * aws: fix filter option collection! * more merge * move new datasources and processors to extensions and modify setup.py and module loader to use the new locations * migrate.py to run extension "fourcat_install.py" files * formatting * remove extensions; add gitignore * excise scheduler merge * some additional cleanup from app_studies branch * allow nested datasources folders; ignore files in extensions main folder * allow extension install scripts to run pip if migrate.py has not * Remove unused URL functions we could use ural for * Take care of git commit hash tracking for extension processors * Get rid of unused path.versionfile config setting * Add extensions README * Squashed commit of the following: commit cd356f7a69d15e8ecc8efffc6d63a16368e62962 Author: Stijn Peeters Date: Sat Sep 14 17:36:18 2024 +0200 UI setting for 4CAT install ad in login commit 0945d8c0a11803a6bb411f15099d50fea25f10ab Author: Stijn Peeters Date: Sat Sep 14 17:32:55 2024 +0200 UI setting for anonymisation controls Todo: make per-datasource commit 1a2562c2f9a368dbe0fc03264fb387e44313213b Author: Stijn Peeters Date: Sat Sep 14 15:53:27 2024 +0200 Debug panel for HTTP headers in control panel commit 203314ec83fb631d985926a0b5c5c440cfaba9aa Author: Stijn Peeters Date: Sat Sep 14 15:53:17 2024 +0200 Preview for HTML datasets commit 48c20c2ebac382bd41b92da4481ff7d832dc1538 Author: Desktop Sal Date: Wed Sep 11 13:54:23 2024 +0200 Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3 Author: Dale Wahl Date: Fri Sep 6 16:29:19 2024 +0200 fix nltk where it matters commit 2ef5c80f2d1a5b5f893c8977d8394740de6d796d Author: Stijn Peeters Date: Tue Sep 3 12:05:14 2024 +0200 Actually check progress in text annotator commit 693960f41b73e39eda0c2f23eb361c18bde632cd Author: Stijn Peeters Date: Mon Sep 2 18:03:18 2024 +0200 Add processor for stormtrooper DMI service commit 6ae964aad492527bc5d016a00f870145aab6e1af Author: Stijn Peeters Date: Fri Aug 30 17:31:37 2024 +0200 Fix reference to old stopwords list in neologisms preset * Fix Github links for extensions * Fix commit detection in extensions * Fix extension detection in module loader * Follow symlinks when loading extensions Probably not uncommon to have a checked out repo somewhere to then symlink into the extensions dir * Make queue message on create page more generic * Markdown in datasource option tooltips * Remove Spacy model from requirements * Add software_source to database SQL --------- Co-authored-by: Stijn Peeters Co-authored-by: Stijn Peeters <42036349+stijn-uva@users.noreply.github.com> commit cd356f7a69d15e8ecc8efffc6d63a16368e62962 Author: Stijn Peeters Date: Sat Sep 14 17:36:18 2024 +0200 UI setting for 4CAT install ad in login commit 0945d8c0a11803a6bb411f15099d50fea25f10ab Author: Stijn Peeters Date: Sat Sep 14 17:32:55 2024 +0200 UI setting for anonymisation controls Todo: make per-datasource commit 1a2562c2f9a368dbe0fc03264fb387e44313213b Author: Stijn Peeters Date: Sat Sep 14 15:53:27 2024 +0200 Debug panel for HTTP headers in control panel commit 203314ec83fb631d985926a0b5c5c440cfaba9aa Author: Stijn Peeters Date: Sat Sep 14 15:53:17 2024 +0200 Preview for HTML datasets commit 48c20c2ebac382bd41b92da4481ff7d832dc1538 Author: Desktop Sal Date: Wed Sep 11 13:54:23 2024 +0200 Remove spacy processors (linguistic extractor, get nouns, get entities) and remove dependencies commit 657ffd75a7f48ba4537449127e5fa39debf4fdf3 Author: Dale Wahl Date: Fri Sep 6 16:29:19 2024 +0200 fix nltk where it matters * merge docker files * fix merge issues * more modules passing fixes * disappearing import not sure pycharm's merge is super awesome... * fix import 4cat datasource with modules changes --------- Co-authored-by: Stijn Peeters --- common/lib/dataset.py | 16 +- datasources/fourcat_import/import_4cat.py | 502 ++++++++++++++++------ docker-compose_build.yml | 6 + processors/conversion/export_datasets.py | 106 +++++ webtool/views/api_tool.py | 6 +- 5 files changed, 500 insertions(+), 136 deletions(-) create mode 100644 processors/conversion/export_datasets.py diff --git a/common/lib/dataset.py b/common/lib/dataset.py index b092d2a4e..b494acbd3 100644 --- a/common/lib/dataset.py +++ b/common/lib/dataset.py @@ -15,7 +15,7 @@ from common.config_manager import config from common.lib.job import Job, JobNotFoundException from common.lib.module_loader import ModuleCollector -from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int +from common.lib.helpers import get_software_commit, NullAwareTextIOWrapper, convert_to_int, get_software_version from common.lib.item_mapping import MappedItem, MissingMappedField, DatasetItem from common.lib.fourcat_module import FourcatModule from common.lib.exceptions import (ProcessorInterruptedException, DataSetException, DataSetNotFoundException, @@ -1586,6 +1586,20 @@ def get_media_type(self): # Default to text return self.parameters.get("media_type", "text") + def get_metadata(self): + """ + Get dataset metadata + + This consists of all the data stored in the database for this dataset, plus the current 4CAT version (appended + as 'current_4CAT_version'). This is useful for exporting datasets, as it can be used by another 4CAT instance to + update its database (and ensure compatibility with the exporting version of 4CAT). + """ + metadata = self.db.fetchone("SELECT * FROM datasets WHERE key = %s", (self.key,)) + + # get 4CAT version (presumably to ensure export is compatible with import) + metadata["current_4CAT_version"] = get_software_version() + return metadata + def get_result_url(self): """ Gets the 4CAT frontend URL of a dataset file. diff --git a/datasources/fourcat_import/import_4cat.py b/datasources/fourcat_import/import_4cat.py index cd231b445..dc5d079fc 100644 --- a/datasources/fourcat_import/import_4cat.py +++ b/datasources/fourcat_import/import_4cat.py @@ -4,6 +4,7 @@ import requests import json import time +import zipfile from backend.lib.processor import BasicProcessor from common.lib.exceptions import (QueryParametersException, FourcatException, ProcessorInterruptedException, @@ -19,8 +20,8 @@ class FourcatImportException(FourcatException): class SearchImportFromFourcat(BasicProcessor): type = "import_4cat-search" # job ID category = "Search" # category - title = "Import from 4CAT" # title displayed in UI - description = "Import a dataset from another 4CAT server" # description displayed in UI + title = "Import 4CAT dataset and analyses" # title displayed in UI + description = "Import a dataset from another 4CAT server or from a zip file (exported from a 4CAT server)" # description displayed in UI is_local = False # Whether this datasource is locally scraped is_static = False # Whether this datasource is still updated @@ -33,29 +34,328 @@ class SearchImportFromFourcat(BasicProcessor): "\n\nTo import a dataset across servers, both servers need to be running the same version of 4CAT. " "You can find the current version in the footer at the bottom of the interface." }, + "method": { + "type": UserInput.OPTION_CHOICE, + "help": "Import Type", + "options": { + "zip": "Zip File", + "url": "4CAT URL", + }, + "default": "url" + }, "url": { "type": UserInput.OPTION_TEXT, "help": "Dataset URL", - "tooltip": "URL to the dataset's page, for example https://4cat.example/results/28da332f8918e6dc5aacd1c3b0170f01b80bd95f8ff9964ac646cecd33bfee49/." + "tooltip": "URL to the dataset's page, for example https://4cat.example/results/28da332f8918e6dc5aacd1c3b0170f01b80bd95f8ff9964ac646cecd33bfee49/.", + "requires": "method^=url" }, "intro2": { "type": UserInput.OPTION_INFO, "help": "You can create an API key via the 'API Access' item in 4CAT's navigation menu. Note that you need " "an API key from **the server you are importing from**, not the one you are looking at right now. " - "Additionally, you need to have owner access to the dataset you want to import." + "Additionally, you need to have owner access to the dataset you want to import.", + "requires": "method^=url" }, "api-key": { "type": UserInput.OPTION_TEXT, "help": "4CAT API Key", "sensitive": True, "cache": True, - } + "requires": "method^=url" + }, + "data_upload": { + "type": UserInput.OPTION_FILE, + "help": "File", + "tooltip": "Upload a ZIP file containing a dataset exported from a 4CAT server.", + "requires": "method^=zip" + }, + } created_datasets = None base = None + remapped_keys = None + dataset_owner = None def process(self): + """ + Import 4CAT dataset either from another 4CAT server or from the uploaded zip file + """ + self.created_datasets = set() # keys of created datasets - may not be successful! + self.remapped_keys = {} # changed dataset keys + self.dataset_owner = self.dataset.get_owners()[0] # at this point it has 1 owner + try: + if self.parameters.get("method") == "zip": + self.process_zip() + else: + self.process_urls() + except Exception as e: + # Catch all exceptions and finish the job with an error + # Resuming is impossible because this dataset was overwritten with the importing dataset + # halt_and_catch_fire() will clean up and delete the datasets that were created + self.interrupted = True + try: + self.halt_and_catch_fire() + except ProcessorInterruptedException: + pass + # Reraise the original exception for logging + raise e + + def after_create(query, dataset, request): + """ + Hook to execute after the dataset for this source has been created + + In this case, put the file in a temporary location so it can be + processed properly by the related Job later. + + :param dict query: Sanitised query parameters + :param DataSet dataset: Dataset created for this query + :param request: Flask request submitted for its creation + """ + if query.get("method") == "zip": + file = request.files["option-data_upload"] + file.seek(0) + with dataset.get_results_path().with_suffix(".importing").open("wb") as outfile: + while True: + chunk = file.read(1024) + if len(chunk) == 0: + break + outfile.write(chunk) + else: + # nothing to do for URLs + pass + + + def process_zip(self): + """ + Import 4CAT dataset from a ZIP file + """ + self.dataset.update_status(f"Importing datasets and analyses from ZIP file.") + temp_file = self.dataset.get_results_path().with_suffix(".importing") + + imported = [] + processed_files = 1 # take into account the export.log file + failed_imports = [] + with zipfile.ZipFile(temp_file, "r") as zip_ref: + zip_contents = zip_ref.namelist() + + # Get all metadata files and determine primary dataset + metadata_files = [file for file in zip_contents if file.endswith("_metadata.json")] + if not metadata_files: + self.dataset.finish_with_error("No metadata files found in ZIP file; is this a 4CAT export?") + return + + # Get the primary dataset + primary_dataset_keys = set() + datasets = [] + parent_child_mapping = {} + for file in metadata_files: + with zip_ref.open(file) as f: + metadata = json.load(f) + if not metadata.get("key_parent"): + primary_dataset_keys.add(metadata.get("key")) + datasets.append(metadata) + else: + # Store the mapping of parent to child datasets + parent_key = metadata.get("key_parent") + if parent_key not in parent_child_mapping: + parent_child_mapping[parent_key] = [] + parent_child_mapping[parent_key].append(metadata) + + # Primary dataset will overwrite this dataset; we could address this to support multiple primary datasets + if len(primary_dataset_keys) != 1: + self.dataset.finish_with_error("ZIP file contains multiple primary datasets; only one is allowed.") + return + + # Import datasets + while datasets: + self.halt_and_catch_fire() + + # Create the datasets + metadata = datasets.pop(0) + dataset_key = metadata.get("key") + processed_metadata = self.process_metadata(metadata) + new_dataset = self.create_dataset(processed_metadata, dataset_key, dataset_key in primary_dataset_keys) + processed_files += 1 + + # TODO: I am now noticing that we do not update the results_file; it is even more unlikely to collide as it is both a random key and label combined... but... + # Copy the log file + self.halt_and_catch_fire() + log_filename = new_dataset.get_log_path().name + if log_filename in zip_contents: + self.dataset.update_status(f"Transferring log file for dataset {new_dataset.key}") + with zip_ref.open(log_filename) as f: + with new_dataset.get_log_path().open("wb") as outfile: + outfile.write(f.read()) + processed_files += 1 + else: + self.dataset.log(f"Log file not found for dataset {new_dataset.key} (original key {dataset_key}).") + + # Copy the results + self.halt_and_catch_fire() + results_filename = new_dataset.get_results_path().name + if results_filename in zip_contents: + self.dataset.update_status(f"Transferring data file for dataset {new_dataset.key}") + with zip_ref.open(results_filename) as f: + with new_dataset.get_results_path().open("wb") as outfile: + outfile.write(f.read()) + processed_files += 1 + + if not imported: + # first dataset - use num rows as 'overall' + num_rows = metadata["num_rows"] + else: + # TODO: should I just delete the new_dataset here? + self.dataset.log(f"Results file not found for dataset {new_dataset.key} (original key {dataset_key}).") + new_dataset.finish_with_error(f"Results file not found for dataset {new_dataset.key} (original key {dataset_key}).") + failed_imports.append(dataset_key) + continue + + # finally, the kids + self.halt_and_catch_fire() + if dataset_key in parent_child_mapping: + datasets.extend(parent_child_mapping[dataset_key]) + self.dataset.log(f"Adding ({len(parent_child_mapping[dataset_key])}) child datasets to import queue") + + # done - remember that we've imported this one + imported.append(new_dataset) + new_dataset.update_status(metadata["status"]) + + if new_dataset.key != self.dataset.key: + # only finish if this is not the 'main' dataset, or the user + # will think the whole import is done + new_dataset.finish(metadata["num_rows"]) + + # Check that all files were processed + missed_files = [] + if len(zip_contents) != processed_files: + for file in zip_contents: + if file not in processed_files: + missed_files.append(file) + + # todo: this part needs updating if/when we support importing multiple datasets! + if failed_imports: + self.dataset.update_status(f"Dataset import finished, but not all data was imported properly. " + f"{len(failed_imports)} dataset(s) were not successfully imported. Check the " + f"dataset log file for details.", is_final=True) + elif missed_files: + self.dataset.log(f"ZIP file contained {len(missed_files)} files that were not processed: {missed_files}") + self.dataset.update_status(f"Dataset import finished, but not all files were processed. " + f"{len(missed_files)} files were not successfully imported. Check the " + f"dataset log file for details.", is_final=True) + else: + self.dataset.update_status(f"{len(imported)} dataset(s) succesfully imported.", + is_final=True) + + if not self.dataset.is_finished(): + # now all related datasets are imported, we can finish the 'main' + # dataset, and the user will be alerted that the full import is + # complete + self.dataset.finish(num_rows) + + + @staticmethod + def process_metadata(metadata): + """ + Process metadata for import + """ + # get rid of some keys that are server-specific and don't need to + # be stored (or don't correspond to database columns) + metadata.pop("current_4CAT_version") + metadata.pop("id") + metadata.pop("job") + metadata.pop("is_private") + metadata.pop("is_finished") # we'll finish it ourselves, thank you!!! + + # extra params are stored as JSON... + metadata["parameters"] = json.loads(metadata["parameters"]) + if "copied_from" in metadata["parameters"]: + metadata["parameters"].pop("copied_from") + metadata["parameters"] = json.dumps(metadata["parameters"]) + + return metadata + + def create_dataset(self, metadata, original_key, primary=False): + """ + Create a new dataset + """ + if primary: + self.dataset.update_status(f"Importing primary dataset {original_key}.") + # if this is the first dataset we're importing, make it the + # processor's "own" dataset. the key has already been set to + # the imported dataset's key via ensure_key() (or a new unqiue + # key if it already existed on this server) + # by making it the "own" dataset, the user initiating the + # import will see the imported dataset as the "result" of their + # import query in the interface, similar to the workflow for + # other data sources + new_dataset = self.dataset + metadata.pop("key") # key already OK (see above) + self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) + + else: + self.dataset.update_status(f"Importing child dataset {original_key}.") + # supernumerary datasets - handle on their own + # these include any children of imported datasets + try: + key_exists = DataSet(key=metadata["key"], db=self.db, modules=self.modules) + + # if we *haven't* thrown a DatasetException now, then the + # key is already in use, so create a "dummy" dataset and + # overwrite it with the metadata we have (except for the + # key). this ensures that a new unique key will be + # generated. + new_dataset = DataSet(parameters={}, type=self.type, db=self.db, modules=self.modules) + metadata.pop("key") + self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) + + except DataSetException: + # this is *good* since it means the key doesn't exist, so + # we can re-use the key of the imported dataset + self.db.insert("datasets", data=metadata) + new_dataset = DataSet(key=metadata["key"], db=self.db, modules=self.modules) + + # make sure the dataset path uses the new key and local dataset + # path settings. this also makes sure the log file is created in + # the right place (since it is derived from the results file path) + extension = metadata["result_file"].split(".")[-1] + new_dataset.reserve_result_file(parameters=new_dataset.parameters, extension=extension) + + new_dataset.update_status("Imported dataset created") + if new_dataset.key != original_key: + # could not use original key because it was already in use + # so update any references to use the new key + self.remapped_keys[original_key] = new_dataset.key + new_dataset.update_status(f"Cannot import with same key - already in use on this server. Using key " + f"{new_dataset.key} instead of key {original_key}!") + + # refresh object, make sure it's in sync with the database + self.created_datasets.add(new_dataset.key) + new_dataset = DataSet(key=new_dataset.key, db=self.db, modules=self.modules) + if new_dataset.key == self.dataset.key: + # this ensures that the first imported dataset becomes the + # processor's "own" dataset, and that the import logs go to + # that dataset's log file. For later imports, this evaluates to + # False. + self.dataset = new_dataset + + # if the key of the parent dataset was changed, change the + # reference to it that the child dataset has + if new_dataset.key_parent and new_dataset.key_parent in self.remapped_keys: + new_dataset.key_parent = self.remapped_keys[new_dataset.key_parent] + + # update some attributes that should come from the new server, not + # the old + new_dataset.creator = self.dataset_owner + new_dataset.original_timestamp = new_dataset.timestamp + new_dataset.imported = True + new_dataset.timestamp = int(time.time()) + new_dataset.db.commit() + + return new_dataset + + + def process_urls(self): """ Import 4CAT dataset from another 4CAT server @@ -67,12 +367,9 @@ def process(self): keys = SearchImportFromFourcat.get_keys_from_urls(urls) api_key = self.parameters.get("api-key") - self.created_datasets = set() # keys of created datasets - may not be successful! imported = [] # successfully imported datasets failed_imports = [] # keys that failed to import - remapped_keys = {} # changed dataset keys num_rows = 0 # will be used later - dataset_owner = self.dataset.get_owners()[0] # at this point it has 1 owner # we can add support for multiple datasets later by removing # this part! @@ -101,90 +398,10 @@ def process(self): failed_imports.append(dataset_key) continue - # get rid of some keys that are server-specific and don't need to - # be stored (or don't correspond to database columns) - metadata.pop("current_4CAT_version") - metadata.pop("id") - metadata.pop("job") - metadata.pop("is_private") - metadata.pop("is_finished") # we'll finish it ourselves, thank you!!! - - # extra params are stored as JSON... - metadata["parameters"] = json.loads(metadata["parameters"]) - if "copied_from" in metadata["parameters"]: - metadata["parameters"].pop("copied_from") - metadata["parameters"] = json.dumps(metadata["parameters"]) - - if not imported: - # if this is the first dataset we're importing, make it the - # processor's "own" dataset. the key has already been set to - # the imported dataset's key via ensure_key() (or a new unqiue - # key if it already existed on this server) - # by making it the "own" dataset, the user initiating the - # import will see the imported dataset as the "result" of their - # import query in the interface, similar to the workflow for - # other data sources - new_dataset = self.dataset - metadata.pop("key") # key already OK (see above) - self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) + metadata = self.process_metadata(metadata) - else: - # supernumerary datasets - handle on their own - # these include any children of imported datasets - try: - key_exists = DataSet(key=metadata["key"], db=self.db) - - # if we *haven't* thrown a DatasetException now, then the - # key is already in use, so create a "dummy" dataset and - # overwrite it with the metadata we have (except for the - # key). this ensures that a new unique key will be - # generated. - new_dataset = DataSet(parameters={}, type=self.type, db=self.db) - metadata.pop("key") - self.db.update("datasets", where={"key": new_dataset.key}, data=metadata) - - except DataSetException: - # this is *good* since it means the key doesn't exist, so - # we can re-use the key of the imported dataset - self.db.insert("datasets", data=metadata) - new_dataset = DataSet(key=metadata["key"], db=self.db) - - # make sure the dataset path uses the new key and local dataset - # path settings. this also makes sure the log file is created in - # the right place (since it is derived from the results file path) - extension = metadata["result_file"].split(".")[-1] - new_dataset.reserve_result_file(parameters=new_dataset.parameters, extension=extension) - - new_dataset.update_status("Imported dataset created") - if new_dataset.key != dataset_key: - # could not use original key because it was already in use - # so update any references to use the new key - remapped_keys[dataset_key] = new_dataset.key - new_dataset.update_status(f"Cannot import with same key - already in use on this server. Using key " - f"{new_dataset.key} instead of key {dataset_key}!") - - # refresh object, make sure it's in sync with the database - self.created_datasets.add(new_dataset.key) - new_dataset = DataSet(key=new_dataset.key, db=self.db) - if new_dataset.key == self.dataset.key: - # this ensures that the first imported dataset becomes the - # processor's "own" dataset, and that the import logs go to - # that dataset's log file. For later imports, this evaluates to - # False. - self.dataset = new_dataset - - # if the key of the parent dataset was changed, change the - # reference to it that the child dataset has - if new_dataset.key_parent and new_dataset.key_parent in remapped_keys: - new_dataset.key_parent = remapped_keys[new_dataset.key_parent] - - # update some attributes that should come from the new server, not - # the old - new_dataset.creator = dataset_owner - new_dataset.original_timestamp = new_dataset.timestamp - new_dataset.imported = True - new_dataset.timestamp = int(time.time()) - new_dataset.db.commit() + # create the new dataset + new_dataset = self.create_dataset(metadata, dataset_key, primary=True if not imported else False) # then, the log self.halt_and_catch_fire() @@ -283,9 +500,9 @@ def halt_and_catch_fire(self): # overwritten by this point deletables = [k for k in self.created_datasets if k != self.dataset.key] for deletable in deletables: - DataSet(key=deletable, db=self.db).delete() + DataSet(key=deletable, db=self.db, modules=self.modules).delete() - self.dataset.finish_with_error(f"Interrupted while importing datasets from {self.base}. Cannot resume - you " + self.dataset.finish_with_error(f"Interrupted while importing datasets{' from '+self.base if self.base else ''}. Cannot resume - you " f"will need to initiate the import again.") raise ProcessorInterruptedException() @@ -353,47 +570,72 @@ def validate_query(query, request, user): :param User user: User object of user who has submitted the query :return dict: Safe query parameters """ - urls = query.get("url") - if not urls: - return QueryParametersException("Provide at least one dataset URL.") - - urls = urls.split(",") - bases = set([url.split("/results/")[0].lower() for url in urls]) - keys = SearchImportFromFourcat.get_keys_from_urls(urls) + if query.get("method") == "zip": + filename = "" + if "option-data_upload-entries" in request.form: + # First pass sends list of files in the zip + pass + elif "option-data_upload" in request.files: + # Second pass sends the actual file + file = request.files["option-data_upload"] + if not file: + raise QueryParametersException("No file uploaded.") + + if not file.filename.endswith(".zip"): + raise QueryParametersException("Uploaded file must be a ZIP file.") + + filename = file.filename + else: + raise QueryParametersException("No file was offered for upload.") + + return { + "method": "zip", + "filename": filename + } + elif query.get("method") == "url": + urls = query.get("url") + if not urls: + raise QueryParametersException("Provide at least one dataset URL.") + + urls = urls.split(",") + bases = set([url.split("/results/")[0].lower() for url in urls]) + keys = SearchImportFromFourcat.get_keys_from_urls(urls) + + if len(keys) != 1: + # todo: change this to < 1 if we allow multiple datasets + raise QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.") + + if len(bases) != 1: + raise QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from " + "one 4CAT server at a time.") + + base = urls[0].split("/results/")[0] + try: + # test if API key is valid and server is reachable + test = SearchImportFromFourcat.fetch_from_4cat(base, keys[0], query.get("api-key"), "metadata") + except FourcatImportException as e: + raise QueryParametersException(str(e)) - if len(keys) != 1: - # todo: change this to < 1 if we allow multiple datasets - return QueryParametersException("You need to provide a single URL to a 4CAT dataset to import.") + try: + # test if we get a response we can parse + metadata = test.json() + except ValueError: + raise QueryParametersException(f"Unexpected response when trying to fetch metadata for dataset {keys[0]}.") - if len(bases) != 1: - return QueryParametersException("All URLs need to point to the same 4CAT server. You can only import from " - "one 4CAT server at a time.") + version = get_software_version() - base = urls[0].split("/results/")[0] - try: - # test if API key is valid and server is reachable - test = SearchImportFromFourcat.fetch_from_4cat(base, keys[0], query.get("api-key"), "metadata") - except FourcatImportException as e: - raise QueryParametersException(str(e)) + if metadata.get("current_4CAT_version") != version: + raise QueryParametersException(f"This 4CAT server is running a different version of 4CAT ({version}) than " + f"the one you are trying to import from ({metadata.get('current_4CAT_version')}). Make " + "sure both are running the same version of 4CAT and try again.") - try: - # test if we get a response we can parse - metadata = test.json() - except ValueError: - raise QueryParametersException(f"Unexpected response when trying to fetch metadata for dataset {keys[0]}.") - - version = get_software_version() - - if metadata.get("current_4CAT_version") != version: - raise QueryParametersException(f"This 4CAT server is running a different version of 4CAT ({version}) than " - f"the one you are trying to import from ({metadata.get('current_4CAT_version')}). Make " - "sure both are running the same version of 4CAT and try again.") - - # OK, we can import at least one dataset - return { - "url": ",".join(urls), - "api-key": query.get("api-key") - } + # OK, we can import at least one dataset + return { + "url": ",".join(urls), + "api-key": query.get("api-key") + } + else: + raise QueryParametersException("Import method not yet implemented.") @staticmethod def get_keys_from_urls(urls): diff --git a/docker-compose_build.yml b/docker-compose_build.yml index 7466e8ba8..b81a9fb94 100644 --- a/docker-compose_build.yml +++ b/docker-compose_build.yml @@ -32,6 +32,9 @@ services: - ./data/datasets/:/usr/src/app/data/ - ./data/config/:/usr/src/app/config/ - ./data/logs/:/usr/src/app/logs/ +# - 4cat_data:/usr/src/app/data/ +# - 4cat_config:/usr/src/app/config/ +# - 4cat_logs:/usr/src/app/logs/ entrypoint: docker/docker-entrypoint.sh frontend: @@ -49,6 +52,9 @@ services: - ./data/datasets/:/usr/src/app/data/ - ./data/config/:/usr/src/app/config/ - ./data/logs/:/usr/src/app/logs/ +# - 4cat_data:/usr/src/app/data/ +# - 4cat_config:/usr/src/app/config/ +# - 4cat_logs:/usr/src/app/logs/ command: ["docker/wait-for-backend.sh"] volumes: diff --git a/processors/conversion/export_datasets.py b/processors/conversion/export_datasets.py new file mode 100644 index 000000000..bd7b81289 --- /dev/null +++ b/processors/conversion/export_datasets.py @@ -0,0 +1,106 @@ +""" +Export a dataset and all its children to a ZIP file +""" +import shutil +import json +import datetime + +from backend.lib.processor import BasicProcessor +from common.lib.dataset import DataSet +from common.lib.exceptions import DataSetException + +__author__ = "Dale Wahl" +__credits__ = ["Dale Wahl"] +__maintainer__ = "Dale Wahl" +__email__ = "4cat@oilab.eu" + + + +class ExportDatasets(BasicProcessor): + """ + Export a dataset and all its children to a ZIP file + """ + type = "export-datasets" # job type ID + category = "Conversion" # category + title = "Export Dataset and All Analyses" # title displayed in UI + description = "Creates a ZIP file containing the dataset and all analyses to be archived and uploaded to a 4CAT instance in the future. Automatically expires after 1 day, after which you must run again." # description displayed in UI + extension = "zip" # extension of result file, used internally and in UI + + @classmethod + def is_compatible_with(cls, module=None, user=None): + """ + Determine if processor is compatible with dataset + + :param module: Module to determine compatibility with + """ + return module.is_top_dataset() and user.can_access_dataset(dataset=module, role="owner") + + def process(self): + """ + This takes a CSV file as input and writes the same data as a JSON file + """ + self.dataset.update_status("Collecting dataset and all analyses") + + results_path = self.dataset.get_staging_area() + + exported_datasets = [] + failed_exports = [] # keys that failed to import + keys = [self.dataset.top_parent().key] # get the key of the top parent + while keys: + dataset_key = keys.pop(0) + self.dataset.log(f"Exporting dataset {dataset_key}.") + + try: + dataset = DataSet(key=dataset_key, db=self.db) + # TODO: these two should fail for the primary dataset, but should they fail for the children too? + except DataSetException: + self.dataset.finish_with_error("Dataset not found.") + return + if not dataset.is_finished(): + self.dataset.finish_with_error("You cannot export unfinished datasets.") + return + + # get metadata + metadata = dataset.get_metadata() + if metadata["num_rows"] == 0: + self.dataset.update_status(f"Skipping empty dataset {dataset_key}") + failed_exports.append(dataset_key) + continue + + # get data + data_file = dataset.get_results_path() + if not data_file.exists(): + self.dataset.finish_with_error(f"Dataset {dataset_key} has no data; skipping.") + failed_exports.append(dataset_key) + continue + + # get log + log_file = dataset.get_results_path().with_suffix(".log") + + # All good, add to ZIP + with results_path.joinpath(f"{dataset_key}_metadata.json").open("w", encoding="utf-8") as outfile: + outfile.write(json.dumps(metadata)) + shutil.copy(data_file, results_path.joinpath(data_file.name)) + if log_file.exists(): + shutil.copy(log_file, results_path.joinpath(log_file.name)) + + # add children to queue + # Not using get_all_children() because we want to skip unfinished datasets and only need the keys + children = [d["key"] for d in self.db.fetchall("SELECT key FROM datasets WHERE key_parent = %s AND is_finished = TRUE", (dataset_key,))] + keys.extend(children) + + self.dataset.update_status(f"Exported dataset {dataset_key}.") + exported_datasets.append(dataset_key) + + # Add export log to ZIP + self.dataset.log(f"Exported datasets: {exported_datasets}") + self.dataset.log(f"Failed to export datasets: {failed_exports}") + shutil.copy(self.dataset.get_log_path(), results_path.joinpath("export.log")) + + # set expiration date + # these datasets can be very large and are just copies of the existing datasets, so we don't need to keep them around for long + # TODO: convince people to stop using hyphens in python variables and file names... + self.dataset.__setattr__("expires-after", (datetime.datetime.now() + datetime.timedelta(days=1)).timestamp()) + + # done! + self.write_archive_and_finish(results_path, len(exported_datasets)) \ No newline at end of file diff --git a/webtool/views/api_tool.py b/webtool/views/api_tool.py index 5b47c030d..f7f66ad6e 100644 --- a/webtool/views/api_tool.py +++ b/webtool/views/api_tool.py @@ -1246,11 +1246,7 @@ def export_packed_dataset(key=None, component=None): return error(403, error="You cannot export unfinished datasets.") if component == "metadata": - metadata = db.fetchone("SELECT * FROM datasets WHERE key = %s", (dataset.key,)) - - # get 4CAT version (presumably to ensure export is compatible with import) - metadata["current_4CAT_version"] = get_software_version() - return jsonify(metadata) + return jsonify(dataset.get_metadata()) elif component == "children": children = [d["key"] for d in db.fetchall("SELECT key FROM datasets WHERE key_parent = %s AND is_finished = TRUE", (dataset.key,))] From dbcc7bddc5b3b440254c3645407629d304057931 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 1 Oct 2024 17:32:59 +0200 Subject: [PATCH 16/26] remove auto settings deletion --- .env | 2 +- common/config_manager.py | 10 ---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/.env b/.env index d03f9c703..7d89486c7 100644 --- a/.env +++ b/.env @@ -2,7 +2,7 @@ # https://hub.docker.com/repository/docker/digitalmethodsinitiative/4cat/tags?page=1&ordering=last_updated DOCKER_TAG=stable # You can select Postrgres Docker image tags here to suit your needs: https://hub.docker.com/_/postgres -POSTGRES_TAG=latest +POSTGRES_TAG=15 # Database setup POSTGRES_USER=fourcat diff --git a/common/config_manager.py b/common/config_manager.py index eb6c846d0..1b8d4052f 100644 --- a/common/config_manager.py +++ b/common/config_manager.py @@ -146,16 +146,6 @@ def ensure_database(self): """ self.with_db() - # delete unknown keys - known_keys = tuple([names for names, settings in config.config_definition.items() if settings.get("type") not in UserInput.OPTIONS_COSMETIC]) - unknown_keys = self.db.fetchall("SELECT DISTINCT name FROM settings WHERE name NOT IN %s", (known_keys,)) - - if unknown_keys: - self.db.log.info(f"Deleting unknown settings from database: {', '.join([key['name'] for key in unknown_keys])}") - self.db.delete("settings", where={"name": tuple([key["name"] for key in unknown_keys])}, commit=False) - - self.db.commit() - # create global values for known keys with the default known_settings = self.get_all() for setting, parameters in self.config_definition.items(): From 8559a113313cad860d4a5a2b5bb55943278bfc6c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 1 Oct 2024 17:33:50 +0200 Subject: [PATCH 17/26] undo .env change --- .env | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.env b/.env index 7d89486c7..d03f9c703 100644 --- a/.env +++ b/.env @@ -2,7 +2,7 @@ # https://hub.docker.com/repository/docker/digitalmethodsinitiative/4cat/tags?page=1&ordering=last_updated DOCKER_TAG=stable # You can select Postrgres Docker image tags here to suit your needs: https://hub.docker.com/_/postgres -POSTGRES_TAG=15 +POSTGRES_TAG=latest # Database setup POSTGRES_USER=fourcat From d769be44adb920503c33f88777d2879dcca4b98c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 3 Oct 2024 16:48:46 +0200 Subject: [PATCH 18/26] douyin fix link for streams --- datasources/douyin/search_douyin.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datasources/douyin/search_douyin.py b/datasources/douyin/search_douyin.py index 4b5d5b814..12768196c 100644 --- a/datasources/douyin/search_douyin.py +++ b/datasources/douyin/search_douyin.py @@ -218,7 +218,7 @@ def map_item(item): "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), "post_source_domain": urllib.parse.unquote(metadata.get("source_platform_url")), # Adding this as different Douyin pages contain different data - "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}", + "post_url": f"https://www.douyin.com/video/{item[aweme_id_key]}" if subject == "Post" else f"https://live.douyin.com/{author.get('web_rid')}", "region": item.get("region", ""), "hashtags": ",".join( [tag[hashtag_key] for tag in (item[text_extra_key] if item[text_extra_key] is not None else []) if From e4c0099d75cdc27f0e1f3f3609a8af93c52b425c Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 8 Oct 2024 12:07:57 +0200 Subject: [PATCH 19/26] word-trees allow selection of column --- processors/visualisation/word-trees.py | 148 ++++++++++++++----------- 1 file changed, 86 insertions(+), 62 deletions(-) diff --git a/processors/visualisation/word-trees.py b/processors/visualisation/word-trees.py index f7783bcc1..0dfe2d408 100644 --- a/processors/visualisation/word-trees.py +++ b/processors/visualisation/word-trees.py @@ -38,71 +38,94 @@ class MakeWordtree(BasicProcessor): "Wattenberg, M., & Viégas, F. B. (2008). The Word Tree, an Interactive Visual Concordance. IEEE Transactions on Visualization and Computer Graphics, 14(6), 1221–1228. " ] - options = { - "query": { - "type": UserInput.OPTION_TEXT, - "default": "", - "help": "Word tree root query", - "tooltip": "Enter a word here to serve as the root of the word tree. The context of this query will be mapped in the tree visualisation. Cannot be empty or contain whitespace." - }, - "limit": { - "type": UserInput.OPTION_TEXT, - "default": 3, - "min": 1, - "max": 25, - "help": "Max branches/level", - "tooltip": "Limit the amount of branches per level, sorted by most-occuring phrases. Range 1-25." - }, - "window": { - "type": UserInput.OPTION_TEXT, - "min": 1, - "max": 10, - "default": 5, - "help": "Window size", - "tooltip": "Up to this many words before and/or after the queried phrase will be visualised" - }, - "sides": { - "type": UserInput.OPTION_CHOICE, - "default": "right", - "options": { - "left": "Before query", - "right": "After query", - "both": "Before and after query" + @classmethod + def get_options(cls, parent_dataset=None, user=None): + """ + Get processor options + """ + options = { + "column": { + "type": UserInput.OPTION_TEXT, + "help": "Text column", + "default": "url", + "inline": True, + "tooltip": "Select the column containing the text from which to generate the word tree.", + }, + "query": { + "type": UserInput.OPTION_TEXT, + "default": "", + "help": "Word tree root query", + "tooltip": "Enter a word here to serve as the root of the word tree. The context of this query will be mapped in the tree visualisation. Cannot be empty or contain whitespace." + }, + "limit": { + "type": UserInput.OPTION_TEXT, + "default": 3, + "min": 1, + "max": 25, + "help": "Max branches/level", + "tooltip": "Limit the amount of branches per level, sorted by most-occuring phrases. Range 1-25." }, - "help": "Query context to visualise" - }, - "align": { - "type": UserInput.OPTION_CHOICE, - "default": "middle", - "options": { - "middle": "Vertically centered", - "top": "Top", + "window": { + "type": UserInput.OPTION_TEXT, + "min": 1, + "max": 10, + "default": 5, + "help": "Window size", + "tooltip": "Up to this many words before and/or after the queried phrase will be visualised" }, - "help": "Visual alignment" - }, - "tokeniser_type": { - "type": UserInput.OPTION_CHOICE, - "default": "regular", - "options": { - "regular": "nltk word_tokenize", - "jieba-cut": "jieba (for Chinese text; accurate mode, recommended)", - "jieba-cut-all": "jieba (for Chinese text; full mode)", - "jieba-search": "jieba (for Chinese text; search engine suggestion style)", + "sides": { + "type": UserInput.OPTION_CHOICE, + "default": "right", + "options": { + "left": "Before query", + "right": "After query", + "both": "Before and after query" + }, + "help": "Query context to visualise" }, - "help": "Tokeniser", - "tooltip": "What heuristic to use to split up the text into separate words." - }, - "strip-urls": { - "type": UserInput.OPTION_TOGGLE, - "default": True, - "help": "Remove URLs" - }, - "strip-symbols": { - "type": UserInput.OPTION_TOGGLE, - "default": True, - "help": "Remove punctuation" + "align": { + "type": UserInput.OPTION_CHOICE, + "default": "middle", + "options": { + "middle": "Vertically centered", + "top": "Top", + }, + "help": "Visual alignment" + }, + "tokeniser_type": { + "type": UserInput.OPTION_CHOICE, + "default": "regular", + "options": { + "regular": "nltk word_tokenize", + "jieba-cut": "jieba (for Chinese text; accurate mode, recommended)", + "jieba-cut-all": "jieba (for Chinese text; full mode)", + "jieba-search": "jieba (for Chinese text; search engine suggestion style)", + }, + "help": "Tokeniser", + "tooltip": "What heuristic to use to split up the text into separate words." + }, + "strip-urls": { + "type": UserInput.OPTION_TOGGLE, + "default": True, + "help": "Remove URLs" + }, + "strip-symbols": { + "type": UserInput.OPTION_TOGGLE, + "default": True, + "help": "Remove punctuation" + } } - } + + # Get the columns for the select columns option + if parent_dataset and parent_dataset.get_columns(): + columns = parent_dataset.get_columns() + options["column"]["type"] = UserInput.OPTION_CHOICE + options["column"]["options"] = {v: v for v in columns} + options["column"]["default"] = "body" if "body" in columns else sorted( + columns, + key=lambda k: any([name in k for name in ["text", "subject", "description"]]), reverse=True).pop(0) + + return options @classmethod def is_compatible_with(cls, module=None, user=None): @@ -146,6 +169,7 @@ def process(self): delete_regex = re.compile(r"[^a-zA-Z)(.,\n -]") # settings + column = self.parameters.get("column") strip_urls = self.parameters.get("strip-urls") strip_symbols = self.parameters.get("strip-symbols") sides = self.parameters.get("sides") @@ -187,7 +211,7 @@ def process(self): processed += 1 if processed % 500 == 0: self.dataset.update_status("Processing and tokenising post %i" % processed) - body = post["body"] + body = post.get(column) if not body: continue From a269f96ed0cf296400fc1d5b4252d0a6765dda52 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 8 Oct 2024 12:31:22 +0200 Subject: [PATCH 20/26] use punkt_tab instead of punkt due to pickle issue: https://github.com/nltk/nltk/issues/3293 --- helper-scripts/first-run.py | 2 +- helper-scripts/migrate.py | 4 ++-- processors/text-analysis/tokenise.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/helper-scripts/first-run.py b/helper-scripts/first-run.py index dea0fd487..a565a591e 100644 --- a/helper-scripts/first-run.py +++ b/helper-scripts/first-run.py @@ -40,7 +40,7 @@ # Now check for presence of required NLTK packages import nltk -nltk_downloads = ("wordnet", "punkt", "omw-1.4") +nltk_downloads = ("wordnet", "punkt_tab", "omw-1.4") for package in nltk_downloads: # if it already exists, .download() will just NOP try: diff --git a/helper-scripts/migrate.py b/helper-scripts/migrate.py index 25071afe4..55c26c044 100644 --- a/helper-scripts/migrate.py +++ b/helper-scripts/migrate.py @@ -69,9 +69,9 @@ def check_for_nltk(): # NLTK import nltk try: - nltk.data.find('tokenizers/punkt') + nltk.data.find('tokenizers/punkt_tab') except LookupError: - nltk.download('punkt', quiet=True) + nltk.download('punkt_tab', quiet=True) try: nltk.data.find('corpora/wordnet') except LookupError: diff --git a/processors/text-analysis/tokenise.py b/processors/text-analysis/tokenise.py index a104306f1..17c350c86 100644 --- a/processors/text-analysis/tokenise.py +++ b/processors/text-analysis/tokenise.py @@ -357,7 +357,7 @@ def dummy_function(x, *args, **kwargs): # for russian we use a special purpose splitter with better # performance sentence_method = razdel.sentenize - elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt')) if + elif language not in [lang.split('.')[0] for lang in os.listdir(nltk.data.find('tokenizers/punkt_tab')) if 'pickle' in lang]: self.dataset.update_status( f"Language {language} not available for sentence tokenizer; grouping by item/post instead.") From db5b6498acd9310a0849be4abf21ab7b04a979bc Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 8 Oct 2024 13:10:50 +0200 Subject: [PATCH 21/26] histwords: return on fail --- processors/visualisation/histwords.py | 1 + 1 file changed, 1 insertion(+) diff --git a/processors/visualisation/histwords.py b/processors/visualisation/histwords.py index f6ae05261..7463e9662 100644 --- a/processors/visualisation/histwords.py +++ b/processors/visualisation/histwords.py @@ -243,6 +243,7 @@ def process(self): vectors = tsne.fit_transform(vectors) except ValueError: self.dataset.finish_with_error("Insufficient data to reduce to 2D. The word embeddings model may be too small to visualise properly.") + return elif reduction_method == "TruncatedSVD": # standard sklearn parameters made explicit svd = TruncatedSVD(n_components=2, algorithm="randomized", n_iter=5, random_state=0) From d399921b7538bc06264b122c8a0d523826bb91a5 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 11 Oct 2024 17:12:21 +0200 Subject: [PATCH 22/26] Configurable model list for stormtrooper processor --- processors/machine_learning/annotate_text.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/processors/machine_learning/annotate_text.py b/processors/machine_learning/annotate_text.py index 954963de4..59016e077 100644 --- a/processors/machine_learning/annotate_text.py +++ b/processors/machine_learning/annotate_text.py @@ -40,6 +40,11 @@ class TextClassifier(BasicProcessor): "type": UserInput.OPTION_TOGGLE, "default": False, "help": "Enable LLM-powered text classification", + }, + "dmi-service-manager.stormtrooper_models": { + "type": UserInput.OPTION_TEXT, + "default": "google/flan-t5-large,tiiaue/falcon-7b-instruct", + "help": "Comma-separated list of models that can be selected" } } @@ -53,8 +58,6 @@ class TextClassifier(BasicProcessor): "type": UserInput.OPTION_CHOICE, "default": "google/flan-t5-large", "options": { - "google/flan-t5-large": "google/flan-t5-large", - "tiiaue/falcon-7b-instruct": "tiiaue/falcon-7b-instruct" }, "help": "Large Language Model to use" }, @@ -97,6 +100,10 @@ def get_options(cls, parent_dataset=None, user=None): :return dict: Processor options """ options = cls.options + + models = config.get("dmi-service-manager.stormtrooper_models", user=user).split(",") + options["model"]["options"] = {m: m for m in models} + if parent_dataset is None: return options From 0bc88a3fabcb8818720ab0a6ea40e166cbcb45e3 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Fri, 11 Oct 2024 17:17:18 +0200 Subject: [PATCH 23/26] Add references to stormtrooper processor --- processors/machine_learning/annotate_text.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/processors/machine_learning/annotate_text.py b/processors/machine_learning/annotate_text.py index 59016e077..022e96de5 100644 --- a/processors/machine_learning/annotate_text.py +++ b/processors/machine_learning/annotate_text.py @@ -31,6 +31,13 @@ class TextClassifier(BasicProcessor): "provided categories.") # description displayed in UI extension = "csv" # extension of result file, used internally and in UI + references = [ + "Annotations are made using the [Stormtrooper](https://centre-for-humanities-computing.github.io/stormtrooper/) library", + "Model card: [google/flan-t5-large](https://huggingface.co/google/flan-t5-large)", + "Model card: [tiiuae/falcon-7b-instruct](https://huggingface.co/tiiuae/falcon-7b-instruct)", + "Model card: [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct)" + ] + config = { "dmi-service-manager.stormtrooper_intro-1": { "type": UserInput.OPTION_INFO, From c27fbbe44175740bffa959fc21d3d98cb42758ce Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 14 Oct 2024 09:35:27 +0200 Subject: [PATCH 24/26] Yet more ways LinkedIn stores image URLs --- datasources/linkedin/search_linkedin.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datasources/linkedin/search_linkedin.py b/datasources/linkedin/search_linkedin.py index f357341ed..a8380b4d8 100644 --- a/datasources/linkedin/search_linkedin.py +++ b/datasources/linkedin/search_linkedin.py @@ -79,7 +79,10 @@ def map_item(item): # or alternatively they are stored here: if not images and item["content"] and item["content"].get("articleComponent") and item["content"]["articleComponent"].get("largeImage"): image = item["content"]["articleComponent"]["largeImage"]["attributes"][0]["detailData"]["vectorImage"] - images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"]) + if not image and item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]: + images.append(item["content"]["articleComponent"]["largeImage"]["attributes"][0]["imageUrl"]["url"]) + elif image and image.get("artifacts"): + images.append(image["rootUrl"] + image["artifacts"][0]["fileIdentifyingUrlPathSegment"]) author = SearchLinkedIn.get_author(item) From a68f5d64de14c02368ba7b9b7dd0a1118cd15417 Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 14 Oct 2024 12:21:17 +0200 Subject: [PATCH 25/26] Threads data source --- datasources/threads/DESCRIPTION.md | 9 ++++ datasources/threads/__init__.py | 12 +++++ datasources/threads/search_threads.py | 78 +++++++++++++++++++++++++++ 3 files changed, 99 insertions(+) create mode 100644 datasources/threads/DESCRIPTION.md create mode 100644 datasources/threads/__init__.py create mode 100644 datasources/threads/search_threads.py diff --git a/datasources/threads/DESCRIPTION.md b/datasources/threads/DESCRIPTION.md new file mode 100644 index 000000000..22f95bba8 --- /dev/null +++ b/datasources/threads/DESCRIPTION.md @@ -0,0 +1,9 @@ +The Threads data source can be used to manipulate data collected from [Threads](https://threads.net) - Meta's +microblogging platform - with [Zeeschuimer](https://github.com/digitalmethodsinitiative/zeeschuimer). Data is collected +with the browser extension; 4CAT cannot collect data on its own. After collecting data with Zeeschuimer it can be +uploaded to 4CAT for further processing and analysis. See the Zeeschuimer documentation for more information on how to +collect data with it. + +Data is collected as it is formatted internally by Threads' website. Posts are stored as (large) JSON objects; it +will usually be easier to make sense of the data by downloading it as a CSV file from 4CAT instead. The JSON structure +is relatively straightforward and contains some data not included in the CSV exports. \ No newline at end of file diff --git a/datasources/threads/__init__.py b/datasources/threads/__init__.py new file mode 100644 index 000000000..a4f019429 --- /dev/null +++ b/datasources/threads/__init__.py @@ -0,0 +1,12 @@ +""" +Initialize Threads data source +""" + +# An init_datasource function is expected to be available to initialize this +# data source. A default function that does this is available from the +# backend helpers library. +from common.lib.helpers import init_datasource + +# Internal identifier for this data source +DATASOURCE = "threads" +NAME = "Threads" \ No newline at end of file diff --git a/datasources/threads/search_threads.py b/datasources/threads/search_threads.py new file mode 100644 index 000000000..02c8c2de4 --- /dev/null +++ b/datasources/threads/search_threads.py @@ -0,0 +1,78 @@ +""" +Import scraped Threads data + +It's prohibitively difficult to scrape data from Threads within 4CAT itself due +to its aggressive rate limiting. Instead, import data collected elsewhere. +""" +from datetime import datetime +from urllib.parse import urlparse, parse_qs, unquote +import re + +from backend.lib.search import Search +from common.lib.item_mapping import MappedItem + + +class SearchThreads(Search): + """ + Import scraped Threads data + """ + type = "threads-search" # job ID + category = "Search" # category + title = "Import scraped Threads data" # title displayed in UI + description = "Import Threads data collected with an external tool such as Zeeschuimer." # description displayed in UI + extension = "ndjson" # extension of result file, used internally and in UI + is_from_zeeschuimer = True + + # not available as a processor for existing datasets + accepts = [None] + references = [ + "[Zeeschuimer browser extension](https://github.com/digitalmethodsinitiative/zeeschuimer)", + "[Worksheet: Capturing TikTok data with Zeeschuimer and 4CAT](https://tinyurl.com/nmrw-zeeschuimer-tiktok)" + ] + + def get_items(self, query): + """ + Run custom search + + Not available for 9gag + """ + raise NotImplementedError("Threads datasets can only be created by importing data from elsewhere") + + @staticmethod + def map_item(post): + post_timestamp = datetime.fromtimestamp(post["taken_at"]) + + if post["carousel_media"]: + image_urls = [c["image_versions2"]["candidates"].pop(0)["url"] for c in post["carousel_media"] if c["image_versions2"]] + video_urls = [c["video_versions"].pop(0)["url"] for c in post["carousel_media"] if c["video_versions"]] + else: + image_urls = [post["image_versions2"]["candidates"].pop(0)["url"]] if post["image_versions2"].get("candidates") else [] + video_urls = [post["video_versions"].pop(0)["url"]] if post["video_versions"] else [] + + linked_url = "" + link_thumbnail = "" + if post["text_post_app_info"].get("link_preview_attachment"): + linked_url = post["text_post_app_info"]["link_preview_attachment"]["url"] + linked_url = parse_qs(urlparse(linked_url).query).get("u", "").pop() + link_thumbnail = post["text_post_app_info"]["link_preview_attachment"].get("image_url") + + return MappedItem({ + "id": post["code"], + "url": f"https://www.threads.net/@{post['user']['username']}/post/{post['code']}", + "body": post["caption"]["text"] if post["caption"] else "", + "timestamp": post_timestamp.strftime("%Y-%m-%d %H:%M:%S"), + "author": post["user"]["username"], + "author_is_verified": "yes" if post["user"].get("is_verified") else "no", + "author_avatar": post["user"].get("profile_pic_url"), + "image_url": ",".join(image_urls), + "video_url": ",".join(video_urls), + "link_url": linked_url, + "link_thumbnail_url": link_thumbnail if link_thumbnail else "", + "is_paid_partnership": "yes" if post["is_paid_partnership"] else "no", + "likes": post["like_count"], + "reposts": post["text_post_app_info"]["repost_count"], + "replies": post["text_post_app_info"]["direct_reply_count"], + "quotes": post["text_post_app_info"]["quote_count"], + "hashtags": ",".join(re.findall(r"#([^\s!@#$%ˆ&*()_+{}:\"|<>?\[\];'\,./`~']+)", post["caption"]["text"])) if post["caption"] else "", + "unix_timestamp": int(post_timestamp.timestamp()), + }) From cbbf89ec35c782823240668d653a5e921e15794a Mon Sep 17 00:00:00 2001 From: Stijn Peeters Date: Mon, 14 Oct 2024 15:09:12 +0200 Subject: [PATCH 26/26] Don't crash URL Titles when trying to extract URLs from numbers --- processors/metrics/url_titles.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/processors/metrics/url_titles.py b/processors/metrics/url_titles.py index e32e3538d..75ebd12d0 100644 --- a/processors/metrics/url_titles.py +++ b/processors/metrics/url_titles.py @@ -145,7 +145,7 @@ def process(self): self.dataset.update_status("Finding URLs in dataset") for item in self.source_dataset.iterate_items(self): # combine column contents that we need to extract URLs from - source_text = " ".join([item[column] for column in columns]) + source_text = " ".join([str(item[column]) for column in columns]) urls = ural.urls_from_text(source_text) for url in urls: