benjamsf · benjamsf · May 21, 2023 · Mar 10, 2024 · Mar 10, 2024 · Mar 12, 2024
diff --git a/lutherscripts/cli.py b/lutherscripts/cli.py
@@ -13,7 +13,7 @@
 
 
 def add_arguments(parser):
-    parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic_analysis", "freq_analysis", "build_corpus", "topic_modeling", "word_document_probability"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, kwic_analysis, corpus_builder, topic_modeler or word_document_probability")
+    parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic_analysis", "freq_analysis", "build_corpus", "topic_modeling", "word_document_probability", "export_docs", "export_prepared_text"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, kwic_analysis, corpus_builder, topic_modeler, export_docs, export_prepared_text or word_document_probability")
     parser.add_argument("-1", "--first-detail", type=float, help="First detail flag for operation, depends on the operation")
     parser.add_argument("-2", "--second-detail", type=float, help="Second detail flag for operation, depends on the operation")
     parser.add_argument("-3", "--third-detail", type=int, help="Third detail flag for operation, depends on the operation")
@@ -52,6 +52,14 @@ def word_document_probability(source_path, corpus_path, dictionary_path, destina
     from src.text_processing.gensim_word_document_probability import main as gensim_word_document_probability
     output = gensim_word_document_probability(source_path, corpus_path, dictionary_path, destination_path)
 
+def export_docs(source_path, destination_path):
+    from src.text_preparation.python_createseparatefiles_from_metadata import main as export_docs
+    output = export_docs(source_path, destination_path)
+
+def export_prepared_text(source_path, destination_path):
+    from src.text_preparation.python_createseparatefiles_from_prepared_text import main as export_prepared_text
+    output = export_prepared_text(source_path, destination_path)
+
 def cli_main():
     parser = argparse.ArgumentParser(description="Lutherscript operations launcher")
     add_arguments(parser)
@@ -91,9 +99,11 @@ def cli_main():
         freq_analysis(source_path, destination_path)
     elif args.operation == 'build_corpus':
         build_corpus(source_path, destination_path, args.first_detail, args.second_detail)
+    elif args.operation == 'export_docs':
+        export_docs(source_path, destination_path)
+    elif args.operation == 'export_prepared_text':
+        export_prepared_text(source_path, destination_path)
 
 
 if __name__ == '__main__':
-    cli_main()
-
-
+    cli_main()
diff --git a/lutherscripts/gui.py b/lutherscripts/gui.py
@@ -12,41 +12,33 @@
 from concurrent.futures import ThreadPoolExecutor
 import asyncio
 import time
+import queue
+
 
 
 __author__ = "benjamsf"
 __license__ = "MIT"
 
+# This flag and queue are used for communication between threads and updating the GUI
 stop_flag = [False]
+message_queue = queue.Queue()
 
 class CustomTextRedirector:
     def __init__(self, widget):
         self.widget = widget
-        self.widget.configure(background='black', foreground='green')  # Set background and text color
-        self.encoding = 'utf-8'  # Set the encoding for the widget
+        self.widget.configure(background='black', foreground='green', font=('Arial', 12))
 
     def write(self, message):
-        self.widget.configure(state='normal')
-        self.widget.insert(tk.END, message.encode(self.encoding))  # Encode the message with the specified encoding
-        self.widget.see(tk.END)
-        self.widget.configure(state='disabled')
-        self.widget.update()
+        if self.widget.winfo_exists():
+            self.widget.configure(state='normal')
+            self.widget.insert(tk.END, message)
+            self.widget.see(tk.END)
+            self.widget.configure(state='disabled')
 
     def flush(self):
-        pass
-
-    def readline(self):
-        return ''
-
-def create_image_label(parent, root, frames):
-    lbl_luther_image = tk.Label(parent, image=frames[0])
-    lbl_luther_image.grid(row=0, rowspan=8, column=0, padx=10, pady=10)
-
-    return lbl_luther_image
-
+        pass  # Nothing to do here for now
 
 def gui_main():
-
     root = tk.Tk()
     root.geometry("1500x640")
     root.title("Lutherscripts (Dev version) - A NLP toolset for Latin language")
@@ -121,17 +113,21 @@ def gui_main():
         ("kwic_analysis", "Perform KWIC analysis from your JSON word tokenized text"),
         ("freq_analysis", "Perform word frequency analysis from your JSON word tokenized text"),    
         ("build_corpus", "Build a dictionary and corpus from your JSON word tokenized text"),
-        ("topic_modeling", "Perform Topic Modeling from your dictionary and corpus")
+        ("topic_modeling", "Perform Topic Modeling from your dictionary and corpus"),
+        ("export_docs", "Export the tokenized JSON to multiple txt documents"),
+        ("export_prepared_text", "Export Lutherscripts prepared txt to multiple txt documents")
         ]
 
     def update_explanation(*args):
         explanations = {
             "Tokenize Latin text by words": "This operation will tokenize your Latin text by words, which is required for further word-based natural language processing, using CLTK. You can manually segmentate the text via inserting a headline in a format #Detail,Otherdetail,Thirddetail# and end marker of the segment as #end#. That will be interpreted by the tokenizer as a single document, with metadata provided in the header",
             "Tokenize Latin text by sentences": "This operation will tokenize your Latin text by sentences, which is useful for sentence-based natural language processing, using CLTK. As of dev version, not in the par of the other operations.",
             "Perform KWIC analysis from your JSON word tokenized text": "This operation will perform a Key Word in Context (KWIC) analysis, allowing you to see the occurrences of a word within the context of the text, using NLTK. Source must be a Word Tokenized text in JSON format.",
-            "Perform word frequency analysis": "This operation will perform a Word Frequency Analysis, allowing you to see the number of times each word has been used in your target text, using NLTK. Source must be a Word Tokenized text in JSON format.",
+            "Perform word frequency analysis from your JSON word tokenized text": "This operation will perform a Word Frequency Analysis, allowing you to see the number of times each word has been used in your target text, using NLTK. Source must be a Word Tokenized text in JSON format.",
             "Build a dictionary and corpus from your JSON word tokenized text": "This operation will build a dictionary and a corpus from your Word Tokenized text in JSON format using GenSim, for to source further operations. As Arg 1 pass minimum appearance of a word in a document corpus to be accepted to the corpus, as Arg 2 pass the maximum in a fraction of a document to do the same.",
-            "Perform Topic Modeling from your dictionary and corpus": "This operation will perform Topic Modeling using GenSim from your dictionary and corpus files. As Argument 1, pass the number of topics you want to try dig out from the text. As Argument 2, pass the number of passes to perform on the corpus. Test different values both here and during the corpus building for to achieve accuracy."
+            "Perform Topic Modeling from your dictionary and corpus": "This operation will perform Topic Modeling using GenSim from your dictionary and corpus files. As Argument 1, pass the number of topics you want to try dig out from the text. As Argument 2, pass the number of passes to perform on the corpus. Test different values both here and during the corpus building for to achieve accuracy.",
+            "Export the tokenized JSON to multiple txt documents": "Lutherscripts Latin tokenizer will output the source to a JSON array. Export that to separate txt documents for work with tools like Voyant.",
+            "Export Lutherscripts prepared txt to multiple txt documents": "Export Lutherscripts prepared text to multiple txt documents. Text prepared by Lutherscripts remove clutter from Luther's Werke im WWW / other raw text sources, and enables you to separate it to documents in a way that Lutherscripts understands. Now export the text to multiple txt files based on that markup, in order to use them in 3rd party tools like VoyantTools."
         }
 
         selected_operation = var_operation.get()
@@ -176,13 +172,75 @@ def choose_output_file():
     root.grid_rowconfigure(5, weight=1)
     sys.stdout = CustomTextRedirector(txt_terminal)
     sys.stderr = CustomTextRedirector(txt_terminal)
+
+    def update_txt_terminal():
+        try:
+            while not message_queue.empty():
+                message = message_queue.get_nowait()
+                txt_terminal.configure(state='normal')
+                txt_terminal.insert(tk.END, message)
+                txt_terminal.see(tk.END)
+                txt_terminal.configure(state='disabled')
+        except queue.Empty:
+            pass  # No more messages to display
+        finally:
+            # Reschedule this function to run again after 100 ms
+            root.after(100, update_txt_terminal)
+
 
 
     def update_image_label(lbl, frames):
         frame = frames.pop(0)
         frames.append(frame)
         lbl.config(image=frame)
 
+    def finalize_operation():
+        """Re-enable the button and stop the animation after the operation is done."""
+        btn_play.configure(state='normal')  # Re-enable the button
+        stop_flag[0] = True
+
+    def animate_luther(stop_flag):
+        while not stop_flag[0]:
+            update_image_label(lbl_luther_image, frames)
+            root.update()
+            time.sleep(interval)
+
+    def start_operation():
+        btn_play.configure(state='disabled')
+        txt_terminal.configure(state='normal')
+        txt_terminal.delete(1.0, tk.END)  # Clear existing text
+        txt_terminal.configure(state='disabled')
+        print("Starting the requested operation...")
+
+        # Run the async operation in a separate thread
+        threading.Thread(target=start_async_operation, daemon=True).start()
+
+        # Update the GUI periodically
+        update_txt_terminal()
+
+        # Start the animation thread
+        stop_flag[0] = False
+        animation_thread = threading.Thread(target=animate_luther, args=(stop_flag,))
+        animation_thread.daemon = True
+        animation_thread.start()
+
+    def start_async_operation():
+        """Start the async operation in a new thread."""
+        def run_in_background():
+            try:
+                loop = asyncio.new_event_loop()
+                asyncio.set_event_loop(loop)
+                loop.run_until_complete(run_script_async())
+                loop.close()
+            except Exception as e:
+                print(f"An error occurred: {e}")
+            finally:
+                root.after(0, finalize_operation)
+
+        global stop_flag  # Use the existing stop_flag list
+        stop_flag[0] = False  # Update the value in the list
+        threading.Thread(target=run_in_background, daemon=True).start()
+
 
     async def run_script_async():
         operation_name = [option[0] for option in options if option[1] == var_operation.get()][0]
@@ -211,24 +269,23 @@ async def run_script_async():
         )
 
         lbl_luther_image.config(image=gif1)
+
 
-        output = ''
+        output_buffer = ''  # Buffer to collect output
         while True:
             char = await process.stdout.read(1)
             if not char:
                 break
-            char = char.decode(errors='replace')
+            output_buffer += char.decode(errors='replace')
+
+            if '\n' in output_buffer or '\r' in output_buffer:
+                # Put the buffer into the queue and reset it
+                message_queue.put(output_buffer)
+                output_buffer = ''
+        # Ensure any remaining output is sent to the queue
+        if output_buffer:
+            message_queue.put(output_buffer)
 
-            if char == '\r':
-                txt_terminal.configure(state='normal')
-                txt_terminal.delete(f'{tk.END} -2c linestart', tk.END)
-                txt_terminal.insert(tk.END, output)
-                txt_terminal.see(tk.END)
-                txt_terminal.configure(state='disabled')
-                txt_terminal.update()
-                output = ''
-            else:
-                output += char
 
         stderr_data = await process.stderr.read()
         if stderr_data:
@@ -237,49 +294,8 @@ async def run_script_async():
             txt_terminal.see(tk.END)
             txt_terminal.configure(state='disabled')
             txt_terminal.update()
-
-    def animate_luther(stop_flag):
-        while not stop_flag[0]:
-            update_image_label(lbl_luther_image, frames)
-            root.update()
-            time.sleep(interval)
-
-
-    def start_operation():
-        # Check the selected operation and validate arguments for KWIC analysis
-        operation_name = [option[0] for option in options if option[1] == var_operation.get()][0]
-        if operation_name == "kwic_analysis":
-            argument1 = ent_argument1.get()
-            argument2 = ent_argument2.get()
-            if not argument1 or not argument2:
-                print("Please enter both Argument1 (Keyword) and argument2 (Context length, a number of words you want to see left and right of a keyword hit) for the KWIC analysis")
-                return
-        if operation_name == "topic_modeling":
-            argument1 = ent_argument1.get()
-            argument2 = ent_argument2.get()
-            if not argument1 or not argument2:
-                print("Please enter both Argument1 (Number of Topics) and argument2 (Number of Corpus Passes during LDA Training) ")
-                return
-
-        # Start the animation thread
-        stop_flag[0] = False
-        animation_thread = threading.Thread(target=animate_luther, args=(stop_flag,))
-        animation_thread.daemon = True
-        animation_thread.start()
-
-        print("Starting operation...")
-        print("Please wait, this might take couple of seconds...")
-
-        btn_play.configure(state='disabled')
-
-        loop = asyncio.new_event_loop()
-        asyncio.set_event_loop(loop)
-
-        # Call the main function with the callback function
-        loop.run_until_complete(run_script_async())
-        loop.close()
-        btn_play.configure(state='normal')
-        stop_flag[0] = True
+
+        print("Operation finished.")
 
     # Start Operation! button
     btn_play = tk.Button(root, text="Start Operation!", command=start_operation)
@@ -295,6 +311,8 @@ def start_operation():
     sys.stdout = CustomTextRedirector(txt_terminal)
     sys.stderr = CustomTextRedirector(txt_terminal)
 
+    update_txt_terminal()  # Start checking the queue
+
     # Start the GUI
     root.mainloop()
 

diff --git a/lutherscripts/src/data/extrastopwords.py b/lutherscripts/src/data/extrastopwords.py
@@ -129,6 +129,7 @@
     'este',
     'esto',
     'et',
+    'etc',
     'etenim',
     'etiam',
     'etiamsi',
@@ -323,6 +324,7 @@
     'non',
     'nondum',
     'nonne',
+    'nolo',
     'nos',
     'noster',
     'nostra',
@@ -360,6 +362,7 @@
     'p',
     'paene',
     'paro1',
+    'pars',
     'pauca',
     'paulus',
     'paulus1',
@@ -438,6 +441,7 @@
     'quis',
     'quisque',
     'quo',
+    '-que',
     'quoad',
     'quod',
     'quodsi',