Skip to content

Add export_docs functionality which saves separate files from Lutherscripts word tokenized corpus json #8

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions lutherscripts/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@


def add_arguments(parser):
parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic_analysis", "freq_analysis", "build_corpus", "topic_modeling", "word_document_probability"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, kwic_analysis, corpus_builder, topic_modeler or word_document_probability")
parser.add_argument("-o", "--operation", type=str, choices=["word_tokenize_latin", "sent_tokenize_latin", "kwic_analysis", "freq_analysis", "build_corpus", "topic_modeling", "word_document_probability", "export_docs", "export_prepared_text"], required=True, help="Choose operation: word_tokenize_latin, sent_tokenize_latin, kwic_analysis, corpus_builder, topic_modeler, export_docs, export_prepared_text or word_document_probability")
parser.add_argument("-1", "--first-detail", type=float, help="First detail flag for operation, depends on the operation")
parser.add_argument("-2", "--second-detail", type=float, help="Second detail flag for operation, depends on the operation")
parser.add_argument("-3", "--third-detail", type=int, help="Third detail flag for operation, depends on the operation")
Expand Down Expand Up @@ -52,6 +52,14 @@ def word_document_probability(source_path, corpus_path, dictionary_path, destina
from src.text_processing.gensim_word_document_probability import main as gensim_word_document_probability
output = gensim_word_document_probability(source_path, corpus_path, dictionary_path, destination_path)

def export_docs(source_path, destination_path):
from src.text_preparation.python_createseparatefiles_from_metadata import main as export_docs
output = export_docs(source_path, destination_path)

def export_prepared_text(source_path, destination_path):
from src.text_preparation.python_createseparatefiles_from_prepared_text import main as export_prepared_text
output = export_prepared_text(source_path, destination_path)

def cli_main():
parser = argparse.ArgumentParser(description="Lutherscript operations launcher")
add_arguments(parser)
Expand Down Expand Up @@ -91,9 +99,11 @@ def cli_main():
freq_analysis(source_path, destination_path)
elif args.operation == 'build_corpus':
build_corpus(source_path, destination_path, args.first_detail, args.second_detail)
elif args.operation == 'export_docs':
export_docs(source_path, destination_path)
elif args.operation == 'export_prepared_text':
export_prepared_text(source_path, destination_path)


if __name__ == '__main__':
cli_main()


cli_main()
172 changes: 95 additions & 77 deletions lutherscripts/gui.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,41 +12,33 @@
from concurrent.futures import ThreadPoolExecutor
import asyncio
import time
import queue



__author__ = "benjamsf"
__license__ = "MIT"

# This flag and queue are used for communication between threads and updating the GUI
stop_flag = [False]
message_queue = queue.Queue()

class CustomTextRedirector:
def __init__(self, widget):
self.widget = widget
self.widget.configure(background='black', foreground='green') # Set background and text color
self.encoding = 'utf-8' # Set the encoding for the widget
self.widget.configure(background='black', foreground='green', font=('Arial', 12))

def write(self, message):
self.widget.configure(state='normal')
self.widget.insert(tk.END, message.encode(self.encoding)) # Encode the message with the specified encoding
self.widget.see(tk.END)
self.widget.configure(state='disabled')
self.widget.update()
if self.widget.winfo_exists():
self.widget.configure(state='normal')
self.widget.insert(tk.END, message)
self.widget.see(tk.END)
self.widget.configure(state='disabled')

def flush(self):
pass

def readline(self):
return ''

def create_image_label(parent, root, frames):
lbl_luther_image = tk.Label(parent, image=frames[0])
lbl_luther_image.grid(row=0, rowspan=8, column=0, padx=10, pady=10)

return lbl_luther_image

pass # Nothing to do here for now

def gui_main():

root = tk.Tk()
root.geometry("1500x640")
root.title("Lutherscripts (Dev version) - A NLP toolset for Latin language")
Expand Down Expand Up @@ -121,17 +113,21 @@ def gui_main():
("kwic_analysis", "Perform KWIC analysis from your JSON word tokenized text"),
("freq_analysis", "Perform word frequency analysis from your JSON word tokenized text"),
("build_corpus", "Build a dictionary and corpus from your JSON word tokenized text"),
("topic_modeling", "Perform Topic Modeling from your dictionary and corpus")
("topic_modeling", "Perform Topic Modeling from your dictionary and corpus"),
("export_docs", "Export the tokenized JSON to multiple txt documents"),
("export_prepared_text", "Export Lutherscripts prepared txt to multiple txt documents")
]

def update_explanation(*args):
explanations = {
"Tokenize Latin text by words": "This operation will tokenize your Latin text by words, which is required for further word-based natural language processing, using CLTK. You can manually segmentate the text via inserting a headline in a format #Detail,Otherdetail,Thirddetail# and end marker of the segment as #end#. That will be interpreted by the tokenizer as a single document, with metadata provided in the header",
"Tokenize Latin text by sentences": "This operation will tokenize your Latin text by sentences, which is useful for sentence-based natural language processing, using CLTK. As of dev version, not in the par of the other operations.",
"Perform KWIC analysis from your JSON word tokenized text": "This operation will perform a Key Word in Context (KWIC) analysis, allowing you to see the occurrences of a word within the context of the text, using NLTK. Source must be a Word Tokenized text in JSON format.",
"Perform word frequency analysis": "This operation will perform a Word Frequency Analysis, allowing you to see the number of times each word has been used in your target text, using NLTK. Source must be a Word Tokenized text in JSON format.",
"Perform word frequency analysis from your JSON word tokenized text": "This operation will perform a Word Frequency Analysis, allowing you to see the number of times each word has been used in your target text, using NLTK. Source must be a Word Tokenized text in JSON format.",
"Build a dictionary and corpus from your JSON word tokenized text": "This operation will build a dictionary and a corpus from your Word Tokenized text in JSON format using GenSim, for to source further operations. As Arg 1 pass minimum appearance of a word in a document corpus to be accepted to the corpus, as Arg 2 pass the maximum in a fraction of a document to do the same.",
"Perform Topic Modeling from your dictionary and corpus": "This operation will perform Topic Modeling using GenSim from your dictionary and corpus files. As Argument 1, pass the number of topics you want to try dig out from the text. As Argument 2, pass the number of passes to perform on the corpus. Test different values both here and during the corpus building for to achieve accuracy."
"Perform Topic Modeling from your dictionary and corpus": "This operation will perform Topic Modeling using GenSim from your dictionary and corpus files. As Argument 1, pass the number of topics you want to try dig out from the text. As Argument 2, pass the number of passes to perform on the corpus. Test different values both here and during the corpus building for to achieve accuracy.",
"Export the tokenized JSON to multiple txt documents": "Lutherscripts Latin tokenizer will output the source to a JSON array. Export that to separate txt documents for work with tools like Voyant.",
"Export Lutherscripts prepared txt to multiple txt documents": "Export Lutherscripts prepared text to multiple txt documents. Text prepared by Lutherscripts remove clutter from Luther's Werke im WWW / other raw text sources, and enables you to separate it to documents in a way that Lutherscripts understands. Now export the text to multiple txt files based on that markup, in order to use them in 3rd party tools like VoyantTools."
}

selected_operation = var_operation.get()
Expand Down Expand Up @@ -176,13 +172,75 @@ def choose_output_file():
root.grid_rowconfigure(5, weight=1)
sys.stdout = CustomTextRedirector(txt_terminal)
sys.stderr = CustomTextRedirector(txt_terminal)

def update_txt_terminal():
try:
while not message_queue.empty():
message = message_queue.get_nowait()
txt_terminal.configure(state='normal')
txt_terminal.insert(tk.END, message)
txt_terminal.see(tk.END)
txt_terminal.configure(state='disabled')
except queue.Empty:
pass # No more messages to display
finally:
# Reschedule this function to run again after 100 ms
root.after(100, update_txt_terminal)



def update_image_label(lbl, frames):
frame = frames.pop(0)
frames.append(frame)
lbl.config(image=frame)

def finalize_operation():
"""Re-enable the button and stop the animation after the operation is done."""
btn_play.configure(state='normal') # Re-enable the button
stop_flag[0] = True

def animate_luther(stop_flag):
while not stop_flag[0]:
update_image_label(lbl_luther_image, frames)
root.update()
time.sleep(interval)

def start_operation():
btn_play.configure(state='disabled')
txt_terminal.configure(state='normal')
txt_terminal.delete(1.0, tk.END) # Clear existing text
txt_terminal.configure(state='disabled')
print("Starting the requested operation...")

# Run the async operation in a separate thread
threading.Thread(target=start_async_operation, daemon=True).start()

# Update the GUI periodically
update_txt_terminal()

# Start the animation thread
stop_flag[0] = False
animation_thread = threading.Thread(target=animate_luther, args=(stop_flag,))
animation_thread.daemon = True
animation_thread.start()

def start_async_operation():
"""Start the async operation in a new thread."""
def run_in_background():
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(run_script_async())
loop.close()
except Exception as e:
print(f"An error occurred: {e}")
finally:
root.after(0, finalize_operation)

global stop_flag # Use the existing stop_flag list
stop_flag[0] = False # Update the value in the list
threading.Thread(target=run_in_background, daemon=True).start()


async def run_script_async():
operation_name = [option[0] for option in options if option[1] == var_operation.get()][0]
Expand Down Expand Up @@ -211,24 +269,23 @@ async def run_script_async():
)

lbl_luther_image.config(image=gif1)


output = ''
output_buffer = '' # Buffer to collect output
while True:
char = await process.stdout.read(1)
if not char:
break
char = char.decode(errors='replace')
output_buffer += char.decode(errors='replace')

if '\n' in output_buffer or '\r' in output_buffer:
# Put the buffer into the queue and reset it
message_queue.put(output_buffer)
output_buffer = ''
# Ensure any remaining output is sent to the queue
if output_buffer:
message_queue.put(output_buffer)

if char == '\r':
txt_terminal.configure(state='normal')
txt_terminal.delete(f'{tk.END} -2c linestart', tk.END)
txt_terminal.insert(tk.END, output)
txt_terminal.see(tk.END)
txt_terminal.configure(state='disabled')
txt_terminal.update()
output = ''
else:
output += char

stderr_data = await process.stderr.read()
if stderr_data:
Expand All @@ -237,49 +294,8 @@ async def run_script_async():
txt_terminal.see(tk.END)
txt_terminal.configure(state='disabled')
txt_terminal.update()

def animate_luther(stop_flag):
while not stop_flag[0]:
update_image_label(lbl_luther_image, frames)
root.update()
time.sleep(interval)


def start_operation():
# Check the selected operation and validate arguments for KWIC analysis
operation_name = [option[0] for option in options if option[1] == var_operation.get()][0]
if operation_name == "kwic_analysis":
argument1 = ent_argument1.get()
argument2 = ent_argument2.get()
if not argument1 or not argument2:
print("Please enter both Argument1 (Keyword) and argument2 (Context length, a number of words you want to see left and right of a keyword hit) for the KWIC analysis")
return
if operation_name == "topic_modeling":
argument1 = ent_argument1.get()
argument2 = ent_argument2.get()
if not argument1 or not argument2:
print("Please enter both Argument1 (Number of Topics) and argument2 (Number of Corpus Passes during LDA Training) ")
return

# Start the animation thread
stop_flag[0] = False
animation_thread = threading.Thread(target=animate_luther, args=(stop_flag,))
animation_thread.daemon = True
animation_thread.start()

print("Starting operation...")
print("Please wait, this might take couple of seconds...")

btn_play.configure(state='disabled')

loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)

# Call the main function with the callback function
loop.run_until_complete(run_script_async())
loop.close()
btn_play.configure(state='normal')
stop_flag[0] = True

print("Operation finished.")

# Start Operation! button
btn_play = tk.Button(root, text="Start Operation!", command=start_operation)
Expand All @@ -295,6 +311,8 @@ def start_operation():
sys.stdout = CustomTextRedirector(txt_terminal)
sys.stderr = CustomTextRedirector(txt_terminal)

update_txt_terminal() # Start checking the queue

# Start the GUI
root.mainloop()

Expand Down
4 changes: 4 additions & 0 deletions lutherscripts/src/data/extrastopwords.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@
'este',
'esto',
'et',
'etc',
'etenim',
'etiam',
'etiamsi',
Expand Down Expand Up @@ -323,6 +324,7 @@
'non',
'nondum',
'nonne',
'nolo',
'nos',
'noster',
'nostra',
Expand Down Expand Up @@ -360,6 +362,7 @@
'p',
'paene',
'paro1',
'pars',
'pauca',
'paulus',
'paulus1',
Expand Down Expand Up @@ -438,6 +441,7 @@
'quis',
'quisque',
'quo',
'-que',
'quoad',
'quod',
'quodsi',
Expand Down
Loading