From 4a7cfcb552e356c9b2039d0fe26f912920dacce9 Mon Sep 17 00:00:00 2001 From: sysradium Date: Sun, 9 Feb 2025 13:46:58 +0100 Subject: [PATCH] use python-magic for better type detection in GradIO --- pyproject.toml | 5 ++-- src/smolagents/gradio_ui.py | 58 ++++++++++++++++++++++--------------- 2 files changed, 38 insertions(+), 25 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a395752f6..721476dd0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,8 @@ dependencies = [ "pillow>=11.0.0", "markdownify>=0.14.1", "duckduckgo-search>=6.3.7", - "python-dotenv" + "python-dotenv", + "python-magic" ] [project.optional-dependencies] @@ -104,4 +105,4 @@ lines-after-imports = 2 [project.scripts] smolagent = "smolagents.cli:main" -webagent = "smolagents.vision_web_browser:main" \ No newline at end of file +webagent = "smolagents.vision_web_browser:main" diff --git a/src/smolagents/gradio_ui.py b/src/smolagents/gradio_ui.py index c33b60f4a..8cfa5099f 100644 --- a/src/smolagents/gradio_ui.py +++ b/src/smolagents/gradio_ui.py @@ -13,12 +13,13 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import mimetypes import os import re import shutil from typing import Optional +import magic + from smolagents.agent_types import AgentAudio, AgentImage, AgentText, handle_agent_output_types from smolagents.agents import ActionStep, MultiStepAgent from smolagents.memory import MemoryStep @@ -210,39 +211,50 @@ def upload_file( ], ): """ - Handle file uploads, default allowed types are .pdf, .docx, and .txt + Secure file upload handling with real MIME-type validation (no reliance on extensions). + Ensures that files are truly PDF, DOCX, or TXT before saving them safely. """ import gradio as gr if file is None: return gr.Textbox("No file uploaded", visible=True), file_uploads_log + # Use python-magic to detect real MIME type + mime = magic.Magic(mime=True) + try: - mime_type, _ = mimetypes.guess_type(file.name) + detected_mime = mime.from_file(file.name) # Get MIME type based on file content except Exception as e: - return gr.Textbox(f"Error: {e}", visible=True), file_uploads_log + return gr.Textbox(f"Error reading file: {e}", visible=True), file_uploads_log - if mime_type not in allowed_file_types: - return gr.Textbox("File type disallowed", visible=True), file_uploads_log + # Validate detected MIME type + if detected_mime not in allowed_file_types: + return gr.Textbox(f"File type disallowed: {detected_mime}", visible=True), file_uploads_log - # Sanitize file name + # Sanitize filename (keep only safe characters) original_name = os.path.basename(file.name) - sanitized_name = re.sub( - r"[^\w\-.]", "_", original_name - ) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores - - type_to_ext = {} - for ext, t in mimetypes.types_map.items(): - if t not in type_to_ext: - type_to_ext[t] = ext - - # Ensure the extension correlates to the mime type - sanitized_name = sanitized_name.split(".")[:-1] - sanitized_name.append("" + type_to_ext[mime_type]) - sanitized_name = "".join(sanitized_name) - - # Save the uploaded file to the specified folder - file_path = os.path.join(self.file_upload_folder, os.path.basename(sanitized_name)) + sanitized_name = re.sub(r"[^\w\-.]", "_", original_name) # Replace unsafe chars + + # Correct extensions based on detected MIME type + mime_to_ext = { + "application/pdf": ".pdf", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx", + "text/plain": ".txt", + } + + correct_ext = mime_to_ext.get(detected_mime, "") + base_name, _ = os.path.splitext(sanitized_name) # Remove incorrect extension + sanitized_name = f"{base_name}{correct_ext}" # Assign correct extension + + file_path = os.path.join(self.file_upload_folder, sanitized_name) + + # Prevent overwriting files by appending a counter if needed + counter = 1 + while os.path.exists(file_path): + file_path = os.path.join(self.file_upload_folder, f"{base_name}_{counter}{correct_ext}") + counter += 1 + + # Save the uploaded file securely shutil.copy(file.name, file_path) return gr.Textbox(f"File uploaded: {file_path}", visible=True), file_uploads_log + [file_path]