Skip to content

Commit

Permalink
use python-magic for better type detection in GradIO
Browse files Browse the repository at this point in the history
  • Loading branch information
sysradium committed Feb 9, 2025
1 parent 63adfcd commit 4a7cfcb
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 25 deletions.
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ dependencies = [
"pillow>=11.0.0",
"markdownify>=0.14.1",
"duckduckgo-search>=6.3.7",
"python-dotenv"
"python-dotenv",
"python-magic"
]

[project.optional-dependencies]
Expand Down Expand Up @@ -104,4 +105,4 @@ lines-after-imports = 2

[project.scripts]
smolagent = "smolagents.cli:main"
webagent = "smolagents.vision_web_browser:main"
webagent = "smolagents.vision_web_browser:main"
58 changes: 35 additions & 23 deletions src/smolagents/gradio_ui.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import mimetypes
import os
import re
import shutil
from typing import Optional

import magic

from smolagents.agent_types import AgentAudio, AgentImage, AgentText, handle_agent_output_types
from smolagents.agents import ActionStep, MultiStepAgent
from smolagents.memory import MemoryStep
Expand Down Expand Up @@ -210,39 +211,50 @@ def upload_file(
],
):
"""
Handle file uploads, default allowed types are .pdf, .docx, and .txt
Secure file upload handling with real MIME-type validation (no reliance on extensions).
Ensures that files are truly PDF, DOCX, or TXT before saving them safely.
"""
import gradio as gr

if file is None:
return gr.Textbox("No file uploaded", visible=True), file_uploads_log

# Use python-magic to detect real MIME type
mime = magic.Magic(mime=True)

try:
mime_type, _ = mimetypes.guess_type(file.name)
detected_mime = mime.from_file(file.name) # Get MIME type based on file content
except Exception as e:
return gr.Textbox(f"Error: {e}", visible=True), file_uploads_log
return gr.Textbox(f"Error reading file: {e}", visible=True), file_uploads_log

if mime_type not in allowed_file_types:
return gr.Textbox("File type disallowed", visible=True), file_uploads_log
# Validate detected MIME type
if detected_mime not in allowed_file_types:
return gr.Textbox(f"File type disallowed: {detected_mime}", visible=True), file_uploads_log

# Sanitize file name
# Sanitize filename (keep only safe characters)
original_name = os.path.basename(file.name)
sanitized_name = re.sub(
r"[^\w\-.]", "_", original_name
) # Replace any non-alphanumeric, non-dash, or non-dot characters with underscores

type_to_ext = {}
for ext, t in mimetypes.types_map.items():
if t not in type_to_ext:
type_to_ext[t] = ext

# Ensure the extension correlates to the mime type
sanitized_name = sanitized_name.split(".")[:-1]
sanitized_name.append("" + type_to_ext[mime_type])
sanitized_name = "".join(sanitized_name)

# Save the uploaded file to the specified folder
file_path = os.path.join(self.file_upload_folder, os.path.basename(sanitized_name))
sanitized_name = re.sub(r"[^\w\-.]", "_", original_name) # Replace unsafe chars

# Correct extensions based on detected MIME type
mime_to_ext = {
"application/pdf": ".pdf",
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": ".docx",
"text/plain": ".txt",
}

correct_ext = mime_to_ext.get(detected_mime, "")
base_name, _ = os.path.splitext(sanitized_name) # Remove incorrect extension
sanitized_name = f"{base_name}{correct_ext}" # Assign correct extension

file_path = os.path.join(self.file_upload_folder, sanitized_name)

# Prevent overwriting files by appending a counter if needed
counter = 1
while os.path.exists(file_path):
file_path = os.path.join(self.file_upload_folder, f"{base_name}_{counter}{correct_ext}")
counter += 1

# Save the uploaded file securely
shutil.copy(file.name, file_path)

return gr.Textbox(f"File uploaded: {file_path}", visible=True), file_uploads_log + [file_path]
Expand Down

0 comments on commit 4a7cfcb

Please sign in to comment.