From f9781069d7bb4334fc376a61e4c6f1e3f020081d Mon Sep 17 00:00:00 2001 From: Aaron Taylor Date: Mon, 6 Jan 2025 08:38:31 -0800 Subject: [PATCH] Guess file content type when not specified (#655) Co-authored-by: Martin Durant --- gcsfs/core.py | 14 +++++++++++--- gcsfs/tests/test_core.py | 31 ++++++++++++++++++++++++++++++- 2 files changed, 41 insertions(+), 4 deletions(-) diff --git a/gcsfs/core.py b/gcsfs/core.py index e6b52f0f..a78ea5f5 100644 --- a/gcsfs/core.py +++ b/gcsfs/core.py @@ -1,10 +1,12 @@ """ Google Cloud Storage pythonic interface """ + import asyncio import io import json import logging +import mimetypes import os import posixpath import re @@ -1391,7 +1393,7 @@ async def _put_file( rpath, metadata=None, consistency=None, - content_type="application/octet-stream", + content_type=None, chunksize=50 * 2**20, callback=None, fixed_key_metadata=None, @@ -1401,6 +1403,10 @@ async def _put_file( # enforce blocksize should be a multiple of 2**18 if os.path.isdir(lpath): return + if content_type is None: + content_type, _ = mimetypes.guess_type(lpath) + if content_type is None: + content_type = "application/octet-stream" callback = callback or NoOpCallback() consistency = consistency or self.consistency checker = get_consistency_checker(consistency) @@ -1755,7 +1761,8 @@ def __init__( the number we wrote; 'md5' does a full checksum. Any value other than 'size' or 'md5' or 'crc32c' is assumed to mean no checking. content_type: str - default is `application/octet-stream`. See the list of available + default when unspecified is provided by mimetypes.guess_type or + otherwise `application/octet-stream`. See the list of available content types at https://www.iana.org/assignments/media-types/media-types.txt metadata: dict Custom metadata, in key/value pairs, added at file creation @@ -1798,7 +1805,8 @@ def __init__( else: det = {} self.content_type = content_type or det.get( - "contentType", "application/octet-stream" + "contentType", + mimetypes.guess_type(self.path)[0] or "application/octet-stream", ) self.metadata = metadata or det.get("metadata", {}) self.fixed_key_metadata = _convert_fixed_key_metadata(det, from_google=True) diff --git a/gcsfs/tests/test_core.py b/gcsfs/tests/test_core.py index d282aa7a..4a907143 100644 --- a/gcsfs/tests/test_core.py +++ b/gcsfs/tests/test_core.py @@ -890,6 +890,36 @@ def test_array(gcs): assert out == b"A" * 1000 +def test_content_type_set(gcs): + fn = TEST_BUCKET + "/content_type" + with gcs.open(fn, "wb", content_type="text/html") as f: + f.write(b"") + assert gcs.info(fn)["contentType"] == "text/html" + + +def test_content_type_guess(gcs): + fn = TEST_BUCKET + "/content_type.txt" + with gcs.open(fn, "wb") as f: + f.write(b"zz") + assert gcs.info(fn)["contentType"] == "text/plain" + + +def test_content_type_default(gcs): + fn = TEST_BUCKET + "/content_type.abcdef" + with gcs.open(fn, "wb") as f: + f.write(b"zz") + assert gcs.info(fn)["contentType"] == "application/octet-stream" + + +def test_content_type_put_guess(gcs): + dst = TEST_BUCKET + "/content_type_put_guess" + with tmpfile(extension="txt") as fn: + with open(fn, "w") as f: + f.write("zz") + gcs.put(fn, f"gs://{dst}", b"") + assert gcs.info(dst)["contentType"] == "text/plain" + + def test_attrs(gcs): if not gcs.on_google: # https://github.com/fsspec/gcsfs/pull/479 @@ -1194,7 +1224,6 @@ def test_dir_marker(gcs): def test_mkdir_with_path(gcs): - with pytest.raises(FileNotFoundError): gcs.mkdir(f"{TEST_BUCKET + 'new'}/path", create_parents=False) assert not gcs.exists(f"{TEST_BUCKET + 'new'}")