PYTHON-2998 Remove md5 checksums from gridfs and remove disable_md5 (#776)

Speed up gridfs tests (shaves off about 2 minutes on macOS).
This commit is contained in:
Shane Harvey 2021-11-04 17:25:11 -07:00 committed by GitHub
parent 89f41cfbd2
commit e27131546c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 94 additions and 129 deletions

View File

@ -170,12 +170,15 @@ Breaking Changes in 4.0
are passed to the server as-is rather than the previous behavior which
substituted in a projection of ``{"_id": 1}``. This means that an empty
projection will now return the entire document, not just the ``"_id"`` field.
- :class:`~pymongo.mongo_client.MongoClient` now raises a :exc:`~pymongo.errors.ConfigurationError`
when more than one URI is passed into the ``hosts`` argument.
- :class:`~pymongo.mongo_client.MongoClient` now raises a
:exc:`~pymongo.errors.ConfigurationError` when more than one URI is passed
into the ``hosts`` argument.
- :class:`~pymongo.mongo_client.MongoClient`` now raises an
:exc:`~pymongo.errors.InvalidURI` exception
when it encounters unescaped percent signs in username and password when
parsing MongoDB URIs.
- Removed the `disable_md5` parameter for :class:`~gridfs.GridFSBucket` and
:class:`~gridfs.GridFS`. See :ref:`removed-gridfs-checksum` for details.
Notable improvements
....................

View File

@ -819,6 +819,32 @@ Changed the default JSON encoding representation from legacy to relaxed.
The json_mode parameter for :const:`bson.json_util.dumps` now defaults to
:const:`~bson.json_util.RELAXED_JSON_OPTIONS`.
GridFS changes
--------------
.. _removed-gridfs-checksum:
disable_md5 parameter is removed
................................
Removed the `disable_md5` option for :class:`~gridfs.GridFSBucket` and
:class:`~gridfs.GridFS`. GridFS no longer generates checksums.
Applications that desire a file digest should implement it outside GridFS
and store it with other file metadata. For example::
import hashlib
my_db = MongoClient().test
fs = GridFSBucket(my_db)
grid_in = fs.open_upload_stream("test_file")
file_data = b'...'
sha356 = hashlib.sha256(file_data).hexdigest()
grid_in.write(file_data)
grid_in.sha356 = sha356 # Set the custom 'sha356' field
grid_in.close()
Note that for large files, the checksum may need to be computed in chunks
to avoid the excessive memory needed to load the entire file at once.
Removed features with no migration path
---------------------------------------

View File

@ -39,7 +39,7 @@ from gridfs.grid_file import (GridIn,
class GridFS(object):
"""An instance of GridFS on top of a single Database.
"""
def __init__(self, database, collection="fs", disable_md5=False):
def __init__(self, database, collection="fs"):
"""Create a new instance of :class:`GridFS`.
Raises :class:`TypeError` if `database` is not an instance of
@ -48,14 +48,18 @@ class GridFS(object):
:Parameters:
- `database`: database to use
- `collection` (optional): root collection to use
- `disable_md5` (optional): When True, MD5 checksums will not be
computed for uploaded files. Useful in environments where MD5
cannot be used for regulatory or other reasons. Defaults to False.
.. versionchanged:: 4.0
Removed the `disable_md5` parameter. See
:ref:`removed-gridfs-checksum` for details.
.. versionchanged:: 3.11
Running a GridFS operation in a transaction now always raises an
error. GridFS does not support multi-document transactions.
.. versionchanged:: 3.7
Added the `disable_md5` parameter.
.. versionchanged:: 3.1
Indexes are only ensured on the first write to the DB.
@ -77,7 +81,6 @@ class GridFS(object):
self.__collection = database[collection]
self.__files = self.__collection.files
self.__chunks = self.__collection.chunks
self.__disable_md5 = disable_md5
def new_file(self, **kwargs):
"""Create a new file in GridFS.
@ -93,8 +96,7 @@ class GridFS(object):
:Parameters:
- `**kwargs` (optional): keyword arguments for file creation
"""
return GridIn(
self.__collection, disable_md5=self.__disable_md5, **kwargs)
return GridIn(self.__collection, **kwargs)
def put(self, data, **kwargs):
"""Put data in GridFS as a new file.
@ -126,8 +128,7 @@ class GridFS(object):
.. versionchanged:: 3.0
w=0 writes to GridFS are now prohibited.
"""
grid_file = GridIn(
self.__collection, disable_md5=self.__disable_md5, **kwargs)
grid_file = GridIn(self.__collection, **kwargs)
try:
grid_file.write(data)
finally:
@ -423,7 +424,7 @@ class GridFSBucket(object):
def __init__(self, db, bucket_name="fs",
chunk_size_bytes=DEFAULT_CHUNK_SIZE, write_concern=None,
read_preference=None, disable_md5=False):
read_preference=None):
"""Create a new instance of :class:`GridFSBucket`.
Raises :exc:`TypeError` if `database` is not an instance of
@ -442,13 +443,17 @@ class GridFSBucket(object):
(the default) db.write_concern is used.
- `read_preference` (optional): The read preference to use. If
``None`` (the default) db.read_preference is used.
- `disable_md5` (optional): When True, MD5 checksums will not be
computed for uploaded files. Useful in environments where MD5
cannot be used for regulatory or other reasons. Defaults to False.
.. versionchanged:: 4.0
Removed the `disable_md5` parameter. See
:ref:`removed-gridfs-checksum` for details.
.. versionchanged:: 3.11
Running a GridFS operation in a transaction now always raises an
error. GridFSBucket does not support multi-document transactions.
Running a GridFSBucket operation in a transaction now always raises
an error. GridFSBucket does not support multi-document transactions.
.. versionchanged:: 3.7
Added the `disable_md5` parameter.
.. versionadded:: 3.1
@ -465,8 +470,6 @@ class GridFSBucket(object):
self._bucket_name = bucket_name
self._collection = db[bucket_name]
self._disable_md5 = disable_md5
self._chunks = self._collection.chunks.with_options(
write_concern=write_concern,
read_preference=read_preference)
@ -522,11 +525,7 @@ class GridFSBucket(object):
if metadata is not None:
opts["metadata"] = metadata
return GridIn(
self._collection,
session=session,
disable_md5=self._disable_md5,
**opts)
return GridIn(self._collection, session=session, **opts)
def open_upload_stream_with_id(
self, file_id, filename, chunk_size_bytes=None, metadata=None,
@ -579,11 +578,7 @@ class GridFSBucket(object):
if metadata is not None:
opts["metadata"] = metadata
return GridIn(
self._collection,
session=session,
disable_md5=self._disable_md5,
**opts)
return GridIn(self._collection, session=session, **opts)
def upload_from_stream(self, filename, source, chunk_size_bytes=None,
metadata=None, session=None):

View File

@ -14,7 +14,6 @@
"""Tools for representing files stored in GridFS."""
import datetime
import hashlib
import io
import math
import os
@ -115,8 +114,7 @@ def _disallow_transactions(session):
class GridIn(object):
"""Class to write data to GridFS.
"""
def __init__(
self, root_collection, session=None, disable_md5=False, **kwargs):
def __init__(self, root_collection, session=None, **kwargs):
"""Write a file to GridFS
Application developers should generally not need to
@ -152,12 +150,15 @@ class GridIn(object):
- `session` (optional): a
:class:`~pymongo.client_session.ClientSession` to use for all
commands
- `disable_md5` (optional): When True, an MD5 checksum will not be
computed for the uploaded file. Useful in environments where
MD5 cannot be used for regulatory or other reasons. Defaults to
False.
- `**kwargs` (optional): file level options (see above)
.. versionchanged:: 4.0
Removed the `disable_md5` parameter. See
:ref:`removed-gridfs-checksum` for details.
.. versionchanged:: 3.7
Added the `disable_md5` parameter.
.. versionchanged:: 3.6
Added ``session`` parameter.
@ -183,8 +184,6 @@ class GridIn(object):
coll = _clear_entity_type_registry(
root_collection, read_preference=ReadPreference.PRIMARY)
if not disable_md5:
kwargs["md5"] = hashlib.md5()
# Defaults
kwargs["_id"] = kwargs.get("_id", ObjectId())
kwargs["chunkSize"] = kwargs.get("chunkSize", DEFAULT_CHUNK_SIZE)
@ -271,9 +270,6 @@ class GridIn(object):
"""Flush `data` to a chunk.
"""
self.__ensure_indexes()
if 'md5' in self._file:
self._file['md5'].update(data)
if not data:
return
assert(len(data) <= self.chunk_size)
@ -301,9 +297,6 @@ class GridIn(object):
"""
try:
self.__flush_buffer()
if "md5" in self._file:
self._file["md5"] = self._file["md5"].hexdigest()
# The GridFS spec says length SHOULD be an Int64.
self._file["length"] = Int64(self._position)
self._file["uploadDate"] = datetime.datetime.utcnow()

View File

@ -949,6 +949,13 @@ class IntegrationTest(PyMongoTestCase):
else:
cls.credentials = {}
def cleanup_colls(self, *collections):
"""Cleanup collections faster than drop_collection."""
for c in collections:
c = self.client[c.database.name][c.name]
c.delete_many({})
c.drop_indexes()
def patch_system_certs(self, ca_certs):
patcher = SystemCertsPatcher(ca_certs)
self.addCleanup(patcher.disable)

View File

@ -29,7 +29,6 @@
"length": 0,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "d41d8cd98f00b204e9800998ecf8427e",
"filename": "filename"
}
]
@ -62,7 +61,6 @@
"length": 1,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "47ed733b8d10be225eceba344d533586",
"filename": "filename"
}
]
@ -108,7 +106,6 @@
"length": 3,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "bafae3a174ab91fc70db7a6aa50f4f52",
"filename": "filename"
}
]
@ -154,7 +151,6 @@
"length": 4,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "7e7c77cff5705d1f7574a25ef6662117",
"filename": "filename"
}
]
@ -200,7 +196,6 @@
"length": 5,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "283d4fea5dded59cf837d3047328f5af",
"filename": "filename"
}
]
@ -254,7 +249,6 @@
"length": 8,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "dd254cdc958e53abaa67da9f797125f5",
"filename": "filename"
}
]
@ -309,7 +303,6 @@
"length": 1,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "47ed733b8d10be225eceba344d533586",
"filename": "filename",
"contentType": "image/jpeg"
}
@ -359,7 +352,6 @@
"length": 1,
"chunkSize": 4,
"uploadDate": "*actual",
"md5": "47ed733b8d10be225eceba344d533586",
"filename": "filename",
"metadata": {
"x": 1

View File

@ -672,7 +672,7 @@ class TestGridFileCustomType(IntegrationTest):
self.assertEqual(["foo"], two.aliases)
self.assertEqual({"foo": 'red', "bar": 'blue'}, two.metadata)
self.assertEqual(3, two.bar)
self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", two.md5)
self.assertEqual(None, two.md5)
for attr in ["_id", "name", "content_type", "length", "chunk_size",
"upload_date", "aliases", "metadata", "md5"]:

View File

@ -80,8 +80,7 @@ class TestGridFileNoConnect(unittest.TestCase):
class TestGridFile(IntegrationTest):
def setUp(self):
self.db.drop_collection('fs.files')
self.db.drop_collection('fs.chunks')
self.cleanup_colls(self.db.fs.files, self.db.fs.chunks)
def test_basic(self):
f = GridIn(self.db.fs, filename="test")
@ -112,7 +111,7 @@ class TestGridFile(IntegrationTest):
f = GridIn(self.db.fs)
f.write(b"hello world\n")
f.close()
self.assertEqual("6f5902ac237024bdd0c176cb93063dc4", f.md5)
self.assertEqual(None, f.md5)
def test_alternate_collection(self):
self.db.alt.files.delete_many({})
@ -128,9 +127,6 @@ class TestGridFile(IntegrationTest):
g = GridOut(self.db.alt, f._id)
self.assertEqual(b"hello world", g.read())
# test that md5 still works...
self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", g.md5)
def test_grid_in_default_opts(self):
self.assertRaises(TypeError, GridIn, "foo")
@ -194,7 +190,7 @@ class TestGridFile(IntegrationTest):
self.assertEqual({"foo": 1}, a.metadata)
self.assertEqual("d41d8cd98f00b204e9800998ecf8427e", a.md5)
self.assertEqual(None, a.md5)
self.assertRaises(AttributeError, setattr, a, "md5", 5)
# Make sure custom attributes that were set both before and after
@ -225,7 +221,7 @@ class TestGridFile(IntegrationTest):
self.assertTrue(isinstance(b.upload_date, datetime.datetime))
self.assertEqual(None, b.aliases)
self.assertEqual(None, b.metadata)
self.assertEqual("d41d8cd98f00b204e9800998ecf8427e", b.md5)
self.assertEqual(None, b.md5)
for attr in ["_id", "name", "content_type", "length", "chunk_size",
"upload_date", "aliases", "metadata", "md5"]:
@ -266,7 +262,7 @@ class TestGridFile(IntegrationTest):
self.assertEqual(["foo"], two.aliases)
self.assertEqual({"foo": 1, "bar": 2}, two.metadata)
self.assertEqual(3, two.bar)
self.assertEqual("5eb63bbbe01eeed093cb22bb8f5acdc3", two.md5)
self.assertEqual(None, two.md5)
for attr in ["_id", "name", "content_type", "length", "chunk_size",
"upload_date", "aliases", "metadata", "md5"]:

View File

@ -97,10 +97,8 @@ class TestGridfs(IntegrationTest):
cls.alt = gridfs.GridFS(cls.db, "alt")
def setUp(self):
self.db.drop_collection("fs.files")
self.db.drop_collection("fs.chunks")
self.db.drop_collection("alt.files")
self.db.drop_collection("alt.chunks")
self.cleanup_colls(self.db.fs.files, self.db.fs.chunks,
self.db.alt.files, self.db.alt.chunks)
def test_basic(self):
oid = self.fs.put(b"hello world")
@ -158,7 +156,7 @@ class TestGridfs(IntegrationTest):
self.assertEqual(oid, raw["_id"])
self.assertTrue(isinstance(raw["uploadDate"], datetime.datetime))
self.assertEqual(255 * 1024, raw["chunkSize"])
self.assertTrue(isinstance(raw["md5"], str))
self.assertNotIn("md5", raw)
def test_corrupt_chunk(self):
files_id = self.fs.put(b'foobar')
@ -174,12 +172,11 @@ class TestGridfs(IntegrationTest):
self.fs.delete(files_id)
def test_put_ensures_index(self):
# setUp has dropped collections.
names = self.db.list_collection_names()
self.assertFalse([name for name in names if name.startswith('fs')])
chunks = self.db.fs.chunks
files = self.db.fs.files
# Ensure the collections are removed.
chunks.drop()
files.drop()
self.fs.put(b"junk")
self.assertTrue(any(
@ -484,21 +481,6 @@ class TestGridfs(IntegrationTest):
def test_md5(self):
gin = self.fs.new_file()
gin.write(b"includes md5 sum")
gin.close()
self.assertIsNotNone(gin.md5)
md5sum = gin.md5
gout = self.fs.get(gin._id)
self.assertIsNotNone(gout.md5)
self.assertEqual(md5sum, gout.md5)
_id = self.fs.put(b"also includes md5 sum")
gout = self.fs.get(_id)
self.assertIsNotNone(gout.md5)
fs = gridfs.GridFS(self.db, disable_md5=True)
gin = fs.new_file()
gin.write(b"no md5 sum")
gin.close()
self.assertIsNone(gin.md5)
@ -506,7 +488,7 @@ class TestGridfs(IntegrationTest):
gout = self.fs.get(gin._id)
self.assertIsNone(gout.md5)
_id = fs.put(b"still no md5 sum")
_id = self.fs.put(b"still no md5 sum")
gout = self.fs.get(_id)
self.assertIsNone(gout.md5)

View File

@ -86,10 +86,8 @@ class TestGridfs(IntegrationTest):
cls.db, bucket_name="alt")
def setUp(self):
self.db.drop_collection("fs.files")
self.db.drop_collection("fs.chunks")
self.db.drop_collection("alt.files")
self.db.drop_collection("alt.chunks")
self.cleanup_colls(self.db.fs.files, self.db.fs.chunks,
self.db.alt.files, self.db.alt.chunks)
def test_basic(self):
oid = self.fs.upload_from_stream("test_filename",
@ -105,7 +103,6 @@ class TestGridfs(IntegrationTest):
self.assertEqual(0, self.db.fs.chunks.count_documents({}))
def test_multi_chunk_delete(self):
self.db.fs.drop()
self.assertEqual(0, self.db.fs.files.count_documents({}))
self.assertEqual(0, self.db.fs.chunks.count_documents({}))
gfs = gridfs.GridFSBucket(self.db)
@ -130,7 +127,7 @@ class TestGridfs(IntegrationTest):
self.assertEqual(oid, raw["_id"])
self.assertTrue(isinstance(raw["uploadDate"], datetime.datetime))
self.assertEqual(255 * 1024, raw["chunkSize"])
self.assertTrue(isinstance(raw["md5"], str))
self.assertNotIn("md5", raw)
def test_corrupt_chunk(self):
files_id = self.fs.upload_from_stream("test_filename",
@ -147,12 +144,11 @@ class TestGridfs(IntegrationTest):
self.fs.delete(files_id)
def test_upload_ensures_index(self):
# setUp has dropped collections.
names = self.db.list_collection_names()
self.assertFalse([name for name in names if name.startswith('fs')])
chunks = self.db.fs.chunks
files = self.db.fs.files
# Ensure the collections are removed.
chunks.drop()
files.drop()
self.fs.upload_from_stream("filename", b"junk")
self.assertTrue(any(
@ -464,41 +460,20 @@ class TestGridfs(IntegrationTest):
self.assertEqual(file1.read(), file2.read())
def test_md5(self):
gin = self.fs.open_upload_stream("has md5")
gin.write(b"includes md5 sum")
gin.close()
self.assertIsNotNone(gin.md5)
md5sum = gin.md5
gout = self.fs.open_download_stream(gin._id)
self.assertIsNotNone(gout.md5)
self.assertEqual(md5sum, gout.md5)
gin = self.fs.open_upload_stream_with_id(ObjectId(), "also has md5")
gin.write(b"also includes md5 sum")
gin.close()
self.assertIsNotNone(gin.md5)
md5sum = gin.md5
gout = self.fs.open_download_stream(gin._id)
self.assertIsNotNone(gout.md5)
self.assertEqual(md5sum, gout.md5)
fs = gridfs.GridFSBucket(self.db, disable_md5=True)
gin = fs.open_upload_stream("no md5")
gin = self.fs.open_upload_stream("no md5")
gin.write(b"no md5 sum")
gin.close()
self.assertIsNone(gin.md5)
gout = fs.open_download_stream(gin._id)
gout = self.fs.open_download_stream(gin._id)
self.assertIsNone(gout.md5)
gin = fs.open_upload_stream_with_id(ObjectId(), "also no md5")
gin = self.fs.open_upload_stream_with_id(ObjectId(), "also no md5")
gin.write(b"also no md5 sum")
gin.close()
self.assertIsNone(gin.md5)
gout = fs.open_download_stream(gin._id)
gout = self.fs.open_download_stream(gin._id)
self.assertIsNone(gout.md5)

View File

@ -66,10 +66,8 @@ class TestAllScenarios(IntegrationTest):
"download_by_name": cls.fs.open_download_stream_by_name}
def init_db(self, data, test):
self.db.drop_collection("fs.files")
self.db.drop_collection("fs.chunks")
self.db.drop_collection("expected.files")
self.db.drop_collection("expected.chunks")
self.cleanup_colls(self.db.fs.files, self.db.fs.chunks,
self.db.expected.files, self.db.expected.chunks)
# Read in data.
if data['files']:

View File

@ -270,9 +270,7 @@ class SpecRunner(IntegrationTest):
if object_name == 'gridfsbucket':
# Only create the GridFSBucket when we need it (for the gridfs
# retryable reads tests).
obj = GridFSBucket(
database, bucket_name=collection.name,
disable_md5=True)
obj = GridFSBucket(database, bucket_name=collection.name)
else:
objects = {
'client': database.client,