From da4df7955529ce6edcebc65e10dbaab977a132fc Mon Sep 17 00:00:00 2001 From: Shane Harvey Date: Mon, 7 Nov 2022 10:37:33 -0800 Subject: [PATCH] PYTHON-3508 Improve the performance of GridOut.readline and GridOut.read (#1109) --- doc/changelog.rst | 19 +++++-- gridfs/grid_file.py | 119 ++++++++++++++++++++++---------------------- 2 files changed, 76 insertions(+), 62 deletions(-) diff --git a/doc/changelog.rst b/doc/changelog.rst index 4f4e5ace7..b3587e04c 100644 --- a/doc/changelog.rst +++ b/doc/changelog.rst @@ -4,10 +4,23 @@ Changelog Changes in Version 4.3.3 ------------------------ -- Fixed a performance regression in :meth:`~gridfs.GridOut.download_to_stream` - and :meth:`~gridfs.GridOut.download_to_stream_by_name` by reading in chunks - instead of line by line. +Version 4.3.3 fixes a number of bugs: +- Fixed a performance regression in :meth:`~gridfs.GridFSBucket.download_to_stream` + and :meth:`~gridfs.GridFSBucket.download_to_stream_by_name` by reading in chunks + instead of line by line (`PYTHON-3502`_). +- Improved performance of :meth:`gridfs.grid_file.GridOut.read` and + :meth:`gridfs.grid_file.GridOut.readline` (`PYTHON-3508`_). + +Issues Resolved +............... + +See the `PyMongo 4.3.3 release notes in JIRA`_ for the list of resolved issues +in this release. + +.. _PYTHON-3502: https://jira.mongodb.org/browse/PYTHON-3502 +.. _PYTHON-3508: https://jira.mongodb.org/browse/PYTHON-3508 +.. _PyMongo 4.3.3 release notes in JIRA: https://jira.mongodb.org/secure/ReleaseNote.jspa?projectId=10004&version=34709 Changes in Version 4.3 (4.3.2) ------------------------------ diff --git a/gridfs/grid_file.py b/gridfs/grid_file.py index cec7d57a2..50efc0cd2 100644 --- a/gridfs/grid_file.py +++ b/gridfs/grid_file.py @@ -463,7 +463,10 @@ class GridOut(io.IOBase): self.__files = root_collection.files self.__file_id = file_id self.__buffer = EMPTY + # Start position within the current buffered chunk. + self.__buffer_pos = 0 self.__chunk_iter = None + # Position within the total file. self.__position = 0 self._file = file_document self._session = session @@ -510,12 +513,12 @@ class GridOut(io.IOBase): """Reads a chunk at a time. If the current position is within a chunk the remainder of the chunk is returned. """ - received = len(self.__buffer) + received = len(self.__buffer) - self.__buffer_pos chunk_data = EMPTY chunk_size = int(self.chunk_size) if received > 0: - chunk_data = self.__buffer + chunk_data = self.__buffer[self.__buffer_pos :] elif self.__position < int(self.length): chunk_number = int((received + self.__position) / chunk_size) if self.__chunk_iter is None: @@ -531,8 +534,60 @@ class GridOut(io.IOBase): self.__position += len(chunk_data) self.__buffer = EMPTY + self.__buffer_pos = 0 return chunk_data + def _read_size_or_line(self, size: int = -1, line: bool = False) -> bytes: + """Internal read() and readline() helper.""" + self._ensure_file() + remainder = int(self.length) - self.__position + if size < 0 or size > remainder: + size = remainder + + if size == 0: + return EMPTY + + received = 0 + data = [] + while received < size: + needed = size - received + if self.__buffer: + # Optimization: Read the buffer with zero byte copies. + buf = self.__buffer + chunk_start = self.__buffer_pos + chunk_data = memoryview(buf)[self.__buffer_pos :] + self.__buffer = EMPTY + self.__buffer_pos = 0 + self.__position += len(chunk_data) + else: + buf = self.readchunk() + chunk_start = 0 + chunk_data = memoryview(buf) + if line: + pos = buf.find(NEWLN, chunk_start, chunk_start + needed) - chunk_start + if pos >= 0: + # Decrease size to exit the loop. + size = received + pos + 1 + needed = pos + 1 + if len(chunk_data) > needed: + data.append(chunk_data[:needed]) + # Optimization: Save the buffer with zero byte copies. + self.__buffer = buf + self.__buffer_pos = chunk_start + needed + self.__position -= len(self.__buffer) - self.__buffer_pos + else: + data.append(chunk_data) + received += len(chunk_data) + + # Detect extra chunks after reading the entire file. + if size == remainder and self.__chunk_iter: + try: + self.__chunk_iter.next() + except StopIteration: + pass + + return b"".join(data) + def read(self, size: int = -1) -> bytes: """Read at most `size` bytes from the file (less if there isn't enough data). @@ -548,36 +603,7 @@ class GridOut(io.IOBase): entire file. Previously, this method would check for extra chunks on every call. """ - self._ensure_file() - - remainder = int(self.length) - self.__position - if size < 0 or size > remainder: - size = remainder - - if size == 0: - return EMPTY - - received = 0 - data = io.BytesIO() - while received < size: - chunk_data = self.readchunk() - received += len(chunk_data) - data.write(chunk_data) - - # Detect extra chunks after reading the entire file. - if size == remainder and self.__chunk_iter: - try: - self.__chunk_iter.next() - except StopIteration: - pass - - self.__position -= received - size - - # Return 'size' bytes and store the rest. - data.seek(size) - self.__buffer = data.read() - data.seek(0) - return data.read(size) + return self._read_size_or_line(size=size) def readline(self, size: int = -1) -> bytes: # type: ignore[override] """Read one line or up to `size` bytes from the file. @@ -585,33 +611,7 @@ class GridOut(io.IOBase): :Parameters: - `size` (optional): the maximum number of bytes to read """ - remainder = int(self.length) - self.__position - if size < 0 or size > remainder: - size = remainder - - if size == 0: - return EMPTY - - received = 0 - data = io.BytesIO() - while received < size: - chunk_data = self.readchunk() - pos = chunk_data.find(NEWLN, 0, size) - if pos != -1: - size = received + pos + 1 - - received += len(chunk_data) - data.write(chunk_data) - if pos != -1: - break - - self.__position -= received - size - - # Return 'size' bytes and store the rest. - data.seek(size) - self.__buffer = data.read() - data.seek(0) - return data.read(size) + return self._read_size_or_line(size=size, line=True) def tell(self) -> int: """Return the current position of this file.""" @@ -651,6 +651,7 @@ class GridOut(io.IOBase): self.__position = new_pos self.__buffer = EMPTY + self.__buffer_pos = 0 if self.__chunk_iter: self.__chunk_iter.close() self.__chunk_iter = None