From f34f2a004fe9d4e192a73ee189eee4ab729dfacd Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Sun, 21 Aug 2022 21:48:30 +0900 Subject: [PATCH 1/4] bump version to 6.1.0.dev0 --- smart_open/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/version.py b/smart_open/version.py index 31858fad..9208262c 100644 --- a/smart_open/version.py +++ b/smart_open/version.py @@ -1,4 +1,4 @@ -__version__ = '6.1.0' +__version__ = '6.1.0.dev0' if __name__ == '__main__': From 41467a912daec30e905deb9712a2f53253bc1a41 Mon Sep 17 00:00:00 2001 From: Joshua-Landau-Anthropic <108403948+Joshua-Landau-Anthropic@users.noreply.github.com> Date: Sun, 21 Aug 2022 14:04:21 +0100 Subject: [PATCH 2/4] Fix quadratic time ByteBuffer operations (#711) * Fix quadratic time ByteBuffer operations * Minimum viable benchmark for ByteBuffer fills * Integration test: ByteBuffer quadratic slowdown --- benchmark/bytebuffer_bench.py | 34 ++++++++++++++++++++++++++++++++++ smart_open/bytebuffer.py | 6 +++--- 2 files changed, 37 insertions(+), 3 deletions(-) create mode 100644 benchmark/bytebuffer_bench.py diff --git a/benchmark/bytebuffer_bench.py b/benchmark/bytebuffer_bench.py new file mode 100644 index 00000000..257e0e24 --- /dev/null +++ b/benchmark/bytebuffer_bench.py @@ -0,0 +1,34 @@ +import time +import sys + +import smart_open +from smart_open.bytebuffer import ByteBuffer + + +def raw_bytebuffer_benchmark(): + buffer = ByteBuffer() + + start = time.time() + for _ in range(10_000): + assert buffer.fill([b"X" * 1000]) == 1000 + return time.time() - start + + +def file_read_benchmark(filename): + file = smart_open.open(filename, mode="rb") + + start = time.time() + read = file.read(100_000_000) + end = time.time() + + if len(read) < 100_000_000: + print("File smaller than 100MB") + + return end - start + + +print("Raw ByteBuffer benchmark:", raw_bytebuffer_benchmark()) + +if len(sys.argv) > 1: + bench_result = file_read_benchmark(sys.argv[1]) + print("File read benchmark", bench_result) diff --git a/smart_open/bytebuffer.py b/smart_open/bytebuffer.py index 65e9f276..6aaa2515 100644 --- a/smart_open/bytebuffer.py +++ b/smart_open/bytebuffer.py @@ -105,12 +105,12 @@ def peek(self, size=-1): if size < 0 or size > len(self): size = len(self) - part = self._bytes[self._pos:self._pos+size] + part = bytes(self._bytes[self._pos:self._pos+size]) return part def empty(self): """Remove all bytes from the buffer""" - self._bytes = b'' + self._bytes = bytearray() self._pos = 0 def fill(self, source, size=-1): @@ -151,7 +151,7 @@ def fill(self, source, size=-1): if hasattr(source, 'read'): new_bytes = source.read(size) else: - new_bytes = b'' + new_bytes = bytearray() for more_bytes in source: new_bytes += more_bytes if len(new_bytes) >= size: From b21d538f2c24f4e50a5bbf491235fd277393039c Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Wed, 14 Sep 2022 11:08:33 +0900 Subject: [PATCH 3/4] bump version to 6.2.0 --- smart_open/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/smart_open/version.py b/smart_open/version.py index 9208262c..c1484c63 100644 --- a/smart_open/version.py +++ b/smart_open/version.py @@ -1,4 +1,4 @@ -__version__ = '6.1.0.dev0' +__version__ = '6.2.0' if __name__ == '__main__': From 07b6d999b6fa673f3d3910aa23562f1c5f707247 Mon Sep 17 00:00:00 2001 From: Michael Penkov Date: Wed, 14 Sep 2022 11:09:51 +0900 Subject: [PATCH 4/4] updated CHANGELOG.md for version 6.2.0 --- CHANGELOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 83bf5f15..0b268215 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Unreleased +# 6.2.0, 14 September 2022 + +- Fix quadratic time ByteBuffer operations (PR [#711](https://github.com/RaRe-Technologies/smart_open/pull/711), [@Joshua-Landau-Anthropic](https://github.com/Joshua-Landau-Anthropic)) + # 6.1.0, 21 August 2022 - Add cert parameter to http transport params (PR [#703](https://github.com/RaRe-Technologies/smart_open/pull/703), [@stev-0](https://github.com/stev-0))