From 3c610c3a3f92c83af1a78940d09e471fd013b0bd Mon Sep 17 00:00:00 2001 From: Matthew Chang Date: Mon, 16 Sep 2024 18:00:59 -0700 Subject: [PATCH] Updates readline logic for azure to match s3 (#826) * Updates readline logic for azure to match s3 Loosely copies the readline buffer management from s3 to azure, improving performance. * Adds unittest for readlines with azure * Revert "Adds unittest for readlines with azure" Did not need to add test, already exists further down in the file. This reverts commit 35a8c5e6ba96fc5f7be93e46a85fcbaec0e6c032. --- smart_open/azure.py | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/smart_open/azure.py b/smart_open/azure.py index 5cac221b..1c991f05 100644 --- a/smart_open/azure.py +++ b/smart_open/azure.py @@ -325,24 +325,22 @@ def readline(self, limit=-1): """Read up to and including the next newline. Returns the bytes read.""" if limit != -1: raise NotImplementedError('limits other than -1 not implemented yet') - the_line = io.BytesIO() + + # + # A single line may span multiple buffers. + # + line = io.BytesIO() while not (self._position == self._size and len(self._current_part) == 0): - # - # In the worst case, we're reading the unread part of self._current_part - # twice here, once in the if condition and once when calling index. - # - # This is sub-optimal, but better than the alternative: wrapping - # .index in a try..except, because that is slower. - # - remaining_buffer = self._current_part.peek() - if self._line_terminator in remaining_buffer: - next_newline = remaining_buffer.index(self._line_terminator) - the_line.write(self._read_from_buffer(next_newline + 1)) + line_part = self._current_part.readline(self._line_terminator) + line.write(line_part) + self._position += len(line_part) + + if line_part.endswith(self._line_terminator): break else: - the_line.write(self._read_from_buffer()) self._fill_buffer() - return the_line.getvalue() + + return line.getvalue() # # Internal methods.