Skip to content

Commit

Permalink
Improve S3 URI Parsing for URIs with "@", "/", and ":" (#776)
Browse files Browse the repository at this point in the history
* Make uri parse more robust

* Fix linter issue

* improve code clarity

* handle edge case

---------

Co-authored-by: Michael Penkov <[email protected]>
  • Loading branch information
rileypeterson and mpenkov committed Sep 6, 2023
1 parent b70a7ce commit ebc30d7
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 3 deletions.
17 changes: 14 additions & 3 deletions smart_open/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,9 +105,20 @@ def parse_uri(uri_as_string):
#
uri = split_uri.netloc + split_uri.path

if '@' in uri and ':' in uri.split('@')[0]:
auth, uri = uri.split('@', 1)
access_id, access_secret = auth.split(':')
#
# Attempt to extract edge-case authentication details from the URL.
#
# See:
# 1. https://summitroute.com/blog/2018/06/20/aws_security_credential_formats/
# 2. test_s3_uri_with_credentials* in test_smart_open.py for example edge cases
#
if '@' in uri:
maybe_auth, rest = uri.split('@', 1)
if ':' in maybe_auth:
maybe_id, maybe_secret = maybe_auth.split(':', 1)
if '/' not in maybe_id:
access_id, access_secret = maybe_id, maybe_secret
uri = rest

head, key_id = uri.split('/', 1)
if '@' in head and ':' in head:
Expand Down
29 changes: 29 additions & 0 deletions smart_open/tests/test_smart_open.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,17 @@ def test_s3_uri_has_atmark_in_key_name(self):
self.assertEqual(parsed_uri.access_id, "accessid")
self.assertEqual(parsed_uri.access_secret, "access/secret")

#
# Nb. should never happen in theory, but if it does, we should avoid crashing
#
def test_s3_uri_has_colon_in_secret(self):
parsed_uri = smart_open_lib._parse_uri("s3://accessid:access/secret:totally@mybucket/my@ke@y")
self.assertEqual(parsed_uri.scheme, "s3")
self.assertEqual(parsed_uri.bucket_id, "mybucket")
self.assertEqual(parsed_uri.key_id, "my@ke@y")
self.assertEqual(parsed_uri.access_id, "accessid")
self.assertEqual(parsed_uri.access_secret, "access/secret:totally")

def test_s3_uri_has_atmark_in_key_name2(self):
parsed_uri = smart_open_lib._parse_uri(
"s3://accessid:access/secret@hostname:1234@mybucket/dir/my@ke@y"
Expand Down Expand Up @@ -218,6 +229,24 @@ def test_s3_uri_with_colon_in_key_name(self):
self.assertEqual(parsed_uri.access_id, None)
self.assertEqual(parsed_uri.access_secret, None)

def test_s3_uri_with_at_symbol_in_key_name0(self):
""" Correctly parse the s3 url if there is an @ symbol (and colon) in the key or dir """
parsed_uri = smart_open_lib._parse_uri("s3://mybucket/mydir:my@key")
self.assertEqual(parsed_uri.scheme, "s3")
self.assertEqual(parsed_uri.bucket_id, "mybucket")
self.assertEqual(parsed_uri.key_id, "mydir:my@key")
self.assertEqual(parsed_uri.access_id, None)
self.assertEqual(parsed_uri.access_secret, None)

def test_s3_uri_with_at_symbol_in_key_name1(self):
""" Correctly parse the s3 url if there is an @ symbol (and colon) in the key or dir """
parsed_uri = smart_open_lib._parse_uri("s3://mybucket/my:dir@my/key")
self.assertEqual(parsed_uri.scheme, "s3")
self.assertEqual(parsed_uri.bucket_id, "mybucket")
self.assertEqual(parsed_uri.key_id, "my:dir@my/key")
self.assertEqual(parsed_uri.access_id, None)
self.assertEqual(parsed_uri.access_secret, None)

def test_s3_uri_contains_question_mark(self):
parsed_uri = smart_open_lib._parse_uri("s3://mybucket/mydir/mykey?param")
self.assertEqual(parsed_uri.scheme, "s3")
Expand Down

0 comments on commit ebc30d7

Please sign in to comment.