Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add WACZ support #770

Draft
wants to merge 13 commits into
base: master
Choose a base branch
from
17 changes: 17 additions & 0 deletions ipwb/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
from six import PY3

from ipwb.util import iso8601_to_digits14, ipfs_client
from ipwb.util import is_wacz, extract_warcs_from_wacz
from ipwb.util import cleanup_warc_files_extracted_from_wacz

import requests
import datetime
Expand Down Expand Up @@ -123,6 +125,19 @@ def index_file_at(warc_paths, encryption_key=None,
for warc_path in warc_paths:
verify_file_exists(warc_path)

# Extract WARCs from any WACZ files
warc_paths_to_append = []
wacz_paths = []
for warc_path in warc_paths:
if is_wacz(warc_path):
warc_paths_to_append += extract_warcs_from_wacz(warc_path)
wacz_paths.append(warc_path)

# Manipulate list of WARCs extracted from WACZ
for ptr in wacz_paths:
warc_paths.remove(ptr)
warc_paths = warc_paths + warc_paths_to_append
machawk1 marked this conversation as resolved.
Show resolved Hide resolved

cdxj_lines = []

if outfile:
Expand Down Expand Up @@ -171,6 +186,8 @@ def index_file_at(warc_paths, encryption_key=None,
cdxj_metadata_lines = generate_cdxj_metadata(cdxj_lines)
cdxj_lines = cdxj_metadata_lines + cdxj_lines

cleanup_warc_files_extracted_from_wacz(warc_paths_to_append)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a big issue, but I think the temporary folders created by the mkdtemp() call will continue to exists (until cleaned up by the OS) because only the files inside them are deleted, not the folders themselves.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docs say that the creator is responsible for the deletion, so I think we should handle this. Given each WARC gets a new temp directory, it might be better to just retain the copy of this directory path and delete it along with its contents instead of deleting the WARC then the directory, which would require tracking the directory path, too. Which approach would you rather be implemented, @ibnesayeed?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

which would require tracking the directory path, too

Not really! It is possible to get the path of the directory if the path of a file is known that it contains.

That said, I would perhaps preferred not holding onto the list of WARC files, instead, operate on each WARC as we discover them, whether those are regular WARC files or those extracted from WACz files. I would deal with one file at a time and loop over for the next one.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ibnesayeed This seems like it requires a revamp outside of the scope of this GH issue/PR. I agree that dealing with one WARC at a time would likely be more computationally optimal.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand that it would require change in the workflow. When done, it would be more space efficient as not all the WARCs need to be extracted from WACZ files upfront, duplicating them on the disk, before processing them.

It is okay to leave it as things are right now and get back to this when we have a WARC record iterator for the WACZ files, when most of these changes will be rendered useless.


if quiet:
return cdxj_lines

Expand Down
44 changes: 44 additions & 0 deletions ipwb/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,11 @@
import datetime
import logging
import platform
import tempfile

# For extracting WARCs from WACZ
import glob
from zipfile import ZipFile, is_zipfile

from urllib.request import urlopen
from urllib.error import URLError
Expand Down Expand Up @@ -336,3 +341,42 @@ def check_for_update(_):
print("The installed version of ipwb is outdated.")
print(f"* Installed: {current}\n* Latest: {latest}")
print("Please run `pip install --upgrade ipwb` to upgrade.")


def is_wacz(path):
# TODO: add logic to check if wacz
# the py-wacz validator inherits many dependencies,
# so ad hoc here for now
return is_zipfile(path)


def get_warc_paths_in_wacz(wacz_path):
with ZipFile(wacz_path) as z:
return [w for w in z.namelist() if w.startswith('archive/')]


def extract_warcs_to_disk(wacz_path, warc_paths) -> list:
extracted_warc_paths = []
for warc in warc_paths:
with ZipFile(wacz_path) as z:
ph = z.extract(warc, tempfile.mkdtemp())
extracted_warc_paths.append(ph)

return extracted_warc_paths


def extract_warcs_from_wacz(wacz_path):
machawk1 marked this conversation as resolved.
Show resolved Hide resolved
warc_paths_in_wacz = get_warc_paths_in_wacz(wacz_path)
warc_paths_on_disk = extract_warcs_to_disk(
wacz_path, warc_paths_in_wacz)

return warc_paths_on_disk


def cleanup_warc_files_extracted_from_wacz(warc_paths):
for temporary_warc in warc_paths:
try:
if os.path.isfile(temporary_warc):
os.remove(temporary_warc)
except OSError as e:
print(f'Error: {e.filename}, {e.strerror}')
Binary file added samples/wacz/my-collection.wacz
Binary file not shown.
7 changes: 5 additions & 2 deletions tests/testUtil.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,14 @@ def count_cdxj_entries(cdxj_data):
return urim_count


def start_replay(warc_filename):
def start_replay(filename, samples_dir='warcs'):
global p
if filename.endswith('.wacz'):
samples_dir = 'wacz'

path_of_warc = os.path.join(
Path(os.path.dirname(__file__)).parent,
'samples', 'warcs', warc_filename)
'samples', samples_dir, filename)

fh, tempfile_path = tempfile.mkstemp(suffix='.cdxj')
os.close(fh)
Expand Down
1 change: 1 addition & 0 deletions tests/test_replay.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ def test_replay_404(warc, lookup, has_md_header):
('2mementos_queryString.warc',
'/memento/20130202100000/memento.us/' +
'index.php?anotherval=ipsum&someval=lorem', 200, None),
('my-collection.wacz', 'memento/*/memento.us', 200, None),
])
def test_replay_search(warc, lookup, status, location):
ipwb_test.start_replay(warc)
Expand Down