diff --git a/ipwb/indexer.py b/ipwb/indexer.py index 30baba1..edc625d 100755 --- a/ipwb/indexer.py +++ b/ipwb/indexer.py @@ -17,6 +17,7 @@ import zlib import surt import ntpath +import shutil import traceback import tempfile @@ -29,6 +30,8 @@ # from requests.exceptions import ConnectionError from ipwb.util import iso8601_to_digits14, ipfs_client +from ipwb.util import is_wacz, extract_warcs_from_wacz +from ipwb.util import cleanup_warc_files_extracted_from_wacz import requests import datetime @@ -119,6 +122,21 @@ def index_file_at(warc_paths, encryption_key=None, for warc_path in warc_paths: verify_file_exists(warc_path) + # Extract WARCs from any WACZ files + warc_paths_to_append = [] + wacz_paths = [] + for warc_path in warc_paths: + if is_wacz(warc_path): + (w_paths, dirs_to_cleanup) = extract_warcs_from_wacz(warc_path) + warc_paths_to_append += w_paths + wacz_paths.append(warc_path) + + # Manipulate list of WARCs extracted from WACZ + for ptr in wacz_paths: + warc_paths.remove(ptr) + + warc_paths = warc_paths + warc_paths_to_append + cdxj_lines = [] if outfile: @@ -167,6 +185,8 @@ def index_file_at(warc_paths, encryption_key=None, cdxj_metadata_lines = generate_cdxj_metadata(cdxj_lines) cdxj_lines = cdxj_metadata_lines + cdxj_lines + cleanup_warc_files_extracted_from_wacz(warc_paths_to_append) + if quiet: return cdxj_lines @@ -180,6 +200,10 @@ def index_file_at(warc_paths, encryption_key=None, else: print('\n'.join(cdxj_lines)) + # Cleanup, e.g., dirs for WARCs from WACZ + for dir_to_cleanup in dirs_to_cleanup: + shutil.rmtree(dir_to_cleanup) + def sanitize_cdxj_line(cdxj_line): return cdxj_line diff --git a/ipwb/util.py b/ipwb/util.py index 02e9cea..4df1723 100644 --- a/ipwb/util.py +++ b/ipwb/util.py @@ -12,6 +12,11 @@ import datetime import logging import platform +import tempfile + +# For extracting WARCs from WACZ +import glob +from zipfile import ZipFile, is_zipfile from enum import Enum, auto @@ -350,3 +355,49 @@ def check_for_update(_): print("The installed version of ipwb is outdated.") print(f"* Installed: {current}\n* Latest: {latest}") print("Please run `pip install --upgrade ipwb` to upgrade.") + + +def is_wacz(path): + # TODO: add logic to check if wacz + # the py-wacz validator inherits many dependencies, + # so ad hoc here for now + return is_zipfile(path) + + +def get_warc_paths_in_wacz(wacz_path): + with ZipFile(wacz_path) as z: + return [w for w in z.namelist() if w.startswith('archive/')] + + +def extract_warcs_to_disk(wacz_path, warc_paths) -> list: + ''' + Extract WARCs and retain reference to temp path + for later deletion + ''' + extracted_warc_paths = [] + tmp_dirs = [] + for warc in warc_paths: + with ZipFile(wacz_path) as z: + tmp_dir = tempfile.mkdtemp() + ph = z.extract(warc, tmp_dir) + extracted_warc_paths.append(ph) + tmp_dirs.append(tmp_dir) # For later dir deletion + + return (extracted_warc_paths, tmp_dirs) + + +def extract_warcs_from_wacz(wacz_path): + warc_paths_in_wacz = get_warc_paths_in_wacz(wacz_path) + (warc_paths_on_disk, dirs_to_cleanup) = extract_warcs_to_disk( + wacz_path, warc_paths_in_wacz) + + return (warc_paths_on_disk, dirs_to_cleanup) + + +def cleanup_warc_files_extracted_from_wacz(warc_paths): + for temporary_warc in warc_paths: + try: + if os.path.isfile(temporary_warc): + os.remove(temporary_warc) + except OSError as e: + print(f'Error: {e.filename}, {e.strerror}') diff --git a/samples/wacz/my-collection.wacz b/samples/wacz/my-collection.wacz new file mode 100644 index 0000000..cef06b7 Binary files /dev/null and b/samples/wacz/my-collection.wacz differ diff --git a/tests/testUtil.py b/tests/testUtil.py index d292722..0b363a5 100644 --- a/tests/testUtil.py +++ b/tests/testUtil.py @@ -57,11 +57,14 @@ def count_cdxj_entries(cdxj_data): return urim_count -def start_replay(warc_filename): +def start_replay(filename, samples_dir='warcs'): global p + if filename.endswith('.wacz'): + samples_dir = 'wacz' + path_of_warc = os.path.join( Path(os.path.dirname(__file__)).parent, - 'samples', 'warcs', warc_filename) + 'samples', samples_dir, filename) fh, tempfile_path = tempfile.mkstemp(suffix='.cdxj') os.close(fh) diff --git a/tests/test_replay.py b/tests/test_replay.py index bb894bd..7caf34c 100644 --- a/tests/test_replay.py +++ b/tests/test_replay.py @@ -49,6 +49,7 @@ def test_replay_404(warc, lookup, has_md_header): ('2mementos_queryString.warc', '/memento/20130202100000/memento.us/' + 'index.php?anotherval=ipsum&someval=lorem', 200, None), + ('my-collection.wacz', 'memento/*/memento.us', 200, None), ]) def test_replay_search(warc, lookup, status, location): ipwb_test.start_replay(warc)