diff --git a/split_zip_into_under_5GB_chunks.py b/split_zip_into_under_5GB_chunks.py index 3faea0e..8300142 100755 --- a/split_zip_into_under_5GB_chunks.py +++ b/split_zip_into_under_5GB_chunks.py @@ -1,7 +1,7 @@ -#!/Users/shelbybark/code/cmg_split_zip_into_chunks/venv/bin/python3 +#!/usr/bin/python3 +import zipfile import os -from zipfile import ZipFile, ZIP_DEFLATED -import io +from io import BytesIO def get_zip_path(): while True: @@ -11,62 +11,39 @@ def get_zip_path(): else: print(f"The file at {zip_path} does not exist. Please try again.") -def split_and_zip(file_path): +def split_zip(input_zip_path, output_folder, max_group_size=4.5 * 1024 * 1024 * 1024): + # Ensure the output folder exists + base = os.path.basename(input_zip_path) + file_name = os.path.splitext(base)[0] + if not os.path.exists(output_folder): + os.makedirs(output_folder) - with ZipFile(file_path, 'r') as zip_ref: - total_size = 0 - for file in zip_ref.namelist(): - total_size += zip_ref.getinfo(file).file_size + with zipfile.ZipFile(input_zip_path, 'r') as source_zip: + members = source_zip.namelist() + group_size = 0 + group_number = 1 - # Define the maximum group size in bytes - max_group_size = 5 * 1024 ** 3 + for member in members: + # Read the file content from the zip archive + with BytesIO(source_zip.read(member)) as file_content: + # Check if adding this file would exceed the maximum group size + if (group_size + len(file_content.getvalue())) > max_group_size: + group_number += 1 + group_size = 0 - # Calculate the number of groups - num_groups = -(-total_size // max_group_size) # Ceiling division + output_zip_path = os.path.join(output_folder, f'{file_name}-group_{group_number}.zip') - # print(f"Total size: { round(total_size / 1024.0 / 1024.0 / 1024.0, 6)} GB") - print(f"Max group size: { max_group_size } GB") - print(f"Total size: { total_size } GB") - print(f"Number of groups: {num_groups}") + with zipfile.ZipFile(output_zip_path, 'a') as target_zip: + target_zip.writestr(member, file_content.getvalue()) - # Create a temporary directory for storing intermediate files - tmp_dir = 'temp' - os.makedirs(tmp_dir, exist_ok=True) + # Update the size of the current group + group_size += len(file_content.getvalue()) - # Iterate over each group - for i in range(num_groups): - start_index = i * max_group_size - end_index = min((i + 1) * max_group_size, total_size) - - # Extract the files for this group from the zipped file - with ZipFile(file_path, 'r') as zip_file: - group_files = [(zip_file.infolist()[j].filename, - io.BytesIO(), - zip_file.open(zip_file.infolist()[j].filename, 'r')) - for j in range(start_index, end_index)] + print(f'Successfully split {input_zip_path} into {group_number} groups.') - # Write each file to a new zip file - with open(os.path.join(tmp_dir, f'group_{i}.zip'), 'wb') as group_zip: - group_zip.write(b'PK' + b'\x01\x0a' * 20) - for filename, buffer, file in group_files: - group_zip.write(f'{filename}\x00') - group_zip.writestr(filename, buffer.read()) - del buffer +# Example usage +# input_zip_path = 'path/to/your/large.zip' +input_zip_path = get_zip_path() +output_folder = 'output' - # Clean up - os.remove(file_path) - - # Zip the intermediate files into final zip files - for i in range(num_groups): - with open(os.path.join(tmp_dir, f'group_{i}.zip'), 'rb') as group_zip: - with ZipFile(f'output_group_{i}.zip', 'w', compression=ZIP_DEFLATED) as output_zip: - for file_info in group_zip.infolist(): - if file_info.filename.startswith('group_'): - output_zip.writestr(file_info.filename[len('group_'):], group_zip.open(file_info.filename, 'r').read()) - - # Clean up - os.rmdir(tmp_dir) - -zip_file = get_zip_path() -# split_and_zip('input.zip') -split_and_zip(zip_file) +split_zip(input_zip_path, output_folder)