From 8fb1d13b0d791558f12ab451e911272dc10a3cce Mon Sep 17 00:00:00 2001 From: Steven Crawford Date: Wed, 11 Dec 2024 23:40:25 -0600 Subject: [PATCH] updates --- .gitignore | 1 + split_zip_into_under_5GB_chunks.py | 129 ++++++++++++++--------------- 2 files changed, 63 insertions(+), 67 deletions(-) create mode 100644 .gitignore mode change 100644 => 100755 split_zip_into_under_5GB_chunks.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..5ceb386 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +venv diff --git a/split_zip_into_under_5GB_chunks.py b/split_zip_into_under_5GB_chunks.py old mode 100644 new mode 100755 index ab7f507..3faea0e --- a/split_zip_into_under_5GB_chunks.py +++ b/split_zip_into_under_5GB_chunks.py @@ -1,67 +1,7 @@ +#!/Users/shelbybark/code/cmg_split_zip_into_chunks/venv/bin/python3 import os -import zipfile -from shutil import copy2 - -def split_zip_into_groups(zip_path, max_group_size_gb): - # Open the existing zip file in read mode - with zipfile.ZipFile(zip_path, 'r') as zip_ref: - # Extract all the contents into a temporary directory - temp_dir = os.path.splitext(zip_path)[0] + '_extracted' - if not os.path.exists(temp_dir): - os.makedirs(temp_dir) - - zip_ref.extractall(temp_dir) - - # Get a list of all files in the temporary directory - file_list = [os.path.join(root, f) for root, dirs, files in os.walk(temp_dir) for f in files] - - # Calculate the maximum size in bytes for each group - max_group_size_bytes = max_group_size_gb * 1024**3 - - # Create a new directory to hold the split zip files - output_dir = os.path.splitext(zip_path)[0] + '_split' - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - group_count = 1 - current_group_size = 0 - current_group_files = [] - - for file in file_list: - # Get the size of the current file in bytes - file_size = os.path.getsize(file) - - if current_group_size + file_size > max_group_size_bytes: - # If adding the current file exceeds the group size, save the current group and create a new one - output_zip_path = os.path.join(output_dir, f'group_{group_count}.zip') - - with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_ref: - for file in current_group_files: - # Copy each file to the temporary directory and add it to the new zip - temp_file = os.path.join(temp_dir, os.path.relpath(file, start=temp_dir)) - copy2(file, temp_file) - zip_ref.write(temp_file, os.path.relpath(file, start=temp_dir)) - os.remove(temp_file) # Clean up the temporary file - - group_count += 1 - current_group_size = 0 - current_group_files = [] - - current_group_size += file_size - current_group_files.append(file) - - # If there are any remaining files in the last group, save them too - if current_group_files: - output_zip_path = os.path.join(output_dir, f'group_{group_count}.zip') - with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_ref: - for file in current_group_files: - temp_file = os.path.join(temp_dir, os.path.relpath(file, start=temp_dir)) - copy2(file, temp_file) - zip_ref.write(temp_file, os.path.relpath(file, start=temp_dir)) - os.remove(temp_file) # Clean up the temporary file - - # Clean up the temporary directory - shutil.rmtree(temp_dir) +from zipfile import ZipFile, ZIP_DEFLATED +import io def get_zip_path(): while True: @@ -71,7 +11,62 @@ def get_zip_path(): else: print(f"The file at {zip_path} does not exist. Please try again.") -# Example usage -zip_path = get_zip_path() -max_group_size_gb = 5 -split_zip_into_groups(zip_path, max_group_size_gb) \ No newline at end of file +def split_and_zip(file_path): + + with ZipFile(file_path, 'r') as zip_ref: + total_size = 0 + for file in zip_ref.namelist(): + total_size += zip_ref.getinfo(file).file_size + + # Define the maximum group size in bytes + max_group_size = 5 * 1024 ** 3 + + # Calculate the number of groups + num_groups = -(-total_size // max_group_size) # Ceiling division + + # print(f"Total size: { round(total_size / 1024.0 / 1024.0 / 1024.0, 6)} GB") + print(f"Max group size: { max_group_size } GB") + print(f"Total size: { total_size } GB") + print(f"Number of groups: {num_groups}") + + # Create a temporary directory for storing intermediate files + tmp_dir = 'temp' + os.makedirs(tmp_dir, exist_ok=True) + + # Iterate over each group + for i in range(num_groups): + start_index = i * max_group_size + end_index = min((i + 1) * max_group_size, total_size) + + # Extract the files for this group from the zipped file + with ZipFile(file_path, 'r') as zip_file: + group_files = [(zip_file.infolist()[j].filename, + io.BytesIO(), + zip_file.open(zip_file.infolist()[j].filename, 'r')) + for j in range(start_index, end_index)] + + # Write each file to a new zip file + with open(os.path.join(tmp_dir, f'group_{i}.zip'), 'wb') as group_zip: + group_zip.write(b'PK' + b'\x01\x0a' * 20) + for filename, buffer, file in group_files: + group_zip.write(f'{filename}\x00') + group_zip.writestr(filename, buffer.read()) + del buffer + + # Clean up + os.remove(file_path) + + # Zip the intermediate files into final zip files + for i in range(num_groups): + with open(os.path.join(tmp_dir, f'group_{i}.zip'), 'rb') as group_zip: + with ZipFile(f'output_group_{i}.zip', 'w', compression=ZIP_DEFLATED) as output_zip: + for file_info in group_zip.infolist(): + if file_info.filename.startswith('group_'): + output_zip.writestr(file_info.filename[len('group_'):], group_zip.open(file_info.filename, 'r').read()) + + # Clean up + os.rmdir(tmp_dir) + +zip_file = get_zip_path() +# split_and_zip('input.zip') +split_and_zip(zip_file)