updates
This commit is contained in:
1
.gitignore
vendored
Normal file
1
.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
venv
|
||||||
129
split_zip_into_under_5GB_chunks.py
Normal file → Executable file
129
split_zip_into_under_5GB_chunks.py
Normal file → Executable file
@@ -1,67 +1,7 @@
|
|||||||
|
#!/Users/shelbybark/code/cmg_split_zip_into_chunks/venv/bin/python3
|
||||||
import os
|
import os
|
||||||
import zipfile
|
from zipfile import ZipFile, ZIP_DEFLATED
|
||||||
from shutil import copy2
|
import io
|
||||||
|
|
||||||
def split_zip_into_groups(zip_path, max_group_size_gb):
|
|
||||||
# Open the existing zip file in read mode
|
|
||||||
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
|
||||||
# Extract all the contents into a temporary directory
|
|
||||||
temp_dir = os.path.splitext(zip_path)[0] + '_extracted'
|
|
||||||
if not os.path.exists(temp_dir):
|
|
||||||
os.makedirs(temp_dir)
|
|
||||||
|
|
||||||
zip_ref.extractall(temp_dir)
|
|
||||||
|
|
||||||
# Get a list of all files in the temporary directory
|
|
||||||
file_list = [os.path.join(root, f) for root, dirs, files in os.walk(temp_dir) for f in files]
|
|
||||||
|
|
||||||
# Calculate the maximum size in bytes for each group
|
|
||||||
max_group_size_bytes = max_group_size_gb * 1024**3
|
|
||||||
|
|
||||||
# Create a new directory to hold the split zip files
|
|
||||||
output_dir = os.path.splitext(zip_path)[0] + '_split'
|
|
||||||
if not os.path.exists(output_dir):
|
|
||||||
os.makedirs(output_dir)
|
|
||||||
|
|
||||||
group_count = 1
|
|
||||||
current_group_size = 0
|
|
||||||
current_group_files = []
|
|
||||||
|
|
||||||
for file in file_list:
|
|
||||||
# Get the size of the current file in bytes
|
|
||||||
file_size = os.path.getsize(file)
|
|
||||||
|
|
||||||
if current_group_size + file_size > max_group_size_bytes:
|
|
||||||
# If adding the current file exceeds the group size, save the current group and create a new one
|
|
||||||
output_zip_path = os.path.join(output_dir, f'group_{group_count}.zip')
|
|
||||||
|
|
||||||
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_ref:
|
|
||||||
for file in current_group_files:
|
|
||||||
# Copy each file to the temporary directory and add it to the new zip
|
|
||||||
temp_file = os.path.join(temp_dir, os.path.relpath(file, start=temp_dir))
|
|
||||||
copy2(file, temp_file)
|
|
||||||
zip_ref.write(temp_file, os.path.relpath(file, start=temp_dir))
|
|
||||||
os.remove(temp_file) # Clean up the temporary file
|
|
||||||
|
|
||||||
group_count += 1
|
|
||||||
current_group_size = 0
|
|
||||||
current_group_files = []
|
|
||||||
|
|
||||||
current_group_size += file_size
|
|
||||||
current_group_files.append(file)
|
|
||||||
|
|
||||||
# If there are any remaining files in the last group, save them too
|
|
||||||
if current_group_files:
|
|
||||||
output_zip_path = os.path.join(output_dir, f'group_{group_count}.zip')
|
|
||||||
with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zip_ref:
|
|
||||||
for file in current_group_files:
|
|
||||||
temp_file = os.path.join(temp_dir, os.path.relpath(file, start=temp_dir))
|
|
||||||
copy2(file, temp_file)
|
|
||||||
zip_ref.write(temp_file, os.path.relpath(file, start=temp_dir))
|
|
||||||
os.remove(temp_file) # Clean up the temporary file
|
|
||||||
|
|
||||||
# Clean up the temporary directory
|
|
||||||
shutil.rmtree(temp_dir)
|
|
||||||
|
|
||||||
def get_zip_path():
|
def get_zip_path():
|
||||||
while True:
|
while True:
|
||||||
@@ -71,7 +11,62 @@ def get_zip_path():
|
|||||||
else:
|
else:
|
||||||
print(f"The file at {zip_path} does not exist. Please try again.")
|
print(f"The file at {zip_path} does not exist. Please try again.")
|
||||||
|
|
||||||
# Example usage
|
def split_and_zip(file_path):
|
||||||
zip_path = get_zip_path()
|
|
||||||
max_group_size_gb = 5
|
with ZipFile(file_path, 'r') as zip_ref:
|
||||||
split_zip_into_groups(zip_path, max_group_size_gb)
|
total_size = 0
|
||||||
|
for file in zip_ref.namelist():
|
||||||
|
total_size += zip_ref.getinfo(file).file_size
|
||||||
|
|
||||||
|
# Define the maximum group size in bytes
|
||||||
|
max_group_size = 5 * 1024 ** 3
|
||||||
|
|
||||||
|
# Calculate the number of groups
|
||||||
|
num_groups = -(-total_size // max_group_size) # Ceiling division
|
||||||
|
|
||||||
|
# print(f"Total size: { round(total_size / 1024.0 / 1024.0 / 1024.0, 6)} GB")
|
||||||
|
print(f"Max group size: { max_group_size } GB")
|
||||||
|
print(f"Total size: { total_size } GB")
|
||||||
|
print(f"Number of groups: {num_groups}")
|
||||||
|
|
||||||
|
# Create a temporary directory for storing intermediate files
|
||||||
|
tmp_dir = 'temp'
|
||||||
|
os.makedirs(tmp_dir, exist_ok=True)
|
||||||
|
|
||||||
|
# Iterate over each group
|
||||||
|
for i in range(num_groups):
|
||||||
|
start_index = i * max_group_size
|
||||||
|
end_index = min((i + 1) * max_group_size, total_size)
|
||||||
|
|
||||||
|
# Extract the files for this group from the zipped file
|
||||||
|
with ZipFile(file_path, 'r') as zip_file:
|
||||||
|
group_files = [(zip_file.infolist()[j].filename,
|
||||||
|
io.BytesIO(),
|
||||||
|
zip_file.open(zip_file.infolist()[j].filename, 'r'))
|
||||||
|
for j in range(start_index, end_index)]
|
||||||
|
|
||||||
|
# Write each file to a new zip file
|
||||||
|
with open(os.path.join(tmp_dir, f'group_{i}.zip'), 'wb') as group_zip:
|
||||||
|
group_zip.write(b'PK' + b'\x01\x0a' * 20)
|
||||||
|
for filename, buffer, file in group_files:
|
||||||
|
group_zip.write(f'{filename}\x00')
|
||||||
|
group_zip.writestr(filename, buffer.read())
|
||||||
|
del buffer
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
os.remove(file_path)
|
||||||
|
|
||||||
|
# Zip the intermediate files into final zip files
|
||||||
|
for i in range(num_groups):
|
||||||
|
with open(os.path.join(tmp_dir, f'group_{i}.zip'), 'rb') as group_zip:
|
||||||
|
with ZipFile(f'output_group_{i}.zip', 'w', compression=ZIP_DEFLATED) as output_zip:
|
||||||
|
for file_info in group_zip.infolist():
|
||||||
|
if file_info.filename.startswith('group_'):
|
||||||
|
output_zip.writestr(file_info.filename[len('group_'):], group_zip.open(file_info.filename, 'r').read())
|
||||||
|
|
||||||
|
# Clean up
|
||||||
|
os.rmdir(tmp_dir)
|
||||||
|
|
||||||
|
zip_file = get_zip_path()
|
||||||
|
# split_and_zip('input.zip')
|
||||||
|
split_and_zip(zip_file)
|
||||||
|
|||||||
Reference in New Issue
Block a user