Code: Select all
import zstandard as zstd
from pathlib import Path
file_to_compress = [r"E:\Personal Projects\tmp\chunk_0.ndjson",
r"E:\Personal Projects\tmp\chunk_0.ndjson"]
file_to_compress = [Path(p) for p in file_to_compress]
output_file = r"E:\Personal Projects\tmp\dataset.zst"
output_file = Path(output_file)
cctx = zstd.ZstdCompressor(write_content_size=True, threads=5)
with open(output_file, "wb") as f_out:
for src in file_to_compress:
with open(src, "rb") as fin:
cctx.copy_stream(fin, f_out)
frames = []
with open("E:\\Personal Projects\\tmp\\dataset.zst", "rb") as f:
offset = 0
while True:
f.seek(offset)
header = f.read(512) # enough for any zstd frame header
if not header:
break
params = zstd.get_frame_parameters(header)
frames.append({
"offset": offset,
"content_size": params.content_size,
"window_size": params.window_size,
"dict_id": params.dict_id,
})
# Advance to next frame:
# compressed_size is not known yet, so we must skip by decompressing OR
# rely on external index if available.
#
# For now, break to show that content_size is present.
break
print(f'The file size of "{str(file_to_compress[0])}" is', file_to_compress[0].stat().st_size)
print("Information of the first frame is", frames[0])
Code: Select all
The file size of "E:\Personal Projects\tmp\chunk_0.ndjson" is 2147473321
Information of the first frame is {'offset': 0, 'content_size': 18446744073709551615, 'window_size': 2097152, 'dict_id': 0}
Vielen Dank für Ihre Ausarbeitung.
Mobile version