|
| 1 | +import os |
| 2 | +import fitz # PyMuPDF |
| 3 | +import io |
| 4 | +from PIL import Image |
| 5 | +import argparse |
| 6 | + |
| 7 | +parser = argparse.ArgumentParser(description="Extract images from a PDF file.") |
| 8 | +parser.add_argument("file", help="PDF file to extract images from.") |
| 9 | +parser.add_argument("-o", "--output", help="Output directory for the extracted images.", default="extracted_images") |
| 10 | +parser.add_argument("-f", "--format", help="Desired output image format, default is PNG.", default="png") |
| 11 | +parser.add_argument("-w", "--width", help="Minimum width for extracted images, default is 100.", default=100, type=int) |
| 12 | +parser.add_argument("-he", "--height", help="Minimum height for extracted images, default is 100.", default=100, type=int) |
| 13 | +# Parse the arguments |
| 14 | +args = parser.parse_args() |
| 15 | + |
| 16 | +# Output directory for the extracted images |
| 17 | +output_dir = args.output |
| 18 | +# Desired output image format |
| 19 | +output_format = args.format |
| 20 | +# Minimum width and height for extracted images |
| 21 | +min_width = args.width |
| 22 | +min_height = args.height |
| 23 | +# Create the output directory if it does not exist |
| 24 | +if not os.path.exists(output_dir): |
| 25 | + os.makedirs(output_dir) |
| 26 | +# File path you want to extract images from |
| 27 | +file = args.file |
| 28 | +# Open the file |
| 29 | +pdf_file = fitz.open(file) |
| 30 | +# Iterate over PDF pages |
| 31 | +for page_index in range(len(pdf_file)): |
| 32 | + # Get the page itself |
| 33 | + page = pdf_file[page_index] |
| 34 | + # Get image list |
| 35 | + image_list = page.get_images(full=True) |
| 36 | + # Print the number of images found on this page |
| 37 | + if image_list: |
| 38 | + print(f"[+] Found a total of {len(image_list)} images in page {page_index}") |
| 39 | + else: |
| 40 | + print(f"[!] No images found on page {page_index}") |
| 41 | + # Iterate over the images on the page |
| 42 | + for image_index, img in enumerate(image_list, start=1): |
| 43 | + # Get the XREF of the image |
| 44 | + xref = img[0] |
| 45 | + # Extract the image bytes |
| 46 | + base_image = pdf_file.extract_image(xref) |
| 47 | + image_bytes = base_image["image"] |
| 48 | + # Get the image extension |
| 49 | + image_ext = base_image["ext"] |
| 50 | + # Load it to PIL |
| 51 | + image = Image.open(io.BytesIO(image_bytes)) |
| 52 | + # Check if the image meets the minimum dimensions and save it |
| 53 | + if image.width >= min_width and image.height >= min_height: |
| 54 | + image.save( |
| 55 | + open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"), |
| 56 | + format=output_format.upper()) |
| 57 | + else: |
| 58 | + print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.") |
0 commit comments