update pdf image extractor

x4nth055 · x4nth055 · commit f3988226ae4f · 2023-04-17T14:18:15.000+01:00
diff --git a/web-scraping/pdf-image-extractor/README.md b/web-scraping/pdf-image-extractor/README.md
@@ -12,4 +12,20 @@ To run this:
     [+] Found a total of 3 images in page 2
     [!] No images found on page 3
     [!] No images found on page 4
+    ```
+- To extract and save all images of 800x800 and higher of `1710.05006.pdf` PDF file, and save them in `images` directory in the PNG format, you run:
+    ```
+    python pdf_image_extractor_cli.py 1710.05006.pdf -o extracted-images -f png -w 800 -he 800
+    ```
+    This will save all available images in the `images` directory and outputs:
+    ```
+    [!] No images found on page 0
+    [+] Found a total of 3 images in page 1
+    [-] Skipping image 1 on page 1 due to its small size.
+    [-] Skipping image 2 on page 1 due to its small size.
+    [-] Skipping image 3 on page 1 due to its small size.
+    [+] Found a total of 3 images in page 2
+    [-] Skipping image 2 on page 2 due to its small size.
+    [!] No images found on page 3
+    [!] No images found on page 4
     ```
diff --git a/web-scraping/pdf-image-extractor/pdf_image_extractor.py b/web-scraping/pdf-image-extractor/pdf_image_extractor.py
@@ -1,31 +1,48 @@
-import fitz # PyMuPDF
+import os
+import fitz  # PyMuPDF
 import io
 from PIL import Image
 
-# file path you want to extract images from
+# Output directory for the extracted images
+output_dir = "extracted_images"
+# Desired output image format
+output_format = "png"
+# Minimum width and height for extracted images
+min_width = 100
+min_height = 100
+# Create the output directory if it does not exist
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+# File path you want to extract images from
 file = "1710.05006.pdf"
-# open the file
+# Open the file
 pdf_file = fitz.open(file)
-# iterate over PDF pages
+# Iterate over PDF pages
 for page_index in range(len(pdf_file)):
-    # get the page itself
+    # Get the page itself
     page = pdf_file[page_index]
-    # get image list
-    image_list = page.get_images()
-    # printing number of images found in this page
+    # Get image list
+    image_list = page.get_images(full=True)
+    # Print the number of images found on this page
     if image_list:
         print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
     else:
-        print("[!] No images found on page", page_index)
+        print(f"[!] No images found on page {page_index}")
+    # Iterate over the images on the page
     for image_index, img in enumerate(image_list, start=1):
-        # get the XREF of the image
+        # Get the XREF of the image
         xref = img[0]
-        # extract the image bytes
+        # Extract the image bytes
         base_image = pdf_file.extract_image(xref)
         image_bytes = base_image["image"]
-        # get the image extension
+        # Get the image extension
         image_ext = base_image["ext"]
-        # load it to PIL
+        # Load it to PIL
         image = Image.open(io.BytesIO(image_bytes))
-        # save it to local disk
-        image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))
+        # Check if the image meets the minimum dimensions and save it
+        if image.width >= min_width and image.height >= min_height:
+            image.save(
+                open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"),
+                format=output_format.upper())
+        else:
+            print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")
diff --git a/web-scraping/pdf-image-extractor/pdf_image_extractor_cli.py b/web-scraping/pdf-image-extractor/pdf_image_extractor_cli.py
@@ -0,0 +1,58 @@
+import os
+import fitz  # PyMuPDF
+import io
+from PIL import Image
+import argparse
+
+parser = argparse.ArgumentParser(description="Extract images from a PDF file.")
+parser.add_argument("file", help="PDF file to extract images from.")
+parser.add_argument("-o", "--output", help="Output directory for the extracted images.", default="extracted_images")
+parser.add_argument("-f", "--format", help="Desired output image format, default is PNG.", default="png")
+parser.add_argument("-w", "--width", help="Minimum width for extracted images, default is 100.", default=100, type=int)
+parser.add_argument("-he", "--height", help="Minimum height for extracted images, default is 100.", default=100, type=int)
+# Parse the arguments
+args = parser.parse_args()
+
+# Output directory for the extracted images
+output_dir = args.output
+# Desired output image format
+output_format = args.format
+# Minimum width and height for extracted images
+min_width = args.width
+min_height = args.height
+# Create the output directory if it does not exist
+if not os.path.exists(output_dir):
+    os.makedirs(output_dir)
+# File path you want to extract images from
+file = args.file
+# Open the file
+pdf_file = fitz.open(file)
+# Iterate over PDF pages
+for page_index in range(len(pdf_file)):
+    # Get the page itself
+    page = pdf_file[page_index]
+    # Get image list
+    image_list = page.get_images(full=True)
+    # Print the number of images found on this page
+    if image_list:
+        print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
+    else:
+        print(f"[!] No images found on page {page_index}")
+    # Iterate over the images on the page
+    for image_index, img in enumerate(image_list, start=1):
+        # Get the XREF of the image
+        xref = img[0]
+        # Extract the image bytes
+        base_image = pdf_file.extract_image(xref)
+        image_bytes = base_image["image"]
+        # Get the image extension
+        image_ext = base_image["ext"]
+        # Load it to PIL
+        image = Image.open(io.BytesIO(image_bytes))
+        # Check if the image meets the minimum dimensions and save it
+        if image.width >= min_width and image.height >= min_height:
+            image.save(
+                open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"),
+                format=output_format.upper())
+        else:
+            print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")