Skip to content

Commit f398822

Browse files
committed
update pdf image extractor
1 parent a2e0042 commit f398822

File tree

3 files changed

+106
-15
lines changed

3 files changed

+106
-15
lines changed

web-scraping/pdf-image-extractor/README.md

+16
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,20 @@ To run this:
1212
[+] Found a total of 3 images in page 2
1313
[!] No images found on page 3
1414
[!] No images found on page 4
15+
```
16+
- To extract and save all images of 800x800 and higher of `1710.05006.pdf` PDF file, and save them in `images` directory in the PNG format, you run:
17+
```
18+
python pdf_image_extractor_cli.py 1710.05006.pdf -o extracted-images -f png -w 800 -he 800
19+
```
20+
This will save all available images in the `images` directory and outputs:
21+
```
22+
[!] No images found on page 0
23+
[+] Found a total of 3 images in page 1
24+
[-] Skipping image 1 on page 1 due to its small size.
25+
[-] Skipping image 2 on page 1 due to its small size.
26+
[-] Skipping image 3 on page 1 due to its small size.
27+
[+] Found a total of 3 images in page 2
28+
[-] Skipping image 2 on page 2 due to its small size.
29+
[!] No images found on page 3
30+
[!] No images found on page 4
1531
```
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,48 @@
1-
import fitz # PyMuPDF
1+
import os
2+
import fitz # PyMuPDF
23
import io
34
from PIL import Image
45

5-
# file path you want to extract images from
6+
# Output directory for the extracted images
7+
output_dir = "extracted_images"
8+
# Desired output image format
9+
output_format = "png"
10+
# Minimum width and height for extracted images
11+
min_width = 100
12+
min_height = 100
13+
# Create the output directory if it does not exist
14+
if not os.path.exists(output_dir):
15+
os.makedirs(output_dir)
16+
# File path you want to extract images from
617
file = "1710.05006.pdf"
7-
# open the file
18+
# Open the file
819
pdf_file = fitz.open(file)
9-
# iterate over PDF pages
20+
# Iterate over PDF pages
1021
for page_index in range(len(pdf_file)):
11-
# get the page itself
22+
# Get the page itself
1223
page = pdf_file[page_index]
13-
# get image list
14-
image_list = page.get_images()
15-
# printing number of images found in this page
24+
# Get image list
25+
image_list = page.get_images(full=True)
26+
# Print the number of images found on this page
1627
if image_list:
1728
print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
1829
else:
19-
print("[!] No images found on page", page_index)
30+
print(f"[!] No images found on page {page_index}")
31+
# Iterate over the images on the page
2032
for image_index, img in enumerate(image_list, start=1):
21-
# get the XREF of the image
33+
# Get the XREF of the image
2234
xref = img[0]
23-
# extract the image bytes
35+
# Extract the image bytes
2436
base_image = pdf_file.extract_image(xref)
2537
image_bytes = base_image["image"]
26-
# get the image extension
38+
# Get the image extension
2739
image_ext = base_image["ext"]
28-
# load it to PIL
40+
# Load it to PIL
2941
image = Image.open(io.BytesIO(image_bytes))
30-
# save it to local disk
31-
image.save(open(f"image{page_index+1}_{image_index}.{image_ext}", "wb"))
42+
# Check if the image meets the minimum dimensions and save it
43+
if image.width >= min_width and image.height >= min_height:
44+
image.save(
45+
open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"),
46+
format=output_format.upper())
47+
else:
48+
print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
import os
2+
import fitz # PyMuPDF
3+
import io
4+
from PIL import Image
5+
import argparse
6+
7+
parser = argparse.ArgumentParser(description="Extract images from a PDF file.")
8+
parser.add_argument("file", help="PDF file to extract images from.")
9+
parser.add_argument("-o", "--output", help="Output directory for the extracted images.", default="extracted_images")
10+
parser.add_argument("-f", "--format", help="Desired output image format, default is PNG.", default="png")
11+
parser.add_argument("-w", "--width", help="Minimum width for extracted images, default is 100.", default=100, type=int)
12+
parser.add_argument("-he", "--height", help="Minimum height for extracted images, default is 100.", default=100, type=int)
13+
# Parse the arguments
14+
args = parser.parse_args()
15+
16+
# Output directory for the extracted images
17+
output_dir = args.output
18+
# Desired output image format
19+
output_format = args.format
20+
# Minimum width and height for extracted images
21+
min_width = args.width
22+
min_height = args.height
23+
# Create the output directory if it does not exist
24+
if not os.path.exists(output_dir):
25+
os.makedirs(output_dir)
26+
# File path you want to extract images from
27+
file = args.file
28+
# Open the file
29+
pdf_file = fitz.open(file)
30+
# Iterate over PDF pages
31+
for page_index in range(len(pdf_file)):
32+
# Get the page itself
33+
page = pdf_file[page_index]
34+
# Get image list
35+
image_list = page.get_images(full=True)
36+
# Print the number of images found on this page
37+
if image_list:
38+
print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
39+
else:
40+
print(f"[!] No images found on page {page_index}")
41+
# Iterate over the images on the page
42+
for image_index, img in enumerate(image_list, start=1):
43+
# Get the XREF of the image
44+
xref = img[0]
45+
# Extract the image bytes
46+
base_image = pdf_file.extract_image(xref)
47+
image_bytes = base_image["image"]
48+
# Get the image extension
49+
image_ext = base_image["ext"]
50+
# Load it to PIL
51+
image = Image.open(io.BytesIO(image_bytes))
52+
# Check if the image meets the minimum dimensions and save it
53+
if image.width >= min_width and image.height >= min_height:
54+
image.save(
55+
open(os.path.join(output_dir, f"image{page_index + 1}_{image_index}.{output_format}"), "wb"),
56+
format=output_format.upper())
57+
else:
58+
print(f"[-] Skipping image {image_index} on page {page_index} due to its small size.")

0 commit comments

Comments
 (0)