Skip to content

Commit 7b61601

Browse files
committed
Consider image's type from its data rather than its extension
DEVSIX-5172
1 parent d6d759e commit 7b61601

File tree

4 files changed

+170
-25
lines changed

4 files changed

+170
-25
lines changed

pdfocr-api/src/main/java/com/itextpdf/pdfocr/PdfCreatorUtil.java

Lines changed: 17 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -24,10 +24,13 @@ This file is part of the iText (R) project.
2424

2525
import com.itextpdf.io.image.ImageData;
2626
import com.itextpdf.io.image.ImageDataFactory;
27+
import com.itextpdf.io.image.ImageType;
28+
import com.itextpdf.io.image.ImageTypeDetector;
2729
import com.itextpdf.io.image.TiffImageData;
2830
import com.itextpdf.io.source.RandomAccessFileOrArray;
2931
import com.itextpdf.io.source.RandomAccessSourceFactory;
3032
import com.itextpdf.io.util.MessageFormatUtil;
33+
import com.itextpdf.io.util.UrlUtil;
3134
import com.itextpdf.kernel.geom.Rectangle;
3235
import com.itextpdf.layout.Document;
3336
import com.itextpdf.layout.element.Paragraph;
@@ -172,15 +175,9 @@ static List<ImageData> getImageData(final File inputImage, IImageRotationHandler
172175
throws OcrException, IOException {
173176
List<ImageData> images = new ArrayList<ImageData>();
174177

175-
String ext = "";
176-
int index = inputImage.getAbsolutePath().lastIndexOf('.');
177-
if (index > 0) {
178-
ext = new String(inputImage.getAbsolutePath().toCharArray(),
179-
index + 1,
180-
inputImage.getAbsolutePath().length() - index - 1);
181-
182-
if ("tiff".equals(ext.toLowerCase())
183-
|| "tif".equals(ext.toLowerCase())) {
178+
try {
179+
ImageType imageType = ImageTypeDetector.detectImageType(UrlUtil.toURL(inputImage.getAbsolutePath()));
180+
if (ImageType.TIFF == imageType) {
184181
int tiffPages = getNumberOfPageTiff(inputImage);
185182

186183
for (int page = 0; page < tiffPages; page++) {
@@ -194,21 +191,19 @@ static List<ImageData> getImageData(final File inputImage, IImageRotationHandler
194191
images.add(imageData);
195192
}
196193
} else {
197-
try {
198-
ImageData imageData = ImageDataFactory
199-
.create(inputImage.getAbsolutePath());
200-
if (imageRotationHandler != null) {
201-
imageData = imageRotationHandler.applyRotation(imageData);
202-
}
203-
images.add(imageData);
204-
} catch (com.itextpdf.io.IOException e) {
205-
LOGGER.error(MessageFormatUtil.format(
206-
PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE,
207-
e.getMessage()));
208-
throw new OcrException(
209-
OcrException.CANNOT_READ_INPUT_IMAGE, e);
194+
ImageData imageData = ImageDataFactory
195+
.create(inputImage.getAbsolutePath());
196+
if (imageRotationHandler != null) {
197+
imageData = imageRotationHandler.applyRotation(imageData);
210198
}
199+
images.add(imageData);
211200
}
201+
} catch (com.itextpdf.io.IOException e) {
202+
LOGGER.error(MessageFormatUtil.format(
203+
PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE,
204+
e.getMessage()));
205+
throw new OcrException(
206+
OcrException.CANNOT_READ_INPUT_IMAGE, e);
212207
}
213208
return images;
214209
}
Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
/*
2+
This file is part of the iText (R) project.
3+
Copyright (c) 1998-2021 iText Group NV
4+
Authors: iText Software.
5+
6+
This program is offered under a commercial and under the AGPL license.
7+
For commercial licensing, contact us at https://itextpdf.com/sales. For AGPL licensing, see below.
8+
9+
AGPL licensing:
10+
This program is free software: you can redistribute it and/or modify
11+
it under the terms of the GNU Affero General Public License as published by
12+
the Free Software Foundation, either version 3 of the License, or
13+
(at your option) any later version.
14+
15+
This program is distributed in the hope that it will be useful,
16+
but WITHOUT ANY WARRANTY; without even the implied warranty of
17+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18+
GNU Affero General Public License for more details.
19+
20+
You should have received a copy of the GNU Affero General Public License
21+
along with this program. If not, see <https://www.gnu.org/licenses/>.
22+
*/
23+
package com.itextpdf.pdfocr;
24+
25+
import com.itextpdf.io.image.ImageData;
26+
import com.itextpdf.io.image.ImageType;
27+
import com.itextpdf.io.image.JpegImageData;
28+
import com.itextpdf.io.image.TiffImageData;
29+
import com.itextpdf.io.util.MessageFormatUtil;
30+
import com.itextpdf.pdfocr.helpers.PdfHelper;
31+
import com.itextpdf.test.ExtendedITextTest;
32+
import com.itextpdf.test.annotations.LogMessage;
33+
import com.itextpdf.test.annotations.LogMessages;
34+
import com.itextpdf.test.annotations.type.UnitTest;
35+
36+
import java.io.File;
37+
import java.io.IOException;
38+
import java.util.List;
39+
import org.junit.Assert;
40+
import org.junit.Rule;
41+
import org.junit.Test;
42+
import org.junit.experimental.categories.Category;
43+
import org.junit.rules.ExpectedException;
44+
45+
@Category(UnitTest.class)
46+
public class PdfCreatorUtilTest extends ExtendedITextTest {
47+
48+
@Rule
49+
public ExpectedException junitExpectedException = ExpectedException.none();
50+
51+
@Test
52+
public void getImageDataFromValidSinglePagedTiffTest() throws IOException {
53+
File image = new File(PdfHelper.getImagesTestDirectory() + "single7x5cm.tif");
54+
List<ImageData> images = PdfCreatorUtil.getImageData(image, null);
55+
56+
Assert.assertEquals(1, images.size());
57+
58+
ImageData imageDate = images.get(0);
59+
Assert.assertNotNull(imageDate);
60+
Assert.assertTrue(imageDate instanceof TiffImageData);
61+
Assert.assertEquals(ImageType.TIFF, imageDate.getOriginalType());
62+
}
63+
64+
@Test
65+
public void getImageDataFromValidMultiPagedTiffTest() throws IOException {
66+
File image = new File(PdfHelper.getImagesTestDirectory() + "multipage.tiff");
67+
List<ImageData> images = PdfCreatorUtil.getImageData(image, null);
68+
69+
Assert.assertEquals(9, images.size());
70+
for (ImageData imageDate : images) {
71+
Assert.assertNotNull(imageDate);
72+
Assert.assertTrue(imageDate instanceof TiffImageData);
73+
Assert.assertEquals(ImageType.TIFF, imageDate.getOriginalType());
74+
}
75+
}
76+
77+
@Test
78+
public void getImageDataFromValidNotTiffTest() throws IOException {
79+
File image = new File(PdfHelper.getImagesTestDirectory() + "numbers_01.jpg");
80+
List<ImageData> images = PdfCreatorUtil.getImageData(image, null);
81+
82+
Assert.assertEquals(1, images.size());
83+
84+
ImageData imageDate = images.get(0);
85+
Assert.assertNotNull(imageDate);
86+
Assert.assertTrue(imageDate instanceof JpegImageData);
87+
Assert.assertEquals(ImageType.JPEG, imageDate.getOriginalType());
88+
}
89+
90+
@Test
91+
@LogMessages(messages = {
92+
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
93+
})
94+
public void getImageDataFromNotExistingImageTest() throws IOException {
95+
junitExpectedException.expect(OcrException.class);
96+
97+
PdfCreatorUtil.getImageData(new File("no such path"), null);
98+
}
99+
100+
@Test
101+
@LogMessages(messages = {
102+
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
103+
})
104+
public void getImageDataFromInvalidImageTest() throws IOException {
105+
junitExpectedException.expect(OcrException.class);
106+
junitExpectedException.expectMessage(MessageFormatUtil.format(
107+
OcrException.CANNOT_READ_INPUT_IMAGE));
108+
109+
PdfCreatorUtil.getImageData(new File(PdfHelper.getImagesTestDirectory() + "corrupted.jpg"),
110+
null);
111+
}
112+
}

pdfocr-api/src/test/java/com/itextpdf/pdfocr/PdfInputImageTest.java

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,8 +41,7 @@ public class PdfInputImageTest extends ExtendedITextTest {
4141
public ExpectedException junitExpectedException = ExpectedException.none();
4242

4343
@LogMessages(messages = {
44-
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE,
45-
count = 1)
44+
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
4645
})
4746
@Test
4847
public void testCorruptedImage() {
@@ -55,7 +54,7 @@ public void testCorruptedImage() {
5554
}
5655

5756
@LogMessages(messages = {
58-
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE, count = 1)
57+
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
5958
})
6059
@Test
6160
public void testCorruptedImageWithoutExtension() {
@@ -67,4 +66,43 @@ public void testCorruptedImageWithoutExtension() {
6766
Assert.assertNotNull(realOutput);
6867
Assert.assertEquals("", realOutput);
6968
}
69+
70+
@LogMessages(messages = {
71+
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
72+
})
73+
@Test
74+
public void testInvalidImagePathWithoutDot() {
75+
junitExpectedException.expect(OcrException.class);
76+
77+
File file = new File("testName");
78+
String realOutput = PdfHelper.getTextFromPdf(file, "testInvalidImagePathWithoutDot");
79+
Assert.assertNotNull(realOutput);
80+
Assert.assertEquals("", realOutput);
81+
}
82+
83+
@LogMessages(messages = {
84+
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
85+
})
86+
@Test
87+
public void testInvalidImagePathWithDot() {
88+
junitExpectedException.expect(OcrException.class);
89+
90+
File file = new File("test.Name");
91+
String realOutput = PdfHelper.getTextFromPdf(file, "testInvalidImagePathWithDot");
92+
Assert.assertNotNull(realOutput);
93+
Assert.assertEquals("", realOutput);
94+
}
95+
96+
@LogMessages(messages = {
97+
@LogMessage(messageTemplate = PdfOcrLogMessageConstant.CANNOT_READ_INPUT_IMAGE)
98+
})
99+
@Test
100+
public void testValidImageWithoutExtension() {
101+
junitExpectedException.expect(OcrException.class);
102+
103+
File file = new File(PdfHelper.getImagesTestDirectory() + "numbers_01");
104+
String realOutput = PdfHelper.getTextFromPdf(file, "testValidImageWithoutExtension");
105+
Assert.assertNotNull(realOutput);
106+
Assert.assertEquals("", realOutput);
107+
}
70108
}
Binary file not shown.

0 commit comments

Comments
 (0)