Skip to content

Commit 1e6ef93

Browse files
committed
Handle whether JPEG compression exists in the TIFF image
1 parent d701931 commit 1e6ef93

File tree

3 files changed

+62
-11
lines changed

3 files changed

+62
-11
lines changed

ingestors/media/tiff.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,10 @@
1-
import logging
21
from followthemoney import model
32

43
from ingestors.ingestor import Ingestor
54
from ingestors.support.pdf import PDFSupport
65
from ingestors.support.shell import ShellSupport
76
from ingestors.support.temp import TempFileSupport
8-
9-
log = logging.getLogger(__name__)
7+
from ingestors.exc import ProcessingException
108

119

1210
class TIFFIngestor(Ingestor, PDFSupport, TempFileSupport, ShellSupport):
@@ -22,9 +20,24 @@ class TIFFIngestor(Ingestor, PDFSupport, TempFileSupport, ShellSupport):
2220
def ingest(self, file_path, entity):
2321
entity.schema = model.get("Pages")
2422
pdf_path = self.make_work_file("tiff.pdf")
25-
self.exec_command(
26-
"tiff2pdf", file_path, "-n", "-j", "-x", "300", "-y", "300", "-o", pdf_path
27-
)
23+
try:
24+
self.exec_command(
25+
"tiff2pdf",
26+
file_path,
27+
"-n",
28+
"-j",
29+
"-x",
30+
"300",
31+
"-y",
32+
"300",
33+
"-o",
34+
pdf_path,
35+
)
36+
except ProcessingException:
37+
self.exec_command(
38+
"tiff2pdf", file_path, "-x", "300", "-y", "300", "-o", pdf_path
39+
)
40+
2841
self.assert_outfile(pdf_path)
2942

3043
self.pdf_alternative_extract(entity, pdf_path, self.manager)

tests/test_image.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,13 @@ def test_tesseract_ocr_regression(self):
6363
)
6464
image_entity = emitted_image_entities.pop()
6565

66+
# Is the mimeType correct?
67+
self.assertEqual(
68+
image_entity.first("mimeType"),
69+
test_data[test_image_type]["mime_type"],
70+
f"Test failed for {test_data[test_image_type]['file']}",
71+
)
72+
6673
# Is the processing status of the entity == SUCCESS?
6774
self.assertEqual(
6875
image_entity.first("processingStatus"),

tests/test_tiff.py

Lines changed: 36 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,45 @@ class TIFFIngestorTest(TestCase):
55
def test_match(self):
66
fixture_path, entity = self.fixture("multipage_tiff_example.tif")
77
self.manager.ingest(fixture_path, entity)
8-
self.assertEqual(entity.first("mimeType"), "image/tiff")
9-
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
8+
9+
emitted_image_entities = [
10+
x
11+
for x in self.get_emitted()
12+
if "mimeType" in x.properties and "image" in x.first("mimeType")
13+
]
14+
15+
# Have entities been emitted with a mime type that contains "image"?
16+
self.assertTrue(
17+
len(emitted_image_entities) != 0,
18+
f"Test failed for multipage_tiff_example.tif",
19+
)
20+
image_entity = emitted_image_entities.pop()
21+
22+
self.assertEqual(image_entity.first("mimeType"), "image/tiff")
23+
self.assertEqual(
24+
image_entity.first("processingStatus"), self.manager.STATUS_SUCCESS
25+
)
1026
entities = self.get_emitted()
1127
self.assertEqual(len(entities), 11)
1228

1329
def test_ingest_tiff_format(self):
1430
fixture_path, entity = self.fixture("hello_world_tiff.tif")
1531
self.manager.ingest(fixture_path, entity)
16-
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
17-
entity = self.get_emitted_by_id(entity.id)
18-
self.assertEqual(entity.first("indexText"), "HELLO WORLD")
32+
33+
emitted_image_entities = [
34+
x
35+
for x in self.get_emitted()
36+
if "mimeType" in x.properties and "image" in x.first("mimeType")
37+
]
38+
39+
# Have entities been emitted with a mime type that contains "image"?
40+
self.assertTrue(
41+
len(emitted_image_entities) != 0,
42+
f"Test failed for multipage_tiff_example.tif",
43+
)
44+
image_entity = emitted_image_entities.pop()
45+
46+
self.assertEqual(
47+
image_entity.first("processingStatus"), self.manager.STATUS_SUCCESS
48+
)
49+
self.assertEqual(image_entity.first("indexText"), "HELLO WORLD")

0 commit comments

Comments
 (0)