Skip to content

Commit d701931

Browse files
committed
Fix the TIFF to PDF conversion command. Add TIFF test.
1 parent 7842d61 commit d701931

File tree

2 files changed

+36
-15
lines changed

2 files changed

+36
-15
lines changed

ingestors/media/tiff.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,8 @@ def ingest(self, file_path, entity):
2323
entity.schema = model.get("Pages")
2424
pdf_path = self.make_work_file("tiff.pdf")
2525
self.exec_command(
26-
"tiff2pdf", file_path, "-x", "300", "-y", "300", "-o", pdf_path
26+
"tiff2pdf", file_path, "-n", "-j", "-x", "300", "-y", "300", "-o", pdf_path
2727
)
2828
self.assert_outfile(pdf_path)
29+
2930
self.pdf_alternative_extract(entity, pdf_path, self.manager)

tests/test_image.py

Lines changed: 34 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,11 @@ def test_tesseract_ocr_regression(self):
2929
"content": "This is text inside a GIF image",
3030
"mime_type": "image/gif",
3131
},
32-
# "tiff": {
33-
# "file": "regression_tiff.tiff",
34-
# "content": "Debian -- Packages",
35-
# "mime_type": "image/tiff",
36-
# },
32+
"tiff": {
33+
"file": "regression_tiff.tiff",
34+
"content": "Debian -- Packages",
35+
"mime_type": "image/tiff",
36+
},
3737
"webp": {
3838
"file": "regression_webp.webp",
3939
"content": "Debian -- Packages",
@@ -49,18 +49,38 @@ def test_tesseract_ocr_regression(self):
4949
for test_image_type in test_data:
5050
fixture_path, entity = self.fixture(test_data[test_image_type]["file"])
5151
self.manager.ingest(fixture_path, entity)
52-
self.assertIn(
53-
test_data[test_image_type]["content"],
54-
entity.first("bodyText"),
55-
f"Test failed for {test_data[test_image_type]['file']}",
56-
)
57-
self.assertEqual(
58-
entity.first("mimeType"),
59-
test_data[test_image_type]["mime_type"],
52+
53+
emitted_image_entities = [
54+
x
55+
for x in self.get_emitted()
56+
if "mimeType" in x.properties and "image" in x.first("mimeType")
57+
]
58+
59+
# Have entities been emitted with a mime type that contains "image"?
60+
self.assertTrue(
61+
len(emitted_image_entities) != 0,
6062
f"Test failed for {test_data[test_image_type]['file']}",
6163
)
64+
image_entity = emitted_image_entities.pop()
65+
66+
# Is the processing status of the entity == SUCCESS?
6267
self.assertEqual(
63-
entity.first("processingStatus"),
68+
image_entity.first("processingStatus"),
6469
self.manager.STATUS_SUCCESS,
6570
f"Test failed for {test_data[test_image_type]['file']}",
6671
)
72+
73+
# Does either the bodyText prop or the indexText prop contain
74+
# the text resulted from OCR?
75+
try:
76+
self.assertIn(
77+
test_data[test_image_type]["content"],
78+
image_entity.first("bodyText"),
79+
f"Test failed for {test_data[test_image_type]['file']}",
80+
)
81+
except TypeError:
82+
self.assertIn(
83+
test_data[test_image_type]["content"],
84+
image_entity.first("indexText"),
85+
f"Test failed for {test_data[test_image_type]['file']}",
86+
)

0 commit comments

Comments
 (0)