Skip to content

Commit 0f3ca52

Browse files
committed
Add tests to prevent regression in OCR for gif, jpg, jp2, tiff, webp
1 parent b766f4e commit 0f3ca52

File tree

6 files changed

+49
-6
lines changed

6 files changed

+49
-6
lines changed

tests/fixtures/regression_gif.gif

2.41 KB
Loading
81.7 KB
Binary file not shown.
103 KB
Binary file not shown.
23.1 KB
Loading

tests/test_image.py

Lines changed: 49 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,53 @@ def test_ingest_on_svg(self):
1414
self.assertIn("TEST", entity.first("bodyText"))
1515
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
1616

17-
def test_ingest_on_jpeg(self):
18-
fixture_path, entity = self.fixture("jpegtest.jpg")
19-
self.manager.ingest(fixture_path, entity)
20-
self.assertIn("Debian", entity.first("bodyText"))
21-
self.assertEqual(entity.first("mimeType"), "image/jpeg")
17+
def test_tesseract_ocr_regression(self):
18+
"""This test is meant to catch a regresion in the OCR behaviour
19+
descrbed in this PR: https://github.com/alephdata/ingest-file/pull/585"""
2220

23-
self.assertEqual(entity.first("processingStatus"), self.manager.STATUS_SUCCESS)
21+
test_data = {
22+
"jpeg": {
23+
"file": "regression_jpg.jpg",
24+
"content": "Debian -- Packages",
25+
"mime_type": "image/jpeg",
26+
},
27+
"gif": {
28+
"file": "regression_gif.gif",
29+
"content": "This is text inside a GIF image",
30+
"mime_type": "image/gif",
31+
},
32+
# "tiff": {
33+
# "file": "regression_tiff.tiff",
34+
# "content": "Debian -- Packages",
35+
# "mime_type": "image/tiff",
36+
# },
37+
"webp": {
38+
"file": "regression_webp.webp",
39+
"content": "Debian -- Packages",
40+
"mime_type": "image/webp",
41+
},
42+
"openjpeg": {
43+
"file": "regression_openjpeg.jp2",
44+
"content": "Debian -- Packages",
45+
"mime_type": "image/jp2",
46+
},
47+
}
48+
49+
for test_image_type in test_data:
50+
fixture_path, entity = self.fixture(test_data[test_image_type]["file"])
51+
self.manager.ingest(fixture_path, entity)
52+
self.assertIn(
53+
test_data[test_image_type]["content"],
54+
entity.first("bodyText"),
55+
f"Test failed for {test_data[test_image_type]['file']}",
56+
)
57+
self.assertEqual(
58+
entity.first("mimeType"),
59+
test_data[test_image_type]["mime_type"],
60+
f"Test failed for {test_data[test_image_type]['file']}",
61+
)
62+
self.assertEqual(
63+
entity.first("processingStatus"),
64+
self.manager.STATUS_SUCCESS,
65+
f"Test failed for {test_data[test_image_type]['file']}",
66+
)

0 commit comments

Comments
 (0)