@@ -29,11 +29,11 @@ def test_tesseract_ocr_regression(self):
2929 "content" : "This is text inside a GIF image" ,
3030 "mime_type" : "image/gif" ,
3131 },
32- # "tiff": {
33- # "file": "regression_tiff.tiff",
34- # "content": "Debian -- Packages",
35- # "mime_type": "image/tiff",
36- # },
32+ "tiff" : {
33+ "file" : "regression_tiff.tiff" ,
34+ "content" : "Debian -- Packages" ,
35+ "mime_type" : "image/tiff" ,
36+ },
3737 "webp" : {
3838 "file" : "regression_webp.webp" ,
3939 "content" : "Debian -- Packages" ,
@@ -49,18 +49,38 @@ def test_tesseract_ocr_regression(self):
4949 for test_image_type in test_data :
5050 fixture_path , entity = self .fixture (test_data [test_image_type ]["file" ])
5151 self .manager .ingest (fixture_path , entity )
52- self .assertIn (
53- test_data [test_image_type ]["content" ],
54- entity .first ("bodyText" ),
55- f"Test failed for { test_data [test_image_type ]['file' ]} " ,
56- )
57- self .assertEqual (
58- entity .first ("mimeType" ),
59- test_data [test_image_type ]["mime_type" ],
52+
53+ emitted_image_entities = [
54+ x
55+ for x in self .get_emitted ()
56+ if "mimeType" in x .properties and "image" in x .first ("mimeType" )
57+ ]
58+
59+ # Have entities been emitted with a mime type that contains "image"?
60+ self .assertTrue (
61+ len (emitted_image_entities ) != 0 ,
6062 f"Test failed for { test_data [test_image_type ]['file' ]} " ,
6163 )
64+ image_entity = emitted_image_entities .pop ()
65+
66+ # Is the processing status of the entity == SUCCESS?
6267 self .assertEqual (
63- entity .first ("processingStatus" ),
68+ image_entity .first ("processingStatus" ),
6469 self .manager .STATUS_SUCCESS ,
6570 f"Test failed for { test_data [test_image_type ]['file' ]} " ,
6671 )
72+
73+ # Does either the bodyText prop or the indexText prop contain
74+ # the text resulted from OCR?
75+ try :
76+ self .assertIn (
77+ test_data [test_image_type ]["content" ],
78+ image_entity .first ("bodyText" ),
79+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
80+ )
81+ except TypeError :
82+ self .assertIn (
83+ test_data [test_image_type ]["content" ],
84+ image_entity .first ("indexText" ),
85+ f"Test failed for { test_data [test_image_type ]['file' ]} " ,
86+ )
0 commit comments