1
1
from os .path import join
2
+ from typing import Union
2
3
import regex
3
4
import itertools
4
5
import numpy as np
24
25
)
25
26
from ocrd_modelfactory import page_from_file
26
27
from ocrd_models .ocrd_page import (
28
+ OcrdPage ,
27
29
RegionRefType ,
28
30
RegionRefIndexedType ,
29
31
OrderedGroupType ,
46
48
47
49
class KrakenRecognize (Processor ):
48
50
49
- def __init__ (self , * args , ** kwargs ):
50
- kwargs ['ocrd_tool' ] = OCRD_TOOL ['tools' ]['ocrd-kraken-recognize' ]
51
- kwargs ['version' ] = OCRD_TOOL ['version' ]
52
- super ().__init__ (* args , ** kwargs )
53
- if hasattr (self , 'output_file_grp' ):
54
- # processing context
55
- self .setup ()
51
+ @property
52
+ def executable (self ):
53
+ return 'ocrd-kraken-recognize'
56
54
57
55
def setup (self ):
58
56
"""
59
- Load models
57
+ Assert filegrp cardinality, load model, set predict function
60
58
"""
61
- log = getLogger ('processor.KrakenRecognize' )
59
+ assert_file_grp_cardinality (self .input_file_grp , 1 )
60
+ assert_file_grp_cardinality (self .output_file_grp , 1 )
61
+
62
+ self .logger = getLogger ('processor.KrakenRecognize' )
62
63
import torch
63
64
from kraken .rpred import rpred
64
65
from kraken .lib .models import load_any
65
66
model_fname = self .resolve_resource (self .parameter ['model' ])
66
- log .info ("loading model '%s'" , model_fname )
67
+ self . logger .info ("loading model '%s'" , model_fname )
67
68
device = self .parameter ['device' ]
68
69
if device != 'cpu' and not torch .cuda .is_available ():
69
70
device = 'cpu'
70
71
if device == 'cpu' :
71
- log .warning ("no CUDA device available. Running without GPU will be slow" )
72
+ self . logger .warning ("no CUDA device available. Running without GPU will be slow" )
72
73
self .model = load_any (model_fname , device = device )
73
74
def predict (page_image , segmentation ):
74
75
return rpred (self .model , page_image , segmentation ,
75
76
self .parameter ['pad' ],
76
77
self .parameter ['bidi_reordering' ])
77
78
self .predict = predict
78
79
79
- def process (self ) :
80
+ def process_page_pcgts (self , * input_pcgts , output_file_id : str = None , page_id : str = None ) -> OcrdPage :
80
81
"""Recognize text on lines with Kraken.
81
82
82
- Open and deserialise each PAGE input file and its respective image,
83
- then iterate over the element hierarchy down to the line level.
83
+ Open the parsed PAGE-XML file, then iterate over the element hierarchy
84
+ down to the line level.
84
85
85
86
Set up Kraken to recognise each text line (via coordinates into
86
87
the higher-level image, or from the alternative image. If the model
@@ -94,149 +95,135 @@ def process(self):
94
95
into additional TextEquiv at each level, and make the higher levels
95
96
consistent with that (by concatenation joined by whitespace).
96
97
97
- Produce a new output file by serialising the resulting hierarchy.
98
+ Return the resulting hierarchy.
98
99
"""
100
+ assert self .workspace
99
101
from kraken .containers import Segmentation , BaselineLine , BBoxLine
100
- log = getLogger ('processor.KrakenRecognize' )
101
- assert_file_grp_cardinality (self .input_file_grp , 1 )
102
- assert_file_grp_cardinality (self .output_file_grp , 1 )
103
102
104
- for n , input_file in enumerate (self .input_files ):
105
- page_id = input_file .pageId or input_file .ID
106
- log .info ("INPUT FILE %i / %s of %s" , n , page_id , len (self .input_files ))
107
- pcgts = page_from_file (self .workspace .download_file (input_file ))
108
- self .add_metadata (pcgts )
109
- page = pcgts .get_Page ()
110
- page_image , page_coords , _ = self .workspace .image_from_page (
111
- page , page_id ,
112
- feature_selector = "binarized"
113
- if self .model .nn .input [1 ] == 1 and self .model .one_channel_mode == '1'
114
- else '' )
115
- page_rect = Rectangle (0 , 0 , page_image .width - 1 , page_image .height - 1 )
116
- # todo: find out whether kraken.lib.xml.XMLPage(...).to_container() is adequate
103
+ pcgts = input_pcgts [0 ]
104
+ page = pcgts .get_Page ()
105
+ page_image , page_coords , _ = self .workspace .image_from_page (
106
+ page , page_id ,
107
+ feature_selector = "binarized"
108
+ if self .model .nn .input [1 ] == 1 and self .model .one_channel_mode == '1'
109
+ else '' )
110
+ page_rect = Rectangle (0 , 0 , page_image .width - 1 , page_image .height - 1 )
111
+ # TODO: find out whether kraken.lib.xml.XMLPage(...).to_container() is adequate
117
112
118
- all_lines = page .get_AllTextLines ()
119
- # assumes that missing baselines are rare, if any
120
- if any (line .Baseline for line in all_lines ):
121
- log .info ("Converting PAGE to Kraken Segmentation (baselines)" )
122
- segtype = 'baselines'
123
- else :
124
- log .info ("Converting PAGE to Kraken Segmentation (boxes only)" )
125
- segtype = 'bbox'
126
- scale = 0.5 * np .median ([xywh_from_points (line .Coords .points )['h' ] for line in all_lines ])
127
- log .info ("Estimated scale: %.1f" , scale )
128
- seglines = []
129
- for line in all_lines :
130
- # FIXME: see whether model prefers baselines or bbox crops (seg_type)
131
- # FIXME: even if we do not have baselines, emulating baseline+boundary might be useful to prevent automatic center normalization
132
- poly = coordinates_of_segment (line , None , page_coords )
133
- poly = make_valid (Polygon (poly ))
134
- poly = poly .intersection (page_rect )
135
- if segtype == 'baselines' :
136
- if line .Baseline is None :
137
- base = dummy_baseline_of_segment (line , page_coords )
138
- else :
139
- base = baseline_of_segment (line , page_coords )
140
- if len (base ) < 2 or np .abs (np .mean (base [0 ] - base [- 1 ])) <= 1 :
141
- base = dummy_baseline_of_segment (line , page_coords )
142
- elif not LineString (base ).intersects (poly ):
143
- base = dummy_baseline_of_segment (line , page_coords )
144
- # kraken expects baseline to be fully contained in boundary
145
- base = LineString (base )
146
- if poly .is_empty :
147
- poly = polygon_from_baseline (base , scale = scale )
148
- elif not base .within (poly ):
149
- poly = join_polygons ([poly , polygon_from_baseline (base , scale = scale )],
150
- loc = line .id , scale = scale )
151
- seglines .append (BaselineLine (baseline = list (map (tuple , base .coords )),
152
- boundary = list (map (tuple , poly .exterior .coords )),
153
- id = line .id ,
154
- tags = {'type' : 'default' }))
155
- # write back
156
- base = coordinates_for_segment (base .coords , None , page_coords )
157
- line .set_Baseline (BaselineType (points = points_from_polygon (base )))
158
- poly = coordinates_for_segment (poly .exterior .coords [:- 1 ], None , page_coords )
159
- line .set_Coords (CoordsType (points = points_from_polygon (poly )))
113
+ all_lines = page .get_AllTextLines ()
114
+ # assumes that missing baselines are rare, if any
115
+ if any (line .Baseline for line in all_lines ):
116
+ self .logger .info ("Converting PAGE to Kraken Segmentation (baselines)" )
117
+ segtype = 'baselines'
118
+ else :
119
+ self .logger .info ("Converting PAGE to Kraken Segmentation (boxes only)" )
120
+ segtype = 'bbox'
121
+ scale = 0.5 * np .median ([xywh_from_points (line .Coords .points )['h' ] for line in all_lines ])
122
+ self .logger .info ("Estimated scale: %.1f" , scale )
123
+ seglines = []
124
+ for line in all_lines :
125
+ # FIXME: see whether model prefers baselines or bbox crops (seg_type)
126
+ # FIXME: even if we do not have baselines, emulating baseline+boundary might be useful to prevent automatic center normalization
127
+ poly = coordinates_of_segment (line , None , page_coords )
128
+ poly = make_valid (Polygon (poly ))
129
+ poly = poly .intersection (page_rect )
130
+ if segtype == 'baselines' :
131
+ if line .Baseline is None :
132
+ base = dummy_baseline_of_segment (line , page_coords )
160
133
else :
161
- seglines .append (BBoxLine (bbox = poly .envelope .bounds ,
162
- id = line .id ))
134
+ base = baseline_of_segment (line , page_coords )
135
+ if len (base ) < 2 or np .abs (np .mean (base [0 ] - base [- 1 ])) <= 1 :
136
+ base = dummy_baseline_of_segment (line , page_coords )
137
+ elif not LineString (base ).intersects (poly ):
138
+ base = dummy_baseline_of_segment (line , page_coords )
139
+ # kraken expects baseline to be fully contained in boundary
140
+ base = LineString (base )
141
+ if poly .is_empty :
142
+ poly = polygon_from_baseline (base , scale = scale )
143
+ elif not base .within (poly ):
144
+ poly = join_polygons ([poly , polygon_from_baseline (base , scale = scale )],
145
+ loc = line .id , scale = scale )
146
+ seglines .append (BaselineLine (baseline = list (map (tuple , base .coords )),
147
+ boundary = list (map (tuple , poly .exterior .coords )),
148
+ id = line .id ,
149
+ tags = {'type' : 'default' }))
150
+ # write back
151
+ base = coordinates_for_segment (base .coords , None , page_coords )
152
+ line .set_Baseline (BaselineType (points = points_from_polygon (base )))
153
+ poly = coordinates_for_segment (poly .exterior .coords [:- 1 ], None , page_coords )
154
+ line .set_Coords (CoordsType (points = points_from_polygon (poly )))
155
+ else :
156
+ seglines .append (BBoxLine (bbox = poly .envelope .bounds ,
157
+ id = line .id ))
163
158
164
- segmentation = Segmentation (lines = seglines ,
165
- script_detection = False ,
166
- text_direction = 'horizontal-lr' ,
167
- type = segtype ,
168
- imagename = page_id )
169
- for idx_line , ocr_record in enumerate (self .predict (page_image , segmentation )):
170
- line = all_lines [idx_line ]
171
- id_line = line .id
172
- if not ocr_record .prediction and not ocr_record .cuts :
173
- log .warning ('No results for line "%s"' , line .id )
159
+ segmentation = Segmentation (lines = seglines ,
160
+ script_detection = False ,
161
+ text_direction = 'horizontal-lr' ,
162
+ type = segtype ,
163
+ imagename = page_id )
164
+ for idx_line , ocr_record in enumerate (self .predict (page_image , segmentation )):
165
+ line = all_lines [idx_line ]
166
+ id_line = line .id
167
+ if not ocr_record .prediction and not ocr_record .cuts :
168
+ self .logger .warning ('No results for line "%s"' , line .id )
169
+ continue
170
+ text_line = ocr_record .prediction
171
+ if len (ocr_record .confidences ) > 0 :
172
+ conf_line = sum (ocr_record .confidences ) / len (ocr_record .confidences )
173
+ else :
174
+ conf_line = None
175
+ if self .parameter ['overwrite_text' ]:
176
+ line .TextEquiv = []
177
+ line .add_TextEquiv (TextEquivType (Unicode = text_line , conf = conf_line ))
178
+ idx_word = 0
179
+ line_offset = 0
180
+ for text_word in regex .splititer (r'(\s+)' , text_line ):
181
+ next_offset = line_offset + len (text_word )
182
+ cuts_word = list (map (list , ocr_record .cuts [line_offset :next_offset ]))
183
+ # fixme: kraken#98 says the Pytorch CTC output is too impoverished to yield good glyph stops
184
+ # as a workaround, here we just steal from the next glyph start, respectively:
185
+ if len (ocr_record .cuts ) > next_offset + 1 :
186
+ cuts_word .extend (list (map (list , ocr_record .cuts [next_offset :next_offset + 1 ])))
187
+ else :
188
+ cuts_word .append (list (ocr_record .cuts [- 1 ]))
189
+ confidences_word = ocr_record .confidences [line_offset :next_offset ]
190
+ line_offset = next_offset
191
+ if len (text_word .strip ()) == 0 :
174
192
continue
175
- text_line = ocr_record .prediction
176
- if len (ocr_record .confidences ) > 0 :
177
- conf_line = sum (ocr_record .confidences ) / len (ocr_record .confidences )
193
+ id_word = '%s_word_%s' % (id_line , idx_word + 1 )
194
+ idx_word += 1
195
+ poly_word = [point for cut in cuts_word for point in cut ]
196
+ bbox_word = bbox_from_polygon (coordinates_for_segment (poly_word , None , page_coords ))
197
+ # avoid zero-size coords on ties
198
+ bbox_word = np .array (bbox_word , dtype = int )
199
+ if np .prod (bbox_word [2 :4 ] - bbox_word [0 :2 ]) == 0 :
200
+ bbox_word [2 :4 ] += 1
201
+ if len (confidences_word ) > 0 :
202
+ conf_word = sum (confidences_word ) / len (confidences_word )
178
203
else :
179
- conf_line = None
180
- if self .parameter ['overwrite_text' ]:
181
- line .TextEquiv = []
182
- line .add_TextEquiv (TextEquivType (Unicode = text_line , conf = conf_line ))
183
- idx_word = 0
184
- line_offset = 0
185
- for text_word in regex .splititer (r'(\s+)' , text_line ):
186
- next_offset = line_offset + len (text_word )
187
- cuts_word = list (map (list , ocr_record .cuts [line_offset :next_offset ]))
188
- # fixme: kraken#98 says the Pytorch CTC output is too impoverished to yield good glyph stops
189
- # as a workaround, here we just steal from the next glyph start, respectively:
190
- if len (ocr_record .cuts ) > next_offset + 1 :
191
- cuts_word .extend (list (map (list , ocr_record .cuts [next_offset :next_offset + 1 ])))
192
- else :
193
- cuts_word .append (list (ocr_record .cuts [- 1 ]))
194
- confidences_word = ocr_record .confidences [line_offset :next_offset ]
195
- line_offset = next_offset
196
- if len (text_word .strip ()) == 0 :
197
- continue
198
- id_word = '%s_word_%s' % (id_line , idx_word + 1 )
199
- idx_word += 1
200
- poly_word = [point for cut in cuts_word for point in cut ]
201
- bbox_word = bbox_from_polygon (coordinates_for_segment (poly_word , None , page_coords ))
204
+ conf_word = None
205
+ word = WordType (id = id_word ,
206
+ Coords = CoordsType (points = points_from_bbox (* bbox_word )))
207
+ word .add_TextEquiv (TextEquivType (Unicode = text_word , conf = conf_word ))
208
+ for idx_glyph , text_glyph in enumerate (text_word ):
209
+ id_glyph = '%s_glyph_%s' % (id_word , idx_glyph + 1 )
210
+ poly_glyph = cuts_word [idx_glyph ] + cuts_word [idx_glyph + 1 ]
211
+ bbox_glyph = bbox_from_polygon (coordinates_for_segment (poly_glyph , None , page_coords ))
202
212
# avoid zero-size coords on ties
203
- bbox_word = np .array (bbox_word , dtype = int )
204
- if np .prod (bbox_word [2 :4 ] - bbox_word [0 :2 ]) == 0 :
205
- bbox_word [2 :4 ] += 1
206
- if len (confidences_word ) > 0 :
207
- conf_word = sum (confidences_word ) / len (confidences_word )
208
- else :
209
- conf_word = None
210
- word = WordType (id = id_word ,
211
- Coords = CoordsType (points = points_from_bbox (* bbox_word )))
212
- word .add_TextEquiv (TextEquivType (Unicode = text_word , conf = conf_word ))
213
- for idx_glyph , text_glyph in enumerate (text_word ):
214
- id_glyph = '%s_glyph_%s' % (id_word , idx_glyph + 1 )
215
- poly_glyph = cuts_word [idx_glyph ] + cuts_word [idx_glyph + 1 ]
216
- bbox_glyph = bbox_from_polygon (coordinates_for_segment (poly_glyph , None , page_coords ))
217
- # avoid zero-size coords on ties
218
- bbox_glyph = np .array (bbox_glyph , dtype = int )
219
- if np .prod (bbox_glyph [2 :4 ] - bbox_glyph [0 :2 ]) == 0 :
220
- bbox_glyph [2 :4 ] += 1
221
- conf_glyph = confidences_word [idx_glyph ]
222
- glyph = GlyphType (id = id_glyph ,
223
- Coords = CoordsType (points = points_from_bbox (* bbox_glyph )))
224
- glyph .add_TextEquiv (TextEquivType (Unicode = text_glyph , conf = conf_glyph ))
225
- word .add_Glyph (glyph )
226
- line .add_Word (word )
227
- log .info ('Recognized line "%s"' , line .id )
213
+ bbox_glyph = np .array (bbox_glyph , dtype = int )
214
+ if np .prod (bbox_glyph [2 :4 ] - bbox_glyph [0 :2 ]) == 0 :
215
+ bbox_glyph [2 :4 ] += 1
216
+ conf_glyph = confidences_word [idx_glyph ]
217
+ glyph = GlyphType (id = id_glyph ,
218
+ Coords = CoordsType (points = points_from_bbox (* bbox_glyph )))
219
+ glyph .add_TextEquiv (TextEquivType (Unicode = text_glyph , conf = conf_glyph ))
220
+ word .add_Glyph (glyph )
221
+ line .add_Word (word )
222
+ self .logger .info ('Recognized line "%s"' , line .id )
228
223
page_update_higher_textequiv_levels ('line' , pcgts )
229
224
230
- log .info ("Finished recognition, serializing" )
231
- file_id = make_file_id (input_file , self .output_file_grp )
232
- pcgts .set_pcGtsId (file_id )
233
- self .workspace .add_file (
234
- ID = file_id ,
235
- file_grp = self .output_file_grp ,
236
- pageId = input_file .pageId ,
237
- mimetype = MIMETYPE_PAGE ,
238
- local_filename = join (self .output_file_grp , f'{ file_id } .xml' ),
239
- content = to_xml (pcgts ))
225
+ self .logger .info ("Finished recognition, serializing" )
226
+ return pcgts
240
227
241
228
# zzz should go into core ocrd_utils
242
229
def baseline_of_segment (segment , coords ):
@@ -251,7 +238,7 @@ def dummy_baseline_of_segment(segment, coords, yrel=0.2):
251
238
return [[xmin , ymid ], [xmax , ymid ]]
252
239
253
240
# zzz should go into core ocrd_utils
254
- def polygon_from_baseline (baseline , scale = 20 ):
241
+ def polygon_from_baseline (baseline , scale : Union [ float , np . floating ] = 20 ):
255
242
if not isinstance (baseline , LineString ):
256
243
baseline = LineString (baseline )
257
244
ltr = baseline .coords [0 ][0 ] < baseline .coords [- 1 ][0 ]
@@ -261,7 +248,7 @@ def polygon_from_baseline(baseline, scale=20):
261
248
scale = scale ))
262
249
return polygon
263
250
264
- def join_polygons (polygons , loc = '' , scale = 20 ):
251
+ def join_polygons (polygons , loc = '' , scale : Union [ float , np . floating ] = 20 ):
265
252
"""construct concave hull (alpha shape) from input polygons"""
266
253
# compoundp = unary_union(polygons)
267
254
# jointp = compoundp.convex_hull
0 commit comments