Skip to content

Commit 9ea80c7

Browse files
committed
port recognize to v3
1 parent 30db9a4 commit 9ea80c7

File tree

1 file changed

+135
-148
lines changed

1 file changed

+135
-148
lines changed

ocrd_kraken/recognize.py

+135-148
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from os.path import join
2+
from typing import Union
23
import regex
34
import itertools
45
import numpy as np
@@ -24,6 +25,7 @@
2425
)
2526
from ocrd_modelfactory import page_from_file
2627
from ocrd_models.ocrd_page import (
28+
OcrdPage,
2729
RegionRefType,
2830
RegionRefIndexedType,
2931
OrderedGroupType,
@@ -46,41 +48,40 @@
4648

4749
class KrakenRecognize(Processor):
4850

49-
def __init__(self, *args, **kwargs):
50-
kwargs['ocrd_tool'] = OCRD_TOOL['tools']['ocrd-kraken-recognize']
51-
kwargs['version'] = OCRD_TOOL['version']
52-
super().__init__(*args, **kwargs)
53-
if hasattr(self, 'output_file_grp'):
54-
# processing context
55-
self.setup()
51+
@property
52+
def executable(self):
53+
return 'ocrd-kraken-recognize'
5654

5755
def setup(self):
5856
"""
59-
Load models
57+
Assert filegrp cardinality, load model, set predict function
6058
"""
61-
log = getLogger('processor.KrakenRecognize')
59+
assert_file_grp_cardinality(self.input_file_grp, 1)
60+
assert_file_grp_cardinality(self.output_file_grp, 1)
61+
62+
self.logger = getLogger('processor.KrakenRecognize')
6263
import torch
6364
from kraken.rpred import rpred
6465
from kraken.lib.models import load_any
6566
model_fname = self.resolve_resource(self.parameter['model'])
66-
log.info("loading model '%s'", model_fname)
67+
self.logger.info("loading model '%s'", model_fname)
6768
device = self.parameter['device']
6869
if device != 'cpu' and not torch.cuda.is_available():
6970
device = 'cpu'
7071
if device == 'cpu':
71-
log.warning("no CUDA device available. Running without GPU will be slow")
72+
self.logger.warning("no CUDA device available. Running without GPU will be slow")
7273
self.model = load_any(model_fname, device=device)
7374
def predict(page_image, segmentation):
7475
return rpred(self.model, page_image, segmentation,
7576
self.parameter['pad'],
7677
self.parameter['bidi_reordering'])
7778
self.predict = predict
7879

79-
def process(self):
80+
def process_page_pcgts(self, *input_pcgts, output_file_id: str = None, page_id: str = None) -> OcrdPage:
8081
"""Recognize text on lines with Kraken.
8182
82-
Open and deserialise each PAGE input file and its respective image,
83-
then iterate over the element hierarchy down to the line level.
83+
Open the parsed PAGE-XML file, then iterate over the element hierarchy
84+
down to the line level.
8485
8586
Set up Kraken to recognise each text line (via coordinates into
8687
the higher-level image, or from the alternative image. If the model
@@ -94,149 +95,135 @@ def process(self):
9495
into additional TextEquiv at each level, and make the higher levels
9596
consistent with that (by concatenation joined by whitespace).
9697
97-
Produce a new output file by serialising the resulting hierarchy.
98+
Return the resulting hierarchy.
9899
"""
100+
assert self.workspace
99101
from kraken.containers import Segmentation, BaselineLine, BBoxLine
100-
log = getLogger('processor.KrakenRecognize')
101-
assert_file_grp_cardinality(self.input_file_grp, 1)
102-
assert_file_grp_cardinality(self.output_file_grp, 1)
103102

104-
for n, input_file in enumerate(self.input_files):
105-
page_id = input_file.pageId or input_file.ID
106-
log.info("INPUT FILE %i / %s of %s", n, page_id, len(self.input_files))
107-
pcgts = page_from_file(self.workspace.download_file(input_file))
108-
self.add_metadata(pcgts)
109-
page = pcgts.get_Page()
110-
page_image, page_coords, _ = self.workspace.image_from_page(
111-
page, page_id,
112-
feature_selector="binarized"
113-
if self.model.nn.input[1] == 1 and self.model.one_channel_mode == '1'
114-
else '')
115-
page_rect = Rectangle(0, 0, page_image.width - 1, page_image.height - 1)
116-
# todo: find out whether kraken.lib.xml.XMLPage(...).to_container() is adequate
103+
pcgts = input_pcgts[0]
104+
page = pcgts.get_Page()
105+
page_image, page_coords, _ = self.workspace.image_from_page(
106+
page, page_id,
107+
feature_selector="binarized"
108+
if self.model.nn.input[1] == 1 and self.model.one_channel_mode == '1'
109+
else '')
110+
page_rect = Rectangle(0, 0, page_image.width - 1, page_image.height - 1)
111+
# TODO: find out whether kraken.lib.xml.XMLPage(...).to_container() is adequate
117112

118-
all_lines = page.get_AllTextLines()
119-
# assumes that missing baselines are rare, if any
120-
if any(line.Baseline for line in all_lines):
121-
log.info("Converting PAGE to Kraken Segmentation (baselines)")
122-
segtype = 'baselines'
123-
else:
124-
log.info("Converting PAGE to Kraken Segmentation (boxes only)")
125-
segtype = 'bbox'
126-
scale = 0.5 * np.median([xywh_from_points(line.Coords.points)['h'] for line in all_lines])
127-
log.info("Estimated scale: %.1f", scale)
128-
seglines = []
129-
for line in all_lines:
130-
# FIXME: see whether model prefers baselines or bbox crops (seg_type)
131-
# FIXME: even if we do not have baselines, emulating baseline+boundary might be useful to prevent automatic center normalization
132-
poly = coordinates_of_segment(line, None, page_coords)
133-
poly = make_valid(Polygon(poly))
134-
poly = poly.intersection(page_rect)
135-
if segtype == 'baselines':
136-
if line.Baseline is None:
137-
base = dummy_baseline_of_segment(line, page_coords)
138-
else:
139-
base = baseline_of_segment(line, page_coords)
140-
if len(base) < 2 or np.abs(np.mean(base[0] - base[-1])) <= 1:
141-
base = dummy_baseline_of_segment(line, page_coords)
142-
elif not LineString(base).intersects(poly):
143-
base = dummy_baseline_of_segment(line, page_coords)
144-
# kraken expects baseline to be fully contained in boundary
145-
base = LineString(base)
146-
if poly.is_empty:
147-
poly = polygon_from_baseline(base, scale=scale)
148-
elif not base.within(poly):
149-
poly = join_polygons([poly, polygon_from_baseline(base, scale=scale)],
150-
loc=line.id, scale=scale)
151-
seglines.append(BaselineLine(baseline=list(map(tuple, base.coords)),
152-
boundary=list(map(tuple, poly.exterior.coords)),
153-
id=line.id,
154-
tags={'type': 'default'}))
155-
# write back
156-
base = coordinates_for_segment(base.coords, None, page_coords)
157-
line.set_Baseline(BaselineType(points=points_from_polygon(base)))
158-
poly = coordinates_for_segment(poly.exterior.coords[:-1], None, page_coords)
159-
line.set_Coords(CoordsType(points=points_from_polygon(poly)))
113+
all_lines = page.get_AllTextLines()
114+
# assumes that missing baselines are rare, if any
115+
if any(line.Baseline for line in all_lines):
116+
self.logger.info("Converting PAGE to Kraken Segmentation (baselines)")
117+
segtype = 'baselines'
118+
else:
119+
self.logger.info("Converting PAGE to Kraken Segmentation (boxes only)")
120+
segtype = 'bbox'
121+
scale = 0.5 * np.median([xywh_from_points(line.Coords.points)['h'] for line in all_lines])
122+
self.logger.info("Estimated scale: %.1f", scale)
123+
seglines = []
124+
for line in all_lines:
125+
# FIXME: see whether model prefers baselines or bbox crops (seg_type)
126+
# FIXME: even if we do not have baselines, emulating baseline+boundary might be useful to prevent automatic center normalization
127+
poly = coordinates_of_segment(line, None, page_coords)
128+
poly = make_valid(Polygon(poly))
129+
poly = poly.intersection(page_rect)
130+
if segtype == 'baselines':
131+
if line.Baseline is None:
132+
base = dummy_baseline_of_segment(line, page_coords)
160133
else:
161-
seglines.append(BBoxLine(bbox=poly.envelope.bounds,
162-
id=line.id))
134+
base = baseline_of_segment(line, page_coords)
135+
if len(base) < 2 or np.abs(np.mean(base[0] - base[-1])) <= 1:
136+
base = dummy_baseline_of_segment(line, page_coords)
137+
elif not LineString(base).intersects(poly):
138+
base = dummy_baseline_of_segment(line, page_coords)
139+
# kraken expects baseline to be fully contained in boundary
140+
base = LineString(base)
141+
if poly.is_empty:
142+
poly = polygon_from_baseline(base, scale=scale)
143+
elif not base.within(poly):
144+
poly = join_polygons([poly, polygon_from_baseline(base, scale=scale)],
145+
loc=line.id, scale=scale)
146+
seglines.append(BaselineLine(baseline=list(map(tuple, base.coords)),
147+
boundary=list(map(tuple, poly.exterior.coords)),
148+
id=line.id,
149+
tags={'type': 'default'}))
150+
# write back
151+
base = coordinates_for_segment(base.coords, None, page_coords)
152+
line.set_Baseline(BaselineType(points=points_from_polygon(base)))
153+
poly = coordinates_for_segment(poly.exterior.coords[:-1], None, page_coords)
154+
line.set_Coords(CoordsType(points=points_from_polygon(poly)))
155+
else:
156+
seglines.append(BBoxLine(bbox=poly.envelope.bounds,
157+
id=line.id))
163158

164-
segmentation = Segmentation(lines=seglines,
165-
script_detection=False,
166-
text_direction='horizontal-lr',
167-
type=segtype,
168-
imagename=page_id)
169-
for idx_line, ocr_record in enumerate(self.predict(page_image, segmentation)):
170-
line = all_lines[idx_line]
171-
id_line = line.id
172-
if not ocr_record.prediction and not ocr_record.cuts:
173-
log.warning('No results for line "%s"', line.id)
159+
segmentation = Segmentation(lines=seglines,
160+
script_detection=False,
161+
text_direction='horizontal-lr',
162+
type=segtype,
163+
imagename=page_id)
164+
for idx_line, ocr_record in enumerate(self.predict(page_image, segmentation)):
165+
line = all_lines[idx_line]
166+
id_line = line.id
167+
if not ocr_record.prediction and not ocr_record.cuts:
168+
self.logger.warning('No results for line "%s"', line.id)
169+
continue
170+
text_line = ocr_record.prediction
171+
if len(ocr_record.confidences) > 0:
172+
conf_line = sum(ocr_record.confidences) / len(ocr_record.confidences)
173+
else:
174+
conf_line = None
175+
if self.parameter['overwrite_text']:
176+
line.TextEquiv = []
177+
line.add_TextEquiv(TextEquivType(Unicode=text_line, conf=conf_line))
178+
idx_word = 0
179+
line_offset = 0
180+
for text_word in regex.splititer(r'(\s+)', text_line):
181+
next_offset = line_offset + len(text_word)
182+
cuts_word = list(map(list, ocr_record.cuts[line_offset:next_offset]))
183+
# fixme: kraken#98 says the Pytorch CTC output is too impoverished to yield good glyph stops
184+
# as a workaround, here we just steal from the next glyph start, respectively:
185+
if len(ocr_record.cuts) > next_offset + 1:
186+
cuts_word.extend(list(map(list, ocr_record.cuts[next_offset:next_offset+1])))
187+
else:
188+
cuts_word.append(list(ocr_record.cuts[-1]))
189+
confidences_word = ocr_record.confidences[line_offset:next_offset]
190+
line_offset = next_offset
191+
if len(text_word.strip()) == 0:
174192
continue
175-
text_line = ocr_record.prediction
176-
if len(ocr_record.confidences) > 0:
177-
conf_line = sum(ocr_record.confidences) / len(ocr_record.confidences)
193+
id_word = '%s_word_%s' % (id_line, idx_word + 1)
194+
idx_word += 1
195+
poly_word = [point for cut in cuts_word for point in cut]
196+
bbox_word = bbox_from_polygon(coordinates_for_segment(poly_word, None, page_coords))
197+
# avoid zero-size coords on ties
198+
bbox_word = np.array(bbox_word, dtype=int)
199+
if np.prod(bbox_word[2:4] - bbox_word[0:2]) == 0:
200+
bbox_word[2:4] += 1
201+
if len(confidences_word) > 0:
202+
conf_word = sum(confidences_word) / len(confidences_word)
178203
else:
179-
conf_line = None
180-
if self.parameter['overwrite_text']:
181-
line.TextEquiv = []
182-
line.add_TextEquiv(TextEquivType(Unicode=text_line, conf=conf_line))
183-
idx_word = 0
184-
line_offset = 0
185-
for text_word in regex.splititer(r'(\s+)', text_line):
186-
next_offset = line_offset + len(text_word)
187-
cuts_word = list(map(list, ocr_record.cuts[line_offset:next_offset]))
188-
# fixme: kraken#98 says the Pytorch CTC output is too impoverished to yield good glyph stops
189-
# as a workaround, here we just steal from the next glyph start, respectively:
190-
if len(ocr_record.cuts) > next_offset + 1:
191-
cuts_word.extend(list(map(list, ocr_record.cuts[next_offset:next_offset+1])))
192-
else:
193-
cuts_word.append(list(ocr_record.cuts[-1]))
194-
confidences_word = ocr_record.confidences[line_offset:next_offset]
195-
line_offset = next_offset
196-
if len(text_word.strip()) == 0:
197-
continue
198-
id_word = '%s_word_%s' % (id_line, idx_word + 1)
199-
idx_word += 1
200-
poly_word = [point for cut in cuts_word for point in cut]
201-
bbox_word = bbox_from_polygon(coordinates_for_segment(poly_word, None, page_coords))
204+
conf_word = None
205+
word = WordType(id=id_word,
206+
Coords=CoordsType(points=points_from_bbox(*bbox_word)))
207+
word.add_TextEquiv(TextEquivType(Unicode=text_word, conf=conf_word))
208+
for idx_glyph, text_glyph in enumerate(text_word):
209+
id_glyph = '%s_glyph_%s' % (id_word, idx_glyph + 1)
210+
poly_glyph = cuts_word[idx_glyph] + cuts_word[idx_glyph + 1]
211+
bbox_glyph = bbox_from_polygon(coordinates_for_segment(poly_glyph, None, page_coords))
202212
# avoid zero-size coords on ties
203-
bbox_word = np.array(bbox_word, dtype=int)
204-
if np.prod(bbox_word[2:4] - bbox_word[0:2]) == 0:
205-
bbox_word[2:4] += 1
206-
if len(confidences_word) > 0:
207-
conf_word = sum(confidences_word) / len(confidences_word)
208-
else:
209-
conf_word = None
210-
word = WordType(id=id_word,
211-
Coords=CoordsType(points=points_from_bbox(*bbox_word)))
212-
word.add_TextEquiv(TextEquivType(Unicode=text_word, conf=conf_word))
213-
for idx_glyph, text_glyph in enumerate(text_word):
214-
id_glyph = '%s_glyph_%s' % (id_word, idx_glyph + 1)
215-
poly_glyph = cuts_word[idx_glyph] + cuts_word[idx_glyph + 1]
216-
bbox_glyph = bbox_from_polygon(coordinates_for_segment(poly_glyph, None, page_coords))
217-
# avoid zero-size coords on ties
218-
bbox_glyph = np.array(bbox_glyph, dtype=int)
219-
if np.prod(bbox_glyph[2:4] - bbox_glyph[0:2]) == 0:
220-
bbox_glyph[2:4] += 1
221-
conf_glyph = confidences_word[idx_glyph]
222-
glyph = GlyphType(id=id_glyph,
223-
Coords=CoordsType(points=points_from_bbox(*bbox_glyph)))
224-
glyph.add_TextEquiv(TextEquivType(Unicode=text_glyph, conf=conf_glyph))
225-
word.add_Glyph(glyph)
226-
line.add_Word(word)
227-
log.info('Recognized line "%s"', line.id)
213+
bbox_glyph = np.array(bbox_glyph, dtype=int)
214+
if np.prod(bbox_glyph[2:4] - bbox_glyph[0:2]) == 0:
215+
bbox_glyph[2:4] += 1
216+
conf_glyph = confidences_word[idx_glyph]
217+
glyph = GlyphType(id=id_glyph,
218+
Coords=CoordsType(points=points_from_bbox(*bbox_glyph)))
219+
glyph.add_TextEquiv(TextEquivType(Unicode=text_glyph, conf=conf_glyph))
220+
word.add_Glyph(glyph)
221+
line.add_Word(word)
222+
self.logger.info('Recognized line "%s"', line.id)
228223
page_update_higher_textequiv_levels('line', pcgts)
229224

230-
log.info("Finished recognition, serializing")
231-
file_id = make_file_id(input_file, self.output_file_grp)
232-
pcgts.set_pcGtsId(file_id)
233-
self.workspace.add_file(
234-
ID=file_id,
235-
file_grp=self.output_file_grp,
236-
pageId=input_file.pageId,
237-
mimetype=MIMETYPE_PAGE,
238-
local_filename=join(self.output_file_grp, f'{file_id}.xml'),
239-
content=to_xml(pcgts))
225+
self.logger.info("Finished recognition, serializing")
226+
return pcgts
240227

241228
# zzz should go into core ocrd_utils
242229
def baseline_of_segment(segment, coords):
@@ -251,7 +238,7 @@ def dummy_baseline_of_segment(segment, coords, yrel=0.2):
251238
return [[xmin, ymid], [xmax, ymid]]
252239

253240
# zzz should go into core ocrd_utils
254-
def polygon_from_baseline(baseline, scale=20):
241+
def polygon_from_baseline(baseline, scale : Union[float, np.floating] = 20):
255242
if not isinstance(baseline, LineString):
256243
baseline = LineString(baseline)
257244
ltr = baseline.coords[0][0] < baseline.coords[-1][0]
@@ -261,7 +248,7 @@ def polygon_from_baseline(baseline, scale=20):
261248
scale=scale))
262249
return polygon
263250

264-
def join_polygons(polygons, loc='', scale=20):
251+
def join_polygons(polygons, loc='', scale : Union[float, np.floating] = 20):
265252
"""construct concave hull (alpha shape) from input polygons"""
266253
# compoundp = unary_union(polygons)
267254
# jointp = compoundp.convex_hull

0 commit comments

Comments
 (0)