Skip to content

Commit e62d7e4

Browse files
authored
Merge pull request #43 from bertsky/segment-regions
Segment regions, incremental segmentation via masking
2 parents bdbe6fc + 636ca99 commit e62d7e4

11 files changed

+428
-155
lines changed

.github/workflows/ci.yml

+2-6
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ jobs:
1919
strategy:
2020
fail-fast: false
2121
matrix:
22-
python-version: [3.9, '3.10', '3.11']
22+
python-version: [3.8, 3.9, '3.10', '3.11']
2323

2424
steps:
2525
- uses: actions/checkout@v4
@@ -35,14 +35,10 @@ jobs:
3535
- name: Build
3636
run: |
3737
python3 --version
38-
python3 -m venv venv
39-
source venv/bin/activate
4038
make deps deps-test
4139
make install
4240
pip check
4341
ocrd resmgr download ocrd-kraken-segment blla.mlmodel
4442
ocrd resmgr download ocrd-kraken-recognize en_best.mlmodel
4543
- name: Test
46-
run: |
47-
source venv/bin/activate
48-
make test
44+
run: make test

CHANGELOG.md

+29-1
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,39 @@ Versioned according to [Semantic Versioning](http://semver.org/).
55

66
## Unreleased
77

8+
Fixed:
9+
10+
* recognize: improve baseline/polgon robustness
11+
12+
Changed:
13+
14+
* segment/recognize: adapt and bump to Kraken v5
15+
* adapt to Python importlib instead of pkg_resources
16+
* update tests/CI
17+
18+
## [0.4.0] - 2024-02-11
19+
20+
Fixed:
21+
22+
* binarize: OCR-D conformity (PAGE output, AlternativeImage input/output)
23+
* docstrings
24+
25+
Added:
26+
27+
* recognize: param `overwrite_text`
28+
* segment: param `overwrite_segments`
29+
* segment: param `level-of-operation` (now supports `table` and `region`, too)
30+
31+
Changed:
32+
33+
* binarize: :fire: renamed `level-of-operation=block` to `region`
34+
* segment: existing segmentation will be masked away (unless `overwrite_segments`)
35+
836
## [0.3.1] - 2023-08-17
937

1038
Fixed:
1139

12-
* recognize: only apply `one_channel_mode` (whether to use `binarized` input)
40+
* recognize: only apply `one_channel_mode` (whether to use `binarized` input)
1341
if the model has only one input channel
1442
* recognize: project text results to region level in order
1543
* recognize: iterate line results via proper word splitting

README.md

+5-4
Original file line numberDiff line numberDiff line change
@@ -83,13 +83,14 @@ Available [OCR-D processors](https://ocr-d.de/en/spec/cli) are:
8383

8484
- [ocrd-kraken-binarize](ocrd_kraken/binarize.py) (nlbin – not recommended)
8585
- adds `AlternativeImage` files (per page, region or line) to the output fileGrp
86-
- [ocrd-kraken-segment](ocrd_kraken/segment.py) (all-in-one segmentation – recommended for handwriting and simply layouted prints)
87-
- adds `TextRegion`s, `TableRegion`s, `ImageRegion`s, `MathsRegion`s, `NoiseRegion`s, `ReadingOrder` and `AlternativeImage` to `Page` (depending on model training)
88-
- adds `TextLine`s to `TextRegion`s, including their `Baseline`
86+
- [ocrd-kraken-segment](ocrd_kraken/segment.py) (all-in-one segmentation – recommended for handwriting and simply layouted prints, or as pure line segmentation)
87+
- adds `TextRegion`s to `Page` (if `level-of-operation=page`) or `TableRegion`s (if `table`)
88+
- adds `TextLine`s (with `Baseline`) to `TextRegion`s (for all `level-of-operation`)
89+
- masks existing segments during detection (unless `overwrite_segments`)
8990
- [ocrd-kraken-recognize](ocrd_kraken/recognize.py) (benefits from annotated `Baseline`s, falls back to center-normalized bboxes)
9091
- adds `Word`s to `TextLine`s
9192
- adds `Glyph`s to `Word`s
92-
- adds `TextEquiv`
93+
- adds `TextEquiv` (removing existing `TextEquiv` if `overwrite_text`)
9394

9495
## Testing
9596

ocrd_kraken/binarize.py

+71-38
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
from __future__ import absolute_import
2-
import io
2+
import os
33
import kraken.binarization
44
from ocrd import Processor
5-
from ocrd_utils import getLogger, polygon_from_points, concat_padded
5+
from ocrd_utils import getLogger, make_file_id, MIMETYPE_PAGE
6+
from ocrd_models.ocrd_page import AlternativeImageType, to_xml
67
from ocrd_modelfactory import page_from_file
78

89
from ocrd_kraken.config import OCRD_TOOL
@@ -16,51 +17,83 @@ def __init__(self, *args, **kwargs):
1617
super(KrakenBinarize, self).__init__(*args, **kwargs)
1718

1819
def process(self):
19-
"""
20-
Performs the binarization.
20+
"""Binarize the pages/regions/lines with Kraken.
21+
22+
Open and deserialise PAGE input files and their respective images,
23+
then iterate over the element hierarchy down to the requested
24+
``level-of-operation``.
25+
26+
Next, for each file, crop each segment image according to the layout
27+
annotation (via coordinates into the higher-level image, or from the
28+
alternative image), and determine the threshold for binarization
29+
(via Ocropy nlbin). Apply results to the image and export it.
30+
31+
Add the new image file to the workspace along with the output fileGrp,
32+
and using a file ID with suffix ``.IMG-BIN`` along with further
33+
identification of the input element.
34+
35+
Reference each new image in the AlternativeImage of the element.
36+
37+
Produce a new output file by serialising the resulting hierarchy.
2138
"""
2239
log = getLogger('processor.KrakenBinarize')
2340
log.debug('Level of operation: "%s"', self.parameter['level-of-operation'])
2441
log.debug('Input file group %s', self.input_file_grp)
2542
log.debug('Input files %s', [str(f) for f in self.input_files])
2643
for (n, input_file) in enumerate(self.input_files):
27-
log.info("INPUT FILE %i / %s", n, input_file)
44+
log.info("INPUT FILE %i / %s", n, input_file.pageId or input_file.ID)
45+
file_id = make_file_id(input_file, self.output_file_grp)
2846
pcgts = page_from_file(self.workspace.download_file(input_file))
29-
image_url = pcgts.get_Page().imageFilename
30-
log.info("pcgts %s", pcgts)
47+
page = pcgts.get_Page()
48+
page_id = pcgts.pcGtsId or input_file.pageId or input_file.ID # (PageType has no id)
49+
self.add_metadata(pcgts)
50+
51+
page_image, page_coords, page_image_info = self.workspace.image_from_page(
52+
page, page_id, feature_filter='binarized')
3153
if self.parameter['level-of-operation'] == 'page':
32-
log.info("About to binarize page '%s'", pcgts.pcGtsId)
33-
image = self.workspace.resolve_image_as_pil(image_url)
34-
bin_image = kraken.binarization.nlbin(image)
35-
bin_image_bytes = io.BytesIO()
36-
bin_image.save(bin_image_bytes, format='PNG')
37-
ID = concat_padded(self.output_file_grp, n)
38-
self.workspace.add_file(
54+
log.info("Binarizing page '%s'", page_id)
55+
bin_image = kraken.binarization.nlbin(page_image)
56+
file_path = self.workspace.save_image_file(
57+
bin_image, file_id + '.IMG-BIN',
3958
self.output_file_grp,
40-
pageId=input_file.pageId,
41-
ID=ID,
42-
mimetype='image/png',
43-
local_filename="%s/%s" % (self.output_file_grp, ID),
44-
content=bin_image_bytes.getvalue())
59+
page_id=input_file.pageId)
60+
page.add_AlternativeImage(AlternativeImageType(
61+
filename=file_path,
62+
comments=page_coords['features'] + ',binarized'))
4563
else:
46-
for region in pcgts.get_Page().get_TextRegion():
47-
if self.parameter['level-of-operation'] == 'block':
48-
log.info("About to binarize region '%s'", region.id)
49-
image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(region.get_Coords().points))
64+
for region in page.get_AllRegions(classes=['Text']):
65+
region_image, region_coords = self.workspace.image_from_segment(
66+
region, page_image, page_coords, feature_filter='binarized')
67+
if self.parameter['level-of-operation'] == 'region':
68+
log.info("Binarizing region '%s'", region.id)
69+
bin_image = kraken.binarization.nlbin(region_image)
70+
file_path = self.workspace.save_image_file(
71+
bin_image, file_id + '_' + region.id + '.IMG-BIN',
72+
self.output_file_grp,
73+
page_id=input_file.pageId)
74+
region.add_AlternativeImage(AlternativeImageType(
75+
filename=file_path,
76+
comments=region_coords['features'] + ',binarized'))
5077
else:
51-
textlines = region.get_TextLine()
52-
log.info("About to binarize %i lines of region '%s'", len(textlines), region.id)
53-
for (line_no, line) in enumerate(textlines):
54-
log.debug("Binarizing line '%s' in region '%s'", line_no, region.id)
55-
image = self.workspace.resolve_image_as_pil(image_url, polygon_from_points(line.get_Coords().points))
56-
bin_image = kraken.binarization.nlbin(image)
57-
bin_image_bytes = io.BytesIO()
58-
bin_image.save(bin_image_bytes, format='PNG')
59-
ID = concat_padded(self.output_file_grp, n, region.id, line_no)
60-
self.workspace.add_file(
78+
for line in region.get_TextLine():
79+
line_image, line_coords = self.workspace.image_from_segment(
80+
line, region_image, region_coords, feature_filter='binarized')
81+
log.info("Binarizing line '%s'", line.id)
82+
bin_image = kraken.binarization.nlbin(line_image)
83+
file_path = self.workspace.save_image_file(
84+
bin_image, file_id + '_' + region.id + '_' + line.id + '.IMG-BIN',
6185
self.output_file_grp,
62-
pageId=input_file.pageId,
63-
ID=ID,
64-
local_filename="%s/%s" % (self.output_file_grp, ID),
65-
mimetype='image/png',
66-
content=bin_image_bytes.getvalue())
86+
page_id=input_file.pageId)
87+
line.add_AlternativeImage(AlternativeImageType(
88+
filename=file_path,
89+
comments=line_coords['features'] + ',binarized'))
90+
# update METS (add the PAGE file):
91+
file_path = os.path.join(self.output_file_grp, file_id + '.xml')
92+
pcgts.set_pcGtsId(file_id)
93+
out = self.workspace.add_file(
94+
ID=file_id,
95+
file_grp=self.output_file_grp,
96+
pageId=input_file.pageId,
97+
local_filename=file_path,
98+
mimetype=MIMETYPE_PAGE,
99+
content=to_xml(pcgts))

ocrd_kraken/config.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
import json
2-
from pkg_resources import resource_filename
2+
from ocrd_utils import resource_filename
33

4-
with open(resource_filename(__name__, 'ocrd-tool.json'), 'r', encoding='utf-8') as f:
4+
with open(resource_filename('ocrd_kraken', 'ocrd-tool.json'), 'r', encoding='utf-8') as f:
55
OCRD_TOOL = json.load(f)

ocrd_kraken/ocrd-tool.json

+31-7
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"git_url": "https://github.com/OCR-D/ocrd_kraken",
3-
"version": "0.3.1",
3+
"version": "0.4.0",
44
"tools": {
55
"ocrd-kraken-binarize": {
66
"executable": "ocrd-kraken-binarize",
@@ -15,10 +15,10 @@
1515
"description": "Binarize images with kraken",
1616
"parameters": {
1717
"level-of-operation": {
18-
"description": "level-of-operation",
18+
"description": "segment hierarchy level to operate on",
1919
"type": "string",
2020
"default": "page",
21-
"enum": ["page", "block", "line"]
21+
"enum": ["page", "region", "line"]
2222
}
2323
}
2424
},
@@ -30,10 +30,22 @@
3030
"Layout analysis"
3131
],
3232
"steps": [
33-
"layout/segmentation/region"
33+
"layout/segmentation/region",
34+
"layout/segmentation/line"
3435
],
35-
"description": "Block segmentation with kraken",
36+
"description": "Layout segmentation with Kraken",
3637
"parameters": {
38+
"level-of-operation": {
39+
"description": "segment hierarchy level to operate on (page into regions+lines, or regions into lines)",
40+
"type": "string",
41+
"default": "page",
42+
"enum": ["page", "table", "region"]
43+
},
44+
"overwrite_segments": {
45+
"description": "remove any existing regions/lines",
46+
"type": "boolean",
47+
"default": false
48+
},
3749
"text_direction": {
3850
"type": "string",
3951
"description": "Sets principal text direction",
@@ -103,7 +115,14 @@
103115
"size": 5047020,
104116
"name": "blla.mlmodel",
105117
"parameter_usage": "without-extension",
106-
"description": "Pretrained baseline segmentation model"
118+
"description": "Pretrained region+baseline segmentation model (trained on handwriting)"
119+
},
120+
{
121+
"url": "https://ub-backup.bib.uni-mannheim.de/~stweil/tesstrain/kraken/ubma_segmentation/ubma_segmentation.mlmodel",
122+
"size": 5047020,
123+
"name": "ubma_segmentation.mlmodel",
124+
"parameter_usage": "without-extension",
125+
"description": "region+baseline segmentation model trained by UBMA (on print)"
107126
}
108127
]
109128
},
@@ -113,8 +132,13 @@
113132
"output_file_grp": ["OCR-D-OCR-KRAK"],
114133
"categories": ["Text recognition and optimization"],
115134
"steps": ["recognition/text-recognition"],
116-
"description": "OCR with kraken",
135+
"description": "Text recognition with Kraken",
117136
"parameters": {
137+
"overwrite_text": {
138+
"description": "remove any existing TextEquiv",
139+
"type": "boolean",
140+
"default": false
141+
},
118142
"model": {
119143
"description": "OCR model to recognize with",
120144
"type": "string",

0 commit comments

Comments
 (0)