Skip to content

Commit d0d20dc

Browse files
authored
Merge pull request #67 from OCR-D/project-parent
improve repair and project processors
2 parents 3993139 + 3d9e0d6 commit d0d20dc

File tree

4 files changed

+71
-61
lines changed

4 files changed

+71
-61
lines changed

CHANGELOG.md

+11
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,17 @@ Versioned according to [Semantic Versioning](http://semver.org/).
44

55
## [Unreleased]
66

7+
### Fixed
8+
9+
* repair/project: adapt to Shapely deprecations,
10+
* repair/project: more robust `join_polygons`, `make_intersections`, `make_valid`
11+
12+
### Changed
13+
14+
* :fire: require Shapely 2
15+
* project: clip coords to parent's parent instead of parent
16+
* repair (`sanitize`): shrink before attempting repair (hierarchical consistency)
17+
718
## [0.1.22] - 2023-06-29
819

920
### Added

ocrd_segment/project.py

+44-30
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
import numpy as np
66
from scipy.sparse.csgraph import minimum_spanning_tree
77
from shapely.geometry import Polygon, LineString
8+
from shapely.geometry.polygon import orient
9+
from shapely import set_precision
810
from shapely.ops import unary_union, nearest_points
911

1012
from ocrd import Processor
@@ -120,22 +122,22 @@ def process(self):
120122
content=to_xml(pcgts))
121123

122124
def _process_segment(self, segment, constituents, page_id):
123-
"""Shrink segment outline to become the minimal convex hull of its constituent segments."""
125+
"""Overwrite segment outline to become the minimal convex hull of its constituent segments."""
124126
LOG = getLogger('processor.ProjectHull')
125127
polygons = [make_valid(Polygon(polygon_from_points(constituent.get_Coords().points)))
126128
for constituent in constituents]
127129
polygon = join_polygons(polygons).buffer(self.parameter['padding']).exterior.coords[:-1]
130+
# make sure the segment still fits into its parent's parent
128131
if isinstance(segment, PageType):
129-
oldborder = segment.Border
130-
segment.Border = None # ensure interim parent is the page frame itself
131-
# make sure the segment still fits into its own parent
132-
polygon2 = polygon_for_parent(polygon, segment)
133-
if polygon2 is None:
132+
# ensure interim parent is the page frame itself
133+
parent = PageType(**segment.__dict__)
134+
parent.Border = None
135+
else:
136+
parent = segment.parent_object_
137+
polygon = polygon_for_parent(polygon, parent)
138+
if polygon is None:
134139
LOG.info('Ignoring extant segment: %s', segment.id)
135-
if isinstance(segment, PageType):
136-
segment.Border = oldborder
137140
else:
138-
polygon = polygon2
139141
points = points_from_polygon(polygon)
140142
coords = CoordsType(points=points)
141143
LOG.debug('Using new coordinates from %d constituents for segment "%s"',
@@ -152,11 +154,13 @@ def pairwise(iterable):
152154

153155
def join_polygons(polygons, scale=20):
154156
"""construct concave hull (alpha shape) from input polygons by connecting their pairwise nearest points"""
155-
# ensure input polygons are simply typed
156-
polygons = list(itertools.chain.from_iterable([
157-
poly.geoms if poly.type in ['MultiPolygon', 'GeometryCollection']
158-
else [poly]
159-
for poly in polygons]))
157+
# ensure input polygons are simply typed and all oriented equally
158+
polygons = [orient(poly)
159+
for poly in itertools.chain.from_iterable(
160+
[poly.geoms
161+
if poly.geom_type in ['MultiPolygon', 'GeometryCollection']
162+
else [poly]
163+
for poly in polygons])]
160164
npoly = len(polygons)
161165
if npoly == 1:
162166
return polygons[0]
@@ -175,16 +179,18 @@ def join_polygons(polygons, scale=20):
175179
prevp = polygons[prevp]
176180
nextp = polygons[nextp]
177181
nearest = nearest_points(prevp, nextp)
178-
bridgep = LineString(nearest).buffer(max(1, scale/5), resolution=1)
182+
bridgep = orient(LineString(nearest).buffer(max(1, scale/5), resolution=1), -1)
179183
polygons.append(bridgep)
180184
jointp = unary_union(polygons)
181-
assert jointp.type == 'Polygon', jointp.wkt
182-
if jointp.minimum_clearance < 1.0:
183-
# follow-up calculations will necessarily be integer;
184-
# so anticipate rounding here and then ensure validity
185-
jointp = Polygon(np.round(jointp.exterior.coords))
186-
jointp = make_valid(jointp)
187-
return jointp
185+
assert jointp.geom_type == 'Polygon', jointp.wkt
186+
# follow-up calculations will necessarily be integer;
187+
# so anticipate rounding here and then ensure validity
188+
jointp2 = set_precision(jointp, 1.0)
189+
if jointp2.geom_type != 'Polygon' or not jointp2.is_valid:
190+
jointp2 = Polygon(np.round(jointp.exterior.coords))
191+
jointp2 = make_valid(jointp2)
192+
assert jointp2.geom_type == 'Polygon', jointp2.wkt
193+
return jointp2
188194

189195
def polygon_for_parent(polygon, parent):
190196
"""Clip polygon to parent polygon range.
@@ -227,30 +233,38 @@ def make_intersection(poly1, poly2):
227233
# post-process
228234
if interp.is_empty or interp.area == 0.0:
229235
return None
230-
if interp.type == 'GeometryCollection':
236+
if interp.geom_type == 'GeometryCollection':
231237
# heterogeneous result: filter zero-area shapes (LineString, Point)
232238
interp = unary_union([geom for geom in interp.geoms if geom.area > 0])
233-
if interp.type == 'MultiPolygon':
239+
if interp.geom_type == 'MultiPolygon':
234240
# homogeneous result: construct convex hull to connect
235241
interp = join_polygons(interp.geoms)
236-
if interp.minimum_clearance < 1.0:
237-
# follow-up calculations will necessarily be integer;
238-
# so anticipate rounding here and then ensure validity
239-
interp = Polygon(np.round(interp.exterior.coords))
240-
interp = make_valid(interp)
242+
# follow-up calculations will necessarily be integer;
243+
# so anticipate rounding here and then ensure validity
244+
interp = set_precision(interp, 1.0)
241245
return interp
242246

243247
def make_valid(polygon):
248+
"""Ensures shapely.geometry.Polygon object is valid by repeated rearrangement/simplification/enlargement."""
244249
points = list(polygon.exterior.coords)
250+
# try by re-arranging points
245251
for split in range(1, len(points)):
246252
if polygon.is_valid or polygon.simplify(polygon.area).is_valid:
247253
break
248254
# simplification may not be possible (at all) due to ordering
249255
# in that case, try another starting point
250256
polygon = Polygon(points[-split:]+points[:-split])
251-
for tolerance in range(int(polygon.area)):
257+
# try by simplification
258+
for tolerance in range(int(polygon.area + 1.5)):
252259
if polygon.is_valid:
253260
break
254261
# simplification may require a larger tolerance
255262
polygon = polygon.simplify(tolerance + 1)
263+
# try by enlarging
264+
for tolerance in range(1, int(polygon.area + 2.5)):
265+
if polygon.is_valid:
266+
break
267+
# enlargement may require a larger tolerance
268+
polygon = polygon.buffer(tolerance)
269+
assert polygon.is_valid, polygon.wkt
256270
return polygon

ocrd_segment/repair.py

+15-30
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
PageValidator
4242
)
4343
from .config import OCRD_TOOL
44-
from .project import join_polygons
44+
from .project import join_polygons, make_valid
4545

4646
TOOL = 'ocrd-segment-repair'
4747

@@ -115,6 +115,14 @@ def process(self):
115115
pcgts.set_pcGtsId(file_id)
116116
page = pcgts.get_Page()
117117

118+
# shrink/expand text regions to the hull of their text lines
119+
if sanitize:
120+
page_image, page_coords, _ = self.workspace.image_from_page(
121+
page, page_id,
122+
feature_selector='binarized',
123+
feature_filter='clipped')
124+
shrink_regions(page_image, page_coords, page, page_id,
125+
padding=self.parameter['sanitize_padding'])
118126
#
119127
# validate segmentation (warn of children extending beyond their parents)
120128
#
@@ -180,14 +188,6 @@ def process(self):
180188
# delete/merge/split redundant text regions (or its text lines)
181189
if plausibilize:
182190
self.plausibilize_page(page, page_id)
183-
# shrink/expand text regions to the hull of their text lines
184-
if sanitize:
185-
page_image, page_coords, _ = self.workspace.image_from_page(
186-
page, page_id,
187-
feature_selector='binarized',
188-
feature_filter='clipped')
189-
shrink_regions(page_image, page_coords, page, page_id,
190-
padding=self.parameter['sanitize_padding'])
191191

192192
self.workspace.add_file(
193193
ID=file_id,
@@ -482,7 +482,7 @@ def _plausibilize_segments(segpolys, rogroup, marked_for_deletion, marked_for_me
482482
_tag_name(otherseg), otherseg.id)
483483
otherpoly = make_valid(Polygon(polygon_from_points(otherseg.get_Coords().points)))
484484
poly = poly.difference(otherpoly)
485-
if poly.type == 'MultiPolygon':
485+
if poly.geom_type == 'MultiPolygon':
486486
poly = join_polygons(poly.geoms)
487487
if poly.minimum_clearance < 1.0:
488488
poly = Polygon(np.round(poly.exterior.coords))
@@ -556,8 +556,8 @@ def shrink_regions(page_image, page_coords, page, page_id, padding=0):
556556
continue
557557
# pick contour and convert to absolute:
558558
region_polygon = join_polygons([make_valid(Polygon(contour[:, 0, ::]))
559-
for contour in contours
560-
if len(contour) >= 3], scale=scale)
559+
for area, contour in zip(areas, contours)
560+
if len(contour) >= 3 and area > 0], scale=scale)
561561
if padding:
562562
region_polygon = region_polygon.buffer(padding)
563563
region_polygon = coordinates_for_segment(region_polygon.exterior.coords[:-1], page_image, page_coords)
@@ -599,7 +599,7 @@ def simplify(segment, tolerance=0):
599599

600600
def merge_poly(poly1, poly2):
601601
poly = poly1.union(poly2)
602-
if poly.type == 'MultiPolygon':
602+
if poly.geom_type == 'MultiPolygon':
603603
#poly = poly.convex_hull
604604
poly = join_polygons(poly.geoms)
605605
if poly.minimum_clearance < 1.0:
@@ -611,10 +611,10 @@ def clip_poly(poly1, poly2):
611611
poly = poly1.intersection(poly2)
612612
if poly.is_empty or poly.area == 0.0:
613613
return None
614-
if poly.type == 'GeometryCollection':
614+
if poly.geom_type == 'GeometryCollection':
615615
# heterogeneous result: filter zero-area shapes (LineString, Point)
616616
poly = unary_union([geom for geom in poly.geoms if geom.area > 0])
617-
if poly.type == 'MultiPolygon':
617+
if poly.geom_type == 'MultiPolygon':
618618
# homogeneous result: construct convex hull to connect
619619
#poly = poly.convex_hull
620620
poly = join_polygons(poly.geoms)
@@ -719,20 +719,5 @@ def ensure_valid(element):
719719
points = points_from_polygon(polygon)
720720
coords.set_points(points)
721721

722-
def make_valid(polygon):
723-
"""Ensures shapely.geometry.Polygon object is valid by repeated simplification"""
724-
for split in range(1, len(polygon.exterior.coords)-1):
725-
if polygon.is_valid or polygon.simplify(polygon.area).is_valid:
726-
break
727-
# simplification may not be possible (at all) due to ordering
728-
# in that case, try another starting point
729-
polygon = Polygon(polygon.exterior.coords[-split:]+polygon.exterior.coords[:-split])
730-
for tolerance in range(1, int(polygon.area)):
731-
if polygon.is_valid:
732-
break
733-
# simplification may require a larger tolerance
734-
polygon = polygon.simplify(tolerance)
735-
return polygon
736-
737722
def _tag_name(element):
738723
return element.__class__.__name__[0:-4]

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
ocrd >= 2.20.0
2-
shapely >= 1.7.1
2+
shapely >= 2.0
33
scikit-image
44
numpy
55
xlsxwriter

0 commit comments

Comments
 (0)