refactor: Major refactoring of functions to improve readability, effi…

…ciency and follow standard practices. (#139) * refactor: Reorganize hocr functions - Use more jinja templating instead of hardcoding strings - Simplified bounding box function - Changed parameter name for `_get_hocr_bounding_box` to `page_dimension` for more clarity. * samples: Added sample for convert to hocr * refactor: Reordering of classes in page.py * refactor: Re-added refactoring to remove extra `get_*()` methods in page.py - Added in #110 Lost in Merge * fix: Moved `templates` directory into package. - Required for template to work in installed library * chore: Ran isort and black * chore: Ran no-implicit-optional * refactor: Refactored document.py - improve readability, follow python conventions, and improve efficiency - Also, fixed a previously unknown bug where `Document.search_pages()` returned inaccurate results because it only searched paragraph.text, not page.text * refactor: Refactor gcs_utilities for readability/pythonic style * refactor: Refactor page.py to improve efficiency, readability and follow python conventions * refactor: Rename `Entity.documentai_entity` to `Entity.documentai_object` to match the page.py file * refactor: Move bounding box extraction to `docai_utilities.py` * refactor: Major Refactoring of converter_helpers.py to simplify/organize functions, reduce complexity, and increase readability * fix: Fixed refactor of export_images in document.py * refactor: Cleanup of blocks.py using `getattr()` * refactor: Refactoring of bbox_conversion.py to improve readability and efficiency * fix: Change _get_files() to send full gcs uri to _get_bytes() - Also reduce wait_time in tests * refactor: Move `converter_helpers.py` functions into `converter.py` - `converter.py` only had one external facing function that called an internal function with the same parameters. - Not sure if there was a specific reason for this setup, can be undone if needed. * chore(deps): update dependency google-cloud-documentai to v2.16.1 (#138) * fix: Change _get_files() to send full gcs uri to _get_bytes() - Also reduce wait_time in tests * refactor: Move `converter_helpers.py` functions into `converter.py` - `converter.py` only had one external facing function that called an internal function with the same parameters. - Not sure if there was a specific reason for this setup, can be undone if needed. * chore: Reran black formatting after merge conflict * refactor: Minor refactoring of test_bbox_conversion.py to improve readability * refactor: Changed blocks.py to block.py for consistency. - Changed how `Block` is initialized. - Changed `load_blocks_from_schema` into a `@classmethod` to simplify imports. * fix: Added Missing type annotations to `document.py` * fix: Add new filename for block.py into test_bbox_conversion.py * 🦉 Updates from OwlBot post-processor See https://github.com/googleapis/repo-automation-bots/blob/main/packages/owl-bot/README.md * fix: Fix failing tests for Block class. Changed all fields to have types * fix: Changed `converter._get_bytes` to return a Tuple * chore: Addressed Code Review Comments - Removed FILES_TO_IGNORE - Simplification of logic in `_get_multiplier` `convert_bbox_to_docproto_bbox` - Addressed other lint errors - Adjusted function names to indicate not protected members. * fix: Remove extra reference to metadata_blob * fix: Change expected test output and remove references to `geometry` --------- Co-authored-by: Mend Renovate <bot@renovateapp.com> Co-authored-by: Owl Bot <gcf-owl-bot[bot]@users.noreply.github.com>
googleapis · Jul 20, 2023 · 82ac823 · 82ac823
1 parent fcf5dbd
commit 82ac823
Show file tree

Hide file tree

Showing 22 changed files with 1,596 additions and 1,770 deletions.
diff --git a/google/cloud/documentai_toolbox/__init__.py b/google/cloud/documentai_toolbox/__init__.py
@@ -18,18 +18,8 @@
 
 __version__ = package_version.__version__
 
-from .wrappers import (
-    document,
-    page,
-    entity,
-)
+from .converters import converter
+from .utilities import docai_utilities, gcs_utilities
+from .wrappers import document, entity, page
 
-from .converters import (
-    converter,
-)
-
-from .utilities import (
-    gcs_utilities,
-)
-
-__all__ = (document, page, entity, converter, gcs_utilities)
+__all__ = (document, page, entity, converter, docai_utilities, gcs_utilities)
diff --git a/google/cloud/documentai_toolbox/constants.py b/google/cloud/documentai_toolbox/constants.py
@@ -15,13 +15,17 @@
 #
 
 from typing import Union
+
 from google.cloud.documentai import Document
 
 USER_AGENT_PRODUCT = "documentai-toolbox"
 
 JSON_EXTENSION = ".json"
 JSON_MIMETYPE = "application/json"
 
+PDF_EXTENSION = ".pdf"
+PDF_MIMETYPE = "application/pdf"
+
 FILE_CHECK_REGEX = r"(.*[.].*$)"
 
 # https://cloud.google.com/document-ai/quotas#content_limits
@@ -32,7 +36,7 @@
 
 # https://cloud.google.com/document-ai/docs/file-types
 VALID_MIME_TYPES = {
-    "application/pdf",
+    PDF_MIMETYPE,
     "image/bmp",
     "image/gif",
     "image/jpeg",

diff --git a/google/cloud/documentai_toolbox/converters/config/bbox_conversion.py b/google/cloud/documentai_toolbox/converters/config/bbox_conversion.py
@@ -14,15 +14,23 @@
 # limitations under the License.
 #
 
-from typing import Callable
-from intervaltree import intervaltree
+from typing import Callable, List, Optional
 
 from google.cloud import documentai
-from google.cloud.documentai_v1.types import geometry
+from google.cloud.documentai_toolbox.converters.config.block import Block
+
+from intervaltree import intervaltree
+
+
+PIXEL_CONVERSION_RATES = {
+    "pxl": 1,
+    "inch": 96,
+    "cm": 37.795,
+}
 
 
 def _midpoint_in_bpoly(
-    box_a: geometry.BoundingPoly, box_b: geometry.BoundingPoly
+    box_a: documentai.BoundingPoly, box_b: documentai.BoundingPoly
 ) -> bool:
     """Returns whether the midpoint in box_a is inside box_b."""
 
@@ -43,7 +51,6 @@ def _merge_text_anchors(
     text_anchor_2: documentai.Document.TextAnchor,
 ) -> documentai.Document.TextAnchor:
     """Merges two TextAnchor objects into one ascending sorted TextAnchor."""
-    merged_text_anchor = documentai.Document.TextAnchor()
     intervals = []
     for text_segment in text_anchor_1.text_segments:
         intervals.append(
@@ -54,21 +61,20 @@ def _merge_text_anchors(
             intervaltree.Interval(text_segment.start_index, text_segment.end_index)
         )
 
-    interval_tree = intervaltree.IntervalTree(intervals)
-    interval_tree.merge_overlaps(strict=False)
-    ts = []
-    for iv in sorted(interval_tree):
-        ts.append(
-            documentai.Document.TextAnchor.TextSegment(
-                start_index=iv.begin, end_index=iv.end
-            )
+    merged_tree = intervaltree.IntervalTree(intervals)
+    merged_tree.merge_overlaps(strict=False)
+
+    merged_text_segments = [
+        documentai.Document.TextAnchor.TextSegment(
+            start_index=iv.begin, end_index=iv.end
         )
+        for iv in sorted(merged_tree)
+    ]
 
-    merged_text_anchor.text_segments = ts
-    return merged_text_anchor
+    return documentai.Document.TextAnchor(text_segments=merged_text_segments)
 
 
-def _get_text_anchor_in_bbox(
+def get_text_anchor_in_bbox(
     bbox: documentai.BoundingPoly,
     page: documentai.Document.Page,
     token_in_bounding_box_function: Callable[
@@ -84,19 +90,19 @@ def _get_text_anchor_in_bbox(
     return text_anchor
 
 
-def _get_norm_x_max(bbox: geometry.BoundingPoly) -> float:
+def _get_norm_x_max(bbox: documentai.BoundingPoly) -> float:
     return max([vertex.x for vertex in bbox.normalized_vertices])
 
 
-def _get_norm_x_min(bbox: geometry.BoundingPoly) -> float:
+def _get_norm_x_min(bbox: documentai.BoundingPoly) -> float:
     return min([vertex.x for vertex in bbox.normalized_vertices])
 
 
-def _get_norm_y_max(bbox: geometry.BoundingPoly) -> float:
+def _get_norm_y_max(bbox: documentai.BoundingPoly) -> float:
     return max([vertex.y for vertex in bbox.normalized_vertices])
 
 
-def _get_norm_y_min(bbox: geometry.BoundingPoly) -> float:
+def _get_norm_y_min(bbox: documentai.BoundingPoly) -> float:
     return min([vertex.y for vertex in bbox.normalized_vertices])
 
 
@@ -109,7 +115,11 @@ def _convert_to_pixels(x: float, conversion_rate: float) -> float:
 
 
 def _convert_bbox_units(
-    coordinate, input_bbox_units, width=None, height=None, multiplier=1
+    coordinate: float,
+    input_bbox_units: str,
+    width: Optional[float] = None,
+    height: Optional[float] = None,
+    multiplier: float = 1.0,
 ) -> float:
     r"""Returns a converted coordinate.
 
@@ -130,27 +140,14 @@ def _convert_bbox_units(
             A converted coordinate.
 
     """
-    final_coordinate = coordinate
-    if input_bbox_units != "normalized":
-        if input_bbox_units == "pxl":
-            if width is None:
-                final_coordinate = _normalize_coordinates(coordinate, height)
-            else:
-                final_coordinate = _normalize_coordinates(coordinate, width)
-        if input_bbox_units == "inch":
-            x = _convert_to_pixels(coordinate, 96)
-            if width is None:
-                final_coordinate = _normalize_coordinates(x, height)
-            else:
-                final_coordinate = _normalize_coordinates(x, width)
-        if input_bbox_units == "cm":
-            x = _convert_to_pixels(coordinate, 37.795)
-            if width is None:
-                final_coordinate = _normalize_coordinates(x, height)
-            else:
-                final_coordinate = _normalize_coordinates(x, width)
-
-    return final_coordinate * multiplier
+
+    if input_bbox_units == "normalized":
+        return coordinate * multiplier
+
+    x = _convert_to_pixels(coordinate, PIXEL_CONVERSION_RATES.get(input_bbox_units, 1))
+    y = width or height
+
+    return _normalize_coordinates(x, y) * multiplier
 
 
 def _get_multiplier(
@@ -170,37 +167,29 @@ def _get_multiplier(
             multiplier to use when converting bounding boxes.
 
     """
-    if input_bbox_units == "inch":
-        converted = _convert_to_pixels(external_coordinate, 96)
-        return docproto_coordinate / converted
-    elif input_bbox_units == "cm":
-        converted = _convert_to_pixels(external_coordinate, 37.795)
-        return docproto_coordinate / converted
-    else:
-        return docproto_coordinate / external_coordinate
+    converted_coordinate = _convert_to_pixels(
+        external_coordinate, PIXEL_CONVERSION_RATES.get(input_bbox_units, 1)
+    )
+    return docproto_coordinate / converted_coordinate
 
 
-def _convert_bbox_to_docproto_bbox(block) -> geometry.BoundingPoly:
+def convert_bbox_to_docproto_bbox(block: Block) -> documentai.BoundingPoly:
     r"""Returns a converted bounding box from Block.
 
     Args:
         block (Block):
             Required.
     Returns:
-        geometry.BoundingPoly:
-            A geometry.BoundingPoly from bounding box.
+        documentai.BoundingPoly:
+            A documentai.BoundingPoly from bounding box.
 
     """
-    merged_bbox = geometry.BoundingPoly()
-    x_multiplier = 1
-    y_multiplier = 1
-    coordinates = []
-    nv = []
+    if block.bounding_box == []:
+        return documentai.BoundingPoly()
 
-    # _convert_bbox_units should check if external_bbox is list or not
-    coordinates_object = block.bounding_box
-    if coordinates_object == []:
-        return coordinates_object
+    x_multiplier = 1.0
+    y_multiplier = 1.0
+    normalized_vertices: List[documentai.NormalizedVertex] = []
 
     if block.page_width and block.page_height:
         x_multiplier = _get_multiplier(
@@ -216,9 +205,8 @@ def _convert_bbox_to_docproto_bbox(block) -> geometry.BoundingPoly:
 
     if block.bounding_type == "1":
         # Type 1 : bounding box has 4 (x,y) coordinates
-
-        if type(block.bounding_box) == list:
-            for coordinate in coordinates_object:
+        if isinstance(block.bounding_box, list):
+            for coordinate in block.bounding_box:
                 x = _convert_bbox_units(
                     coordinate[f"{block.bounding_x}"],
                     input_bbox_units=block.bounding_unit,
@@ -232,44 +220,36 @@ def _convert_bbox_to_docproto_bbox(block) -> geometry.BoundingPoly:
                     multiplier=y_multiplier,
                 )
 
-                coordinates.append({"x": x, "y": y})
-
-            coordinates_object = coordinates
+                normalized_vertices.append(documentai.NormalizedVertex(x=x, y=y))
 
     elif block.bounding_type == "2":
         # Type 2 : bounding box has 1 (x,y) coordinates for the top left corner
         #          and (width, height)
-        original_x = coordinates_object[f"{block.bounding_x}"]
-        original_y = coordinates_object[f"{block.bounding_y}"]
-
-        x = _convert_bbox_units(
-            original_x,
+        x_min = _convert_bbox_units(
+            block.bounding_box[f"{block.bounding_x}"],
             input_bbox_units=block.bounding_unit,
             width=block.page_width,
             multiplier=x_multiplier,
         )
-        y = _convert_bbox_units(
-            original_y,
+        y_min = _convert_bbox_units(
+            block.bounding_box[f"{block.bounding_y}"],
             input_bbox_units=block.bounding_unit,
             width=block.page_height,
             multiplier=y_multiplier,
         )
-
-        # x_min_y_min
-        coordinates.append({"x": x, "y": y})
-        # x_max_y_min
-        coordinates.append({"x": (x + block.bounding_width), "y": y})
-        # x_max_y_max
-        coordinates.append(
-            {"x": (x + block.bounding_width), "y": (y + block.bounding_height)}
+        x_max = x_min + block.bounding_width
+        y_max = y_min + block.bounding_height
+        normalized_vertices.extend(
+            [
+                documentai.NormalizedVertex(x=x_min, y=y_min),
+                documentai.NormalizedVertex(x=x_max, y=y_min),
+                documentai.NormalizedVertex(x=x_max, y=y_max),
+                documentai.NormalizedVertex(x=x_min, y=y_max),
+            ]
         )
-        # x_min_y_max
-        coordinates.append({"x": x, "y": (y + block.bounding_height)})
 
-        coordinates_object = coordinates
     elif block.bounding_type == "3":
-        # Type 2 : bounding box has 1 (x,y) coordinates for the top left corner
-        #          and (width, height)
+        #   Type 3 : bounding_box: [x1, y1, x2, y2, x3, y3, x4, y4]
         for idx in range(0, len(block.bounding_box), 2):
             x = _convert_bbox_units(
                 block.bounding_box[idx],
@@ -283,14 +263,6 @@ def _convert_bbox_to_docproto_bbox(block) -> geometry.BoundingPoly:
                 width=block.docproto_height,
                 multiplier=y_multiplier,
             )
+            normalized_vertices.append(documentai.NormalizedVertex(x=x, y=y))
 
-            coordinates.append({"x": x, "y": y})
-
-        coordinates_object = coordinates
-
-    for coordinates in coordinates_object:
-        nv.append(documentai.NormalizedVertex(x=coordinates["x"], y=coordinates["y"]))
-
-    merged_bbox.normalized_vertices = nv
-
-    return merged_bbox
+    return documentai.BoundingPoly(normalized_vertices=normalized_vertices)