fix: Add support for Classifier entities (#333)

- Added support for Custom Classifier entities - Added Unit Tests for Classifier output - Added input validation for `Document.split_pdf()` Fixes #332 🦕
googleapis · Jul 16, 2024 · 2352cae · 2352cae
1 parent dbf26f2
commit 2352cae
Show file tree

Hide file tree

Showing 5 changed files with 84 additions and 18 deletions.
diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py
@@ -765,7 +765,7 @@ def entities_to_bigquery(
         )
 
     def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
-        r"""Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor.
+        r"""Splits local PDF file into multiple PDF files based on output from a Splitter processor.
 
         Args:
             pdf_path (str):
@@ -776,6 +776,8 @@ def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
             List[str]:
                 A list of output pdf files.
         """
+        if self.entities[0].start_page is None or self.entities[0].end_page is None:
+            raise ValueError("Entities do not contain start or end pages.")
         output_files: List[str] = []
         input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
         with Pdf.open(pdf_path) as pdf:

diff --git a/google/cloud/documentai_toolbox/wrappers/entity.py b/google/cloud/documentai_toolbox/wrappers/entity.py
@@ -40,45 +40,46 @@ class Entity:
             Required. Entity type from a schema e.g. "Address".
         mention_text (str):
             Optional. Text value in the document e.g. "1600 Amphitheatre Pkwy".
-            If the entity is not present in
-            the document, this field will be empty.
+            Only populated for Extraction processors.
         normalized_text (str):
             Optional. Normalized text value in the document e.g. "1970-01-01".
-            If the entity is not present in
-            the document, this field will be empty.
+            Only populated for Extraction processors.
         start_page (int):
-            Required. `Page` containing the `Entity` or the first page of the
-            classification (for Splitter/Classifier processors).
+            Optional. `Page` containing the `Entity` for Extraction processors or the first page of the
+            subdocument for Splitter processors.
         end_page (int):
-            Required. Last page of the classification
+            Optional. Last page of the subdocument for Splitter processors.
     """
 
     documentai_object: documentai.Document.Entity = dataclasses.field(repr=False)
     page_offset: dataclasses.InitVar[Optional[int]] = 0
 
     type_: str = dataclasses.field(init=False)
-    mention_text: str = dataclasses.field(init=False, default="")
-    normalized_text: str = dataclasses.field(init=False, default="")
+    mention_text: Optional[str] = dataclasses.field(init=False, default=None)
+    normalized_text: Optional[str] = dataclasses.field(init=False, default=None)
 
-    start_page: int = dataclasses.field(init=False)
-    # Only Populated for Splitter/Classifier Output
-    end_page: int = dataclasses.field(init=False)
+    start_page: Optional[int] = dataclasses.field(init=False, default=None)
+    end_page: Optional[int] = dataclasses.field(init=False, default=None)
 
     _image: Optional[Image.Image] = dataclasses.field(init=False, default=None)
 
     def __post_init__(self, page_offset: int) -> None:
         self.type_ = self.documentai_object.type_
-        self.mention_text = self.documentai_object.mention_text
+
+        if self.documentai_object.mention_text:
+            self.mention_text = self.documentai_object.mention_text
+
         if (
             self.documentai_object.normalized_value
             and self.documentai_object.normalized_value.text
         ):
             self.normalized_text = self.documentai_object.normalized_value.text
 
-        page_refs = self.documentai_object.page_anchor.page_refs
-        if page_refs:
-            self.start_page = int(page_refs[0].page) + page_offset
-            self.end_page = int(page_refs[-1].page) + page_offset
+        if self.documentai_object.page_anchor:
+            page_refs = self.documentai_object.page_anchor.page_refs
+            if page_refs:
+                self.start_page = int(page_refs[0].page) + page_offset
+                self.end_page = int(page_refs[-1].page) + page_offset
 
     def crop_image(
         self, documentai_page: documentai.Document.Page

diff --git a/tests/unit/resources/classifier/custom_classifier_output.json b/tests/unit/resources/classifier/custom_classifier_output.json
@@ -0,0 +1 @@
+{"text": "US010182182B2\n(12) United States Patent\nLewkow et al.\n(10) Patent No.: US 10,182,182 B2\n(45) Date of Patent: Jan. 15, 2019\n(54)\nIMAGE SENSOR HAVING MULTIPLE\nOUTPUT PORTS\n(71)\nApplicant: Google LLC, Mountain View, CA (US)\nH04N 7/0127 (2013.01); H04N_7/0806\n(2013.01); H04N 13/239 (2018.05); H04N\n13/254 (2018.05); H04N 13/271 (2018.05)\n(58) Field of Classification Search\nCPC G01S 17/08; H04N 5/2258; H04N 5/23229;\nH04N 13/0271; H04N 5/376; H04N\n5/3765; H04N 5/378; H04N 5/345\nSee application file for complete search history.\n(72)\nInventors: Roman Lewkow, San Jose, CA (US);\nChung Chun Wan, San Jose, CA (US)\n(73)\nAssignee: Google LLC, Mountain View, CA (US)\n(*)\nNotice:\n(56)\nReferences Cited\nSubject to any disclaimer, the term of this\npatent is extended or adjusted under 35\nU.S.C. 154(b) by 0 days.\nU.S. PATENT DOCUMENTS\n(21)\nAppl. No.: 15/831,925\n6,831,688 B2 * 12/2004 Lareau\nGO1J 3/02\n348/272\n(22)\nFiled:\nDec. 5, 2017\n7,247,393 B2\n7,936,038 B2\n7,990,636 B2\n8,027,107 B2\n7/2007 Hazel et al.\n5/2011 Jeong et al.\n8/2011 Park et al.\n9/2011 Hwang et al.\n(Continued)\n(65)\nPrior Publication Data\nFOREIGN PATENT DOCUMENTS\n(63)\nUS 2018/0097979 A1 Apr. 5, 2018\nRelated U.S. Application Data\nContinuation of application No. 15/476,165, filed on\nMar. 31, 2017, now Pat. No. 9,866,740, which is a\ncontinuation of application No. 14/580,025, filed on\nDec. 22, 2014, now Pat. No. 9,615,013.\nEP\n1478176\n11/2004\nOTHER PUBLICATIONS\n(51)\nPCT/US2015/062157-International Search Report & Written Opin-\nion, dated Mar. 8, 2016, 12 pages.\n(Continued)\nPrimary Examiner Nicholas G Giles\n(74) Attorney, Agent, or Firm - Fish & Richardson P.C.\nInt. Cl.\nH04N 5/225\n(2006.01)\nH04N 5/374\n(2011.01)\nH04N 5/378\n(2011.01)\nH04N 5/369\n(2011.01)\nH04N 5/232\n(2006.01)\nH04N 13/254 (2018.01)\nH04N 13/271\n(2018.01)\nH04N 7/01\n(2006.01)\nH04N 7/08\n(2006.01)\nH04N 13/239 (2018.01)\nU.S. Cl.\nCPC H04N 5/2258 (2013.01); H04N 5/23245\n(2013.01); H04N 5/3696 (2013.01); H04N\n5/378 (2013.01); H04N 5/3742 (2013.01);\n(57)\nABSTRACT\nAn apparatus is described that includes an image sensor\nhaving a first output port and a second output port. The first\noutput port is to transmit a first image stream concurrently\nwith a second image stream transmitted from the second\noutput port.\n(52)\n18 Claims, 10 Drawing Sheets\nImage Sensor\n410b\nFirst Image\nStream 401b\nImage Signal\nProcessing\nPipeline 407_1b\n1\n1\n2\n413_1b\n1\n2\n3\n5\n6\nSecond Image\nStream 402b\n413_2b\nImage Signal Processing\nPipeline 407_2b\ntime\n", "pages": [{"pageNumber": 1}], "entities": [{"type": "computer_vision", "confidence": 0.47925246, "id": "0"}, {"type": "crypto", "confidence": 0.0433604, "id": "1"}, {"type": "med_tech", "confidence": 0.26732057, "id": "2"}, {"type": "other", "confidence": 0.2100666, "id": "3"}]}
diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py
@@ -79,6 +79,13 @@ def get_bytes_splitter_mock():
         yield byte_factory
 
 
+@pytest.fixture
+def get_bytes_classifier_mock():
+    with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory:
+        byte_factory.return_value = get_bytes("tests/unit/resources/classifier")
+        yield byte_factory
+
+
 @pytest.fixture
 def get_bytes_images_mock():
     with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory:
@@ -206,6 +213,30 @@ def test_entities_from_shards_with_hex_ids():
     assert actual[1].type_ == "class_international"
 
 
+def test_entities_from_shards_classifier(get_bytes_classifier_mock):
+    shards = document._get_shards(
+        gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0/"
+    )
+    get_bytes_classifier_mock.assert_called_once()
+
+    actual = document._entities_from_shards(shards=shards)
+
+    # Check for error reported in https://github.com/googleapis/python-documentai-toolbox/issues/332
+    assert repr(actual)
+    assert actual[0].type_ == "computer_vision"
+    assert round(actual[0].documentai_object.confidence, 8) == 0.47925246
+    assert actual[0].documentai_object.id == "0"
+    assert actual[1].type_ == "crypto"
+    assert round(actual[1].documentai_object.confidence, 8) == 0.0433604
+    assert actual[1].documentai_object.id == "1"
+    assert actual[2].type_ == "med_tech"
+    assert round(actual[2].documentai_object.confidence, 8) == 0.26732057
+    assert actual[2].documentai_object.id == "2"
+    assert actual[3].type_ == "other"
+    assert round(actual[3].documentai_object.confidence, 8) == 0.2100666
+    assert actual[3].documentai_object.id == "3"
+
+
 @mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai")
 def test_get_batch_process_metadata_with_valid_operation(
     mock_docai,
@@ -703,6 +734,22 @@ def test_split_pdf(mock_Pdf, get_bytes_splitter_mock):
     ]
 
 
+def test_split_pdf_with_non_splitter(get_bytes_classifier_mock):
+    doc = document.Document.from_gcs(
+        gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0"
+    )
+
+    with pytest.raises(
+        ValueError,
+        match="Entities do not contain start or end pages.",
+    ):
+        doc.split_pdf(
+            pdf_path="procurement_multi_document.pdf", output_path="splitter/output/"
+        )
+
+    get_bytes_classifier_mock.assert_called_once()
+
+
 def test_convert_document_to_annotate_file_response():
     doc = document.Document.from_document_path(
         document_path="tests/unit/resources/0/toolbox_invoice_test-0.json"

diff --git a/tests/unit/test_entity.py b/tests/unit/test_entity.py
@@ -68,6 +68,21 @@ def test_Entity_splitter():
     assert wrapper_entity.end_page == 2
 
 
+def test_Entity_classifier():
+    documentai_entity = documentai.Document.Entity(
+        type_="clinical_notes",
+        id="0",
+        confidence=0.99878639,
+    )
+    wrapper_entity = entity.Entity(documentai_entity)
+    assert wrapper_entity.type_ == "clinical_notes"
+    assert wrapper_entity.documentai_object.id == "0"
+    assert round(wrapper_entity.documentai_object.confidence, 8) == 0.99878639
+    assert not wrapper_entity.mention_text
+    assert not wrapper_entity.start_page
+    assert not wrapper_entity.end_page
+
+
 def test_Entity_with_page_offset():
     documentai_entity = documentai.Document.Entity(
         type_="invoice_statement",
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		{"text": "US010182182B2\n(12) United States Patent\nLewkow et al.\n(10) Patent No.: US 10,182,182 B2\n(45) Date of Patent: Jan. 15, 2019\n(54)\nIMAGE SENSOR HAVING MULTIPLE\nOUTPUT PORTS\n(71)\nApplicant: Google LLC, Mountain View, CA (US)\nH04N 7/0127 (2013.01); H04N_7/0806\n(2013.01); H04N 13/239 (2018.05); H04N\n13/254 (2018.05); H04N 13/271 (2018.05)\n(58) Field of Classification Search\nCPC G01S 17/08; H04N 5/2258; H04N 5/23229;\nH04N 13/0271; H04N 5/376; H04N\n5/3765; H04N 5/378; H04N 5/345\nSee application file for complete search history.\n(72)\nInventors: Roman Lewkow, San Jose, CA (US);\nChung Chun Wan, San Jose, CA (US)\n(73)\nAssignee: Google LLC, Mountain View, CA (US)\n()\nNotice:\n(56)\nReferences Cited\nSubject to any disclaimer, the term of this\npatent is extended or adjusted under 35\nU.S.C. 154(b) by 0 days.\nU.S. PATENT DOCUMENTS\n(21)\nAppl. No.: 15/831,925\n6,831,688 B2 12/2004 Lareau\nGO1J 3/02\n348/272\n(22)\nFiled:\nDec. 5, 2017\n7,247,393 B2\n7,936,038 B2\n7,990,636 B2\n8,027,107 B2\n7/2007 Hazel et al.\n5/2011 Jeong et al.\n8/2011 Park et al.\n9/2011 Hwang et al.\n(Continued)\n(65)\nPrior Publication Data\nFOREIGN PATENT DOCUMENTS\n(63)\nUS 2018/0097979 A1 Apr. 5, 2018\nRelated U.S. Application Data\nContinuation of application No. 15/476,165, filed on\nMar. 31, 2017, now Pat. No. 9,866,740, which is a\ncontinuation of application No. 14/580,025, filed on\nDec. 22, 2014, now Pat. No. 9,615,013.\nEP\n1478176\n11/2004\nOTHER PUBLICATIONS\n(51)\nPCT/US2015/062157-International Search Report & Written Opin-\nion, dated Mar. 8, 2016, 12 pages.\n(Continued)\nPrimary Examiner Nicholas G Giles\n(74) Attorney, Agent, or Firm - Fish & Richardson P.C.\nInt. Cl.\nH04N 5/225\n(2006.01)\nH04N 5/374\n(2011.01)\nH04N 5/378\n(2011.01)\nH04N 5/369\n(2011.01)\nH04N 5/232\n(2006.01)\nH04N 13/254 (2018.01)\nH04N 13/271\n(2018.01)\nH04N 7/01\n(2006.01)\nH04N 7/08\n(2006.01)\nH04N 13/239 (2018.01)\nU.S. Cl.\nCPC H04N 5/2258 (2013.01); H04N 5/23245\n(2013.01); H04N 5/3696 (2013.01); H04N\n5/378 (2013.01); H04N 5/3742 (2013.01);\n(57)\nABSTRACT\nAn apparatus is described that includes an image sensor\nhaving a first output port and a second output port. The first\noutput port is to transmit a first image stream concurrently\nwith a second image stream transmitted from the second\noutput port.\n(52)\n18 Claims, 10 Drawing Sheets\nImage Sensor\n410b\nFirst Image\nStream 401b\nImage Signal\nProcessing\nPipeline 407_1b\n1\n1\n2\n413_1b\n1\n2\n3\n5\n6\nSecond Image\nStream 402b\n413_2b\nImage Signal Processing\nPipeline 407_2b\ntime\n", "pages": [{"pageNumber": 1}], "entities": [{"type": "computer_vision", "confidence": 0.47925246, "id": "0"}, {"type": "crypto", "confidence": 0.0433604, "id": "1"}, {"type": "med_tech", "confidence": 0.26732057, "id": "2"}, {"type": "other", "confidence": 0.2100666, "id": "3"}]}