Skip to content

Commit

Permalink
fix: Add support for Classifier entities (#333)
Browse files Browse the repository at this point in the history
- Added support for Custom Classifier entities
- Added Unit Tests for Classifier output
- Added input validation for `Document.split_pdf()`

Fixes #332 🦕
  • Loading branch information
holtskinner authored Jul 16, 2024
1 parent dbf26f2 commit 2352cae
Show file tree
Hide file tree
Showing 5 changed files with 84 additions and 18 deletions.
4 changes: 3 additions & 1 deletion google/cloud/documentai_toolbox/wrappers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -765,7 +765,7 @@ def entities_to_bigquery(
)

def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
r"""Splits local PDF file into multiple PDF files based on output from a Splitter/Classifier processor.
r"""Splits local PDF file into multiple PDF files based on output from a Splitter processor.
Args:
pdf_path (str):
Expand All @@ -776,6 +776,8 @@ def split_pdf(self, pdf_path: str, output_path: str) -> List[str]:
List[str]:
A list of output pdf files.
"""
if self.entities[0].start_page is None or self.entities[0].end_page is None:
raise ValueError("Entities do not contain start or end pages.")
output_files: List[str] = []
input_filename, input_extension = os.path.splitext(os.path.basename(pdf_path))
with Pdf.open(pdf_path) as pdf:
Expand Down
35 changes: 18 additions & 17 deletions google/cloud/documentai_toolbox/wrappers/entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,45 +40,46 @@ class Entity:
Required. Entity type from a schema e.g. "Address".
mention_text (str):
Optional. Text value in the document e.g. "1600 Amphitheatre Pkwy".
If the entity is not present in
the document, this field will be empty.
Only populated for Extraction processors.
normalized_text (str):
Optional. Normalized text value in the document e.g. "1970-01-01".
If the entity is not present in
the document, this field will be empty.
Only populated for Extraction processors.
start_page (int):
Required. `Page` containing the `Entity` or the first page of the
classification (for Splitter/Classifier processors).
Optional. `Page` containing the `Entity` for Extraction processors or the first page of the
subdocument for Splitter processors.
end_page (int):
Required. Last page of the classification
Optional. Last page of the subdocument for Splitter processors.
"""

documentai_object: documentai.Document.Entity = dataclasses.field(repr=False)
page_offset: dataclasses.InitVar[Optional[int]] = 0

type_: str = dataclasses.field(init=False)
mention_text: str = dataclasses.field(init=False, default="")
normalized_text: str = dataclasses.field(init=False, default="")
mention_text: Optional[str] = dataclasses.field(init=False, default=None)
normalized_text: Optional[str] = dataclasses.field(init=False, default=None)

start_page: int = dataclasses.field(init=False)
# Only Populated for Splitter/Classifier Output
end_page: int = dataclasses.field(init=False)
start_page: Optional[int] = dataclasses.field(init=False, default=None)
end_page: Optional[int] = dataclasses.field(init=False, default=None)

_image: Optional[Image.Image] = dataclasses.field(init=False, default=None)

def __post_init__(self, page_offset: int) -> None:
self.type_ = self.documentai_object.type_
self.mention_text = self.documentai_object.mention_text

if self.documentai_object.mention_text:
self.mention_text = self.documentai_object.mention_text

if (
self.documentai_object.normalized_value
and self.documentai_object.normalized_value.text
):
self.normalized_text = self.documentai_object.normalized_value.text

page_refs = self.documentai_object.page_anchor.page_refs
if page_refs:
self.start_page = int(page_refs[0].page) + page_offset
self.end_page = int(page_refs[-1].page) + page_offset
if self.documentai_object.page_anchor:
page_refs = self.documentai_object.page_anchor.page_refs
if page_refs:
self.start_page = int(page_refs[0].page) + page_offset
self.end_page = int(page_refs[-1].page) + page_offset

def crop_image(
self, documentai_page: documentai.Document.Page
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"text": "US010182182B2\n(12) United States Patent\nLewkow et al.\n(10) Patent No.: US 10,182,182 B2\n(45) Date of Patent: Jan. 15, 2019\n(54)\nIMAGE SENSOR HAVING MULTIPLE\nOUTPUT PORTS\n(71)\nApplicant: Google LLC, Mountain View, CA (US)\nH04N 7/0127 (2013.01); H04N_7/0806\n(2013.01); H04N 13/239 (2018.05); H04N\n13/254 (2018.05); H04N 13/271 (2018.05)\n(58) Field of Classification Search\nCPC G01S 17/08; H04N 5/2258; H04N 5/23229;\nH04N 13/0271; H04N 5/376; H04N\n5/3765; H04N 5/378; H04N 5/345\nSee application file for complete search history.\n(72)\nInventors: Roman Lewkow, San Jose, CA (US);\nChung Chun Wan, San Jose, CA (US)\n(73)\nAssignee: Google LLC, Mountain View, CA (US)\n(*)\nNotice:\n(56)\nReferences Cited\nSubject to any disclaimer, the term of this\npatent is extended or adjusted under 35\nU.S.C. 154(b) by 0 days.\nU.S. PATENT DOCUMENTS\n(21)\nAppl. No.: 15/831,925\n6,831,688 B2 * 12/2004 Lareau\nGO1J 3/02\n348/272\n(22)\nFiled:\nDec. 5, 2017\n7,247,393 B2\n7,936,038 B2\n7,990,636 B2\n8,027,107 B2\n7/2007 Hazel et al.\n5/2011 Jeong et al.\n8/2011 Park et al.\n9/2011 Hwang et al.\n(Continued)\n(65)\nPrior Publication Data\nFOREIGN PATENT DOCUMENTS\n(63)\nUS 2018/0097979 A1 Apr. 5, 2018\nRelated U.S. Application Data\nContinuation of application No. 15/476,165, filed on\nMar. 31, 2017, now Pat. No. 9,866,740, which is a\ncontinuation of application No. 14/580,025, filed on\nDec. 22, 2014, now Pat. No. 9,615,013.\nEP\n1478176\n11/2004\nOTHER PUBLICATIONS\n(51)\nPCT/US2015/062157-International Search Report & Written Opin-\nion, dated Mar. 8, 2016, 12 pages.\n(Continued)\nPrimary Examiner Nicholas G Giles\n(74) Attorney, Agent, or Firm - Fish & Richardson P.C.\nInt. Cl.\nH04N 5/225\n(2006.01)\nH04N 5/374\n(2011.01)\nH04N 5/378\n(2011.01)\nH04N 5/369\n(2011.01)\nH04N 5/232\n(2006.01)\nH04N 13/254 (2018.01)\nH04N 13/271\n(2018.01)\nH04N 7/01\n(2006.01)\nH04N 7/08\n(2006.01)\nH04N 13/239 (2018.01)\nU.S. Cl.\nCPC H04N 5/2258 (2013.01); H04N 5/23245\n(2013.01); H04N 5/3696 (2013.01); H04N\n5/378 (2013.01); H04N 5/3742 (2013.01);\n(57)\nABSTRACT\nAn apparatus is described that includes an image sensor\nhaving a first output port and a second output port. The first\noutput port is to transmit a first image stream concurrently\nwith a second image stream transmitted from the second\noutput port.\n(52)\n18 Claims, 10 Drawing Sheets\nImage Sensor\n410b\nFirst Image\nStream 401b\nImage Signal\nProcessing\nPipeline 407_1b\n1\n1\n2\n413_1b\n1\n2\n3\n5\n6\nSecond Image\nStream 402b\n413_2b\nImage Signal Processing\nPipeline 407_2b\ntime\n", "pages": [{"pageNumber": 1}], "entities": [{"type": "computer_vision", "confidence": 0.47925246, "id": "0"}, {"type": "crypto", "confidence": 0.0433604, "id": "1"}, {"type": "med_tech", "confidence": 0.26732057, "id": "2"}, {"type": "other", "confidence": 0.2100666, "id": "3"}]}
47 changes: 47 additions & 0 deletions tests/unit/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,13 @@ def get_bytes_splitter_mock():
yield byte_factory


@pytest.fixture
def get_bytes_classifier_mock():
with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory:
byte_factory.return_value = get_bytes("tests/unit/resources/classifier")
yield byte_factory


@pytest.fixture
def get_bytes_images_mock():
with mock.patch.object(gcs_utilities, "get_bytes") as byte_factory:
Expand Down Expand Up @@ -206,6 +213,30 @@ def test_entities_from_shards_with_hex_ids():
assert actual[1].type_ == "class_international"


def test_entities_from_shards_classifier(get_bytes_classifier_mock):
shards = document._get_shards(
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0/"
)
get_bytes_classifier_mock.assert_called_once()

actual = document._entities_from_shards(shards=shards)

# Check for error reported in https://github.com/googleapis/python-documentai-toolbox/issues/332
assert repr(actual)
assert actual[0].type_ == "computer_vision"
assert round(actual[0].documentai_object.confidence, 8) == 0.47925246
assert actual[0].documentai_object.id == "0"
assert actual[1].type_ == "crypto"
assert round(actual[1].documentai_object.confidence, 8) == 0.0433604
assert actual[1].documentai_object.id == "1"
assert actual[2].type_ == "med_tech"
assert round(actual[2].documentai_object.confidence, 8) == 0.26732057
assert actual[2].documentai_object.id == "2"
assert actual[3].type_ == "other"
assert round(actual[3].documentai_object.confidence, 8) == 0.2100666
assert actual[3].documentai_object.id == "3"


@mock.patch("google.cloud.documentai_toolbox.wrappers.document.documentai")
def test_get_batch_process_metadata_with_valid_operation(
mock_docai,
Expand Down Expand Up @@ -703,6 +734,22 @@ def test_split_pdf(mock_Pdf, get_bytes_splitter_mock):
]


def test_split_pdf_with_non_splitter(get_bytes_classifier_mock):
doc = document.Document.from_gcs(
gcs_bucket_name="test-directory", gcs_prefix="documentai/output/123456789/0"
)

with pytest.raises(
ValueError,
match="Entities do not contain start or end pages.",
):
doc.split_pdf(
pdf_path="procurement_multi_document.pdf", output_path="splitter/output/"
)

get_bytes_classifier_mock.assert_called_once()


def test_convert_document_to_annotate_file_response():
doc = document.Document.from_document_path(
document_path="tests/unit/resources/0/toolbox_invoice_test-0.json"
Expand Down
15 changes: 15 additions & 0 deletions tests/unit/test_entity.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,21 @@ def test_Entity_splitter():
assert wrapper_entity.end_page == 2


def test_Entity_classifier():
documentai_entity = documentai.Document.Entity(
type_="clinical_notes",
id="0",
confidence=0.99878639,
)
wrapper_entity = entity.Entity(documentai_entity)
assert wrapper_entity.type_ == "clinical_notes"
assert wrapper_entity.documentai_object.id == "0"
assert round(wrapper_entity.documentai_object.confidence, 8) == 0.99878639
assert not wrapper_entity.mention_text
assert not wrapper_entity.start_page
assert not wrapper_entity.end_page


def test_Entity_with_page_offset():
documentai_entity = documentai.Document.Entity(
type_="invoice_statement",
Expand Down

0 comments on commit 2352cae

Please sign in to comment.