Skip to content

Commit

Permalink
fix: Changed client_info import and added new quickstart samples (#268
Browse files Browse the repository at this point in the history
)

* fix: Changed `client_info` import

`google.api_core.client_info` -> `google.api_core.gapic_v1.client_info`

* Add examples for all document initialization options (w/tests)

Fixes #266
  • Loading branch information
holtskinner authored Mar 4, 2024
1 parent ecb656c commit c4b1d58
Show file tree
Hide file tree
Showing 5 changed files with 191 additions and 25 deletions.
2 changes: 1 addition & 1 deletion google/cloud/documentai_toolbox/utilities/gcs_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
import re
from typing import Dict, List, Optional, Tuple

from google.api_core import client_info
from google.api_core.gapic_v1 import client_info

from google.cloud import documentai, documentai_toolbox, storage
from google.cloud.documentai_toolbox import constants
Expand Down
24 changes: 23 additions & 1 deletion google/cloud/documentai_toolbox/wrappers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import re
from typing import Dict, List, Optional, Type, Union

from google.api_core.client_options import ClientOptions
from google.api_core.operation import from_gapic as operation_from_gapic
from google.cloud.vision import AnnotateFileResponse
from google.longrunning.operations_pb2 import GetOperationRequest
Expand Down Expand Up @@ -138,6 +139,7 @@ def _get_shards(gcs_bucket_name: str, gcs_prefix: str) -> List[documentai.Docume

def _get_batch_process_metadata(
operation_name: str,
location: Optional[str] = None,
timeout: Optional[float] = None,
) -> documentai.BatchProcessMetadata:
r"""Get `BatchProcessMetadata` from a `batch_process_documents()` long-running operation.
Expand All @@ -146,22 +148,41 @@ def _get_batch_process_metadata(
operation_name (str):
Required. The fully qualified operation name for a `batch_process_documents()` operation.
location (str):
Optional. The location of the processor used for `batch_process_documents()`.
Deprecated. Maintained for backwards compatibility.
timeout (float):
Optional. Default None. Time in seconds to wait for operation to complete.
If None, will wait indefinitely.
Returns:
documentai.BatchProcessMetadata:
Metadata from batch process.
"""
# Validate Operation Name
match = re.search(
r"projects\/\w+\/locations\/(\w+)\/operations\/\w+", operation_name
)

if not match:
raise ValueError(
f"Invalid Operation Name: {operation_name}\n"
"Expected operation name in the format `projects/<project>/locations/<location>/operations/<operation>`"
)

location = location or match.group(1)

client = documentai.DocumentProcessorServiceClient(
client_info=gcs_utilities._get_client_info(module="get_batch_process_metadata"),
client_options=ClientOptions(
api_endpoint=f"{location}-documentai.googleapis.com"
),
)

# Poll Operation until complete.
operation = operation_from_gapic(
operation=client.get_operation(
request=GetOperationRequest(name=operation_name),
metadata=documentai.BatchProcessMetadata(),
),
operations_client=client,
result_type=documentai.BatchProcessResponse,
Expand Down Expand Up @@ -599,6 +620,7 @@ def from_batch_process_operation(
return cls.from_batch_process_metadata(
metadata=_get_batch_process_metadata(
operation_name=operation_name,
location=location,
timeout=timeout,
)
)
Expand Down
85 changes: 66 additions & 19 deletions samples/snippets/quickstart_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,41 +15,88 @@


# [START documentai_toolbox_quickstart]
from typing import Optional

from google.cloud import documentai
from google.cloud.documentai_toolbox import document
from google.cloud.documentai_toolbox import gcs_utilities

# TODO(developer): Uncomment these variables before running the sample.
# Given a document.proto or sharded document.proto in path gs://bucket/path/to/folder
# Given a Document JSON or sharded Document JSON in path gs://bucket/path/to/folder
# gcs_bucket_name = "bucket"
# gcs_prefix = "path/to/folder"

# Or, given a Document JSON in path gs://bucket/path/to/folder/document.json
# gcs_uri = "gs://bucket/path/to/folder/document.json"

# Or, given a Document JSON in path local/path/to/folder/document.json
# document_path = "local/path/to/folder/document.json"

# Or, given a Document object from Document AI
# documentai_document = documentai.Document()

# Or, given a BatchProcessMetadata object from Document AI
# operation = client.batch_process_documents(request)
# operation.result(timeout=timeout)
# batch_process_metadata = documentai.BatchProcessMetadata(operation.metadata)

# Or, given a BatchProcessOperation name from Document AI
# batch_process_operation = "projects/project_id/locations/location/operations/operation_id"


def quickstart_sample(
gcs_bucket_name: Optional[str] = None,
gcs_prefix: Optional[str] = None,
gcs_uri: Optional[str] = None,
document_path: Optional[str] = None,
documentai_document: Optional[documentai.Document] = None,
batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None,
batch_process_operation: Optional[str] = None,
) -> None:
if gcs_bucket_name and gcs_prefix:
# Load from Google Cloud Storage Directory
print("Document structure in Cloud Storage")
gcs_utilities.print_gcs_document_tree(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)

wrapped_document = document.Document.from_gcs(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)
elif gcs_uri:
# Load a single Document from a Google Cloud Storage URI
wrapped_document = document.Document.from_gcs_uri(gcs_uri=gcs_uri)
elif document_path:
# Load from local `Document` JSON file
wrapped_document = document.Document.from_document_path(document_path)
elif documentai_document:
# Load from `documentai.Document` object
wrapped_document = document.Document.from_documentai_document(
documentai_document
)
elif batch_process_metadata:
# Load Documents from `BatchProcessMetadata` object
wrapped_documents = document.Document.from_batch_process_metadata(
metadata=batch_process_metadata
)
wrapped_document = wrapped_documents[0]
elif batch_process_operation:
wrapped_documents = document.Document.from_batch_process_operation(
location="us", operation_name=batch_process_operation
)
wrapped_document = wrapped_documents[0]
else:
raise ValueError("No document source provided.")

def quickstart_sample(gcs_bucket_name: str, gcs_prefix: str) -> None:
print("Document structure in Cloud Storage")
gcs_utilities.print_gcs_document_tree(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)

wrapped_document = document.Document.from_gcs(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)
# For all properties and methods, refer to:
# https://cloud.google.com/python/docs/reference/documentai-toolbox/latest/google.cloud.documentai_toolbox.wrappers.document.Document

# Alternatively, create wrapped document from:
#
# - Local `Document` JSON file: `document.Document.from_document_path()`
# - `Document` object: `document.Document.from_documentai_document()`
# - `BatchProcessMetadata`: `document.Document.from_batch_process_metadata()`
# - Batch Processing Operation: `document.Document.from_batch_process_operation()`

print("Document Successfully Loaded!")
print(f"\t Number of Pages: {len(wrapped_document.pages)}")
print(f"\t Number of Entities: {len(wrapped_document.entities)}")

for idx, page in enumerate(wrapped_document.pages):
print(f"Page {idx}")
for page in wrapped_document.pages:
print(f"Page {page.page_number}")
for block in page.blocks:
print(block.text)
for paragraph in page.paragraphs:
Expand Down
95 changes: 91 additions & 4 deletions samples/snippets/test_quickstart_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,105 @@
import pytest
from samples.snippets import quickstart_sample

from google.cloud import documentai
from google.longrunning.operations_pb2 import ListOperationsRequest # type: ignore

location = "us"
project_id = os.environ["GOOGLE_CLOUD_PROJECT"]
gcs_bucket_name = "documentai_toolbox_samples"
gcs_input_uri = "output/123456789/0"


def test_quickstart_sample(capsys: pytest.CaptureFixture) -> None:
def test_quickstart_sample_gcs_bucket_prefix(capsys: pytest.CaptureFixture) -> None:
gcs_bucket_name = "documentai_toolbox_samples"
gcs_prefix = "output/123456789/0"
quickstart_sample.quickstart_sample(
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_input_uri
gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix
)
out, _ = capsys.readouterr()

assert "Document structure in Cloud Storage" in out
assert "Number of Pages: 1" in out
assert "Number of Entities: 35" in out


def test_quickstart_sample_gcs_uri(capsys: pytest.CaptureFixture) -> None:
gcs_uri = (
"gs://documentai_toolbox_samples/output/123456789/0/toolbox_invoice_test-0.json"
)
quickstart_sample.quickstart_sample(gcs_uri=gcs_uri)
out, _ = capsys.readouterr()

assert "Number of Pages: 1" in out
assert "Number of Entities: 35" in out


def test_quickstart_sample_document_path(capsys: pytest.CaptureFixture) -> None:
document_path = "resources/form_with_tables.json"
quickstart_sample.quickstart_sample(document_path=document_path)
out, _ = capsys.readouterr()

assert "Number of Pages: 1" in out
assert "Number of Entities: 0" in out
assert "Form Date" in out


def test_quickstart_sample_documentai_document(capsys: pytest.CaptureFixture) -> None:
with open("resources/form_with_tables.json", encoding="utf-8") as f:
documentai_document = documentai.Document.from_json(
f.read(), ignore_unknown_fields=True
)

quickstart_sample.quickstart_sample(documentai_document=documentai_document)
out, _ = capsys.readouterr()

assert "Number of Pages: 1" in out
assert "Number of Entities: 0" in out
assert "Form Date" in out


def test_quickstart_sample_batch_process_metadata(
capsys: pytest.CaptureFixture,
) -> None:
client = documentai.DocumentProcessorServiceClient()
name = f"{client.common_location_path(project=project_id, location=location)}/operations"
response = client.list_operations(
request=ListOperationsRequest(
name=name,
filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE",
page_size=1,
)
)
batch_process_metadata = documentai.BatchProcessMetadata.deserialize(
response.operations[0].metadata.value
)

quickstart_sample.quickstart_sample(batch_process_metadata=batch_process_metadata)

out, _ = capsys.readouterr()

assert "Document Successfully Loaded!" in out


def test_quickstart_sample_batch_process_operation(
capsys: pytest.CaptureFixture,
) -> None:
client = documentai.DocumentProcessorServiceClient()
name = f"{client.common_location_path(project=project_id, location=location)}/operations"
response = client.list_operations(
request=ListOperationsRequest(
name=name,
filter="TYPE=BATCH_PROCESS_DOCUMENTS AND STATE=DONE",
page_size=1,
)
)
batch_process_operation = response.operations[0].name

quickstart_sample.quickstart_sample(batch_process_operation=batch_process_operation)

out, _ = capsys.readouterr()

assert "Document Successfully Loaded!" in out


def test_quickstart_sample_no_input() -> None:
with pytest.raises(ValueError, match="No document source provided."):
quickstart_sample.quickstart_sample()
10 changes: 10 additions & 0 deletions tests/unit/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -317,6 +317,16 @@ def test_get_batch_process_metadata_with_invalid_metadata_type(mock_docai):
document._get_batch_process_metadata(operation_name)


def test_get_batch_process_metadata_with_invalid_operation_name():
with pytest.raises(
ValueError,
match="Invalid Operation Name",
):
document._get_batch_process_metadata(
"projects//locations/us/operations/7890123"
)


def test_bigquery_column_name():
string_map = {
"Phone #:": "phone_num",
Expand Down

0 comments on commit c4b1d58

Please sign in to comment.