Skip to content

Commit

Permalink
fix: Add trailing slash if not present for gcs_prefix in `Document.…
Browse files Browse the repository at this point in the history
…from_gcs()` to cover matching prefixes edge case. (#274)

* fix: Add trailing slash if not present for `gcs_prefix` in `Document.from_gcs()` to cover matching prefixes edge case.
* Added Tests for GCS Matching Prefixes
  • Loading branch information
holtskinner authored Mar 8, 2024
1 parent 7248fe1 commit b4762e8
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 6 deletions.
2 changes: 2 additions & 0 deletions google/cloud/documentai_toolbox/wrappers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,6 +504,8 @@ def from_gcs(
Document:
A document from gcs.
"""
# Add trailing slash if not present.
gcs_prefix = gcs_prefix.rstrip("/") + "/"
shards = _get_shards(gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix)
return cls(
shards=shards,
Expand Down
5 changes: 3 additions & 2 deletions samples/snippets/quickstart_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def quickstart_sample(
documentai_document: Optional[documentai.Document] = None,
batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None,
batch_process_operation: Optional[str] = None,
) -> None:
) -> document.Document:
if gcs_bucket_name and gcs_prefix:
# Load from Google Cloud Storage Directory
print("Document structure in Cloud Storage")
Expand Down Expand Up @@ -128,5 +128,6 @@ def quickstart_sample(
if entity.normalized_text:
print(f"\tNormalized Text: {entity.normalized_text}")

# [END documentai_toolbox_quickstart]

# [END documentai_toolbox_quickstart]
return wrapped_document
26 changes: 26 additions & 0 deletions samples/snippets/test_quickstart_sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,32 @@ def test_quickstart_sample_batch_process_metadata(
assert "Document Successfully Loaded!" in out


def test_quickstart_sample_batch_process_metadata_matching_prefixes(
capsys: pytest.CaptureFixture,
) -> None:
batch_process_metadata = documentai.BatchProcessMetadata(
state=documentai.BatchProcessMetadata.State.SUCCEEDED,
individual_process_statuses=[
documentai.BatchProcessMetadata.IndividualProcessStatus(
input_gcs_source="gs://test-directory/documentai/input.pdf",
output_gcs_destination="gs://documentai_toolbox_samples/output/matching-prefixes/1",
),
documentai.BatchProcessMetadata.IndividualProcessStatus(
input_gcs_source="gs://test-directory/documentai/input.pdf",
output_gcs_destination="gs://documentai_toolbox_samples/output/matching-prefixes/11",
),
],
)
wrapped_document = quickstart_sample.quickstart_sample(
batch_process_metadata=batch_process_metadata
)

assert wrapped_document.gcs_prefix == "output/matching-prefixes/1/"
out, _ = capsys.readouterr()

assert "Document Successfully Loaded!" in out


def test_quickstart_sample_batch_process_operation(
capsys: pytest.CaptureFixture,
) -> None:
Expand Down
39 changes: 35 additions & 4 deletions tests/unit/test_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def test_get_batch_process_metadata_with_valid_operation(
individual_process_statuses=[
documentai.BatchProcessMetadata.IndividualProcessStatus(
input_gcs_source="gs://test-directory/documentai/input.pdf",
output_gcs_destination="gs://test-directory/documentai/output/123456789/1/",
output_gcs_destination="gs://test-directory/documentai/output/123456789/1",
)
],
)
Expand Down Expand Up @@ -256,7 +256,7 @@ def test_get_batch_process_metadata_with_running_operation(
individual_process_statuses=[
documentai.BatchProcessMetadata.IndividualProcessStatus(
input_gcs_source="gs://test-directory/documentai/input.pdf",
output_gcs_destination="gs://test-directory/documentai/output/123456789/1/",
output_gcs_destination="gs://test-directory/documentai/output/123456789/1",
)
],
)
Expand Down Expand Up @@ -442,11 +442,11 @@ def test_document_from_batch_process_metadata_with_multiple_input_files(
individual_process_statuses=[
mock.Mock(
input_gcs_source="gs://test-directory/documentai/input.pdf",
output_gcs_destination="gs://test-directory/documentai/output/123456789/1/",
output_gcs_destination="gs://test-directory/documentai/output/123456789/1",
),
mock.Mock(
input_gcs_source="gs://test-directory/documentai/input2.pdf",
output_gcs_destination="gs://test-directory/documentai/output/123456789/2/",
output_gcs_destination="gs://test-directory/documentai/output/123456789/2",
),
],
)
Expand All @@ -465,6 +465,37 @@ def test_document_from_batch_process_metadata_with_multiple_input_files(
assert documents[1].gcs_input_uri == "gs://test-directory/documentai/input2.pdf"


def test_document_from_batch_process_metadata_with_multiple_input_files_matching_prefix(
get_bytes_multiple_directories_mock,
):
mock_metadata = mock.Mock(
state=documentai.BatchProcessMetadata.State.SUCCEEDED,
individual_process_statuses=[
mock.Mock(
input_gcs_source="gs://test-directory/documentai/input.pdf",
output_gcs_destination="gs://test-directory/documentai/output/123456789/1",
),
mock.Mock(
input_gcs_source="gs://test-directory/documentai/input2.pdf",
output_gcs_destination="gs://test-directory/documentai/output/123456789/11",
),
],
)
documents = document.Document.from_batch_process_metadata(mock_metadata)

get_bytes_multiple_directories_mock.assert_called()
assert get_bytes_multiple_directories_mock.call_count == 2
assert len(documents) == 2

assert documents[0].gcs_bucket_name == "test-directory"
assert documents[0].gcs_prefix == "documentai/output/123456789/1/"
assert documents[0].gcs_input_uri == "gs://test-directory/documentai/input.pdf"

assert documents[1].gcs_bucket_name == "test-directory"
assert documents[1].gcs_prefix == "documentai/output/123456789/11/"
assert documents[1].gcs_input_uri == "gs://test-directory/documentai/input2.pdf"


def test_document_from_batch_process_metadata_with_failed_operation():
with pytest.raises(
ValueError,
Expand Down

0 comments on commit b4762e8

Please sign in to comment.