From b4762e8212e9e435eaa430bcd345291c69e518ac Mon Sep 17 00:00:00 2001 From: Holt Skinner <13262395+holtskinner@users.noreply.github.com> Date: Fri, 8 Mar 2024 12:32:45 -0600 Subject: [PATCH] fix: Add trailing slash if not present for `gcs_prefix` in `Document.from_gcs()` to cover matching prefixes edge case. (#274) * fix: Add trailing slash if not present for `gcs_prefix` in `Document.from_gcs()` to cover matching prefixes edge case. * Added Tests for GCS Matching Prefixes --- .../documentai_toolbox/wrappers/document.py | 2 + samples/snippets/quickstart_sample.py | 5 ++- samples/snippets/test_quickstart_sample.py | 26 +++++++++++++ tests/unit/test_document.py | 39 +++++++++++++++++-- 4 files changed, 66 insertions(+), 6 deletions(-) diff --git a/google/cloud/documentai_toolbox/wrappers/document.py b/google/cloud/documentai_toolbox/wrappers/document.py index 7818a2fa..bface9af 100644 --- a/google/cloud/documentai_toolbox/wrappers/document.py +++ b/google/cloud/documentai_toolbox/wrappers/document.py @@ -504,6 +504,8 @@ def from_gcs( Document: A document from gcs. """ + # Add trailing slash if not present. + gcs_prefix = gcs_prefix.rstrip("/") + "/" shards = _get_shards(gcs_bucket_name=gcs_bucket_name, gcs_prefix=gcs_prefix) return cls( shards=shards, diff --git a/samples/snippets/quickstart_sample.py b/samples/snippets/quickstart_sample.py index 0cea14db..a387c438 100644 --- a/samples/snippets/quickstart_sample.py +++ b/samples/snippets/quickstart_sample.py @@ -52,7 +52,7 @@ def quickstart_sample( documentai_document: Optional[documentai.Document] = None, batch_process_metadata: Optional[documentai.BatchProcessMetadata] = None, batch_process_operation: Optional[str] = None, -) -> None: +) -> document.Document: if gcs_bucket_name and gcs_prefix: # Load from Google Cloud Storage Directory print("Document structure in Cloud Storage") @@ -128,5 +128,6 @@ def quickstart_sample( if entity.normalized_text: print(f"\tNormalized Text: {entity.normalized_text}") + # [END documentai_toolbox_quickstart] -# [END documentai_toolbox_quickstart] + return wrapped_document diff --git a/samples/snippets/test_quickstart_sample.py b/samples/snippets/test_quickstart_sample.py index cb7a9c4a..e1dd1370 100644 --- a/samples/snippets/test_quickstart_sample.py +++ b/samples/snippets/test_quickstart_sample.py @@ -96,6 +96,32 @@ def test_quickstart_sample_batch_process_metadata( assert "Document Successfully Loaded!" in out +def test_quickstart_sample_batch_process_metadata_matching_prefixes( + capsys: pytest.CaptureFixture, +) -> None: + batch_process_metadata = documentai.BatchProcessMetadata( + state=documentai.BatchProcessMetadata.State.SUCCEEDED, + individual_process_statuses=[ + documentai.BatchProcessMetadata.IndividualProcessStatus( + input_gcs_source="gs://test-directory/documentai/input.pdf", + output_gcs_destination="gs://documentai_toolbox_samples/output/matching-prefixes/1", + ), + documentai.BatchProcessMetadata.IndividualProcessStatus( + input_gcs_source="gs://test-directory/documentai/input.pdf", + output_gcs_destination="gs://documentai_toolbox_samples/output/matching-prefixes/11", + ), + ], + ) + wrapped_document = quickstart_sample.quickstart_sample( + batch_process_metadata=batch_process_metadata + ) + + assert wrapped_document.gcs_prefix == "output/matching-prefixes/1/" + out, _ = capsys.readouterr() + + assert "Document Successfully Loaded!" in out + + def test_quickstart_sample_batch_process_operation( capsys: pytest.CaptureFixture, ) -> None: diff --git a/tests/unit/test_document.py b/tests/unit/test_document.py index e5ef5f1f..bcf71ae5 100644 --- a/tests/unit/test_document.py +++ b/tests/unit/test_document.py @@ -222,7 +222,7 @@ def test_get_batch_process_metadata_with_valid_operation( individual_process_statuses=[ documentai.BatchProcessMetadata.IndividualProcessStatus( input_gcs_source="gs://test-directory/documentai/input.pdf", - output_gcs_destination="gs://test-directory/documentai/output/123456789/1/", + output_gcs_destination="gs://test-directory/documentai/output/123456789/1", ) ], ) @@ -256,7 +256,7 @@ def test_get_batch_process_metadata_with_running_operation( individual_process_statuses=[ documentai.BatchProcessMetadata.IndividualProcessStatus( input_gcs_source="gs://test-directory/documentai/input.pdf", - output_gcs_destination="gs://test-directory/documentai/output/123456789/1/", + output_gcs_destination="gs://test-directory/documentai/output/123456789/1", ) ], ) @@ -442,11 +442,11 @@ def test_document_from_batch_process_metadata_with_multiple_input_files( individual_process_statuses=[ mock.Mock( input_gcs_source="gs://test-directory/documentai/input.pdf", - output_gcs_destination="gs://test-directory/documentai/output/123456789/1/", + output_gcs_destination="gs://test-directory/documentai/output/123456789/1", ), mock.Mock( input_gcs_source="gs://test-directory/documentai/input2.pdf", - output_gcs_destination="gs://test-directory/documentai/output/123456789/2/", + output_gcs_destination="gs://test-directory/documentai/output/123456789/2", ), ], ) @@ -465,6 +465,37 @@ def test_document_from_batch_process_metadata_with_multiple_input_files( assert documents[1].gcs_input_uri == "gs://test-directory/documentai/input2.pdf" +def test_document_from_batch_process_metadata_with_multiple_input_files_matching_prefix( + get_bytes_multiple_directories_mock, +): + mock_metadata = mock.Mock( + state=documentai.BatchProcessMetadata.State.SUCCEEDED, + individual_process_statuses=[ + mock.Mock( + input_gcs_source="gs://test-directory/documentai/input.pdf", + output_gcs_destination="gs://test-directory/documentai/output/123456789/1", + ), + mock.Mock( + input_gcs_source="gs://test-directory/documentai/input2.pdf", + output_gcs_destination="gs://test-directory/documentai/output/123456789/11", + ), + ], + ) + documents = document.Document.from_batch_process_metadata(mock_metadata) + + get_bytes_multiple_directories_mock.assert_called() + assert get_bytes_multiple_directories_mock.call_count == 2 + assert len(documents) == 2 + + assert documents[0].gcs_bucket_name == "test-directory" + assert documents[0].gcs_prefix == "documentai/output/123456789/1/" + assert documents[0].gcs_input_uri == "gs://test-directory/documentai/input.pdf" + + assert documents[1].gcs_bucket_name == "test-directory" + assert documents[1].gcs_prefix == "documentai/output/123456789/11/" + assert documents[1].gcs_input_uri == "gs://test-directory/documentai/input2.pdf" + + def test_document_from_batch_process_metadata_with_failed_operation(): with pytest.raises( ValueError,