remote_exec.proto: add blob split and splice API

roloffs · roloffs · commit 186240b6edf4 · 2023-11-29T18:13:24.000+01:00
Depending on the software project, possibly large artifacts need to be
downloaded from or uploaded to the remote CAS such as executables with debug
information, comprehensive libraries, or even whole file system images. Such
artifacts generate a lot of traffic when downloaded or uploaded. The blob-split
API allows to split such artifacts into chunks at the remote side, to fetch
only those parts that are locally missing, and finally to locally assemble the
requested blob from its chunks. The blob-splice API allows to split such
artifacts into chunks locally, to upload only those parts that are remotely
missing, and finally to remotely splice the requested blob from its chunks.
Since only the binary differences from the last download/upload are
fetched/uploaded, the blob split and splice API can save a lot of network
traffic between server and client.
diff --git a/build/bazel/remote/execution/v2/remote_execution.proto b/build/bazel/remote/execution/v2/remote_execution.proto
@@ -430,6 +430,78 @@ service ContentAddressableStorage {
   rpc GetTree(GetTreeRequest) returns (stream GetTreeResponse) {
     option (google.api.http) = { get: "/v2/{instance_name=**}/blobs/{root_digest.hash}/{root_digest.size_bytes}:getTree" };
   }
+
+  // Split a blob into chunks.
+  //
+  // This splitting API aims to reduce download traffic between a client and a
+  // server, e.g., if a client needs to fetch a large blob that just has been
+  // modified slightly since the last built. In this case, there is no need to
+  // fetch the entire blob data, but just the binary differences between the two
+  // blob versions, which are typically determined by content-defined chunking.
+  //
+  // Clients can use this API before downloading a blob to determine which parts
+  // of the blob are already present locally and do not need to be downloaded
+  // again. The server splits the blob into chunks according to a
+  // content-defined chunking algorithm and returns a list of the chunk digests
+  // in the order in which the chunks have to be concatenated to assemble the
+  // requested blob.
+  //
+  // The client can expect certain guarantees from the server if a split request
+  // is answered successfully:
+  //  1. The blob chunks are stored in CAS.
+  //  2. Concatenating the blob chunks in the order of the digest list returned
+  //     by the server results in the original blob.
+  //
+  // The usage of this API is optional but it allows clients to download only
+  // the missing parts of a large blob instead of the entire blob data, which in
+  // turn can considerably reduce download network traffic.
+  //
+  // Servers are free to implement this functionality, but they need to declare
+  // whether they support it or not by setting the
+  // [CacheCapabilities.blob_split_support][build.bazel.remote.execution.v2.CacheCapabilities.blob_split_support]
+  // field accordingly.
+  //
+  // Errors:
+  //
+  // * `NOT_FOUND`: The requested blob is not present in the CAS.
+  // * `RESOURCE_EXHAUSTED`: There is insufficient disk quota to store the blob
+  //   chunks.
+  rpc SplitBlob(SplitBlobRequest) returns (SplitBlobResponse) {
+    option (google.api.http) = { get: "/v2/{instance_name=**}/blobs/{blob_digest.hash}/{blob_digest.size_bytes}:splitBlob" };
+  }
+
+  // Splice a blob from chunks.
+  //
+  // This is the complementary operation to the
+  // [ContentAddressableStorage.SplitBlob][build.bazel.remote.execution.v2.ContentAddressableStorage.SplitBlob]
+  // function to handle the splitted upload of large blobs to safe upload
+  // traffic.
+  //
+  // If a client needs to upload a large blob and is able to split a blob into
+  // chunks locally according to some content-defined chunking algorithm, it can
+  // first determine which parts of the blob are already available in the remote
+  // CAS and upload the missing chunks, and then use this API to instruct the
+  // server to splice the original blob from the remotely available blob chunks.
+  //
+  // The usage of this API is optional but it allows clients to upload only the
+  // missing parts of a large blob instead of the entire blob data, which in
+  // turn can considerably reduce upload network traffic.
+  //
+  // Servers are free to implement this functionality, but they need to declare
+  // whether they support it or not by setting the
+  // [CacheCapabilities.blob_splice_support][build.bazel.remote.execution.v2.CacheCapabilities.blob_splice_support]
+  // field accordingly.
+  //
+  // Errors:
+  //
+  // * `NOT_FOUND`: At least one of the blob chunks is not present in the CAS.
+  // * `RESOURCE_EXHAUSTED`: There is insufficient disk quota to store the
+  //   spliced blob.
+  // * `INVALID_ARGUMENT`: The digest of the spliced blob is different from the
+  //   provided expected digest.
+  rpc SpliceBlob(SpliceBlobRequest) returns (SpliceBlobResponse) {
+    option (google.api.http) = { post: "/v2/{instance_name=**}/blobs:spliceBlob" body: "*" };
+  }
 }
 
 // The Capabilities service may be used by remote execution clients to query
@@ -1778,6 +1850,53 @@ message GetTreeResponse {
   string next_page_token = 2;
 }
 
+// A request message for
+// [ContentAddressableStorage.SplitBlob][build.bazel.remote.execution.v2.ContentAddressableStorage.SplitBlob].
+message SplitBlobRequest {
+  // The instance of the execution system to operate against. A server may
+  // support multiple instances of the execution system (with their own workers,
+  // storage, caches, etc.). The server MAY require use of this field to select
+  // between them in an implementation-defined fashion, otherwise it can be
+  // omitted.
+  string instance_name = 1;
+
+  // The digest of the blob to be splitted.
+  Digest blob_digest = 2;
+}
+
+// A response message for
+// [ContentAddressableStorage.SplitBlob][build.bazel.remote.execution.v2.ContentAddressableStorage.SplitBlob].
+message SplitBlobResponse {
+  // The ordered list of digests of the chunks into which the blob was splitted.
+  // The original blob is assembled by concatenating the chunk data according to
+  // the order of the digests given by this list.
+  repeated Digest chunk_digests = 1;
+}
+
+// A request message for
+// [ContentAddressableStorage.SpliceBlob][build.bazel.remote.execution.v2.ContentAddressableStorage.SpliceBlob].
+message SpliceBlobRequest {
+  // The instance of the execution system to operate against. A server may
+  // support multiple instances of the execution system (with their own workers,
+  // storage, caches, etc.). The server MAY require use of this field to select
+  // between them in an implementation-defined fashion, otherwise it can be
+  // omitted.
+  string instance_name = 1;
+
+  // Expected digest of the spliced blob.
+  Digest blob_digest = 2;
+
+  // The ordered list of digests of the chunks which need to be concatenated to
+  // assemble the original blob.
+  repeated Digest chunk_digests = 3;
+}
+
+// A response message for
+// [ContentAddressableStorage.SpliceBlob][build.bazel.remote.execution.v2.ContentAddressableStorage.SpliceBlob].
+message SpliceBlobResponse {
+  // Intentionally empty for now, but might need to be extended in future.
+}
+
 // A request message for
 // [Capabilities.GetCapabilities][build.bazel.remote.execution.v2.Capabilities.GetCapabilities].
 message GetCapabilitiesRequest {
@@ -1997,6 +2116,20 @@ message CacheCapabilities {
   // [BatchUpdateBlobs][build.bazel.remote.execution.v2.ContentAddressableStorage.BatchUpdateBlobs]
   // requests.
   repeated Compressor.Value supported_batch_update_compressors = 7;
+
+  // Whether blob splitting is supported for the particular server/instance. If
+  // yes, the server/instance implements the specified behavior for blob
+  // splitting and a meaningful result can be expected from the
+  // [ContentAddressableStorage.SplitBlob][build.bazel.remote.execution.v2.ContentAddressableStorage.SplitBlob]
+  // operation.
+  bool blob_split_support = 8;
+
+  // Whether blob splicing is supported for the particular server/instance. If
+  // yes, the server/instance implements the specified behavior for blob
+  // splicing and a meaningful result can be expected from the
+  // [ContentAddressableStorage.SpliceBlob][build.bazel.remote.execution.v2.ContentAddressableStorage.SpliceBlob]
+  // operation.
+  bool blob_splice_support = 9;
 }
 
 // Capabilities of the remote execution system.