Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Metal: Improve startup times by using concurrent shader compilation APIs #96052

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 77 additions & 7 deletions drivers/metal/metal_objects.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,12 @@

#import "servers/rendering/rendering_device_driver.h"

#import <CommonCrypto/CommonDigest.h>
#import <Foundation/Foundation.h>
#import <Metal/Metal.h>
#import <QuartzCore/CAMetalLayer.h>
#import <simd/simd.h>
#import <zlib.h>
#import <initializer_list>
#import <optional>
#import <spirv.hpp>
Expand Down Expand Up @@ -497,6 +499,76 @@ struct API_AVAILABLE(macos(11.0), ios(14.0)) UniformSet {
HashMap<RDC::ShaderStage, id<MTLArgumentEncoder>> encoders;
};

struct ShaderCacheEntry;

enum class ShaderLoadStrategy {
DEFAULT,
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DEFAULT uses the background compilation APIs, managed by the OS, to compile all shaders, which improves overall performance, since Godot schedules these requests serially. Only when a pipeline uses the MTLLibrary, will Godot block to wait for the compilation to complete. Previously, Godot blocked when compiling the library, so it had to wait for over 1600 shaders to compile. With this change, only about 250 shaders block. 936 shaders are compiled, and the OS only allows a max of 3 concurrent requests, so the 250 shaders still have to wait for the others to compile, but the result is a vast improvement overall.

Copy link
Contributor

@DarioSamo DarioSamo Aug 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably worth mentioning this exact behavior is introduced by #90400 but at a general level for all APIs. The same strategy is adopted, where it'll spawn as much work as possible for the global task scheduler we have and wait on it until it is required if it needs to draw.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thats great!

LAZY,
};

/**
* A Metal shader library.
*/
@interface MDLibrary : NSObject
- (id<MTLLibrary>)library;
- (NSError *)error;
- (void)setLabel:(NSString *)label;

+ (instancetype)newLibraryWithCacheEntry:(ShaderCacheEntry *)entry
device:(id<MTLDevice>)device
source:(NSString *)source
options:(MTLCompileOptions *)options
strategy:(ShaderLoadStrategy)strategy;
@end

struct SHA256Digest {
unsigned char data[CC_SHA256_DIGEST_LENGTH];

uint32_t hash() const {
uint32_t c = crc32(0, data, CC_SHA256_DIGEST_LENGTH);
return c;
}

SHA256Digest() {
bzero(data, CC_SHA256_DIGEST_LENGTH);
}

SHA256Digest(const char *p_data, size_t p_length) {
CC_SHA256(p_data, (CC_LONG)p_length, data);
}
};

template <>
struct HashMapComparatorDefault<SHA256Digest> {
static bool compare(const SHA256Digest &p_lhs, const SHA256Digest &p_rhs) {
return memcmp(p_lhs.data, p_rhs.data, CC_SHA256_DIGEST_LENGTH) == 0;
}
};

/**
* A cache entry for a Metal shader library.
*/
struct ShaderCacheEntry {
RenderingDeviceDriverMetal &owner;
SHA256Digest key;
CharString name;
CharString short_sha;
RD::ShaderStage stage = RD::SHADER_STAGE_VERTEX;
/**
* This reference must be weak, to ensure that when the last strong reference to the library
* is released, the cache entry is freed.
*/
MDLibrary *__weak library = nil;

/** Notify the cache that this entry is no longer needed. */
void notify_free() const;

ShaderCacheEntry(RenderingDeviceDriverMetal &p_owner, SHA256Digest p_key) :
owner(p_owner), key(p_key) {
}
~ShaderCacheEntry() = default;
};

class API_AVAILABLE(macos(11.0), ios(14.0)) MDShader {
public:
CharString name;
Expand All @@ -517,15 +589,14 @@ class API_AVAILABLE(macos(11.0), ios(14.0)) MDComputeShader final : public MDSha
} push_constants;
MTLSize local = {};

id<MTLLibrary> kernel;
MDLibrary *kernel;
#if DEV_ENABLED
CharString kernel_source;
#endif

void encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) final;

MDComputeShader(CharString p_name, Vector<UniformSet> p_sets, id<MTLLibrary> p_kernel);
~MDComputeShader() override = default;
MDComputeShader(CharString p_name, Vector<UniformSet> p_sets, MDLibrary *p_kernel);
};

class API_AVAILABLE(macos(11.0), ios(14.0)) MDRenderShader final : public MDShader {
Expand All @@ -541,17 +612,16 @@ class API_AVAILABLE(macos(11.0), ios(14.0)) MDRenderShader final : public MDShad
} frag;
} push_constants;

id<MTLLibrary> vert;
id<MTLLibrary> frag;
MDLibrary *vert;
MDLibrary *frag;
#if DEV_ENABLED
CharString vert_source;
CharString frag_source;
#endif

void encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) final;

MDRenderShader(CharString p_name, Vector<UniformSet> p_sets, id<MTLLibrary> p_vert, id<MTLLibrary> p_frag);
~MDRenderShader() override = default;
MDRenderShader(CharString p_name, Vector<UniformSet> p_sets, MDLibrary *p_vert, MDLibrary *p_frag);
};

enum StageResourceUsage : uint32_t {
Expand Down
208 changes: 206 additions & 2 deletions drivers/metal/metal_objects.mm
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,12 @@

#import "metal_objects.h"

#import "metal_utils.h"
#import "pixel_formats.h"
#import "rendering_device_driver_metal.h"

#import <os/signpost.h>

void MDCommandBuffer::begin() {
DEV_ASSERT(commandBuffer == nil);
commandBuffer = queue.commandBuffer;
Expand Down Expand Up @@ -850,7 +853,7 @@
type = MDCommandBufferStateType::None;
}

MDComputeShader::MDComputeShader(CharString p_name, Vector<UniformSet> p_sets, id<MTLLibrary> p_kernel) :
MDComputeShader::MDComputeShader(CharString p_name, Vector<UniformSet> p_sets, MDLibrary *p_kernel) :
MDShader(p_name, p_sets), kernel(p_kernel) {
}

Expand All @@ -868,7 +871,7 @@
[enc setBytes:ptr length:length atIndex:push_constants.binding];
}

MDRenderShader::MDRenderShader(CharString p_name, Vector<UniformSet> p_sets, id<MTLLibrary> _Nonnull p_vert, id<MTLLibrary> _Nonnull p_frag) :
MDRenderShader::MDRenderShader(CharString p_name, Vector<UniformSet> p_sets, MDLibrary *_Nonnull p_vert, MDLibrary *_Nonnull p_frag) :
MDShader(p_name, p_sets), vert(p_vert), frag(p_frag) {
}

Expand Down Expand Up @@ -1378,3 +1381,204 @@ fragment ClearColorsOut fragClear(VaryingsPos varyings [[stage_in]], constant Cl
}
return *val;
}

static const char *SHADER_STAGE_NAMES[] = {
[RD::SHADER_STAGE_VERTEX] = "vert",
[RD::SHADER_STAGE_FRAGMENT] = "frag",
[RD::SHADER_STAGE_TESSELATION_CONTROL] = "tess_ctrl",
[RD::SHADER_STAGE_TESSELATION_EVALUATION] = "tess_eval",
[RD::SHADER_STAGE_COMPUTE] = "comp",
};

void ShaderCacheEntry::notify_free() const {
owner.shader_cache_free_entry(key);
}

@interface MDLibrary ()
- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry;
- (ShaderCacheEntry *)entry;
@end

@interface MDLazyLibrary : MDLibrary {
id<MTLLibrary> _library;
NSError *_error;
std::shared_mutex _mu;
bool _loaded;
id<MTLDevice> _device;
NSString *_source;
MTLCompileOptions *_options;
}
- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry
device:(id<MTLDevice>)device
source:(NSString *)source
options:(MTLCompileOptions *)options;
@end

@interface MDImmediateLibrary : MDLibrary {
id<MTLLibrary> _library;
NSError *_error;
std::mutex _cv_mutex;
std::condition_variable _cv;
std::atomic<bool> _complete;
bool _ready;
}
- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry
device:(id<MTLDevice>)device
source:(NSString *)source
options:(MTLCompileOptions *)options;
@end

@implementation MDLibrary {
ShaderCacheEntry *_entry;
}

+ (instancetype)newLibraryWithCacheEntry:(ShaderCacheEntry *)entry
device:(id<MTLDevice>)device
source:(NSString *)source
options:(MTLCompileOptions *)options
strategy:(ShaderLoadStrategy)strategy {
switch (strategy) {
case ShaderLoadStrategy::DEFAULT:
[[fallthrough]];
default:
return [[MDImmediateLibrary alloc] initWithCacheEntry:entry device:device source:source options:options];
case ShaderLoadStrategy::LAZY:
return [[MDLazyLibrary alloc] initWithCacheEntry:entry device:device source:source options:options];
}
}

- (ShaderCacheEntry *)entry {
return _entry;
}

- (id<MTLLibrary>)library {
CRASH_NOW_MSG("Not implemented");
return nil;
}

- (NSError *)error {
CRASH_NOW_MSG("Not implemented");
return nil;
}
Comment on lines +1454 to +1462
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are overridden in the subclasses, so these methods are unreachable.


- (void)setLabel:(NSString *)label {
}

- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry {
self = [super init];
_entry = entry;
_entry->library = self;
return self;
}

- (void)dealloc {
_entry->notify_free();
}

@end

@implementation MDImmediateLibrary

- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry
device:(id<MTLDevice>)device
source:(NSString *)source
options:(MTLCompileOptions *)options {
self = [super initWithCacheEntry:entry];
_complete = false;
_ready = false;

__block os_signpost_id_t compile_id = (os_signpost_id_t)(uintptr_t)self;
os_signpost_interval_begin(LOG_INTERVALS, compile_id, "shader_compile",
"shader_name=%{public}s stage=%{public}s hash=%{public}s",
entry->name.get_data(), SHADER_STAGE_NAMES[entry->stage], entry->short_sha.get_data());

[device newLibraryWithSource:source
options:options
completionHandler:^(id<MTLLibrary> library, NSError *error) {
os_signpost_interval_end(LOG_INTERVALS, compile_id, "shader_compile");
self->_library = library;
self->_error = error;
if (error) {
ERR_PRINT(String(U"Error compiling shader %s: %s").format(entry->name.get_data(), error.localizedDescription.UTF8String));
}

{
std::lock_guard<std::mutex> lock(self->_cv_mutex);
_ready = true;
}
_cv.notify_all();
_complete = true;
}];
return self;
}

- (id<MTLLibrary>)library {
if (!_complete) {
std::unique_lock<std::mutex> lock(_cv_mutex);
_cv.wait(lock, [&] { return _ready; });
}
return _library;
}

- (NSError *)error {
if (!_complete) {
std::unique_lock<std::mutex> lock(_cv_mutex);
_cv.wait(lock, [&] { return _ready; });
}
return _error;
}

@end

@implementation MDLazyLibrary
- (instancetype)initWithCacheEntry:(ShaderCacheEntry *)entry
device:(id<MTLDevice>)device
source:(NSString *)source
options:(MTLCompileOptions *)options {
self = [super initWithCacheEntry:entry];
_device = device;
_source = source;
_options = options;

return self;
}

- (void)load {
{
std::shared_lock<std::shared_mutex> lock(_mu);
if (_loaded) {
return;
}
}

std::unique_lock<std::shared_mutex> lock(_mu);
if (_loaded) {
return;
}

ShaderCacheEntry *entry = [self entry];

__block os_signpost_id_t compile_id = (os_signpost_id_t)(uintptr_t)self;
os_signpost_interval_begin(LOG_INTERVALS, compile_id, "shader_compile",
"shader_name=%{public}s stage=%{public}s hash=%{public}s",
entry->name.get_data(), SHADER_STAGE_NAMES[entry->stage], entry->short_sha.get_data());
NSError *error;
_library = [_device newLibraryWithSource:_source options:_options error:&error];
os_signpost_interval_end(LOG_INTERVALS, compile_id, "shader_compile");
_device = nil;
_source = nil;
_options = nil;
_loaded = true;
}

- (id<MTLLibrary>)library {
[self load];
return _library;
}

- (NSError *)error {
[self load];
return _error;
}

@end
20 changes: 20 additions & 0 deletions drivers/metal/metal_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@
#ifndef METAL_UTILS_H
#define METAL_UTILS_H

#import <os/log.h>

#pragma mark - Boolean flags

namespace flags {
Expand Down Expand Up @@ -78,4 +80,22 @@ static constexpr uint64_t round_up_to_alignment(uint64_t p_value, uint64_t p_ali
return aligned_value;
}

class Defer {
public:
Defer(std::function<void()> func) :
func_(func) {}
~Defer() { func_(); }

private:
std::function<void()> func_;
};

#define CONCAT_INTERNAL(x, y) x##y
#define CONCAT(x, y) CONCAT_INTERNAL(x, y)
#define DEFER const Defer &CONCAT(defer__, __LINE__) = Defer

extern os_log_t LOG_DRIVER;
// Used for dynamic tracing.
extern os_log_t LOG_INTERVALS;

#endif // METAL_UTILS_H
Loading
Loading