Skip to content

Commit 0bed7ea

Browse files
authored
Merge pull request #61316 from bruvzg/tts_3x
[3.x] Backport text-to-speech support.
2 parents 0dccbcd + 6489fe8 commit 0bed7ea

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+3905
-2
lines changed

.github/workflows/linux_builds.yml

+2-1
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,8 @@ jobs:
5959
# The actual dependencies
6060
sudo apt-get install build-essential pkg-config libx11-dev libxcursor-dev \
6161
libxinerama-dev libgl1-mesa-dev libglu-dev libasound2-dev libpulse-dev \
62-
libdbus-1-dev libudev-dev libxi-dev libxrandr-dev yasm xvfb wget unzip
62+
libdbus-1-dev libudev-dev libxi-dev libxrandr-dev yasm xvfb wget unzip \
63+
libspeechd-dev speech-dispatcher
6364
6465
- name: Setup Godot build cache
6566
uses: ./.github/actions/godot-cache

core/bind/core_bind.cpp

+53
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,42 @@ void _OS::global_menu_clear(const String &p_menu) {
190190
OS::get_singleton()->global_menu_clear(p_menu);
191191
}
192192

193+
bool _OS::tts_is_speaking() const {
194+
return OS::get_singleton()->tts_is_speaking();
195+
}
196+
197+
bool _OS::tts_is_paused() const {
198+
return OS::get_singleton()->tts_is_paused();
199+
}
200+
201+
Array _OS::tts_get_voices() const {
202+
return OS::get_singleton()->tts_get_voices();
203+
}
204+
205+
PoolStringArray _OS::tts_get_voices_for_language(const String &p_language) const {
206+
return OS::get_singleton()->tts_get_voices_for_language(p_language);
207+
}
208+
209+
void _OS::tts_speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
210+
OS::get_singleton()->tts_speak(p_text, p_voice, p_volume, p_pitch, p_rate, p_utterance_id, p_interrupt);
211+
}
212+
213+
void _OS::tts_pause() {
214+
OS::get_singleton()->tts_pause();
215+
}
216+
217+
void _OS::tts_resume() {
218+
OS::get_singleton()->tts_resume();
219+
}
220+
221+
void _OS::tts_stop() {
222+
OS::get_singleton()->tts_stop();
223+
}
224+
225+
void _OS::tts_set_utterance_callback(TTSUtteranceEvent p_event, Object *p_object, String p_callback) {
226+
OS::get_singleton()->tts_set_utterance_callback((OS::TTSUtteranceEvent)p_event, p_object, p_callback);
227+
}
228+
193229
Point2 _OS::get_mouse_position() const {
194230
return OS::get_singleton()->get_mouse_position();
195231
}
@@ -1260,6 +1296,18 @@ void _OS::_bind_methods() {
12601296
//ClassDB::bind_method(D_METHOD("is_video_mode_resizable","screen"),&_OS::is_video_mode_resizable,DEFVAL(0));
12611297
//ClassDB::bind_method(D_METHOD("get_fullscreen_mode_list","screen"),&_OS::get_fullscreen_mode_list,DEFVAL(0));
12621298

1299+
ClassDB::bind_method(D_METHOD("tts_is_speaking"), &_OS::tts_is_speaking);
1300+
ClassDB::bind_method(D_METHOD("tts_is_paused"), &_OS::tts_is_paused);
1301+
ClassDB::bind_method(D_METHOD("tts_get_voices"), &_OS::tts_get_voices);
1302+
ClassDB::bind_method(D_METHOD("tts_get_voices_for_language", "language"), &_OS::tts_get_voices_for_language);
1303+
1304+
ClassDB::bind_method(D_METHOD("tts_speak", "text", "voice", "volume", "pitch", "rate", "utterance_id", "interrupt"), &_OS::tts_speak, DEFVAL(50), DEFVAL(1.f), DEFVAL(1.f), DEFVAL(0), DEFVAL(false));
1305+
ClassDB::bind_method(D_METHOD("tts_pause"), &_OS::tts_pause);
1306+
ClassDB::bind_method(D_METHOD("tts_resume"), &_OS::tts_resume);
1307+
ClassDB::bind_method(D_METHOD("tts_stop"), &_OS::tts_stop);
1308+
1309+
ClassDB::bind_method(D_METHOD("tts_set_utterance_callback", "event", "object", "callback"), &_OS::tts_set_utterance_callback);
1310+
12631311
ClassDB::bind_method(D_METHOD("global_menu_add_item", "menu", "label", "id", "meta"), &_OS::global_menu_add_item);
12641312
ClassDB::bind_method(D_METHOD("global_menu_add_separator", "menu"), &_OS::global_menu_add_separator);
12651313
ClassDB::bind_method(D_METHOD("global_menu_remove_item", "menu", "idx"), &_OS::global_menu_remove_item);
@@ -1578,6 +1626,11 @@ void _OS::_bind_methods() {
15781626
BIND_ENUM_CONSTANT(POWERSTATE_NO_BATTERY);
15791627
BIND_ENUM_CONSTANT(POWERSTATE_CHARGING);
15801628
BIND_ENUM_CONSTANT(POWERSTATE_CHARGED);
1629+
1630+
BIND_ENUM_CONSTANT(TTS_UTTERANCE_STARTED);
1631+
BIND_ENUM_CONSTANT(TTS_UTTERANCE_ENDED);
1632+
BIND_ENUM_CONSTANT(TTS_UTTERANCE_CANCELED);
1633+
BIND_ENUM_CONSTANT(TTS_UTTERANCE_BOUNDARY);
15811634
}
15821635

15831636
_OS::_OS() {

core/bind/core_bind.h

+21
Original file line numberDiff line numberDiff line change
@@ -152,11 +152,31 @@ class _OS : public Object {
152152
OPENGL_CONTEXT, // HGLRC, X11::GLXContext, NSOpenGLContext*, EGLContext* ...
153153
};
154154

155+
enum TTSUtteranceEvent {
156+
TTS_UTTERANCE_STARTED,
157+
TTS_UTTERANCE_ENDED,
158+
TTS_UTTERANCE_CANCELED,
159+
TTS_UTTERANCE_BOUNDARY,
160+
TTS_UTTERANCE_MAX,
161+
};
162+
155163
void global_menu_add_item(const String &p_menu, const String &p_label, const Variant &p_signal, const Variant &p_meta);
156164
void global_menu_add_separator(const String &p_menu);
157165
void global_menu_remove_item(const String &p_menu, int p_idx);
158166
void global_menu_clear(const String &p_menu);
159167

168+
bool tts_is_speaking() const;
169+
bool tts_is_paused() const;
170+
Array tts_get_voices() const;
171+
PoolStringArray tts_get_voices_for_language(const String &p_language) const;
172+
173+
void tts_speak(const String &p_text, const String &p_voice, int p_volume = 50, float p_pitch = 1.f, float p_rate = 1.f, int p_utterance_id = 0, bool p_interrupt = false);
174+
void tts_pause();
175+
void tts_resume();
176+
void tts_stop();
177+
178+
void tts_set_utterance_callback(TTSUtteranceEvent p_event, Object *p_object, String p_callback);
179+
160180
Point2 get_mouse_position() const;
161181
void set_window_title(const String &p_title);
162182
void set_window_mouse_passthrough(const PoolVector2Array &p_region);
@@ -429,6 +449,7 @@ VARIANT_ENUM_CAST(_OS::VirtualKeyboardType);
429449
VARIANT_ENUM_CAST(_OS::SystemDir);
430450
VARIANT_ENUM_CAST(_OS::ScreenOrientation);
431451
VARIANT_ENUM_CAST(_OS::HandleType);
452+
VARIANT_ENUM_CAST(_OS::TTSUtteranceEvent);
432453

433454
class _Geometry : public Object {
434455
GDCLASS(_Geometry, Object);

core/os/os.cpp

+69
Original file line numberDiff line numberDiff line change
@@ -557,6 +557,75 @@ bool OS::can_use_threads() const {
557557
#endif
558558
}
559559

560+
bool OS::tts_is_speaking() const {
561+
WARN_PRINT("TTS is not supported by this platform.");
562+
return false;
563+
}
564+
565+
bool OS::tts_is_paused() const {
566+
WARN_PRINT("TTS is not supported by this platform.");
567+
return false;
568+
}
569+
570+
void OS::tts_pause() {
571+
WARN_PRINT("TTS is not supported by this platformr.");
572+
}
573+
574+
void OS::tts_resume() {
575+
WARN_PRINT("TTS is not supported by this platform.");
576+
}
577+
578+
Array OS::tts_get_voices() const {
579+
WARN_PRINT("TTS is not supported by this platform.");
580+
return Array();
581+
}
582+
583+
PoolStringArray OS::tts_get_voices_for_language(const String &p_language) const {
584+
PoolStringArray ret;
585+
Array voices = tts_get_voices();
586+
for (int i = 0; i < voices.size(); i++) {
587+
const Dictionary &voice = voices[i];
588+
if (voice.has("id") && voice.has("language") && voice["language"].operator String().begins_with(p_language)) {
589+
ret.push_back(voice["id"]);
590+
}
591+
}
592+
return ret;
593+
}
594+
595+
void OS::tts_speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
596+
WARN_PRINT("TTS is not supported by this platform.");
597+
}
598+
599+
void OS::tts_stop() {
600+
WARN_PRINT("TTS is not supported by this platform.");
601+
}
602+
603+
void OS::tts_set_utterance_callback(TTSUtteranceEvent p_event, Object *p_object, const StringName &p_callback) {
604+
ERR_FAIL_INDEX(p_event, OS::TTS_UTTERANCE_MAX);
605+
utterance_callback[p_event].object = p_object;
606+
utterance_callback[p_event].cb_name = p_callback;
607+
}
608+
609+
void OS::tts_post_utterance_event(TTSUtteranceEvent p_event, int p_id, int p_pos) {
610+
ERR_FAIL_INDEX(p_event, OS::TTS_UTTERANCE_MAX);
611+
switch (p_event) {
612+
case OS::TTS_UTTERANCE_STARTED:
613+
case OS::TTS_UTTERANCE_ENDED:
614+
case OS::TTS_UTTERANCE_CANCELED: {
615+
if (utterance_callback[p_event].object != nullptr) {
616+
utterance_callback[p_event].object->call_deferred(utterance_callback[p_event].cb_name, p_id);
617+
}
618+
} break;
619+
case OS::TTS_UTTERANCE_BOUNDARY: {
620+
if (utterance_callback[p_event].object != nullptr) {
621+
utterance_callback[p_event].object->call_deferred(utterance_callback[p_event].cb_name, p_pos, p_id);
622+
}
623+
} break;
624+
default:
625+
break;
626+
}
627+
}
628+
560629
OS::MouseMode OS::get_mouse_mode() const {
561630
return MOUSE_MODE_VISIBLE;
562631
}

core/os/os.h

+39
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,31 @@ class OS {
123123
}
124124
};
125125

126+
struct TTSUtterance {
127+
String text;
128+
String voice;
129+
int volume = 50;
130+
float pitch = 1.f;
131+
float rate = 1.f;
132+
int id = 0;
133+
};
134+
135+
enum TTSUtteranceEvent {
136+
TTS_UTTERANCE_STARTED,
137+
TTS_UTTERANCE_ENDED,
138+
TTS_UTTERANCE_CANCELED,
139+
TTS_UTTERANCE_BOUNDARY,
140+
TTS_UTTERANCE_MAX,
141+
};
142+
143+
private:
144+
struct Callback {
145+
Object *object = nullptr;
146+
StringName cb_name;
147+
};
148+
149+
Callback utterance_callback[TTS_UTTERANCE_MAX];
150+
126151
protected:
127152
friend class Main;
128153

@@ -172,6 +197,20 @@ class OS {
172197
virtual void set_mouse_mode(MouseMode p_mode);
173198
virtual MouseMode get_mouse_mode() const;
174199

200+
virtual bool tts_is_speaking() const;
201+
virtual bool tts_is_paused() const;
202+
virtual Array tts_get_voices() const;
203+
204+
virtual PoolStringArray tts_get_voices_for_language(const String &p_language) const;
205+
206+
virtual void tts_speak(const String &p_text, const String &p_voice, int p_volume = 50, float p_pitch = 1.f, float p_rate = 1.f, int p_utterance_id = 0, bool p_interrupt = false);
207+
virtual void tts_pause();
208+
virtual void tts_resume();
209+
virtual void tts_stop();
210+
211+
virtual void tts_set_utterance_callback(TTSUtteranceEvent p_event, Object *p_object, const StringName &p_callback);
212+
virtual void tts_post_utterance_event(TTSUtteranceEvent p_event, int p_id, int p_pos = 0);
213+
175214
virtual void warp_mouse_position(const Point2 &p_to) {}
176215
virtual Point2 get_mouse_position() const = 0;
177216
virtual int get_mouse_button_state() const = 0;

doc/classes/OS.xml

+100
Original file line numberDiff line numberDiff line change
@@ -1032,6 +1032,94 @@
10321032
[b]Note:[/b] This method is implemented on Android, iOS, UWP, and HTML5.
10331033
</description>
10341034
</method>
1035+
<method name="tts_get_voices" qualifiers="const">
1036+
<return type="Array" />
1037+
<description>
1038+
Returns an [Array] of voice information dictionaries.
1039+
Each [Dictionary] contains two [String] entries:
1040+
- [code]name[/code] is voice name.
1041+
- [code]id[/code] is voice identifier.
1042+
- [code]language[/code] is language code in [code]lang_Variant[/code] format. [code]lang[/code] part is a 2 or 3-letter code based on the ISO-639 standard, in lowercase. And [code]Variant[/code] part is an engine dependent string describing country, region or/and dialect.
1043+
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
1044+
</description>
1045+
</method>
1046+
<method name="tts_get_voices_for_language" qualifiers="const">
1047+
<return type="PoolStringArray" />
1048+
<argument index="0" name="language" type="String" />
1049+
<description>
1050+
Returns an [PoolStringArray] of voice identifiers for the [code]language[/code].
1051+
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
1052+
</description>
1053+
</method>
1054+
<method name="tts_is_paused" qualifiers="const">
1055+
<return type="bool" />
1056+
<description>
1057+
Returns [code]true[/code] if the synthesizer is in a paused state.
1058+
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
1059+
</description>
1060+
</method>
1061+
<method name="tts_is_speaking" qualifiers="const">
1062+
<return type="bool" />
1063+
<description>
1064+
Returns [code]true[/code] if the synthesizer is generating speech, or have utterance waiting in the queue.
1065+
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
1066+
</description>
1067+
</method>
1068+
<method name="tts_pause">
1069+
<return type="void" />
1070+
<description>
1071+
Puts the synthesizer into a paused state.
1072+
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
1073+
</description>
1074+
</method>
1075+
<method name="tts_resume">
1076+
<return type="void" />
1077+
<description>
1078+
Resumes the synthesizer if it was paused.
1079+
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
1080+
</description>
1081+
</method>
1082+
<method name="tts_set_utterance_callback">
1083+
<return type="void" />
1084+
<argument index="0" name="event" type="int" enum="OS.TTSUtteranceEvent" />
1085+
<argument index="1" name="object" type="Object" />
1086+
<argument index="2" name="callback" type="String" />
1087+
<description>
1088+
Adds a callback, which is called when the utterance has started, finished, canceled or reached a text boundary.
1089+
- [code]TTS_UTTERANCE_STARTED[/code], [code]TTS_UTTERANCE_ENDED[/code], and [code]TTS_UTTERANCE_CANCELED[/code] callable's method should take one [int] parameter, the utterance id.
1090+
- [code]TTS_UTTERANCE_BOUNDARY[/code] callable's method should take two [int] parameters, the index of the character and the utterance id.
1091+
[b]Note:[/b] The granularity of the boundary callbacks is engine dependent.
1092+
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
1093+
</description>
1094+
</method>
1095+
<method name="tts_speak">
1096+
<return type="void" />
1097+
<argument index="0" name="text" type="String" />
1098+
<argument index="1" name="voice" type="String" />
1099+
<argument index="2" name="volume" type="int" default="50" />
1100+
<argument index="3" name="pitch" type="float" default="1.0" />
1101+
<argument index="4" name="rate" type="float" default="1.0" />
1102+
<argument index="5" name="utterance_id" type="int" default="0" />
1103+
<argument index="6" name="interrupt" type="bool" default="false" />
1104+
<description>
1105+
Adds an utterance to the queue. If [code]interrupt[/code] is [code]true[/code], the queue is cleared first.
1106+
- [code]voice[/code] identifier is one of the [code]"id"[/code] values returned by [method tts_get_voices] or one of the values returned by [method tts_get_voices_for_language].
1107+
- [code]volume[/code] ranges from [code]0[/code] (lowest) to [code]100[/code] (highest).
1108+
- [code]pitch[/code] ranges from [code]0.0[/code] (lowest) to [code]2.0[/code] (highest), [code]1.0[/code] is default pitch for the current voice.
1109+
- [code]rate[/code] ranges from [code]0.1[/code] (lowest) to [code]10.0[/code] (highest), [code]1.0[/code] is a normal speaking rate. Other values act as a percentage relative.
1110+
- [code]utterance_id[/code] is passed as a parameter to the callback functions.
1111+
[b]Note:[/b] On Windows and Linux, utterance [code]text[/code] can use SSML markup. SSML support is engine and voice dependent. If the engine does not support SSML, you should strip out all XML markup before calling [method tts_speak].
1112+
[b]Note:[/b] The granularity of pitch, rate, and volume is engine and voice dependent. Values may be truncated.
1113+
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
1114+
</description>
1115+
</method>
1116+
<method name="tts_stop">
1117+
<return type="void" />
1118+
<description>
1119+
Stops synthesis in progress and removes all utterances from the queue.
1120+
[b]Note:[/b] This method is implemented on Android, iOS, HTML5, Linux, macOS, and Windows.
1121+
</description>
1122+
</method>
10351123
</methods>
10361124
<members>
10371125
<member name="clipboard" type="String" setter="set_clipboard" getter="get_clipboard" default="&quot;&quot;">
@@ -1283,5 +1371,17 @@
12831371
<constant name="POWERSTATE_CHARGED" value="4" enum="PowerState">
12841372
Plugged in, battery fully charged.
12851373
</constant>
1374+
<constant name="TTS_UTTERANCE_STARTED" value="0" enum="TTSUtteranceEvent">
1375+
Utterance has begun to be spoken.
1376+
</constant>
1377+
<constant name="TTS_UTTERANCE_ENDED" value="1" enum="TTSUtteranceEvent">
1378+
Utterance was successfully finished.
1379+
</constant>
1380+
<constant name="TTS_UTTERANCE_CANCELED" value="2" enum="TTSUtteranceEvent">
1381+
Utterance was canceled, or TTS service was unable to process it.
1382+
</constant>
1383+
<constant name="TTS_UTTERANCE_BOUNDARY" value="3" enum="TTSUtteranceEvent">
1384+
Utterance reached a word or sentence boundary.
1385+
</constant>
12861386
</constants>
12871387
</class>

platform/android/SCsub

+1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ android_files = [
99
"file_access_filesystem_jandroid.cpp",
1010
"audio_driver_opensl.cpp",
1111
"dir_access_jandroid.cpp",
12+
"tts_android.cpp",
1213
"thread_jandroid.cpp",
1314
"net_socket_android.cpp",
1415
"java_godot_lib_jni.cpp",

platform/android/java/lib/src/org/godotengine/godot/Godot.java

+3
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.godotengine.godot.io.file.FileAccessHandler;
3939
import org.godotengine.godot.plugin.GodotPlugin;
4040
import org.godotengine.godot.plugin.GodotPluginRegistry;
41+
import org.godotengine.godot.tts.GodotTTS;
4142
import org.godotengine.godot.utils.GodotNetUtils;
4243
import org.godotengine.godot.utils.PermissionsUtil;
4344
import org.godotengine.godot.xr.XRMode;
@@ -254,6 +255,7 @@ protected void instanceSingleton(SingletonBase s) {
254255

255256
public GodotIO io;
256257
public GodotNetUtils netUtils;
258+
public GodotTTS tts;
257259

258260
static SingletonBase[] singletons = new SingletonBase[MAX_SINGLETONS];
259261
static int singleton_count = 0;
@@ -575,6 +577,7 @@ private void initializeGodot() {
575577
final Activity activity = getActivity();
576578
io = new GodotIO(activity);
577579
netUtils = new GodotNetUtils(activity);
580+
tts = new GodotTTS(activity);
578581
Context context = getContext();
579582
DirectoryAccessHandler directoryAccessHandler = new DirectoryAccessHandler(context);
580583
FileAccessHandler fileAccessHandler = new FileAccessHandler(context);

0 commit comments

Comments
 (0)