1/**************************************************************************/
2/* tts_linux.cpp */
3/**************************************************************************/
4/* This file is part of: */
5/* GODOT ENGINE */
6/* https://godotengine.org */
7/**************************************************************************/
8/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
9/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */
10/* */
11/* Permission is hereby granted, free of charge, to any person obtaining */
12/* a copy of this software and associated documentation files (the */
13/* "Software"), to deal in the Software without restriction, including */
14/* without limitation the rights to use, copy, modify, merge, publish, */
15/* distribute, sublicense, and/or sell copies of the Software, and to */
16/* permit persons to whom the Software is furnished to do so, subject to */
17/* the following conditions: */
18/* */
19/* The above copyright notice and this permission notice shall be */
20/* included in all copies or substantial portions of the Software. */
21/* */
22/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */
23/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */
24/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
25/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */
26/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */
27/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */
28/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
29/**************************************************************************/
30
31#include "tts_linux.h"
32
33#include "core/config/project_settings.h"
34#include "servers/text_server.h"
35
36TTS_Linux *TTS_Linux::singleton = nullptr;
37
38void TTS_Linux::speech_init_thread_func(void *p_userdata) {
39 TTS_Linux *tts = (TTS_Linux *)p_userdata;
40 if (tts) {
41 MutexLock thread_safe_method(tts->_thread_safe_);
42#ifdef SOWRAP_ENABLED
43#ifdef DEBUG_ENABLED
44 int dylibloader_verbose = 1;
45#else
46 int dylibloader_verbose = 0;
47#endif
48 if (initialize_speechd(dylibloader_verbose) != 0) {
49 print_verbose("Text-to-Speech: Cannot load Speech Dispatcher library!");
50 } else {
51 if (!spd_open || !spd_set_notification_on || !spd_list_synthesis_voices || !free_spd_voices || !spd_set_synthesis_voice || !spd_set_volume || !spd_set_voice_pitch || !spd_set_voice_rate || !spd_set_data_mode || !spd_say || !spd_pause || !spd_resume || !spd_cancel) {
52 // There's no API to check version, check if functions are available instead.
53 print_verbose("Text-to-Speech: Unsupported Speech Dispatcher library version!");
54 return;
55 }
56#else
57 {
58#endif
59 CharString class_str;
60 String config_name = GLOBAL_GET("application/config/name");
61 if (config_name.length() == 0) {
62 class_str = "Godot_Engine";
63 } else {
64 class_str = config_name.utf8();
65 }
66 tts->synth = spd_open(class_str, "Godot_Engine_Speech_API", "Godot_Engine", SPD_MODE_THREADED);
67 if (tts->synth) {
68 tts->synth->callback_end = &speech_event_callback;
69 tts->synth->callback_cancel = &speech_event_callback;
70 tts->synth->callback_im = &speech_event_index_mark;
71 spd_set_notification_on(tts->synth, SPD_END);
72 spd_set_notification_on(tts->synth, SPD_CANCEL);
73
74 print_verbose("Text-to-Speech: Speech Dispatcher initialized.");
75 } else {
76 print_verbose("Text-to-Speech: Cannot initialize Speech Dispatcher synthesizer!");
77 }
78 }
79 }
80}
81
82void TTS_Linux::speech_event_index_mark(size_t p_msg_id, size_t p_client_id, SPDNotificationType p_type, char *p_index_mark) {
83 TTS_Linux *tts = TTS_Linux::get_singleton();
84 if (tts) {
85 callable_mp(tts, &TTS_Linux::_speech_index_mark).call_deferred(p_msg_id, p_client_id, (int)p_type, String::utf8(p_index_mark));
86 }
87}
88
89void TTS_Linux::_speech_index_mark(size_t p_msg_id, size_t p_client_id, int p_type, const String &p_index_mark) {
90 _THREAD_SAFE_METHOD_
91
92 if (ids.has(p_msg_id)) {
93 DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_BOUNDARY, ids[p_msg_id], p_index_mark.to_int());
94 }
95}
96
97void TTS_Linux::speech_event_callback(size_t p_msg_id, size_t p_client_id, SPDNotificationType p_type) {
98 TTS_Linux *tts = TTS_Linux::get_singleton();
99 if (tts) {
100 callable_mp(tts, &TTS_Linux::_speech_event).call_deferred(p_msg_id, p_client_id, (int)p_type);
101 }
102}
103
104void TTS_Linux::_load_voices() {
105 if (!voices_loaded) {
106 SPDVoice **spd_voices = spd_list_synthesis_voices(synth);
107 if (spd_voices != nullptr) {
108 SPDVoice **voices_ptr = spd_voices;
109 while (*voices_ptr != nullptr) {
110 VoiceInfo vi;
111 vi.language = String::utf8((*voices_ptr)->language);
112 vi.variant = String::utf8((*voices_ptr)->variant);
113 voices[String::utf8((*voices_ptr)->name)] = vi;
114 voices_ptr++;
115 }
116 free_spd_voices(spd_voices);
117 }
118 voices_loaded = true;
119 }
120}
121
122void TTS_Linux::_speech_event(size_t p_msg_id, size_t p_client_id, int p_type) {
123 _THREAD_SAFE_METHOD_
124
125 if (!paused && ids.has(p_msg_id)) {
126 if ((SPDNotificationType)p_type == SPD_EVENT_END) {
127 DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_ENDED, ids[p_msg_id]);
128 ids.erase(p_msg_id);
129 last_msg_id = -1;
130 speaking = false;
131 } else if ((SPDNotificationType)p_type == SPD_EVENT_CANCEL) {
132 DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, ids[p_msg_id]);
133 ids.erase(p_msg_id);
134 last_msg_id = -1;
135 speaking = false;
136 }
137 }
138 if (!speaking && queue.size() > 0) {
139 DisplayServer::TTSUtterance &message = queue.front()->get();
140
141 // Inject index mark after each word.
142 String text;
143 String language;
144
145 _load_voices();
146 const VoiceInfo *voice = voices.getptr(message.voice);
147 if (voice) {
148 language = voice->language;
149 }
150
151 PackedInt32Array breaks = TS->string_get_word_breaks(message.text, language);
152 for (int i = 0; i < breaks.size(); i += 2) {
153 const int start = breaks[i];
154 const int end = breaks[i + 1];
155 text += message.text.substr(start, end - start + 1);
156 text += "<mark name=\"" + String::num_int64(end, 10) + "\"/>";
157 }
158 spd_set_synthesis_voice(synth, message.voice.utf8().get_data());
159 spd_set_volume(synth, message.volume * 2 - 100);
160 spd_set_voice_pitch(synth, (message.pitch - 1) * 100);
161 float rate = 0;
162 if (message.rate > 1.f) {
163 rate = log10(MIN(message.rate, 2.5f)) / log10(2.5f) * 100;
164 } else if (message.rate < 1.f) {
165 rate = log10(MAX(message.rate, 0.5f)) / log10(0.5f) * -100;
166 }
167 spd_set_voice_rate(synth, rate);
168 spd_set_data_mode(synth, SPD_DATA_SSML);
169 last_msg_id = spd_say(synth, SPD_TEXT, text.utf8().get_data());
170 ids[last_msg_id] = message.id;
171 DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_STARTED, message.id);
172
173 queue.pop_front();
174 speaking = true;
175 }
176}
177
178bool TTS_Linux::is_speaking() const {
179 return speaking;
180}
181
182bool TTS_Linux::is_paused() const {
183 return paused;
184}
185
186Array TTS_Linux::get_voices() const {
187 _THREAD_SAFE_METHOD_
188
189 ERR_FAIL_NULL_V(synth, Array());
190 const_cast<TTS_Linux *>(this)->_load_voices();
191
192 Array list;
193 for (const KeyValue<String, VoiceInfo> &E : voices) {
194 Dictionary voice_d;
195 voice_d["name"] = E.key;
196 voice_d["id"] = E.key;
197 voice_d["language"] = E.value.language + "_" + E.value.variant;
198 list.push_back(voice_d);
199 }
200
201 return list;
202}
203
204void TTS_Linux::speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) {
205 _THREAD_SAFE_METHOD_
206
207 ERR_FAIL_NULL(synth);
208 if (p_interrupt) {
209 stop();
210 }
211
212 if (p_text.is_empty()) {
213 DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, p_utterance_id);
214 return;
215 }
216
217 DisplayServer::TTSUtterance message;
218 message.text = p_text;
219 message.voice = p_voice;
220 message.volume = CLAMP(p_volume, 0, 100);
221 message.pitch = CLAMP(p_pitch, 0.f, 2.f);
222 message.rate = CLAMP(p_rate, 0.1f, 10.f);
223 message.id = p_utterance_id;
224 queue.push_back(message);
225
226 if (is_paused()) {
227 resume();
228 } else {
229 _speech_event(0, 0, (int)SPD_EVENT_BEGIN);
230 }
231}
232
233void TTS_Linux::pause() {
234 _THREAD_SAFE_METHOD_
235
236 ERR_FAIL_NULL(synth);
237 if (spd_pause(synth) == 0) {
238 paused = true;
239 }
240}
241
242void TTS_Linux::resume() {
243 _THREAD_SAFE_METHOD_
244
245 ERR_FAIL_NULL(synth);
246 spd_resume(synth);
247 paused = false;
248}
249
250void TTS_Linux::stop() {
251 _THREAD_SAFE_METHOD_
252
253 ERR_FAIL_NULL(synth);
254 for (DisplayServer::TTSUtterance &message : queue) {
255 DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, message.id);
256 }
257 if ((last_msg_id != -1) && ids.has(last_msg_id)) {
258 DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, ids[last_msg_id]);
259 }
260 queue.clear();
261 ids.clear();
262 last_msg_id = -1;
263 spd_cancel(synth);
264 spd_resume(synth);
265 speaking = false;
266 paused = false;
267}
268
269TTS_Linux *TTS_Linux::get_singleton() {
270 return singleton;
271}
272
273TTS_Linux::TTS_Linux() {
274 singleton = this;
275 // Speech Dispatcher init can be slow, it might wait for helper process to start on background, so run it in the thread.
276 init_thread.start(speech_init_thread_func, this);
277}
278
279TTS_Linux::~TTS_Linux() {
280 init_thread.wait_to_finish();
281 if (synth) {
282 spd_close(synth);
283 }
284
285 singleton = nullptr;
286}
287