| 1 | /**************************************************************************/ |
| 2 | /* tts_linux.cpp */ |
| 3 | /**************************************************************************/ |
| 4 | /* This file is part of: */ |
| 5 | /* GODOT ENGINE */ |
| 6 | /* https://godotengine.org */ |
| 7 | /**************************************************************************/ |
| 8 | /* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ |
| 9 | /* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ |
| 10 | /* */ |
| 11 | /* Permission is hereby granted, free of charge, to any person obtaining */ |
| 12 | /* a copy of this software and associated documentation files (the */ |
| 13 | /* "Software"), to deal in the Software without restriction, including */ |
| 14 | /* without limitation the rights to use, copy, modify, merge, publish, */ |
| 15 | /* distribute, sublicense, and/or sell copies of the Software, and to */ |
| 16 | /* permit persons to whom the Software is furnished to do so, subject to */ |
| 17 | /* the following conditions: */ |
| 18 | /* */ |
| 19 | /* The above copyright notice and this permission notice shall be */ |
| 20 | /* included in all copies or substantial portions of the Software. */ |
| 21 | /* */ |
| 22 | /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ |
| 23 | /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ |
| 24 | /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ |
| 25 | /* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ |
| 26 | /* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ |
| 27 | /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ |
| 28 | /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ |
| 29 | /**************************************************************************/ |
| 30 | |
| 31 | #include "tts_linux.h" |
| 32 | |
| 33 | #include "core/config/project_settings.h" |
| 34 | #include "servers/text_server.h" |
| 35 | |
| 36 | TTS_Linux *TTS_Linux::singleton = nullptr; |
| 37 | |
| 38 | void TTS_Linux::speech_init_thread_func(void *p_userdata) { |
| 39 | TTS_Linux *tts = (TTS_Linux *)p_userdata; |
| 40 | if (tts) { |
| 41 | MutexLock thread_safe_method(tts->_thread_safe_); |
| 42 | #ifdef SOWRAP_ENABLED |
| 43 | #ifdef DEBUG_ENABLED |
| 44 | int dylibloader_verbose = 1; |
| 45 | #else |
| 46 | int dylibloader_verbose = 0; |
| 47 | #endif |
| 48 | if (initialize_speechd(dylibloader_verbose) != 0) { |
| 49 | print_verbose("Text-to-Speech: Cannot load Speech Dispatcher library!" ); |
| 50 | } else { |
| 51 | if (!spd_open || !spd_set_notification_on || !spd_list_synthesis_voices || !free_spd_voices || !spd_set_synthesis_voice || !spd_set_volume || !spd_set_voice_pitch || !spd_set_voice_rate || !spd_set_data_mode || !spd_say || !spd_pause || !spd_resume || !spd_cancel) { |
| 52 | // There's no API to check version, check if functions are available instead. |
| 53 | print_verbose("Text-to-Speech: Unsupported Speech Dispatcher library version!" ); |
| 54 | return; |
| 55 | } |
| 56 | #else |
| 57 | { |
| 58 | #endif |
| 59 | CharString class_str; |
| 60 | String config_name = GLOBAL_GET("application/config/name" ); |
| 61 | if (config_name.length() == 0) { |
| 62 | class_str = "Godot_Engine" ; |
| 63 | } else { |
| 64 | class_str = config_name.utf8(); |
| 65 | } |
| 66 | tts->synth = spd_open(class_str, "Godot_Engine_Speech_API" , "Godot_Engine" , SPD_MODE_THREADED); |
| 67 | if (tts->synth) { |
| 68 | tts->synth->callback_end = &speech_event_callback; |
| 69 | tts->synth->callback_cancel = &speech_event_callback; |
| 70 | tts->synth->callback_im = &speech_event_index_mark; |
| 71 | spd_set_notification_on(tts->synth, SPD_END); |
| 72 | spd_set_notification_on(tts->synth, SPD_CANCEL); |
| 73 | |
| 74 | print_verbose("Text-to-Speech: Speech Dispatcher initialized." ); |
| 75 | } else { |
| 76 | print_verbose("Text-to-Speech: Cannot initialize Speech Dispatcher synthesizer!" ); |
| 77 | } |
| 78 | } |
| 79 | } |
| 80 | } |
| 81 | |
| 82 | void TTS_Linux::speech_event_index_mark(size_t p_msg_id, size_t p_client_id, SPDNotificationType p_type, char *p_index_mark) { |
| 83 | TTS_Linux *tts = TTS_Linux::get_singleton(); |
| 84 | if (tts) { |
| 85 | callable_mp(tts, &TTS_Linux::_speech_index_mark).call_deferred(p_msg_id, p_client_id, (int)p_type, String::utf8(p_index_mark)); |
| 86 | } |
| 87 | } |
| 88 | |
| 89 | void TTS_Linux::_speech_index_mark(size_t p_msg_id, size_t p_client_id, int p_type, const String &p_index_mark) { |
| 90 | _THREAD_SAFE_METHOD_ |
| 91 | |
| 92 | if (ids.has(p_msg_id)) { |
| 93 | DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_BOUNDARY, ids[p_msg_id], p_index_mark.to_int()); |
| 94 | } |
| 95 | } |
| 96 | |
| 97 | void TTS_Linux::speech_event_callback(size_t p_msg_id, size_t p_client_id, SPDNotificationType p_type) { |
| 98 | TTS_Linux *tts = TTS_Linux::get_singleton(); |
| 99 | if (tts) { |
| 100 | callable_mp(tts, &TTS_Linux::_speech_event).call_deferred(p_msg_id, p_client_id, (int)p_type); |
| 101 | } |
| 102 | } |
| 103 | |
| 104 | void TTS_Linux::_load_voices() { |
| 105 | if (!voices_loaded) { |
| 106 | SPDVoice **spd_voices = spd_list_synthesis_voices(synth); |
| 107 | if (spd_voices != nullptr) { |
| 108 | SPDVoice **voices_ptr = spd_voices; |
| 109 | while (*voices_ptr != nullptr) { |
| 110 | VoiceInfo vi; |
| 111 | vi.language = String::utf8((*voices_ptr)->language); |
| 112 | vi.variant = String::utf8((*voices_ptr)->variant); |
| 113 | voices[String::utf8((*voices_ptr)->name)] = vi; |
| 114 | voices_ptr++; |
| 115 | } |
| 116 | free_spd_voices(spd_voices); |
| 117 | } |
| 118 | voices_loaded = true; |
| 119 | } |
| 120 | } |
| 121 | |
| 122 | void TTS_Linux::_speech_event(size_t p_msg_id, size_t p_client_id, int p_type) { |
| 123 | _THREAD_SAFE_METHOD_ |
| 124 | |
| 125 | if (!paused && ids.has(p_msg_id)) { |
| 126 | if ((SPDNotificationType)p_type == SPD_EVENT_END) { |
| 127 | DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_ENDED, ids[p_msg_id]); |
| 128 | ids.erase(p_msg_id); |
| 129 | last_msg_id = -1; |
| 130 | speaking = false; |
| 131 | } else if ((SPDNotificationType)p_type == SPD_EVENT_CANCEL) { |
| 132 | DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, ids[p_msg_id]); |
| 133 | ids.erase(p_msg_id); |
| 134 | last_msg_id = -1; |
| 135 | speaking = false; |
| 136 | } |
| 137 | } |
| 138 | if (!speaking && queue.size() > 0) { |
| 139 | DisplayServer::TTSUtterance &message = queue.front()->get(); |
| 140 | |
| 141 | // Inject index mark after each word. |
| 142 | String text; |
| 143 | String language; |
| 144 | |
| 145 | _load_voices(); |
| 146 | const VoiceInfo *voice = voices.getptr(message.voice); |
| 147 | if (voice) { |
| 148 | language = voice->language; |
| 149 | } |
| 150 | |
| 151 | PackedInt32Array breaks = TS->string_get_word_breaks(message.text, language); |
| 152 | for (int i = 0; i < breaks.size(); i += 2) { |
| 153 | const int start = breaks[i]; |
| 154 | const int end = breaks[i + 1]; |
| 155 | text += message.text.substr(start, end - start + 1); |
| 156 | text += "<mark name=\"" + String::num_int64(end, 10) + "\"/>" ; |
| 157 | } |
| 158 | spd_set_synthesis_voice(synth, message.voice.utf8().get_data()); |
| 159 | spd_set_volume(synth, message.volume * 2 - 100); |
| 160 | spd_set_voice_pitch(synth, (message.pitch - 1) * 100); |
| 161 | float rate = 0; |
| 162 | if (message.rate > 1.f) { |
| 163 | rate = log10(MIN(message.rate, 2.5f)) / log10(2.5f) * 100; |
| 164 | } else if (message.rate < 1.f) { |
| 165 | rate = log10(MAX(message.rate, 0.5f)) / log10(0.5f) * -100; |
| 166 | } |
| 167 | spd_set_voice_rate(synth, rate); |
| 168 | spd_set_data_mode(synth, SPD_DATA_SSML); |
| 169 | last_msg_id = spd_say(synth, SPD_TEXT, text.utf8().get_data()); |
| 170 | ids[last_msg_id] = message.id; |
| 171 | DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_STARTED, message.id); |
| 172 | |
| 173 | queue.pop_front(); |
| 174 | speaking = true; |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | bool TTS_Linux::is_speaking() const { |
| 179 | return speaking; |
| 180 | } |
| 181 | |
| 182 | bool TTS_Linux::is_paused() const { |
| 183 | return paused; |
| 184 | } |
| 185 | |
| 186 | Array TTS_Linux::get_voices() const { |
| 187 | _THREAD_SAFE_METHOD_ |
| 188 | |
| 189 | ERR_FAIL_NULL_V(synth, Array()); |
| 190 | const_cast<TTS_Linux *>(this)->_load_voices(); |
| 191 | |
| 192 | Array list; |
| 193 | for (const KeyValue<String, VoiceInfo> &E : voices) { |
| 194 | Dictionary voice_d; |
| 195 | voice_d["name" ] = E.key; |
| 196 | voice_d["id" ] = E.key; |
| 197 | voice_d["language" ] = E.value.language + "_" + E.value.variant; |
| 198 | list.push_back(voice_d); |
| 199 | } |
| 200 | |
| 201 | return list; |
| 202 | } |
| 203 | |
| 204 | void TTS_Linux::speak(const String &p_text, const String &p_voice, int p_volume, float p_pitch, float p_rate, int p_utterance_id, bool p_interrupt) { |
| 205 | _THREAD_SAFE_METHOD_ |
| 206 | |
| 207 | ERR_FAIL_NULL(synth); |
| 208 | if (p_interrupt) { |
| 209 | stop(); |
| 210 | } |
| 211 | |
| 212 | if (p_text.is_empty()) { |
| 213 | DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, p_utterance_id); |
| 214 | return; |
| 215 | } |
| 216 | |
| 217 | DisplayServer::TTSUtterance message; |
| 218 | message.text = p_text; |
| 219 | message.voice = p_voice; |
| 220 | message.volume = CLAMP(p_volume, 0, 100); |
| 221 | message.pitch = CLAMP(p_pitch, 0.f, 2.f); |
| 222 | message.rate = CLAMP(p_rate, 0.1f, 10.f); |
| 223 | message.id = p_utterance_id; |
| 224 | queue.push_back(message); |
| 225 | |
| 226 | if (is_paused()) { |
| 227 | resume(); |
| 228 | } else { |
| 229 | _speech_event(0, 0, (int)SPD_EVENT_BEGIN); |
| 230 | } |
| 231 | } |
| 232 | |
| 233 | void TTS_Linux::pause() { |
| 234 | _THREAD_SAFE_METHOD_ |
| 235 | |
| 236 | ERR_FAIL_NULL(synth); |
| 237 | if (spd_pause(synth) == 0) { |
| 238 | paused = true; |
| 239 | } |
| 240 | } |
| 241 | |
| 242 | void TTS_Linux::resume() { |
| 243 | _THREAD_SAFE_METHOD_ |
| 244 | |
| 245 | ERR_FAIL_NULL(synth); |
| 246 | spd_resume(synth); |
| 247 | paused = false; |
| 248 | } |
| 249 | |
| 250 | void TTS_Linux::stop() { |
| 251 | _THREAD_SAFE_METHOD_ |
| 252 | |
| 253 | ERR_FAIL_NULL(synth); |
| 254 | for (DisplayServer::TTSUtterance &message : queue) { |
| 255 | DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, message.id); |
| 256 | } |
| 257 | if ((last_msg_id != -1) && ids.has(last_msg_id)) { |
| 258 | DisplayServer::get_singleton()->tts_post_utterance_event(DisplayServer::TTS_UTTERANCE_CANCELED, ids[last_msg_id]); |
| 259 | } |
| 260 | queue.clear(); |
| 261 | ids.clear(); |
| 262 | last_msg_id = -1; |
| 263 | spd_cancel(synth); |
| 264 | spd_resume(synth); |
| 265 | speaking = false; |
| 266 | paused = false; |
| 267 | } |
| 268 | |
| 269 | TTS_Linux *TTS_Linux::get_singleton() { |
| 270 | return singleton; |
| 271 | } |
| 272 | |
| 273 | TTS_Linux::TTS_Linux() { |
| 274 | singleton = this; |
| 275 | // Speech Dispatcher init can be slow, it might wait for helper process to start on background, so run it in the thread. |
| 276 | init_thread.start(speech_init_thread_func, this); |
| 277 | } |
| 278 | |
| 279 | TTS_Linux::~TTS_Linux() { |
| 280 | init_thread.wait_to_finish(); |
| 281 | if (synth) { |
| 282 | spd_close(synth); |
| 283 | } |
| 284 | |
| 285 | singleton = nullptr; |
| 286 | } |
| 287 | |