6 #if !UNITY_EDITOR && UNITY_WSA 7 using Windows.Foundation;
8 using Windows.Media.SpeechSynthesis;
9 using Windows.Storage.Streams;
11 using System.Threading.Tasks;
53 [RequireComponent(typeof(AudioSource))]
56 [Tooltip(
"The audio source where speech will be played.")]
58 private AudioSource audioSource;
63 public AudioSource AudioSource {
get {
return audioSource; }
set { audioSource = value; } }
70 [Tooltip(
"The voice that will be used to generate speech.")]
74 #if !UNITY_EDITOR && UNITY_WSA 75 private SpeechSynthesizer synthesizer;
76 private VoiceInformation voiceInfo;
77 private bool speechTextInQueue =
false;
86 private static float BytesToFloat(byte firstByte, byte secondByte)
89 short s = (short)((secondByte << 8) | firstByte);
101 private static int BytesToInt(byte[] bytes,
int offset = 0)
104 for (
int i = 0; i < 4; i++)
106 value |= ((int)bytes[offset + i]) << (i * 8);
119 private static AudioClip ToClip(
string name,
float[] audioData,
int sampleCount,
int frequency)
121 var clip = AudioClip.Create(name, sampleCount, 1, frequency,
false);
122 clip.SetData(audioData, 0);
133 private static float[] ToUnityAudio(byte[] wavAudio, out
int sampleCount, out
int frequency)
136 int channelCount = wavAudio[22];
139 frequency = BytesToInt(wavAudio, 24);
145 while (!(wavAudio[pos] == 100 && wavAudio[pos + 1] == 97 && wavAudio[pos + 2] == 116 && wavAudio[pos + 3] == 97))
148 int chunkSize = wavAudio[pos] + wavAudio[pos + 1] * 256 + wavAudio[pos + 2] * 65536 + wavAudio[pos + 3] * 16777216;
149 pos += 4 + chunkSize;
154 sampleCount = (wavAudio.Length - pos) / 2;
155 if (channelCount == 2) { sampleCount /= 2; }
158 var unityData =
new float[sampleCount];
162 while (pos < wavAudio.Length)
164 unityData[i] = BytesToFloat(wavAudio[pos], wavAudio[pos + 1]);
166 if (channelCount == 2)
176 #if !UNITY_EDITOR && UNITY_WSA 177 private void PlaySpeech(
string text, Func<IAsyncOperation<SpeechSynthesisStream>> speakFunc)
189 if (speakFunc == null)
throw new ArgumentNullException(nameof(speakFunc));
191 if (synthesizer != null)
195 speechTextInQueue =
true;
204 var voiceName = Enum.GetName(typeof(TextToSpeechVoice), voice);
207 if ((voiceInfo == null) || (!voiceInfo.DisplayName.Contains(voiceName)))
210 voiceInfo = SpeechSynthesizer.AllVoices.Where(v => v.DisplayName.Contains(voiceName)).FirstOrDefault();
213 if (voiceInfo != null)
215 synthesizer.Voice = voiceInfo;
219 Debug.LogErrorFormat(
"TTS voice {0} could not be found.", voiceName);
225 var speechStream = await speakFunc();
228 var size = speechStream.Size;
231 byte[] buffer =
new byte[(int)size];
234 using (var inputStream = speechStream.GetInputStreamAt(0))
237 speechStream.Dispose();
240 using (var dataReader =
new DataReader(inputStream))
243 await dataReader.LoadAsync((uint)size);
246 dataReader.ReadBytes(buffer);
253 var unityData = ToUnityAudio(buffer, out sampleCount, out frequency);
256 UnityEngine.WSA.Application.InvokeOnAppThread(() =>
259 var clip = ToClip(
"Speech", unityData, sampleCount, frequency);
262 audioSource.clip = clip;
266 speechTextInQueue =
false;
272 speechTextInQueue =
false;
273 Debug.LogErrorFormat(
"Speech generation problem: \"{0}\"", ex.Message);
278 Debug.LogErrorFormat(
"Speech not initialized. \"{0}\"", text);
287 if (audioSource == null)
289 audioSource = GetComponent<AudioSource>();
291 #if !UNITY_EDITOR && UNITY_WSA 292 synthesizer =
new SpeechSynthesizer();
297 Debug.LogError(
"Could not start Speech Synthesis: " + ex.Message);
310 if (
string.IsNullOrEmpty(ssml)) {
return; }
313 #if !UNITY_EDITOR && UNITY_WSA 314 PlaySpeech(ssml, () => synthesizer.SynthesizeSsmlToStreamAsync(ssml));
316 Debug.LogWarningFormat(
"Text to Speech not supported in editor.\n\"{0}\"", ssml);
327 if (
string.IsNullOrEmpty(text)) {
return; }
330 #if !UNITY_EDITOR && UNITY_WSA 331 PlaySpeech(text, ()=> synthesizer.SynthesizeTextToStreamAsync(text));
333 Debug.LogWarningFormat(
"Text to Speech not supported in editor.\n\"{0}\"", text);
345 #if !UNITY_EDITOR && UNITY_WSA 346 return speechTextInQueue;
360 if (audioSource != null)
362 return audioSource.isPlaying;
bool IsSpeaking()
Returns whether or not the AudioSource is actively playing.
void StartSpeaking(string text)
Speaks the specified text using text-to-speech.
Enables text to speech using the Windows 10 SpeechSynthesizer class.
bool SpeechTextInQueue()
Returns info whether a text is submitted and being processed by PlaySpeech method Handy for avoiding ...
void SpeakSsml(string ssml)
Speaks the specified SSML markup using text-to-speech.
void StopSpeaking()
Stops text-to-speech playback.