AR Design
UBC EML collab with UBC SALA - visualizing IoT data in AR
TextToSpeech.cs
Go to the documentation of this file.
1 // Copyright (c) Microsoft Corporation. All rights reserved.
2 // Licensed under the MIT License. See LICENSE in the project root for license information.
3 using System;
4 using UnityEngine;
5 
6 #if !UNITY_EDITOR && UNITY_WSA
7 using Windows.Foundation;
8 using Windows.Media.SpeechSynthesis;
9 using Windows.Storage.Streams;
10 using System.Linq;
11 using System.Threading.Tasks;
12 #endif
13 
14 namespace HoloToolkit.Unity
15 {
19  public enum TextToSpeechVoice
20  {
24  Default,
25 
29  David,
30 
34  Mark,
35 
39  Zira,
40  }
41 
53  [RequireComponent(typeof(AudioSource))]
54  public class TextToSpeech : MonoBehaviour
55  {
56  [Tooltip("The audio source where speech will be played.")]
57  [SerializeField]
58  private AudioSource audioSource;
59 
63  public AudioSource AudioSource { get { return audioSource; } set { audioSource = value; } }
64 
68  public TextToSpeechVoice Voice { get { return voice; } set { voice = value; } }
69 
70  [Tooltip("The voice that will be used to generate speech.")]
71  [SerializeField]
72  private TextToSpeechVoice voice;
73 
74 #if !UNITY_EDITOR && UNITY_WSA
75  private SpeechSynthesizer synthesizer;
76  private VoiceInformation voiceInfo;
77  private bool speechTextInQueue = false;
78 #endif
79 
86  private static float BytesToFloat(byte firstByte, byte secondByte)
87  {
88  // Convert two bytes to one short (little endian)
89  short s = (short)((secondByte << 8) | firstByte);
90 
91  // Convert to range from -1 to (just below) 1
92  return s / 32768.0F;
93  }
94 
101  private static int BytesToInt(byte[] bytes, int offset = 0)
102  {
103  int value = 0;
104  for (int i = 0; i < 4; i++)
105  {
106  value |= ((int)bytes[offset + i]) << (i * 8);
107  }
108  return value;
109  }
110 
119  private static AudioClip ToClip(string name, float[] audioData, int sampleCount, int frequency)
120  {
121  var clip = AudioClip.Create(name, sampleCount, 1, frequency, false);
122  clip.SetData(audioData, 0);
123  return clip;
124  }
125 
133  private static float[] ToUnityAudio(byte[] wavAudio, out int sampleCount, out int frequency)
134  {
135  // Determine if mono or stereo
136  int channelCount = wavAudio[22]; // Speech audio data is always mono but read actual header value for processing
137 
138  // Get the frequency
139  frequency = BytesToInt(wavAudio, 24);
140 
141  // Get past all the other sub chunks to get to the data subchunk:
142  int pos = 12; // First subchunk ID from 12 to 16
143 
144  // Keep iterating until we find the data chunk (i.e. 64 61 74 61 ...... (i.e. 100 97 116 97 in decimal))
145  while (!(wavAudio[pos] == 100 && wavAudio[pos + 1] == 97 && wavAudio[pos + 2] == 116 && wavAudio[pos + 3] == 97))
146  {
147  pos += 4;
148  int chunkSize = wavAudio[pos] + wavAudio[pos + 1] * 256 + wavAudio[pos + 2] * 65536 + wavAudio[pos + 3] * 16777216;
149  pos += 4 + chunkSize;
150  }
151  pos += 8;
152 
153  // Pos is now positioned to start of actual sound data.
154  sampleCount = (wavAudio.Length - pos) / 2; // 2 bytes per sample (16 bit sound mono)
155  if (channelCount == 2) { sampleCount /= 2; } // 4 bytes per sample (16 bit stereo)
156 
157  // Allocate memory (supporting left channel only)
158  var unityData = new float[sampleCount];
159 
160  // Write to double array/s:
161  int i = 0;
162  while (pos < wavAudio.Length)
163  {
164  unityData[i] = BytesToFloat(wavAudio[pos], wavAudio[pos + 1]);
165  pos += 2;
166  if (channelCount == 2)
167  {
168  pos += 2;
169  }
170  i++;
171  }
172 
173  return unityData;
174  }
175 
176 #if !UNITY_EDITOR && UNITY_WSA
177  private void PlaySpeech(string text, Func<IAsyncOperation<SpeechSynthesisStream>> speakFunc)
187  {
188  // Make sure there's something to speak
189  if (speakFunc == null) throw new ArgumentNullException(nameof(speakFunc));
190 
191  if (synthesizer != null)
192  {
193  try
194  {
195  speechTextInQueue = true;
196  // Need await, so most of this will be run as a new Task in its own thread.
197  // This is good since it frees up Unity to keep running anyway.
198  Task.Run(async () =>
199  {
200  // Change voice?
201  if (voice != TextToSpeechVoice.Default)
202  {
203  // Get name
204  var voiceName = Enum.GetName(typeof(TextToSpeechVoice), voice);
205 
206  // See if it's never been found or is changing
207  if ((voiceInfo == null) || (!voiceInfo.DisplayName.Contains(voiceName)))
208  {
209  // Search for voice info
210  voiceInfo = SpeechSynthesizer.AllVoices.Where(v => v.DisplayName.Contains(voiceName)).FirstOrDefault();
211 
212  // If found, select
213  if (voiceInfo != null)
214  {
215  synthesizer.Voice = voiceInfo;
216  }
217  else
218  {
219  Debug.LogErrorFormat("TTS voice {0} could not be found.", voiceName);
220  }
221  }
222  }
223 
224  // Speak and get stream
225  var speechStream = await speakFunc();
226 
227  // Get the size of the original stream
228  var size = speechStream.Size;
229 
230  // Create buffer
231  byte[] buffer = new byte[(int)size];
232 
233  // Get input stream and the size of the original stream
234  using (var inputStream = speechStream.GetInputStreamAt(0))
235  {
236  // Close the original speech stream to free up memory
237  speechStream.Dispose();
238 
239  // Create a new data reader off the input stream
240  using (var dataReader = new DataReader(inputStream))
241  {
242  // Load all bytes into the reader
243  await dataReader.LoadAsync((uint)size);
244 
245  // Copy from reader into buffer
246  dataReader.ReadBytes(buffer);
247  }
248  }
249 
250  // Convert raw WAV data into Unity audio data
251  int sampleCount = 0;
252  int frequency = 0;
253  var unityData = ToUnityAudio(buffer, out sampleCount, out frequency);
254 
255  // The remainder must be done back on Unity's main thread
256  UnityEngine.WSA.Application.InvokeOnAppThread(() =>
257  {
258  // Convert to an audio clip
259  var clip = ToClip("Speech", unityData, sampleCount, frequency);
260 
261  // Set the source on the audio clip
262  audioSource.clip = clip;
263 
264  // Play audio
265  audioSource.Play();
266  speechTextInQueue = false;
267  }, false);
268  });
269  }
270  catch (Exception ex)
271  {
272  speechTextInQueue = false;
273  Debug.LogErrorFormat("Speech generation problem: \"{0}\"", ex.Message);
274  }
275  }
276  else
277  {
278  Debug.LogErrorFormat("Speech not initialized. \"{0}\"", text);
279  }
280  }
281 #endif
282 
283  private void Awake()
284  {
285  try
286  {
287  if (audioSource == null)
288  {
289  audioSource = GetComponent<AudioSource>();
290  }
291 #if !UNITY_EDITOR && UNITY_WSA
292  synthesizer = new SpeechSynthesizer();
293 #endif
294  }
295  catch (Exception ex)
296  {
297  Debug.LogError("Could not start Speech Synthesis: " + ex.Message);
298  }
299  }
300 
301  // Public Methods
302 
307  public void SpeakSsml(string ssml)
308  {
309  // Make sure there's something to speak
310  if (string.IsNullOrEmpty(ssml)) { return; }
311 
312  // Pass to helper method
313 #if !UNITY_EDITOR && UNITY_WSA
314  PlaySpeech(ssml, () => synthesizer.SynthesizeSsmlToStreamAsync(ssml));
315 #else
316  Debug.LogWarningFormat("Text to Speech not supported in editor.\n\"{0}\"", ssml);
317 #endif
318  }
319 
324  public void StartSpeaking(string text)
325  {
326  // Make sure there's something to speak
327  if (string.IsNullOrEmpty(text)) { return; }
328 
329  // Pass to helper method
330 #if !UNITY_EDITOR && UNITY_WSA
331  PlaySpeech(text, ()=> synthesizer.SynthesizeTextToStreamAsync(text));
332 #else
333  Debug.LogWarningFormat("Text to Speech not supported in editor.\n\"{0}\"", text);
334 #endif
335  }
336 
343  public bool SpeechTextInQueue()
344  {
345 #if !UNITY_EDITOR && UNITY_WSA
346  return speechTextInQueue;
347 #else
348  return false;
349 #endif
350  }
351 
358  public bool IsSpeaking()
359  {
360  if (audioSource != null)
361  {
362  return audioSource.isPlaying;
363  }
364 
365  return false;
366  }
367 
371  public void StopSpeaking()
372  {
373  if (IsSpeaking())
374  {
375  audioSource.Stop();
376  }
377  }
378  }
379 }
bool IsSpeaking()
Returns whether or not the AudioSource is actively playing.
void StartSpeaking(string text)
Speaks the specified text using text-to-speech.
Enables text to speech using the Windows 10 SpeechSynthesizer class.
Definition: TextToSpeech.cs:54
bool SpeechTextInQueue()
Returns info whether a text is submitted and being processed by PlaySpeech method Handy for avoiding ...
void SpeakSsml(string ssml)
Speaks the specified SSML markup using text-to-speech.
void StopSpeaking()
Stops text-to-speech playback.
TextToSpeechVoice
The well-known voices that can be used by TextToSpeech.
Definition: TextToSpeech.cs:19