ubcemergingmedialab/ARDesign/_text_to_speech_8cs_source.html

 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License. See LICENSE in the project root for license information.
 using System;
 using UnityEngine;

 #if !UNITY_EDITOR && UNITY_WSA
 using Windows.Foundation;
 using Windows.Media.SpeechSynthesis;
 using Windows.Storage.Streams;
 using System.Linq;
 using System.Threading.Tasks;
 #endif

 namespace HoloToolkit.Unity
 {
     public enum TextToSpeechVoice
     {
         Default,

         David,

         Mark,

         Zira,
     }

     [RequireComponent(typeof(AudioSource))]
     public class TextToSpeech : MonoBehaviour
     {
         [Tooltip("The audio source where speech will be played.")]
         [SerializeField]
         private AudioSource audioSource;

         public AudioSource AudioSource { get { return audioSource; } set { audioSource = value; } }

         public TextToSpeechVoice Voice { get { return voice; } set { voice = value; } }

         [Tooltip("The voice that will be used to generate speech.")]
         [SerializeField]
         private TextToSpeechVoice voice;

 #if !UNITY_EDITOR && UNITY_WSA
         private SpeechSynthesizer synthesizer;
         private VoiceInformation voiceInfo;
         private bool speechTextInQueue = false;
 #endif

         private static float BytesToFloat(byte firstByte, byte secondByte)
         {
             // Convert two bytes to one short (little endian)
             short s = (short)((secondByte << 8) | firstByte);

             // Convert to range from -1 to (just below) 1
             return s / 32768.0F;
         }

         private static int BytesToInt(byte[] bytes, int offset = 0)
         {
             int value = 0;
             for (int i = 0; i < 4; i++)
             {
                 value |= ((int)bytes[offset + i]) << (i * 8);
             }
             return value;
         }

         private static AudioClip ToClip(string name, float[] audioData, int sampleCount, int frequency)
         {
             var clip = AudioClip.Create(name, sampleCount, 1, frequency, false);
             clip.SetData(audioData, 0);
             return clip;
         }

         private static float[] ToUnityAudio(byte[] wavAudio, out int sampleCount, out int frequency)
         {
             // Determine if mono or stereo
             int channelCount = wavAudio[22];  // Speech audio data is always mono but read actual header value for processing

             // Get the frequency
             frequency = BytesToInt(wavAudio, 24);

             // Get past all the other sub chunks to get to the data subchunk:
             int pos = 12; // First subchunk ID from 12 to 16

             // Keep iterating until we find the data chunk (i.e. 64 61 74 61 ...... (i.e. 100 97 116 97 in decimal))
             while (!(wavAudio[pos] == 100 && wavAudio[pos + 1] == 97 && wavAudio[pos + 2] == 116 && wavAudio[pos + 3] == 97))
             {
                 pos += 4;
                 int chunkSize = wavAudio[pos] + wavAudio[pos + 1] * 256 + wavAudio[pos + 2] * 65536 + wavAudio[pos + 3] * 16777216;
                 pos += 4 + chunkSize;
             }
             pos += 8;

             // Pos is now positioned to start of actual sound data.
             sampleCount = (wavAudio.Length - pos) / 2;  // 2 bytes per sample (16 bit sound mono)
             if (channelCount == 2) { sampleCount /= 2; }  // 4 bytes per sample (16 bit stereo)

             // Allocate memory (supporting left channel only)
             var unityData = new float[sampleCount];

             // Write to double array/s:
             int i = 0;
             while (pos < wavAudio.Length)
             {
                 unityData[i] = BytesToFloat(wavAudio[pos], wavAudio[pos + 1]);
                 pos += 2;
                 if (channelCount == 2)
                 {
                     pos += 2;
                 }
                 i++;
             }

             return unityData;
         }

 #if !UNITY_EDITOR && UNITY_WSA
         private void PlaySpeech(string text, Func<IAsyncOperation<SpeechSynthesisStream>> speakFunc)
         {
             // Make sure there's something to speak
             if (speakFunc == null) throw new ArgumentNullException(nameof(speakFunc));

             if (synthesizer != null)
             {
                 try
                 {
                     speechTextInQueue = true;
                     // Need await, so most of this will be run as a new Task in its own thread.
                     // This is good since it frees up Unity to keep running anyway.
                     Task.Run(async () =>
                     {
                         // Change voice?
                         if (voice != TextToSpeechVoice.Default)
                         {
                             // Get name
                             var voiceName = Enum.GetName(typeof(TextToSpeechVoice), voice);

                             // See if it's never been found or is changing
                             if ((voiceInfo == null) || (!voiceInfo.DisplayName.Contains(voiceName)))
                             {
                                 // Search for voice info
                                 voiceInfo = SpeechSynthesizer.AllVoices.Where(v => v.DisplayName.Contains(voiceName)).FirstOrDefault();

                                 // If found, select
                                 if (voiceInfo != null)
                                 {
                                     synthesizer.Voice = voiceInfo;
                                 }
                                 else
                                 {
                                     Debug.LogErrorFormat("TTS voice {0} could not be found.", voiceName);
                                 }
                             }
                         }

                         // Speak and get stream
                         var speechStream = await speakFunc();

                         // Get the size of the original stream
                         var size = speechStream.Size;

                         // Create buffer
                         byte[] buffer = new byte[(int)size];

                         // Get input stream and the size of the original stream
                         using (var inputStream = speechStream.GetInputStreamAt(0))
                         {
                             // Close the original speech stream to free up memory
                             speechStream.Dispose();

                             // Create a new data reader off the input stream
                             using (var dataReader = new DataReader(inputStream))
                             {
                                 // Load all bytes into the reader
                                 await dataReader.LoadAsync((uint)size);

                                 // Copy from reader into buffer
                                 dataReader.ReadBytes(buffer);
                             }
                         }

                         // Convert raw WAV data into Unity audio data
                         int sampleCount = 0;
                         int frequency = 0;
                         var unityData = ToUnityAudio(buffer, out sampleCount, out frequency);

                         // The remainder must be done back on Unity's main thread
                         UnityEngine.WSA.Application.InvokeOnAppThread(() =>
                         {
                             // Convert to an audio clip
                             var clip = ToClip("Speech", unityData, sampleCount, frequency);

                             // Set the source on the audio clip
                             audioSource.clip = clip;

                             // Play audio
                             audioSource.Play();
                             speechTextInQueue = false;
                         }, false);
                     });
                 }
                 catch (Exception ex)
                 {
                     speechTextInQueue = false;
                     Debug.LogErrorFormat("Speech generation problem: \"{0}\"", ex.Message);
                 }
             }
             else
             {
                 Debug.LogErrorFormat("Speech not initialized. \"{0}\"", text);
             }
         }
 #endif

         private void Awake()
         {
             try
             {
                 if (audioSource == null)
                 {
                     audioSource = GetComponent<AudioSource>();
                 }
 #if !UNITY_EDITOR && UNITY_WSA
                 synthesizer = new SpeechSynthesizer();
 #endif
             }
             catch (Exception ex)
             {
                 Debug.LogError("Could not start Speech Synthesis: " + ex.Message);
             }
         }

         // Public Methods

         public void SpeakSsml(string ssml)
         {
             // Make sure there's something to speak
             if (string.IsNullOrEmpty(ssml)) { return; }

             // Pass to helper method
 #if !UNITY_EDITOR && UNITY_WSA
             PlaySpeech(ssml, () => synthesizer.SynthesizeSsmlToStreamAsync(ssml));
 #else
             Debug.LogWarningFormat("Text to Speech not supported in editor.\n\"{0}\"", ssml);
 #endif
         }

         public void StartSpeaking(string text)
         {
             // Make sure there's something to speak
             if (string.IsNullOrEmpty(text)) { return; }

             // Pass to helper method
 #if !UNITY_EDITOR && UNITY_WSA
             PlaySpeech(text, ()=> synthesizer.SynthesizeTextToStreamAsync(text));
 #else
             Debug.LogWarningFormat("Text to Speech not supported in editor.\n\"{0}\"", text);
 #endif
         }

         public bool SpeechTextInQueue()
         {
 #if !UNITY_EDITOR && UNITY_WSA
             return speechTextInQueue;
 #else
             return false;
 #endif
         }

         public bool IsSpeaking()
         {
             if (audioSource != null)
             {
                 return audioSource.isPlaying;
             }

             return false;
         }

         public void StopSpeaking()
         {
             if (IsSpeaking())
             {
                 audioSource.Stop();
             }
         }
     }
 }
HoloToolkit.Unity.TextToSpeech.IsSpeaking
bool IsSpeaking()
Returns whether or not the AudioSource is actively playing.
Definition: TextToSpeech.cs:358

System

HoloToolkit.Unity.TextToSpeech.StartSpeaking
void StartSpeaking(string text)
Speaks the specified text using text-to-speech.
Definition: TextToSpeech.cs:324

HoloToolkit.Unity.TextToSpeechVoice.David
Microsoft David Mobile

HoloToolkit.Unity.TextToSpeech
Enables text to speech using the Windows 10 SpeechSynthesizer class.
Definition: TextToSpeech.cs:54

HoloToolkit.Unity.TextToSpeech.SpeechTextInQueue
bool SpeechTextInQueue()
Returns info whether a text is submitted and being processed by PlaySpeech method Handy for avoiding ...
Definition: TextToSpeech.cs:343

HoloToolkit.Unity.TextToSpeechVoice.Zira
Microsoft Zira Mobile

HoloToolkit.Unity.TextToSpeech.SpeakSsml
void SpeakSsml(string ssml)
Speaks the specified SSML markup using text-to-speech.
Definition: TextToSpeech.cs:307

HoloToolkit.Unity.TextToSpeech.StopSpeaking
void StopSpeaking()
Stops text-to-speech playback.
Definition: TextToSpeech.cs:371

HoloToolkit.Unity.TextToSpeechVoice
TextToSpeechVoice
The well-known voices that can be used by TextToSpeech.
Definition: TextToSpeech.cs:19

Debug

HoloToolkit

HoloToolkit.Unity.TextToSpeechVoice.Mark
Microsoft Mark Mobile

UnityEngine