using System.Collections; using System.Collections.Generic; using UnityEngine; using System; using System.Text; using System.Security.Cryptography; using Microphone = FrostweepGames.MicrophonePro.Microphone; using System.Net.WebSockets; using System.Threading; public class XunFeiSTTRealtime : STT { //讯飞控制台上所创建的应用对应的appid、appkey [SerializeField] private string appid = "f2433640"; [SerializeField] private string appkey = "6fde2501ca9c018c392bdffb19757d8d"; private string timeStamp; private string baseString; private string toMd5; private string signa; private AudioClip RecordedClip; private ClientWebSocket ws; private CancellationToken ct; //最大录音时长 private int MAX_RECORD_LENGTH = 3599; /// /// 语音识别回调事件 /// public Action asrCallback; public override void StartSpeechToText(Action _callback) { base.StartSpeechToText(_callback); asrCallback = _callback; StartASR(); } public override void StopSpeechToText(Action _callback) { base.StopSpeechToText(_callback); asrCallback = null; StopASR(); } public void StartASR() { if (ws != null && ws.State == WebSocketState.Open) { Debug.LogWarning("开始语音识别失败!,等待上次识别连接结束"); return; } if (Microphone.devices.Length == 0) { Debug.LogError("未检测到可用的麦克风"); return; } ConnectASR_Aysnc(); RecordedClip = Microphone.Start(null, false, MAX_RECORD_LENGTH, 16000); } public async void StopASR() { // 避免重复调用 if (ws == null || ws.State == WebSocketState.Closed || ws.State == WebSocketState.Aborted) { return; } try { // 关掉发送音频的协程 StopCoroutine(SendAudioClip()); Microphone.End(null); // 只有在连接打开状态下才发送结束标识 if (ws.State == WebSocketState.Open) { await ws.SendAsync( new ArraySegment(Encoding.UTF8.GetBytes("{\"end\": true}")), WebSocketMessageType.Text, // 这里应该用Text类型而不是Binary true, CancellationToken.None ); } // 等待关闭操作完成 if (ws.State == WebSocketState.Open || ws.State == WebSocketState.CloseSent) { await ws.CloseAsync( WebSocketCloseStatus.NormalClosure, "Closing normally", CancellationToken.None ); } } catch (ObjectDisposedException) { // 已释放的异常可以忽略,因为对象已经在清理中 Debug.Log("WebSocket already disposed"); } catch (Exception ex) { Debug.LogError($"StopASR error: {ex.Message}"); } finally { // 最终确保资源释放 if (ws != null) { // 如果还没终止,强制终止 if (ws.State != WebSocketState.Closed && ws.State != WebSocketState.Aborted) { ws.Abort(); } // 释放资源 ws.Dispose(); ws = null; // 置空避免再次访问 } } } async void ConnectASR_Aysnc() { ws = new ClientWebSocket(); ct = new CancellationToken(); Uri url = GetUri(); await ws.ConnectAsync(url, ct); StartCoroutine(SendAudioClip()); try { while (ws.State == WebSocketState.Open) { var result = new byte[4096]; await ws.ReceiveAsync(new ArraySegment(result), ct); //接受数据 List list = new List(result); while (list[list.Count - 1] == 0x00) list.RemoveAt(list.Count - 1); //去除空字节 string str = Encoding.UTF8.GetString(list.ToArray()); if (string.IsNullOrEmpty(str)) { return; } ReceiveJsonData receiveJsonData = JsonUtility.FromJson(str); if (receiveJsonData.action.Equals("started")) { Debug.Log("握手成功!"); } else if (receiveJsonData.action.Equals("result")) { //Debug.Log("返回结果:" + str); AnalysisResult(receiveJsonData.data); } else if (receiveJsonData.action.Equals("error")) { Debug.Log("Error: " + receiveJsonData.desc); } } } catch (Exception ex) { Debug.Log(ex.Message); } } /// /// 发送音频片段 /// /// /// IEnumerator SendAudioClip() { yield return new WaitWhile(() => Microphone.GetPosition(null) <= 0); float t = 0; int position = Microphone.GetPosition(null); const float waitTime = 0.04f; //每隔40ms发送音频 const int Maxlength = 1280; //最多发送1280字节 int status = 0; int lastPosition = 0; while (position < RecordedClip.samples && ws.State == WebSocketState.Open) { t += waitTime; if (t >= MAX_RECORD_LENGTH) { Debug.Log("录音时长已用尽,结束语音识别!"); break; } yield return new WaitForSecondsRealtime(waitTime); if (Microphone.IsRecording(null)) { position = Microphone.GetPosition(null); //Debug.Log("录音时长:" + t + "position=" + position + ",lastPosition=" + lastPosition); } if (position <= lastPosition) { // 防止出现当前采样位置和上一帧采样位置一样,导致length为0 // 那么在调用AudioClip.GetData(float[] data, int offsetSamples);时,将报错 continue; } int length = position - lastPosition > Maxlength ? Maxlength : position - lastPosition; byte[] data = GetAudioClip(lastPosition, length, RecordedClip); try { ws.SendAsync(new ArraySegment(data), WebSocketMessageType.Binary, true, new CancellationToken()); //发送数据 } catch (Exception ex) { Debug.Log(ex.Message); } lastPosition = lastPosition + length; status = 1; } } //string endText = ""; private void OnApplicationQuit() { StopASR(); } /// /// 获取识别并返回字符串 /// /// 所获取的识别的Json字符串 /// 所识别的连贯的一句话 void AnalysisResult(string data) { Data result = JsonUtility.FromJson(data); // 等效于 JsonConvert.DeserializeObject(data) StringBuilder stringBuilder = new StringBuilder(); //Debug.Log(result.cn.st.rt[0].ws.Length); for (int i = 0; i < result.cn.st.rt[0].ws.Length; i++) { //******只有w字段有用,将其提取出来即可****** stringBuilder.Append(result.cn.st.rt[0].ws[i].cw[0].w); } string _thisType = result.cn.st.type; string testing = stringBuilder.ToString(); //testing = readTextManager.SetKeyWordColor(testing); //Debug.Log(stringBuilder + "***" + _thisType); //type 结果类型标识 0-最终结(即这句话说完了);1-中间结果(即这句话没说完,下一句转写结果可能推翻前面的内容) if (_thisType.Equals("0")) { //endText = endText + testing; asrCallback?.Invoke(testing); //_text.text = endText; } else { //_text.text = endText + testing; } } /// /// 获取音频流片段 /// /// 起始采样点 /// 采样长度 /// 音频 /// public static byte[] GetAudioClip(int start, int length, AudioClip recordedClip) { float[] soundata = new float[length]; recordedClip.GetData(soundata, start); int rescaleFactor = 32767; byte[] outData = new byte[soundata.Length * 2]; for (int i = 0; i < soundata.Length; i++) { short temshort = (short)(soundata[i] * rescaleFactor); byte[] temdata = BitConverter.GetBytes(temshort); outData[i * 2] = temdata[0]; outData[i * 2 + 1] = temdata[1]; } return outData; } /// /// 获得请求URI /// /// 请求的URI private Uri GetUri() { //精确到秒 timeStamp = GetTimeStamp(); //baseString由appid和当前时间戳ts拼接而成 baseString = appid + timeStamp; //对baseString进行MD5 toMd5 = ToMD5(baseString); //以apiKey为key对MD5之后的baseString进行HmacSHA1加密 //然后再对加密后的字符串进行base64编码 signa = ToHmacSHA1(toMd5, appkey); string requestUrl = string.Format(m_SpeechRecognizeURL + "appid={0}&ts={1}&signa={2}&pd=tech", appid, timeStamp, UrlEncode(signa)); //Debug.Log("requestUrl: " + requestUrl); return new Uri(requestUrl); } #region 一些加密算法 /// /// 对字符串进行UrlEncode转码 /// /// 需要转码的字符串 /// 经过UrlEncode转码的字符串 public static string UrlEncode(string str) { StringBuilder sb = new StringBuilder(); byte[] byStr = System.Text.Encoding.UTF8.GetBytes(str); //默认是System.Text.Encoding.Default.GetBytes(str) for (int i = 0; i < byStr.Length; i++) { sb.Append(@"%" + Convert.ToString(byStr[i], 16)); } return (sb.ToString()); } /// /// 获取时间戳 /// /// 时间戳,精确到秒 public static string GetTimeStamp() { TimeSpan ts = DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, 0); return Convert.ToInt64(ts.TotalSeconds).ToString(); } /// /// MD5字符串加密 /// /// 需要加密的字符串 /// 加密后字符串 public static string ToMD5(string txt) { using (MD5 mi = MD5.Create()) { byte[] buffer = Encoding.Default.GetBytes(txt); //开始加密 byte[] newBuffer = mi.ComputeHash(buffer); StringBuilder sb = new StringBuilder(); for (int i = 0; i < newBuffer.Length; i++) { sb.Append(newBuffer[i].ToString("x2")); } return sb.ToString(); } } /// /// HMACSHA1算法加密并返回ToBase64String /// /// 要加密的原串 ///私钥 /// 返回一个签名值(即哈希值) public static string ToHmacSHA1(string text, string key) { //HMACSHA1加密 HMACSHA1 hmacsha1 = new HMACSHA1(); hmacsha1.Key = System.Text.Encoding.UTF8.GetBytes(key); byte[] dataBuffer = System.Text.Encoding.UTF8.GetBytes(text); byte[] hashBytes = hmacsha1.ComputeHash(dataBuffer); return Convert.ToBase64String(hashBytes); } #endregion } [Serializable] public struct ReceiveJsonData { /// /// 结果标识,started:握手,result:结果,error:异常 /// public string action; /// /// 结果码(具体见错误码) /// public string code; /// /// 转写结果数据 /// public string data; /// /// 描述 /// public string desc; /// /// 会话ID /// 主要用于DEBUG追查问题,如果出现问题,可以提供sid帮助确认问题。 /// public string sid; } /// /// 语音识别的结果 /// { /// \"seg_id\":7, /// \"cn\":{ /// \"st\":{ /// \"rt\":[ /// {\"ws\":[ /// {\"cw\":[{\"w\":\"我们\",\"wp\":\"n\"}],\"wb\":23,\"we\":70}, /// {\"cw\":[{\"w\":\"生活\",\"wp\":\"n\"}],\"wb\":71,\"we\":118}, /// {\"cw\":[{\"w\":\"的\",\"wp\":\"n\"}],\"wb\":119,\"we\":130}, /// {\"cw\":[{\"w\":\"世界\",\"wp\":\"n\"}],\"wb\":131,\"we\":172}, /// {\"cw\":[{\"w\":\"里\",\"wp\":\"n\"}],\"wb\":173,\"we\":201}, /// {\"cw\":[{\"w\":\"有\",\"wp\":\"n\"}],\"wb\":202,\"we\":226}, /// {\"cw\":[{\"w\":\"两\",\"wp\":\"n\"}],\"wb\":227,\"we\":249}, /// {\"cw\":[{\"w\":\"个\",\"wp\":\"n\"}],\"wb\":250,\"we\":263}, /// {\"cw\":[{\"w\":\"世界\",\"wp\":\"n\"} /// ], /// \"wb\":264, /// \"we\":320} /// }], /// \"bg\":\"5120\", /// \"type\":\"0\", /// \"ed\":\"8520\" /// } /// }, /// \"ls\":false /// } /// [Serializable] public struct Data { /// /// 转写结果序号 从0开始 /// public string seg_id; [Serializable] public struct CN { [Serializable] public struct ST { [Serializable] public struct RT { [Serializable] public class WS { [Serializable] public class CW { /// /// 词识别结果 /// public string w; /// /// 词标识 n-普通词;s-顺滑词(语气词);p-标点 /// public string wp; } [SerializeField] public CW[] cw; /// /// 词在本句中的开始时间,单位是帧,1帧=10ms 即词在整段语音中的开始时间为(bg+wb*10)ms /// 中间结果的 wb 为 0 /// public string wb; /// /// 词在本句中的结束时间,单位是帧,1帧=10ms 即词在整段语音中的结束时间为(bg+we*10)ms /// 中间结果的 we 为 0 /// public string we; } [SerializeField] public WS[] ws; } [SerializeField] public RT[] rt; /// /// 句子在整段语音中的开始时间,单位毫秒(ms) /// 中间结果的bg为准确值 /// public string bg; /// /// 结果类型标识 0-最终结果;1-中间结果 /// public string type; /// /// 句子在整段语音中的结束时间,单位毫秒(ms) /// 中间结果的ed为0 /// public string ed; } [SerializeField] public ST st; } [SerializeField] public CN cn; /// /// /// public string ls; }