2026-03-24 11:39:01 +08:00

518 lines
15 KiB
C#
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using System;
using System.Text;
using System.Security.Cryptography;
using Microphone = FrostweepGames.MicrophonePro.Microphone;
using System.Net.WebSockets;
using System.Threading;
public class XunFeiSTTRealtime : STT
{
//讯飞控制台上所创建的应用对应的appid、appkey
[SerializeField] private string appid = "f2433640";
[SerializeField] private string appkey = "6fde2501ca9c018c392bdffb19757d8d";
private string timeStamp;
private string baseString;
private string toMd5;
private string signa;
private AudioClip RecordedClip;
private ClientWebSocket ws;
private CancellationToken ct;
//最大录音时长
private int MAX_RECORD_LENGTH = 3599;
/// <summary>
/// 语音识别回调事件
/// </summary>
public Action<string> asrCallback;
public override void StartSpeechToText(Action<string> _callback)
{
base.StartSpeechToText(_callback);
asrCallback = _callback;
StartASR();
}
public override void StopSpeechToText(Action<string> _callback)
{
base.StopSpeechToText(_callback);
asrCallback = null;
StopASR();
}
public void StartASR()
{
if (ws != null && ws.State == WebSocketState.Open)
{
Debug.LogWarning("开始语音识别失败!,等待上次识别连接结束");
return;
}
if (Microphone.devices.Length == 0)
{
Debug.LogError("未检测到可用的麦克风");
return;
}
ConnectASR_Aysnc();
RecordedClip = Microphone.Start(null, false, MAX_RECORD_LENGTH, 16000);
}
public async void StopASR()
{
// 避免重复调用
if (ws == null || ws.State == WebSocketState.Closed || ws.State == WebSocketState.Aborted)
{
return;
}
try
{
// 关掉发送音频的协程
StopCoroutine(SendAudioClip());
Microphone.End(null);
// 只有在连接打开状态下才发送结束标识
if (ws.State == WebSocketState.Open)
{
await ws.SendAsync(
new ArraySegment<byte>(Encoding.UTF8.GetBytes("{\"end\": true}")),
WebSocketMessageType.Text, // 这里应该用Text类型而不是Binary
true,
CancellationToken.None
);
}
// 等待关闭操作完成
if (ws.State == WebSocketState.Open || ws.State == WebSocketState.CloseSent)
{
await ws.CloseAsync(
WebSocketCloseStatus.NormalClosure,
"Closing normally",
CancellationToken.None
);
}
}
catch (ObjectDisposedException)
{
// 已释放的异常可以忽略,因为对象已经在清理中
Debug.Log("WebSocket already disposed");
}
catch (Exception ex)
{
Debug.LogError($"StopASR error: {ex.Message}");
}
finally
{
// 最终确保资源释放
if (ws != null)
{
// 如果还没终止,强制终止
if (ws.State != WebSocketState.Closed && ws.State != WebSocketState.Aborted)
{
ws.Abort();
}
// 释放资源
ws.Dispose();
ws = null; // 置空避免再次访问
}
}
}
async void ConnectASR_Aysnc()
{
ws = new ClientWebSocket();
ct = new CancellationToken();
Uri url = GetUri();
await ws.ConnectAsync(url, ct);
StartCoroutine(SendAudioClip());
try
{
while (ws.State == WebSocketState.Open)
{
var result = new byte[4096];
await ws.ReceiveAsync(new ArraySegment<byte>(result), ct); //接受数据
List<byte> list = new List<byte>(result);
while (list[list.Count - 1] == 0x00) list.RemoveAt(list.Count - 1); //去除空字节
string str = Encoding.UTF8.GetString(list.ToArray());
if (string.IsNullOrEmpty(str))
{
return;
}
ReceiveJsonData receiveJsonData = JsonUtility.FromJson<ReceiveJsonData>(str);
if (receiveJsonData.action.Equals("started"))
{
Debug.Log("握手成功!");
}
else if (receiveJsonData.action.Equals("result"))
{
//Debug.Log("返回结果:" + str);
AnalysisResult(receiveJsonData.data);
}
else if (receiveJsonData.action.Equals("error"))
{
Debug.Log("Error: " + receiveJsonData.desc);
}
}
}
catch (Exception ex)
{
Debug.Log(ex.Message);
}
}
/// <summary>
/// 发送音频片段
/// </summary>
/// <param name="socket"></param>
/// <returns></returns>
IEnumerator SendAudioClip()
{
yield return new WaitWhile(() => Microphone.GetPosition(null) <= 0);
float t = 0;
int position = Microphone.GetPosition(null);
const float waitTime = 0.04f; //每隔40ms发送音频
const int Maxlength = 1280; //最多发送1280字节
int status = 0;
int lastPosition = 0;
while (position < RecordedClip.samples && ws.State == WebSocketState.Open)
{
t += waitTime;
if (t >= MAX_RECORD_LENGTH)
{
Debug.Log("录音时长已用尽,结束语音识别!");
break;
}
yield return new WaitForSecondsRealtime(waitTime);
if (Microphone.IsRecording(null))
{
position = Microphone.GetPosition(null);
//Debug.Log("录音时长:" + t + "position=" + position + ",lastPosition=" + lastPosition);
}
if (position <= lastPosition)
{
// 防止出现当前采样位置和上一帧采样位置一样导致length为0
// 那么在调用AudioClip.GetData(float[] data, int offsetSamples);时,将报错
continue;
}
int length = position - lastPosition > Maxlength ? Maxlength : position - lastPosition;
byte[] data = GetAudioClip(lastPosition, length, RecordedClip);
try
{
ws.SendAsync(new ArraySegment<byte>(data), WebSocketMessageType.Binary, true,
new CancellationToken()); //发送数据
}
catch (Exception ex)
{
Debug.Log(ex.Message);
}
lastPosition = lastPosition + length;
status = 1;
}
}
//string endText = "";
private void OnApplicationQuit()
{
StopASR();
}
/// <summary>
/// 获取识别并返回字符串
/// </summary>
/// <param name="data">所获取的识别的Json字符串</param>
/// <returns>所识别的连贯的一句话</returns>
void AnalysisResult(string data)
{
Data result = JsonUtility.FromJson<Data>(data); // 等效于 JsonConvert.DeserializeObject<Data>(data)
StringBuilder stringBuilder = new StringBuilder();
//Debug.Log(result.cn.st.rt[0].ws.Length);
for (int i = 0; i < result.cn.st.rt[0].ws.Length; i++)
{
//******只有w字段有用将其提取出来即可******
stringBuilder.Append(result.cn.st.rt[0].ws[i].cw[0].w);
}
string _thisType = result.cn.st.type;
string testing = stringBuilder.ToString();
//testing = readTextManager.SetKeyWordColor(testing);
//Debug.Log(stringBuilder + "***" + _thisType);
//type 结果类型标识 0-最终结(即这句话说完了)1-中间结果(即这句话没说完,下一句转写结果可能推翻前面的内容)
if (_thisType.Equals("0"))
{
//endText = endText + testing;
asrCallback?.Invoke(testing);
//_text.text = endText;
}
else
{
//_text.text = endText + testing;
}
}
/// <summary>
/// 获取音频流片段
/// </summary>
/// <param name="start">起始采样点</param>
/// <param name="length">采样长度</param>
/// <param name="recordedClip">音频</param>
/// <returns></returns>
public static byte[] GetAudioClip(int start, int length, AudioClip recordedClip)
{
float[] soundata = new float[length];
recordedClip.GetData(soundata, start);
int rescaleFactor = 32767;
byte[] outData = new byte[soundata.Length * 2];
for (int i = 0; i < soundata.Length; i++)
{
short temshort = (short)(soundata[i] * rescaleFactor);
byte[] temdata = BitConverter.GetBytes(temshort);
outData[i * 2] = temdata[0];
outData[i * 2 + 1] = temdata[1];
}
return outData;
}
/// <summary>
/// 获得请求URI
/// </summary>
/// <returns>请求的URI</returns>
private Uri GetUri()
{
//精确到秒
timeStamp = GetTimeStamp();
//baseString由appid和当前时间戳ts拼接而成
baseString = appid + timeStamp;
//对baseString进行MD5
toMd5 = ToMD5(baseString);
//以apiKey为key对MD5之后的baseString进行HmacSHA1加密
//然后再对加密后的字符串进行base64编码
signa = ToHmacSHA1(toMd5, appkey);
string requestUrl = string.Format(m_SpeechRecognizeURL + "appid={0}&ts={1}&signa={2}&pd=tech", appid,
timeStamp, UrlEncode(signa));
//Debug.Log("requestUrl: " + requestUrl);
return new Uri(requestUrl);
}
#region
/// <summary>
/// 对字符串进行UrlEncode转码
/// </summary>
/// <param name="str">需要转码的字符串</param>
/// <returns>经过UrlEncode转码的字符串</returns>
public static string UrlEncode(string str)
{
StringBuilder sb = new StringBuilder();
byte[] byStr = System.Text.Encoding.UTF8.GetBytes(str); //默认是System.Text.Encoding.Default.GetBytes(str)
for (int i = 0; i < byStr.Length; i++)
{
sb.Append(@"%" + Convert.ToString(byStr[i], 16));
}
return (sb.ToString());
}
/// <summary>
/// 获取时间戳
/// </summary>
/// <returns>时间戳,精确到秒</returns>
public static string GetTimeStamp()
{
TimeSpan ts = DateTime.UtcNow - new DateTime(1970, 1, 1, 0, 0, 0, 0);
return Convert.ToInt64(ts.TotalSeconds).ToString();
}
/// <summary>
/// MD5字符串加密
/// </summary>
/// <param name="txt">需要加密的字符串</param>
/// <returns>加密后字符串</returns>
public static string ToMD5(string txt)
{
using (MD5 mi = MD5.Create())
{
byte[] buffer = Encoding.Default.GetBytes(txt);
//开始加密
byte[] newBuffer = mi.ComputeHash(buffer);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < newBuffer.Length; i++)
{
sb.Append(newBuffer[i].ToString("x2"));
}
return sb.ToString();
}
}
/// <summary>
/// HMACSHA1算法加密并返回ToBase64String
/// </summary>
/// <param name="text">要加密的原串</param>
///<param name="key">私钥</param>
/// <returns>返回一个签名值(即哈希值)</returns>
public static string ToHmacSHA1(string text, string key)
{
//HMACSHA1加密
HMACSHA1 hmacsha1 = new HMACSHA1();
hmacsha1.Key = System.Text.Encoding.UTF8.GetBytes(key);
byte[] dataBuffer = System.Text.Encoding.UTF8.GetBytes(text);
byte[] hashBytes = hmacsha1.ComputeHash(dataBuffer);
return Convert.ToBase64String(hashBytes);
}
#endregion
}
[Serializable]
public struct ReceiveJsonData
{
/// <summary>
/// 结果标识started:握手result:结果error:异常
/// </summary>
public string action;
/// <summary>
/// 结果码(具体见错误码)
/// </summary>
public string code;
/// <summary>
/// 转写结果数据
/// </summary>
public string data;
/// <summary>
/// 描述
/// </summary>
public string desc;
/// <summary>
/// 会话ID
/// 主要用于DEBUG追查问题如果出现问题可以提供sid帮助确认问题。
/// </summary>
public string sid;
}
/// <summary>
/// 语音识别的结果
/// {
/// \"seg_id\":7,
/// \"cn\":{
/// \"st\":{
/// \"rt\":[
/// {\"ws\":[
/// {\"cw\":[{\"w\":\"我们\",\"wp\":\"n\"}],\"wb\":23,\"we\":70},
/// {\"cw\":[{\"w\":\"生活\",\"wp\":\"n\"}],\"wb\":71,\"we\":118},
/// {\"cw\":[{\"w\":\"的\",\"wp\":\"n\"}],\"wb\":119,\"we\":130},
/// {\"cw\":[{\"w\":\"世界\",\"wp\":\"n\"}],\"wb\":131,\"we\":172},
/// {\"cw\":[{\"w\":\"里\",\"wp\":\"n\"}],\"wb\":173,\"we\":201},
/// {\"cw\":[{\"w\":\"有\",\"wp\":\"n\"}],\"wb\":202,\"we\":226},
/// {\"cw\":[{\"w\":\"两\",\"wp\":\"n\"}],\"wb\":227,\"we\":249},
/// {\"cw\":[{\"w\":\"个\",\"wp\":\"n\"}],\"wb\":250,\"we\":263},
/// {\"cw\":[{\"w\":\"世界\",\"wp\":\"n\"}
/// ],
/// \"wb\":264,
/// \"we\":320}
/// }],
/// \"bg\":\"5120\",
/// \"type\":\"0\",
/// \"ed\":\"8520\"
/// }
/// },
/// \"ls\":false
/// }
/// </summary>
[Serializable]
public struct Data
{
/// <summary>
/// 转写结果序号 从0开始
/// </summary>
public string seg_id;
[Serializable]
public struct CN
{
[Serializable]
public struct ST
{
[Serializable]
public struct RT
{
[Serializable]
public class WS
{
[Serializable]
public class CW
{
/// <summary>
/// 词识别结果
/// </summary>
public string w;
/// <summary>
/// 词标识 n-普通词s-顺滑词语气词p-标点
/// </summary>
public string wp;
}
[SerializeField] public CW[] cw;
/// <summary>
/// 词在本句中的开始时间单位是帧1帧=10ms 即词在整段语音中的开始时间为(bg+wb*10)ms
/// 中间结果的 wb 为 0
/// </summary>
public string wb;
/// <summary>
/// 词在本句中的结束时间单位是帧1帧=10ms 即词在整段语音中的结束时间为(bg+we*10)ms
/// 中间结果的 we 为 0
/// </summary>
public string we;
}
[SerializeField] public WS[] ws;
}
[SerializeField] public RT[] rt;
/// <summary>
/// 句子在整段语音中的开始时间,单位毫秒(ms)
/// 中间结果的bg为准确值
/// </summary>
public string bg;
/// <summary>
/// 结果类型标识 0-最终结果1-中间结果
/// </summary>
public string type;
/// <summary>
/// 句子在整段语音中的结束时间,单位毫秒(ms)
/// 中间结果的ed为0
/// </summary>
public string ed;
}
[SerializeField] public ST st;
}
[SerializeField] public CN cn;
/// <summary>
///
/// </summary>
public string ls;
}