廢話不多說,直奔主題!
1.環(huán)境
??Microsoft Speech SDK 11 (語音識別SDK)點這里
2.語音識別
MKSpeech.h
#pragma once
#include <Windows.h>
#include <sapi.h>
#include <sphelper.h>
#include "stdafx.h"
class MKSpeech
{
private:
/** 是否打開*/
bool isOpen;
/** 是否初始化完成*/
bool isInit;
private:
// 音頻
IAudioBeam* m_pAudioBeam ;
// 語音識別輸入流
ISpStream* m_pSpeechStream ;
// 語音識別器
ISpRecognizer* m_pSpeechRecognizer ;
// 語音識別上下文
ISpRecoContext* m_pSpeechContext ;
// 語音識別語法
ISpRecoGrammar* m_pSpeechGrammar ;
// 語音識別觸發(fā)事件
HANDLE m_hSpeechEvent ;
// 音頻處理線程
std::thread m_threadAudio;
// 語法文件
WCHAR* s_GrammarFileName;
private:
// 初始化語音識別
HRESULT init_speech_recognizer();
// 音頻線程
static void AudioThread(MKSpeech* pointer);
// 語音行為
void speech_behavior(const SPPHRASEPROPERTY* tag);
// 語音處理
void speech_process();
public:
MKSpeech(void);
~MKSpeech(void);
void open(int type);
void close(void);
private:
/** 回掉的函數(shù)指針*/
ErrorCallBack _errorCallBack;
AudioCallBack _audioCallBack;
public:
/**設(shè)置回掉函數(shù)*/
void m_setErrorActionCallBack(ErrorCallBack call);
void m_setAudioCallBack(AudioCallBack call);
void m_errorCallBack(string codeStr);
void m_audioCallBack(string str,AudioCallStatus status);
};
MKSpeech.cpp
#include "MKSpeech.h"
MKSpeech::MKSpeech(void)
{
isOpen = false;
isInit = false;
s_GrammarFileName = L"Grammar.xml";
m_pAudioBeam = nullptr;
m_pSpeechStream = nullptr;
m_pSpeechRecognizer = nullptr;
m_pSpeechContext = nullptr;
m_pSpeechGrammar = nullptr;
m_hSpeechEvent = nullptr;
}
MKSpeech::~MKSpeech(void)
{
}
HRESULT MKSpeech::init_speech_recognizer()
{
HRESULT hr = S_OK;
// 創(chuàng)建語音輸入流
hr = CoCreateInstance(CLSID_SpStream, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpStream), (void**)&m_pSpeechStream);
if (!SUCCEEDED(hr))
{
m_errorCallBack("CoCreateInstance CLSID_SpStream failed");
return hr;
}
// 創(chuàng)建語言識別器
hr = CoCreateInstance(CLSID_SpInprocRecognizer, nullptr, CLSCTX_INPROC_SERVER, __uuidof(ISpRecognizer), (void**)&m_pSpeechRecognizer);
if (!SUCCEEDED(hr))
{
m_errorCallBack("CoCreateInstance CLSID_SpInprocRecognizer failed");
return hr;
}
//創(chuàng)建默認音頻輸入對象
CComPtr<ISpObjectToken> ISPToken = nullptr;
hr = SpGetDefaultTokenFromCategoryId(SPCAT_AUDIOIN,&ISPToken);
if (!SUCCEEDED(hr))
{
m_errorCallBack("SpGetDefaultTokenFromCategoryId failed");
return hr;
}
//設(shè)置識別引擎輸入源
hr = m_pSpeechRecognizer->SetInput(ISPToken,TRUE);
if (!SUCCEEDED(hr))
{
m_errorCallBack("SetInput failed");
return hr;
}
// 創(chuàng)建語音識別對象
ISpObjectToken *pEngineToken = nullptr;
hr = SpFindBestToken(SPCAT_RECOGNIZERS, L"language=804", nullptr, &pEngineToken);
if (!SUCCEEDED(hr))
{
m_errorCallBack("SpFindBestToken failed");
return hr;
}
// 設(shè)置待識別語言
m_pSpeechRecognizer->SetRecognizer(pEngineToken);
// 創(chuàng)建語音識別上下文
hr = m_pSpeechRecognizer->CreateRecoContext(&m_pSpeechContext);
if (!SUCCEEDED(hr))
{
m_errorCallBack("m_pSpeechRecognizer CreateRecoContext failed");
return hr;
}
SafeRelease(pEngineToken);
// 適應(yīng)性 ON! 防止因長時間的處理而導(dǎo)致識別能力的退化
hr = m_pSpeechRecognizer->SetPropertyNum(L"AdaptationOn", 0);
if (!SUCCEEDED(hr))
{
m_errorCallBack("m_pSpeechRecognizer SetPropertyNum failed");
return hr;
}
// 創(chuàng)建語法
hr = m_pSpeechContext->CreateGrammar(1, &m_pSpeechGrammar);
if (!SUCCEEDED(hr))
{
m_errorCallBack("m_pSpeechContext CreateGrammar failed");
return hr;
}
//加載語法規(guī)則
hr = m_pSpeechGrammar->LoadCmdFromFile(s_GrammarFileName, SPLO_DYNAMIC);
if (!SUCCEEDED(hr)){
hr = m_pSpeechGrammar->LoadCmdFromFile(L".\\resources\\app\\kinectLib\\Grammar.xml", SPLO_DYNAMIC);
}
if (!SUCCEEDED(hr))
{
m_errorCallBack("m_pSpeechGrammar LoadCmdFromFile failed");
return hr;
}
// 激活語法規(guī)則
hr = m_pSpeechGrammar->SetRuleState(nullptr, nullptr, SPRS_ACTIVE);
if (!SUCCEEDED(hr))
{
m_errorCallBack("m_pSpeechGrammar SetRuleState failed");
return hr;
}
// 設(shè)置識別器一直讀取數(shù)據(jù)
hr = m_pSpeechRecognizer->SetRecoState(SPRST_ACTIVE);
if (!SUCCEEDED(hr))
{
m_errorCallBack("m_pSpeechRecognizer SetRecoState failed");
return hr;
}
// 設(shè)置對識別事件感興趣
hr = m_pSpeechContext->SetInterest(SPFEI(SPEI_RECOGNITION), SPFEI(SPEI_RECOGNITION));
if (!SUCCEEDED(hr))
{
m_errorCallBack("m_pSpeechContext SetInterest failed");
return hr;
}
// 保證語音識別處于激活狀態(tài)
m_pSpeechContext->Resume(0);
// 獲取識別事件
m_hSpeechEvent = m_pSpeechContext->GetNotifyEventHandle();
return hr;
}
// 音頻線程
void MKSpeech::AudioThread(MKSpeech* pointer){
// 先設(shè)置
HANDLE events[] = { pointer->m_hSpeechEvent };
bool exit = false;
while (!exit) {
switch (::WaitForMultipleObjects(lengthof(events), events, FALSE, 8000)){
case WAIT_OBJECT_0:
// 語言識別
pointer->speech_process();
exit = true;
pointer->close();
break;
case WAIT_TIMEOUT:
pointer->m_audioCallBack("faild",AudioCallStatus_Faild);
exit = true;
pointer->close();
break;
}
}
}
// 音頻處理
void MKSpeech::speech_process() {
// 置信閾值
const float ConfidenceThreshold = 0.3f;
SPEVENT curEvent = { SPEI_UNDEFINED, SPET_LPARAM_IS_UNDEFINED, 0, 0, 0, 0 };
ULONG fetched = 0;
HRESULT hr = S_OK;
// 獲取事件
m_pSpeechContext->GetEvents(1, &curEvent, &fetched);
while (fetched > 0)
{
// 確定是識別事件
switch (curEvent.eEventId)
{
case SPEI_RECOGNITION:
// 保證位對象
if (SPET_LPARAM_IS_OBJECT == curEvent.elParamType) {
ISpRecoResult* result = reinterpret_cast<ISpRecoResult*>(curEvent.lParam);
SPPHRASE* pPhrase = nullptr;
// 獲取識別短語
hr = result->GetPhrase(&pPhrase);
if (SUCCEEDED(hr)) {
// DEBUG時顯示識別字符串
WCHAR* pwszFirstWord;
result->GetText(SP_GETWHOLEPHRASE, SP_GETWHOLEPHRASE, TRUE, &pwszFirstWord, nullptr);
//_cwprintf(pwszFirstWord);
char * m_char;
int len= WideCharToMultiByte( CP_ACP ,0,pwszFirstWord ,wcslen( pwszFirstWord ), NULL,0, NULL ,NULL);
m_char = new char[len+1];
WideCharToMultiByte( CP_ACP ,0,pwszFirstWord,wcslen( pwszFirstWord ),m_char,len, NULL ,NULL );
m_char[len]= '\0';
printf(m_char);
//string base64Str = GbkToUtf8(m_char);
::CoTaskMemFree(pwszFirstWord);
pPhrase->pProperties;
const SPPHRASEELEMENT* pointer = pPhrase->pElements;
if ((pPhrase->pProperties != nullptr) && (pPhrase->pProperties->pFirstChild != nullptr)) {
const SPPHRASEPROPERTY* pSemanticTag = pPhrase->pProperties->pFirstChild;
//CString str(lpcwStr);
_cwprintf(pSemanticTag->pszValue);
char * m_char2;
int len2= WideCharToMultiByte( CP_ACP ,0,pSemanticTag->pszValue ,wcslen( pSemanticTag->pszValue ), NULL,0, NULL ,NULL);
m_char2 = new char[len2+1];
WideCharToMultiByte( CP_ACP ,0,pSemanticTag->pszValue,wcslen( pSemanticTag->pszValue ),m_char2,len2, NULL ,NULL );
m_char2[len2]= '\0';
printf(m_char2);
m_audioCallBack(m_char2,AudioCallStatus_Success);
//#ifdef _DEBUG
_cwprintf(L" 置信度:%d%%\n", (int)(pSemanticTag->SREngineConfidence*100.f));
//#endif
if (pSemanticTag->SREngineConfidence > ConfidenceThreshold) {
speech_behavior(pSemanticTag);
}
}
::CoTaskMemFree(pPhrase);
}
}
break;
}
m_pSpeechContext->GetEvents(1, &curEvent, &fetched);
}
return;
}
// 語音行為
void MKSpeech::speech_behavior(const SPPHRASEPROPERTY* tag){
if (!tag) return;
if (!wcscmp(tag->pszName, L"戰(zhàn)況")){
enum class Subject{
US = 0,
Enemy
} ;
enum class Predicate{
Destroy = 0,
Defeat,
Breakdown
};
// 分析戰(zhàn)況
union Situation{
struct{
// 主語
Subject subject;
// 謂語
Predicate predicate;
// 對象
int object2;
// 賓語
int object;
};
UINT32 data[4];
};
Situation situation;
auto obj = tag->pFirstChild;
auto pointer = situation.data;
// 填寫數(shù)據(jù)
while (obj) {
*pointer = obj->vValue.lVal;
++pointer;
obj = obj->pNextSibling;
}
// XXX
}
else if (!wcscmp(tag->pszName, L"發(fā)現(xiàn)東西")){
// 發(fā)現(xiàn)東西
}
}
void MKSpeech::m_setErrorActionCallBack(ErrorCallBack call)
{
_errorCallBack = call;
}
void MKSpeech::m_setAudioCallBack(AudioCallBack call)
{
_audioCallBack = call;
}
void MKSpeech::m_errorCallBack(string codeStr)
{
if (_errorCallBack)
{
_errorCallBack(codeStr);
}
}
void MKSpeech::m_audioCallBack(string str,AudioCallStatus status)
{
if (isOpen)
{
if (_audioCallBack)
{
_audioCallBack(str,status);
}
}
}
void MKSpeech::open(int type)
{
if (!isInit)
{
HRESULT hr = init_speech_recognizer();
if (SUCCEEDED(hr))
{
isInit = true;
}
}
if (isInit)
{
if (type == 0)
{
printf("%d",wcscmp(s_GrammarFileName,L"Grammar.xml"));
if (wcscmp(s_GrammarFileName,L"Grammar.xml") != 0)
{
s_GrammarFileName = L"Grammar.xml";
//加載語法規(guī)則
HRESULT hr = m_pSpeechGrammar->LoadCmdFromFile(s_GrammarFileName, SPLO_DYNAMIC);
if (!SUCCEEDED(hr)){
hr = m_pSpeechGrammar->LoadCmdFromFile(L".\\resources\\app\\kinectLib\\Grammar.xml", SPLO_DYNAMIC);
}
if (!SUCCEEDED(hr))
{
m_errorCallBack("m_pSpeechGrammar LoadCmdFromFile failed");
return ;
}
// 激活語法規(guī)則
hr = m_pSpeechGrammar->SetRuleState(nullptr, nullptr, SPRS_ACTIVE);
if (!SUCCEEDED(hr))
{
m_errorCallBack("m_pSpeechGrammar SetRuleState failed");
return ;
}
}
}else
{
printf("%d",wcscmp(s_GrammarFileName,L"Grammar_en.xml"));
if (wcscmp(s_GrammarFileName,L"Grammar_en.xml") != 0)
{
s_GrammarFileName = L"Grammar_en.xml";
//加載語法規(guī)則
HRESULT hr = m_pSpeechGrammar->LoadCmdFromFile(s_GrammarFileName, SPLO_DYNAMIC);
if (!SUCCEEDED(hr)){
hr = m_pSpeechGrammar->LoadCmdFromFile(L".\\resources\\app\\kinectLib\\Grammar_en.xml", SPLO_DYNAMIC);
}
if (!SUCCEEDED(hr))
{
m_errorCallBack("m_pSpeechGrammar LoadCmdFromFile failed");
return ;
}
// 激活語法規(guī)則
hr = m_pSpeechGrammar->SetRuleState(nullptr, nullptr, SPRS_ACTIVE);
if (!SUCCEEDED(hr))
{
m_errorCallBack("m_pSpeechGrammar SetRuleState failed");
return ;
}
}
}
isOpen = true;
m_pSpeechContext->SetContextState(SPCS_ENABLED);
m_pSpeechContext->Resume( 0 );
m_pSpeechRecognizer->SetRecoState(SPRST_ACTIVE) ;
m_threadAudio.std::thread::thread(AudioThread, this);
}
}
void MKSpeech::close(void)
{
// Set Audio Input Strem to Stop
ResetEvent(m_hSpeechEvent);
m_pSpeechContext->SetContextState(SPCS_DISABLED);
m_pSpeechRecognizer->SetRecoState(SPRST_INACTIVE_WITH_PURGE) ;
m_pSpeechContext->Pause( 0 );
isOpen = false;
}
KinectAudioStreamWrapper.h
#pragma once
#include "stdafx.h"
// Kinect 音頻流簡單封裝
class KinectAudioStreamWrapper : public IStream{
public:
// 構(gòu)造函數(shù)
KinectAudioStreamWrapper(IStream *p32BitAudioStream);
// 析構(gòu)函數(shù)
~KinectAudioStreamWrapper();
// 刪除默認構(gòu)造
KinectAudioStreamWrapper();
// 這是語音狀態(tài)
void SetSpeechState(BOOL state){ m_SpeechActive = state; }
// IUnknown 方法 實現(xiàn)
STDMETHODIMP_(ULONG) AddRef() { return InterlockedIncrement(&m_cRef); }
STDMETHODIMP_(ULONG) Release() {
UINT ref = InterlockedDecrement(&m_cRef);
if (ref == 0){
delete this;
}
return ref;
}
STDMETHODIMP QueryInterface(REFIID riid, void **ppv) {
if (riid == IID_IUnknown) {
AddRef();
*ppv = (IUnknown*)this;
return S_OK;
}
else if (riid == IID_IStream) {
AddRef();
*ppv = (IStream*)this;
return S_OK;
}
else {
return E_NOINTERFACE;
}
}
// IStream 方法
STDMETHODIMP Read(void *, ULONG, ULONG *);
STDMETHODIMP Write(const void *, ULONG, ULONG *);
STDMETHODIMP Seek(LARGE_INTEGER, DWORD, ULARGE_INTEGER *);
STDMETHODIMP SetSize(ULARGE_INTEGER);
STDMETHODIMP CopyTo(IStream *, ULARGE_INTEGER, ULARGE_INTEGER *, ULARGE_INTEGER *);
STDMETHODIMP Commit(DWORD);
STDMETHODIMP Revert();
STDMETHODIMP LockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD);
STDMETHODIMP UnlockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD);
STDMETHODIMP Stat(STATSTG *, DWORD);
STDMETHODIMP Clone(IStream **);
private:
// 引用計數(shù)
UINT m_cRef;
// 浮點緩沖區(qū)
float* m_pFloatBuffer;
// 緩沖區(qū)大小
UINT m_uFloatBuferSize;
// 封裝對象
IStream* m_p32BitAudio;
// 語音狀態(tài) 使用BOOL保證數(shù)據(jù)對齊
BOOL m_SpeechActive;
};
#include "KinectAudioStreamWrapper.h"
// KinectAudioStreamWrapper 構(gòu)造函數(shù)
KinectAudioStreamWrapper::KinectAudioStreamWrapper(IStream *p32BitAudio) :m_p32BitAudio(p32BitAudio){
m_cRef = 1;
m_pFloatBuffer = nullptr;
m_uFloatBuferSize = 0;
m_SpeechActive = false;
// 增加計數(shù)
if (m_p32BitAudio){
m_p32BitAudio->AddRef();
}
}
// 析構(gòu)函數(shù)
KinectAudioStreamWrapper::~KinectAudioStreamWrapper(){
SafeRelease(m_p32BitAudio);
if (m_pFloatBuffer){
delete[] m_pFloatBuffer;
m_pFloatBuffer = nullptr;
}
}
// IStream Read方法的實現(xiàn)
STDMETHODIMP KinectAudioStreamWrapper::Read(void *pBuffer, ULONG cbBuffer, ULONG *pcbRead){
// 參數(shù)檢查
if (!pBuffer || !pcbRead) return E_INVALIDARG;
// 在讀取前未使用 m_SpeechActive 返回S_OK
if (!m_SpeechActive){
*pcbRead = cbBuffer;
return S_OK;
}
HRESULT hr = S_OK;
// 目標(biāo)是將浮點編碼轉(zhuǎn)換成16位PCM編碼
INT16* const p16Buffer = reinterpret_cast<INT16*>(pBuffer);
// 長度倍數(shù)
const int multiple = sizeof(float) / sizeof(INT16);
// 檢查緩沖區(qū)釋放足夠
auto float_buffer_size = cbBuffer / multiple;
if (float_buffer_size > m_uFloatBuferSize){
// 不夠就重新申請內(nèi)存
m_uFloatBuferSize = float_buffer_size;
if (m_pFloatBuffer) delete[]m_pFloatBuffer;
m_pFloatBuffer = new float[m_uFloatBuferSize];
}
// 緩沖區(qū)寫入進度 字節(jié)為單位
BYTE* pWriteProgress = reinterpret_cast<BYTE*>(m_pFloatBuffer);
// 目前讀取量
ULONG bytesRead = 0;
// 需要讀取量
ULONG bytesNeed = cbBuffer * multiple;
// 循環(huán)讀取
while (true){
// 已經(jīng)不需要語音的情況下
if (!m_SpeechActive){
*pcbRead = cbBuffer;
hr = S_OK;
break;
}
// 從包裝對象獲取數(shù)據(jù)
hr = m_p32BitAudio->Read(pWriteProgress, bytesNeed, &bytesRead);
//printf("讀取字節(jié)數(shù): %d", &bytesRead);
bytesNeed -= bytesRead;
pWriteProgress += bytesRead;
// 檢查是否足夠
if (!bytesNeed){
*pcbRead = cbBuffer;
break;
}
// 不然就睡一個時間片的時間
Sleep(20);
}
// 數(shù)據(jù)處理 float -> 16bit PCM
if (!bytesNeed){
for (UINT i = 0; i < cbBuffer / multiple; i++) {
float sample = m_pFloatBuffer[i];
// 區(qū)間保證
//sample = max(min(sample, 1.f), -1.f);
if (sample > 1.f) sample = 1.f;
if (sample < -1.f) sample = -1.f;
// 數(shù)據(jù)轉(zhuǎn)換
float sampleScaled = sample * (float)SHRT_MAX;
p16Buffer[i] = (sampleScaled > 0.f) ? (INT16)(sampleScaled + 0.5f) : (INT16)(sampleScaled - 0.5f);
}
}
return hr;
}
// 其他不需要支持的方法實現(xiàn)
STDMETHODIMP KinectAudioStreamWrapper::Write(const void *, ULONG, ULONG *)
{
return E_NOTIMPL;
}
STDMETHODIMP KinectAudioStreamWrapper::Seek(LARGE_INTEGER /* dlibMove */, DWORD /* dwOrigin */, ULARGE_INTEGER * /* plibNewPosition */)
{
// Seek在語音識別中是個比較關(guān)鍵的函數(shù) Kinect目前不支持 但是防止失敗返回S_OK
return S_OK;
}
STDMETHODIMP KinectAudioStreamWrapper::SetSize(ULARGE_INTEGER)
{
return E_NOTIMPL;
}
STDMETHODIMP KinectAudioStreamWrapper::CopyTo(IStream *, ULARGE_INTEGER, ULARGE_INTEGER *, ULARGE_INTEGER *)
{
return E_NOTIMPL;
}
STDMETHODIMP KinectAudioStreamWrapper::Commit(DWORD)
{
return E_NOTIMPL;
}
STDMETHODIMP KinectAudioStreamWrapper::Revert()
{
return E_NOTIMPL;
}
STDMETHODIMP KinectAudioStreamWrapper::LockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD)
{
return E_NOTIMPL;
}
STDMETHODIMP KinectAudioStreamWrapper::UnlockRegion(ULARGE_INTEGER, ULARGE_INTEGER, DWORD)
{
return E_NOTIMPL;
}
STDMETHODIMP KinectAudioStreamWrapper::Stat(STATSTG *, DWORD)
{
return E_NOTIMPL;
}
STDMETHODIMP KinectAudioStreamWrapper::Clone(IStream **)
{
return E_NOTIMPL;
}
3.測試代碼
main.h
#include "KinectApp.h"
#include <string>
#include "MKSpeech.h"
void __stdcall onActionCallBack(BodyRect bRect)
{
printf("Person %d : X:%d Y:%d Z:%d State:%d \n", 0, bRect.X,bRect.Y,bRect.Z,bRect.type);
}
void __stdcall onErrorCallBack(string codeStr)
{
std::cout << codeStr << std::endl;
}
void __stdcall onPanActionCallBack(PanActionType type)
{
printf("%c",type);
}
void __stdcall onAudioCallBack(string str,AudioCallStatus status)
{
printf("%d ",status);
std::cout << str << std::endl;
}
KinectApp *app;
MKSpeech *speech;
int main()
{
if (SUCCEEDED(CoInitialize(NULL)))
{
/*app = new KinectApp();
app->m_setActionCallBack(onActionCallBack);
app->m_setErrorActionCallBack(onErrorCallBack);
app->m_setPanActionCallBack(onPanActionCallBack);
app->open();*/
speech = new MKSpeech();
speech->m_setErrorActionCallBack(onErrorCallBack);
speech->m_setAudioCallBack(onAudioCallBack);
while (true)
{
int a = rand()%2;
printf("%d",a);
speech->open(a);
system("pause");
}
}
}
4.語音文件格式
<grammar root="rootRule" tag-format="semantics/1.0-literals" version="1.0" xml:lang="zh-CN" xmlns="http://www.w3.org/2001/06/grammar">
<rule id="rootRule">
<one-of>
<item>
<tag>5qyn5rSy</tag>
<one-of>
<item>青海交通職業(yè)技術(shù)學(xué)院</item>
<item>歐洲</item>
<item>亞洲</item>
<item>青海</item>
<item>黑龍江</item>
</one-of>
</item>
</one-of>
</rule>
</grammar>
5.幾項注意
1.語音識別可識別Kinect2Sdk語音輸入流,語音識別對環(huán)境要求高(靜),距離要近(2米內(nèi)),越靜識別速度、識別準確率越高。
2.語音識別為指令性語音識別,識別指令相似度越高,識別準確率越低。
3.本代碼開發(fā)工具為Visual Studio 與上一篇手勢控制鼠標(biāo)為一套代碼。
4.如有問題、建議可聯(lián)系249086205@qq.com。
本文參考文獻:
[https://blog.csdn.net/dustpg/column/info/k4w2dn](https://blog.csdn.net/dustpg/column/info/k4w2dn)
代碼地址:鏈接: https://pan.baidu.com/s/1aY8S2VWOBIsW-JbAqcdEow 提取碼: kujw