- 微信
- 微博
  
  分享文章到微博
- 复制链接
  
  复制链接到剪贴板

基于CNN和MFCC的语音情感识别

可爱又积极发表于 2021/09/07 14:49:28 2021/09/07

【摘要】近年来，随着信息技术的飞速发展，智能设备正在逐渐地融入到人们的日常生活当中，语音作为人机交互的最为便捷的方式之一，得到了广泛的应用。让机器听懂人类语言的同时，如何实现与人类有感情的自然交流，是无数科研工作者的目标。语音情感识别的主要内容就是建立一种能够从语音中分析和识别人类情感的计算系统，实现人与机器的人性化交流。语音情感识别的主要任务是将蕴含在语音中的情感信息提取出...

近年来，随着信息技术的飞速发展，智能设备正在逐渐地融入到人们的日常生活当中，语音作为人机交互的最为便捷的方式之一，得到了广泛的应用。让机器听懂人类语言的同时，如何实现与人类有感情的自然交流，是无数科研工作者的目标。语音情感识别的主要内容就是建立一种能够从语音中分析和识别人类情感的计算系统，实现人与机器的人性化交流。

语音情感识别的主要任务是将蕴含在语音中的情感信息提取出来并识别出其类别。目前对于情感的描述主要有两种方法。第一种是基于离散的情感划分，将人类日常生活中广泛使用的基本情感分为愤怒、开心、兴奋、悲伤、厌恶等；另一种是基于连续维度情感划分，主要通过不同的效价度和激活程度来对不同情感进行区分的。

那么作为一个分类任务，特征选择是最关键的一步。本文中使用的语音特征是梅尔倒谱系数，有关梅尔倒谱系数是什么和怎样提取的知识，可参阅文章《Python语音信号处理》。

本文在一定程度上参考了MITESHPUTHRANNEU/Speech-Emotion-Analyzer这个项目，下面开始介绍如何通过卷积神经网络进行语音情感分析。

神经网络结构

使用到的架构其实还是很简单的，如下

数据集

我使用到是CASIA的语音情感数据库。CASIA汉语情感语料库由中国科学院自动化所（Institute of Automation, Chinese Academy of Sciences）录制，共包括四个专业发音人，六种情绪生气（angry）、高兴（happy）、害怕（fear）、悲伤（sad）、惊讶（surprise）和中性（neutral），共9600句不同发音。其中300句是相同文本的，也即是说对相同的文本赋以不同的情感来阅读，这些语料可以用来对比分析不同情感状态下的声学及韵律表现；另外100句是不同文本的，这些文本从字面意思就可以看出其情感归属，便于录音人更准确地表现出情感。

但是完整的CASIA数据集是收费的，因此我只找到了1200句残缺数据集。我把我找到的数据集放在我的网盘上：https://pan.baidu.com/s/1EsRoKaF17Q_3s2t7OMNibQ。

特征提取

我使用librosa模块进行MFCC的提取，提取代码如下。

%matplotlib inline
import librosa
import matplotlib.pyplot as plt
import numpy as np

path=r'D:\NLP\dataset\语音情感\test.wav'

y,sr = librosa.load(path,sr=None)

def normalizeVoiceLen(y,normalizedLen):
nframes=len(y)
y = np.reshape(y,[nframes,1]).T
#归一化音频长度为2s,32000数据点
if(nframes<normalizedLen):
res=normalizedLen-nframes
res_data=np.zeros([1,res],dtype=np.float32)
y = np.reshape(y,[nframes,1]).T
y=np.c_[y,res_data]
else:
y=y[:,0:normalizedLen]
return y[0]

def getNearestLen(framelength,sr):
framesize = framelength*sr
#找到与当前framesize最接近的2的正整数次方
nfftdict = {}
lists = [32,64,128,256,512,1024]
for i in lists:
nfftdict[i] = abs(framesize - i)
sortlist = sorted(nfftdict.items(), key=lambda x: x[1])#按与当前framesize差值升序排列
framesize = int(sortlist[0][0])#取最接近当前framesize的那个2的正整数次方值为新的framesize
return framesize

VOICE_LEN=32000
#获得N_FFT的长度
N_FFT=getNearestLen(0.25,sr)
#统一声音范围为前两秒
y=normalizeVoiceLen(y,VOICE_LEN)
print(y.shape)
#提取mfcc特征
mfcc_data=librosa.feature.mfcc(y=y, sr=sr,n_mfcc=13,n_fft=N_FFT,hop_length=int(N_FFT/4))

# 画出特征图，将MFCC可视化。转置矩阵，使得时域是水平的
plt.matshow(mfcc_data)
plt.title('MFCC')
上面代码的作用是加载声音，取声音的前两秒进行情感分析。getNearestLen()函数根据声音的采样率确定一个合适的语音帧长用于傅立叶变换。然后通过librosa.feature.mfcc()函数提取mfcc特征，并将其可视化。

下面的代码将数据集中的mfcc特征提取出来，并对每帧的mfcc取平均，将结果保存为文件。

#提取特征
import os
import pickle

counter=0
fileDirCASIA = r'D:\NLP\dataset\语音情感\CASIA database'

mfccs={}
mfccs['angry']=[]
mfccs['fear']=[]
mfccs['happy']=[]
mfccs['neutral']=[]
mfccs['sad']=[]
mfccs['surprise']=[]
mfccs['disgust']=[]

listdir=os.listdir(fileDirCASIA)
for persondir in listdir:
if(not r'.' in persondir):
emotionDirName=os.path.join(fileDirCASIA,persondir)
emotiondir=os.listdir(emotionDirName)
for ed in emotiondir:
if(not r'.' in ed):
filesDirName=os.path.join(emotionDirName,ed)
files=os.listdir(filesDirName)
for fileName in files:
if(fileName[-3:]=='wav'):
counter+=1
fn=os.path.join(filesDirName,fileName)
print(str(counter)+fn)
y,sr = librosa.load(fn,sr=None)
y=normalizeVoiceLen(y,VOICE_LEN)#归一化长度
mfcc_data=librosa.feature.mfcc(y=y, sr=sr,n_mfcc=13,n_fft=N_FFT,hop_length=int(N_FFT/4))
feature=np.mean(mfcc_data,axis=0)
mfccs[ed].append(feature.tolist())

with open('mfcc_feature_dict.pkl', 'wb') as f:
pickle.dump(mfccs, f)
数据预处理

代码如下：

%matplotlib inline
import pickle
import os
import librosa
import matplotlib.pyplot as plt
import numpy as np
from keras import layers
from keras import models
from keras import optimizers
from keras.utils import to_categorical

#读取特征
mfccs={}
with open('mfcc_feature_dict.pkl', 'rb') as f:
mfccs=pickle.load(f)

#设置标签
emotionDict={}
emotionDict['angry']=0
emotionDict['fear']=1
emotionDict['happy']=2
emotionDict['neutral']=3
emotionDict['sad']=4
emotionDict['surprise']=5

data=[]
labels=[]
data=data+mfccs['angry']
print(len(mfccs['angry']))
for i in range(len(mfccs['angry'])):
labels.append(0)

data=data+mfccs['fear']
print(len(mfccs['fear']))
for i in range(len(mfccs['fear'])):
labels.append(1)

print(len(mfccs['happy']))
data=data+mfccs['happy']
for i in range(len(mfccs['happy'])):
labels.append(2)

print(len(mfccs['neutral']))
data=data+mfccs['neutral']
for i in range(len(mfccs['neutral'])):
labels.append(3)

print(len(mfccs['sad']))
data=data+mfccs['sad']
for i in range(len(mfccs['sad'])):
labels.append(4)

print(len(mfccs['surprise']))
data=data+mfccs['surprise']
for i in range(len(mfccs['surprise'])):
labels.append(5)

print(len(data))
print(len(labels))

#设置数据维度
data=np.array(data)
data=data.reshape((data.shape[0],data.shape[1],1))

labels=np.array(labels)
labels=to_categorical(labels)

#数据标准化
DATA_MEAN=np.mean(data,axis=0)
DATA_STD=np.std(data,axis=0)

data-=DATA_MEAN
data/=DATA_STD
接下来保存好参数，模型预测的时候需要用到。

paraDict={}
paraDict['mean']=DATA_MEAN
paraDict['std']=DATA_STD
paraDict['emotion']=emotionDict
with open('mfcc_model_para_dict.pkl', 'wb') as f:
pickle.dump(paraDict, f)
最后是打乱数据集并划分训练数据和测试数据。

ratioTrain=0.8
numTrain=int(data.shape[0]*ratioTrain)
permutation = np.random.permutation(data.shape[0])
data = data[permutation,:]
labels = labels[permutation,:]

x_train=data[:numTrain]
x_val=data[numTrain:]
y_train=labels[:numTrain]
y_val=labels[numTrain:]

print(x_train.shape)
print(y_train.shape)
print(x_val.shape)
print(y_val.shape)
定义模型

使用keras定义模型，代码如下：

from keras.utils import plot_model
from keras import regularizers

model = models.Sequential()
model.add(layers.Conv1D(256,5,activation='relu',input_shape=(126,1)))
model.add(layers.Conv1D(128,5,padding='same',activation='relu',kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.Dropout(0.2))
model.add(layers.MaxPooling1D(pool_size=(8)))
model.add(layers.Conv1D(128,5,activation='relu',padding='same',kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.Dropout(0.2))
model.add(layers.Conv1D(128,5,activation='relu',padding='same',kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.Dropout(0.2))
model.add(layers.Conv1D(128,5,padding='same',activation='relu',kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.Dropout(0.2))
model.add(layers.MaxPooling1D(pool_size=(3)))
model.add(layers.Conv1D(256,5,padding='same',activation='relu',kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(6,activation='softmax'))

plot_model(model,to_file='mfcc_model.png',show_shapes=True)
model.summary()
训练模型

编译并训练模型

opt = optimizers.rmsprop(lr=0.0001, decay=1e-6)
model.compile(loss='categorical_crossentropy', optimizer=opt,metrics=['accuracy'])
import keras
callbacks_list=[
keras.callbacks.EarlyStopping(
monitor='acc',
patience=50,
),
keras.callbacks.ModelCheckpoint(
filepath='speechmfcc_model_checkpoint.h5',
monitor='val_loss',
save_best_only=True
),
keras.callbacks.TensorBoard(
log_dir='speechmfcc_train_log'
)
]
history=model.fit(x_train, y_train,
batch_size=16,
epochs=200,
validation_data=(x_val, y_val),
callbacks=callbacks_list)
model.save('speech_mfcc_model.h5')
model.save_weights('speech_mfcc_model_weight.h5')
可视化训练结果：

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model acc')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

从上图中可以发现，模型在训练了60轮后开始过拟合，此时训练精度达到70%，验证精度达到50%。最终训练到200轮后，训练精度达到95%。

测试

最后对训练好的模型进行测试。

#单元测试,载入模型

from keras.models import load_model
import pickle

model=load_model('speech_mfcc_model.h5')
paradict={}
with open('mfcc_model_para_dict.pkl', 'rb') as f:
paradict=pickle.load(f)
DATA_MEAN=paradict['mean']
DATA_STD=paradict['std']
emotionDict=paradict['emotion']
edr = dict([(i, t) for t, i in emotionDict.items()])
import librosa

filePath=r'record1.wav'
y,sr = librosa.load(filePath,sr=None)
y=normalizeVoiceLen(y,VOICE_LEN)#归一化长度
mfcc_data=librosa.feature.mfcc(y=y, sr=sr,n_mfcc=13,n_fft=N_FFT,hop_length=int(N_FFT/4))
feature=np.mean(mfcc_data,axis=0)
feature=feature.reshape((126,1))
feature-=DATA_MEAN
feature/=DATA_STD
feature=feature.reshape((1,126,1))
result=model.predict(feature)
index=np.argmax(result, axis=1)[0]
print(edr[index])
由于数据集太小的原因，效果也就那样。

点赞
收藏
关注作者

0/1000

抱歉，系统识别当前为高风险访问，暂不支持该操作

全部回复

上滑加载中

设置昵称

在此一键设置昵称，即可参与社区互动！

*长度不超过10个汉字或20个英文字符，设置后3个月内不可修改。

确认取消

加入云驻计划，成为创作者

华为云周边好礼
免费体验产品
特殊身份标识
线下官方门票
内部专家零距离
与10000+优质创作者共同成长

立即加入

基于CNN和MFCC的语音情感识别

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

基于CNN和MFCC的语音情感识别

全部回复

设置昵称

关于作者

目录

加入云驻计划，成为创作者

推荐阅读

相关产品