資源簡介
目的:使用CNN卷積神經(jīng)網(wǎng)絡(luò)實(shí)現(xiàn)語音識別
步驟:(1)預(yù)處理。 首尾端的靜音切除,降低對后續(xù)步驟造成的干擾,然后進(jìn)行聲音分幀,把聲音切開成幀,,各幀之間一般是有交疊。
(2)特征提取。運(yùn)用的算法為倒譜系數(shù)(MFCC),把每一幀波形變成一個包含聲音信息的多維向量;
(3)RNN模型訓(xùn)練。有了特征,就可以使用TensorFlow完成模型的建立和訓(xùn)練了。
(4)驗(yàn)證模型。
目標(biāo):對相應(yīng)的聲音數(shù)據(jù)進(jìn)行分類,例如數(shù)據(jù)的是數(shù)數(shù)的數(shù)據(jù),能夠輸出對應(yīng)的數(shù)字。

代碼片段和文件信息
#?-*-?coding:?utf-8?-*-
import?tensorflow?as?tf
import?scipy.io.wavfile?as?wav
from?python_speech_features?import?mfccdelta
import?os
import?numpy?as?np
import?sklearn.preprocessing
path_film?=?os.path.abspath(‘.‘)
path?=?path_film?+?“/data/xunlian/“
test_path?=?path_film?+?“/data/test_data/“
isnot_test_path?=?path_film?+?“/data/isnot_test_path/“
#使用one-hot編碼,將離散特征的取值擴(kuò)展到了歐式空間
#全局one-hot編碼空間
label_binarizer?=?““
def?def_one_hot(x):
????if?label_binarizer?==?““:
????????binarizer?=?sklearn.preprocessing.LabelBinarizer()
????else:
????????binarizer?=?label_binarizer
????binarizer.fit(range(max(x)+1))
????y=?binarizer.transform(x)
????return?y
“““讀取文件位置“““
def?read_wav_path(path):
????map_path?map_relative?=?[str(path)?+?str(x)?for?x?in?os.listdir(path)?if?os.path.isfile(str(path)?+?str(x))]?[y?for?y?in?os.listdir(path)]
????return?map_path?map_relative
“““獲得mfcc系數(shù)“““
def?def_wav_read_mfcc(file_name):
????fs?audio?=?wav.read(file_name)
????processed_audio?=?mfcc(audio?samplerate=fs?nfft=512)
????return?processed_audio
“““獲取輸入的矩陣形狀(大小)“““
def?find_matrix_max_shape(audio):
????h?l?=?0?0
????for?a?in?audio:
????????a?b?=?np.array(a).shape
????????if?a?>?h:
????????????h=a
????????if?b?>?l:
????????????l=b
????return?700l
def?matrix_make_up(audio):
????h?l?=?find_matrix_max_shape(audio)
????new_audio?=?[]
????for?aa?in?audio:
????????zeros_matrix?=?np.zeros([h?l]np.int8)
????????a?b?=?np.array(aa).shape
????????for?i?in?range(a):
????????????for?j?in?range(b):
????????????????zeros_matrix[i?j]=zeros_matrix[ij]+aa[ij]
????????new_audio.append(zeros_matrix)
????return?new_audiohl
def?read_wav_matrix(path):
????map_path?map_relative?=?read_wav_path(path)
????audio=[]
????labels=[]
????for?idx?folder?in?enumerate(map_path):
????????processed_audio_delta?=?def_wav_read_mfcc(folder)
????????audio.append(processed_audio_delta)
????????labels.append(int(map_relative[idx].split(“.“)[0].split(“_“)[0]))
????x_datahl?=?matrix_make_up(audio)
????x_data?=?np.array(x_data)
????#?得到文件夾內(nèi)每種語音的one-hot編碼
????x_label?=?np.array(def_one_hot(labels))
????return?x_data?x_label?h?l
“““初始化權(quán)值“““
def?weight_variable(shapename):
????initial?=?tf.truncated_normal(shapestddev=0.01)#生成一個截斷的正態(tài)分布
????return?tf.Variable(initialname=name)
“““初始化偏置“““
def?bias_variable(shapename):
????initial?=?tf.constant(0.01shape=shape)
????return?tf.Variable(initialname=name)
“““卷積層“““
def?conv2d(xW):
????#?x?input?tensor?of?shape?‘[batch?in_height?in_width?in_channels]‘[訓(xùn)練時一個batch的圖片數(shù)量?圖片高度?圖片寬度?圖像通道數(shù)]
????#?W?filter?/?kernel?tensor?of?shape?[filter_height?filter_width?in_channels?out_channels][卷積核的高度,卷積核的寬度,圖像通道數(shù),卷積核個數(shù)]
????#‘strides[0]?=?strides[3]?=?1‘.?strides[1]代表x方向的步長,strides[2]代表y方向的步長
????#?padding:?A?‘string‘?from:?‘“SAME“?“VALID“‘
????return?tf.nn.conv2d(xWstrides=[1111]padding=‘SAME‘)
“““池化層“““
def?max_pool_2x2(x):
????#[池化的輸入
?屬性????????????大小?????日期????時間???名稱
-----------?---------??----------?-----??----
?????文件????????9694??2019-07-02?15:26??speechRecogined.py
?????文件????????2993??2019-07-02?09:26??test.py
評論
共有 條評論