LSTM 情感分析 (Keras 版本)

所需環境:Python3.6 + Tensorflow

如果使用cpu版本,可以參考:http://www.lxweimin.com/p/da141c730180
如果使用gpu版本,可以參考:http://www.lxweimin.com/p/62d414aa843e

還需要安裝 Keras:
pip install keras -i https://pypi.tuna.tsinghua.edu.cn/simple/
所需數據集 :

負面情緒:http://ai-download.xmgc360.com/datasets/sentiment_nalysis/neg.xls
正面情緒:http://ai-download.xmgc360.com/datasets/sentiment_nalysis/pos.xls

基于LSTM網絡,結構圖如下:

image.png

代碼解釋

導入相關模塊

import pandas as pd
import numpy as np
import jieba
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.utils import shuffle
from keras.utils import plot_model

讀取數據集,分別存儲在 DataFream 里

neg = pd.read_excel('neg.xls',header=None,index=None) #負面情緒數據集
pos = pd.read_excel('pos.xls',header=None,index=None) #正面情緒數據集

數據集打標簽

pos['mark'] = 1 #正面情緒
neg['mark'] = 0 #負面情緒

合并數據集

df = pd.concat([pos,neg],ignore_index=True) 

對數據集進行分詞

df['words'] = df[0].apply(lambda x: list(jieba.cut(x))) #分詞

如圖:

image.png

統計分詞后每個次出現的次數(主要是去重)

df_words = pd.DataFrame(pd.Series([j for i in df['words'] for j in i]).value_counts())

對每個詞進行編號:

df_words['id'] = list(range(1,len(df_words)+1))  #id編號是順序遞增的

image.png

把每個句子中文轉成句子向量(使用簡單編號向量)

df['words_vecoter'] = df['words'].apply(lambda x: list(df_words['id'][x]))
image.png

把句子向量的長度統一到50,長度不夠補0

df['words_vecoter'] = list(sequence.pad_sequences(df['words_vecoter'], maxlen=50))
image.png

數據整理完成,獲取訓練集和測試集

x_train = np.array(list(df['words_vecoter']))[::2] #訓練集
y_train = np.array(list(df['mark']))[::2]
x_test = np.array(list(df['words_vecoter']))[1::2] #測試集
y_test = np.array(list(df['mark']))[1::2]

隨機打亂數據集

x_train,y_train = shuffle(x_train,y_train)
x_test,y_test = shuffle(x_test,y_test)

構造循環網絡

dlen = len(df_words) + 1
model = Sequential()
model.add(Embedding(dlen, 256)) #Embedding層就是以one hot為輸入、中間層節點為字向量維數的全連接層!而這個全連接層的參數,就是一個“字向量表”!
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
plot_model(model, to_file='sentiment_nalysis.png', show_shapes=True) #保存網絡結構
model.summary()  #顯示網絡結構

訓練模型

model.fit(x_train, y_train, batch_size=16, nb_epoch=5) 

評估預測

y_predict = model.predict(x_test)
print(y_predict)
acc = model.evaluate(x_test, y_test)
print('Test accuracy:', acc)

保存模型

model.save('sentiment_nalysis.h5')

完整代碼

import pandas as pd
import numpy as np
import jieba
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.utils import shuffle
from keras.utils import plot_model

neg = pd.read_excel('neg.xls',header=None,index=None) #負面情緒數據集
pos = pd.read_excel('pos.xls',header=None,index=None) #正面情緒數據集
pos['mark'] = 1 #正面情緒
neg['mark'] = 0 #負面情緒
df = pd.concat([pos,neg],ignore_index=True)
df['words'] = df[0].apply(lambda x: list(jieba.cut(x))) #分詞
df_words = pd.DataFrame(pd.Series([j for i in df['words'] for j in i]).value_counts())
df_words['id'] = list(range(1,len(df_words)+1))  #id編號是順序遞增的
df['words_vecoter'] = df['words'].apply(lambda x: list(df_words['id'][x]))
df['words_vecoter'] = list(sequence.pad_sequences(df['words_vecoter'], maxlen=50))

x_train = np.array(list(df['words_vecoter']))[::2] #訓練集
y_train = np.array(list(df['mark']))[::2]
x_test = np.array(list(df['words_vecoter']))[1::2] #測試集
y_test = np.array(list(df['mark']))[1::2]

x_train,y_train = shuffle(x_train,y_train)
x_test,y_test = shuffle(x_test,y_test)

dlen = len(df_words) + 1
model = Sequential()
model.add(Embedding(dlen, 256))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
plot_model(model, to_file='sentiment_nalysis.png', show_shapes=True) #保存網絡結構
model.summary()  #顯示網絡結構

model.fit(x_train, y_train, batch_size=16, nb_epoch=5)

y_predict = model.predict(x_test)
print(y_predict)

acc = model.evaluate(x_test, y_test)

print('Test accuracy:', acc)

model.save('sentiment_nalysis.h5')


最后編輯于
?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。