所需環境:Python3.6 + Tensorflow
如果使用cpu版本,可以參考:http://www.lxweimin.com/p/da141c730180
如果使用gpu版本,可以參考:http://www.lxweimin.com/p/62d414aa843e
還需要安裝 Keras:
pip install keras -i https://pypi.tuna.tsinghua.edu.cn/simple/
所需數據集 :
負面情緒:http://ai-download.xmgc360.com/datasets/sentiment_nalysis/neg.xls
正面情緒:http://ai-download.xmgc360.com/datasets/sentiment_nalysis/pos.xls
基于LSTM網絡,結構圖如下:
image.png
代碼解釋
導入相關模塊
import pandas as pd
import numpy as np
import jieba
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.utils import shuffle
from keras.utils import plot_model
讀取數據集,分別存儲在 DataFream 里
neg = pd.read_excel('neg.xls',header=None,index=None) #負面情緒數據集
pos = pd.read_excel('pos.xls',header=None,index=None) #正面情緒數據集
數據集打標簽
pos['mark'] = 1 #正面情緒
neg['mark'] = 0 #負面情緒
合并數據集
df = pd.concat([pos,neg],ignore_index=True)
對數據集進行分詞
df['words'] = df[0].apply(lambda x: list(jieba.cut(x))) #分詞
如圖:
image.png
統計分詞后每個次出現的次數(主要是去重)
df_words = pd.DataFrame(pd.Series([j for i in df['words'] for j in i]).value_counts())
對每個詞進行編號:
df_words['id'] = list(range(1,len(df_words)+1)) #id編號是順序遞增的
image.png
把每個句子中文轉成句子向量(使用簡單編號向量)
df['words_vecoter'] = df['words'].apply(lambda x: list(df_words['id'][x]))
image.png
把句子向量的長度統一到50,長度不夠補0
df['words_vecoter'] = list(sequence.pad_sequences(df['words_vecoter'], maxlen=50))
image.png
數據整理完成,獲取訓練集和測試集
x_train = np.array(list(df['words_vecoter']))[::2] #訓練集
y_train = np.array(list(df['mark']))[::2]
x_test = np.array(list(df['words_vecoter']))[1::2] #測試集
y_test = np.array(list(df['mark']))[1::2]
隨機打亂數據集
x_train,y_train = shuffle(x_train,y_train)
x_test,y_test = shuffle(x_test,y_test)
構造循環網絡
dlen = len(df_words) + 1
model = Sequential()
model.add(Embedding(dlen, 256)) #Embedding層就是以one hot為輸入、中間層節點為字向量維數的全連接層!而這個全連接層的參數,就是一個“字向量表”!
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
plot_model(model, to_file='sentiment_nalysis.png', show_shapes=True) #保存網絡結構
model.summary() #顯示網絡結構
訓練模型
model.fit(x_train, y_train, batch_size=16, nb_epoch=5)
評估預測
y_predict = model.predict(x_test)
print(y_predict)
acc = model.evaluate(x_test, y_test)
print('Test accuracy:', acc)
保存模型
model.save('sentiment_nalysis.h5')
完整代碼
import pandas as pd
import numpy as np
import jieba
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from sklearn.utils import shuffle
from keras.utils import plot_model
neg = pd.read_excel('neg.xls',header=None,index=None) #負面情緒數據集
pos = pd.read_excel('pos.xls',header=None,index=None) #正面情緒數據集
pos['mark'] = 1 #正面情緒
neg['mark'] = 0 #負面情緒
df = pd.concat([pos,neg],ignore_index=True)
df['words'] = df[0].apply(lambda x: list(jieba.cut(x))) #分詞
df_words = pd.DataFrame(pd.Series([j for i in df['words'] for j in i]).value_counts())
df_words['id'] = list(range(1,len(df_words)+1)) #id編號是順序遞增的
df['words_vecoter'] = df['words'].apply(lambda x: list(df_words['id'][x]))
df['words_vecoter'] = list(sequence.pad_sequences(df['words_vecoter'], maxlen=50))
x_train = np.array(list(df['words_vecoter']))[::2] #訓練集
y_train = np.array(list(df['mark']))[::2]
x_test = np.array(list(df['words_vecoter']))[1::2] #測試集
y_test = np.array(list(df['mark']))[1::2]
x_train,y_train = shuffle(x_train,y_train)
x_test,y_test = shuffle(x_test,y_test)
dlen = len(df_words) + 1
model = Sequential()
model.add(Embedding(dlen, 256))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
plot_model(model, to_file='sentiment_nalysis.png', show_shapes=True) #保存網絡結構
model.summary() #顯示網絡結構
model.fit(x_train, y_train, batch_size=16, nb_epoch=5)
y_predict = model.predict(x_test)
print(y_predict)
acc = model.evaluate(x_test, y_test)
print('Test accuracy:', acc)
model.save('sentiment_nalysis.h5')