機器學習實戰

數據集地址:https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

1.下載數據集

2.執行代碼

代碼如下:


# coding=utf-8

import?pyprind??

import?pandas?as?pd??

import?os??

import?numpy?as?np??

import?re??

from?nltk.stem.porter?import?PorterStemmer??

import?nltk??

from?nltk.corpus?import?stopwords??

from?sklearn.grid_search?import?GridSearchCV??

from?sklearn.pipeline?import?Pipeline??

from?sklearn.linear_model?import?LogisticRegression??

from?sklearn.feature_extraction.text?import?TfidfVectorizer??

import?time??


start?=?time.clock()??


homedir?=?os.getcwd()#獲取當前文件的路徑??

#第一步:導入數據并輸出到moive_data.csv??

'''''

pbar=pyprind.ProgBar(50000)

labels={'pos':1,'neg':0}#正面和負面評論標簽

df?=?pd.DataFrame()

for?s?in?('test','train'):

????for?l?in?('pos','neg'):

????????path=homedir+'/aclImdb/%s/%s'?%(s,l)

????????for?file?in?os.listdir(path):

????????????with?open(os.path.join(path,file),'r')?as?infile:

????????????????txt?=infile.read()

????????????df?=df.append([[txt,labels[l]]],ignore_index=True)

????????????pbar.update()

df.columns=['review','sentiment']

np.random.seed(0)

df=df.reindex(np.random.permutation(df.index))#重排數據集,打散正負樣本數據

df.to_csv(homedir+'/movie_data.csv',index=False)

'''??

#第二步:文本數據清洗和特征向量化??

df=pd.read_csv(homedir+'/movie_data.csv')??

def?preprocessor(text):??

text=re.sub('<[^>]*>','',text)#移除HTML標記,#把<>里面的東西刪掉包括內容??

emotions=re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)',text)??

text=re.sub('[\W]+','?',text.lower())+''.join(emotions).replace('-','')??

return?text??

#print?(preprocessor(df.loc[0,'review'][-50:]))#數據集第一行review字段的最后50個字符??

#print?(preprocessor("This?:)?is?:(?a?test?:-)!"))??

df['review']=df['review'].apply(preprocessor)??

def?tokenizer(text):#提取詞匯??

return?text.split()??

porter=PorterStemmer()??

def?tokenizer_porter(text):#文本分詞并提取詞干??

return?[porter.stem(word)?for?word?in?text.split()]??

nltk.download('stopwords')#停用詞移除(stop-word?removal),停用詞是文本中常見單不能有效判別信息的詞匯??

stop?=?stopwords.words('english')#獲得英文停用詞集??

#print?([w?for?w?in?tokenizer_porter('a?runner?likes?running?and?runs?a?lot')?if?w?not?in?stop])??

#第三步:模型訓練??

X_train=df.loc[:25000,'review'].values??

y_train=df.loc[:25000,'sentiment'].values??

X_test=df.loc[25000:,'review'].values??

y_test=df.loc[25000:,'sentiment'].values??

tfidf=TfidfVectorizer(strip_accents=None,lowercase=False,preprocessor=None)??

param_grid?=?[{'vect__ngram_range':[(1,1)],'vect__stop_words':[stop,None],'vect__tokenizer':[tokenizer,tokenizer_porter],'clf__penalty':['l1','l2'],'clf__C':[1.0,10.1,100.0]},\??

{'vect__ngram_range':[(1,1)],'vect__stop_words':[stop,None],'vect__tokenizer':[tokenizer,tokenizer_porter],'vect__use_idf':[False],'vect__norm':[None],'clf__penalty':['l1','l2'],'clf__C':[1.0,10.1,100.0]}?]??

lr_tfidf?=Pipeline([('vect',tfidf),('clf',LogisticRegression(random_state=0))])??

gs_lr_tfidf=GridSearchCV(lr_tfidf,param_grid,scoring='accuracy',cv=5,verbose=1,n_jobs=-1)??

gs_lr_tfidf.fit(X_train,y_train)??

print?('Best?parameter?set?:%s'?%?gs_lr_tfidf.best_params_)??

print?('CV?Accuracy:%.3f'%gs_lr_tfidf.best_score_)??

clf=gs_lr_tfidf.best_estimator_??

print?('Test?Accuracy:%.3f'%clf.score(X_test,y_test))??

end?=?time.clock()??????

print('finish?all?in?%s'?%?str(end?-?start))


執行完成的結果如下:


代碼運行結果

3.執行代碼時遇到的問題:

(1)No module named pyprind,在服務器的python3 下執行.py文件時出現沒有對應的包,需要安裝,下面的指令表示在root權限下,在python3 下安裝pyprind,之后遇到的相同的問題,關于別的包,解決方法類似,替換不同的包即可

sudo python -3 -m pip install pyprind

(2)'encoding' is an invalid keyword argument for this function

解決方法,將open 改成io.open :

import io

data_file = io.open("F:\\MyPro\\data.yaml", "r", encoding=‘utf-8‘)

(3)'ascii' codec can't encode character u'\x96' in position 1448: ordinal not in range(128)

解決方案,加上如下代碼:

import sys

reload(sys)

sys.setdefaultencoding("utf-8")

?著作權歸作者所有,轉載或內容合作請聯系作者
平臺聲明:文章內容(如有圖片或視頻亦包括在內)由作者上傳并發布,文章內容僅代表作者本人觀點,簡書系信息發布平臺,僅提供信息存儲服務。

推薦閱讀更多精彩內容