原創(chuàng)文章:如有下載及轉(zhuǎn)載請注明來源鏈接,否則視為侵權(quán)
(第一)------------------數(shù)據(jù)讀取相關(guān)方法--------------
import numpy as py
import pandas as pd
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from sklearn import RandomForestclassifier
loandData=pd.read_csv("")
Pandas將多個Sheet寫入到本地同一Excel文件中
import pandas as pd #讀取兩個表格data1=pd.read_excel('文件路徑')
data2=pd.read_excel('C:\Users\xn084037\Desktop\副本三代核心系統(tǒng)入賬金額異常結(jié)果數(shù)據(jù).xlsx')#將兩個表格輸出到一個excel文件里面
data1=pd.read_excel('C:\Users\xn084037\Desktop\副本三代核心系統(tǒng)入賬金額.xlsx')#將兩個表格輸出到一個excel文件里面
writer=pd.ExcelWriter('D:新表.xlsx')
data1.to_excel(writer,sheet_name='sheet1')
data2.to_excel(writer,sheet_name='sheet2') #必須運行
writer.save()#不然不能輸出到本地writer.save()
------Pandas的read_csv讀入數(shù)據(jù)并且自己給列名命名--------
3. 使用Pandas的read_csv、read_fwf、read_table讀取數(shù)據(jù)
import pandas as pd#數(shù)據(jù)框dataframe
csv_data=pd.read_csv('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\csv_data.csv',names=['aa','bb','cc','dd','hh'])#讀入數(shù)據(jù)并且自己給列名命名
print(csv_data)
--------Pandas的pd.read_fwf讀入數(shù)據(jù)并且自己給列名命名----------
import pandas as pd # 導(dǎo)入Pandas庫數(shù)據(jù)框dataframe
fwf_data = pd.read_fwf('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\fwf_data', widths=[5, 5, 5, 5], names=['col1', 'col2', 'col3', 'col4']) # 讀取csv數(shù)據(jù)
print (fwf_data) # 打印輸出數(shù)據(jù)
----------Pandas的pd.read_table(txt)讀入數(shù)據(jù)并且自己給列名命名---------
import pandas as pd#數(shù)據(jù)框dataframe
table_data=pd.read_table('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\table_data.txt',sep=';',
names=['aa','ab','ac','ad','ah'])
print(table_data)
------------numpy的讀入(txt)---------------------
import numpy as np#array數(shù)據(jù)組形式
file_name='D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\numpy_data.txt'
data=np.loadtxt(file_name,dtype='float32',delimiter=' ')
print(data)
------------numpy的讀入(npy)----------------
import numpy as np#讀入npy格式
write_data=np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12]])
np.save('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\load_data1',write_data)
read_data=np.load('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\load_data1.npy')
1. 使用read、readline、readlines讀取數(shù)據(jù)
通過read方式讀入text.txt數(shù)據(jù)
file_name='D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\text.txt'
file_object=open(file_name)
read_data=file_object.read()
print(read_data)
通過readline方式讀入text.txt數(shù)據(jù)
file_object=open(file_name)
readline_data=file_object.readline()
print(readline_data)
fn = open('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\text.txt') # 獲得文件對象
print (fn.tell()) # 輸出指針位置
line1 = fn.readline() # 獲得文件第一行數(shù)據(jù)
print (line1) # 輸出第一行數(shù)據(jù)
print (fn.tell()) # 輸出指針位置
line2 = fn.readline() # 獲得文件第二行數(shù)據(jù)
print (line2) # 輸出第二行數(shù)據(jù)
print (fn.tell()) # 輸出指針位置
fn.close() # 關(guān)閉文件對象
(第二)------------數(shù)據(jù)預(yù)覽相關(guān)方法---------------
loandData.head()#查看前5行記錄
loandData.info()#查看每個變量結(jié)構(gòu)基本情況
loandData.shape()#查看總樣本數(shù)及總字段數(shù)量
loandData.count()#查看每個變量對應(yīng)的樣本數(shù)
loandData.describe()#查看變量最基本指標情況(包括最大值,最小值,方差,第一分位數(shù),中位數(shù)數(shù),第三分位數(shù),均值,計數(shù))
df.head(5) #查看DataFrame對象的前n行
df.tail(5) #查看DataFrame對象的最后n行
df.shape #查看數(shù)據(jù)的行列數(shù)各是多少
df.info() #查看索引、數(shù)據(jù)類型和內(nèi)存信息
df.describe(percentiles=[.05, .25, .75, .95]) #查看數(shù)值型列的匯總統(tǒng)計,返回計數(shù)、均值、標準差、最小最大值、25%50%75%分位數(shù),percentiles0.05,0.95分位數(shù)
df.unique() #快速查看數(shù)據(jù)列有哪些分類內(nèi)容,類似groupby
s.value_counts(dropna=False) #查看Series對象的唯一值和計數(shù)
df.apply(pd.Series.value_counts) #查看DataFrame對象中每一列的唯一值和計數(shù)
df.sum() #返回所有列的求和值
df.mean() #返回所有列的均值
df.corr() #返回列與列之間的相關(guān)系數(shù)
df.count() #返回每一列中的非空值的個數(shù)
df.max() #返回每一列的最大值
df.min #返回每一列的最小值
df.median() #返回每一列的中位數(shù)
df.std() #返回每一列的標準差
df.loc[:3,['incall']]#基于incall列,選擇特定行的數(shù)據(jù),返回DataFrame格式
df[['incall']]#選擇基于incall列,返回DataFrame格式
print(df['incall'])
import pandas as pd
data=pd.read_csv(r'/Users/huangjunwen/Desktop/fandango_score_comparison.csv')
這是我的路徑大家使用的時候記得改,記得改,記得改!
print(data.head(10))#查看前10行數(shù)據(jù),數(shù)據(jù)大的時候可以通過該方式快速看到df的數(shù)據(jù)格式
loanData=loanData[loanData['Status']!='Cancelled']#間接刪除某個字段中某一類數(shù)據(jù)
--增加一列(CreditScore)變量=((最高信用+最低信用)/2).round(0)
loanData['CreditScore']=((loanData.CreditScoreRangeUpper+loanData.CreditScoreRangeLower)/2).round(0)#.round(0)表示取四舍五入
1:----------------------批量數(shù)據(jù)轉(zhuǎn)換-------------------------------------
def loan_status(s):#該函數(shù)主要功能把狀態(tài)最終分為4種情況包括壞賬,取消,貸款還款中,正常還款(正常完成,最后還款中,逾期還款)
if s=='Chargedoff':#Chargedoff(沖銷,投資人有損失)
a='Defaulted' #Defaulted(壞賬,投資人有損失)
elif s=='Defaulted':#Defaulted(壞賬,投資人有損失)
a = 'Defaulted'#Defaulted(壞賬,投資人有損失)
elif s=='Cancelled':#Cancelled(取消)
a='Cancelled' #Cancelled(取消)
elif s == 'Current':#Current(貸款還款中)
a = 'Current'#Current(貸款還款中)
else:
a='Completed'#Completed(正常完成,投資人無損失)
return a
loanData['Status']=loanData['LoanStatus'].apply(loan_status)#將數(shù)據(jù)進行轉(zhuǎn)換,增加一列為Status變量:LoanStatus數(shù)據(jù)字段轉(zhuǎn)換為Status(來自loan_status函數(shù)轉(zhuǎn)換了貸款狀態(tài))
------------------------------過濾篩選-----------------------------
DefaultedRatio=loanData[loanData['Status']=='Defaulted']['DebtToIncomeRatio']
CompletedRatio=loanData[loanData['Status']=='Completed']['DebtToIncomeRatio']
2:---------------------批量數(shù)據(jù)轉(zhuǎn)換--先求四分位數(shù)及中位數(shù),再轉(zhuǎn)換-----------------
loanData['BankcardUtilization']的四位數(shù)是0.31
oneFourth=loanData['BankcardUtilization'].quantile(0.25)
loanData['BankcardUtilization']的中位數(shù)是0.6
twoForth=loanData['BankcardUtilization'].quantile(0.5)
def bank_card_use(s,oneForth = 0.31,twoForth = 0.6):#根據(jù)業(yè)務(wù)經(jīng)驗認為設(shè)置兩個閥值
if s<=oneForth:
b='Mild Use'
elif (s>oneForth) & (s<=twoForth):
b='Medium Use'
elif (s>twoForth) & (s<=1):
b='Heavy Use'
elif s>1:
b='Super Use'
else:
b='No Use'
return b
loanData['BankCardUse']=loanData['BankcardUtilization'].apply(bank_card_use)
3:---------------------------缺失值處理相關(guān)---------------------------
填充缺失值
x['age'].fillna(x['age'].mean(), inplace=True)
2.通常情況下刪除行,使用參數(shù)axis = 0,刪除列的參數(shù)axis = 1,通常不會這么做,那樣會刪除一個變量。
print('\ndrop row')
print(df.dropna(axis = 0))
刪除后結(jié)果:
缺失值處理
import pandas as pd # 導(dǎo)入pandas庫
import numpy as np # 導(dǎo)入numpy庫
from sklearn.preprocessing import Imputer # 導(dǎo)入sklearn.preprocessing中的Imputer庫
生成缺失數(shù)據(jù)
df = pd.DataFrame(np.random.randn(6, 4), columns=['col1', 'col2', 'col3', 'col4']) # 生成一份數(shù)據(jù)
df.iloc[1:2, 1] = np.nan # 增加缺失值
df.iloc[4, 3] = np.nan # 增加缺失值
print (df)
查看哪些值缺失
nan_all = df.isnull() # 獲得所有數(shù)據(jù)框中的N值
print (nan_all) # 打印輸出
查看哪些列缺失
nan_col1 = df.isnull().any() # 獲得含有NA的列
nan_col2 = df.isnull().all() # 獲得全部為NA的列
print (nan_col1) # 打印輸出
print (nan_col2) # 打印輸出
丟棄缺失值
df2 = df.dropna() # 直接丟棄含有NA的行記錄
print (df2) # 打印輸出
-------每個字段是否缺失并且展示對應(yīng)的行數(shù)----
missing=pd.concat([loanData.isnull().any(),loanData.count()],axis=1)
--------用中位數(shù)------------補全缺失值
loanData['CreditScore']=loanData['CreditScore'].fillna(loanData['CreditScore'].median())
使用sklearn將缺失值替換為特定值
nan_model = Imputer(missing_values='NaN', strategy='mean', axis=0) # 建立替換規(guī)則:將值為Nan的缺失值以均值做替換
nan_result = nan_model.fit_transform(df) # 應(yīng)用模型規(guī)則
print (nan_result) # 打印輸出
使用pandas將缺失值替換為特定值
nan_result_pd1 = df.fillna(method='backfill') # 用后面的值替換缺失值
nan_result_pd2 = df.fillna(method='bfill', limit=1) # 用后面的值替代缺失值,限制每列只能替代一個缺失值
nan_result_pd3 = df.fillna(method='pad') # 用前面的值替換缺失值
nan_result_pd4 = df.fillna(0) # 用0替換缺失值
nan_result_pd5 = df.fillna({'col2': 1.1, 'col4': 1.2}) # 用指定值替換不同列的缺失值
nan_result_pd6 = df.fillna(df.mean()['col2':'col4']) # 用平均數(shù)代替,選擇各自列的均值替換缺失值
打印輸出
print (nan_result_pd1) # 打印輸出
print (nan_result_pd2) # 打印輸出
print (nan_result_pd3) # 打印輸出
print (nan_result_pd4) # 打印輸出
print (nan_result_pd5) # 打印輸出
print (nan_result_pd6) # 打印輸出
主要變量的缺失值,只顯示存在缺失的變量,獲取其缺失數(shù)量,以及缺失率,代碼實現(xiàn)如下:
missing=pd.concat([loanData.isnull().any(),loanData.count()],axis=1) #查出每個變量是否缺失?每個變量總的觀測值
column=['是否缺失','數(shù)量']#定義兩列
missing1=pd.DataFrame(list(missing.values),index=list(missing.index),columns=column)#修正新列命 ‘是否缺失','數(shù)量'
max=missing1['數(shù)量'].max()#每個變量最大觀測值
missing1['缺失數(shù)量']=max-missing1['數(shù)量']#每個變量最大觀測值-每個變量實際觀測值=每個變量缺失數(shù)量
missing1['缺失率']=missing1['缺失數(shù)量']/max#求出缺失值
miss=missing1[missing1['數(shù)量']<max] #取出有缺失的樣本(包括字段名稱,是否有缺失,缺失數(shù)量,缺失率)
4.1:######用中位數(shù)替換CreditScore的缺失值############
loanData['CreditScore']=loanData['CreditScore'].fillna(loanData['CreditScore'].median())
用“NOTA” 替換BorrowerState的缺失值
loanData['BorrowerState']=loanData['BorrowerState'].fillna('NOTA')
4.2:############DebtToIncomeRatio 添加隨機數(shù)缺失值處理#########
DebtToIncomeRatio缺失值添加隨機數(shù)
def rand_missing(s):
if s>=0:
a=s
else:
a=random.uniform(0.1,0.5)
return a
DebtToIncomeRatio的缺失值添加0.1~0.5的隨機變量
loanData['DebtToIncomeRatio']=loanData['DebtToIncomeRatio'].apply(rand_missing)
4.3:##############將DelinquenciesLast7Years的缺失值賦值為1################
loanData['DelinquenciesLast7Years'] = loanData['DelinquenciesLast7Years'].fillna(1)
4.4###########2009之后,選出ProsperRating (Alpha)為空的行,然后對行進行刪除#####
missIndex=loanData[(loanData['ProsperRating (Alpha)'].isnull()) & (loanData['DatePhase']=='After Jul.2009')]
loanData=loanData.drop(missIndex.index,axis=0)
4.5#字符串變量轉(zhuǎn)換成數(shù)字變量
數(shù)據(jù)中存在字符串變量,將其用數(shù)字變量進行替換。實現(xiàn)的函數(shù)如下:
定性變量的賦值
def harmonize_data(df):
# 填充空數(shù)據(jù) 和 把string數(shù)據(jù)轉(zhuǎn)成integer表示
#Status
df.loc[df['Status']=='Completed','Status']=1
df.loc[df['Status'] == 'Defaulted', 'Status'] = 0
df.loc[df['Status'] == 'Current', 'Status'] = 2
#IsBorrowerHomeowner
df.loc[df['IsBorrowerHomeowner'] == False, 'IsBorrowerHomeowner'] = 0
df.loc[df['IsBorrowerHomeowner'] == True, 'IsBorrowerHomeowner'] = 1
#CreditGrade
df.loc[df['CreditGrade'] == 'NC', 'CreditGrade'] = 0
df.loc[df['CreditGrade'] == 'HR', 'CreditGrade'] = 1
df.loc[df['CreditGrade'] == 'E', 'CreditGrade'] = 2
df.loc[df['CreditGrade'] == 'D', 'CreditGrade'] = 3
df.loc[df['CreditGrade'] == 'C', 'CreditGrade'] = 4
df.loc[df['CreditGrade'] == 'B', 'CreditGrade'] = 5
df.loc[df['CreditGrade'] == 'A', 'CreditGrade'] = 6
df.loc[df['CreditGrade'] == 'AA', 'CreditGrade'] = 7
#ProsperRating (Alpha)
df.loc[df['ProsperRating (Alpha)'] == 'HR', 'ProsperRating (Alpha)'] = 1
df.loc[df['ProsperRating (Alpha)'] == 'E', 'ProsperRating (Alpha)'] = 2
df.loc[df['ProsperRating (Alpha)'] == 'D', 'ProsperRating (Alpha)'] = 3
df.loc[df['ProsperRating (Alpha)'] == 'C', 'ProsperRating (Alpha)'] = 4
df.loc[df['ProsperRating (Alpha)'] == 'B', 'ProsperRating (Alpha)'] = 5
df.loc[df['ProsperRating (Alpha)'] == 'A', 'ProsperRating (Alpha)'] = 6
df.loc[df['ProsperRating (Alpha)'] == 'AA', 'ProsperRating (Alpha)'] = 7
#IncomeRange
df.loc[df['IncomeRange'] == 'Not displayed', 'IncomeRange'] = 0
df.loc[df['IncomeRange'] == 'Not employed', 'IncomeRange'] = 1
df.loc[df['IncomeRange'] == '1-24,999', 'IncomeRange'] = 3
df.loc[df['IncomeRange'] == '50,000-74,999', 'IncomeRange'] = 5
df.loc[df['IncomeRange'] == '100,000+', 'IncomeRange'] = 7
#BankCardUse
df.loc[df['BankCardUse'] == 'No Use', 'BankCardUse'] = 0
df.loc[df['BankCardUse'] == 'Mild Use', 'BankCardUse'] = 1
df.loc[df['BankCardUse'] == 'Medium Use', 'BankCardUse'] = 2
df.loc[df['BankCardUse'] == 'Heavy Use', 'BankCardUse'] = 3
df.loc[df['BankCardUse'] == 'Super Use', 'BankCardUse'] = 4
#CustomerClarify
df.loc[df['CustomerClarify'] == 'New Borrower', 'CustomerClarify'] = 0
df.loc[df['CustomerClarify'] == 'Previous Borrower', 'CustomerClarify'] = 1
return df
#字符串替換成整數(shù)
loanData=harmonize_data(loanData)
loc和iloc的區(qū)別
pandas以類似字典的方式來獲取某一列的值,比如df[‘A’],這會得到df的A列。如果我們對某一行感興趣呢?這個時候有兩種方法,一種是iloc方法,另一種方法是loc方法。loc是指location的意思,iloc中的i是指integer。這兩者的區(qū)別如下:
loc:works on labels in the index.
iloc:works on the positions in the index (so it only takes integers).
也就是說loc是根據(jù)index來索引,比如下邊的df定義了一個index,那么loc就根據(jù)這個index來索引對應(yīng)的行。iloc并不是根據(jù)index來索引,而是根據(jù)行號來索引,行號從0開始,逐次加1。##
4.6#######################通過循環(huán)計算準確率
result=rfr.predict(X_test)
Y_test
對預(yù)測的準確率進行計算:
def accuracy_statistics(rd,prd):
count=len(prd)
sum=0
for i in range(1,count):
if rd[i]==prd[i]:
sum += 1
pecent=round(sum/count,4)
return pecent
pecent=accuracy_statistics(list(Y_test.values),list(result))
該模型預(yù)測結(jié)果的準確率為(1498+4520)/8385=71.77%。
5##########如何安裝fbprophet庫
http://www.lxweimin.com/p/0c06ad7bccaa
1:查看安裝了那些包 pip list
2: 查看那些包需要更新版本pip list --outdated
3:創(chuàng)建一個新的python環(huán)境:conda create -n fbprophet_python3.7 python=3.7
4:激活已創(chuàng)建新的python環(huán)境:conda activate fbprophet_python3.7
如何安裝fbprophet庫
1、創(chuàng)建一個新的環(huán)境按照官網(wǎng)的做法,創(chuàng)建一個新的python環(huán)境
conda create -n fbprophet_python3.7 python=3.7
2.激活已創(chuàng)建新的python環(huán)境:conda activate fbprophet_python3.7
3.安裝C++的編譯工具
4.conda install libpython m2w64-toolchain -c msys2
5、安裝依賴庫
conda install numpy cython -c conda-forge
conda install matplotlib scipy pandas -c conda-forge
6.安裝pystan
conda install pystan -c conda-forge
7.安裝fbprophet
conda install fbprophet -c conda-forge
8.還是有點問題的。
這樣吧在GitHub上,下載一個壓縮包,用壓縮包安裝
https://github.com/facebook/prophet
conda activate fbprophet_python3.7
我又做了一次掙扎,fbprophet也用conda安裝
conda install -c conda-forge fbprophet
(第六)##############
python——修改Dataframe列名的兩種方法
首先新建一個Dataframe
import pandas as pd
df = pd.DataFrame({'a':[1,2,3],'b':[1,2,3]})
如下:
a b
0 1 1
1 2 2
2 3 3
1、修改列名a,b為A、B。
df.columns = ['A','B']
2、只修改列名a為A
df.rename(columns={'a':'A'})
Python中dataframe\ array\ list相互轉(zhuǎn)化
1、list 轉(zhuǎn)化成array矩陣
np.array(result).T
2、array轉(zhuǎn)化成dataframe
pd.DataFrame(result)
3、把Pandas中的dataframe轉(zhuǎn)成numpy中的array
使用 df=df.values,
print(len(X))--查詢數(shù)組個數(shù)
type(X)--查詢數(shù)據(jù)類型
python 切片無循序取其中某列
rr111=rr11[['kelian_x','qingjiatianshu_x','meancall_x','meanlong_x','meanxiapi_x','聚類類別']]
--------python 切片有循序取其中某列
rr111=rr11.iloc[:,0:6]
---python left join
rr11=pd.merge(left=rr1, right=r0, how='left', left_on=rr1.index, right_on=r0.index)# 關(guān)聯(lián)取出類別數(shù)目
------------------------------安裝分詞庫jieba-------------------------------------------------------------------------------------------------
1:打開Anaconda Prompt
2: pip install jieba
----------------------------------------本地讀入list格式數(shù)據(jù)-------------------------------------------------------------------------------------
import jieba
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords
對句子進行分詞
stopwords = stopwordslist('D:\2019年python代碼\python_book\chapter4\stopwords.txt') # 這里加載停用詞的路徑
---------------------------------------在python中如何將兩個list合并成一個list----------------------------------------------------------------
1:用list的extend方法,L1.extend(L2),該方法將參數(shù)L2的全部元素添加到L1的尾部,例如:
2:用切片(slice)操作,L1[len(L1):len(L1)] = L2和上面的方法等價,例如:
但切片方法用起來更靈活,可以插入到頭部,或其他任意部位,例如:
加到開頭:
3:加到中間:
-----------------------------------------python數(shù)組讀入數(shù)據(jù)-----------------------------------------------------------------------------------
data=np.loadtxt('.txt')
--------------------------------pd.set_option('display.max_rows', 4)#限定只顯示四行--------------------------------------------------------------------------
pd.set_option('display.max_rows', 4)#限定只顯示四行
-------------------------------------------刪除NA缺失數(shù)據(jù)---------------------------------------------------------------------------------------------------
data=data.dropna()#刪除帶有NA的所有行與列
print(data.shape)#查看總樣本數(shù)及總字段數(shù)
print(list(data.columns))#查看所有列
查看數(shù)據(jù)內(nèi)容:
----------------------------------------------查前面五行數(shù)據(jù)或者最后五行數(shù)據(jù)數(shù)據(jù)------------------------------------------------------------------------------
data1.head(5)#前面五行數(shù)據(jù)
data1.tail(5)#最后五行數(shù)據(jù)
--------------------------------------------------------去重unique---------------------------------------------------------------------------------------------
data['education'].unique()#去重
----------------------------------------把data數(shù)據(jù)中的y列的'yes'值改為1,'no'值改為0----------------------------------------------------------------------------
把y變?yōu)閿?shù)值型,并進行簡單的統(tǒng)計。
data.loc[data['y']=='yes','y']=1#把data數(shù)據(jù)中的y列的'yes'值改為1
data.loc[data['y']=='no','y']=0#把data數(shù)據(jù)中的y列的'no'值改為0
data['y'].value_counts()#分類對應(yīng)的個數(shù)
----------------------------------------------------畫直方圖------------------------------------------------------------------------------------------------------
sns.countplot(x='y',data=data,palette='hls')#畫直方圖
--------------------------------------------------------groupby----------------------------------------------------------------------------------------------------
data.groupby('y').mean()
-----------------------------------------------------取兩表差集合(相當(dāng)去重)-------------------------------------------------------------------
--方法1取兩表差集合(相當(dāng)去重)
df1 = pd.DataFrame({'id':[1,2,3],
'col1':['a','b','c']})
df2 = pd.DataFrame({'id':[4,3],
'col2':['d','e']})
#### 1. 數(shù)據(jù)框內(nèi)連接,類似inner join
ddd=df1.merge(df2,how='left',left_on='id',right_on='id')#用mergy方法找出兩表
ddd1=ddd[ddd.isnull().T.any()][['id','col1']]#取兩表差集(相當(dāng)去重)
--方法2取兩表差集合(相當(dāng)去重)
d1=df1['id'].values.tolist()
d2=df2['id'].values.tolist()
data=[]
for i in d1:
if i in d2:
data=data
else:
data.append(i)
df3=df1[df1['id'].isin(data)]
----------------------------------------------dataframe求均值-------------------------------------------------------------
dataframe按列求均值
df_mean=df[['col1','col2']].mean(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_mean)
dataframe按行求均值
df_mean=df[['col1','col2']].mean(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_mean)
-------------------------------------------------dataframe求svd------------------------------------------------------------
dataframe按列求std
df_std=df[['col1','col2']].std(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_std)
dataframe按行求std
df_std=df[['col1','col2']].std(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_std)
---------------------------------------------#dataframe求var(標準差)-------------------------------------------------------
dataframe按列求var(標準差)
df_var=df[['col1','col2']].var(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_var)
dataframe按行求var(標準差)
df_var=df[['col1','col2']].var(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_var)
--------------------------------------------#dataframe按列求max-----------------------------------------------------------
dataframe按列求max
df_max=df[['col1','col2']].max(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_max)
dataframe按行求max
df_max=df[['col1','col2']].max(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_max)
----------------------------------------------#dataframe求min-------------------------------------------------------------
dataframe按列求min
df_min=df[['col1','col2']].min(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_min)
dataframe按行求min
df_min=df[['col1','col2']].min(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_min)
-------------------------------------------------#dataframe求sum---------------------------------------------------------
dataframe按列求sum
df_sum=df[['col1','col2']].sum(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_sum)
dataframe按行求sum
df_sum=df[['col1','col2']].sum(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_sum)
---------------------------------------------------#dataframe求count------------------------------------------------------------
dataframe按列求count
df_count=df[['col1','col2']].count(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_count)
dataframe按行求count
df_count=df[['col1','col2']].count(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_count)
------------------------------------------------#dataframe求中位數(shù)--------------------------------------------------------------
df_median=df[['col1','col2']].median(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_median)
dataframe按行求median
df_median=df[['col1','col2']].median(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_median)
-------------------------------------------------dataframe按describe求常規(guī)統(tǒng)計量,不能用axis=1或者axis=2------------------------------
df_describe=df[['col1','col2']].describe()#數(shù)據(jù)框dataframe求均值
print(df_describe)
------------------------------------------------#dataframe按corr()求相關(guān)系數(shù),不能用axis=1或者axis=2-----------------------------------
df_corr=df.corr()#數(shù)據(jù)框dataframe求均值
print(df_corr)
----------------------------------------------------dataframe判斷重復(fù)數(shù)據(jù)------------------------------------------------------------
重復(fù)值處理
import pandas as pd # 導(dǎo)入pandas庫
生成重復(fù)數(shù)據(jù)
data1 = ['a', 3]
data2 = ['b', 2]
data3 = ['a', 3]
data4 = ['c', 2]
df = pd.DataFrame([data1, data2, data3, data4], columns=['col1', 'col2'])
print (df)
判斷重復(fù)數(shù)據(jù)
isDuplicated = df.duplicated() # 判斷重復(fù)數(shù)據(jù)記錄
print (isDuplicated) # 打印輸出
-------------------------------------------------------dataframe刪除重復(fù)值------------------------------------------------------------
刪除重復(fù)值
new_df1 = df.drop_duplicates() # 刪除數(shù)據(jù)記錄中所有列值相同的記錄
new_df2 = df.drop_duplicates(['col1']) # 刪除數(shù)據(jù)記錄中col1值相同的記錄
new_df3 = df.drop_duplicates(['col2']) # 刪除數(shù)據(jù)記錄中col2值相同的記錄
new_df4 = df.drop_duplicates(['col1', 'col2']) # 刪除數(shù)據(jù)記錄中指定列(col1/col2)值相同的記錄
print (new_df1) # 打印輸出
print (new_df2) # 打印輸出
print (new_df3) # 打印輸出
print (new_df4) # 打印輸出
--------------------------------------------------------轉(zhuǎn)義字符--------------------------------------------------------------------
- 轉(zhuǎn)義字符''
轉(zhuǎn)義字符\可以轉(zhuǎn)義很多字符,比如\n表示換行,\t表示制表符,字符\本身也要轉(zhuǎn)義,所以\表示的字符就是\
print ('It 's a dog!')
print ("hello world!\nhello Python!")
print ('\\t\')
原樣輸出引號內(nèi)字符串可以使用在引號前加r
print (r'\\t\')
-------------------------------------------------------日期天數(shù)相減------------------------------------------------------------------
import datetime
someDay = datetime.date(1999,2,10)
anotherDay = datetime.date(1999,2,15)
deltaDay = anotherDay - someDay
deltaDay.days
----------------------------------------------------------列表(list)簡單操作---------------------------------------------------------
列表(list)用來存儲一連串元素的容器,列表用[]來表示,其中元素的類型可不相同。
students = ["ming", "hua", "li", "juan", "yun", 3]
print (students)
type(students)
列表索引和切片
索引從0開始,含左不含右
print ('[4]=', students[4])
print ('[-4]=', students[-4])
print ('[0:4]=', students[0:4])
print( '[4:]=', students[4:])
print ('[0:4:2]=', students[0:4:2])
print ('[-5:-1:]=', students[-5:-1:])
print ('[-2::-1]=', students[-2::-1])
修改列表
students[3] = "小月"
print (students[3])
students[5]="小楠"
print (students[5])
students[5]=19978
print (students[5])
插入元素
students.append('han') # 只能添加一個添加到尾部
students.extend(['long', 'wan'])#可以添加多個到尾部
print (students)
scores = [90, 80, 75, 66]
students.insert(6, scores) # 添加到指定左邊第一位置,不會去重
students
刪除元素
print (students.pop(1)) # 該函數(shù)返回被彈出的元素,不傳入?yún)?shù)則刪除最后一個元素
print (students)
判斷元素是否在列表中等
print( 'wan' in students)
print ('han' not in students)
students.count('wan')#計算個數(shù)
students.index('wan')#向量在哪個位置
range函數(shù)生成整數(shù)列表
print (range(10))
print (range(-5, 5))
print (range(-10, 10, 2))
print (range(16, 10, -1))
---------------------------------------------元組(tuple)簡單操作--------------------------------------------------------------------
### 3.2.2 元組(tuple)
元組類似列表,元組里面的元素也是進行索引計算。列表里面的元素的值可以修改,而元組里面的元素的值不能修改,只能讀取。元組的符號是()。
studentsTuple = ("ming", "jun", "qiang", "wu", scores)
studentsTuple
studentsTuple=['aaa','dddd']
studentsTuple1=('aaa','dddd')
print(studentsTuple)
type(studentsTuple1)
判斷異常
try:
studentsTuple[1] = 'fu'
except TypeError:
print ('TypeError')
scores[1]= 100
studentsTuple
'ming' in studentsTuple
studentsTuple[0:4]
studentsTuple.count('ming')
studentsTuple.index('jun')
len(studentsTuple)
-------------------------------------------集合(set)簡單操作-------------------------------------------------------------------------
### 3.2.3 集合(set),去重功能
Python中集合主要有兩個功能,一個功能是進行集合操作,另一個功能是消除重復(fù)元素。 集合的格式是:set(),其中()內(nèi)可以是列表、字典或字符串,因為字符串是以列表的形式存儲的
students = ["ming", "hua", "li", "juan", "yun", 3]
studentsSet = set(students)
print (studentsSet)
studentsSet.add('xu')#集合增加一個字符
print (studentsSet)
studentsSet.remove('xu')#集合減掉一個字符
print (studentsSet)
a = set("abcnmaaaaggsng")
print ('a=', a)
b = set("cdfm")
print ('b=', b)
交集
x = a & b
print( 'x=', x)
并集
y = a | b
print ('y=', y)
差集
z = a - b
print( 'z=', z)
去除重復(fù)元素
new = set(a)
print( z)
------------------------------------------------------------字典(dict)簡單操作--------------------------------------
### 3.2.4字典(dict)
Python中的字典dict也叫做關(guān)聯(lián)數(shù)組,用大括號{}括起來,在其他語言中也稱為map,使用鍵-值(key-value)存儲,具有極快的查找速度,其中key不能重復(fù)。
k = {"name":"weiwei", "home":"guilin"}
print (k["home"])
print( k.keys())#鍵
print( k.values())#值
a={"success":True,"reason_code":"200","reason_desc":"獲取成功",
"rules":[{"rule_id":"1062274","score":7,"conditions":[{"address_a_value":
"南通市","address_a":"mobile_address","address_b":"true_ip_address","address_b_value":"南京市","type":"match_address"}]}]}
print(a["success"])
添加、修改字典里面的項目
k["like"] = "music"
k['name'] = 'guangzhou'
print (k)
判斷key是否存在
print ('name' in k)
has_key方法在python2中是可以使用的,在python3中刪除了。
print (k.has_key('name'))
改為:
if 'name' in k:
print("Yes")
k.get('edu', -1) # 通過dict提供的get方法,如果key不存在,可以返回None,或者自己指定的value
刪除key-value元素
k.pop('like')
print (k)
--------------------------------------------------------if語句-----------------------------------------------------------------------
### 3.3.2 分支結(jié)構(gòu):Python中if語句是用來判斷選擇執(zhí)行哪個語句塊的
=============================================================================
if <True or Flase表達式>:
執(zhí)行語句塊
elif <True or Flase表達式>:
執(zhí)行語句塊
else: # 都不滿足
執(zhí)行語句塊
=============================================================================
elif子句可以有多條,elif和else部分可省略
salary = 30000
if salary > 10000:
print ("Wow!!!!!!!")
elif salary > 5000:
print ("That's OK.")
elif salary > 3000:
print ("5555555555")
else:
print ("..........")
-----------------------------------------------for 循環(huán)------------------------------------------------------------------------------
- for 循環(huán)
=============================================================================
for (條件變量) in (集合):
執(zhí)行語句塊
=============================================================================
“集合”并不單指set,而是“形似”集合的列表、元組、字典、數(shù)組都可以進行循環(huán)
條件變量可以有多個
heights = {'Yao':226, 'Sharq':216, 'AI':183}
for i in heights:
print (i, heights[i])
for key, value in heights.items():-Python3 不能使用dict.iteritems(),改為dict.items()
for key, value in heights.items():
print(key, value)
total = 0
for i in range(1, 101):
total += i
print (total)
-------------------------------------------------------while 循環(huán)簡單操作-------------------------------------------------------------
### 3.3.3 循環(huán)結(jié)構(gòu)
while 循環(huán)
=============================================================================
while <True or Flase表達式>:
循環(huán)執(zhí)行語句塊
else: # 不滿足條件
執(zhí)行語句塊
=============================================================================
else部分可以省略
a = 1
while a < 10:
if a <= 5:
print (a)
else:
print ("Hello")
a = a + 1
else:
print ("Done")
---------------------------------------------- break、continue和pass-----------------------------------------------------------------
*** 練習(xí):使用循環(huán)和分支結(jié)構(gòu)輸出20以內(nèi)的奇數(shù)
### 3.3.4 break、continue和pass
break:跳出循環(huán)
continue:跳出當(dāng)前循環(huán)
pass:占位符,什么也不做
for i in range(1, 5):
if i == 3:
break
print (i)
for i in range(1, 5):
if i == 3:
continue
print (i)
for i in range(1, 5):
if i == 3:
pass
print (i)
---------------------------------------------------------排序相關(guān)--------------------------------------------------------------------
元組排序
myList = [-1, 2, -3, 4, -5, 6, 7]
print(sorted(myList))#升序排序
print(sorted(myList,reverse=True))#降序排序
### 5.1.5 數(shù)據(jù)框排序
#### 1. 排序
sample=pd.DataFrame({'name':['Bob','Lindy','Mark','Miki','Sully','Rose'],
'score':[98,78,87,77,77,np.nan],
'group':[1,1,1,2,1,2],})
sample
按'score'這個字段降序排序
sample.sort_values('score',ascending=False,na_position='last')
按'score'這個字段升序排序
sample.sort_values('score',ascending=True,na_position='last')
sample.sort_values(['group','score'])
sample.sort_values(['group','score'])
-------------------------------------------------------數(shù)據(jù)框分組匯總--------------------------------------
### 5.1.6 數(shù)據(jù)框分組匯總
sample = pd.read_csv(r'D:\《Python數(shù)據(jù)科學(xué):技術(shù)詳解與商業(yè)實踐》源代碼文件\Python_book\5Preprocessing\sample.csv', encoding='gbk')
sample.head()
sample.groupby('class')[['math']].min()
sample.groupby('class') [['math']].max()
sample.groupby('class')[['math']].count()
sample.groupby('class')[['math']].mean()
sample.groupby('class')[['math']].sum()
sample.groupby(['grade','class'])[['math']].mean()#兩個字段分組
sample.groupby(['grade'])['math','chinese'].mean()
sample.groupby('class')['math'].agg(['mean','min','max'])
df = sample.groupby(['grade','class'])['math','chinese'].agg(['min','max'])
df
-------------------------------------------------------拆分、堆疊列---------------------------------------------
### 5.1.7 拆分、堆疊列
table = pd.DataFrame({'cust_id':[10001,10001,10002,10002,10003],
'type':['Normal','Special_offer',
'Normal','Special_offer','Special_offer'],
'Monetary':[3608,420,1894,3503,4567]})
pd.pivot_table(table,index='cust_id',columns='type',values='Monetary')
pd.pivot_table(table,index='cust_id',columns='type',values='Monetary')
pd.pivot_table(table,index='cust_id',columns='type',values='Monetary',
fill_value=0,aggfunc='sum')
table1 = pd.pivot_table(table,index='cust_id',
columns='type',
values='Monetary',
fill_value=0,
aggfunc=np.sum).reset_index()
table1
pd.melt(table1,
id_vars='cust_id',
value_vars=['Normal','Special_offer'],
value_name='Monetary',
var_name='TYPE')
--------------------------------------------------數(shù)據(jù)框橫向連接/縱向連接-------------------------------------------
### 5.1.3 橫向連接
df1 = pd.DataFrame({'id':[1,2,3],
'col1':['a','b','c']})
df2 = pd.DataFrame({'id':[4,3],
'col2':['d','e']})
#### 1. 數(shù)據(jù)框內(nèi)連接,類似inner join
df1.merge(df2,how='inner',left_on='id',right_on='id')
#### 2. 左連接,類似left join
df1.merge(df2,how='left',left_on='id',right_on='id')
df1.merge(df2,how='left',on='id')
#### 3. 右連接,類似right join
df1.merge(df2,how='right',on='id')
#### 3. 行索引連接
df1 = pd.DataFrame({'id1':[1,2,3],
'col1':['a','b','c']},
index = [1,2,3])
df2 = pd.DataFrame({'id2':[1,2,3],
'col2':['aa','bb','cc']},
index = [1,3,2])
pd.concat([df1,df2],axis=1)
df1.join(df2)
### 5.1.4 縱向合并
df1 = pd.DataFrame({'id':[1,1,1,2,3,4,6],
'col':['a','a','b','c','v','e','q']})
df2 = pd.DataFrame({'id':[1,2,3,3,5],
'col':['x','y','z','v','w']})
pd.concat([df1,df2],ignore_index=True,axis=0)
pd.concat([df1,df2],ignore_index=True).drop_duplicates()#縱向合并并且去重
df3 = df1.rename(columns = {'col':'new_col'})#數(shù)據(jù)框某列改名
pd.concat([df1,df3],ignore_index=True).drop_duplicates()
----------------------------------------------------------------數(shù)據(jù)整合--------------------------------------
5.1 數(shù)據(jù)整合
### 5.1.1 行列操作
#### 1. 單列
import pandas as pd
import numpy as np
sample = pd.DataFrame(np.random.randn(4, 5),
columns=['a','b','c','d','e'])
print(sample)
sample['a']#數(shù)據(jù)框選擇單列
sample[['a','b']]#數(shù)據(jù)框隨意選擇兩列的
sample[0:2]#數(shù)據(jù)框選擇前兩行
sample.iloc[1:2,0:2]#數(shù)據(jù)框iloc選擇行與列的寫法
sample.ix[:,'a']
sample[['a']]
#### 2. 選擇多行和多列
sample.ix[0:2, 0:2]
#### 3. 創(chuàng)建、刪除列
sample['new_col1'] = sample['a'] - sample['b']
sample
數(shù)據(jù)框增加兩列
sample_new=sample.assign(new_col2 = sample['a'] - sample['b'],
new_col3 = sample['a'] + sample['b'])
sample_new
sample.drop('a',axis=1)#數(shù)據(jù)框刪除某一列
### 5.1.2 條件查詢
sample =pd.DataFrame({'name':['Bob','Lindy','Mark',
'Miki','Sully','Rose'],
'score':[98,78,87,77,65,67],
'group':[1,1,1,2,1,2],})
sample
#### 1. 單條件
sample.score > 70
sample[sample.score > 70]
sample[sample.score>70]
#### 2. 多條件
sample[(sample.score > 70) & (sample.group ==1)]
#### 3. 使用query
sample.query('score > 90')
sample.query('(group ==2) |(group == 1)')
#### 4. 其他數(shù)據(jù)框sample['score'].后面跟聚合函數(shù)sum,mean,count,between
sample[sample['score'].between(70,80,inclusive=True)]
sample[sample['name'].isin(['Bob','Lindy'])]
sample[sample['name'].str.contains('[M]+')]#數(shù)據(jù)框字符過濾,有點像live
### 5.1.8 賦值與條件賦值
#### 1. 賦值
sample = pd.DataFrame({'name':['Bob','Lindy','Mark',
'Miki','Sully','Rose'],
'score':[99,78,999,77,77,np.nan],
'group':[1,1,1,2,1,2],})
sample.score.replace(999,np.nan)
sample.replace({'score':{999:np.nan},
'name':{'Bob':np.nan}})
#### 2. 條件賦值
def transform(row):
if row['group'] == 1:
return ('class1')
elif row['group'] == 2:
return ('class2')
sample.apply(transform,axis=1)
sample.assign(class_n = sample.apply(transform,axis=1))
sample = sample.copy()
sample.loc[sample.group==1,'class_n']='class1'
sample.loc[sample.group==2,'class_n']='class2'