python常用基礎(chǔ)命令匯總

原創(chuàng)文章:如有下載及轉(zhuǎn)載請注明來源鏈接,否則視為侵權(quán)

(第一)------------------數(shù)據(jù)讀取相關(guān)方法--------------

import numpy as py
import pandas as pd
import seaborn as sns
import random
from sklearn.model_selection import train_test_split
from sklearn import RandomForestclassifier
loandData=pd.read_csv("")

Pandas將多個Sheet寫入到本地同一Excel文件中

import pandas as pd #讀取兩個表格data1=pd.read_excel('文件路徑')
data2=pd.read_excel('C:\Users\xn084037\Desktop\副本三代核心系統(tǒng)入賬金額異常結(jié)果數(shù)據(jù).xlsx')#將兩個表格輸出到一個excel文件里面
data1=pd.read_excel('C:\Users\xn084037\Desktop\副本三代核心系統(tǒng)入賬金額.xlsx')#將兩個表格輸出到一個excel文件里面
writer=pd.ExcelWriter('D:新表.xlsx')
data1.to_excel(writer,sheet_name='sheet1')
data2.to_excel(writer,sheet_name='sheet2') #必須運行
writer.save()#不然不能輸出到本地writer.save()

------Pandas的read_csv讀入數(shù)據(jù)并且自己給列名命名--------

3. 使用Pandas的read_csv、read_fwf、read_table讀取數(shù)據(jù)

import pandas as pd#數(shù)據(jù)框dataframe
csv_data=pd.read_csv('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\csv_data.csv',names=['aa','bb','cc','dd','hh'])#讀入數(shù)據(jù)并且自己給列名命名
print(csv_data)

--------Pandas的pd.read_fwf讀入數(shù)據(jù)并且自己給列名命名----------
import pandas as pd # 導(dǎo)入Pandas庫數(shù)據(jù)框dataframe
fwf_data = pd.read_fwf('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\fwf_data', widths=[5, 5, 5, 5], names=['col1', 'col2', 'col3', 'col4']) # 讀取csv數(shù)據(jù)
print (fwf_data) # 打印輸出數(shù)據(jù)
----------Pandas的pd.read_table(txt)讀入數(shù)據(jù)并且自己給列名命名---------
import pandas as pd#數(shù)據(jù)框dataframe
table_data=pd.read_table('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\table_data.txt',sep=';',
names=['aa','ab','ac','ad','ah'])
print(table_data)
------------numpy的讀入(txt)---------------------
import numpy as np#array數(shù)據(jù)組形式
file_name='D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\numpy_data.txt'
data=np.loadtxt(file_name,dtype='float32',delimiter=' ')
print(data)
------------numpy的讀入(npy)----------------
import numpy as np#讀入npy格式
write_data=np.array([[1,2,3,4],[5,6,7,8],[9,10,11,12]])
np.save('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\load_data1',write_data)
read_data=np.load('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\load_data1.npy')

1. 使用read、readline、readlines讀取數(shù)據(jù)

通過read方式讀入text.txt數(shù)據(jù)

file_name='D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\text.txt'
file_object=open(file_name)
read_data=file_object.read()
print(read_data)

通過readline方式讀入text.txt數(shù)據(jù)

file_object=open(file_name)
readline_data=file_object.readline()
print(readline_data)

fn = open('D:\Python數(shù)據(jù)分析與數(shù)據(jù)化運營\python_book\chapter2\text.txt') # 獲得文件對象
print (fn.tell()) # 輸出指針位置
line1 = fn.readline() # 獲得文件第一行數(shù)據(jù)
print (line1) # 輸出第一行數(shù)據(jù)
print (fn.tell()) # 輸出指針位置
line2 = fn.readline() # 獲得文件第二行數(shù)據(jù)
print (line2) # 輸出第二行數(shù)據(jù)
print (fn.tell()) # 輸出指針位置
fn.close() # 關(guān)閉文件對象

(第二)------------數(shù)據(jù)預(yù)覽相關(guān)方法---------------

loandData.head()#查看前5行記錄
loandData.info()#查看每個變量結(jié)構(gòu)基本情況
loandData.shape()#查看總樣本數(shù)及總字段數(shù)量
loandData.count()#查看每個變量對應(yīng)的樣本數(shù)
loandData.describe()#查看變量最基本指標情況(包括最大值,最小值,方差,第一分位數(shù),中位數(shù)數(shù),第三分位數(shù),均值,計數(shù))

df.head(5) #查看DataFrame對象的前n行
df.tail(5) #查看DataFrame對象的最后n行
df.shape #查看數(shù)據(jù)的行列數(shù)各是多少
df.info() #查看索引、數(shù)據(jù)類型和內(nèi)存信息
df.describe(percentiles=[.05, .25, .75, .95]) #查看數(shù)值型列的匯總統(tǒng)計,返回計數(shù)、均值、標準差、最小最大值、25%50%75%分位數(shù),percentiles0.05,0.95分位數(shù)
df.unique() #快速查看數(shù)據(jù)列有哪些分類內(nèi)容,類似groupby
s.value_counts(dropna=False) #查看Series對象的唯一值和計數(shù)
df.apply(pd.Series.value_counts) #查看DataFrame對象中每一列的唯一值和計數(shù)
df.sum() #返回所有列的求和值
df.mean() #返回所有列的均值
df.corr() #返回列與列之間的相關(guān)系數(shù)
df.count() #返回每一列中的非空值的個數(shù)
df.max() #返回每一列的最大值
df.min #返回每一列的最小值
df.median() #返回每一列的中位數(shù)
df.std() #返回每一列的標準差
df.loc[:3,['incall']]#基于incall列,選擇特定行的數(shù)據(jù),返回DataFrame格式
df[['incall']]#選擇基于incall列,返回DataFrame格式
print(df['incall'])
import pandas as pd
data=pd.read_csv(r'/Users/huangjunwen/Desktop/fandango_score_comparison.csv')

這是我的路徑大家使用的時候記得改,記得改,記得改!

print(data.head(10))#查看前10行數(shù)據(jù),數(shù)據(jù)大的時候可以通過該方式快速看到df的數(shù)據(jù)格式

loanData=loanData[loanData['Status']!='Cancelled']#間接刪除某個字段中某一類數(shù)據(jù)
--增加一列(CreditScore)變量=((最高信用+最低信用)/2).round(0)
loanData['CreditScore']=((loanData.CreditScoreRangeUpper+loanData.CreditScoreRangeLower)/2).round(0)#.round(0)表示取四舍五入

1:----------------------批量數(shù)據(jù)轉(zhuǎn)換-------------------------------------
def loan_status(s):#該函數(shù)主要功能把狀態(tài)最終分為4種情況包括壞賬,取消,貸款還款中,正常還款(正常完成,最后還款中,逾期還款)
if s=='Chargedoff':#Chargedoff(沖銷,投資人有損失)
a='Defaulted' #Defaulted(壞賬,投資人有損失)
elif s=='Defaulted':#Defaulted(壞賬,投資人有損失)
a = 'Defaulted'#Defaulted(壞賬,投資人有損失)
elif s=='Cancelled':#Cancelled(取消)
a='Cancelled' #Cancelled(取消)
elif s == 'Current':#Current(貸款還款中)
a = 'Current'#Current(貸款還款中)
else:
a='Completed'#Completed(正常完成,投資人無損失)
return a

loanData['Status']=loanData['LoanStatus'].apply(loan_status)#將數(shù)據(jù)進行轉(zhuǎn)換,增加一列為Status變量:LoanStatus數(shù)據(jù)字段轉(zhuǎn)換為Status(來自loan_status函數(shù)轉(zhuǎn)換了貸款狀態(tài))

------------------------------過濾篩選-----------------------------
DefaultedRatio=loanData[loanData['Status']=='Defaulted']['DebtToIncomeRatio']
CompletedRatio=loanData[loanData['Status']=='Completed']['DebtToIncomeRatio']

2:---------------------批量數(shù)據(jù)轉(zhuǎn)換--先求四分位數(shù)及中位數(shù),再轉(zhuǎn)換-----------------

loanData['BankcardUtilization']的四位數(shù)是0.31

oneFourth=loanData['BankcardUtilization'].quantile(0.25)

loanData['BankcardUtilization']的中位數(shù)是0.6

twoForth=loanData['BankcardUtilization'].quantile(0.5)

def bank_card_use(s,oneForth = 0.31,twoForth = 0.6):#根據(jù)業(yè)務(wù)經(jīng)驗認為設(shè)置兩個閥值
if s<=oneForth:
b='Mild Use'
elif (s>oneForth) & (s<=twoForth):
b='Medium Use'
elif (s>twoForth) & (s<=1):
b='Heavy Use'
elif s>1:
b='Super Use'
else:
b='No Use'
return b

loanData['BankCardUse']=loanData['BankcardUtilization'].apply(bank_card_use)

3:---------------------------缺失值處理相關(guān)---------------------------

填充缺失值

x['age'].fillna(x['age'].mean(), inplace=True)
2.通常情況下刪除行,使用參數(shù)axis = 0,刪除列的參數(shù)axis = 1,通常不會這么做,那樣會刪除一個變量。
print('\ndrop row')
print(df.dropna(axis = 0))
刪除后結(jié)果:

缺失值處理

import pandas as pd # 導(dǎo)入pandas庫
import numpy as np # 導(dǎo)入numpy庫
from sklearn.preprocessing import Imputer # 導(dǎo)入sklearn.preprocessing中的Imputer庫

生成缺失數(shù)據(jù)

df = pd.DataFrame(np.random.randn(6, 4), columns=['col1', 'col2', 'col3', 'col4']) # 生成一份數(shù)據(jù)
df.iloc[1:2, 1] = np.nan # 增加缺失值
df.iloc[4, 3] = np.nan # 增加缺失值
print (df)

查看哪些值缺失

nan_all = df.isnull() # 獲得所有數(shù)據(jù)框中的N值
print (nan_all) # 打印輸出

查看哪些列缺失

nan_col1 = df.isnull().any() # 獲得含有NA的列
nan_col2 = df.isnull().all() # 獲得全部為NA的列
print (nan_col1) # 打印輸出
print (nan_col2) # 打印輸出

丟棄缺失值

df2 = df.dropna() # 直接丟棄含有NA的行記錄
print (df2) # 打印輸出
-------每個字段是否缺失并且展示對應(yīng)的行數(shù)----
missing=pd.concat([loanData.isnull().any(),loanData.count()],axis=1)
--------用中位數(shù)------------補全缺失值
loanData['CreditScore']=loanData['CreditScore'].fillna(loanData['CreditScore'].median())

使用sklearn將缺失值替換為特定值

nan_model = Imputer(missing_values='NaN', strategy='mean', axis=0) # 建立替換規(guī)則:將值為Nan的缺失值以均值做替換
nan_result = nan_model.fit_transform(df) # 應(yīng)用模型規(guī)則
print (nan_result) # 打印輸出

使用pandas將缺失值替換為特定值

nan_result_pd1 = df.fillna(method='backfill') # 用后面的值替換缺失值
nan_result_pd2 = df.fillna(method='bfill', limit=1) # 用后面的值替代缺失值,限制每列只能替代一個缺失值
nan_result_pd3 = df.fillna(method='pad') # 用前面的值替換缺失值
nan_result_pd4 = df.fillna(0) # 用0替換缺失值
nan_result_pd5 = df.fillna({'col2': 1.1, 'col4': 1.2}) # 用指定值替換不同列的缺失值
nan_result_pd6 = df.fillna(df.mean()['col2':'col4']) # 用平均數(shù)代替,選擇各自列的均值替換缺失值

打印輸出

print (nan_result_pd1) # 打印輸出
print (nan_result_pd2) # 打印輸出
print (nan_result_pd3) # 打印輸出
print (nan_result_pd4) # 打印輸出
print (nan_result_pd5) # 打印輸出
print (nan_result_pd6) # 打印輸出

主要變量的缺失值,只顯示存在缺失的變量,獲取其缺失數(shù)量,以及缺失率,代碼實現(xiàn)如下:
missing=pd.concat([loanData.isnull().any(),loanData.count()],axis=1) #查出每個變量是否缺失?每個變量總的觀測值
column=['是否缺失','數(shù)量']#定義兩列
missing1=pd.DataFrame(list(missing.values),index=list(missing.index),columns=column)#修正新列命 ‘是否缺失','數(shù)量'
max=missing1['數(shù)量'].max()#每個變量最大觀測值
missing1['缺失數(shù)量']=max-missing1['數(shù)量']#每個變量最大觀測值-每個變量實際觀測值=每個變量缺失數(shù)量
missing1['缺失率']=missing1['缺失數(shù)量']/max#求出缺失值
miss=missing1[missing1['數(shù)量']<max] #取出有缺失的樣本(包括字段名稱,是否有缺失,缺失數(shù)量,缺失率)

4.1:######用中位數(shù)替換CreditScore的缺失值############
loanData['CreditScore']=loanData['CreditScore'].fillna(loanData['CreditScore'].median())

用“NOTA” 替換BorrowerState的缺失值
loanData['BorrowerState']=loanData['BorrowerState'].fillna('NOTA')

4.2:############DebtToIncomeRatio 添加隨機數(shù)缺失值處理#########

DebtToIncomeRatio缺失值添加隨機數(shù)

def rand_missing(s):
if s>=0:
a=s
else:
a=random.uniform(0.1,0.5)
return a

DebtToIncomeRatio的缺失值添加0.1~0.5的隨機變量

loanData['DebtToIncomeRatio']=loanData['DebtToIncomeRatio'].apply(rand_missing)

4.3:##############將DelinquenciesLast7Years的缺失值賦值為1################
loanData['DelinquenciesLast7Years'] = loanData['DelinquenciesLast7Years'].fillna(1)

4.4###########2009之后,選出ProsperRating (Alpha)為空的行,然后對行進行刪除#####
missIndex=loanData[(loanData['ProsperRating (Alpha)'].isnull()) & (loanData['DatePhase']=='After Jul.2009')]
loanData=loanData.drop(missIndex.index,axis=0)

4.5#字符串變量轉(zhuǎn)換成數(shù)字變量

數(shù)據(jù)中存在字符串變量,將其用數(shù)字變量進行替換。實現(xiàn)的函數(shù)如下:

定性變量的賦值

def harmonize_data(df):
# 填充空數(shù)據(jù) 和 把string數(shù)據(jù)轉(zhuǎn)成integer表示
#Status
df.loc[df['Status']=='Completed','Status']=1
df.loc[df['Status'] == 'Defaulted', 'Status'] = 0
df.loc[df['Status'] == 'Current', 'Status'] = 2
#IsBorrowerHomeowner
df.loc[df['IsBorrowerHomeowner'] == False, 'IsBorrowerHomeowner'] = 0
df.loc[df['IsBorrowerHomeowner'] == True, 'IsBorrowerHomeowner'] = 1
#CreditGrade
df.loc[df['CreditGrade'] == 'NC', 'CreditGrade'] = 0
df.loc[df['CreditGrade'] == 'HR', 'CreditGrade'] = 1
df.loc[df['CreditGrade'] == 'E', 'CreditGrade'] = 2
df.loc[df['CreditGrade'] == 'D', 'CreditGrade'] = 3
df.loc[df['CreditGrade'] == 'C', 'CreditGrade'] = 4
df.loc[df['CreditGrade'] == 'B', 'CreditGrade'] = 5
df.loc[df['CreditGrade'] == 'A', 'CreditGrade'] = 6
df.loc[df['CreditGrade'] == 'AA', 'CreditGrade'] = 7
#ProsperRating (Alpha)
df.loc[df['ProsperRating (Alpha)'] == 'HR', 'ProsperRating (Alpha)'] = 1
df.loc[df['ProsperRating (Alpha)'] == 'E', 'ProsperRating (Alpha)'] = 2
df.loc[df['ProsperRating (Alpha)'] == 'D', 'ProsperRating (Alpha)'] = 3
df.loc[df['ProsperRating (Alpha)'] == 'C', 'ProsperRating (Alpha)'] = 4
df.loc[df['ProsperRating (Alpha)'] == 'B', 'ProsperRating (Alpha)'] = 5
df.loc[df['ProsperRating (Alpha)'] == 'A', 'ProsperRating (Alpha)'] = 6
df.loc[df['ProsperRating (Alpha)'] == 'AA', 'ProsperRating (Alpha)'] = 7
#IncomeRange
df.loc[df['IncomeRange'] == 'Not displayed', 'IncomeRange'] = 0
df.loc[df['IncomeRange'] == 'Not employed', 'IncomeRange'] = 1
df.loc[df['IncomeRange'] == '0', 'IncomeRange'] = 2 df.loc[df['IncomeRange'] == '1-24,999', 'IncomeRange'] = 3
df.loc[df['IncomeRange'] == '25,000-49,999', 'IncomeRange'] = 4 df.loc[df['IncomeRange'] == '50,000-74,999', 'IncomeRange'] = 5
df.loc[df['IncomeRange'] == '75,000-99,999', 'IncomeRange'] = 6 df.loc[df['IncomeRange'] == '100,000+', 'IncomeRange'] = 7
#BankCardUse
df.loc[df['BankCardUse'] == 'No Use', 'BankCardUse'] = 0
df.loc[df['BankCardUse'] == 'Mild Use', 'BankCardUse'] = 1
df.loc[df['BankCardUse'] == 'Medium Use', 'BankCardUse'] = 2
df.loc[df['BankCardUse'] == 'Heavy Use', 'BankCardUse'] = 3
df.loc[df['BankCardUse'] == 'Super Use', 'BankCardUse'] = 4
#CustomerClarify
df.loc[df['CustomerClarify'] == 'New Borrower', 'CustomerClarify'] = 0
df.loc[df['CustomerClarify'] == 'Previous Borrower', 'CustomerClarify'] = 1
return df

#字符串替換成整數(shù)
loanData=harmonize_data(loanData)

loc和iloc的區(qū)別

pandas以類似字典的方式來獲取某一列的值,比如df[‘A’],這會得到df的A列。如果我們對某一行感興趣呢?這個時候有兩種方法,一種是iloc方法,另一種方法是loc方法。loc是指location的意思,iloc中的i是指integer。這兩者的區(qū)別如下:
loc:works on labels in the index.
iloc:works on the positions in the index (so it only takes integers).
也就是說loc是根據(jù)index來索引,比如下邊的df定義了一個index,那么loc就根據(jù)這個index來索引對應(yīng)的行。iloc并不是根據(jù)index來索引,而是根據(jù)行號來索引,行號從0開始,逐次加1。##

4.6#######################通過循環(huán)計算準確率

result=rfr.predict(X_test)
Y_test
對預(yù)測的準確率進行計算:
def accuracy_statistics(rd,prd):
count=len(prd)
sum=0
for i in range(1,count):
if rd[i]==prd[i]:
sum += 1
pecent=round(sum/count,4)
return pecent

pecent=accuracy_statistics(list(Y_test.values),list(result))

該模型預(yù)測結(jié)果的準確率為(1498+4520)/8385=71.77%。

5##########如何安裝fbprophet庫
http://www.lxweimin.com/p/0c06ad7bccaa
1:查看安裝了那些包 pip list
2: 查看那些包需要更新版本pip list --outdated
3:創(chuàng)建一個新的python環(huán)境:conda create -n fbprophet_python3.7 python=3.7
4:激活已創(chuàng)建新的python環(huán)境:conda activate fbprophet_python3.7

如何安裝fbprophet庫
1、創(chuàng)建一個新的環(huán)境按照官網(wǎng)的做法,創(chuàng)建一個新的python環(huán)境
conda create -n fbprophet_python3.7 python=3.7
2.激活已創(chuàng)建新的python環(huán)境:conda activate fbprophet_python3.7
3.安裝C++的編譯工具
4.conda install libpython m2w64-toolchain -c msys2
5、安裝依賴庫
conda install numpy cython -c conda-forge
conda install matplotlib scipy pandas -c conda-forge
6.安裝pystan
conda install pystan -c conda-forge
7.安裝fbprophet
conda install fbprophet -c conda-forge
8.還是有點問題的。
這樣吧在GitHub上,下載一個壓縮包,用壓縮包安裝
https://github.com/facebook/prophet
conda activate fbprophet_python3.7
我又做了一次掙扎,fbprophet也用conda安裝
conda install -c conda-forge fbprophet

(第六)##############


python——修改Dataframe列名的兩種方法
首先新建一個Dataframe
import pandas as pd
df = pd.DataFrame({'a':[1,2,3],'b':[1,2,3]})
如下:
a b
0 1 1
1 2 2
2 3 3
1、修改列名a,b為A、B。

df.columns = ['A','B']

2、只修改列名a為A

df.rename(columns={'a':'A'})

Python中dataframe\ array\ list相互轉(zhuǎn)化
1、list 轉(zhuǎn)化成array矩陣

 np.array(result).T

2、array轉(zhuǎn)化成dataframe

 pd.DataFrame(result)

3、把Pandas中的dataframe轉(zhuǎn)成numpy中的array

使用 df=df.values,


print(len(X))--查詢數(shù)組個數(shù)
type(X)--查詢數(shù)據(jù)類型


python 切片無循序取其中某列
rr111=rr11[['kelian_x','qingjiatianshu_x','meancall_x','meanlong_x','meanxiapi_x','聚類類別']]
--------python 切片有循序取其中某列
rr111=rr11.iloc[:,0:6]


---python left join
rr11=pd.merge(left=rr1, right=r0, how='left', left_on=rr1.index, right_on=r0.index)# 關(guān)聯(lián)取出類別數(shù)目

------------------------------安裝分詞庫jieba-------------------------------------------------------------------------------------------------
1:打開Anaconda Prompt
2: pip install jieba
----------------------------------------本地讀入list格式數(shù)據(jù)-------------------------------------------------------------------------------------
import jieba
def stopwordslist(filepath):
stopwords = [line.strip() for line in open(filepath, 'r', encoding='utf-8').readlines()]
return stopwords

對句子進行分詞

stopwords = stopwordslist('D:\2019年python代碼\python_book\chapter4\stopwords.txt') # 這里加載停用詞的路徑

---------------------------------------在python中如何將兩個list合并成一個list----------------------------------------------------------------
1:用list的extend方法,L1.extend(L2),該方法將參數(shù)L2的全部元素添加到L1的尾部,例如:
2:用切片(slice)操作,L1[len(L1):len(L1)] = L2和上面的方法等價,例如:
但切片方法用起來更靈活,可以插入到頭部,或其他任意部位,例如:
加到開頭:
3:加到中間:
-----------------------------------------python數(shù)組讀入數(shù)據(jù)-----------------------------------------------------------------------------------
data=np.loadtxt('.txt')

--------------------------------pd.set_option('display.max_rows', 4)#限定只顯示四行--------------------------------------------------------------------------
pd.set_option('display.max_rows', 4)#限定只顯示四行

-------------------------------------------刪除NA缺失數(shù)據(jù)---------------------------------------------------------------------------------------------------
data=data.dropna()#刪除帶有NA的所有行與列
print(data.shape)#查看總樣本數(shù)及總字段數(shù)
print(list(data.columns))#查看所有列

查看數(shù)據(jù)內(nèi)容:
----------------------------------------------查前面五行數(shù)據(jù)或者最后五行數(shù)據(jù)數(shù)據(jù)------------------------------------------------------------------------------
data1.head(5)#前面五行數(shù)據(jù)
data1.tail(5)#最后五行數(shù)據(jù)
--------------------------------------------------------去重unique---------------------------------------------------------------------------------------------
data['education'].unique()#去重

----------------------------------------把data數(shù)據(jù)中的y列的'yes'值改為1,'no'值改為0----------------------------------------------------------------------------

把y變?yōu)閿?shù)值型,并進行簡單的統(tǒng)計。

data.loc[data['y']=='yes','y']=1#把data數(shù)據(jù)中的y列的'yes'值改為1
data.loc[data['y']=='no','y']=0#把data數(shù)據(jù)中的y列的'no'值改為0
data['y'].value_counts()#分類對應(yīng)的個數(shù)
----------------------------------------------------畫直方圖------------------------------------------------------------------------------------------------------
sns.countplot(x='y',data=data,palette='hls')#畫直方圖
--------------------------------------------------------groupby----------------------------------------------------------------------------------------------------
data.groupby('y').mean()

-----------------------------------------------------取兩表差集合(相當(dāng)去重)-------------------------------------------------------------------

--方法1取兩表差集合(相當(dāng)去重)
df1 = pd.DataFrame({'id':[1,2,3],
'col1':['a','b','c']})
df2 = pd.DataFrame({'id':[4,3],
'col2':['d','e']})

#### 1. 數(shù)據(jù)框內(nèi)連接,類似inner join

ddd=df1.merge(df2,how='left',left_on='id',right_on='id')#用mergy方法找出兩表
ddd1=ddd[ddd.isnull().T.any()][['id','col1']]#取兩表差集(相當(dāng)去重)

--方法2取兩表差集合(相當(dāng)去重)
d1=df1['id'].values.tolist()
d2=df2['id'].values.tolist()
data=[]
for i in d1:
if i in d2:
data=data
else:
data.append(i)
df3=df1[df1['id'].isin(data)]
----------------------------------------------dataframe求均值-------------------------------------------------------------

dataframe按列求均值

df_mean=df[['col1','col2']].mean(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_mean)

dataframe按行求均值

df_mean=df[['col1','col2']].mean(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_mean)
-------------------------------------------------dataframe求svd------------------------------------------------------------

dataframe按列求std

df_std=df[['col1','col2']].std(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_std)

dataframe按行求std

df_std=df[['col1','col2']].std(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_std)
---------------------------------------------#dataframe求var(標準差)-------------------------------------------------------

dataframe按列求var(標準差)

df_var=df[['col1','col2']].var(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_var)

dataframe按行求var(標準差)

df_var=df[['col1','col2']].var(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_var)
--------------------------------------------#dataframe按列求max-----------------------------------------------------------

dataframe按列求max

df_max=df[['col1','col2']].max(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_max)

dataframe按行求max

df_max=df[['col1','col2']].max(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_max)
----------------------------------------------#dataframe求min-------------------------------------------------------------

dataframe按列求min

df_min=df[['col1','col2']].min(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_min)

dataframe按行求min

df_min=df[['col1','col2']].min(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_min)
-------------------------------------------------#dataframe求sum---------------------------------------------------------

dataframe按列求sum

df_sum=df[['col1','col2']].sum(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_sum)

dataframe按行求sum

df_sum=df[['col1','col2']].sum(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_sum)
---------------------------------------------------#dataframe求count------------------------------------------------------------

dataframe按列求count

df_count=df[['col1','col2']].count(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_count)

dataframe按行求count

df_count=df[['col1','col2']].count(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_count)
------------------------------------------------#dataframe求中位數(shù)--------------------------------------------------------------
df_median=df[['col1','col2']].median(axis=0)#數(shù)據(jù)框dataframe求均值
print(df_median)

dataframe按行求median

df_median=df[['col1','col2']].median(axis=1)#數(shù)據(jù)框dataframe求均值
print(df_median)
-------------------------------------------------dataframe按describe求常規(guī)統(tǒng)計量,不能用axis=1或者axis=2------------------------------
df_describe=df[['col1','col2']].describe()#數(shù)據(jù)框dataframe求均值
print(df_describe)
------------------------------------------------#dataframe按corr()求相關(guān)系數(shù),不能用axis=1或者axis=2-----------------------------------
df_corr=df.corr()#數(shù)據(jù)框dataframe求均值
print(df_corr)
----------------------------------------------------dataframe判斷重復(fù)數(shù)據(jù)------------------------------------------------------------

重復(fù)值處理

import pandas as pd # 導(dǎo)入pandas庫

生成重復(fù)數(shù)據(jù)

data1 = ['a', 3]
data2 = ['b', 2]
data3 = ['a', 3]
data4 = ['c', 2]
df = pd.DataFrame([data1, data2, data3, data4], columns=['col1', 'col2'])
print (df)

判斷重復(fù)數(shù)據(jù)

isDuplicated = df.duplicated() # 判斷重復(fù)數(shù)據(jù)記錄
print (isDuplicated) # 打印輸出
-------------------------------------------------------dataframe刪除重復(fù)值------------------------------------------------------------

刪除重復(fù)值

new_df1 = df.drop_duplicates() # 刪除數(shù)據(jù)記錄中所有列值相同的記錄
new_df2 = df.drop_duplicates(['col1']) # 刪除數(shù)據(jù)記錄中col1值相同的記錄
new_df3 = df.drop_duplicates(['col2']) # 刪除數(shù)據(jù)記錄中col2值相同的記錄
new_df4 = df.drop_duplicates(['col1', 'col2']) # 刪除數(shù)據(jù)記錄中指定列(col1/col2)值相同的記錄
print (new_df1) # 打印輸出
print (new_df2) # 打印輸出
print (new_df3) # 打印輸出
print (new_df4) # 打印輸出
--------------------------------------------------------轉(zhuǎn)義字符--------------------------------------------------------------------

- 轉(zhuǎn)義字符''

轉(zhuǎn)義字符\可以轉(zhuǎn)義很多字符,比如\n表示換行,\t表示制表符,字符\本身也要轉(zhuǎn)義,所以\表示的字符就是\

print ('It 's a dog!')
print ("hello world!\nhello Python!")
print ('\\t\')

原樣輸出引號內(nèi)字符串可以使用在引號前加r

print (r'\\t\')
-------------------------------------------------------日期天數(shù)相減------------------------------------------------------------------

import datetime
someDay = datetime.date(1999,2,10)
anotherDay = datetime.date(1999,2,15)
deltaDay = anotherDay - someDay
deltaDay.days
----------------------------------------------------------列表(list)簡單操作---------------------------------------------------------

列表(list)用來存儲一連串元素的容器,列表用[]來表示,其中元素的類型可不相同。

students = ["ming", "hua", "li", "juan", "yun", 3]
print (students)
type(students)

列表索引和切片

索引從0開始,含左不含右

print ('[4]=', students[4])
print ('[-4]=', students[-4])
print ('[0:4]=', students[0:4])
print( '[4:]=', students[4:])
print ('[0:4:2]=', students[0:4:2])
print ('[-5:-1:]=', students[-5:-1:])
print ('[-2::-1]=', students[-2::-1])

修改列表

students[3] = "小月"
print (students[3])

students[5]="小楠"
print (students[5])

students[5]=19978
print (students[5])

插入元素

students.append('han') # 只能添加一個添加到尾部
students.extend(['long', 'wan'])#可以添加多個到尾部
print (students)

scores = [90, 80, 75, 66]
students.insert(6, scores) # 添加到指定左邊第一位置,不會去重
students

刪除元素

print (students.pop(1)) # 該函數(shù)返回被彈出的元素,不傳入?yún)?shù)則刪除最后一個元素
print (students)

判斷元素是否在列表中等

print( 'wan' in students)
print ('han' not in students)
students.count('wan')#計算個數(shù)
students.index('wan')#向量在哪個位置

range函數(shù)生成整數(shù)列表

print (range(10))
print (range(-5, 5))
print (range(-10, 10, 2))
print (range(16, 10, -1))
---------------------------------------------元組(tuple)簡單操作--------------------------------------------------------------------

### 3.2.2 元組(tuple)

元組類似列表,元組里面的元素也是進行索引計算。列表里面的元素的值可以修改,而元組里面的元素的值不能修改,只能讀取。元組的符號是()。

studentsTuple = ("ming", "jun", "qiang", "wu", scores)
studentsTuple
studentsTuple=['aaa','dddd']
studentsTuple1=('aaa','dddd')
print(studentsTuple)
type(studentsTuple1)

判斷異常

try:
studentsTuple[1] = 'fu'
except TypeError:
print ('TypeError')
scores[1]= 100
studentsTuple
'ming' in studentsTuple
studentsTuple[0:4]
studentsTuple.count('ming')
studentsTuple.index('jun')
len(studentsTuple)
-------------------------------------------集合(set)簡單操作-------------------------------------------------------------------------

### 3.2.3 集合(set),去重功能

Python中集合主要有兩個功能,一個功能是進行集合操作,另一個功能是消除重復(fù)元素。 集合的格式是:set(),其中()內(nèi)可以是列表、字典或字符串,因為字符串是以列表的形式存儲的

students = ["ming", "hua", "li", "juan", "yun", 3]
studentsSet = set(students)
print (studentsSet)
studentsSet.add('xu')#集合增加一個字符
print (studentsSet)
studentsSet.remove('xu')#集合減掉一個字符
print (studentsSet)
a = set("abcnmaaaaggsng")
print ('a=', a)
b = set("cdfm")
print ('b=', b)

交集

x = a & b
print( 'x=', x)

并集

y = a | b
print ('y=', y)

差集

z = a - b
print( 'z=', z)

去除重復(fù)元素

new = set(a)
print( z)
------------------------------------------------------------字典(dict)簡單操作--------------------------------------

### 3.2.4字典(dict)

Python中的字典dict也叫做關(guān)聯(lián)數(shù)組,用大括號{}括起來,在其他語言中也稱為map,使用鍵-值(key-value)存儲,具有極快的查找速度,其中key不能重復(fù)。

k = {"name":"weiwei", "home":"guilin"}
print (k["home"])
print( k.keys())#鍵
print( k.values())#值
a={"success":True,"reason_code":"200","reason_desc":"獲取成功",
"rules":[{"rule_id":"1062274","score":7,"conditions":[{"address_a_value":
"南通市","address_a":"mobile_address","address_b":"true_ip_address","address_b_value":"南京市","type":"match_address"}]}]}
print(a["success"])

添加、修改字典里面的項目

k["like"] = "music"
k['name'] = 'guangzhou'
print (k)

判斷key是否存在

print ('name' in k)

has_key方法在python2中是可以使用的,在python3中刪除了。

print (k.has_key('name'))

改為:

if 'name' in k:
print("Yes")

k.get('edu', -1) # 通過dict提供的get方法,如果key不存在,可以返回None,或者自己指定的value

刪除key-value元素

k.pop('like')
print (k)
--------------------------------------------------------if語句-----------------------------------------------------------------------

### 3.3.2 分支結(jié)構(gòu):Python中if語句是用來判斷選擇執(zhí)行哪個語句塊的

=============================================================================

if <True or Flase表達式>:

執(zhí)行語句塊

elif <True or Flase表達式>:

執(zhí)行語句塊

else: # 都不滿足

執(zhí)行語句塊

=============================================================================

elif子句可以有多條,elif和else部分可省略

salary = 30000
if salary > 10000:
print ("Wow!!!!!!!")
elif salary > 5000:
print ("That's OK.")
elif salary > 3000:
print ("5555555555")
else:
print ("..........")
-----------------------------------------------for 循環(huán)------------------------------------------------------------------------------

- for 循環(huán)

=============================================================================

for (條件變量) in (集合):

執(zhí)行語句塊

=============================================================================

“集合”并不單指set,而是“形似”集合的列表、元組、字典、數(shù)組都可以進行循環(huán)

條件變量可以有多個

heights = {'Yao':226, 'Sharq':216, 'AI':183}
for i in heights:
print (i, heights[i])

for key, value in heights.items():-Python3 不能使用dict.iteritems(),改為dict.items()

for key, value in heights.items():
print(key, value)

total = 0
for i in range(1, 101):
total += i
print (total)

-------------------------------------------------------while 循環(huán)簡單操作-------------------------------------------------------------

### 3.3.3 循環(huán)結(jié)構(gòu)

while 循環(huán)

=============================================================================

while <True or Flase表達式>:

循環(huán)執(zhí)行語句塊

else: # 不滿足條件

執(zhí)行語句塊

=============================================================================

else部分可以省略

a = 1
while a < 10:
if a <= 5:
print (a)
else:
print ("Hello")
a = a + 1
else:
print ("Done")

---------------------------------------------- break、continue和pass-----------------------------------------------------------------

*** 練習(xí):使用循環(huán)和分支結(jié)構(gòu)輸出20以內(nèi)的奇數(shù)

### 3.3.4 break、continue和pass

break:跳出循環(huán)

continue:跳出當(dāng)前循環(huán)

pass:占位符,什么也不做

for i in range(1, 5):
if i == 3:
break
print (i)

for i in range(1, 5):
if i == 3:
continue
print (i)
for i in range(1, 5):
if i == 3:
pass
print (i)
---------------------------------------------------------排序相關(guān)--------------------------------------------------------------------

元組排序

myList = [-1, 2, -3, 4, -5, 6, 7]
print(sorted(myList))#升序排序
print(sorted(myList,reverse=True))#降序排序

### 5.1.5 數(shù)據(jù)框排序

#### 1. 排序

sample=pd.DataFrame({'name':['Bob','Lindy','Mark','Miki','Sully','Rose'],
'score':[98,78,87,77,77,np.nan],
'group':[1,1,1,2,1,2],})

sample

按'score'這個字段降序排序

sample.sort_values('score',ascending=False,na_position='last')

按'score'這個字段升序排序

sample.sort_values('score',ascending=True,na_position='last')

sample.sort_values(['group','score'])

sample.sort_values(['group','score'])

-------------------------------------------------------數(shù)據(jù)框分組匯總--------------------------------------

### 5.1.6 數(shù)據(jù)框分組匯總

sample = pd.read_csv(r'D:\《Python數(shù)據(jù)科學(xué):技術(shù)詳解與商業(yè)實踐》源代碼文件\Python_book\5Preprocessing\sample.csv', encoding='gbk')
sample.head()

sample.groupby('class')[['math']].min()
sample.groupby('class') [['math']].max()
sample.groupby('class')[['math']].count()
sample.groupby('class')[['math']].mean()
sample.groupby('class')[['math']].sum()

sample.groupby(['grade','class'])[['math']].mean()#兩個字段分組

sample.groupby(['grade'])['math','chinese'].mean()

sample.groupby('class')['math'].agg(['mean','min','max'])

df = sample.groupby(['grade','class'])['math','chinese'].agg(['min','max'])
df
-------------------------------------------------------拆分、堆疊列---------------------------------------------

### 5.1.7 拆分、堆疊列

table = pd.DataFrame({'cust_id':[10001,10001,10002,10002,10003],
'type':['Normal','Special_offer',
'Normal','Special_offer','Special_offer'],
'Monetary':[3608,420,1894,3503,4567]})

pd.pivot_table(table,index='cust_id',columns='type',values='Monetary')
pd.pivot_table(table,index='cust_id',columns='type',values='Monetary')

pd.pivot_table(table,index='cust_id',columns='type',values='Monetary',
fill_value=0,aggfunc='sum')

table1 = pd.pivot_table(table,index='cust_id',
columns='type',
values='Monetary',
fill_value=0,
aggfunc=np.sum).reset_index()
table1

pd.melt(table1,
id_vars='cust_id',
value_vars=['Normal','Special_offer'],
value_name='Monetary',
var_name='TYPE')
--------------------------------------------------數(shù)據(jù)框橫向連接/縱向連接-------------------------------------------

### 5.1.3 橫向連接

df1 = pd.DataFrame({'id':[1,2,3],
'col1':['a','b','c']})
df2 = pd.DataFrame({'id':[4,3],
'col2':['d','e']})

#### 1. 數(shù)據(jù)框內(nèi)連接,類似inner join

df1.merge(df2,how='inner',left_on='id',right_on='id')

#### 2. 左連接,類似left join

df1.merge(df2,how='left',left_on='id',right_on='id')
df1.merge(df2,how='left',on='id')

#### 3. 右連接,類似right join

df1.merge(df2,how='right',on='id')

#### 3. 行索引連接

df1 = pd.DataFrame({'id1':[1,2,3],
'col1':['a','b','c']},
index = [1,2,3])
df2 = pd.DataFrame({'id2':[1,2,3],
'col2':['aa','bb','cc']},
index = [1,3,2])

pd.concat([df1,df2],axis=1)

df1.join(df2)

### 5.1.4 縱向合并

df1 = pd.DataFrame({'id':[1,1,1,2,3,4,6],
'col':['a','a','b','c','v','e','q']})
df2 = pd.DataFrame({'id':[1,2,3,3,5],
'col':['x','y','z','v','w']})

pd.concat([df1,df2],ignore_index=True,axis=0)

pd.concat([df1,df2],ignore_index=True).drop_duplicates()#縱向合并并且去重
df3 = df1.rename(columns = {'col':'new_col'})#數(shù)據(jù)框某列改名

pd.concat([df1,df3],ignore_index=True).drop_duplicates()

----------------------------------------------------------------數(shù)據(jù)整合--------------------------------------

5.1 數(shù)據(jù)整合

### 5.1.1 行列操作

#### 1. 單列

import pandas as pd
import numpy as np
sample = pd.DataFrame(np.random.randn(4, 5),
columns=['a','b','c','d','e'])
print(sample)
sample['a']#數(shù)據(jù)框選擇單列
sample[['a','b']]#數(shù)據(jù)框隨意選擇兩列的
sample[0:2]#數(shù)據(jù)框選擇前兩行
sample.iloc[1:2,0:2]#數(shù)據(jù)框iloc選擇行與列的寫法
sample.ix[:,'a']
sample[['a']]

#### 2. 選擇多行和多列

sample.ix[0:2, 0:2]

#### 3. 創(chuàng)建、刪除列

sample['new_col1'] = sample['a'] - sample['b']
sample

數(shù)據(jù)框增加兩列

sample_new=sample.assign(new_col2 = sample['a'] - sample['b'],
new_col3 = sample['a'] + sample['b'])
sample_new

sample.drop('a',axis=1)#數(shù)據(jù)框刪除某一列

### 5.1.2 條件查詢

sample =pd.DataFrame({'name':['Bob','Lindy','Mark',
'Miki','Sully','Rose'],
'score':[98,78,87,77,65,67],
'group':[1,1,1,2,1,2],})
sample

#### 1. 單條件

sample.score > 70

sample[sample.score > 70]
sample[sample.score>70]

#### 2. 多條件

sample[(sample.score > 70) & (sample.group ==1)]

#### 3. 使用query

sample.query('score > 90')

sample.query('(group ==2) |(group == 1)')

#### 4. 其他數(shù)據(jù)框sample['score'].后面跟聚合函數(shù)sum,mean,count,between

sample[sample['score'].between(70,80,inclusive=True)]

sample[sample['name'].isin(['Bob','Lindy'])]
sample[sample['name'].str.contains('[M]+')]#數(shù)據(jù)框字符過濾,有點像live

### 5.1.8 賦值與條件賦值

#### 1. 賦值

sample = pd.DataFrame({'name':['Bob','Lindy','Mark',
'Miki','Sully','Rose'],
'score':[99,78,999,77,77,np.nan],
'group':[1,1,1,2,1,2],})

sample.score.replace(999,np.nan)

sample.replace({'score':{999:np.nan},
'name':{'Bob':np.nan}})

#### 2. 條件賦值

def transform(row):
if row['group'] == 1:
return ('class1')
elif row['group'] == 2:
return ('class2')
sample.apply(transform,axis=1)

sample.assign(class_n = sample.apply(transform,axis=1))

sample = sample.copy()
sample.loc[sample.group==1,'class_n']='class1'
sample.loc[sample.group==2,'class_n']='class2'

最后編輯于
?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者
平臺聲明:文章內(nèi)容(如有圖片或視頻亦包括在內(nèi))由作者上傳并發(fā)布,文章內(nèi)容僅代表作者本人觀點,簡書系信息發(fā)布平臺,僅提供信息存儲服務(wù)。

推薦閱讀更多精彩內(nèi)容