1.簡介
Python Data Analysis Library 或 pandas 是基于NumPy 的一種工具,該工具是為了解決數(shù)據(jù)分析任務(wù)而創(chuàng)建的。Pandas 納入了大量庫和一些標(biāo)準(zhǔn)的數(shù)據(jù)模型,提供了高效地操作大型數(shù)據(jù)集所需的工具。pandas提供了大量能使我們快速便捷地處理數(shù)據(jù)的函數(shù)和方法。Pandas是使Python成為強大而高效的數(shù)據(jù)分析環(huán)境的重要因素之一。
- python數(shù)據(jù)分析library
- 基于numpy (對ndarray的操作)
- 有一種用python做Excel/SQL/R的感覺
2.數(shù)據(jù)結(jié)構(gòu)Series
2.1 構(gòu)造和初始化Series
import pandas as pd
import numpy as np
#Series是一個一維的數(shù)據(jù)結(jié)構(gòu),下面是一些初始化Series的方法。
s = pd.Series([7, 'Beijing', 2.17, -12344, 'Happy Birthday!'])
#pandas會默認(rèn)用0到n來作為Series的index,但是我們也可以自己指定index。index我們可以把它理解為dict里面的key。
s = pd.Series([7, 'Beijing', 2.17, -12344, 'Happy Birthday!'],
index=['A', 'B', 'C', 'D', 'E'])
#還可以用dictionary來構(gòu)造一個Series,因為Series本來就是key value pairs。
cities = {'Beijing': 55000, 'Shanghai': 60000, 'Shenzhen': 50000, 'Hangzhou': 20000, 'Guangzhou': 25000, 'Suzhou': None}
# apts = pd.Series(cities)
apts = pd.Series(cities, name="price")
#numpy ndarray構(gòu)建一個Series
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
2.2 選擇數(shù)據(jù)
#我們可以像對待一個list一樣對待Series
apts[[4,3,1]]
apts[1:]
#為什么下面這樣會拿到兩個NaN呢?
apts[1:] + apts[:-1]
#Series就像一個dict,前面定義的index就是用來選擇數(shù)據(jù)的
apts["Hangzhou"]
apts[["Hangzhou", "Beijing", "Shenzhen"]]
"Hangzhou" in apts
apts.get("Hangzhou")
#boolean indexing,與numpy類似。
apts[apts < 50000]
apts.median()
apts[apts > apts.median()]
#下面我再詳細(xì)展示一下這個boolean indexing是如何工作的
less_than_50000 = apts < 50000
print(less_than_50000)
print(apts[less_than_50000])
2.3 Series元素賦值
#Series的元素可以被賦值
apts['Shenzhen'] = 55000
#前面講過的boolean indexing在賦值的時候也可以用
apts[apts <= 50000] = 40000
2.4 數(shù)學(xué)運算
#下面我們來講一些基本的數(shù)學(xué)運算。
apts / 2
apts * 2
#numpy的運算可以被運用到pandsa上去
np.square(apts)
#我們再定義一個新的Series做加法
cars = pd.Series({'Beijing': 300000, 'Shanghai': 400000, 'Shenzhen': 300000, \
'Tianjin': 200000, 'Guangzhou': 200000, 'Chongqing': 150000})
cars + apts * 100
2.5 數(shù)據(jù)缺失
apts.notnull()
apts.isnull()
apts[apts.isnull()]
apts[apts.isnull() == False]
3.數(shù)據(jù)結(jié)構(gòu)DataFrame
一個Dataframe就是一張表格,Series表示的是一維數(shù)組,Dataframe則是一個二維數(shù)組,可以類比成一張excel的spreadsheet。也可以把Dataframe當(dāng)做一組Series的集合。
3.1 創(chuàng)建一個DataFrame
#dataframe可以由一個dictionary構(gòu)造得到。
data = {'city': ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen', 'Hangzhou', 'Chongqing'],
'year': [2016,2017,2016,2017,2016, 2016],
'population': [2100, 2300, 1000, 700, 500, 500]}
pd.DataFrame(data)
#columns的名字和順序可以指定
pd.DataFrame(data, columns = ['year', 'city', 'population'])
pd.DataFrame(data, columns = ['year', 'city', 'population', 'debt'])
frame2 = pd.DataFrame(data, columns = ['year', 'city', 'population', 'debt'],
index=['one', 'two', 'three', 'four', 'five', 'six'])
print(frame2)
3.2 從DataFrame里選擇數(shù)據(jù)
frame2['city']
type(frame2['city'])
frame2.ix['three']
#下面這種方法默認(rèn)用來選列而不是選行
frame2.ix[2]
3.3 DataFrame元素賦值
frame2["population"]["one"] = 2100
#可以給一整列賦值
frame2['debt'] = 100000000
frame2.ix['six'] = 0
frame2 = pd.DataFrame(data, \
columns = ['year', 'city', 'population', 'debt'],
index = ['one', 'two', 'three', 'four', 'five', 'six'])
print(frame2)
frame2.debt = np.arange(6)
#還可以用Series來指定需要修改的index以及相對應(yīng)的value,沒有指定的默認(rèn)用NaN.
val = pd.Series([100, 200, 300], index=['two', 'three', 'five'])
frame2['debt'] = val
print(frame2)
frame2['western'] = (frame2.city == 'Chongqing')
print(frame2)
frame2.columns
frame2.index
#一個DataFrame就和一個numpy 2d array一樣,可以被轉(zhuǎn)置
frame2.T
#指定index的順序,以及使用切片初始化數(shù)據(jù)
frame3['Beijing'][1:3]
frame3['Shanghai'][:-1]
pdata = {'Beijing': frame3['Beijing'][:-1], 'Shanghai':frame3['Shanghai'][:-1]}
print(pd.DataFrame(pdata))
#還可以指定index的名字和列的名字
frame3.index.name = 'year'
frame3.columns.name = 'city'
frame3
type(frame3.values)
3.數(shù)據(jù)結(jié)構(gòu)Index
3.1 index object
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
print(index)
print(index[1:])
#index的值是不能被賦值
index[1] = 'd' #錯誤的做法
index = pd.Index(np.arange(3))
index
obj2 = pd.Series([2,5,7], index=index)
print(obj2)
print(obj2.index is index)
pop = {'Beijing': {2016: 2100, 2017:2200},
'Shanghai': {2015:2400, 2016:2500, 2017:2600}}
frame3 = pd.DataFrame(pop)
print('Shanghai' in frame3.columns)
print(2015 in frame3.columns)
3.2 針對index進行索引和切片
obj = pd.Series(np.arange(4), index=['a','b','c','d'])
obj[['b', 'a']]
obj[[0, 2]]
#默認(rèn)的數(shù)字index依舊可以使用
obj[1:3]
obj['b':'d'] = 5
#對DataFrame進行Indexing與Series基本相同
frame = pd.DataFrame(np.arange(9).reshape(3,3),
index = ['a', 'c', 'd'],
columns = ['Hangzhou', 'Shenzhen', 'Nanjing'])
frame['Hangzhou']
frame[:2]
frame.ix['a':'d']
frame['Hangzhou':'Najing']
frame.ix[:, 'Shenzhen':'Nanjing']
frame.ix[:'c', 'Hangzhou']
#DataFrame也可以用condition selection
frame[frame.Hangzhou > 1]
frame[frame < 5] = 0
print(frame)
3.3 reindex
http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.reindex.html
#把一個Series或者DataFrame按照新的index順序進行重排
obj = pd.Series([4.5, 7.2, -5.3, 3.2], index=['d', 'b', 'a', 'c'])
obj.reindex(['a', 'b', 'c', 'd', 'e'])
obj.reindex(['a', 'b', 'c', 'd', 'e'], fill_value = 0)
obj3 = pd.Series(['blue', 'purple', 'yellow'], index = [0,2,4])
obj3.reindex(range(6), method='ffill')
obj3.reindex(range(6), method='bfill')
#既然我們可以對Series進行reindex,相應(yīng)地,我們也可以用同樣的方法對DataFrame進行reindex。
frame = pd.DataFrame(np.arange(9).reshape(3,3),
index = ['a', 'c', 'd'],
columns = ['Hangzhou', 'Shenzhen', 'Nanjing'])
print(frame)
frame.reindex(['a' , 'b', 'c', 'd'])
#在reindex的同時,我們還可以重新指定columns
frame.reindex(columns = ['Shenzhen', 'Hangzhou', 'Chongqing'])
frame.reindex(index = ['a', 'b', 'c', 'd'],
method = 'ffill',
columns = ['Shenzhen', 'Hangzhou', 'Chongqing'])
#下面介紹如何用drop來刪除Series和DataFrame中的index
obj4 = obj3.drop(2)
obj3.drop([2, 4])
frame.drop(['a', 'c'])
frame.drop('Shenzhen', axis=1)
frame.drop(['Shenzhen', 'Hangzhou'], axis=1)
4.數(shù)據(jù)結(jié)構(gòu)hierarchical index
#Series的hierarchical indexing
data = pd.Series(np.random.randn(10),
index=[['a','a','a','b','b','c','c','c','d','d'], \
[1,2,3,1,2,1,2,3,1,2]])
data
data.index
data["b"]
data['b':'d']
data[1:4]
#unstack和stack可以幫助我們在hierarchical indexing和DataFrame之間進行切換。
data.unstack()
type(data.unstack())
data.unstack().stack()
#DataFrame的hierarchical indexing
frame = pd.DataFrame(np.arange(12).reshape((4,3)),
index = [['a','a','b','b'], [1,2,1,2]],
columns = [['Beijing', 'Beijing', 'Shanghai'], ['apts', 'cars', 'apts']])
print(frame)
frame.index.names = ['alpha', 'number']
frame.columns.names = ['city', 'type']
print(frame)
frame.ix['a', 1]
frame.ix['a', 2]['Beijing']['apts']
5.關(guān)于Concatenate, Merge和Join
df1 = pd.DataFrame({'apts': [55000, 60000],
'cars': [200000, 300000],},
index = ['Shanghai', 'Beijing'])
df1
df2 = pd.DataFrame({'apts': [25000, 20000],
'cars': [150000, 120000],},
index = ['Hangzhou', 'Najing'])
print(df2)
df3 = pd.DataFrame({'apts': [30000, 10000],
'cars': [180000, 100000],},
index = ['Guangzhou', 'Chongqing'])
print(df3)
5.1 concatenate
frames = [df1, df2, df3]
result = pd.concat(frames)
result
#在concatenate的時候可以指定keys,這樣可以給每一個部分加上一個Key
#以下的例子就構(gòu)造了一個hierarchical index
result2 = pd.concat(frames, keys=['x', 'y', 'z'])
print(result2)
df4 = pd.DataFrame({'salaries': [10000, 30000, 30000, 20000, 15000]},
index = ['Suzhou', 'Beijing', 'Shanghai', 'Guangzhou', 'Tianjin'])
print(df4)
result3 = pd.concat([result, df4], axis=1)
result3
#用inner可以去掉NaN
result3 = pd.concat([result, df4], axis=1, join='inner')
result3
#用append來做concatenation
df1.append(df2)
df1.append(df4) #可能不是想要的結(jié)果
#Series和DataFrame還可以被一起concatenate,這時候Series會先被轉(zhuǎn)成DataFrame然后做Join,因為Series本來就是一個只有一維的DataFrame。
s1 = pd.Series([60, 50], index=['Shanghai', 'Beijing'], name='meal')
print(s1)
df1
pd.concat([df1, s1], axis=1)
#如何append一個row到DataFrame里
s2 = pd.Series([18000, 120000], index=['apts', 'cars'], name='Xiamen')
s2
df1.append(s2)
5.2 Merge
df1 = pd.DataFrame({'apts': [55000, 60000, 58000],
'cars': [200000, 300000,250000],
'cities': ['Shanghai', 'Beijing','Shenzhen']})
print(df1)
df4 = pd.DataFrame({'salaries': [10000, 30000, 30000, 20000, 15000],
'cities': ['Suzhou', 'Beijing', 'Shanghai', 'Guangzhou', 'Tianjin']})
print(df4)
result = pd.merge(df1, df4, on='cities')
result
result2 = pd.merge(df1, df4, on='cities', how='outer')
result2
5.3 Join
df1 = pd.DataFrame({'apts': [55000, 60000, 58000],
'cars': [200000, 300000,250000]},
index=['Shanghai', 'Beijing','Shenzhen'])
print(df1)
df4 = pd.DataFrame({'salaries': [10000, 30000, 30000, 20000, 15000]},
index=['Suzhou', 'Beijing', 'Shanghai', 'Guangzhou', 'Tianjin'])
print(df4)
df1.join(df4)
df4.join(df1)
df1.join(df4, how='outer')
#也可以用merge來寫
pd.merge(df1, df4, left_index=True, right_index=True, how='outer')
6.Group By
import pandas as pd
salaries = pd.DataFrame({
'Name': ['July', 'Chu', 'Chu', 'Lin', 'July', 'July', 'Chu', 'July'],
'Year': [2016,2016,2016,2016,2017,2017,2017,2017],
'Salary': [10000,2000,4000,5000,18000,25000,3000,4000],
'Bonus': [3000,1000,1000,1200,4000,2300,500,1000]
})
print(salaries)
group_by_name = salaries.groupby('Name')
group_by_name
#groupby經(jīng)常和aggregate一起使用
group_by_name.aggregate(sum)
group_by_name.sum()
group_by_name_year = salaries.groupby(['Name', 'Year'])
group_by_name_year.sum()
group_by_name_year.size()
group_by_name_year.max()
#describe這個function可以為我們展示各種有用的統(tǒng)計信息
group_by_name_year.describe()
7.Read From CSV
#我們先從CSV文件中讀取一些數(shù)據(jù)。
#bike.csv記錄了Montreal自行車路線的數(shù)據(jù),具體有7條路線,數(shù)據(jù)記錄了每條自行車路線每天分別有多少人。
bikes = pd.read_csv('bikes.csv', encoding='latin1', sep=';',
parse_dates=['Date'], dayfirst=True, index_col='Date')
bikes.head()
bikes.dropna() #dropna會刪除所有帶NA的行
bikes.dropna(how='all').head()
bikes.dropna(axis=1, how='all').head()
#下面給大家介紹如何填充缺失的數(shù)據(jù)
row = bikes.ix[0].copy()
row.fillna(row.mean())
m = bikes.mean(axis=1)
for i, col in enumerate(bikes):
bikes.ix[:, i] = bikes.ix[:, i].fillna(m)
bikes.head()
berri_bikes = bikes[['Berri 1']].copy()
berri_bikes.head()
berri_bikes.index
berri_bikes.index.weekday
berri_bikes.loc[:, 'weekday'] = berri_bikes.index.weekday
berri_bikes[:5]
#有了weekday信息之后,我們就可以用上我們前面學(xué)過的.groupyby把騎車人數(shù)按照weekday分類,然后用aggregate算出每個工作日的騎車人數(shù)之和。
weekday_counts = berri_bikes.groupby('weekday').aggregate(sum)
weekday_counts
weekday_counts.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_counts
#接下來我們試試能不能把每條路線都加起來,然后算出一天騎自行車出門的人數(shù)之和。
bikes_sum = bikes.sum(axis=1).to_frame()
bikes_sum.head()
# bikes_sum.index
bikes_sum.ix[:, 'weekday'] = bikes_sum.index.weekday
bikes_sum.head()
#type(berri_bikes)
weekday_counts = bikes_sum.groupby('weekday').aggregate(sum)
weekday_counts.index = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_counts
8.stock project
import pandas as pd
import numpy as np
%matplotlib inline
goog = pd.read_csv("data/GOOG.csv", index_col=0)
goog.index = pd.to_datetime(goog.index)
# goog
goog["Adj Close"].plot(grid = True)
goog["Close"].plot(grid = True)
#shift這個function可以幫我們移動時間
goog.shift(1).head()
#datetime indexing
goog["log-return"] = np.log(goog["Adj Close"] / goog["Adj Close"].shift(1))
print(goog["log-return"].head())
goog["log-return"].plot(grid=True)
aapl = pd.read_csv("data/AAPL.csv", index_col=0)
aapl.index = pd.to_datetime(aapl.index)
#aapl["Adj Close"].plot(grid=True)
goog.join(aapl, lsuffix=" goog", rsuffix=" aapl").plot()
aapl["Adj Close"][aapl["Adj Close"] == "null"] = np.NaN
aapl["Adj Close"] = aapl["Adj Close"].bfill()
aapl["Adj Close"] = aapl["Adj Close"].apply(lambda x: float(x))
aapl["Adj Close"].plot(grid=True)
msft = pd.read_csv("data/MSFT.csv", index_col=0)
msft.index = pd.to_datetime(msft.index)
stocks = pd.DataFrame({"AAPL": aapl["Adj Close"].bfill(),
"MSFT": msft["Adj Close"].bfill(),
"GOOG": goog["Adj Close"].bfill()})
pd.concat([aapl["Adj Close"], msft["Adj Close"], goog["Adj Close"]], 1, keys=["aapl", "msft", "goog"]).plot()
stocks = pd.concat([aapl["Adj Close"], msft["Adj Close"], goog["Adj Close"]], 1, keys=["aapl", "msft", "goog"])
valid_stocks = stocks[stocks.index >= stocks["goog"].first_valid_index()]
# stocks = pd.to_numeric(stocks)
stocks.plot(grid=True)
valid_stocks.plot(grid=True)
valid_stocks_lr = np.log(valid_stocks / valid_stocks.shift(1))
valid_stocks_lr.loc["2017-01-01":"2017-02-01"].plot(grid=True)
#下面我們試試能不能把日K圖變成月K圖
monthly_stocks = valid_stocks.groupby([valid_stocks.index.year, valid_stocks.index.month]).last()
monthly_stocks
index = [ str(i[0]) + "-" + str(i[1]) for i in monthly_stocks.index.values]
print(index)
index = pd.core.indexes.period.PeriodIndex(index, freq="M")
monthly_stocks.index = index
monthly_stocks.plot(grid=True)
monthly_stocks.loc["2011-8"]
monthly_stocks.loc["2011/8"]
monthly_stocks.loc["8/2011"]
9.特征處理
9.1 credit project
import pandas as pd
import numpy as np
# import matplotlib.pyploy as plt
%matplotlib inline
df = pd.read_csv("data/credit-data.csv")
df.head()
for i, val in enumerate(df):
print(df[val].value_counts())
df['income_bins'] = pd.cut(df.monthly_income, bins=15)
pd.value_counts(df['income_bins'])
df['income_bins'] = pd.cut(df.monthly_income, bins=15, labels=False)
pd.value_counts(df.income_bins)
df["monthly_income"] = df["monthly_income"].fillna(df["monthly_income"].mean())
df["income_bins"] = np.log(df.monthly_income)
df["income_bins"] = df["income_bins"].replace([np.inf, -np.inf], 0)
df["income_bins"] = df["income_bins"].astype("int")
df[["income_bins", "serious_dlqin2yrs"]].groupby("income_bins").mean()
cols = ['income_bins', 'serious_dlqin2yrs']
income_means = df[cols].groupby("income_bins").mean()
income_means.plot()
cols = ['age', 'serious_dlqin2yrs']
age_means = df[cols].groupby("age").mean()
age_means.plot()
mybins = [0] + list(range(20, 80, 5)) + [120]
df['age_bucket'] = pd.cut(df.age, bins=mybins)
df['age_bucket'].value_counts()
df[["age_bucket", "serious_dlqin2yrs"]].groupby("age_bucket").mean().fillna(0)
df[["age_bucket", "serious_dlqin2yrs"]].groupby("age_bucket").mean().plot()
#把categorize的類型轉(zhuǎn)換成數(shù)值類型
labels, levels = pd.factorize(df.age_bucket)
df.age_bucket = labels
df.age_bucket.head()
#quantile
bins = []
for q in [0.2, 0.4, 0.6, 0.8, 1.0]:
bins.append(df.debt_ratio.quantile(q))
debt_ratio_binned = pd.cut(df.debt_ratio, bins=bins)
debt_ratio_binned
print(pd.value_counts(debt_ratio_binned))
9.2 train project
step1.加載需要的庫
import pandas as pd
import numpy as np
%matplotlib inline
#載入數(shù)據(jù):
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
train.shape, test.shape
step2.看看數(shù)據(jù)的基本情況
train.dtypes
train.head(5)
#合成一個總的data
train['source']= 'train'
test['source'] = 'test'
data=pd.concat([train, test],ignore_index=True)
data.shape
step3.數(shù)據(jù)應(yīng)用/建模一個很重要的工作是,你要看看異常點,比如說缺省值
data.apply(lambda x: sum(x.isnull()))
step4.要對數(shù)據(jù)有更深的認(rèn)識,比如說,咱們看看這些字段,分別有多少種取值(甚至你可以看看分布)
var = ['Gender','Salary_Account','Mobile_Verified','Var1','Filled_Form','Device_Type','Var2','Source']
for v in var:
print '\n%s這一列數(shù)據(jù)的不同取值和出現(xiàn)的次數(shù)\n'%v
print data[v].value_counts()
step5.緊接著你就可以開始處理你的字段(特征)了
#City字段的處理
len(data['City'].unique())
data.drop('City',axis=1,inplace=True) #好像city的類型好多,粗暴一點,這個字段咱們不要了
#DOB字段的處理
#DOB是出生的具體日期,咱們要具體日期作用沒那么大,年齡段可能對我們有用,所有算一下年齡好了
data['DOB'].head()
#創(chuàng)建一個年齡的字段Age
data['Age'] = data['DOB'].apply(lambda x: 115 - int(x[-2:]))
data['Age'].head()
#把原始的DOB字段去掉:
data.drop('DOB',axis=1,inplace=True)
#EMI_Load_Submitted字段處理
data.boxplot(column=['EMI_Loan_Submitted'],return_type='axes')
#好像缺失值比較多,干脆就開一個新的字段,表明是缺失值還是不是缺失值
data['EMI_Loan_Submitted_Missing'] = data['EMI_Loan_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data[['EMI_Loan_Submitted','EMI_Loan_Submitted_Missing']].head(10)
#原始那一列就可以不要了
data.drop('EMI_Loan_Submitted',axis=1,inplace=True)
#Employer Name字段處理
len(data['Employer_Name'].value_counts())
data.drop('Employer_Name',axis=1,inplace=True)
#Existing_EMI字段
data.boxplot(column='Existing_EMI',return_type='axes')
data['Existing_EMI'].describe()
#缺省值不多,用均值代替
data['Existing_EMI'].fillna(0, inplace=True)
#Interest_Rate字段:
data.boxplot(column=['Interest_Rate'],return_type='axes')
#缺省值太多,也造一個字段,表示有無
data['Interest_Rate_Missing'] = data['Interest_Rate'].apply(lambda x: 1 if pd.isnull(x) else 0)
print data[['Interest_Rate','Interest_Rate_Missing']].head(10)
data.drop('Interest_Rate',axis=1,inplace=True)
#Lead Creation Date字段
#不!要!了!,是的,不要了!!!
data.drop('Lead_Creation_Date',axis=1,inplace=True)
data.head()
#Loan Amount and Tenure applied字段
#找中位數(shù)去填補缺省值(因為缺省的不多)
data['Loan_Amount_Applied'].fillna(data['Loan_Amount_Applied'].median(),inplace=True)
data['Loan_Tenure_Applied'].fillna(data['Loan_Tenure_Applied'].median(),inplace=True)
#Loan Amount and Tenure selected
# 缺省值太多。。。是否缺省。。。
data['Loan_Amount_Submitted_Missing'] = data['Loan_Amount_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
data['Loan_Tenure_Submitted_Missing'] = data['Loan_Tenure_Submitted'].apply(lambda x: 1 if pd.isnull(x) else 0)
#原來的字段就沒用了
data.drop(['Loan_Amount_Submitted','Loan_Tenure_Submitted'],axis=1,inplace=True)
#LoggedIn
#沒想好怎么用。。。不要了。。。
data.drop('LoggedIn',axis=1,inplace=True)
#salary account
# 可能對接多個銀行,所以也不要了
data.drop('Salary_Account',axis=1,inplace=True)
#Processing_Fee
#和之前一樣的處理,有或者沒有
data['Processing_Fee_Missing'] = data['Processing_Fee'].apply(lambda x: 1 if pd.isnull(x) else 0)
#舊的字段不要了
data.drop('Processing_Fee',axis=1,inplace=True)
#Source
data['Source'] = data['Source'].apply(lambda x: 'others' if x not in ['S122','S133'] else x)
data['Source'].value_counts()
#最終的數(shù)據(jù)樣式
data.head()
data.describe()
data.apply(lambda x: sum(x.isnull()))
data.dtypes
step6.數(shù)值編碼
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
var_to_encode = ['Device_Type','Filled_Form','Gender','Var1','Var2','Mobile_Verified','Source']
for col in var_to_encode:
data[col] = le.fit_transform(data[col])
data.head()
data.dtypes
step7.類別型的One-Hot 編碼
data = pd.get_dummies(data, columns=var_to_encode)
data.columns
step8.區(qū)分訓(xùn)練和測試數(shù)據(jù)
train = data.loc[data['source']=='train']
test = data.loc[data['source']=='test']
train.drop('source',axis=1,inplace=True)
test.drop(['source','Disbursed'],axis=1,inplace=True)
train.to_csv('train_modified.csv',index=False)
test.to_csv('test_modified.csv',index=False)