当前位置:网站首页>ML之RF:kaggle比赛之利用泰坦尼克号数据集建立RF模型对每个人进行获救是否预测
ML之RF:kaggle比赛之利用泰坦尼克号数据集建立RF模型对每个人进行获救是否预测
2022-04-22 15:23:00 【一个处女座的程序猿】
ML之RF:kaggle比赛之利用泰坦尼克号数据集建立RF模型对每个人进行获救是否预测
目录
输出结果
实现代码
输出结果
后期更新……
实现代码
#预测模型选择的RF
import
numpy
as
np
import
pandas
as
pd
from
pandas
import
DataFrame
from
patsy
import
dmatrices
import
string
from
operator
import
itemgetter
import
json
from
sklearn.
ensemble
import
RandomForestClassifier
from
sklearn.
cross_validation
import
cross_val_score
from
sklearn.
pipeline
import
Pipeline
from
sklearn.
grid_search
import
GridSearchCV
from
sklearn.
cross_validation
import
train_test_split,
StratifiedShuffleSplit,
StratifiedKFold
from
sklearn
import
preprocessing
from
sklearn.
metrics
import
classification_report
from
sklearn.
externals
import
joblib
##Read configuration parameters
train_file
=
"train.csv"
MODEL_PATH
=
"./"
test_file
=
"test.csv"
SUBMISSION_PATH
=
"./"
seed
=
0
print(
train_file,
seed)
# 输出得分
def
report(
grid_scores,
n_top
=
3):
top_scores
=
sorted(
grid_scores,
key
=
itemgetter(
1),
reverse
=
True)[:
n_top]
for
i,
score
in
enumerate(
top_scores):
print(
"Model with rank: {0}".
format(
i
+
1))
print(
"Mean validation score: {0:.3f} (std: {1:.3f})".
format(
score.
mean_validation_score,
np.
std(
score.
cv_validation_scores)))
print(
"Parameters: {0}".
format(
score.
parameters))
print(
"")
#清理和处理数据
def
substrings_in_string(
big_string,
substrings):
for
substring
in
substrings:
if
string.
find(
big_string,
substring)
!=
-
1:
return
substring
print(
big_string)
return
np.
nan
le
=
preprocessing.
LabelEncoder()
enc
=
preprocessing.
OneHotEncoder()
def
clean_and_munge_data(
df):
#处理缺省值
df.
Fare
=
df.
Fare.
map(
lambda
x:
np.
nan
if
x
==
0
else
x)
#处理一下名字,生成Title字段
title_list
=[
'Mrs',
'Mr',
'Master',
'Miss',
'Major',
'Rev',
'Dr',
'Ms',
'Mlle',
'Col',
'Capt',
'Mme',
'Countess',
'Don',
'Jonkheer']
df[
'Title']
=
df[
'Name'].
map(
lambda
x:
substrings_in_string(
x,
title_list))
#处理特殊的称呼,全处理成mr, mrs, miss, master
def
replace_titles(
x):
title
=
x[
'Title']
if
title
in [
'Mr',
'Don',
'Major',
'Capt',
'Jonkheer',
'Rev',
'Col']:
return
'Mr'
elif
title
in [
'Master']:
return
'Master'
elif
title
in [
'Countess',
'Mme',
'Mrs']:
return
'Mrs'
elif
title
in [
'Mlle',
'Ms',
'Miss']:
return
'Miss'
elif
title
==
'Dr':
if
x[
'Sex']
==
'Male':
return
'Mr'
else:
return
'Mrs'
elif
title
==
'':
if
x[
'Sex']
==
'Male':
return
'Master'
else:
return
'Miss'
else:
return
title
df[
'Title']
=
df.
apply(
replace_titles,
axis
=
1)
#看看家族是否够大,咳咳
df[
'Family_Size']
=
df[
'SibSp']
+
df[
'Parch']
df[
'Family']
=
df[
'SibSp']
*
df[
'Parch']
df.
loc[ (
df.
Fare.
isnull())
&(
df.
Pclass
==
1),
'Fare']
=
np.
median(
df[
df[
'Pclass']
==
1][
'Fare'].
dropna())
df.
loc[ (
df.
Fare.
isnull())
&(
df.
Pclass
==
2),
'Fare']
=
np.
median(
df[
df[
'Pclass']
==
2][
'Fare'].
dropna())
df.
loc[ (
df.
Fare.
isnull())
&(
df.
Pclass
==
3),
'Fare']
=
np.
median(
df[
df[
'Pclass']
==
3][
'Fare'].
dropna())
df[
'Gender']
=
df[
'Sex'].
map( {
'female':
0,
'male':
1} ).
astype(
int)
df[
'AgeFill']
=
df[
'Age']
mean_ages
=
np.
zeros(
4)
mean_ages[
0]
=
np.
average(
df[
df[
'Title']
==
'Miss'][
'Age'].
dropna())
mean_ages[
1]
=
np.
average(
df[
df[
'Title']
==
'Mrs'][
'Age'].
dropna())
mean_ages[
2]
=
np.
average(
df[
df[
'Title']
==
'Mr'][
'Age'].
dropna())
mean_ages[
3]
=
np.
average(
df[
df[
'Title']
==
'Master'][
'Age'].
dropna())
df.
loc[ (
df.
Age.
isnull())
& (
df.
Title
==
'Miss') ,
'AgeFill']
=
mean_ages[
0]
df.
loc[ (
df.
Age.
isnull())
& (
df.
Title
==
'Mrs') ,
'AgeFill']
=
mean_ages[
1]
df.
loc[ (
df.
Age.
isnull())
& (
df.
Title
==
'Mr') ,
'AgeFill']
=
mean_ages[
2]
df.
loc[ (
df.
Age.
isnull())
& (
df.
Title
==
'Master') ,
'AgeFill']
=
mean_ages[
3]
df[
'AgeCat']
=
df[
'AgeFill']
df.
loc[ (
df.
AgeFill
<=
10) ,
'AgeCat']
=
'child'
df.
loc[ (
df.
AgeFill
>
60),
'AgeCat']
=
'aged'
df.
loc[ (
df.
AgeFill
>
10)
& (
df.
AgeFill
<=
30) ,
'AgeCat']
=
'adult'
df.
loc[ (
df.
AgeFill
>
30)
& (
df.
AgeFill
<=
60) ,
'AgeCat']
=
'senior'
df.
Embarked
=
df.
Embarked.
fillna(
'S')
df.
loc[
df.
Cabin.
isnull()
==
True,
'Cabin']
=
0.5
df.
loc[
df.
Cabin.
isnull()
==
False,
'Cabin']
=
1.5
df[
'Fare_Per_Person']
=
df[
'Fare']
/(
df[
'Family_Size']
+
1)
#Age times class
df[
'AgeClass']
=
df[
'AgeFill']
*
df[
'Pclass']
df[
'ClassFare']
=
df[
'Pclass']
*
df[
'Fare_Per_Person']
df[
'HighLow']
=
df[
'Pclass']
df.
loc[ (
df.
Fare_Per_Person
<
8) ,
'HighLow']
=
'Low'
df.
loc[ (
df.
Fare_Per_Person
>=
8) ,
'HighLow']
=
'High'
le.
fit(
df[
'Sex'] )
x_sex
=
le.
transform(
df[
'Sex'])
df[
'Sex']
=
x_sex.
astype(
np.
float)
le.
fit(
df[
'Ticket'])
x_Ticket
=
le.
transform(
df[
'Ticket'])
df[
'Ticket']
=
x_Ticket.
astype(
np.
float)
le.
fit(
df[
'Title'])
x_title
=
le.
transform(
df[
'Title'])
df[
'Title']
=
x_title.
astype(
np.
float)
le.
fit(
df[
'HighLow'])
x_hl
=
le.
transform(
df[
'HighLow'])
df[
'HighLow']
=
x_hl.
astype(
np.
float)
le.
fit(
df[
'AgeCat'])
x_age
=
le.
transform(
df[
'AgeCat'])
df[
'AgeCat']
=
x_age.
astype(
np.
float)
le.
fit(
df[
'Embarked'])
x_emb
=
le.
transform(
df[
'Embarked'])
df[
'Embarked']
=
x_emb.
astype(
np.
float)
df
=
df.
drop([
'PassengerId',
'Name',
'Age',
'Cabin'],
axis
=
1)
#remove Name,Age and PassengerId
return
df
#读取数据
traindf
=
pd.
read_csv(
train_file)
##清洗数据
df
=
clean_and_munge_data(
traindf)
########################################formula################################
formula_ml
=
'Survived~Pclass+C(Title)+Sex+C(AgeCat)+Fare_Per_Person+Fare+Family_Size'
y_train,
x_train
=
dmatrices(
formula_ml,
data
=
df,
return_type
=
'dataframe')
y_train
=
np.
asarray(
y_train).
ravel()
print(
y_train.
shape,
x_train.
shape)
##选择训练和测试集
X_train,
X_test,
Y_train,
Y_test
=
train_test_split(
x_train,
y_train,
test_size
=
0.2,
random_state
=
seed)
#初始化分类器
clf
=
RandomForestClassifier(
n_estimators
=
500,
criterion
=
'entropy',
max_depth
=
5,
min_samples_split
=
1,
min_samples_leaf
=
1,
max_features
=
'auto',
bootstrap
=
False,
oob_score
=
False,
n_jobs
=
1,
random_state
=
seed,
verbose
=
0)
###grid search找到最好的参数
param_grid
=
dict( )
##创建分类pipeline
pipeline
=
Pipeline([ (
'clf',
clf) ])
grid_search
=
GridSearchCV(
pipeline,
param_grid
=
param_grid,
verbose
=
3,
scoring
=
'accuracy',\
cv
=
StratifiedShuffleSplit(
Y_train,
n_iter
=
10,
test_size
=
0.2,
train_size
=
None,
indices
=
None, \
random_state
=
seed,
n_iterations
=
None)).
fit(
X_train,
Y_train)
# 对结果打分
print(
"Best score: %0.3f"
%
grid_search.
best_score_)
print(
grid_search.
best_estimator_)
report(
grid_search.
grid_scores_)
print(
'-----grid search end------------')
print (
'on all train set')
scores
=
cross_val_score(
grid_search.
best_estimator_,
x_train,
y_train,
cv
=
3,
scoring
=
'accuracy')
print(
scores.
mean(),
scores)
print (
'on test set')
scores
=
cross_val_score(
grid_search.
best_estimator_,
X_test,
Y_test,
cv
=
3,
scoring
=
'accuracy')
print(
scores.
mean(),
scores)
# 对结果打分
print(
classification_report(
Y_train,
grid_search.
best_estimator_.
predict(
X_train) ))
print(
'test data')
print(
classification_report(
Y_test,
grid_search.
best_estimator_.
predict(
X_test) ))
model_file
=
MODEL_PATH
+
'model-rf.pkl'
joblib.
dump(
grid_search.
best_estimator_,
model_file)
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.
版权声明
本文为[一个处女座的程序猿]所创,转载请带上原文链接,感谢
https://blog.51cto.com/yunyaniu/5244055
边栏推荐
- 学校固定资产管理系统由谁来做,云呐RFID固定资产管理系统
- MQTT协议应用:外网手机控制内网树莓派
- 如何选择电流钳或电流探头
- [in depth understanding of tcallusdb technology] insert data example code - [generic table]
- JVM的垃圾回收Serial、Serial Old、Parallel Scavenge、Parallel Old的介绍和STW(Stop The World)
- 【朋友圈服务器架构设计】
- [deeply understand tcallusdb technology] update data example code - [generic table]
- 如何通过云效Projex项目协同提高团队更高效的协作能力
- WireGuard 系列文章(五):Netmaker 简介-创建和管理 WireGuard 网络的平台
- 人脸识别 (4) 人脸对齐
猜你喜欢

About STS access control of Alibaba cloud OSS resources

Redis series 3: ThinkPHP uses redis
![【深入理解TcaplusDB技术】读取数据示例代码——[Generic表]](/img/7b/8c4f1549054ee8c0184495d9e8e378.png)
【深入理解TcaplusDB技术】读取数据示例代码——[Generic表]

How to measure small current with oscilloscope and current probe

Computer Vision L7 -- Self-supervised Learning

Tencent cloud im integration (so easy)

多款无源探头1:1衰减比有什么区别

Introduction to development using Hal library, stm32cubemx and keil 5 (I): turn on an LED light (nucleo-f411re)

Tcallusdb Jun · industry news compilation (V)

Redis系列三:thinkphp 使用 redis
随机推荐
人脸识别 (5) 基于MCTNN人脸检测(Pytorch)
How to import Cisco evng image simulator
微服务技术概览
MySQL initialization error
ROS通信机制三---参数服务器
分布式任务调度平台XXL-JOB安装及使用
JVM garbage collection parnew and CMS introduction
使用HAL库、STM32CubeMX和Keil 5开发入门教程(一):点亮一盏LED灯(NUCLEO-F411RE)
【深入理解TcaplusDB技术】读取列表指定位置数据示例代码——[List表]
Installation and use of distributed task scheduling platform xxl-job
【写作】论文写作技巧
SOLVE的妙用
rotate an array by k positions clock-wise
ROS通信机制二---服务通信
云呐|最新酒店行业固定资产管理办法,酒店实物资产管理系统
TcaplusDB君 · 行业新闻汇编(五)
【深入理解TcaplusDB技术】将数据插入到列表指定位置接口说明——[List表]
Record an SQL, query the company where the user last worked, and search according to the enterprise name + user name
Redis系列三:thinkphp 使用 redis
Kotlin's extended function knowledge points