当前位置:网站首页>RF of mL: the kaggle competition uses the Titanic data set to establish an RF model to predict whether everyone is rescued or not
RF of mL: the kaggle competition uses the Titanic data set to establish an RF model to predict whether everyone is rescued or not
2022-04-22 15:29:00 【A Virgo procedural ape】
ML And RF:kaggle The game uses the Titanic data set to build RF The model predicts whether everyone is rescued
Catalog
Output results
Later updates ……
Implementation code
# Selection of prediction model RF
import
numpy
as
np
import
pandas
as
pd
from
pandas
import
DataFrame
from
patsy
import
dmatrices
import
string
from
operator
import
itemgetter
import
json
from
sklearn.
ensemble
import
RandomForestClassifier
from
sklearn.
cross_validation
import
cross_val_score
from
sklearn.
pipeline
import
Pipeline
from
sklearn.
grid_search
import
GridSearchCV
from
sklearn.
cross_validation
import
train_test_split,
StratifiedShuffleSplit,
StratifiedKFold
from
sklearn
import
preprocessing
from
sklearn.
metrics
import
classification_report
from
sklearn.
externals
import
joblib
##Read configuration parameters
train_file
=
"train.csv"
MODEL_PATH
=
"./"
test_file
=
"test.csv"
SUBMISSION_PATH
=
"./"
seed
=
0
print(
train_file,
seed)
# Output score
def
report(
grid_scores,
n_top
=
3):
top_scores
=
sorted(
grid_scores,
key
=
itemgetter(
1),
reverse
=
True)[:
n_top]
for
i,
score
in
enumerate(
top_scores):
print(
"Model with rank: {0}".
format(
i
+
1))
print(
"Mean validation score: {0:.3f} (std: {1:.3f})".
format(
score.
mean_validation_score,
np.
std(
score.
cv_validation_scores)))
print(
"Parameters: {0}".
format(
score.
parameters))
print(
"")
# Clean up and process data
def
substrings_in_string(
big_string,
substrings):
for
substring
in
substrings:
if
string.
find(
big_string,
substring)
!=
-
1:
return
substring
print(
big_string)
return
np.
nan
le
=
preprocessing.
LabelEncoder()
enc
=
preprocessing.
OneHotEncoder()
def
clean_and_munge_data(
df):
# Process default values
df.
Fare
=
df.
Fare.
map(
lambda
x:
np.
nan
if
x
==
0
else
x)
# Deal with the name , Generate Title Field
title_list
=[
'Mrs',
'Mr',
'Master',
'Miss',
'Major',
'Rev',
'Dr',
'Ms',
'Mlle',
'Col',
'Capt',
'Mme',
'Countess',
'Don',
'Jonkheer']
df[
'Title']
=
df[
'Name'].
map(
lambda
x:
substrings_in_string(
x,
title_list))
# Deal with special titles , Fully processed into mr, mrs, miss, master
def
replace_titles(
x):
title
=
x[
'Title']
if
title
in [
'Mr',
'Don',
'Major',
'Capt',
'Jonkheer',
'Rev',
'Col']:
return
'Mr'
elif
title
in [
'Master']:
return
'Master'
elif
title
in [
'Countess',
'Mme',
'Mrs']:
return
'Mrs'
elif
title
in [
'Mlle',
'Ms',
'Miss']:
return
'Miss'
elif
title
==
'Dr':
if
x[
'Sex']
==
'Male':
return
'Mr'
else:
return
'Mrs'
elif
title
==
'':
if
x[
'Sex']
==
'Male':
return
'Master'
else:
return
'Miss'
else:
return
title
df[
'Title']
=
df.
apply(
replace_titles,
axis
=
1)
# See if the family is big enough , Cough
df[
'Family_Size']
=
df[
'SibSp']
+
df[
'Parch']
df[
'Family']
=
df[
'SibSp']
*
df[
'Parch']
df.
loc[ (
df.
Fare.
isnull())
&(
df.
Pclass
==
1),
'Fare']
=
np.
median(
df[
df[
'Pclass']
==
1][
'Fare'].
dropna())
df.
loc[ (
df.
Fare.
isnull())
&(
df.
Pclass
==
2),
'Fare']
=
np.
median(
df[
df[
'Pclass']
==
2][
'Fare'].
dropna())
df.
loc[ (
df.
Fare.
isnull())
&(
df.
Pclass
==
3),
'Fare']
=
np.
median(
df[
df[
'Pclass']
==
3][
'Fare'].
dropna())
df[
'Gender']
=
df[
'Sex'].
map( {
'female':
0,
'male':
1} ).
astype(
int)
df[
'AgeFill']
=
df[
'Age']
mean_ages
=
np.
zeros(
4)
mean_ages[
0]
=
np.
average(
df[
df[
'Title']
==
'Miss'][
'Age'].
dropna())
mean_ages[
1]
=
np.
average(
df[
df[
'Title']
==
'Mrs'][
'Age'].
dropna())
mean_ages[
2]
=
np.
average(
df[
df[
'Title']
==
'Mr'][
'Age'].
dropna())
mean_ages[
3]
=
np.
average(
df[
df[
'Title']
==
'Master'][
'Age'].
dropna())
df.
loc[ (
df.
Age.
isnull())
& (
df.
Title
==
'Miss') ,
'AgeFill']
=
mean_ages[
0]
df.
loc[ (
df.
Age.
isnull())
& (
df.
Title
==
'Mrs') ,
'AgeFill']
=
mean_ages[
1]
df.
loc[ (
df.
Age.
isnull())
& (
df.
Title
==
'Mr') ,
'AgeFill']
=
mean_ages[
2]
df.
loc[ (
df.
Age.
isnull())
& (
df.
Title
==
'Master') ,
'AgeFill']
=
mean_ages[
3]
df[
'AgeCat']
=
df[
'AgeFill']
df.
loc[ (
df.
AgeFill
<=
10) ,
'AgeCat']
=
'child'
df.
loc[ (
df.
AgeFill
>
60),
'AgeCat']
=
'aged'
df.
loc[ (
df.
AgeFill
>
10)
& (
df.
AgeFill
<=
30) ,
'AgeCat']
=
'adult'
df.
loc[ (
df.
AgeFill
>
30)
& (
df.
AgeFill
<=
60) ,
'AgeCat']
=
'senior'
df.
Embarked
=
df.
Embarked.
fillna(
'S')
df.
loc[
df.
Cabin.
isnull()
==
True,
'Cabin']
=
0.5
df.
loc[
df.
Cabin.
isnull()
==
False,
'Cabin']
=
1.5
df[
'Fare_Per_Person']
=
df[
'Fare']
/(
df[
'Family_Size']
+
1)
#Age times class
df[
'AgeClass']
=
df[
'AgeFill']
*
df[
'Pclass']
df[
'ClassFare']
=
df[
'Pclass']
*
df[
'Fare_Per_Person']
df[
'HighLow']
=
df[
'Pclass']
df.
loc[ (
df.
Fare_Per_Person
<
8) ,
'HighLow']
=
'Low'
df.
loc[ (
df.
Fare_Per_Person
>=
8) ,
'HighLow']
=
'High'
le.
fit(
df[
'Sex'] )
x_sex
=
le.
transform(
df[
'Sex'])
df[
'Sex']
=
x_sex.
astype(
np.
float)
le.
fit(
df[
'Ticket'])
x_Ticket
=
le.
transform(
df[
'Ticket'])
df[
'Ticket']
=
x_Ticket.
astype(
np.
float)
le.
fit(
df[
'Title'])
x_title
=
le.
transform(
df[
'Title'])
df[
'Title']
=
x_title.
astype(
np.
float)
le.
fit(
df[
'HighLow'])
x_hl
=
le.
transform(
df[
'HighLow'])
df[
'HighLow']
=
x_hl.
astype(
np.
float)
le.
fit(
df[
'AgeCat'])
x_age
=
le.
transform(
df[
'AgeCat'])
df[
'AgeCat']
=
x_age.
astype(
np.
float)
le.
fit(
df[
'Embarked'])
x_emb
=
le.
transform(
df[
'Embarked'])
df[
'Embarked']
=
x_emb.
astype(
np.
float)
df
=
df.
drop([
'PassengerId',
'Name',
'Age',
'Cabin'],
axis
=
1)
#remove Name,Age and PassengerId
return
df
# Reading data
traindf
=
pd.
read_csv(
train_file)
## Data cleaning
df
=
clean_and_munge_data(
traindf)
########################################formula################################
formula_ml
=
'Survived~Pclass+C(Title)+Sex+C(AgeCat)+Fare_Per_Person+Fare+Family_Size'
y_train,
x_train
=
dmatrices(
formula_ml,
data
=
df,
return_type
=
'dataframe')
y_train
=
np.
asarray(
y_train).
ravel()
print(
y_train.
shape,
x_train.
shape)
## Select training and test sets
X_train,
X_test,
Y_train,
Y_test
=
train_test_split(
x_train,
y_train,
test_size
=
0.2,
random_state
=
seed)
# Initialize the classifier
clf
=
RandomForestClassifier(
n_estimators
=
500,
criterion
=
'entropy',
max_depth
=
5,
min_samples_split
=
1,
min_samples_leaf
=
1,
max_features
=
'auto',
bootstrap
=
False,
oob_score
=
False,
n_jobs
=
1,
random_state
=
seed,
verbose
=
0)
###grid search Find the best parameters
param_grid
=
dict( )
## Create classification pipeline
pipeline
=
Pipeline([ (
'clf',
clf) ])
grid_search
=
GridSearchCV(
pipeline,
param_grid
=
param_grid,
verbose
=
3,
scoring
=
'accuracy',\
cv
=
StratifiedShuffleSplit(
Y_train,
n_iter
=
10,
test_size
=
0.2,
train_size
=
None,
indices
=
None, \
random_state
=
seed,
n_iterations
=
None)).
fit(
X_train,
Y_train)
# Score the results
print(
"Best score: %0.3f"
%
grid_search.
best_score_)
print(
grid_search.
best_estimator_)
report(
grid_search.
grid_scores_)
print(
'-----grid search end------------')
print (
'on all train set')
scores
=
cross_val_score(
grid_search.
best_estimator_,
x_train,
y_train,
cv
=
3,
scoring
=
'accuracy')
print(
scores.
mean(),
scores)
print (
'on test set')
scores
=
cross_val_score(
grid_search.
best_estimator_,
X_test,
Y_test,
cv
=
3,
scoring
=
'accuracy')
print(
scores.
mean(),
scores)
# Score the results
print(
classification_report(
Y_train,
grid_search.
best_estimator_.
predict(
X_train) ))
print(
'test data')
print(
classification_report(
Y_test,
grid_search.
best_estimator_.
predict(
X_test) ))
model_file
=
MODEL_PATH
+
'model-rf.pkl'
joblib.
dump(
grid_search.
best_estimator_,
model_file)
- 1.
- 2.
- 3.
- 4.
- 5.
- 6.
- 7.
- 8.
- 9.
- 10.
- 11.
- 12.
- 13.
- 14.
- 15.
- 16.
- 17.
- 18.
- 19.
- 20.
- 21.
- 22.
- 23.
- 24.
- 25.
- 26.
- 27.
- 28.
- 29.
- 30.
- 31.
- 32.
- 33.
- 34.
- 35.
- 36.
- 37.
- 38.
- 39.
- 40.
- 41.
- 42.
- 43.
- 44.
- 45.
- 46.
- 47.
- 48.
- 49.
- 50.
- 51.
- 52.
- 53.
- 54.
- 55.
- 56.
- 57.
- 58.
- 59.
- 60.
- 61.
- 62.
- 63.
- 64.
- 65.
- 66.
- 67.
- 68.
- 69.
- 70.
- 71.
- 72.
- 73.
- 74.
- 75.
- 76.
- 77.
- 78.
- 79.
- 80.
- 81.
- 82.
- 83.
- 84.
- 85.
- 86.
- 87.
- 88.
- 89.
- 90.
- 91.
- 92.
- 93.
- 94.
- 95.
- 96.
- 97.
- 98.
- 99.
- 100.
- 101.
- 102.
- 103.
- 104.
- 105.
- 106.
- 107.
- 108.
- 109.
- 110.
- 111.
- 112.
- 113.
- 114.
- 115.
- 116.
- 117.
- 118.
- 119.
- 120.
- 121.
- 122.
- 123.
- 124.
- 125.
- 126.
- 127.
- 128.
- 129.
- 130.
- 131.
- 132.
- 133.
- 134.
- 135.
- 136.
- 137.
- 138.
- 139.
- 140.
- 141.
- 142.
- 143.
- 144.
- 145.
- 146.
- 147.
- 148.
- 149.
- 150.
- 151.
- 152.
- 153.
- 154.
- 155.
- 156.
- 157.
- 158.
- 159.
- 160.
- 161.
- 162.
- 163.
- 164.
- 165.
- 166.
- 167.
- 168.
- 169.
- 170.
- 171.
- 172.
- 173.
- 174.
- 175.
- 176.
- 177.
- 178.
- 179.
- 180.
- 181.
- 182.
- 183.
- 184.
- 185.
- 186.
- 187.
- 188.
- 189.
- 190.
- 191.
- 192.
- 193.
- 194.
- 195.
- 196.
- 197.
- 198.
- 199.
- 200.
- 201.
- 202.
- 203.
- 204.
- 205.
- 206.
- 207.
- 208.
- 209.
版权声明
本文为[A Virgo procedural ape]所创,转载请带上原文链接,感谢
https://yzsam.com/2022/04/202204221432498020.html
边栏推荐
- Web automated testing
- [in depth understanding of tcallusdb technology] description of data interface for batch deletion of specified location in list - [list table]
- Record an SQL, query the company where the user last worked, and search according to the enterprise name + user name
- OSPF的详细讲解、分类,还有实验讲解
- 【深入理解TcaplusDB技术】插入数据示例代码——[Generic表]
- Dataset之CASIA-WebFace:CASIA-WebFace 数据集的简介、安装、使用方法之详细攻略
- Pat class a 1012: the best rank (25)
- Installation and use of distributed task scheduling platform xxl-job
- [yolact dataset production - detailed explanation of labelme use and conversion to coco]
- 多款无源探头1:1衰减比有什么区别
猜你喜欢

Face recognition (5) face detection based on mctnn (pytoch)

Raspberry Pi B的UART极简例程
![【深入理解TcaplusDB技术】插入数据示例代码——[Generic表]](/img/7b/8c4f1549054ee8c0184495d9e8e378.png)
【深入理解TcaplusDB技术】插入数据示例代码——[Generic表]

2022 welder (primary) operation certificate examination question bank and answers

性能飙升66%的秘密:AMD 2.5万元768MB 3D缓存霄龙首次开盖
![【深入理解TcaplusDB技术】删除数据示例代码——[Generic表]](/img/7b/8c4f1549054ee8c0184495d9e8e378.png)
【深入理解TcaplusDB技术】删除数据示例代码——[Generic表]

How to measure small current with oscilloscope and current probe

如何维护和应用高压隔离探头
![[circle of friends server architecture design]](/img/f4/d528e138ff23529a26a0db21a16afe.png)
[circle of friends server architecture design]
![[in-depth understanding of tcallusdb technology] sample code for reading all data in the list - [list table]](/img/7b/8c4f1549054ee8c0184495d9e8e378.png)
[in-depth understanding of tcallusdb technology] sample code for reading all data in the list - [list table]
随机推荐
vscode处理代码合并冲突
顺序表——单向无头链表基础实现
psftp用法
企业级知识管理(KM)建设方法及过程
性能飙升66%的秘密:AMD 2.5万元768MB 3D缓存霄龙首次开盖
TcaplusDB君 · 行业新闻汇编(四)
ML之FE:结合Kaggle比赛的某一案例细究特征工程(Feature Engineering)思路框架
【KMP】
The GNU build system体验教程:Hello world example with Autoconf and Automake
E. 2-Letter Strings
【深入理解TcaplusDB技术】删除列表所有数据接口说明——[List表]
快速搭建属于你自己的WordPress博客站点【玩转华为云】
分布式任務調度平臺XXL-JOB安裝及使用
07-有关函数的知识点
wxWidgets学习笔记(一):解读Code::Blocks模板工程源代码
Raspberry Pi B的UART极简例程
2022 welder (primary) operation certificate examination question bank and answers
Workplace PUA, five sins of managers
恶意软件分析– Ursnif Trojan
「译文」如何在YAML中输入多行字符串?