当前位置：网站首页>RF of mL: the kaggle competition uses the Titanic data set to establish an RF model to predict whether everyone is rescued or not

RF of mL: the kaggle competition uses the Titanic data set to establish an RF model to predict whether everyone is rescued or not

2022-04-22 15:29:00 【A Virgo procedural ape】
ML And RF：kaggle The game uses the Titanic data set to build RF The model predicts whether everyone is rescued
Catalog
Output results
Implementation code
Output results

Later updates ……
Implementation code

 
       # Selection of prediction model RF
       
       import 
       
       numpy 
       
       as 
       
       np
       
       import 
       
       pandas 
       
       as 
       
       pd
       
       from 
       
       pandas 
       
       import  
       
       DataFrame
       
       from 
       
       patsy 
       
       import 
       
       dmatrices
       
       import 
       
       string
       
       from 
       
       operator 
       
       import 
       
       itemgetter
       
       import 
       
       json
       
       from 
       
       sklearn.
       
       ensemble 
       
       import 
       
       RandomForestClassifier
       
       from 
       
       sklearn.
       
       cross_validation 
       
       import 
       
       cross_val_score
       
       from 
       
       sklearn.
       
       pipeline 
       
       import 
       
       Pipeline
       
       from 
       
       sklearn.
       
       grid_search 
       
       import 
       
       GridSearchCV
       
       from 
       
       sklearn.
       
       cross_validation 
       
       import 
       
       train_test_split,
       
       StratifiedShuffleSplit,
       
       StratifiedKFold
       
       from 
       
       sklearn 
       
       import 
       
       preprocessing
       
       from 
       
       sklearn.
       
       metrics 
       
       import 
       
       classification_report
       
       from 
       
       sklearn.
       
       externals 
       
       import 
       
       joblib
       
       ##Read configuration parameters
       
       train_file
       
       =
       
       "train.csv"
       
       MODEL_PATH
       
       =
       
       "./"
       
       test_file
       
       =
       
       "test.csv"
       
       SUBMISSION_PATH
       
       =
       
       "./"
       
       seed
       
       = 
       
       0
       
       print(
       
       train_file,
       
       seed)
       
       #  Output score 
       
       def 
       
       report(
       
       grid_scores, 
       
       n_top
       
       =
       
       3):
       
       top_scores 
       
       = 
       
       sorted(
       
       grid_scores, 
       
       key
       
       =
       
       itemgetter(
       
       1), 
       
       reverse
       
       =
       
       True)[:
       
       n_top]
       
       for 
       
       i, 
       
       score 
       
       in 
       
       enumerate(
       
       top_scores):
       
       print(
       
       "Model with rank: {0}".
       
       format(
       
       i 
       
       + 
       
       1))
       
       print(
       
       "Mean validation score: {0:.3f} (std: {1:.3f})".
       
       format(
       
       score.
       
       mean_validation_score,
       
       np.
       
       std(
       
       score.
       
       cv_validation_scores)))
       
       print(
       
       "Parameters: {0}".
       
       format(
       
       score.
       
       parameters))
       
       print(
       
       "")
       
       # Clean up and process data 
       
       def 
       
       substrings_in_string(
       
       big_string, 
       
       substrings):
       
       for 
       
       substring 
       
       in 
       
       substrings:
       
       if 
       
       string.
       
       find(
       
       big_string, 
       
       substring) 
       
       != 
       
       -
       
       1:
       
       return 
       
       substring
       
       print(
       
       big_string)
       
       return 
       
       np.
       
       nan
       
       le 
       
       = 
       
       preprocessing.
       
       LabelEncoder()
       
       enc
       
       =
       
       preprocessing.
       
       OneHotEncoder()
       
       def 
       
       clean_and_munge_data(
       
       df):
       
       # Process default values 
       
       df.
       
       Fare 
       
       = 
       
       df.
       
       Fare.
       
       map(
       
       lambda 
       
       x: 
       
       np.
       
       nan 
       
       if 
       
       x
       
       ==
       
       0 
       
       else 
       
       x)
       
       # Deal with the name , Generate Title Field 
       
       title_list
       
       =[
       
       'Mrs', 
       
       'Mr', 
       
       'Master', 
       
       'Miss', 
       
       'Major', 
       
       'Rev',
       
       'Dr', 
       
       'Ms', 
       
       'Mlle',
       
       'Col', 
       
       'Capt', 
       
       'Mme', 
       
       'Countess',
       
       'Don', 
       
       'Jonkheer']
       
       df[
       
       'Title']
       
       =
       
       df[
       
       'Name'].
       
       map(
       
       lambda 
       
       x: 
       
       substrings_in_string(
       
       x, 
       
       title_list))
       
       # Deal with special titles , Fully processed into mr, mrs, miss, master
       
       def 
       
       replace_titles(
       
       x):
       
       title
       
       =
       
       x[
       
       'Title']
       
       if 
       
       title 
       
       in [
       
       'Mr',
       
       'Don', 
       
       'Major', 
       
       'Capt', 
       
       'Jonkheer', 
       
       'Rev', 
       
       'Col']:
       
       return 
       
       'Mr'
       
       elif 
       
       title 
       
       in [
       
       'Master']:
       
       return 
       
       'Master'
       
       elif 
       
       title 
       
       in [
       
       'Countess', 
       
       'Mme',
       
       'Mrs']:
       
       return 
       
       'Mrs'
       
       elif 
       
       title 
       
       in [
       
       'Mlle', 
       
       'Ms',
       
       'Miss']:
       
       return 
       
       'Miss'
       
       elif 
       
       title 
       
       ==
       
       'Dr':
       
       if 
       
       x[
       
       'Sex']
       
       ==
       
       'Male':
       
       return 
       
       'Mr'
       
       else:
       
       return 
       
       'Mrs'
       
       elif 
       
       title 
       
       ==
       
       '':
       
       if 
       
       x[
       
       'Sex']
       
       ==
       
       'Male':
       
       return 
       
       'Master'
       
       else:
       
       return 
       
       'Miss'
       
       else:
       
       return 
       
       title
       
       df[
       
       'Title']
       
       =
       
       df.
       
       apply(
       
       replace_titles, 
       
       axis
       
       =
       
       1)
       
       # See if the family is big enough , Cough 
       
       df[
       
       'Family_Size']
       
       =
       
       df[
       
       'SibSp']
       
       +
       
       df[
       
       'Parch']
       
       df[
       
       'Family']
       
       =
       
       df[
       
       'SibSp']
       
       *
       
       df[
       
       'Parch']
       
       df.
       
       loc[ (
       
       df.
       
       Fare.
       
       isnull())
       
       &(
       
       df.
       
       Pclass
       
       ==
       
       1),
       
       'Fare'] 
       
       =
       
       np.
       
       median(
       
       df[
       
       df[
       
       'Pclass'] 
       
       == 
       
       1][
       
       'Fare'].
       
       dropna())
       
       df.
       
       loc[ (
       
       df.
       
       Fare.
       
       isnull())
       
       &(
       
       df.
       
       Pclass
       
       ==
       
       2),
       
       'Fare'] 
       
       =
       
       np.
       
       median( 
       
       df[
       
       df[
       
       'Pclass'] 
       
       == 
       
       2][
       
       'Fare'].
       
       dropna())
       
       df.
       
       loc[ (
       
       df.
       
       Fare.
       
       isnull())
       
       &(
       
       df.
       
       Pclass
       
       ==
       
       3),
       
       'Fare'] 
       
       = 
       
       np.
       
       median(
       
       df[
       
       df[
       
       'Pclass'] 
       
       == 
       
       3][
       
       'Fare'].
       
       dropna())
       
       df[
       
       'Gender'] 
       
       = 
       
       df[
       
       'Sex'].
       
       map( {
       
       'female': 
       
       0, 
       
       'male': 
       
       1} ).
       
       astype(
       
       int)
       
       df[
       
       'AgeFill']
       
       =
       
       df[
       
       'Age']
       
       mean_ages 
       
       = 
       
       np.
       
       zeros(
       
       4)
       
       mean_ages[
       
       0]
       
       =
       
       np.
       
       average(
       
       df[
       
       df[
       
       'Title'] 
       
       == 
       
       'Miss'][
       
       'Age'].
       
       dropna())
       
       mean_ages[
       
       1]
       
       =
       
       np.
       
       average(
       
       df[
       
       df[
       
       'Title'] 
       
       == 
       
       'Mrs'][
       
       'Age'].
       
       dropna())
       
       mean_ages[
       
       2]
       
       =
       
       np.
       
       average(
       
       df[
       
       df[
       
       'Title'] 
       
       == 
       
       'Mr'][
       
       'Age'].
       
       dropna())
       
       mean_ages[
       
       3]
       
       =
       
       np.
       
       average(
       
       df[
       
       df[
       
       'Title'] 
       
       == 
       
       'Master'][
       
       'Age'].
       
       dropna())
       
       df.
       
       loc[ (
       
       df.
       
       Age.
       
       isnull()) 
       
       & (
       
       df.
       
       Title 
       
       == 
       
       'Miss') ,
       
       'AgeFill'] 
       
       = 
       
       mean_ages[
       
       0]
       
       df.
       
       loc[ (
       
       df.
       
       Age.
       
       isnull()) 
       
       & (
       
       df.
       
       Title 
       
       == 
       
       'Mrs') ,
       
       'AgeFill'] 
       
       = 
       
       mean_ages[
       
       1]
       
       df.
       
       loc[ (
       
       df.
       
       Age.
       
       isnull()) 
       
       & (
       
       df.
       
       Title 
       
       == 
       
       'Mr') ,
       
       'AgeFill'] 
       
       = 
       
       mean_ages[
       
       2]
       
       df.
       
       loc[ (
       
       df.
       
       Age.
       
       isnull()) 
       
       & (
       
       df.
       
       Title 
       
       == 
       
       'Master') ,
       
       'AgeFill'] 
       
       = 
       
       mean_ages[
       
       3]
       
       df[
       
       'AgeCat']
       
       =
       
       df[
       
       'AgeFill']
       
       df.
       
       loc[ (
       
       df.
       
       AgeFill
       
       <=
       
       10) ,
       
       'AgeCat'] 
       
       = 
       
       'child'
       
       df.
       
       loc[ (
       
       df.
       
       AgeFill
       
       >
       
       60),
       
       'AgeCat'] 
       
       = 
       
       'aged'
       
       df.
       
       loc[ (
       
       df.
       
       AgeFill
       
       >
       
       10) 
       
       & (
       
       df.
       
       AgeFill 
       
       <=
       
       30) ,
       
       'AgeCat'] 
       
       = 
       
       'adult'
       
       df.
       
       loc[ (
       
       df.
       
       AgeFill
       
       >
       
       30) 
       
       & (
       
       df.
       
       AgeFill 
       
       <=
       
       60) ,
       
       'AgeCat'] 
       
       = 
       
       'senior'
       
       df.
       
       Embarked 
       
       = 
       
       df.
       
       Embarked.
       
       fillna(
       
       'S')
       
       df.
       
       loc[ 
       
       df.
       
       Cabin.
       
       isnull()
       
       ==
       
       True,
       
       'Cabin'] 
       
       = 
       
       0.5
       
       df.
       
       loc[ 
       
       df.
       
       Cabin.
       
       isnull()
       
       ==
       
       False,
       
       'Cabin'] 
       
       = 
       
       1.5
       
       df[
       
       'Fare_Per_Person']
       
       =
       
       df[
       
       'Fare']
       
       /(
       
       df[
       
       'Family_Size']
       
       +
       
       1)
       
       #Age times class
       
       df[
       
       'AgeClass']
       
       =
       
       df[
       
       'AgeFill']
       
       *
       
       df[
       
       'Pclass']
       
       df[
       
       'ClassFare']
       
       =
       
       df[
       
       'Pclass']
       
       *
       
       df[
       
       'Fare_Per_Person']
       
       df[
       
       'HighLow']
       
       =
       
       df[
       
       'Pclass']
       
       df.
       
       loc[ (
       
       df.
       
       Fare_Per_Person
       
       <
       
       8) ,
       
       'HighLow'] 
       
       = 
       
       'Low'
       
       df.
       
       loc[ (
       
       df.
       
       Fare_Per_Person
       
       >=
       
       8) ,
       
       'HighLow'] 
       
       = 
       
       'High'
       
       le.
       
       fit(
       
       df[
       
       'Sex'] )
       
       x_sex
       
       =
       
       le.
       
       transform(
       
       df[
       
       'Sex'])
       
       df[
       
       'Sex']
       
       =
       
       x_sex.
       
       astype(
       
       np.
       
       float)
       
       le.
       
       fit( 
       
       df[
       
       'Ticket'])
       
       x_Ticket
       
       =
       
       le.
       
       transform( 
       
       df[
       
       'Ticket'])
       
       df[
       
       'Ticket']
       
       =
       
       x_Ticket.
       
       astype(
       
       np.
       
       float)
       
       le.
       
       fit(
       
       df[
       
       'Title'])
       
       x_title
       
       =
       
       le.
       
       transform(
       
       df[
       
       'Title'])
       
       df[
       
       'Title'] 
       
       =
       
       x_title.
       
       astype(
       
       np.
       
       float)
       
       le.
       
       fit(
       
       df[
       
       'HighLow'])
       
       x_hl
       
       =
       
       le.
       
       transform(
       
       df[
       
       'HighLow'])
       
       df[
       
       'HighLow']
       
       =
       
       x_hl.
       
       astype(
       
       np.
       
       float)
       
       le.
       
       fit(
       
       df[
       
       'AgeCat'])
       
       x_age
       
       =
       
       le.
       
       transform(
       
       df[
       
       'AgeCat'])
       
       df[
       
       'AgeCat'] 
       
       =
       
       x_age.
       
       astype(
       
       np.
       
       float)
       
       le.
       
       fit(
       
       df[
       
       'Embarked'])
       
       x_emb
       
       =
       
       le.
       
       transform(
       
       df[
       
       'Embarked'])
       
       df[
       
       'Embarked']
       
       =
       
       x_emb.
       
       astype(
       
       np.
       
       float)
       
       df 
       
       = 
       
       df.
       
       drop([
       
       'PassengerId',
       
       'Name',
       
       'Age',
       
       'Cabin'], 
       
       axis
       
       =
       
       1) 
       
       #remove Name,Age and PassengerId
       
       return 
       
       df
       
       # Reading data 
       
       traindf
       
       =
       
       pd.
       
       read_csv(
       
       train_file)
       
       ## Data cleaning 
       
       df
       
       =
       
       clean_and_munge_data(
       
       traindf)
       
       ########################################formula################################
       
       formula_ml
       
       =
       
       'Survived~Pclass+C(Title)+Sex+C(AgeCat)+Fare_Per_Person+Fare+Family_Size' 
       
       y_train, 
       
       x_train 
       
       = 
       
       dmatrices(
       
       formula_ml, 
       
       data
       
       =
       
       df, 
       
       return_type
       
       =
       
       'dataframe')
       
       y_train 
       
       = 
       
       np.
       
       asarray(
       
       y_train).
       
       ravel()
       
       print(
       
       y_train.
       
       shape,
       
       x_train.
       
       shape)
       
       ## Select training and test sets 
       
       X_train, 
       
       X_test, 
       
       Y_train, 
       
       Y_test 
       
       = 
       
       train_test_split(
       
       x_train, 
       
       y_train, 
       
       test_size
       
       =
       
       0.2,
       
       random_state
       
       =
       
       seed)
       
       # Initialize the classifier 
       
       clf
       
       =
       
       RandomForestClassifier(
       
       n_estimators
       
       =
       
       500, 
       
       criterion
       
       =
       
       'entropy', 
       
       max_depth
       
       =
       
       5, 
       
       min_samples_split
       
       =
       
       1,
       
       min_samples_leaf
       
       =
       
       1, 
       
       max_features
       
       =
       
       'auto',    
       
       bootstrap
       
       =
       
       False, 
       
       oob_score
       
       =
       
       False, 
       
       n_jobs
       
       =
       
       1, 
       
       random_state
       
       =
       
       seed,
       
       verbose
       
       =
       
       0)
       
       ###grid search Find the best parameters 
       
       param_grid 
       
       = 
       
       dict( )
       
       ## Create classification pipeline
       
       pipeline
       
       =
       
       Pipeline([ (
       
       'clf',
       
       clf) ])
       
       grid_search 
       
       = 
       
       GridSearchCV(
       
       pipeline, 
       
       param_grid
       
       =
       
       param_grid, 
       
       verbose
       
       =
       
       3,
       
       scoring
       
       =
       
       'accuracy',\
       
       cv
       
       =
       
       StratifiedShuffleSplit(
       
       Y_train, 
       
       n_iter
       
       =
       
       10, 
       
       test_size
       
       =
       
       0.2, 
       
       train_size
       
       =
       
       None, 
       
       indices
       
       =
       
       None, \
       
       random_state
       
       =
       
       seed, 
       
       n_iterations
       
       =
       
       None)).
       
       fit(
       
       X_train, 
       
       Y_train)
       
       #  Score the results 
       
       print(
       
       "Best score: %0.3f" 
       
       % 
       
       grid_search.
       
       best_score_)
       
       print(
       
       grid_search.
       
       best_estimator_)
       
       report(
       
       grid_search.
       
       grid_scores_)
       
       print(
       
       '-----grid search end------------')
       
       print (
       
       'on all train set')
       
       scores 
       
       = 
       
       cross_val_score(
       
       grid_search.
       
       best_estimator_, 
       
       x_train, 
       
       y_train,
       
       cv
       
       =
       
       3,
       
       scoring
       
       =
       
       'accuracy')
       
       print(
       
       scores.
       
       mean(),
       
       scores)
       
       print (
       
       'on test set')
       
       scores 
       
       = 
       
       cross_val_score(
       
       grid_search.
       
       best_estimator_, 
       
       X_test, 
       
       Y_test,
       
       cv
       
       =
       
       3,
       
       scoring
       
       =
       
       'accuracy')
       
       print(
       
       scores.
       
       mean(),
       
       scores)
       
       #  Score the results 
       
       print(
       
       classification_report(
       
       Y_train, 
       
       grid_search.
       
       best_estimator_.
       
       predict(
       
       X_train) ))
       
       print(
       
       'test data')
       
       print(
       
       classification_report(
       
       Y_test, 
       
       grid_search.
       
       best_estimator_.
       
       predict(
       
       X_test) ))
       
       model_file
       
       =
       
       MODEL_PATH
       
       +
       
       'model-rf.pkl'
       
       joblib.
       
       dump(
       
       grid_search.
       
       best_estimator_, 
       
       model_file)
      
1.
2.
3.
4.
5.
6.
7.
8.
9.
10.
11.
12.
13.
14.
15.
16.
17.
18.
19.
20.
21.
22.
23.
24.
25.
26.
27.
28.
29.
30.
31.
32.
33.
34.
35.
36.
37.
38.
39.
40.
41.
42.
43.
44.
45.
46.
47.
48.
49.
50.
51.
52.
53.
54.
55.
56.
57.
58.
59.
60.
61.
62.
63.
64.
65.
66.
67.
68.
69.
70.
71.
72.
73.
74.
75.
76.
77.
78.
79.
80.
81.
82.
83.
84.
85.
86.
87.
88.
89.
90.
91.
92.
93.
94.
95.
96.
97.
98.
99.
100.
101.
102.
103.
104.
105.
106.
107.
108.
109.
110.
111.
112.
113.
114.
115.
116.
117.
118.
119.
120.
121.
122.
123.
124.
125.
126.
127.
128.
129.
130.
131.
132.
133.
134.
135.
136.
137.
138.
139.
140.
141.
142.
143.
144.
145.
146.
147.
148.
149.
150.
151.
152.
153.
154.
155.
156.
157.
158.
159.
160.
161.
162.
163.
164.
165.
166.
167.
168.
169.
170.
171.
172.
173.
174.
175.
176.
177.
178.
179.
180.
181.
182.
183.
184.
185.
186.
187.
188.
189.
190.
191.
192.
193.
194.
195.
196.
197.
198.
199.
200.
201.
202.
203.
204.
205.
206.
207.
208.
209.
版权声明
本文为[A Virgo procedural ape]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/04/202204221432498020.html
当前位置：网站首页>RF of mL: the kaggle competition uses the Titanic data set to establish an RF model to predict whether everyone is rescued or not

RF of mL: the kaggle competition uses the Titanic data set to establish an RF model to predict whether everyone is rescued or not

Output results

Implementation code

边栏推荐

猜你喜欢

随机推荐