当前位置：网站首页>Machine learning training template, summarizing multiple classifiers

Machine learning training template, summarizing multiple classifiers

2022-04-22 12:19:00 【Sing for me alone】

1. Data aspect ： It mainly includes pandas operation

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
sns.set(font="simhei")   # Give Way heatmap According to Chinese 
plt.rcParams['font.sans-serif'] = ['SimHei']  # Set in the matplotlab Chinese font on 
plt.rcParams['axes.unicode_minus'] = False  # stay matplotlib The drawing displays symbols normally 
pd.set_option("display.max_rows", None)



data = pd.read_csv('train.csv')

#info、describe、head、value_counts etc. 
print(data.shape)
print(data['label'].unique())

####### Delete a column 、 Delete the blank 、 Change of name ##########
x = data.drop('label', axis=1)
data = data.dropna()    #,axis=1 Is to delete columns 
data.rename(columns={‘old_name’: ‘new_ name’})
####### Sum up #############
data['col3']=data[['col1','col2']].sum(axis=1)


#data=data.reset_index(drop=True) #  to update 

###### Handle labels #############
classes = data.loc[:, 'label']    # Remove all labels 
df.label= df.label.astype(str).map({'False.':0, 'True.':1})   # Change label tf by 01

######### In terms of data correlation, Pearson correlation coefficient 、 Heat map display

2. normalization 、 Hot coding alone ：

from sklearn.preprocessing import StandardScaler,MinMaxScaler


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

######## normalization #########
std = StandardScaler()    # Or change to MinMaxScaler
x_train = std.fit_transform(x_train)
x_test = std.transform(x_test)   


######## Text to value #####    There is a little problem with the recent use 
from sklearn.preprocessing import LabelEncoder
le = preprocessing.LabelEncoder()
train_x = data.apply(le.fit_transform)

####### Hot coding alone ########   There are many ways 
## Method 1 ##
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories='auto').fit(data_ca)
result = enc.transform(data_ca)

## Method 2 ##
data_dummies = pd.get_dummies(data[['col','col2']])

## Method 3 ##
y_train = np_utils.to_categorical(y_train, num_classes=10)   # For labels ; Divided into ten categories 

#eg.[1 0 0]
 #   --------------
 #  [[0. 1.]
 #  [1. 0.]
  # [1. 0.]]

3. Model ： Only for classification problems

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score



# define scoring method
scoring = 'accuracy'

# Define models to train
names = ["Nearest Neighbors"
#          , "Gaussian Process"
         ,"Decision Tree"
         , "Random Forest"
         , "Neural Net"
#          , "AdaBoost"
         ,"Naive Bayes"
         , "SVM Linear"
         , "SVM RBF"
         , "SVM Sigmoid"]



classifiers = [
    KNeighborsClassifier(n_neighbors = 3)
#     ,GaussianProcessClassifier(1.0 * RBF(1.0))
    ,DecisionTreeClassifier(max_depth=5)
    ,RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
    ,MLPClassifier(alpha=1)
#     ,AdaBoostClassifier()
    ,GaussianNB()
    ,SVC(kernel = 'linear')
    ,SVC(kernel = 'rbf')
    ,SVC(kernel = 'sigmoid')
]



models = zip(names, classifiers)


# evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state = seed, shuffle=True)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print('Test-- ',name,': ',accuracy_score(y_test, predictions))
    print()
    print(classification_report(y_test, predictions))

4. assessment ：

from sklearn import metrics

########### Self contained 
print(clf.score(X_test,y_test))

########### Cross validation 
scores = cross_val_score(clf, iris.data, iris.target, cv=5)     #cross_val_scorel To complete cross validation 
print('scores：',scores)
print("Accuracy: {:.4f} (+/- {:.4})".format(scores.mean(), scores.std() * 2))


################F1
score=metrics.f1_score(y_true=y_true,y_pred=preds,average="macro")


################ Confusion matrix and visualization 
from sklearn.metrics import confusion_matrix, accuracy_score

conf = confusion_matrix(test_y, preds) # Confusion model ,  Compare the predicted value with the real value 
label = ["0","1"] #  Here is the second category 
sns.heatmap(conf, annot = True, xticklabels=label, yticklabels=label)
plt.show()


################ other 
print(' Accuracy rate ：', metrics.accuracy_score(y_true, y_pred))
print(' Precision category ：', metrics.precision_score(y_true, y_pred, average=None))  #  Not average 
print(' Macro average precision ：', metrics.precision_score(y_true, y_pred, average='macro'))
print(' Micro average recall rate :', metrics.recall_score(y_true, y_pred, average='micro'))

Reference link ：

sklearn.metrics Introduction to the evaluation method in (accuracy_score, recall_score, roc_curve, roc_auc_score, confusion_matrix)_ Chihiro ～ The blog of -CSDN Blog _accuracy_score

版权声明
本文为[Sing for me alone]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/04/202204221209111726.html

当前位置：网站首页>Machine learning training template, summarizing multiple classifiers

Machine learning training template, summarizing multiple classifiers

边栏推荐

猜你喜欢

随机推荐