当前位置：网站首页>[code analysis (6)] communication efficient learning of deep networks from decentralized data

[code analysis (6)] communication efficient learning of deep networks from decentralized data

2022-04-23 13:47:00 【Silent city of the sky】
federated_main.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Python version: 3.6


import os
import copy
import time
import pickle
import numpy as np
from tqdm import tqdm

import torch
from tensorboardX import SummaryWriter

from options import args_parser
from update import LocalUpdate, test_inference
from models import MLP, CNNMnist, CNNFashion_Mnist, CNNCifar
from utils import get_dataset, average_weights, exp_details

os.environ['KMP_DUPLICATE_LIB_OK'] = 'TRUE'

if __name__ == '__main__':
    start_time = time.time()

    # define paths
    path_project = os.path.abspath('..')
    logger = SummaryWriter('../logs')

    #  Take out the parameters 
    args = args_parser()

    #  Output experimental model parameters and other details 
    exp_details(args)

    '''
    if args.gpu_id:
        torch.cuda.set_device(args.gpu_id)
    device = 'cuda' if args.gpu else 'cpu'
    '''
    if args.gpu:
        torch.cuda.set_device(args.gpu)
    device = 'cuda' if args.gpu else 'cpu'

    # load dataset and user groups
    '''
         Load datasets and user groups 
    '''
    train_dataset, test_dataset, user_groups = get_dataset(args)
    '''
        print('++++++++++++++++')
        print(user_groups)
        cifar_iid(dataset, num_users) Under the circumstances ：
        
    '''
    '''
    for dict_key in user_groups.keys():
        print(dict_key)
        0~100
     altogether 100 Users 
    num_users=100
    
    '''

    # BUILD MODEL Build the model 
    if args.model == 'cnn':
        # Convolutional neural netork
        if args.dataset == 'mnist':
            global_model = CNNMnist(args=args)
        elif args.dataset == 'fmnist':
            global_model = CNNFashion_Mnist(args=args)
        elif args.dataset == 'cifar':
            global_model = CNNCifar(args=args)

    elif args.model == 'mlp':
        # Multi-layer preceptron
        img_size = train_dataset[0][0].shape
        '''
            cifar Data set ：
            print('+++++++++++++++++++')
            print(img_size)
            torch.Size([3, 32, 32])
            
            print(train_dataset[0][0].shape)
            torch.Size([3, 32, 32])
            
            print(train_dataset[0][0][0].shape)
            torch.Size([32, 32])
        '''

        len_in = 1
        # torch.Size([3, 32, 32])
        for x in img_size:
            len_in *= x
            global_model = MLP(dim_in=len_in, dim_hidden=64,
                               dim_out=args.num_classes)
        '''
            len_in = 3
            MLP(dim_in=3, dim_hidden=64, dim_out=10)
            
            len_in = 96
            MLP(dim_in=94, dim_hidden=64, dim_out=10)
            
             This meets the requirements ：
            len_in = 3072
            MLP(dim_in=3072, dim_hidden=64, dim_out=10)
             Output ：
                torch.Size([10, 10])
        '''
    else:
        exit('Error: unrecognized model')

    '''
         The training model has been built above 
    '''

    # Set the model to train and send it to device.
    global_model.to(device)

    global_model.train()
    '''
        MLP There are... In the model ：dropout
        CNN There are... In the model ：Batch Normalization  and  Dropout
        model.train() The function of is to enable  
        Batch Normalization  and  Dropout.
         If there are BN layer (Batch Normalization） and Dropout,
         You need to add model.train().model.train()
         It's a guarantee BN The layer can use the mean and variance of each batch of data .
         about Dropout,model.train() Is to take a random part of the network connection 
         To train and update parameters .
    '''
    # print('+++++++++++++++++++++++')
    print(global_model)
    '''
         about MLP Model ：
            MLP(
                  (layer_input): Linear(in_features=3072, out_features=64, bias=True)
                  (relu): ReLU()
                  (dropout): Dropout(p=0.5, inplace=False)
                  (layer_hidden): Linear(in_features=64, out_features=10, bias=True)
                  (softmax): Softmax(dim=1)
                )
        
    
    '''

    # copy weights  Copy weights 
    global_weights = global_model.state_dict()
    '''
        state_dict Variable storage training process need to learn the weight and paranoia coefficient 
        state_dict As python The dictionary object maps the parameters of each layer to tensor tensor 
         It should be noted that torch.nn.Module Module state_dict Only the parameters of convolution layer and full connection layer are included 
        
        
        
         give an example ：
        self.conv1=nn.Conv2d(3,6,5)
        self.pool=nn.MaxPool2d(2,2)
        self.conv2=nn.Conv2d(6,16,5)
        self.fc1=nn.Linear(16*5*5,120)
        self.fc2=nn.Linear(120,84)
        self.fc3=nn.Linear(84,10)


        model.state_dict()[param_tensor].size()
        conv1.weight 	torch.Size([6, 3, 5, 5])
            6 individual  3*5*5 Convolution kernel 
        conv1.bias 	    torch.Size([6])
            6 A bias 
        conv2.weight 	torch.Size([16, 6, 5, 5])
            
        conv2.bias 	    torch.Size([16])
        
        fc1 = nn.Linear(16 * 5 * 5, 120)  The weight is torch.Size([120, 400])
        fc1.weight 	    torch.Size([120, 400])
            
        fc1.bias 	    torch.Size([120])
        
        fc2.weight 	    torch.Size([84, 120])
        fc2.bias 	    torch.Size([84])
        
        fc3.weight 	    torch.Size([10, 84])
        fc3.bias 	    torch.Size([10])

    '''

    # Training
    train_loss, train_accuracy = [], []
    '''
         Loss of training , Training accuracy 
    '''
    val_acc_list, net_list = [], []
    '''
        
    '''
    cv_loss, cv_acc = [], []

    print_every = 2

    val_loss_pre, counter = 0, 0

    #  Local training ,epoch Selected as 10
    '''
         One epoch It refers to the process that all data is sent to the network to complete a forward calculation and back propagation 
         One epoch in ：
             The data is divided into several batch
            batch_size For one batch The amount of data in it 
    '''
    for epoch in tqdm(range(args.epochs)):
        '''
            args.epochs by options.py Global rounds in 
            
            epoch[0,1,2,3,4,5,6,7,8,9]
            
            tqdm modular tqdm function  tqdm  yes  Python  Progress bar library 
              0%|          | 0/10 [00:00<?, ?it/s]Train Epoch: 1 [0/50000 (0%)]
             10%|█         | 1/10 [00:58<08:45, 58.42s/it]Train Epoch: 2 [0/50000 (0%)]
        '''
        local_weights, local_losses = [], []

        print(f'\n | Global Training Round : {epoch+1} |\n')

        global_model.train()
        '''
             If there are BN layer (Batch Normalization） and Dropout,
             You need to add model.train().model.train()
             It's a guarantee BN The layer can use the mean and variance of each batch of data .
             about Dropout,model.train() It's a random part of the network connection to train and update parameters 
        '''

        #  Random selection 10 Users 
        m = max(int(args.frac * args.num_users), 1)
        '''
            frac = 0.1
            num_users = 100
            m = 10
        '''

        idxs_users = np.random.choice(range(args.num_users), m, replace=False)
        ''' 
            frac = 0.1
            num_users = 100
            dict_users[i] dict type {
    '0':{
    1,3,4}}

            replace Indicates whether to reuse elements 
            numpy.random.choice(a, size=None, replace=True, p=None)
            a :  If it's a one-dimensional array , It means random sampling from this one-dimensional array ; If it is int type ,
             It means the from 0 To a-1 Random sampling in this sequence 
             from [0,1,2,3 ... len(dataset)] sampling num_items Elements 

             That's reasonable ,dataset It's equivalent to a matrix , Behavior user, As a Item
             Every user For a line , As a item Number , So for each user sampling num_item Elements 
            
            idxs_users = np.random.choice(range(args.num_users), m, replace=False)
             this 100 User subscripts 
             Random selection 10 individual 
        '''
        # print('++++++++++')
        # print(idxs_users)
        # [33 55 99 17  1 68 31 20 77 93]

        '''
             Local client training 
        '''
        for idx in idxs_users:
            '''
                 For each user 
            '''
            local_model = LocalUpdate(args=args, dataset=train_dataset,
                                      idxs=user_groups[idx], logger=logger)
            # print('+++++++++++++++++')
            # print(user_groups[idx])
            '''
                idx What is the role of ？
                 Return to user group ：
                    dict type {
    key:value}
                        key: User's index 
                        value: The corresponding data of these users 
                    
                    user_group Namely dict_users
                    dict_users[i] similar {
    '0':{
    1,3,4}}
                
                dict_users[i] Save all the subscripts ？？？？？？！！！！   
                    
                user_groups = get_dataset(args) yes utils.py From here 
                utils.py Medium get_dataset It's accepted sampling.py From here ：
                    user_groups = mnist_iid(train_dataset, args.num_users)
                sampling.py Medium ：
                    dict_users[i] = set(np.random.choice(all_idxs, num_items,
                                             replace=False))
                    dict_users[i] = 
                
                
                 obtain idxs=user_groups[idx]
                
            '''

            w, loss = local_model.update_weights(
                model=copy.deepcopy(global_model), global_round=epoch)
            '''
                 call update.py Medium update_weights function 
                for iter in range(self.args.local_ep)
                 A user passes by 10 A local batch
                 return ：
                return model.state_dict(), sum(epoch_loss) / len(epoch_loss)
            '''

            # print('99999999999999')
            # print(len(w))  # 4
            # print(type(w))  # <class 'collections.OrderedDict'>
            # print(len(w[0]))

            local_weights.append(copy.deepcopy(w))
            '''
                 At first I thought local_weights It was received w
                 It's actually local_weights Added w To your own list in 
            '''

            # print(local_weights)
            '''
            local_weights yes <class 'list'> type ：
            
            [OrderedDict([ 
                ('layer_input.weight', tensor([[ 0.0153, -0.0127, -0.0007,  ..., -0.0012,  0.0151,  0.0063],
                    [ 0.0041, -0.0144,  0.0049,  ...,  0.0157,  0.0007, -0.0153],
                    ...,
                    [-0.0050,  0.0112,  0.0106,  ..., -0.0072, -0.0117, -0.0039]])),
                
                ('layer_input.bias', tensor([ 0.0052, -0.0118, -0.0131,  0.0120, ...,  0.0047])),
                ('layer_hidden.weight', tensor([[ 0.0995,  0.1264, -0.0289, -0.0753, ...0.1195],
                                                ..., 
                                                [ 0.0995,  0.1264, -0.0289, -0.0753, ...0.1195]])
                )
                ('layer_hidden.bias', tensor([ 0.0422,  0.0937, -0.1109, -0.1184,  0.0178, -0.0370, -0.0875, -0.0417,
                                            0.1082, -0.0144])
                )
                ])
            ]
                
            '''

            local_losses.append(copy.deepcopy(loss))

        # print('000000000000000000')
        # print(local_weights[9])

        # print('+++++++++++++++++++++++++++')
        # print(local_weights[0]['layer_input.weight'])
        '''
             get tensor Of shape:
                test.shape
                print(local_weights[1]['layer_input.weight'].shape)
                torch.Size([64, 3072])
            3072=3*32*32
            
            (layer_input): Linear(in_features=3072, out_features=64, bias=True)
            fc1 = nn.Linear(16 * 5 * 5, 120)  The weight is torch.Size([120, 400])
            
             It's got to come out 3072 Just above ：
                args.model == 'mlp':
            
            local_weights[1]['layer_input.weight']:
                tensor([[ 0.0053, -0.0159,  0.0121,  ...,  0.0102,  0.0121,  0.0087],
                        [-0.0181,  0.0125,  0.0130,  ...,  0.0134,  0.0020,  0.0107],
                        ...,
                        [ 0.0023, -0.0054, -0.0015,  ...,  0.0022,  0.0147,  0.0071]])
        '''
        w_avg = copy.deepcopy(local_weights[0])
        # print(len(local_weights))  # 10
        for key in w_avg.keys():
            for i in range(1, len(local_weights)):
                # range(1, 10):1,2,3,4,5,6,7,8,9
                #  He added 9 individual tensor But behind div(10) 了 
                '''
                     got it ！！
                    w_avg = copy.deepcopy(local_weights[0])
                     Have already put [0] Used to initialize w_avg 了 
                '''

                w_avg[key] += local_weights[i][key]
                #
                '''
                    local_weights[1]['layer_input.weight']:
                        torch.Size([64, 3072])
                    
                    local_weights[1]['layer_input.weight']:
                tensor([[ 0.0053, -0.0159,  0.0121,  ...,  0.0102,  0.0121,  0.0087],
                        [-0.0181,  0.0125,  0.0130,  ...,  0.0134,  0.0020,  0.0107],
                        ...,
                        [ 0.0023, -0.0054, -0.0015,  ...,  0.0022,  0.0147,  0.0071]])
                '''
            # print('+==========================')
            # print(w_avg['layer_input.weight'])
            '''
                 similar ：
                tensor([[-0.0397, -0.0698,  0.0882,  ..., -0.1417,  0.1702, -0.1217],
                        ...
                        [-0.1882,  0.0659,  0.0673,  ...,  0.0293, -0.0446,  0.0299]])
            '''

            w_avg[key] = torch.div(w_avg[key], len(local_weights))

            ''' The source code is wrong and should not be divided by len(w)'''
            # w_avg[key] = torch.div(w_avg[key], len(w))
            # print(')))))))))))))))))))))))0')
            # print(w[0])
            # print(w)  #  and local_weights[i] equally 
            # print(len(w))  # 4 Express w There are four key value pairs 
            # print(type(w))  # <class 'collections.OrderedDict'>
            '''
                w yes <class 'list'> type ,
                w_avg[key] Divide each value by len(w)=4
                len(w) Divide 4 But it's wrong 
                 It should be divided by len(local_weights) Well 
                len(w)
            '''

        '''
            print(w_avg.keys())：
            odict_keys(['layer_input.weight', 
                        'layer_input.bias', 
                        'layer_hidden.weight', 
                        'layer_hidden.bias']
        '''
        '''
        OrderedDict Orderly dictionary ：
        
        print('wwwwwwwwwwwwwww')
        print(local_weights)
            [OrderedDict([('layer_input.weight', tensor([[-0.0026,  0.0088,  0.0002,  ...,  0.0076,  0.0010,  0.0020],
            [-0.0053,  0.0178,  0.0017,  ..., -0.0125,  0.0014, -0.0052],
            [-0.0076,  0.0077,  0.0046,  ...,  0.0105,  0.0071,  0.0129],
        print(local_weights[0]) 
            OrderedDict([('layer_input.weight', tensor([[-0.0068, -0.0142,  0.0133,  ..., -0.0147,  0.0048,  0.0156],
            [ 0.0089, -0.0090, -0.0009,  ..., -0.0061, -0.0013, -0.0005],
        '''
        # print(' type ')
        # print(type(local_weights))  # <class 'list'>
        # print(type(local_weights[0]))  # <class 'collections.OrderedDict'>

        # update global weights
        '''
             Update global weights 
             Pass for reference local_weights
            local_weights.append(copy.deepcopy(w))
            
            len(local_weights) = 10
        '''
        # print(' The ginseng local_weights')
        # print(len(local_weights))

        #  Update global weights one by one 
        global_weights = average_weights(local_weights)
        '''
             The above a for There is no point in circular existence 
             Got w_avg of no avail 
             The above a for The cycle is uitls.py Medium average_weights() Function 
            global_weights = average_weights(local_weights)
             Called this function 
        '''

        # update global weights
        global_model.load_state_dict(global_weights)
        '''
            torch.load_state_dict() The function is used to load the pre trained parameter weights into the new model 
        '''

        # print('loooooooooooooooooo')
        # print(local_losses)
        loss_avg = sum(local_losses) / len(local_losses)
        '''
            local_losses by list type 
            [-0.1604325039871037, -0.14957306474447252, 
            -0.1479335972107947, -0.17096682694740595, 
            -0.15407370103523135, -0.15217236945405604, 
            -0.14514834607020022, -0.1494329896569252, 
            -0.1533350457623601, -0.1353217322193086]
        '''

        train_loss.append(loss_avg)
        '''
            [[], [], []....]
            w, loss = local_model.update_weights(
                model=copy.deepcopy(global_model), global_round=epoch)
            loss From here ：
            update.py Medium update_weights function ：
                log_probs = model(images)
                loss = self.criterion(log_probs, labels)
             below ： loss = local_model.inference(model=global_model)
             from update.py Medium inference function ：
                outputs = model(images)
                batch_loss = self.criterion(outputs, labels)
             The results are the same 
            
        '''

        # Calculate avg training accuracy over all users at every epoch
        '''
             Calculate each epoch Average training accuracy and loss of all users 
        '''
        list_acc, list_loss = [], []

        global_model.eval()

        for c in range(args.num_users):
            local_model = LocalUpdate(args=args, dataset=train_dataset,
                                      idxs=user_groups[idx], logger=logger)
            acc, loss = local_model.inference(model=global_model)
            list_acc.append(acc)
            list_loss.append(loss)
        '''
             The accuracy and loss of each user's inference are stored in 
            list_acc
            list_loss
             there list_loss useless 
             It's using ：
                train_loss.append(loss_avg)
        '''

        train_accuracy.append(sum(list_acc)/len(list_acc))
        # print('000000000000')
        # print(train_accuracy)  # [0.2199999999999999]
        # print(train_accuracy[-1])  # 0.2199999999999999

        # print global training loss after every 'i' rounds
        ''' Global training loss per round  print_every=2'''

        if (epoch+1) % print_every == 0:
            print(f' \nAvg Training Stats after {epoch+1} global rounds:')

            print(f'Training Loss : {np.mean(np.array(train_loss))}')
            '''
                train_loss:[[], [], [],...]
                np.array(train_loss):
                    [[]
                     []
                     []
                     ]
                np.mean(np.array(train_loss)):
                     The sum of all values divided by the total number 
            '''

            print('Train Accuracy: {:.2f}% \n'.format(100*train_accuracy[-1]))

    # Test inference after completion of training
    '''
         The inferred loss of completing all rounds of training 
    '''
    test_acc, test_loss = test_inference(args, global_model, test_dataset)

    print(f' \n Results after {args.epochs} global rounds of training:')

    print("|---- Avg Train Accuracy: {:.2f}%".format(100*train_accuracy[-1]))

    print("|---- Test Accuracy: {:.2f}%".format(100*test_acc))

    # Saving the objects train_loss and train_accuracy:
    file_name = '../save/objects/{}_{}_{}_C[{}]_iid[{}]_E[{}]_B[{}].pkl'.\
        format(args.dataset, args.model, args.epochs, args.frac, args.iid,
               args.local_ep, args.local_bs)

    with open(file_name, 'wb') as f:
        pickle.dump([train_loss, train_accuracy], f)

    print('\n Total Run Time: {0:0.4f}'.format(time.time()-start_time))
版权声明
本文为[Silent city of the sky]所创，转载请带上原文链接，感谢
https://yzsam.com/2022/04/202204230556365692.html
当前位置：网站首页>[code analysis (6)] communication efficient learning of deep networks from decentralized data

[code analysis (6)] communication efficient learning of deep networks from decentralized data

federated_main.py

边栏推荐

猜你喜欢

随机推荐