Pytorch freeze part of the layers
In PyTorch we can freeze the layer by setting the requires_grad to False. The weight freeze is helpful when we want to apply a pretrained model.
Here I’d like to explore this process.
Build a toy model
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(2, 4)
self.relu1 = nn.ReLU()
#self.dout = nn.Dropout(0.2)
self.fc2 = nn.Linear(4, 3)
self.relu2 = nn.ReLU(1)
self.out = nn.Linear(3, 1)
self.out_act = nn.Sigmoid()
def forward(self, inputs):
a1 = self.fc1(inputs)
h1 = self.relu1(a1)
a2 = self.fc2(h1)
h2 = self.relu2(a2)
a3 = self.out(h2)
y = self.out_act(a3)
return y
If we run it on the terminal, we can explore the parameters
>>> import torch.nn as nn
>>> from torch.autograd import Variable
>>> import torch.optim as optim
>>> class Net(nn.Module):
...
... def __init__(self):
... super().__init__()
... self.fc1 = nn.Linear(2, 4)
... self.relu1 = nn.ReLU()
... #self.dout = nn.Dropout(0.2)
... self.fc2 = nn.Linear(4, 3)
... self.relu2 = nn.ReLU(1)
... self.out = nn.Linear(3, 1)
... self.out_act = nn.Sigmoid()
...
... def forward(self, inputs):
... a1 = self.fc1(inputs)
... h1 = self.relu1(a1)
... a2 = self.fc2(h1)
... h2 = self.relu2(a2)
... a3 = self.out(h2)
... y = self.out_act(a3)
... return y
...
Output the parameters
>>> net = Net()
>>> for para in net.parameters():
... print(para)
...
Parameter containing:
tensor([[-0.1833, -0.3816],
[-0.4710, 0.3696],
[-0.1283, 0.6524],
[ 0.5941, -0.3848]], requires_grad=True)
Parameter containing:
tensor([ 0.6866, -0.5329, 0.6027, -0.6733], requires_grad=True)
Parameter containing:
tensor([[-0.0581, -0.1891, -0.4522, -0.1631],
[ 0.4902, 0.0954, -0.2497, -0.4682],
[ 0.0365, -0.0082, -0.4446, -0.0282]], requires_grad=True)
Parameter containing:
tensor([-0.1120, 0.0109, -0.0787], requires_grad=True)
Parameter containing:
tensor([[ 0.3279, 0.1556, -0.1405]], requires_grad=True)
Parameter containing:
tensor([0.1250], requires_grad=True)
Set requires_grad to False
>>> for para in net.parameters():
... para.requires_grad = False
... print(para)
...
Parameter containing:
tensor([[-0.1833, -0.3816],
[-0.4710, 0.3696],
[-0.1283, 0.6524],
[ 0.5941, -0.3848]])
Parameter containing:
tensor([ 0.6866, -0.5329, 0.6027, -0.6733])
Parameter containing:
tensor([[-0.0581, -0.1891, -0.4522, -0.1631],
[ 0.4902, 0.0954, -0.2497, -0.4682],
[ 0.0365, -0.0082, -0.4446, -0.0282]])
Parameter containing:
tensor([-0.1120, 0.0109, -0.0787])
Parameter containing:
tensor([[ 0.3279, 0.1556, -0.1405]])
Parameter containing:
tensor([0.1250])
Set requires_grad back to True
>>> for para in net.parameters():
... para.requires_grad = True
... print(para)
...
Parameter containing:
tensor([[-0.1833, -0.3816],
[-0.4710, 0.3696],
[-0.1283, 0.6524],
[ 0.5941, -0.3848]], requires_grad=True)
Parameter containing:
tensor([ 0.6866, -0.5329, 0.6027, -0.6733], requires_grad=True)
Parameter containing:
tensor([[-0.0581, -0.1891, -0.4522, -0.1631],
[ 0.4902, 0.0954, -0.2497, -0.4682],
[ 0.0365, -0.0082, -0.4446, -0.0282]], requires_grad=True)
Parameter containing:
tensor([-0.1120, 0.0109, -0.0787], requires_grad=True)
Parameter containing:
tensor([[ 0.3279, 0.1556, -0.1405]], requires_grad=True)
Parameter containing:
tensor([0.1250], requires_grad=True)
Freeze part of the parameter
For example, only freeze the fc1 layer.
get params keys
>>> params = net.state_dict()
>>> params.keys()
odict_keys(['fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias', 'out.weight', 'out.bias'])
set related layer’s require grads to False (a naive way)
>>> keys = list(params.keys())
>>> keys[0]
'fc1.weight'
>>> net.fc1.weight.requires_grad = False
>>> for para in net.parameters():print(para)
...
Parameter containing:
tensor([[-0.1833, -0.3816],
[-0.4710, 0.3696],
[-0.1283, 0.6524],
[ 0.5941, -0.3848]])
Parameter containing:
tensor([ 0.6866, -0.5329, 0.6027, -0.6733], requires_grad=True)
Parameter containing:
tensor([[-0.0581, -0.1891, -0.4522, -0.1631],
[ 0.4902, 0.0954, -0.2497, -0.4682],
[ 0.0365, -0.0082, -0.4446, -0.0282]], requires_grad=True)
Parameter containing:
tensor([-0.1120, 0.0109, -0.0787], requires_grad=True)
Parameter containing:
tensor([[ 0.3279, 0.1556, -0.1405]], requires_grad=True)
Parameter containing:
tensor([0.1250], requires_grad=True)
A better way
We can identify the parameter by name[2]:
>>> net.fc1.weight.requires_grad = True
>>> for name, param in net.named_parameters():
... if param.requires_grad:print(name)
...
fc1.weight
fc1.bias
fc2.weight
fc2.bias
out.weight
out.bias
Then we can filter and control the requires_grad by filtering through the parameter names
>>> for name, param in net.named_parameters():
... if param.requires_grad and 'fc1' in name:
... param.requires_grad = False
...
>>> for name, param in net.named_parameters():print(name, param)
...
fc1.weight Parameter containing:
tensor([[-0.1833, -0.3816],
[-0.4710, 0.3696],
[-0.1283, 0.6524],
[ 0.5941, -0.3848]])
fc1.bias Parameter containing:
tensor([ 0.6866, -0.5329, 0.6027, -0.6733])
fc2.weight Parameter containing:
tensor([[-0.0581, -0.1891, -0.4522, -0.1631],
[ 0.4902, 0.0954, -0.2497, -0.4682],
[ 0.0365, -0.0082, -0.4446, -0.0282]], requires_grad=True)
fc2.bias Parameter containing:
tensor([-0.1120, 0.0109, -0.0787], requires_grad=True)
out.weight Parameter containing:
tensor([[ 0.3279, 0.1556, -0.1405]], requires_grad=True)
out.bias Parameter containing:
tensor([0.1250], requires_grad=True)
Last one more step
We haven’t done yet as even the required grad is set to False, we still can update the weights
>>> net.fc1.weight -= 0.1*net.fc1.weight
>>> for name, param in net.named_parameters():print(name, param)
...
fc1.weight Parameter containing:
tensor([[-0.1650, -0.3435],
[-0.4239, 0.3326],
[-0.1154, 0.5872],
[ 0.5347, -0.3463]])
fc1.bias Parameter containing:
tensor([ 0.6866, -0.5329, 0.6027, -0.6733])
fc2.weight Parameter containing:
tensor([[-0.0581, -0.1891, -0.4522, -0.1631],
[ 0.4902, 0.0954, -0.2497, -0.4682],
[ 0.0365, -0.0082, -0.4446, -0.0282]], requires_grad=True)
fc2.bias Parameter containing:
tensor([-0.1120, 0.0109, -0.0787], requires_grad=True)
out.weight Parameter containing:
tensor([[ 0.3279, 0.1556, -0.1405]], requires_grad=True)
out.bias Parameter containing:
tensor([0.1250], requires_grad=True)
So we should filter the parameters to only those requires_grad ones by using this code[1]
optimizer = optim.SGD(filter(lambda p: p.requires_grad, net.parameters()), lr=0.1)>>> for p in filter(lambda p: p.requires_grad, net.parameters()):print(p)
...
Parameter containing:
tensor([[-0.0581, -0.1891, -0.4522, -0.1631],
[ 0.4902, 0.0954, -0.2497, -0.4682],
[ 0.0365, -0.0082, -0.4446, -0.0282]], requires_grad=True)
Parameter containing:
tensor([-0.1120, 0.0109, -0.0787], requires_grad=True)
Parameter containing:
tensor([[ 0.3279, 0.1556, -0.1405]], requires_grad=True)
Parameter containing:
tensor([0.1250], requires_grad=True)
>>>
Quick summary
we can use
- net.state_dict() to get the key information of all parameters and we can print it out to help us figure out which layers that we want to freeze
- If we know our target layer to be frozen, we can then freeze the layers by filtering by names
Thanks for reading and I hope it helps.