add fourth chapter
This commit is contained in:
parent
22e6d87ad9
commit
714913637b
|
|
@ -409,8 +409,8 @@ class GELU(nn.Module):
|
|||
|
||||
def forward(self, x):
|
||||
return 0.5 * x * (1 + torch.tanh(
|
||||
torch.sqrt(torch.tensor(2.0 / torch.pi)) *
|
||||
(x + 0.044715 * torch.pow(x, 3))
|
||||
torch.sqrt(torch.tensor(2.0 / torch.pi)) *
|
||||
(x + 0.044715 * torch.pow(x, 3))
|
||||
))
|
||||
```
|
||||
|
||||
|
|
@ -456,7 +456,7 @@ class FeedForward(nn.Module):
|
|||
)
|
||||
|
||||
def forward(self, x):
|
||||
return self.layers(x)
|
||||
return self.layers(x)
|
||||
```
|
||||
|
||||
如代码所示,FeedForward 模块是一个小型神经网络,由两个线性层和一个 GELU 激活函数组成。在 1.24 亿参数的 GPT 模型中,该模块可以接收批量输入,每个输入 token 是一个 768 维的向量表示。这一嵌入维度大小通过 `GPT_CONFIG_124M` 配置字典中的 `GPT_CONFIG_124M["emb_dim"]` 参数指定。
|
||||
|
|
@ -539,9 +539,9 @@ class ExampleDeepNeuralNetwork(nn.Module):
|
|||
layer_output = layer(x)
|
||||
# Check if shortcut can be applied
|
||||
if self.use_shortcut and x.shape == layer_output.shape:
|
||||
x = x + layer_output
|
||||
x = x + layer_output
|
||||
else:
|
||||
x = layer_output
|
||||
x = layer_output
|
||||
return x
|
||||
```
|
||||
|
||||
|
|
@ -554,7 +554,7 @@ layer_sizes = [3, 3, 3, 3, 3, 1]
|
|||
sample_input = torch.tensor([[1., 0., -1.]])
|
||||
torch.manual_seed(123) # specify random seed for the initial weights for reproducibility
|
||||
model_without_shortcut = ExampleDeepNeuralNetwork(
|
||||
layer_sizes, use_shortcut=False
|
||||
layer_sizes, use_shortcut=False
|
||||
)
|
||||
```
|
||||
|
||||
|
|
@ -607,7 +607,7 @@ layers.4.0.weight has gradient mean of 0.005049646366387606
|
|||
```python
|
||||
torch.manual_seed(123)
|
||||
model_with_shortcut = ExampleDeepNeuralNetwork(
|
||||
layer_sizes, use_shortcut=True
|
||||
layer_sizes, use_shortcut=True
|
||||
)
|
||||
print_gradients(model_with_shortcut, sample_input)
|
||||
```
|
||||
|
|
@ -804,11 +804,11 @@ class GPTModel(nn.Module):
|
|||
self.drop_emb = nn.Dropout(cfg["drop_rate"])
|
||||
|
||||
self.trf_blocks = nn.Sequential(
|
||||
*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
|
||||
*[TransformerBlock(cfg) for _ in range(cfg["n_layers"])])
|
||||
|
||||
self.final_norm = LayerNorm(cfg["emb_dim"])
|
||||
self.out_head = nn.Linear(
|
||||
cfg["emb_dim"], cfg["vocab_size"], bias=False
|
||||
cfg["emb_dim"], cfg["vocab_size"], bias=False
|
||||
)
|
||||
|
||||
def forward(self, in_idx):
|
||||
|
|
@ -851,7 +851,7 @@ print(out)
|
|||
```python
|
||||
Input batch:
|
||||
tensor([[ 6109, 3626, 6100, 345], # token IDs of text 1
|
||||
[ 6109, 1110, 6622, 257]]) # token IDs of text 2
|
||||
[ 6109, 1110, 6622, 257]]) # token IDs of text 2
|
||||
|
||||
Output shape: torch.Size([2, 4, 50257])
|
||||
tensor([[[ 0.3613, 0.4222, -0.0711, ..., 0.3483, 0.4661, -0.2838],
|
||||
|
|
|
|||
Loading…
Reference in New Issue