From 714913637bd5961c8f9cc71fa28132f63d339dab Mon Sep 17 00:00:00 2001 From: skindhu Date: Tue, 5 Nov 2024 17:27:30 +0800 Subject: [PATCH] add fourth chapter --- ...从零开始实现一个用于文本生成的 GPT 模型.md | 20 +++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/cn-Book/4.从零开始实现一个用于文本生成的 GPT 模型.md b/cn-Book/4.从零开始实现一个用于文本生成的 GPT 模型.md index e5592c1..22e44b2 100644 --- a/cn-Book/4.从零开始实现一个用于文本生成的 GPT 模型.md +++ b/cn-Book/4.从零开始实现一个用于文本生成的 GPT 模型.md @@ -409,8 +409,8 @@ class GELU(nn.Module): def forward(self, x): return 0.5 * x * (1 + torch.tanh( - torch.sqrt(torch.tensor(2.0 / torch.pi)) * - (x + 0.044715 * torch.pow(x, 3)) + torch.sqrt(torch.tensor(2.0 / torch.pi)) * + (x + 0.044715 * torch.pow(x, 3)) )) ``` @@ -456,7 +456,7 @@ class FeedForward(nn.Module): ) def forward(self, x): - return self.layers(x) + return self.layers(x) ``` 如代码所示,FeedForward 模块是一个小型神经网络,由两个线性层和一个 GELU 激活函数组成。在 1.24 亿参数的 GPT 模型中,该模块可以接收批量输入,每个输入 token 是一个 768 维的向量表示。这一嵌入维度大小通过 `GPT_CONFIG_124M` 配置字典中的 `GPT_CONFIG_124M["emb_dim"]` 参数指定。 @@ -539,9 +539,9 @@ class ExampleDeepNeuralNetwork(nn.Module): layer_output = layer(x) # Check if shortcut can be applied if self.use_shortcut and x.shape == layer_output.shape: - x = x + layer_output + x = x + layer_output else: - x = layer_output + x = layer_output return x ``` @@ -554,7 +554,7 @@ layer_sizes = [3, 3, 3, 3, 3, 1] sample_input = torch.tensor([[1., 0., -1.]]) torch.manual_seed(123) # specify random seed for the initial weights for reproducibility model_without_shortcut = ExampleDeepNeuralNetwork( - layer_sizes, use_shortcut=False + layer_sizes, use_shortcut=False ) ``` @@ -607,7 +607,7 @@ layers.4.0.weight has gradient mean of 0.005049646366387606 ```python torch.manual_seed(123) model_with_shortcut = ExampleDeepNeuralNetwork( - layer_sizes, use_shortcut=True + layer_sizes, use_shortcut=True ) print_gradients(model_with_shortcut, sample_input) ``` @@ -804,11 +804,11 @@ class GPTModel(nn.Module): self.drop_emb = nn.Dropout(cfg["drop_rate"]) self.trf_blocks = nn.Sequential( - *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) + *[TransformerBlock(cfg) for _ in range(cfg["n_layers"])]) self.final_norm = LayerNorm(cfg["emb_dim"]) self.out_head = nn.Linear( - cfg["emb_dim"], cfg["vocab_size"], bias=False + cfg["emb_dim"], cfg["vocab_size"], bias=False ) def forward(self, in_idx): @@ -851,7 +851,7 @@ print(out) ```python Input batch: tensor([[ 6109, 3626, 6100, 345], # token IDs of text 1 - [ 6109, 1110, 6622, 257]]) # token IDs of text 2 + [ 6109, 1110, 6622, 257]]) # token IDs of text 2 Output shape: torch.Size([2, 4, 50257]) tensor([[[ 0.3613, 0.4222, -0.0711, ..., 0.3483, 0.4661, -0.2838],