add fourth chapter

2024-11-05 17:46:19 +08:00 · 2024-11-05 17:46:19 +08:00 · d9e1a27bf2
parent be238a37c5
commit d9e1a27bf2
1 changed files with 15 additions and 14 deletions
--- a/cn-Book/3.实现注意力机制.md
+++ b/cn-Book/3.实现注意力机制.md
@ -164,7 +164,7 @@ inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
-	 [0.22, 0.58, 0.33], # with     (x^4)
+   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)

@ -233,7 +233,7 @@ tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])
 > ```python
 > res = 0.
 > for idx, element in enumerate(inputs[0]):
->  res += inputs[0][idx] * query[idx]
+>      res += inputs[0][idx] * query[idx]
 > print(res)
 > print(torch.dot(inputs[0], query))
 > ```
@ -609,7 +609,7 @@ print(attn_scores_2)
 可以看到，输出中的第二个元素与我们之前计算的 `attn_score_22` 相同：

 ```python
- tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])
+tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])
 ```

 第三步是将注意力得分转换为注意力权重，如图 3.16 所示。
@ -627,7 +627,7 @@ print(attn_weights_2)
 结果如下：

 ```python
- tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])
+tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])
 ```

 > [!NOTE]
@ -929,13 +929,13 @@ print(masked)
 由此生成以下掩码：

 ```python
-tensor([[0.2899,   -inf,   -inf,   -inf,   -inf,   -inf],
-		   [0.4656, 0.1723,    -inf,   -inf,   -inf,   -inf],
-       [0.4594, 0.1703, 0.1731,    -inf,   -inf,   -inf],
-       [0.2642, 0.1024, 0.1036,  0.0186,   -inf,   -inf],
-       [0.2183, 0.0874, 0.0882,  0.0177,  0.0786,  -inf],
-       [0.3408, 0.1270, 0.1290, 0.0198, 0.1290, 0.0078]],
-       grad_fn=<MaskedFillBackward0>)
+tensor([[0.2899,   -inf,   -inf,   -inf,   -inf,    -inf],
+        [0.4656, 0.1723,    -inf,   -inf,   -inf,   -inf],
+        [0.4594, 0.1703, 0.1731,    -inf,   -inf,   -inf],
+        [0.2642, 0.1024, 0.1036,  0.0186,   -inf,   -inf],
+        [0.2183, 0.0874, 0.0882,  0.0177,  0.0786,  -inf],
+        [0.3408, 0.1270, 0.1290, 0.0198, 0.1290, 0.0078]],
+        grad_fn=<MaskedFillBackward0>)
 ```

 现在我们只需要对这些掩码后的结果应用 softmax 函数，就可以完成了：
@ -1076,7 +1076,7 @@ print(batch.shape)                                              #A
 以上代码生成一个三维张量，包含 2 个输入文本，每个文本包含 6 个 token，每个 token 表示为一个 3 维嵌入向量：

 ```python
- torch.Size([2, 6, 3])
+torch.Size([2, 6, 3])
 ```

 以下的 CausalAttention 类与我们之前实现的 SelfAttention 类类似，不同之处在于我们现在添加了dropout和因果掩码组件，如以下代码所示：
@ -1095,7 +1095,7 @@ class CausalAttention(nn.Module):
           'mask',
           torch.triu(torch.ones(context_length, context_length),
           diagonal=1)
-				)                                                         #B
+        )                                                         #B

    def forward(self, x):
        b, num_tokens, d_in = x.shape                             #C
@ -1204,6 +1204,7 @@ tensor([[[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5675, -0.0843,  0.5478,  0.3589],
         [-0.5526, -0.0981,  0.5321,  0.3428],
         [-0.5299, -0.1081,  0.5077,  0.3493]],
+        
        [[-0.4519,  0.2216,  0.4772,  0.1063],
         [-0.5874,  0.0058,  0.5891,  0.3257],
         [-0.6300, -0.0632,  0.6202,  0.3860],
@ -1253,7 +1254,7 @@ class MultiHeadAttention(nn.Module):
        self.register_buffer(
            'mask',
             torch.triu(torch.ones(context_length, context_length), diagonal=1)
-				)
+        )


    def forward(self, x):