diff --git a/cn-Book/3.实现注意力机制.md b/cn-Book/3.实现注意力机制.md
index 9d5c59d..f32ae1f 100644
--- a/cn-Book/3.实现注意力机制.md
+++ b/cn-Book/3.实现注意力机制.md
@@ -220,7 +220,7 @@ print(attn_scores_2)
 
 计算得到的注意力得分如下：
 
-```
+```python
 tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])
 ```
 
@@ -240,7 +240,7 @@ tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])
 >
 > 输出结果确认，逐元素相乘的和与点积的结果相同。
 >
-> ```
+> ```python
 > tensor(0.9544)
 > tensor(0.9544)
 > ```
@@ -261,7 +261,7 @@ print("Sum:", attn_weights_2_tmp.sum())
 
 如输出所示，现在注意力权重的总和为 1：
 
-```
+```python
 Attention weights: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
 Sum: tensor(1.0000)
 ```
@@ -278,7 +278,7 @@ print("Sum:", attn_weights_2_naive.sum())
 
 从输出中可以看到，softmax 函数可以实现注意力权重的归一化，使它们的总和为 1：
 
-```
+```python
 Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
 Sum: tensor(1.)
 ```
@@ -295,7 +295,7 @@ print("Sum:", attn_weights_2.sum())
 
 可以看到，它与我们之前实现的 `softmax_naive` 函数产生的结果相同。
 
-```
+```python
 Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
 Sum: tensor(1.)
 ```
@@ -342,7 +342,7 @@ print(context_vec_2)
 
 结算结果如下：
 
-```
+```python
 tensor([0.4419, 0.6515, 0.5683])
 ```
 
@@ -372,7 +372,7 @@ print(attn_scores)
 
 计算得到的注意力分数集合如下：
 
-```
+```python
 tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
         [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
         [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
@@ -392,7 +392,7 @@ print(attn_scores)
 
 可以看到，结果与之前一致：
 
-```
+```python
 tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
         [0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
         [0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
@@ -410,7 +410,7 @@ print(attn_weights)
 
 执行上述代码返回的注意力权重张量与图 3.10 中显示的数值一致：
 
-```
+```python
 tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
         [0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
         [0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
@@ -431,7 +431,7 @@ print("All row sums:", attn_weights.sum(dim=-1))
 
 结果如下：
 
-```
+```python
 Row 2 sum: 1.0
 All row sums: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])
 ```
@@ -445,7 +445,7 @@ print(all_context_vecs)
 
 可以看到，计算输出的张量中，每一行包含一个三维的上下文向量：
 
-```
+```python
 tensor([[0.4421, 0.5931, 0.5790],
         [0.4419, 0.6515, 0.5683],
         [0.4431, 0.6496, 0.5671],
@@ -462,7 +462,7 @@ tensor([[0.4421, 0.5931, 0.5790],
 
 根据结果，我们可以看到之前计算的 context_vec_2 与以上张量的第二行完全一致：
 
-```
+```python
  Previous 2nd context vector: tensor([0.4419, 0.6515, 0.5683])
 ```
 
@@ -528,7 +528,7 @@ print(query_2)
 
 以上代码的输出是一个二维向量，因为我们将对应的输出权重矩阵的列数通过 `d_out` 参数设置为 2：
 
-```
+```python
  tensor([0.4306, 1.4551])
 ```
 
@@ -553,7 +553,7 @@ print("values.shape:", values.shape)
 
 从输出结果可以看出，我们成功地将 6 个输入 token 从 3 维嵌入空间投影到 2 维嵌入空间：
 
-```
+```python
 keys.shape: torch.Size([6, 2])
 values.shape: torch.Size([6, 2])
 ```
@@ -574,7 +574,7 @@ print(attn_score_22)
 
 由此得到以下未经归一化的注意力得分：
 
-```
+```python
  tensor(1.8524)
 ```
 
@@ -608,7 +608,7 @@ print(attn_scores_2)
 
 可以看到，输出中的第二个元素与我们之前计算的 `attn_score_22` 相同：
 
-```
+```python
  tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])
 ```
 
@@ -626,7 +626,7 @@ print(attn_weights_2)
 
 结果如下：
 
-```
+```python
  tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])
 ```
 
@@ -660,7 +660,7 @@ print(context_vec_2)
 
 结果如下：
 
-```
+```python
 tensor([0.3061, 0.8210])
 ```
 
@@ -722,7 +722,7 @@ print(sa_v1(inputs))
 
 由于输入包含六个嵌入向量，因此会生成一个用于存储这六个上下文向量的矩阵:
 
-```
+```python
 tensor([[0.2996, 0.8053],
         [0.3061, 0.8210],
         [0.3058, 0.8203],
@@ -771,7 +771,7 @@ print(sa_v2(inputs))
 
 输出如下：
 
-```
+```python
 tensor([[-0.0739,  0.0713],
         [-0.0748,  0.0703],
         [-0.0749,  0.0702],
@@ -834,7 +834,7 @@ print(attn_weights)
 
 这会得到以下注意力权重：
 
-```
+```python
 tensor([[0.1921, 0.1646, 0.1652, 0.1550, 0.1721, 0.1510],
         [0.2041, 0.1659, 0.1662, 0.1496, 0.1665, 0.1477],
         [0.2036, 0.1659, 0.1662, 0.1498, 0.1664, 0.1480],
@@ -854,7 +854,7 @@ print(mask_simple)
 
 生成的掩码如下所示：
 
-```
+```python
 tensor([[1., 0., 0., 0., 0., 0.],
         [1., 1., 0., 0., 0., 0.],
         [1., 1., 1., 0., 0., 0.],
@@ -872,7 +872,7 @@ print(masked_simple)
 
 可以看到，对角线以上的元素已成功被置零：
 
-```
+```python
 tensor([[0.1921, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2041, 0.1659, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2036, 0.1659, 0.1662, 0.0000, 0.0000, 0.0000],
@@ -892,7 +892,7 @@ print(masked_simple_norm)
 
 最终得到的注意力权重矩阵具有以下特性：主对角线以上的注意力权重被置零，每一行的权重和为 1：
 
-```
+```python
 tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
@@ -928,7 +928,7 @@ print(masked)
 
 由此生成以下掩码：
 
-```
+```python
 tensor([[0.2899,   -inf,   -inf,   -inf,   -inf,   -inf],
 		   [0.4656, 0.1723,    -inf,   -inf,   -inf,   -inf],
        [0.4594, 0.1703, 0.1731,    -inf,   -inf,   -inf],
@@ -947,7 +947,7 @@ print(attn_weights)
 
 如输出所示，每一行的值之和为 1，因此不再需要进一步的归一化：
 
-```
+```python
 tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
@@ -988,7 +988,7 @@ print(dropout(example))
 
 如我们所见，约一半的数值被置零：
 
-```
+```python
 tensor([[2., 2., 0., 2., 2., 0.],
         [0., 0., 0., 2., 0., 2.],
         [2., 2., 2., 2., 0., 2.],
@@ -1042,7 +1042,7 @@ print(dropout(attn_weights))
 
 由此生成的注意力权重矩阵中，部分元素被置零，剩余的元素重新进行了缩放：
 
-```
+```python
 tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.7599, 0.6194, 0.6206, 0.0000, 0.0000, 0.0000],
@@ -1075,7 +1075,7 @@ print(batch.shape)                                              #A
 
 以上代码生成一个三维张量，包含 2 个输入文本，每个文本包含 6 个 token，每个 token 表示为一个 3 维嵌入向量：
 
-```
+```python
  torch.Size([2, 6, 3])
 ```
 
@@ -1131,7 +1131,7 @@ print("context_vecs.shape:", context_vecs.shape)
 
 生成的上下文向量是一个三维张量，其中每个 token 现在都表示为一个二维嵌入：
 
-```
+```python
 context_vecs.shape: torch.Size([2, 6, 2])
 ```
 
@@ -1197,7 +1197,7 @@ print("context_vecs.shape:", context_vecs.shape)
 
 以上代码输出的上下文向量如下所示：
 
-```
+```python
 tensor([[[-0.4519,  0.2216,  0.4772,  0.1063],
          [-0.5874,  0.0058,  0.5891,  0.3257],
          [-0.6300, -0.0632,  0.6202,  0.3860],
@@ -1332,7 +1332,7 @@ print(a @ a.transpose(2, 3))
 
 结果如下：
 
-```
+```python
 tensor([[[[1.3208, 1.1631, 1.2879],
           [1.1631, 2.2150, 1.8424],
           [1.2879, 1.8424, 2.0402]],
@@ -1356,7 +1356,7 @@ print("\nSecond head:\n", second_res)
 
 该结果与我们之前使用批量矩阵乘法 `print(a @ a.transpose(2, 3))` 时获得的结果完全相同：
 
-```
+```python
 First head:
  tensor([[1.3208, 1.1631, 1.2879],
         [1.1631, 2.2150, 1.8424],
@@ -1387,7 +1387,7 @@ print("context_vecs.shape:", context_vecs.shape)
 
 从结果可以看出，输出维度是由`d_out`参数直接控制的：
 
-```
+```python
 tensor([[[0.3190, 0.4858],
          [0.2943, 0.3897],
          [0.2856, 0.3593],
diff --git a/cn-Book/4.从零开始实现一个用于文本生成的 GPT 模型.md b/cn-Book/4.从零开始实现一个用于文本生成的 GPT 模型.md
index 22e44b2..2820af5 100644
--- a/cn-Book/4.从零开始实现一个用于文本生成的 GPT 模型.md	
+++ b/cn-Book/4.从零开始实现一个用于文本生成的 GPT 模型.md	
@@ -89,10 +89,10 @@ class DummyGPTModel(nn.Module):
         self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
         self.drop_emb = nn.Dropout(cfg["drop_rate"])
         self.trf_blocks = nn.Sequential(
-       		 *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])      #A
+            *[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])])      #A
         self.final_norm = DummyLayerNorm(cfg["emb_dim"])                       #B
         self.out_head = nn.Linear(
-       		 cfg["emb_dim"], cfg["vocab_size"], bias=False
+            cfg["emb_dim"], cfg["vocab_size"], bias=False
         )
 
     def forward(self, in_idx):