add fourth chapter
This commit is contained in:
parent
714913637b
commit
be238a37c5
|
|
@ -220,7 +220,7 @@ print(attn_scores_2)
|
|||
|
||||
计算得到的注意力得分如下:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])
|
||||
```
|
||||
|
||||
|
|
@ -240,7 +240,7 @@ tensor([0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865])
|
|||
>
|
||||
> 输出结果确认,逐元素相乘的和与点积的结果相同。
|
||||
>
|
||||
> ```
|
||||
> ```python
|
||||
> tensor(0.9544)
|
||||
> tensor(0.9544)
|
||||
> ```
|
||||
|
|
@ -261,7 +261,7 @@ print("Sum:", attn_weights_2_tmp.sum())
|
|||
|
||||
如输出所示,现在注意力权重的总和为 1:
|
||||
|
||||
```
|
||||
```python
|
||||
Attention weights: tensor([0.1455, 0.2278, 0.2249, 0.1285, 0.1077, 0.1656])
|
||||
Sum: tensor(1.0000)
|
||||
```
|
||||
|
|
@ -278,7 +278,7 @@ print("Sum:", attn_weights_2_naive.sum())
|
|||
|
||||
从输出中可以看到,softmax 函数可以实现注意力权重的归一化,使它们的总和为 1:
|
||||
|
||||
```
|
||||
```python
|
||||
Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
|
||||
Sum: tensor(1.)
|
||||
```
|
||||
|
|
@ -295,7 +295,7 @@ print("Sum:", attn_weights_2.sum())
|
|||
|
||||
可以看到,它与我们之前实现的 `softmax_naive` 函数产生的结果相同。
|
||||
|
||||
```
|
||||
```python
|
||||
Attention weights: tensor([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
|
||||
Sum: tensor(1.)
|
||||
```
|
||||
|
|
@ -342,7 +342,7 @@ print(context_vec_2)
|
|||
|
||||
结算结果如下:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([0.4419, 0.6515, 0.5683])
|
||||
```
|
||||
|
||||
|
|
@ -372,7 +372,7 @@ print(attn_scores)
|
|||
|
||||
计算得到的注意力分数集合如下:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
|
||||
[0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
|
||||
[0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
|
||||
|
|
@ -392,7 +392,7 @@ print(attn_scores)
|
|||
|
||||
可以看到,结果与之前一致:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[0.9995, 0.9544, 0.9422, 0.4753, 0.4576, 0.6310],
|
||||
[0.9544, 1.4950, 1.4754, 0.8434, 0.7070, 1.0865],
|
||||
[0.9422, 1.4754, 1.4570, 0.8296, 0.7154, 1.0605],
|
||||
|
|
@ -410,7 +410,7 @@ print(attn_weights)
|
|||
|
||||
执行上述代码返回的注意力权重张量与图 3.10 中显示的数值一致:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[0.2098, 0.2006, 0.1981, 0.1242, 0.1220, 0.1452],
|
||||
[0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581],
|
||||
[0.1390, 0.2369, 0.2326, 0.1242, 0.1108, 0.1565],
|
||||
|
|
@ -431,7 +431,7 @@ print("All row sums:", attn_weights.sum(dim=-1))
|
|||
|
||||
结果如下:
|
||||
|
||||
```
|
||||
```python
|
||||
Row 2 sum: 1.0
|
||||
All row sums: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])
|
||||
```
|
||||
|
|
@ -445,7 +445,7 @@ print(all_context_vecs)
|
|||
|
||||
可以看到,计算输出的张量中,每一行包含一个三维的上下文向量:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[0.4421, 0.5931, 0.5790],
|
||||
[0.4419, 0.6515, 0.5683],
|
||||
[0.4431, 0.6496, 0.5671],
|
||||
|
|
@ -462,7 +462,7 @@ tensor([[0.4421, 0.5931, 0.5790],
|
|||
|
||||
根据结果,我们可以看到之前计算的 context_vec_2 与以上张量的第二行完全一致:
|
||||
|
||||
```
|
||||
```python
|
||||
Previous 2nd context vector: tensor([0.4419, 0.6515, 0.5683])
|
||||
```
|
||||
|
||||
|
|
@ -528,7 +528,7 @@ print(query_2)
|
|||
|
||||
以上代码的输出是一个二维向量,因为我们将对应的输出权重矩阵的列数通过 `d_out` 参数设置为 2:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([0.4306, 1.4551])
|
||||
```
|
||||
|
||||
|
|
@ -553,7 +553,7 @@ print("values.shape:", values.shape)
|
|||
|
||||
从输出结果可以看出,我们成功地将 6 个输入 token 从 3 维嵌入空间投影到 2 维嵌入空间:
|
||||
|
||||
```
|
||||
```python
|
||||
keys.shape: torch.Size([6, 2])
|
||||
values.shape: torch.Size([6, 2])
|
||||
```
|
||||
|
|
@ -574,7 +574,7 @@ print(attn_score_22)
|
|||
|
||||
由此得到以下未经归一化的注意力得分:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor(1.8524)
|
||||
```
|
||||
|
||||
|
|
@ -608,7 +608,7 @@ print(attn_scores_2)
|
|||
|
||||
可以看到,输出中的第二个元素与我们之前计算的 `attn_score_22` 相同:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([1.2705, 1.8524, 1.8111, 1.0795, 0.5577, 1.5440])
|
||||
```
|
||||
|
||||
|
|
@ -626,7 +626,7 @@ print(attn_weights_2)
|
|||
|
||||
结果如下:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([0.1500, 0.2264, 0.2199, 0.1311, 0.0906, 0.1820])
|
||||
```
|
||||
|
||||
|
|
@ -660,7 +660,7 @@ print(context_vec_2)
|
|||
|
||||
结果如下:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([0.3061, 0.8210])
|
||||
```
|
||||
|
||||
|
|
@ -722,7 +722,7 @@ print(sa_v1(inputs))
|
|||
|
||||
由于输入包含六个嵌入向量,因此会生成一个用于存储这六个上下文向量的矩阵:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[0.2996, 0.8053],
|
||||
[0.3061, 0.8210],
|
||||
[0.3058, 0.8203],
|
||||
|
|
@ -771,7 +771,7 @@ print(sa_v2(inputs))
|
|||
|
||||
输出如下:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[-0.0739, 0.0713],
|
||||
[-0.0748, 0.0703],
|
||||
[-0.0749, 0.0702],
|
||||
|
|
@ -834,7 +834,7 @@ print(attn_weights)
|
|||
|
||||
这会得到以下注意力权重:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[0.1921, 0.1646, 0.1652, 0.1550, 0.1721, 0.1510],
|
||||
[0.2041, 0.1659, 0.1662, 0.1496, 0.1665, 0.1477],
|
||||
[0.2036, 0.1659, 0.1662, 0.1498, 0.1664, 0.1480],
|
||||
|
|
@ -854,7 +854,7 @@ print(mask_simple)
|
|||
|
||||
生成的掩码如下所示:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[1., 0., 0., 0., 0., 0.],
|
||||
[1., 1., 0., 0., 0., 0.],
|
||||
[1., 1., 1., 0., 0., 0.],
|
||||
|
|
@ -872,7 +872,7 @@ print(masked_simple)
|
|||
|
||||
可以看到,对角线以上的元素已成功被置零:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[0.1921, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
|
||||
[0.2041, 0.1659, 0.0000, 0.0000, 0.0000, 0.0000],
|
||||
[0.2036, 0.1659, 0.1662, 0.0000, 0.0000, 0.0000],
|
||||
|
|
@ -892,7 +892,7 @@ print(masked_simple_norm)
|
|||
|
||||
最终得到的注意力权重矩阵具有以下特性:主对角线以上的注意力权重被置零,每一行的权重和为 1:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
|
||||
[0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
|
||||
[0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
|
||||
|
|
@ -928,7 +928,7 @@ print(masked)
|
|||
|
||||
由此生成以下掩码:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[0.2899, -inf, -inf, -inf, -inf, -inf],
|
||||
[0.4656, 0.1723, -inf, -inf, -inf, -inf],
|
||||
[0.4594, 0.1703, 0.1731, -inf, -inf, -inf],
|
||||
|
|
@ -947,7 +947,7 @@ print(attn_weights)
|
|||
|
||||
如输出所示,每一行的值之和为 1,因此不再需要进一步的归一化:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
|
||||
[0.5517, 0.4483, 0.0000, 0.0000, 0.0000, 0.0000],
|
||||
[0.3800, 0.3097, 0.3103, 0.0000, 0.0000, 0.0000],
|
||||
|
|
@ -988,7 +988,7 @@ print(dropout(example))
|
|||
|
||||
如我们所见,约一半的数值被置零:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[2., 2., 0., 2., 2., 0.],
|
||||
[0., 0., 0., 2., 0., 2.],
|
||||
[2., 2., 2., 2., 0., 2.],
|
||||
|
|
@ -1042,7 +1042,7 @@ print(dropout(attn_weights))
|
|||
|
||||
由此生成的注意力权重矩阵中,部分元素被置零,剩余的元素重新进行了缩放:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
|
||||
[0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
|
||||
[0.7599, 0.6194, 0.6206, 0.0000, 0.0000, 0.0000],
|
||||
|
|
@ -1075,7 +1075,7 @@ print(batch.shape) #A
|
|||
|
||||
以上代码生成一个三维张量,包含 2 个输入文本,每个文本包含 6 个 token,每个 token 表示为一个 3 维嵌入向量:
|
||||
|
||||
```
|
||||
```python
|
||||
torch.Size([2, 6, 3])
|
||||
```
|
||||
|
||||
|
|
@ -1131,7 +1131,7 @@ print("context_vecs.shape:", context_vecs.shape)
|
|||
|
||||
生成的上下文向量是一个三维张量,其中每个 token 现在都表示为一个二维嵌入:
|
||||
|
||||
```
|
||||
```python
|
||||
context_vecs.shape: torch.Size([2, 6, 2])
|
||||
```
|
||||
|
||||
|
|
@ -1197,7 +1197,7 @@ print("context_vecs.shape:", context_vecs.shape)
|
|||
|
||||
以上代码输出的上下文向量如下所示:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[[-0.4519, 0.2216, 0.4772, 0.1063],
|
||||
[-0.5874, 0.0058, 0.5891, 0.3257],
|
||||
[-0.6300, -0.0632, 0.6202, 0.3860],
|
||||
|
|
@ -1332,7 +1332,7 @@ print(a @ a.transpose(2, 3))
|
|||
|
||||
结果如下:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[[[1.3208, 1.1631, 1.2879],
|
||||
[1.1631, 2.2150, 1.8424],
|
||||
[1.2879, 1.8424, 2.0402]],
|
||||
|
|
@ -1356,7 +1356,7 @@ print("\nSecond head:\n", second_res)
|
|||
|
||||
该结果与我们之前使用批量矩阵乘法 `print(a @ a.transpose(2, 3))` 时获得的结果完全相同:
|
||||
|
||||
```
|
||||
```python
|
||||
First head:
|
||||
tensor([[1.3208, 1.1631, 1.2879],
|
||||
[1.1631, 2.2150, 1.8424],
|
||||
|
|
@ -1387,7 +1387,7 @@ print("context_vecs.shape:", context_vecs.shape)
|
|||
|
||||
从结果可以看出,输出维度是由`d_out`参数直接控制的:
|
||||
|
||||
```
|
||||
```python
|
||||
tensor([[[0.3190, 0.4858],
|
||||
[0.2943, 0.3897],
|
||||
[0.2856, 0.3593],
|
||||
|
|
|
|||
|
|
@ -89,10 +89,10 @@ class DummyGPTModel(nn.Module):
|
|||
self.pos_emb = nn.Embedding(cfg["context_length"], cfg["emb_dim"])
|
||||
self.drop_emb = nn.Dropout(cfg["drop_rate"])
|
||||
self.trf_blocks = nn.Sequential(
|
||||
*[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]) #A
|
||||
*[DummyTransformerBlock(cfg) for _ in range(cfg["n_layers"])]) #A
|
||||
self.final_norm = DummyLayerNorm(cfg["emb_dim"]) #B
|
||||
self.out_head = nn.Linear(
|
||||
cfg["emb_dim"], cfg["vocab_size"], bias=False
|
||||
cfg["emb_dim"], cfg["vocab_size"], bias=False
|
||||
)
|
||||
|
||||
def forward(self, in_idx):
|
||||
|
|
|
|||
Loading…
Reference in New Issue