add second chapter

This commit is contained in:
skindhu 2024-10-25 12:58:04 +08:00
parent 265759d6e2
commit 5382114061
1 changed files with 5 additions and 5 deletions

View File

@ -244,19 +244,19 @@ for i, item in enumerate(vocab.items()):
# Listing 2.3 Implementing a simple text tokenizer # Listing 2.3 Implementing a simple text tokenizer
class SimpleTokenizerV1: class SimpleTokenizerV1:
def __init__(self, vocab): def __init__(self, vocab):
self.str_to_int = vocab #A self.str_to_int = vocab #A
self.int_to_str = {i:s for s,i in vocab.items()} #B self.int_to_str = {i:s for s,i in vocab.items()} #B
def encode(self, text): #C def encode(self, text): #C
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text) preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
preprocessed = [item.strip() for item in preprocessed if item.strip()] preprocessed = [item.strip() for item in preprocessed if item.strip()]
ids = [self.str_to_int[s] for s in preprocessed] ids = [self.str_to_int[s] for s in preprocessed]
return ids return ids
def decode(self, ids): #D def decode(self, ids): #D
text = " ".join([self.int_to_str[i] for i in ids]) text = " ".join([self.int_to_str[i] for i in ids])
text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E
return text return text