add second chapter
This commit is contained in:
parent
265759d6e2
commit
5382114061
|
|
@ -244,19 +244,19 @@ for i, item in enumerate(vocab.items()):
|
|||
# Listing 2.3 Implementing a simple text tokenizer
|
||||
class SimpleTokenizerV1:
|
||||
def __init__(self, vocab):
|
||||
self.str_to_int = vocab #A
|
||||
self.int_to_str = {i:s for s,i in vocab.items()} #B
|
||||
self.str_to_int = vocab #A
|
||||
self.int_to_str = {i:s for s,i in vocab.items()} #B
|
||||
|
||||
def encode(self, text): #C
|
||||
def encode(self, text): #C
|
||||
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
|
||||
preprocessed = [item.strip() for item in preprocessed if item.strip()]
|
||||
ids = [self.str_to_int[s] for s in preprocessed]
|
||||
return ids
|
||||
|
||||
def decode(self, ids): #D
|
||||
def decode(self, ids): #D
|
||||
text = " ".join([self.int_to_str[i] for i in ids])
|
||||
|
||||
text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E
|
||||
text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E
|
||||
return text
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue