add second chapter
This commit is contained in:
parent
265759d6e2
commit
5382114061
|
|
@ -244,19 +244,19 @@ for i, item in enumerate(vocab.items()):
|
||||||
# Listing 2.3 Implementing a simple text tokenizer
|
# Listing 2.3 Implementing a simple text tokenizer
|
||||||
class SimpleTokenizerV1:
|
class SimpleTokenizerV1:
|
||||||
def __init__(self, vocab):
|
def __init__(self, vocab):
|
||||||
self.str_to_int = vocab #A
|
self.str_to_int = vocab #A
|
||||||
self.int_to_str = {i:s for s,i in vocab.items()} #B
|
self.int_to_str = {i:s for s,i in vocab.items()} #B
|
||||||
|
|
||||||
def encode(self, text): #C
|
def encode(self, text): #C
|
||||||
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
|
preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
|
||||||
preprocessed = [item.strip() for item in preprocessed if item.strip()]
|
preprocessed = [item.strip() for item in preprocessed if item.strip()]
|
||||||
ids = [self.str_to_int[s] for s in preprocessed]
|
ids = [self.str_to_int[s] for s in preprocessed]
|
||||||
return ids
|
return ids
|
||||||
|
|
||||||
def decode(self, ids): #D
|
def decode(self, ids): #D
|
||||||
text = " ".join([self.int_to_str[i] for i in ids])
|
text = " ".join([self.int_to_str[i] for i in ids])
|
||||||
|
|
||||||
text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E
|
text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue