add second chapter

2024-10-25 12:58:04 +08:00 · 2024-10-25 12:58:04 +08:00 · 5382114061
parent 265759d6e2
commit 5382114061
1 changed files with 5 additions and 5 deletions
--- a/Book/2.处理文本数据.md
+++ b/Book/2.处理文本数据.md
@ -244,19 +244,19 @@ for i, item in enumerate(vocab.items()):
 # Listing 2.3 Implementing a simple text tokenizer
 class SimpleTokenizerV1:
  	def __init__(self, vocab):
-      	self.str_to_int = vocab                                               		#A
-        self.int_to_str = {i:s for s,i in vocab.items()}                     			#B
+      	self.str_to_int = vocab                                                   #A
+        self.int_to_str = {i:s for s,i in vocab.items()}                          #B
        
-    def encode(self, text):																												#C
+    def encode(self, text):                                                       #C
      	preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]		
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
      
-    def decode(self, ids):																												#D
+    def decode(self, ids):                                                        #D
      	text = " ".join([self.int_to_str[i] for i in ids])
        
-        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)														#E
+        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)                           #E
        return text