From 5382114061467b00a405200cbe3492b3926f8610 Mon Sep 17 00:00:00 2001
From: skindhu <skindhu@tencent.com>
Date: Fri, 25 Oct 2024 12:58:04 +0800
Subject: [PATCH] add second chapter

---
 Book/2.处理文本数据.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/Book/2.处理文本数据.md b/Book/2.处理文本数据.md
index 666bfba..d834de5 100644
--- a/Book/2.处理文本数据.md
+++ b/Book/2.处理文本数据.md
@@ -244,19 +244,19 @@ for i, item in enumerate(vocab.items()):
 # Listing 2.3 Implementing a simple text tokenizer
 class SimpleTokenizerV1:
   	def __init__(self, vocab):
-      	self.str_to_int = vocab                                               		#A
-        self.int_to_str = {i:s for s,i in vocab.items()}                     			#B
+      	self.str_to_int = vocab                                                   #A
+        self.int_to_str = {i:s for s,i in vocab.items()}                          #B
         
-    def encode(self, text):																												#C
+    def encode(self, text):                                                       #C
       	preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
         preprocessed = [item.strip() for item in preprocessed if item.strip()]		
         ids = [self.str_to_int[s] for s in preprocessed]
         return ids
       
-    def decode(self, ids):																												#D
+    def decode(self, ids):                                                        #D
       	text = " ".join([self.int_to_str[i] for i in ids])
         
-        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)														#E
+        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)                           #E
         return text