From 5382114061467b00a405200cbe3492b3926f8610 Mon Sep 17 00:00:00 2001 From: skindhu Date: Fri, 25 Oct 2024 12:58:04 +0800 Subject: [PATCH] add second chapter --- Book/2.处理文本数据.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/Book/2.处理文本数据.md b/Book/2.处理文本数据.md index 666bfba..d834de5 100644 --- a/Book/2.处理文本数据.md +++ b/Book/2.处理文本数据.md @@ -244,19 +244,19 @@ for i, item in enumerate(vocab.items()): # Listing 2.3 Implementing a simple text tokenizer class SimpleTokenizerV1: def __init__(self, vocab): - self.str_to_int = vocab #A - self.int_to_str = {i:s for s,i in vocab.items()} #B + self.str_to_int = vocab #A + self.int_to_str = {i:s for s,i in vocab.items()} #B - def encode(self, text): #C + def encode(self, text): #C preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text) preprocessed = [item.strip() for item in preprocessed if item.strip()] ids = [self.str_to_int[s] for s in preprocessed] return ids - def decode(self, ids): #D + def decode(self, ids): #D text = " ".join([self.int_to_str[i] for i in ids]) - text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E + text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) #E return text