在进行自然语言处理过程中需要将自然语言转化成数字化表示,需要对自然语言进行词表映射。
代码from collections import defaultdict #建立词表映射 class Vocab: def __init__(self,tokens=None): self.idx_to_token = list() self.token_to_idx = dict() if tokens is not None: if "运行结果" not in tokens: tokens = tokens + [" "] for token in tokens: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token) - 1 self.unk = self.token_to_idx[' '] #创建词表 @classmethod def build(cls,text,min_freq=1,reserved_tokens=None): token_freqs = defaultdict(int) for sentence in text: for token in sentence: token_freqs[token] += 1 uniq_tokens = [" "] + (reserved_tokens if reserved_tokens else []) uniq_tokens += [token for token,freq in token_freqs.items() if freq >= min_freq and token != " "] return cls(uniq_tokens) #返回词表大小 def __len__(self): return len(self.idx_to_token) #查找token索引 def __getitem__(self,token): return self.token_to_idx.get(token,self.unk) #将tokens转化成索引 def convert_tokens_ids(self,tokens): return [self.token_to_idx[token] for token in tokens] #将索引转化成token def convert_ids_to_tokens(self,indices): return [self.idx_to_token[index] for index in indices] #验证 from nltk.corpus import sentence_polarity vocab = Vocab.build(sentence_polarity.sents()) print("词表大小:",len(vocab)) print('great的索引:',vocab['great']) print("token转换成id:",vocab.convert_tokens_ids(['i','am','great'])) print("id转化成token:",vocab.convert_ids_to_tokens([329, 4499, 3495]))
使用BERT等预训练模型,自带词表,不需要创建词表。