按照规范,Huggingface 提供了这个教程 https://huggingface.co/learn/nlp-course/chapter6/2 但它以“使用现有分词器时的怪癖”结尾。然后它指向第 7 章中的
train_new_from_iterator()
函数,但我似乎找不到如何使用它来扩展分词器而不重新训练它的参考。
我已经尝试过来自训练新的AutoTokenizer Hugging Face的解决方案,它使用
train_new_from_iterator()
,但这将重新训练标记器,但它不会扩展它,该解决方案将替换现有的标记索引。 训练新的 AutoTokenizer 拥抱脸
import pandas as pd
def batch_iterator(batch_size=3, size=8):
df = pd.DataFrame({"note_text": ['foobar', 'helloworld']})
for x in range(0, size, batch_size):
yield df['note_text'].to_list()
old_tokenizer = AutoTokenizer.from_pretrained('roberta-base')
training_corpus = batch_iterator()
new_tokenizer = old_tokenizer.train_new_from_iterator(training_corpus, 32000)
print(len(old_tokenizer))
print(old_tokenizer( ['foobarzz', 'helloworld'] ))
print(new_tokenizer( ['foobarzz', 'hello world'] ))
[出]:
50265
{'input_ids': [[0, 21466, 22468, 7399, 2], [0, 20030, 1722, 39949, 2]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1, 1]]}
{'input_ids': [[0, 275, 2], [0, 276, 2]], 'attention_mask': [[1, 1, 1], [1, 1, 1]]}
注意: 新令牌之所以从 275 和 276 开始,是因为 id 0-274 之间有保留令牌。
new_tokenizer( ['foo bar', 'hello word'] )
的预期行为是具有超出分词器词汇大小的ID(即roberta-base
模型的50265),它应该如下所示:
{'input_ids': [[0, 50265, 2], [0, 50266, 2]], 'attention_mask': [[1, 1, 1], [1, 1, 1]]}
来源:https://www.depends-on-the-definition.com/how-to-add-new-tokens-to-huggingface-transformers/
from transformers import AutoTokenizer, AutoModel
# pick the model type
model_type = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type)
# new tokens
new_tokens = ["new_token"]
# check if the tokens are already in the vocabulary
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())
# add the tokens to the tokenizer vocabulary
tokenizer.add_tokens(list(new_tokens))
# add new, random embeddings for the new tokens
model.resize_token_embeddings(len(tokenizer))
这是一个示例用法,首先从
emoji.txt
文件中获取表情符号列表,打印所有unicode表情符号到文件
from transformers import AutoTokenizer, AutoModel
# pick the model type
model_type = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type)
# add emojis
new_tokens = [e.strip() for e in open('emoji.txt')]
# check if the tokens are already in the vocabulary
new_tokens = set(new_tokens) - set(tokenizer.vocab.keys())
# add the tokens to the tokenizer vocabulary
tokenizer.add_tokens(list(new_tokens))
# add new, random embeddings for the new tokens
model.resize_token_embeddings(len(tokenizer))
这是提取多词表达式的示例:https://stackoverflow.com/a/76058924/610569
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers.pipelines.token_classification import TokenClassificationPipeline
model_checkpoint = "Davlan/bert-base-multilingual-cased-ner-hrl"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)
class TokenClassificationChunkPipeline(TokenClassificationPipeline):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
def preprocess(self, sentence, offset_mapping=None, **preprocess_params):
tokenizer_params = preprocess_params.pop("tokenizer_params", {})
truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
inputs = self.tokenizer(
sentence,
return_tensors="pt",
truncation=True,
return_special_tokens_mask=True,
return_offsets_mapping=True,
return_overflowing_tokens=True, # Return multiple chunks
max_length=self.tokenizer.model_max_length,
padding=True
)
#inputs.pop("overflow_to_sample_mapping", None)
num_chunks = len(inputs["input_ids"])
for i in range(num_chunks):
if self.framework == "tf":
model_inputs = {k: tf.expand_dims(v[i], 0) for k, v in inputs.items()}
else:
model_inputs = {k: v[i].unsqueeze(0) for k, v in inputs.items()}
if offset_mapping is not None:
model_inputs["offset_mapping"] = offset_mapping
model_inputs["sentence"] = sentence if i == 0 else None
model_inputs["is_last"] = i == num_chunks - 1
yield model_inputs
def _forward(self, model_inputs):
# Forward
special_tokens_mask = model_inputs.pop("special_tokens_mask")
offset_mapping = model_inputs.pop("offset_mapping", None)
sentence = model_inputs.pop("sentence")
is_last = model_inputs.pop("is_last")
overflow_to_sample_mapping = model_inputs.pop("overflow_to_sample_mapping")
output = self.model(**model_inputs)
logits = output["logits"] if isinstance(output, dict) else output[0]
model_outputs = {
"logits": logits,
"special_tokens_mask": special_tokens_mask,
"offset_mapping": offset_mapping,
"sentence": sentence,
"overflow_to_sample_mapping": overflow_to_sample_mapping,
"is_last": is_last,
**model_inputs,
}
# We reshape outputs to fit with the postprocess inputs
model_outputs["input_ids"] = torch.reshape(model_outputs["input_ids"], (1, -1))
model_outputs["token_type_ids"] = torch.reshape(model_outputs["token_type_ids"], (1, -1))
model_outputs["attention_mask"] = torch.reshape(model_outputs["attention_mask"], (1, -1))
model_outputs["special_tokens_mask"] = torch.reshape(model_outputs["special_tokens_mask"], (1, -1))
model_outputs["offset_mapping"] = torch.reshape(model_outputs["offset_mapping"], (1, -1, 2))
return model_outputs
pipe = TokenClassificationChunkPipeline(model=model, tokenizer=tokenizer, aggregation_strategy="simple")
mwe_tokens = set(
token['word'] for sent in pipe(
["Bernard works at BNP Paribas in Paris.", "In New York, you will be a New Man"])
for token in sent
)
# pick the model type
model_type = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type)
# check if the tokens are already in the vocabulary
new_tokens = set(mwe_tokens) - set(tokenizer.vocab.keys())
# add the tokens to the tokenizer vocabulary
tokenizer.add_tokens(list(new_tokens))
# add new, random embeddings for the new tokens
model.resize_token_embeddings(len(tokenizer))
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
# pick the model type
model_type = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_type)
model = AutoModel.from_pretrained(model_type)
# Original vocab size.
print(len(tokenizer))
# Note the outputs are 100s indices which points to unknown tokens.
print(tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ"))
# Take the first 1000 sentences from cc100.
am_dataset = load_dataset("cc100", "am")
am_train = iter(am_dataset['train'][i]['text'] for i in range(1000))
# Train a new tokenizer using the am_train and the old tokenizer object.
new_tokenizer = tokenizer.train_new_from_iterator(am_train, vocab_size=100_000)
tokenizer.add_tokens(list(new_tokenizer.vocab))
print(new_tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ"))
print(tokenizer("የቀን 7 ሰዓት አማርኛ ዜና ሚያዝያ"))
[出]:
119547
{'input_ids': [101, 100, 128, 100, 100, 100, 100, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [2, 1087, 388, 27, 1744, 2155, 1135, 5252, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 128730, 120760, 128, 128891, 121948, 135152, 135435, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel
# pick the model type
tokenizer1 = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenizer2 = AutoTokenizer.from_pretrained("roberta-base")
print("Before adding roberta:", len(tokenizer1))
tokens_in_roberta_not_in_bert = set(tokenizer2.vocab).difference(tokenizer.vocab)
tokenizer1.add_tokens(list(tokens_in_roberta_not_in_bert))
print("After adding roberta:", len(tokenizer1))
model = AutoModel.from_pretrained("bert-base-multilingual-cased")
model.resize_token_embeddings(len(tokenizer))
[出]:
Before adding roberta: 119547
After adding roberta: 162769
要回答原始帖子(OP)中所需输出的问题:
from transformers import AutoTokenizer
old_tokenizer_1 = AutoTokenizer.from_pretrained('roberta-base')
old_tokenizer_1.add_tokens(['foobarzz', 'helloworld'])
print(old_tokenizer_1( ['foobarzz', 'helloworld'] ))
[出]:
{'input_ids': [[0, 50265, 2], [0, 50266, 2]], 'attention_mask': [[1, 1, 1], [1, 1, 1]]}
我在不同的更大的数据集上尝试了上面的
Extending existing AutoTokenizer with new bpe-tokenized tokens
。我尝试过对 100k 行文本进行训练。训练时间太长,合作实验在训练完成之前就崩溃了。我使用了与您上面发布的代码完全相同的代码。这是代码:
从数据集导入load_dataset
从 Transformers 导入 AutoTokenizer
model_type =“mistralai/mistral-7b-v0.1” tokenizer = AutoTokenizer.from_pretrained(model_type)
打印(len(分词器)) print(tokenizer("నేను బాగున్నాను。మీరు ఏలా ఉన్నారు?"))
数据集 = load_dataset("ai4bharat/sangraha", data_files=["verified/tel/data-0.parquet"], split="train") am_train = iter(数据集[i]['text'] for i in range(100000))
new_tokenizer = tokenizer.train_new_from_iterator(am_train, vocab_size=16_000)
您认为哪里出了问题?
train_new_from_iterator
方法很麻烦,并且无论我的数据使用什么类型的生成器对象,都会反复使我的会话崩溃。