from tokenizers import Tokenizer, pre_tokenizers
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers import decoders
path="./"
tokenizer = Tokenizer(BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
tokenizer.decoder = decoders.ByteLevel(add_prefix_space=True, trim_offsets=True, use_regex=True)

trainer = BpeTrainer(
    special_tokens=[ "<s>","<pad>", "</s>","<unk>",
                     # "<mask>"
                     ],
                     show_progress=True,
                     # min_frequency=100,
                     # vocab_size=10
                     )#,vocab_size=100,min_frequency=10,vocab_size=600
tokenizer.train(trainer=trainer,files=[path+"spacemath.txt"])
print("Trained vocab size: {}".format(tokenizer.get_vocab_size()))

tokenizer.save(path="tokenizer.json",pretty=True)
from transformers import PreTrainedTokenizerFast

# 加载 tokenizer
tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")

# 检查特殊符号 ID
print("ID for '<s>':", tokenizer.convert_tokens_to_ids("<s>"))
print("ID for '</s>':", tokenizer.convert_tokens_to_ids("</s>"))
print(tokenizer.encode("asd123"))
Logo

技术共进,成长同行——讯飞AI开发者社区

更多推荐