代码如下:
# coding=utf-8 import torch from transformers import BertTokenizer, BertModel tokenizer = BertTokenizer.from_pretrained("D:/data/transformers/chinese_L-12_H-768_A-12") model = BertModel.from_pretrained("D:/data/transformers/chinese_L-12_H-768_A-12") token_ids = tokenizer.encode("我爱中国") print(token_ids) # [101, 2769, 4263, 704, 1744, 102] tokens = tokenizer.decode(token_ids=token_ids) print(tokens) # [CLS] 我 爱 中 国 [SEP] input_ids = torch.tensor(token_ids).unsqueeze(0) print(input_ids) # tensor([[ 101, 2769, 4263, 704, 1744, 102]]) outputs = model(input_ids) # last_hidden_states = outputs[0] # The last hidden-state is the first element of the output tuple sequence_output = outputs[0] pooled_output = outputs[1] print(sequence_output.shape) ## 字向量 # torch.Size([1, 6, 768]) print(pooled_output.shape) ## 句向量 # torch.Size([1, 768]) vector1 = sequence_output[:, 1].squeeze(0) # '我' 字向量 vector2 = sequence_output[:, 2].squeeze(0) # '爱' 字向量