numpy计算余弦相似度如下:
def cos_sim(vector1, vector2): vector1 = np.mat(vector1) vector2 = np.mat(vector2) sim = float(vector1 * vector2.T) / (np.linalg.norm(vector1) * np.linalg.norm(vector2)) return sim
下面四大名著为语料,基于word2vec训练中文词向量,然后计算任意两个词向量的相似度,代码如下:
语料下载参考:http://www.k6k4.com/blog/show/aaaudtlar1618752462990
# coding=utf-8 import pickle import time import numpy as np from gensim.models import word2vec from gensim.test.utils import get_tmpfile def pre_handle_data(): with open("/home/train4.txt", 'r', encoding='utf8') as f: with open("data/cop.txt", 'w', encoding='utf8') as f1: for line in f: line = line.strip() tokens = line.split(' ')[1:] f1.write(' '.join(tokens)) f1.write('\n') def train(): # 数据预处理 pre_handle_data() # 加载语料 sentences = word2vec.LineSentence('data/cop.txt') # 训练语料 t0 = time.time() get_tmpfile("word2vec.model") model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=10, size=256, workers=8) model.save("data/word2vec.model") print("finished train, time=>{} seconds".format(time.time() - t0)) # finished train, time = > 71.40878009796143 seconds # 保留单词->词向量映射关系 index2word = model.wv.index2word word_vector_json = {} for index, word in enumerate(index2word): word_vector_json[word] = model.wv.vectors[index] with open('data/model.pk', 'wb') as f: pickle.dump(word_vector_json, f) def cos_sim(vector1, vector2): vector1 = np.mat(vector1) vector2 = np.mat(vector2) sim = float(vector1 * vector2.T) / (np.linalg.norm(vector1) * np.linalg.norm(vector2)) return sim def test(): model = word2vec.Word2Vec.load("data/word2vec.model") similarity = model.similarity('曹操', '司马懿') print(similarity) # 0.8013452 vector1 = model.wv['曹操'] vector2 = model.wv['司马懿'] print(cos_sim(vector1,vector2)) # 0.8013452273988938 # train() test()