四大名著语料下载:
http://www.k6k4.com/resource/detail/aaskhxdgq1580644421997
停用词下载:
http://www.k6k4.com/resource/detail/aascdcaow1580131106546
中文词向量训练代码如下:
# coding=utf-8 import pickle import time from gensim.models import word2vec from gensim.test.utils import get_tmpfile def pre_handle_data(): with open("D:/data/corp/train4.txt", 'r', encoding='utf8') as f: with open("data/cop.txt", 'w', encoding='utf8') as f1: for line in f: line = line.strip() tokens = line.split(' ')[1:] f1.write(' '.join(tokens)) f1.write('\n') def train(): # 数据预处理 pre_handle_data() # 加载语料 sentences = word2vec.LineSentence('data/cop.txt') # 训练语料 t0 = time.time() get_tmpfile("word2vec.model") model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=10, size=256, workers=8) model.save("data/word2vec.model") print("finished train, time=>{} seconds".format(time.time() - t0)) # finished train, time = > 71.40878009796143 seconds # 保留单词->词向量映射关系 index2word = model.wv.index2word word_vector_json = {} for index, word in enumerate(index2word): word_vector_json[word] = model.wv.vectors[index] with open('data/model.pk', 'wb') as f: pickle.dump(word_vector_json, f) def test(): model = word2vec.Word2Vec.load("data/word2vec.model") for key in model.wv.similar_by_word('曹操', topn=10): print(key) # ('孙权', 0.8790037631988525) # ('孙策', 0.8632506728172302) # ('袁绍', 0.8599162697792053) # ('吕布', 0.8457263708114624) # ('刘备', 0.8314833045005798) # ('东吴', 0.8303385376930237) # ('江东', 0.8191568851470947) # ('刘璋', 0.8189404010772705) # ('袁术', 0.8098791837692261) # ('马超', 0.8001011610031128) similarity = model.similarity('曹操', '司马懿') print(similarity) # 0.7981173 vector = model.wv['曹操'] print(vector) index2word = model.wv.index2word print(len(index2word)) # 111404 print(index2word[100:110]) # ['至', '不知', '这个', '大', '贾母', '你们', '凤姐', '可', '杀', '打'] print(model.wv.vectors.shape) # (111404, 256) train() test()