https://radimrehurek.com/gensim/models/word2vec.html
语料:
http://www.k6k4.com/resource/detail/aaswfrdtx1605446850024
import jieba from gensim.models import word2vec, Word2Vec jieba.suggest_freq('沙瑞金', True) jieba.suggest_freq('田国富', True) jieba.suggest_freq('高育良', True) jieba.suggest_freq('侯亮平', True) jieba.suggest_freq('钟小艾', True) jieba.suggest_freq('陈岩石', True) jieba.suggest_freq('欧阳菁', True) jieba.suggest_freq('易学习', True) jieba.suggest_freq('王大路', True) jieba.suggest_freq('蔡成功', True) jieba.suggest_freq('孙连城', True) jieba.suggest_freq('季昌明', True) jieba.suggest_freq('丁义珍', True) jieba.suggest_freq('郑西坡', True) jieba.suggest_freq('赵东来', True) jieba.suggest_freq('高小琴', True) jieba.suggest_freq('赵瑞龙', True) jieba.suggest_freq('林华华', True) jieba.suggest_freq('陆亦可', True) jieba.suggest_freq('刘新建', True) jieba.suggest_freq('刘庆祝', True) path = 'in_the_name_of_people.txt' with open(path, 'r', encoding='utf-8') as f: lines = [jieba.cut(line.strip()) for line in f.readlines() if line.strip() != '' and len(line) > 6] lines = [' '.join(line) + '\n' for line in lines] print('total line count=>', len(lines)) # total line count=> 2200 with open('train.txt', 'w', encoding='utf-8') as f: f.writelines(lines) sentences = word2vec.LineSentence('train.txt') model = Word2Vec(sentences, size=100, window=5, min_count=1, workers=4) model.save('word2vec.model') model = Word2Vec.load('word2vec.model') print(model.wv['侯亮平']) # [ 1.2608657 -0.31955615 1.0512999 -1.1955101 -0.54877764 0.8825323 # -0.23593487 0.38839418 0.1108841 -0.5551717 0.7028596 1.6013402 # 0.26119116 -0.25903744 -0.52801746 1.8191248 0.20933713 -0.58041674 # -1.2210642 -0.09129222 0.5809172 -0.16032848 -0.16483907 0.14679825 # -0.7538626 -0.27101424 0.06765628 0.9971095 -1.4045182 -1.1781099 # 0.91175085 -0.8313674 2.1861002 0.17322211 0.44776583 0.42540348 # -0.9376401 1.3471535 -0.7086678 -1.368492 0.14985737 -1.3741096 # 0.02821825 0.6822765 0.30579555 -0.15688124 1.0605482 -0.45804158 # -0.27202618 -0.1547584 1.0001445 0.00995962 0.43815503 0.23531151 # -0.02857795 -1.4710406 0.8649675 0.58276564 0.6302883 -0.65666556 # 1.9223623 0.27569762 0.53792554 0.33289945 -0.8774105 0.3597854 # -0.3688891 -2.0895743 0.9067872 -0.5675777 -0.19521916 1.053807 # -1.424331 0.18578833 -1.2480674 -0.8538316 0.5637747 0.73074526 # -0.3430865 -0.12637 0.7318182 1.6930991 -0.43526727 0.8515712 # 0.69197536 -0.10268717 -1.2695224 -0.5271906 0.77203965 -1.3805364 # -0.23319757 -0.28349143 -1.3337592 0.10251193 0.6908297 1.1695349 # -1.1622112 -0.37568846 -1.5370712 -1.588368 ] print(model.wv.most_similar('侯亮平', topn=10)) # [('李达康', 0.9998794794082642), ('祁同伟', 0.9998669624328613), ('这个', 0.999819278717041), # ('易学习', 0.9998106956481934), ('得', 0.9997933506965637), ('汇报', 0.9997814297676086), # ('季昌明', 0.999774694442749), ('地说', 0.9997525215148926), ('要', 0.9997483491897583), # ('别', 0.9997408390045166)]