语料下载
四大名著:
http://www.k6k4.com/resource/detail/aaskhxdgq1580644421997
停用词:
http://www.k6k4.com/resource/detail/aascdcaow1580131106546
# coding=utf-8 import fasttext from sklearn.metrics.pairwise import cosine_distances stop_words = [] with open('哈工大停用词表.txt', 'r', encoding='utf-8') as f: lines = f.readlines() stop_words = {line.strip() for line in lines} with open('train.txt', 'r', encoding='utf-8') as f: lines = [line.split(' ')[1:-2] for line in f.readlines()] lines = [[word for word in line if word not in stop_words] for line in lines] with open('train1.txt', 'w', encoding='utf-8') as f: for line in lines: line.append('\n') f.write(' '.join(line)) model = fasttext.train_unsupervised('train1.txt', model='skipgram') model.save_model('model.bin') model = fasttext.load_model('model.bin') v1 = model['贾母'] v2 = model['宝玉'] v3 = model['黛玉'] similarity = cosine_distances([v1, v2, v3]) print(similarity) # [[0. 0.2294364 0.27810735] # [0.2294364 0. 0.12159884] # [0.27810735 0.12159884 0. ]]