基于四大名著训练中文词向量

四大名著语料下载:

http://www.k6k4.com/resource/detail/aaskhxdgq1580644421997

停用词下载:

http://www.k6k4.com/resource/detail/aascdcaow1580131106546

中文词向量训练代码如下:

# coding=utf-8
import pickle
import time

from gensim.models import word2vec
from gensim.test.utils import get_tmpfile


def pre_handle_data():
    with open("D:/data/corp/train4.txt", 'r', encoding='utf8') as f:
        with open("data/cop.txt", 'w', encoding='utf8') as f1:
            for line in f:
                line = line.strip()
                tokens = line.split(' ')[1:]
                f1.write(' '.join(tokens))
                f1.write('\n')


def train():
    # 数据预处理
    pre_handle_data()

    # 加载语料
    sentences = word2vec.LineSentence('data/cop.txt')

    # 训练语料
    t0 = time.time()
    get_tmpfile("word2vec.model")
    model = word2vec.Word2Vec(sentences, hs=1, min_count=1, window=10, size=256, workers=8)
    model.save("data/word2vec.model")
    print("finished train, time=>{} seconds".format(time.time() - t0))
    # finished train, time = > 71.40878009796143 seconds

    # 保留单词->词向量映射关系
    index2word = model.wv.index2word
    word_vector_json = {}
    for index, word in enumerate(index2word):
        word_vector_json[word] = model.wv.vectors[index]

    with open('data/model.pk', 'wb') as f:
        pickle.dump(word_vector_json, f)


def test():
    model = word2vec.Word2Vec.load("data/word2vec.model")
    for key in model.wv.similar_by_word('曹操', topn=10):
        print(key)
    # ('孙权', 0.8790037631988525)
    # ('孙策', 0.8632506728172302)
    # ('袁绍', 0.8599162697792053)
    # ('吕布', 0.8457263708114624)
    # ('刘备', 0.8314833045005798)
    # ('东吴', 0.8303385376930237)
    # ('江东', 0.8191568851470947)
    # ('刘璋', 0.8189404010772705)
    # ('袁术', 0.8098791837692261)
    # ('马超', 0.8001011610031128)

    similarity = model.similarity('曹操', '司马懿')
    print(similarity)
    # 0.7981173

    vector = model.wv['曹操']
    print(vector)

    index2word = model.wv.index2word
    print(len(index2word))
    # 111404
    print(index2word[100:110])
    # ['至', '不知', '这个', '大', '贾母', '你们', '凤姐', '可', '杀', '打']

    print(model.wv.vectors.shape)
    # (111404, 256)


train()
test()

个人资料
hadoop迷
等级:6
文章:30篇
访问:2.2w
排名: 13
上一篇: tensorflow2.0 手写数字识别训练、预测
下一篇:词向量应用一:相似度计算
标签: 词向量、gensim、面试题
隐藏