使用word2vec词向量进行【四大名著】文本分类

本文使用word2vec训练词向量,并进行文本分类,准确率为91.5%

数据下载:四大名著训练语料

停用词下载:中文常用停用词

代码:

import time
import collections
import numpy as np
import pandas as pd
from gensim.models import word2vec
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsOneClassifier
from sklearn.multiclass import OneVsRestClassifier


def loadStopwords():
    with open("stopwords.txt", 'r', encoding='utf-8') as fr:
        stopwords = fr.read().splitlines()
    return stopwords


start = time.time()

# 加载停用词
stopwords = set(loadStopwords())

data = []
with open('D:\data\story\\train.txt', 'r', encoding='utf8') as fr:
    data = fr.read().splitlines()

tags = []
sentences = []
all_words = []
for item in data:
    ss = item.split(' ')
    tags.append(ss[0])
    valid_words = [x for x in ss[1:] if x not in stopwords]
    sentences.append(valid_words)
    all_words += valid_words

# 获取词频最高的前80%个词
print('total words(没有去重)=>', len(all_words))
# total words(没有去重)=> 896717

counter = collections.Counter(all_words)
counter_pairs = sorted(counter.items(), key=lambda x: -x[1])
words, _ = zip(*counter_pairs)
print('total words=>', len(words))
# total words=> 110967

reserved_word_count = int(len(words) * 0.8)
words = words[:reserved_word_count]
print('reserved word count=>', len(words))
# reserved word count=> 88773
stop = time.time()
print("parse time: ", str(round(stop - start, 4)), "s")
# parse time:  0.6103 s
start = stop

# 训练词向量
model = word2vec.Word2Vec(sentences, min_count=2, size=300)
stop = time.time()
print("finished train word2vec, time=>", str(stop - start), 's')
start = stop

# 计算句子的向量=句子中每个词的词向量之和
features = []
hit_count = 0
missing_count = 0
for sent in sentences:
    feature = np.zeros(300)
    for word in sent:
        try:
            hit_count += 1
            feature += model.wv[word]
        except KeyError:
            missing_count += 1
            continue
    features.append(feature)

print("len:", len(features), len(tags))
print('hit count=>', hit_count, ' missing count=>', missing_count)

stop = time.time()
print('time=>', str(stop - start))

num_train = int(len(tags) * 0.7)
train_x = features[:num_train]
train_y = tags[:num_train]

test_x = features[num_train:]
test_y = tags[num_train:]

clf = OneVsOneClassifier(LinearSVC(random_state=0))
# clf = OneVsRestClassifier(LinearSVC(random_state=0))
clf.fit(train_x, train_y)
print("score=>", str(clf.score(test_x, test_y)))
# score=> 0.914850821320818

print('classes=>', clf.classes_)
# classes=> ['1' '2' '3']

for item in clf.estimators_:
    print(item)
# LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
#           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
#           multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
#           verbose=0)
# LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
#           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
#           multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
#           verbose=0)
# LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
#           intercept_scaling=1, loss='squared_hinge', max_iter=1000,
#           multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
#           verbose=0)



个人资料
半世晨晓
等级:5
文章:3篇
访问:1.2k
排名: 31
上一篇: LSTM写诗
下一篇:使用keras【四大名著】文本分类
猜你感兴趣的圈子:
深度学习交流圈
标签: words、train、linearsvc、clf、intercept、面试题
隐藏