本文使用word2vec训练词向量,并进行文本分类,准确率为91.5%
数据下载:四大名著训练语料
停用词下载:中文常用停用词
代码:
import time import collections import numpy as np import pandas as pd from gensim.models import word2vec from sklearn.svm import LinearSVC from sklearn.multiclass import OneVsOneClassifier from sklearn.multiclass import OneVsRestClassifier def loadStopwords(): with open("stopwords.txt", 'r', encoding='utf-8') as fr: stopwords = fr.read().splitlines() return stopwords start = time.time() # 加载停用词 stopwords = set(loadStopwords()) data = [] with open('D:\data\story\\train.txt', 'r', encoding='utf8') as fr: data = fr.read().splitlines() tags = [] sentences = [] all_words = [] for item in data: ss = item.split(' ') tags.append(ss[0]) valid_words = [x for x in ss[1:] if x not in stopwords] sentences.append(valid_words) all_words += valid_words # 获取词频最高的前80%个词 print('total words(没有去重)=>', len(all_words)) # total words(没有去重)=> 896717 counter = collections.Counter(all_words) counter_pairs = sorted(counter.items(), key=lambda x: -x[1]) words, _ = zip(*counter_pairs) print('total words=>', len(words)) # total words=> 110967 reserved_word_count = int(len(words) * 0.8) words = words[:reserved_word_count] print('reserved word count=>', len(words)) # reserved word count=> 88773 stop = time.time() print("parse time: ", str(round(stop - start, 4)), "s") # parse time: 0.6103 s start = stop # 训练词向量 model = word2vec.Word2Vec(sentences, min_count=2, size=300) stop = time.time() print("finished train word2vec, time=>", str(stop - start), 's') start = stop # 计算句子的向量=句子中每个词的词向量之和 features = [] hit_count = 0 missing_count = 0 for sent in sentences: feature = np.zeros(300) for word in sent: try: hit_count += 1 feature += model.wv[word] except KeyError: missing_count += 1 continue features.append(feature) print("len:", len(features), len(tags)) print('hit count=>', hit_count, ' missing count=>', missing_count) stop = time.time() print('time=>', str(stop - start)) num_train = int(len(tags) * 0.7) train_x = features[:num_train] train_y = tags[:num_train] test_x = features[num_train:] test_y = tags[num_train:] clf = OneVsOneClassifier(LinearSVC(random_state=0)) # clf = OneVsRestClassifier(LinearSVC(random_state=0)) clf.fit(train_x, train_y) print("score=>", str(clf.score(test_x, test_y))) # score=> 0.914850821320818 print('classes=>', clf.classes_) # classes=> ['1' '2' '3'] for item in clf.estimators_: print(item) # LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, # intercept_scaling=1, loss='squared_hinge', max_iter=1000, # multi_class='ovr', penalty='l2', random_state=0, tol=0.0001, # verbose=0) # LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, # intercept_scaling=1, loss='squared_hinge', max_iter=1000, # multi_class='ovr', penalty='l2', random_state=0, tol=0.0001, # verbose=0) # LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True, # intercept_scaling=1, loss='squared_hinge', max_iter=1000, # multi_class='ovr', penalty='l2', random_state=0, tol=0.0001, # verbose=0)