nltk有如下词性标注器:
import nltk from nltk.corpus import brown nltk.data.path = ["C:\\nltk_data\\nltk_data-gh-pages\packages"] tagged_words = brown.tagged_words(categories='news') # print(len(tagged_words)) # 100554 tagged_sents = brown.tagged_sents(categories='news') # print(len(tagged_sents)) # 4623 # 常量标注,所有的词都被标注为相同的Tag tagger = nltk.DefaultTagger('NN') print(tagger.evaluate(tagged_sents)) # 0.13089484257215028 # 正则表达式标注 patterns = [(r'.*ing$', 'VBG'), (r'.*ed$', 'VBD'), (r'.*es$', 'VBZ'), (r'.*ould$', 'MD'), \ (r'.*\'s$', 'NN$'), (r'.*s$', 'NNS'), (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), (r'.*', 'NN')] tagger = nltk.RegexpTagger(patterns) print(tagger.evaluate(tagged_sents)) # 0.20326391789486245 # 一元标注器 train_size = int(len(tagged_sents) * 0.8) print(train_size) # 3698 tagger = nltk.UnigramTagger(tagged_sents[:train_size]) print(tagger.evaluate(tagged_sents[train_size:])) # 0.8026879907509996 # 二元标注器 tagger = nltk.BigramTagger(tagged_sents[:train_size]) print(tagger.evaluate(tagged_sents[train_size:])) # 0.09186376993111421 # 组合多个标注器 t0 = nltk.DefaultTagger('NN') t1 = nltk.UnigramTagger(tagged_sents[:train_size], backoff=t0) t2 = nltk.BigramTagger(tagged_sents[:train_size], backoff=t1) print(t2.evaluate(tagged_sents[train_size:])) # 0.8360711016908329