如何解决Gensim LDA:如何获取每个主题中单词的数量频率?
我想获取每个主题中的词数或词频。
下面的代码是我用来生成模型和进行可视化的代码。
def compute_coherence_values(dictionary,corpus,texts,limit,start=2,step=3):
coherence_values = []
model_list = []
for num_topics in range(start,step):
model = LdaModel(corpus=corpus,id2word=dictionary,num_topics=num_topics)
model_list.append(model)
coherencemodel = CoherenceModel(model=model,texts=texts,dictionary=dictionary,coherence='c_v')
coherence_values.append(coherencemodel.get_coherence())
return model_list,coherence_values
def find_optimal_number_of_topics(dictionary,processed_data):
limit = 40;
start = 2;
step = 6;
model_list,coherence_values = compute_coherence_values(dictionary=dictionary,corpus=corpus,texts=processed_data,start=start,limit=limit,step=step)
x = range(start,step)
plt.plot(x,coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"),loc='best')
plt.show()
if __name__ == '__main__':
processed_data = [sent.strip().split(",") for sent in tqdm(open('./data/tokenized_data.csv','r',encoding='utf-8').readlines())]
dictionary = corpora.Dictionary(processed_data)
dictionary.filter_extremes(no_below=10,no_above=0.05)
corpus = [dictionary.doc2bow(text) for text in processed_data]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
find_optimal_number_of_topics(dictionary,processed_data)
if __name__ == '__main__':
processed_data = [sent.strip().split(",") for sent in tqdm(open('/Users/pc/Downloads/data/tokenized_data.csv',level=logging.INFO)
perplexity_logger = PerplexityMetric(corpus=corpus,logger='shell')
coherence_logger = CoherenceMetric(corpus=corpus,coherence="u_mass",logger='shell')
lda_model = LdaModel(corpus,num_topics=5,passes=30,callbacks=[coherence_logger,perplexity_logger])
topics = lda_model.print_topics(num_words=5)
for topic in topics:
print(topic)
coherence_model_lda = CoherenceModel(model=lda_model,coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score (c_v): ',coherence_lda)
coherence_model_lda = CoherenceModel(model=lda_model,coherence="u_mass")
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score (u_mass): ',coherence_lda)
pickle.dump(corpus,open('/Users/pc/Downloads/data/lda_corpus.pkl','wb'))
dictionary.save('/Users/pc/Downloads/data/lda_dictionary.gensim')
lda_model.save('/Users/pc/Downloads/data/lda_model.gensim')
lda_visualization = gensimvis.prepare(lda_model,dictionary,sort_topics=False)
pyLDAvis.save_html(lda_visualization,'/Users/pc/Downloads/data/lda.html')
pyLDAvis.show(lda_visualization)
理想的输出如下:
主题 1. word 1: 123 word 2: 323 word 3: 123 word 4: 322
主题 2. word 1: 133 word 5: 103 word 8: 313 word 4: 232
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。