Gensim LDA：如何获取每个主题中单词的数量频率？

如何解决Gensim LDA：如何获取每个主题中单词的数量频率？

我想获取每个主题中的词数或词频。

下面的代码是我用来生成模型和进行可视化的代码。

def compute_coherence_values(dictionary,corpus,texts,limit,start=2,step=3):
    coherence_values = []
    model_list = []
    for num_topics in range(start,step):
        model = LdaModel(corpus=corpus,id2word=dictionary,num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model,texts=texts,dictionary=dictionary,coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list,coherence_values

def find_optimal_number_of_topics(dictionary,processed_data):
    limit = 40;
    start = 2;
    step = 6;
    
    model_list,coherence_values = compute_coherence_values(dictionary=dictionary,corpus=corpus,texts=processed_data,start=start,limit=limit,step=step)
    x = range(start,step)
    plt.plot(x,coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend(("coherence_values"),loc='best')
    plt.show()

if __name__ == '__main__':
    processed_data = [sent.strip().split(",") for sent in tqdm(open('./data/tokenized_data.csv','r',encoding='utf-8').readlines())]
    
    dictionary = corpora.Dictionary(processed_data)
    
    dictionary.filter_extremes(no_below=10,no_above=0.05)
    corpus = [dictionary.doc2bow(text) for text in processed_data]
    print('Number of unique tokens: %d' % len(dictionary))
    print('Number of documents: %d' % len(corpus))
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',level=logging.INFO)
    
    find_optimal_number_of_topics(dictionary,processed_data)

if __name__ == '__main__':
    processed_data = [sent.strip().split(",") for sent in tqdm(open('/Users/pc/Downloads/data/tokenized_data.csv',level=logging.INFO)
    
    perplexity_logger = PerplexityMetric(corpus=corpus,logger='shell')
    coherence_logger = CoherenceMetric(corpus=corpus,coherence="u_mass",logger='shell')
    
    lda_model = LdaModel(corpus,num_topics=5,passes=30,callbacks=[coherence_logger,perplexity_logger])
    
    topics = lda_model.print_topics(num_words=5)
    for topic in topics:
        print(topic)

    coherence_model_lda = CoherenceModel(model=lda_model,coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score (c_v): ',coherence_lda)
    
    coherence_model_lda = CoherenceModel(model=lda_model,coherence="u_mass")
    coherence_lda = coherence_model_lda.get_coherence()
    print('\nCoherence Score (u_mass): ',coherence_lda)
    
    pickle.dump(corpus,open('/Users/pc/Downloads/data/lda_corpus.pkl','wb'))
    dictionary.save('/Users/pc/Downloads/data/lda_dictionary.gensim')
    lda_model.save('/Users/pc/Downloads/data/lda_model.gensim')
    
    lda_visualization = gensimvis.prepare(lda_model,dictionary,sort_topics=False)
    pyLDAvis.save_html(lda_visualization,'/Users/pc/Downloads/data/lda.html')
    pyLDAvis.show(lda_visualization)

理想的输出如下：

主题 1. word 1: 123 word 2: 323 word 3: 123 word 4: 322

主题 2. word 1: 133 word 5: 103 word 8: 313 word 4: 232

Gensim LDA：如何获取每个主题中单词的数量频率？

如何解决Gensim LDA：如何获取每个主题中单词的数量频率？

相关推荐