如何解决NMF分解在python中的实现 Matmul和Nan问题遇到溢出
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt
%matplotlib inline
categories = ['alt.atheism','talk.religion.misc','comp.graphics','sci.space']
remove = ('headers','footers','quotes')
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories,remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test',remove=remove)
vectorizer_tfidf = TfidfVectorizer(stop_words='english')
vectors_tfidf = vectorizer_tfidf.fit_transform(newsgroups_train.data).todense() # (documents,vocab)
vectors_tfidf.shape
num_top_words=8
vocab = np.array(vectorizer_tfidf.get_feature_names())
def show_topics(a):
top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
topic_words = ([top_words(t) for t in a])
return [' '.join(t) for t in topic_words]
from sklearn import decomposition
d = 5 # num topics
clf = decomposition.NMF(n_components=d,random_state=1)
W1 = clf.fit_transform(vectors_tfidf)
H1 = clf.components_
show_topics(H1)
根据SE帖子从头开始的NMF。 https://stats.stackexchange.com/questions/351359/deriving-multiplicative-update-rules-for-nmf
V=vectors_tfidf.copy()
m,n=V.shape
k=500
V=vectors_tfidf.copy()
m,n=V.shape
print('V shape',V.shape)
#W =np.random.normal(0,1,(m,k))
W =np.random.rand(m,k)
W =np.abs(W)
print('W shape',W.shape)
#H =np.random.normal(0,(k,n))
H =np.random.rand(k,n)
H =np.abs(H)
print('H shape',H.shape)
num_iterations=10
learning_rate=0.001
eps=0.000000001
for i in range(num_iterations):
#GD of W and H .
W = W -learning_rate * (W @ H @ H.T+ eps - V @ H.T)
H = H -learning_rate * (W.T @ W @ H+ eps - W.T @ V)
error=np.linalg.norm(V-W@H)
print('Error :',error)
两次迭代后出现错误
V shape (2034,26576)
W shape (2034,500)
H shape (500,26576)
Error : 8.581016691579372e+18
Error : 4.342392440036606e+95
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:25: RuntimeWarning: overflow encountered in matmul
Error : inf
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:23: RuntimeWarning: invalid value encountered in matmul
Error : nan
Error : nan
Error : nan
Error : nan
Error : nan
Error : nan
Error : nan
建议 Python Non negative Matrix Factorization that handles both zeros and missing data?似乎无效。我仍然面临着南问题。
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。