NMF分解在python中的实现 Matmul和Nan问题遇到溢出

如何解决NMF分解在python中的实现 Matmul和Nan问题遇到溢出

import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

%matplotlib inline

categories = ['alt.atheism','talk.religion.misc','comp.graphics','sci.space']
remove = ('headers','footers','quotes')
newsgroups_train = fetch_20newsgroups(subset='train',categories=categories,remove=remove)
newsgroups_test = fetch_20newsgroups(subset='test',remove=remove)

vectorizer_tfidf = TfidfVectorizer(stop_words='english')
vectors_tfidf = vectorizer_tfidf.fit_transform(newsgroups_train.data).todense() # (documents,vocab)
vectors_tfidf.shape

num_top_words=8
vocab = np.array(vectorizer_tfidf.get_feature_names())

def show_topics(a):
    top_words = lambda t: [vocab[i] for i in np.argsort(t)[:-num_top_words-1:-1]]
    topic_words = ([top_words(t) for t in a])
    return [' '.join(t) for t in topic_words]
from sklearn import decomposition

d = 5 # num topics
clf = decomposition.NMF(n_components=d,random_state=1)

W1 = clf.fit_transform(vectors_tfidf)
H1 = clf.components_

show_topics(H1)

根据SE帖子从头开始的NMF。 https://stats.stackexchange.com/questions/351359/deriving-multiplicative-update-rules-for-nmf

V=vectors_tfidf.copy()
m,n=V.shape
k=500
V=vectors_tfidf.copy()
m,n=V.shape
print('V shape',V.shape)

#W =np.random.normal(0,1,(m,k))
W =np.random.rand(m,k)
W =np.abs(W)
print('W shape',W.shape)

#H =np.random.normal(0,(k,n))
H =np.random.rand(k,n)
H =np.abs(H)
print('H shape',H.shape)

num_iterations=10
learning_rate=0.001
eps=0.000000001

for i in range(num_iterations):

  #GD of W and H .
  W =  W -learning_rate * (W @ H @ H.T+ eps - V @ H.T)

  H =  H -learning_rate * (W.T @ W @ H+ eps  - W.T @ V)

  error=np.linalg.norm(V-W@H)
  print('Error :',error)

两次迭代后出现错误

V shape (2034,26576)
W shape (2034,500)
H shape (500,26576)
Error : 8.581016691579372e+18
Error : 4.342392440036606e+95
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:25: RuntimeWarning: overflow encountered in matmul
Error : inf
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:23: RuntimeWarning: invalid value encountered in matmul
Error : nan
Error : nan
Error : nan
Error : nan
Error : nan
Error : nan
Error : nan

建议 Python Non negative Matrix Factorization that handles both zeros and missing data?似乎无效。我仍然面临着南问题。