如何解决python中的普通PageRank编码的问题
问题是我了解背后的数学或机制,但我不明白为什么我的老师 找到该页面的等级后使用蒸发 很难用字母解释,所以我只给你完整的代码 我将零件与---------分开,所以您可以轻易注意到吗? 我为什么要使用EVAP,TODIFF的问题,简而言之,就是所选网站的说明
从这里开始
\导入sqlite3
conn = sqlite3.connect('spider.sqlite')
cur = conn.cursor()
# Find the ids that send out page rank - we only are interested
# in pages in the SCC that have in and out links
cur.execute('''SELECT DISTINCT from_id FROM Links''')
from_ids = list()
for row in cur:
from_ids.append(row[0])
# Find the ids that receive page rank
to_ids = list()
links = list()
cur.execute('''SELECT DISTINCT from_id,to_id FROM Links''')
for row in cur:
from_id = row[0]
to_id = row[1]
if from_id == to_id : continue
if from_id not in from_ids : continue
if to_id not in from_ids : continue
links.append(row)
if to_id not in to_ids : to_ids.append(to_id)
#for i in links:
# print(i)
# Get latest page ranks for strongly connected component
prev_ranks = dict()
for node in from_ids:
cur.execute('''SELECT new_rank FROM Pages WHERE id = ?''',(node,))
row = cur.fetchone()
prev_ranks[node] = row[0]
print('--')
print(node)
sval = input('How many iterations:')
many = 1
if ( len(sval) > 0 ) : many = int(sval)
# Sanity check
if len(prev_ranks) < 1 :
print("Nothing to page rank. Check data.")
quit()
# Lets do Page Rank in memory so it is really fast
for i in range(many):
# print prev_ranks.items()[:5]
next_ranks = dict()
total = 0.0
for (node,old_rank) in list(prev_ranks.items()):
total = total + old_rank
#print(total)
next_ranks[node] = 0.0
#print('--',next_ranks[node],'--')
# print total
#print(next_ranks)
# Find the number of outbound links and sent the page rank down each
for (node,old_rank) in list(prev_ranks.items()):
# print node,old_rank
give_ids = list()
for (from_id,to_id) in links:
if from_id != node : continue
# print ' ',from_id,to_id
if to_id not in to_ids: continue
give_ids.append(to_id)
if ( len(give_ids) < 1 ) : continue
print(old_rank)
amount = old_rank / len(give_ids)
print (node,old_rank,amount,give_ids)
for id in give_ids:
next_ranks[id] = next_ranks[id] + amount
print(next_ranks)
-----------------------------------------------------------------------
newtot = 0
for (node,next_rank) in list(next_ranks.items()):
print(node,'---',next_rank)
newtot = newtot + next_rank
evap = (total - newtot) / len(next_ranks)
# print newtot,evap
for node in next_ranks:
next_ranks[node] = next_ranks[node] + evap
newtot = 0
for (node,next_rank) in list(next_ranks.items()):
newtot = newtot + next_rank
# Compute the per-page average change from old rank to new rank
# As indication of convergence of the algorithm
totdiff = 0
for (node,old_rank) in list(prev_ranks.items()):
new_rank = next_ranks[node]
diff = abs(old_rank-new_rank)
print('====',diff)
totdiff = totdiff + diff
avediff = totdiff / len(prev_ranks)
print(i+1,avediff)
---------------------------------------------------------------------------------
# rotate
prev_ranks = next_ranks
# Put the final ranks back into the database
print(list(next_ranks.items())[:5])
cur.execute('''UPDATE Pages SET old_rank=new_rank''')
for (id,new_rank) in list(next_ranks.items()) :
cur.execute('''UPDATE Pages SET new_rank=? WHERE id=?''',(new_rank,id))
conn.commit()
cur.close()
\
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。