320 lines
9.5 KiB
Python
320 lines
9.5 KiB
Python
"""Hubs and authorities analysis of graph structure.
|
|
"""
|
|
# Copyright (C) 2008-2012 by
|
|
# Aric Hagberg <hagberg@lanl.gov>
|
|
# Dan Schult <dschult@colgate.edu>
|
|
# Pieter Swart <swart@lanl.gov>
|
|
# All rights reserved.
|
|
# BSD license.
|
|
# NetworkX:http://networkx.github.io/
|
|
import networkx as nx
|
|
__author__ = """Aric Hagberg (hagberg@lanl.gov)"""
|
|
__all__ = ['hits', 'hits_numpy', 'hits_scipy', 'authority_matrix', 'hub_matrix']
|
|
|
|
|
|
def hits(G, max_iter=100, tol=1.0e-8, nstart=None, normalized=True):
|
|
"""Returns HITS hubs and authorities values for nodes.
|
|
|
|
The HITS algorithm computes two numbers for a node.
|
|
Authorities estimates the node value based on the incoming links.
|
|
Hubs estimates the node value based on outgoing links.
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
A NetworkX graph
|
|
|
|
max_iter : integer, optional
|
|
Maximum number of iterations in power method.
|
|
|
|
tol : float, optional
|
|
Error tolerance used to check convergence in power method iteration.
|
|
|
|
nstart : dictionary, optional
|
|
Starting value of each node for power method iteration.
|
|
|
|
normalized : bool (default=True)
|
|
Normalize results by the sum of all of the values.
|
|
|
|
Returns
|
|
-------
|
|
(hubs,authorities) : two-tuple of dictionaries
|
|
Two dictionaries keyed by node containing the hub and authority
|
|
values.
|
|
|
|
Raises
|
|
------
|
|
PowerIterationFailedConvergence
|
|
If the algorithm fails to converge to the specified tolerance
|
|
within the specified number of iterations of the power iteration
|
|
method.
|
|
|
|
Examples
|
|
--------
|
|
>>> G=nx.path_graph(4)
|
|
>>> h,a=nx.hits(G)
|
|
|
|
Notes
|
|
-----
|
|
The eigenvector calculation is done by the power iteration method
|
|
and has no guarantee of convergence. The iteration will stop
|
|
after max_iter iterations or an error tolerance of
|
|
number_of_nodes(G)*tol has been reached.
|
|
|
|
The HITS algorithm was designed for directed graphs but this
|
|
algorithm does not check if the input graph is directed and will
|
|
execute on undirected graphs.
|
|
|
|
References
|
|
----------
|
|
.. [1] A. Langville and C. Meyer,
|
|
"A survey of eigenvector methods of web information retrieval."
|
|
http://citeseer.ist.psu.edu/713792.html
|
|
.. [2] Jon Kleinberg,
|
|
Authoritative sources in a hyperlinked environment
|
|
Journal of the ACM 46 (5): 604-32, 1999.
|
|
doi:10.1145/324133.324140.
|
|
http://www.cs.cornell.edu/home/kleinber/auth.pdf.
|
|
"""
|
|
if type(G) == nx.MultiGraph or type(G) == nx.MultiDiGraph:
|
|
raise Exception("hits() not defined for graphs with multiedges.")
|
|
if len(G) == 0:
|
|
return {}, {}
|
|
# choose fixed starting vector if not given
|
|
if nstart is None:
|
|
h = dict.fromkeys(G, 1.0 / G.number_of_nodes())
|
|
else:
|
|
h = nstart
|
|
# normalize starting vector
|
|
s = 1.0 / sum(h.values())
|
|
for k in h:
|
|
h[k] *= s
|
|
for _ in range(max_iter): # power iteration: make up to max_iter iterations
|
|
hlast = h
|
|
h = dict.fromkeys(hlast.keys(), 0)
|
|
a = dict.fromkeys(hlast.keys(), 0)
|
|
# this "matrix multiply" looks odd because it is
|
|
# doing a left multiply a^T=hlast^T*G
|
|
for n in h:
|
|
for nbr in G[n]:
|
|
a[nbr] += hlast[n] * G[n][nbr].get('weight', 1)
|
|
# now multiply h=Ga
|
|
for n in h:
|
|
for nbr in G[n]:
|
|
h[n] += a[nbr] * G[n][nbr].get('weight', 1)
|
|
# normalize vector
|
|
s = 1.0 / max(h.values())
|
|
for n in h:
|
|
h[n] *= s
|
|
# normalize vector
|
|
s = 1.0 / max(a.values())
|
|
for n in a:
|
|
a[n] *= s
|
|
# check convergence, l1 norm
|
|
err = sum([abs(h[n] - hlast[n]) for n in h])
|
|
if err < tol:
|
|
break
|
|
else:
|
|
raise nx.PowerIterationFailedConvergence(max_iter)
|
|
if normalized:
|
|
s = 1.0 / sum(a.values())
|
|
for n in a:
|
|
a[n] *= s
|
|
s = 1.0 / sum(h.values())
|
|
for n in h:
|
|
h[n] *= s
|
|
return h, a
|
|
|
|
|
|
def authority_matrix(G, nodelist=None):
|
|
"""Returns the HITS authority matrix."""
|
|
M = nx.to_numpy_matrix(G, nodelist=nodelist)
|
|
return M.T * M
|
|
|
|
|
|
def hub_matrix(G, nodelist=None):
|
|
"""Returns the HITS hub matrix."""
|
|
M = nx.to_numpy_matrix(G, nodelist=nodelist)
|
|
return M * M.T
|
|
|
|
|
|
def hits_numpy(G, normalized=True):
|
|
"""Returns HITS hubs and authorities values for nodes.
|
|
|
|
The HITS algorithm computes two numbers for a node.
|
|
Authorities estimates the node value based on the incoming links.
|
|
Hubs estimates the node value based on outgoing links.
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
A NetworkX graph
|
|
|
|
normalized : bool (default=True)
|
|
Normalize results by the sum of all of the values.
|
|
|
|
Returns
|
|
-------
|
|
(hubs,authorities) : two-tuple of dictionaries
|
|
Two dictionaries keyed by node containing the hub and authority
|
|
values.
|
|
|
|
Examples
|
|
--------
|
|
>>> G=nx.path_graph(4)
|
|
>>> h,a=nx.hits(G)
|
|
|
|
Notes
|
|
-----
|
|
The eigenvector calculation uses NumPy's interface to LAPACK.
|
|
|
|
The HITS algorithm was designed for directed graphs but this
|
|
algorithm does not check if the input graph is directed and will
|
|
execute on undirected graphs.
|
|
|
|
References
|
|
----------
|
|
.. [1] A. Langville and C. Meyer,
|
|
"A survey of eigenvector methods of web information retrieval."
|
|
http://citeseer.ist.psu.edu/713792.html
|
|
.. [2] Jon Kleinberg,
|
|
Authoritative sources in a hyperlinked environment
|
|
Journal of the ACM 46 (5): 604-32, 1999.
|
|
doi:10.1145/324133.324140.
|
|
http://www.cs.cornell.edu/home/kleinber/auth.pdf.
|
|
"""
|
|
try:
|
|
import numpy as np
|
|
except ImportError:
|
|
raise ImportError(
|
|
"hits_numpy() requires NumPy: http://scipy.org/")
|
|
if len(G) == 0:
|
|
return {}, {}
|
|
H = nx.hub_matrix(G, list(G))
|
|
e, ev = np.linalg.eig(H)
|
|
m = e.argsort()[-1] # index of maximum eigenvalue
|
|
h = np.array(ev[:, m]).flatten()
|
|
A = nx.authority_matrix(G, list(G))
|
|
e, ev = np.linalg.eig(A)
|
|
m = e.argsort()[-1] # index of maximum eigenvalue
|
|
a = np.array(ev[:, m]).flatten()
|
|
if normalized:
|
|
h = h / h.sum()
|
|
a = a / a.sum()
|
|
else:
|
|
h = h / h.max()
|
|
a = a / a.max()
|
|
hubs = dict(zip(G, map(float, h)))
|
|
authorities = dict(zip(G, map(float, a)))
|
|
return hubs, authorities
|
|
|
|
|
|
def hits_scipy(G, max_iter=100, tol=1.0e-6, normalized=True):
|
|
"""Returns HITS hubs and authorities values for nodes.
|
|
|
|
The HITS algorithm computes two numbers for a node.
|
|
Authorities estimates the node value based on the incoming links.
|
|
Hubs estimates the node value based on outgoing links.
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
A NetworkX graph
|
|
|
|
max_iter : integer, optional
|
|
Maximum number of iterations in power method.
|
|
|
|
tol : float, optional
|
|
Error tolerance used to check convergence in power method iteration.
|
|
|
|
nstart : dictionary, optional
|
|
Starting value of each node for power method iteration.
|
|
|
|
normalized : bool (default=True)
|
|
Normalize results by the sum of all of the values.
|
|
|
|
Returns
|
|
-------
|
|
(hubs,authorities) : two-tuple of dictionaries
|
|
Two dictionaries keyed by node containing the hub and authority
|
|
values.
|
|
|
|
Examples
|
|
--------
|
|
>>> G=nx.path_graph(4)
|
|
>>> h,a=nx.hits(G)
|
|
|
|
Notes
|
|
-----
|
|
This implementation uses SciPy sparse matrices.
|
|
|
|
The eigenvector calculation is done by the power iteration method
|
|
and has no guarantee of convergence. The iteration will stop
|
|
after max_iter iterations or an error tolerance of
|
|
number_of_nodes(G)*tol has been reached.
|
|
|
|
The HITS algorithm was designed for directed graphs but this
|
|
algorithm does not check if the input graph is directed and will
|
|
execute on undirected graphs.
|
|
|
|
Raises
|
|
------
|
|
PowerIterationFailedConvergence
|
|
If the algorithm fails to converge to the specified tolerance
|
|
within the specified number of iterations of the power iteration
|
|
method.
|
|
|
|
References
|
|
----------
|
|
.. [1] A. Langville and C. Meyer,
|
|
"A survey of eigenvector methods of web information retrieval."
|
|
http://citeseer.ist.psu.edu/713792.html
|
|
.. [2] Jon Kleinberg,
|
|
Authoritative sources in a hyperlinked environment
|
|
Journal of the ACM 46 (5): 604-632, 1999.
|
|
doi:10.1145/324133.324140.
|
|
http://www.cs.cornell.edu/home/kleinber/auth.pdf.
|
|
"""
|
|
try:
|
|
import scipy.sparse
|
|
import numpy as np
|
|
except ImportError:
|
|
raise ImportError(
|
|
"hits_scipy() requires SciPy: http://scipy.org/")
|
|
if len(G) == 0:
|
|
return {}, {}
|
|
M = nx.to_scipy_sparse_matrix(G, nodelist=list(G))
|
|
(n, m) = M.shape # should be square
|
|
A = M.T * M # authority matrix
|
|
x = scipy.ones((n, 1)) / n # initial guess
|
|
# power iteration on authority matrix
|
|
i = 0
|
|
while True:
|
|
xlast = x
|
|
x = A * x
|
|
x = x / x.max()
|
|
# check convergence, l1 norm
|
|
err = scipy.absolute(x - xlast).sum()
|
|
if err < tol:
|
|
break
|
|
if i > max_iter:
|
|
raise nx.PowerIterationFailedConvergence(max_iter)
|
|
i += 1
|
|
|
|
a = np.asarray(x).flatten()
|
|
# h=M*a
|
|
h = np.asarray(M * a).flatten()
|
|
if normalized:
|
|
h = h / h.sum()
|
|
a = a / a.sum()
|
|
hubs = dict(zip(G, map(float, h)))
|
|
authorities = dict(zip(G, map(float, a)))
|
|
return hubs, authorities
|
|
|
|
|
|
# fixture for pytest
|
|
def setup_module(module):
|
|
import pytest
|
|
numpy = pytest.importorskip('numpy')
|
|
scipy = pytest.importorskip('scipy')
|