Fixed database typo and removed unnecessary class identifier.
This commit is contained in:
parent
00ad49a143
commit
45fb349a7d
5098 changed files with 952558 additions and 85 deletions
|
@ -0,0 +1,2 @@
|
|||
from networkx.algorithms.link_analysis.pagerank_alg import *
|
||||
from networkx.algorithms.link_analysis.hits_alg import *
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,305 @@
|
|||
"""Hubs and authorities analysis of graph structure.
|
||||
"""
|
||||
import networkx as nx
|
||||
|
||||
__all__ = ["hits", "hits_numpy", "hits_scipy", "authority_matrix", "hub_matrix"]
|
||||
|
||||
|
||||
def hits(G, max_iter=100, tol=1.0e-8, nstart=None, normalized=True):
|
||||
"""Returns HITS hubs and authorities values for nodes.
|
||||
|
||||
The HITS algorithm computes two numbers for a node.
|
||||
Authorities estimates the node value based on the incoming links.
|
||||
Hubs estimates the node value based on outgoing links.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
G : graph
|
||||
A NetworkX graph
|
||||
|
||||
max_iter : integer, optional
|
||||
Maximum number of iterations in power method.
|
||||
|
||||
tol : float, optional
|
||||
Error tolerance used to check convergence in power method iteration.
|
||||
|
||||
nstart : dictionary, optional
|
||||
Starting value of each node for power method iteration.
|
||||
|
||||
normalized : bool (default=True)
|
||||
Normalize results by the sum of all of the values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
(hubs,authorities) : two-tuple of dictionaries
|
||||
Two dictionaries keyed by node containing the hub and authority
|
||||
values.
|
||||
|
||||
Raises
|
||||
------
|
||||
PowerIterationFailedConvergence
|
||||
If the algorithm fails to converge to the specified tolerance
|
||||
within the specified number of iterations of the power iteration
|
||||
method.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> G = nx.path_graph(4)
|
||||
>>> h, a = nx.hits(G)
|
||||
|
||||
Notes
|
||||
-----
|
||||
The eigenvector calculation is done by the power iteration method
|
||||
and has no guarantee of convergence. The iteration will stop
|
||||
after max_iter iterations or an error tolerance of
|
||||
number_of_nodes(G)*tol has been reached.
|
||||
|
||||
The HITS algorithm was designed for directed graphs but this
|
||||
algorithm does not check if the input graph is directed and will
|
||||
execute on undirected graphs.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] A. Langville and C. Meyer,
|
||||
"A survey of eigenvector methods of web information retrieval."
|
||||
http://citeseer.ist.psu.edu/713792.html
|
||||
.. [2] Jon Kleinberg,
|
||||
Authoritative sources in a hyperlinked environment
|
||||
Journal of the ACM 46 (5): 604-32, 1999.
|
||||
doi:10.1145/324133.324140.
|
||||
http://www.cs.cornell.edu/home/kleinber/auth.pdf.
|
||||
"""
|
||||
if type(G) == nx.MultiGraph or type(G) == nx.MultiDiGraph:
|
||||
raise Exception("hits() not defined for graphs with multiedges.")
|
||||
if len(G) == 0:
|
||||
return {}, {}
|
||||
# choose fixed starting vector if not given
|
||||
if nstart is None:
|
||||
h = dict.fromkeys(G, 1.0 / G.number_of_nodes())
|
||||
else:
|
||||
h = nstart
|
||||
# normalize starting vector
|
||||
s = 1.0 / sum(h.values())
|
||||
for k in h:
|
||||
h[k] *= s
|
||||
for _ in range(max_iter): # power iteration: make up to max_iter iterations
|
||||
hlast = h
|
||||
h = dict.fromkeys(hlast.keys(), 0)
|
||||
a = dict.fromkeys(hlast.keys(), 0)
|
||||
# this "matrix multiply" looks odd because it is
|
||||
# doing a left multiply a^T=hlast^T*G
|
||||
for n in h:
|
||||
for nbr in G[n]:
|
||||
a[nbr] += hlast[n] * G[n][nbr].get("weight", 1)
|
||||
# now multiply h=Ga
|
||||
for n in h:
|
||||
for nbr in G[n]:
|
||||
h[n] += a[nbr] * G[n][nbr].get("weight", 1)
|
||||
# normalize vector
|
||||
s = 1.0 / max(h.values())
|
||||
for n in h:
|
||||
h[n] *= s
|
||||
# normalize vector
|
||||
s = 1.0 / max(a.values())
|
||||
for n in a:
|
||||
a[n] *= s
|
||||
# check convergence, l1 norm
|
||||
err = sum([abs(h[n] - hlast[n]) for n in h])
|
||||
if err < tol:
|
||||
break
|
||||
else:
|
||||
raise nx.PowerIterationFailedConvergence(max_iter)
|
||||
if normalized:
|
||||
s = 1.0 / sum(a.values())
|
||||
for n in a:
|
||||
a[n] *= s
|
||||
s = 1.0 / sum(h.values())
|
||||
for n in h:
|
||||
h[n] *= s
|
||||
return h, a
|
||||
|
||||
|
||||
def authority_matrix(G, nodelist=None):
|
||||
"""Returns the HITS authority matrix."""
|
||||
M = nx.to_numpy_array(G, nodelist=nodelist)
|
||||
return M.T @ M
|
||||
|
||||
|
||||
def hub_matrix(G, nodelist=None):
|
||||
"""Returns the HITS hub matrix."""
|
||||
M = nx.to_numpy_array(G, nodelist=nodelist)
|
||||
return M @ M.T
|
||||
|
||||
|
||||
def hits_numpy(G, normalized=True):
|
||||
"""Returns HITS hubs and authorities values for nodes.
|
||||
|
||||
The HITS algorithm computes two numbers for a node.
|
||||
Authorities estimates the node value based on the incoming links.
|
||||
Hubs estimates the node value based on outgoing links.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
G : graph
|
||||
A NetworkX graph
|
||||
|
||||
normalized : bool (default=True)
|
||||
Normalize results by the sum of all of the values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
(hubs,authorities) : two-tuple of dictionaries
|
||||
Two dictionaries keyed by node containing the hub and authority
|
||||
values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> G = nx.path_graph(4)
|
||||
>>> h, a = nx.hits(G)
|
||||
|
||||
Notes
|
||||
-----
|
||||
The eigenvector calculation uses NumPy's interface to LAPACK.
|
||||
|
||||
The HITS algorithm was designed for directed graphs but this
|
||||
algorithm does not check if the input graph is directed and will
|
||||
execute on undirected graphs.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] A. Langville and C. Meyer,
|
||||
"A survey of eigenvector methods of web information retrieval."
|
||||
http://citeseer.ist.psu.edu/713792.html
|
||||
.. [2] Jon Kleinberg,
|
||||
Authoritative sources in a hyperlinked environment
|
||||
Journal of the ACM 46 (5): 604-32, 1999.
|
||||
doi:10.1145/324133.324140.
|
||||
http://www.cs.cornell.edu/home/kleinber/auth.pdf.
|
||||
"""
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError as e:
|
||||
raise ImportError("hits_numpy() requires NumPy: " "http://numpy.org/") from e
|
||||
if len(G) == 0:
|
||||
return {}, {}
|
||||
H = nx.hub_matrix(G, list(G))
|
||||
e, ev = np.linalg.eig(H)
|
||||
m = e.argsort()[-1] # index of maximum eigenvalue
|
||||
h = np.array(ev[:, m]).flatten()
|
||||
A = nx.authority_matrix(G, list(G))
|
||||
e, ev = np.linalg.eig(A)
|
||||
m = e.argsort()[-1] # index of maximum eigenvalue
|
||||
a = np.array(ev[:, m]).flatten()
|
||||
if normalized:
|
||||
h = h / h.sum()
|
||||
a = a / a.sum()
|
||||
else:
|
||||
h = h / h.max()
|
||||
a = a / a.max()
|
||||
hubs = dict(zip(G, map(float, h)))
|
||||
authorities = dict(zip(G, map(float, a)))
|
||||
return hubs, authorities
|
||||
|
||||
|
||||
def hits_scipy(G, max_iter=100, tol=1.0e-6, normalized=True):
|
||||
"""Returns HITS hubs and authorities values for nodes.
|
||||
|
||||
The HITS algorithm computes two numbers for a node.
|
||||
Authorities estimates the node value based on the incoming links.
|
||||
Hubs estimates the node value based on outgoing links.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
G : graph
|
||||
A NetworkX graph
|
||||
|
||||
max_iter : integer, optional
|
||||
Maximum number of iterations in power method.
|
||||
|
||||
tol : float, optional
|
||||
Error tolerance used to check convergence in power method iteration.
|
||||
|
||||
nstart : dictionary, optional
|
||||
Starting value of each node for power method iteration.
|
||||
|
||||
normalized : bool (default=True)
|
||||
Normalize results by the sum of all of the values.
|
||||
|
||||
Returns
|
||||
-------
|
||||
(hubs,authorities) : two-tuple of dictionaries
|
||||
Two dictionaries keyed by node containing the hub and authority
|
||||
values.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> G = nx.path_graph(4)
|
||||
>>> h, a = nx.hits(G)
|
||||
|
||||
Notes
|
||||
-----
|
||||
This implementation uses SciPy sparse matrices.
|
||||
|
||||
The eigenvector calculation is done by the power iteration method
|
||||
and has no guarantee of convergence. The iteration will stop
|
||||
after max_iter iterations or an error tolerance of
|
||||
number_of_nodes(G)*tol has been reached.
|
||||
|
||||
The HITS algorithm was designed for directed graphs but this
|
||||
algorithm does not check if the input graph is directed and will
|
||||
execute on undirected graphs.
|
||||
|
||||
Raises
|
||||
------
|
||||
PowerIterationFailedConvergence
|
||||
If the algorithm fails to converge to the specified tolerance
|
||||
within the specified number of iterations of the power iteration
|
||||
method.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] A. Langville and C. Meyer,
|
||||
"A survey of eigenvector methods of web information retrieval."
|
||||
http://citeseer.ist.psu.edu/713792.html
|
||||
.. [2] Jon Kleinberg,
|
||||
Authoritative sources in a hyperlinked environment
|
||||
Journal of the ACM 46 (5): 604-632, 1999.
|
||||
doi:10.1145/324133.324140.
|
||||
http://www.cs.cornell.edu/home/kleinber/auth.pdf.
|
||||
"""
|
||||
try:
|
||||
import numpy as np
|
||||
except ImportError as e:
|
||||
raise ImportError(
|
||||
"hits_scipy() requires SciPy and NumPy:"
|
||||
"http://scipy.org/ http://numpy.org/"
|
||||
) from e
|
||||
if len(G) == 0:
|
||||
return {}, {}
|
||||
M = nx.to_scipy_sparse_matrix(G, nodelist=list(G))
|
||||
(n, m) = M.shape # should be square
|
||||
A = M.T * M # authority matrix
|
||||
x = np.ones((n, 1)) / n # initial guess
|
||||
# power iteration on authority matrix
|
||||
i = 0
|
||||
while True:
|
||||
xlast = x
|
||||
x = A * x
|
||||
x = x / x.max()
|
||||
# check convergence, l1 norm
|
||||
err = np.absolute(x - xlast).sum()
|
||||
if err < tol:
|
||||
break
|
||||
if i > max_iter:
|
||||
raise nx.PowerIterationFailedConvergence(max_iter)
|
||||
i += 1
|
||||
|
||||
a = np.asarray(x).flatten()
|
||||
# h=M*a
|
||||
h = np.asarray(M * a).flatten()
|
||||
if normalized:
|
||||
h = h / h.sum()
|
||||
a = a / a.sum()
|
||||
hubs = dict(zip(G, map(float, h)))
|
||||
authorities = dict(zip(G, map(float, a)))
|
||||
return hubs, authorities
|
|
@ -0,0 +1,475 @@
|
|||
"""PageRank analysis of graph structure. """
|
||||
import networkx as nx
|
||||
from networkx.utils import not_implemented_for
|
||||
|
||||
__all__ = ["pagerank", "pagerank_numpy", "pagerank_scipy", "google_matrix"]
|
||||
|
||||
|
||||
@not_implemented_for("multigraph")
|
||||
def pagerank(
|
||||
G,
|
||||
alpha=0.85,
|
||||
personalization=None,
|
||||
max_iter=100,
|
||||
tol=1.0e-6,
|
||||
nstart=None,
|
||||
weight="weight",
|
||||
dangling=None,
|
||||
):
|
||||
"""Returns the PageRank of the nodes in the graph.
|
||||
|
||||
PageRank computes a ranking of the nodes in the graph G based on
|
||||
the structure of the incoming links. It was originally designed as
|
||||
an algorithm to rank web pages.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
G : graph
|
||||
A NetworkX graph. Undirected graphs will be converted to a directed
|
||||
graph with two directed edges for each undirected edge.
|
||||
|
||||
alpha : float, optional
|
||||
Damping parameter for PageRank, default=0.85.
|
||||
|
||||
personalization: dict, optional
|
||||
The "personalization vector" consisting of a dictionary with a
|
||||
key some subset of graph nodes and personalization value each of those.
|
||||
At least one personalization value must be non-zero.
|
||||
If not specfiied, a nodes personalization value will be zero.
|
||||
By default, a uniform distribution is used.
|
||||
|
||||
max_iter : integer, optional
|
||||
Maximum number of iterations in power method eigenvalue solver.
|
||||
|
||||
tol : float, optional
|
||||
Error tolerance used to check convergence in power method solver.
|
||||
|
||||
nstart : dictionary, optional
|
||||
Starting value of PageRank iteration for each node.
|
||||
|
||||
weight : key, optional
|
||||
Edge data key to use as weight. If None weights are set to 1.
|
||||
|
||||
dangling: dict, optional
|
||||
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
|
||||
any outedges. The dict key is the node the outedge points to and the dict
|
||||
value is the weight of that outedge. By default, dangling nodes are given
|
||||
outedges according to the personalization vector (uniform if not
|
||||
specified). This must be selected to result in an irreducible transition
|
||||
matrix (see notes under google_matrix). It may be common to have the
|
||||
dangling dict to be the same as the personalization dict.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pagerank : dictionary
|
||||
Dictionary of nodes with PageRank as value
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> G = nx.DiGraph(nx.path_graph(4))
|
||||
>>> pr = nx.pagerank(G, alpha=0.9)
|
||||
|
||||
Notes
|
||||
-----
|
||||
The eigenvector calculation is done by the power iteration method
|
||||
and has no guarantee of convergence. The iteration will stop after
|
||||
an error tolerance of ``len(G) * tol`` has been reached. If the
|
||||
number of iterations exceed `max_iter`, a
|
||||
:exc:`networkx.exception.PowerIterationFailedConvergence` exception
|
||||
is raised.
|
||||
|
||||
The PageRank algorithm was designed for directed graphs but this
|
||||
algorithm does not check if the input graph is directed and will
|
||||
execute on undirected graphs by converting each edge in the
|
||||
directed graph to two edges.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pagerank_numpy, pagerank_scipy, google_matrix
|
||||
|
||||
Raises
|
||||
------
|
||||
PowerIterationFailedConvergence
|
||||
If the algorithm fails to converge to the specified tolerance
|
||||
within the specified number of iterations of the power iteration
|
||||
method.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] A. Langville and C. Meyer,
|
||||
"A survey of eigenvector methods of web information retrieval."
|
||||
http://citeseer.ist.psu.edu/713792.html
|
||||
.. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
|
||||
The PageRank citation ranking: Bringing order to the Web. 1999
|
||||
http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
|
||||
|
||||
"""
|
||||
if len(G) == 0:
|
||||
return {}
|
||||
|
||||
if not G.is_directed():
|
||||
D = G.to_directed()
|
||||
else:
|
||||
D = G
|
||||
|
||||
# Create a copy in (right) stochastic form
|
||||
W = nx.stochastic_graph(D, weight=weight)
|
||||
N = W.number_of_nodes()
|
||||
|
||||
# Choose fixed starting vector if not given
|
||||
if nstart is None:
|
||||
x = dict.fromkeys(W, 1.0 / N)
|
||||
else:
|
||||
# Normalized nstart vector
|
||||
s = float(sum(nstart.values()))
|
||||
x = {k: v / s for k, v in nstart.items()}
|
||||
|
||||
if personalization is None:
|
||||
# Assign uniform personalization vector if not given
|
||||
p = dict.fromkeys(W, 1.0 / N)
|
||||
else:
|
||||
s = float(sum(personalization.values()))
|
||||
p = {k: v / s for k, v in personalization.items()}
|
||||
|
||||
if dangling is None:
|
||||
# Use personalization vector if dangling vector not specified
|
||||
dangling_weights = p
|
||||
else:
|
||||
s = float(sum(dangling.values()))
|
||||
dangling_weights = {k: v / s for k, v in dangling.items()}
|
||||
dangling_nodes = [n for n in W if W.out_degree(n, weight=weight) == 0.0]
|
||||
|
||||
# power iteration: make up to max_iter iterations
|
||||
for _ in range(max_iter):
|
||||
xlast = x
|
||||
x = dict.fromkeys(xlast.keys(), 0)
|
||||
danglesum = alpha * sum(xlast[n] for n in dangling_nodes)
|
||||
for n in x:
|
||||
# this matrix multiply looks odd because it is
|
||||
# doing a left multiply x^T=xlast^T*W
|
||||
for nbr in W[n]:
|
||||
x[nbr] += alpha * xlast[n] * W[n][nbr][weight]
|
||||
x[n] += danglesum * dangling_weights.get(n, 0) + (1.0 - alpha) * p.get(n, 0)
|
||||
# check convergence, l1 norm
|
||||
err = sum([abs(x[n] - xlast[n]) for n in x])
|
||||
if err < N * tol:
|
||||
return x
|
||||
raise nx.PowerIterationFailedConvergence(max_iter)
|
||||
|
||||
|
||||
def google_matrix(
|
||||
G, alpha=0.85, personalization=None, nodelist=None, weight="weight", dangling=None
|
||||
):
|
||||
"""Returns the Google matrix of the graph.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
G : graph
|
||||
A NetworkX graph. Undirected graphs will be converted to a directed
|
||||
graph with two directed edges for each undirected edge.
|
||||
|
||||
alpha : float
|
||||
The damping factor.
|
||||
|
||||
personalization: dict, optional
|
||||
The "personalization vector" consisting of a dictionary with a
|
||||
key some subset of graph nodes and personalization value each of those.
|
||||
At least one personalization value must be non-zero.
|
||||
If not specfiied, a nodes personalization value will be zero.
|
||||
By default, a uniform distribution is used.
|
||||
|
||||
nodelist : list, optional
|
||||
The rows and columns are ordered according to the nodes in nodelist.
|
||||
If nodelist is None, then the ordering is produced by G.nodes().
|
||||
|
||||
weight : key, optional
|
||||
Edge data key to use as weight. If None weights are set to 1.
|
||||
|
||||
dangling: dict, optional
|
||||
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
|
||||
any outedges. The dict key is the node the outedge points to and the dict
|
||||
value is the weight of that outedge. By default, dangling nodes are given
|
||||
outedges according to the personalization vector (uniform if not
|
||||
specified) This must be selected to result in an irreducible transition
|
||||
matrix (see notes below). It may be common to have the dangling dict to
|
||||
be the same as the personalization dict.
|
||||
|
||||
Returns
|
||||
-------
|
||||
A : NumPy matrix
|
||||
Google matrix of the graph
|
||||
|
||||
Notes
|
||||
-----
|
||||
The matrix returned represents the transition matrix that describes the
|
||||
Markov chain used in PageRank. For PageRank to converge to a unique
|
||||
solution (i.e., a unique stationary distribution in a Markov chain), the
|
||||
transition matrix must be irreducible. In other words, it must be that
|
||||
there exists a path between every pair of nodes in the graph, or else there
|
||||
is the potential of "rank sinks."
|
||||
|
||||
This implementation works with Multi(Di)Graphs. For multigraphs the
|
||||
weight between two nodes is set to be the sum of all edge weights
|
||||
between those nodes.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pagerank, pagerank_numpy, pagerank_scipy
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
if nodelist is None:
|
||||
nodelist = list(G)
|
||||
|
||||
M = nx.to_numpy_matrix(G, nodelist=nodelist, weight=weight)
|
||||
N = len(G)
|
||||
if N == 0:
|
||||
return M
|
||||
|
||||
# Personalization vector
|
||||
if personalization is None:
|
||||
p = np.repeat(1.0 / N, N)
|
||||
else:
|
||||
p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
|
||||
p /= p.sum()
|
||||
|
||||
# Dangling nodes
|
||||
if dangling is None:
|
||||
dangling_weights = p
|
||||
else:
|
||||
# Convert the dangling dictionary into an array in nodelist order
|
||||
dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float)
|
||||
dangling_weights /= dangling_weights.sum()
|
||||
dangling_nodes = np.where(M.sum(axis=1) == 0)[0]
|
||||
|
||||
# Assign dangling_weights to any dangling nodes (nodes with no out links)
|
||||
for node in dangling_nodes:
|
||||
M[node] = dangling_weights
|
||||
|
||||
M /= M.sum(axis=1) # Normalize rows to sum to 1
|
||||
|
||||
return alpha * M + (1 - alpha) * p
|
||||
|
||||
|
||||
def pagerank_numpy(G, alpha=0.85, personalization=None, weight="weight", dangling=None):
|
||||
"""Returns the PageRank of the nodes in the graph.
|
||||
|
||||
PageRank computes a ranking of the nodes in the graph G based on
|
||||
the structure of the incoming links. It was originally designed as
|
||||
an algorithm to rank web pages.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
G : graph
|
||||
A NetworkX graph. Undirected graphs will be converted to a directed
|
||||
graph with two directed edges for each undirected edge.
|
||||
|
||||
alpha : float, optional
|
||||
Damping parameter for PageRank, default=0.85.
|
||||
|
||||
personalization: dict, optional
|
||||
The "personalization vector" consisting of a dictionary with a
|
||||
key some subset of graph nodes and personalization value each of those.
|
||||
At least one personalization value must be non-zero.
|
||||
If not specfiied, a nodes personalization value will be zero.
|
||||
By default, a uniform distribution is used.
|
||||
|
||||
weight : key, optional
|
||||
Edge data key to use as weight. If None weights are set to 1.
|
||||
|
||||
dangling: dict, optional
|
||||
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
|
||||
any outedges. The dict key is the node the outedge points to and the dict
|
||||
value is the weight of that outedge. By default, dangling nodes are given
|
||||
outedges according to the personalization vector (uniform if not
|
||||
specified) This must be selected to result in an irreducible transition
|
||||
matrix (see notes under google_matrix). It may be common to have the
|
||||
dangling dict to be the same as the personalization dict.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pagerank : dictionary
|
||||
Dictionary of nodes with PageRank as value.
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> G = nx.DiGraph(nx.path_graph(4))
|
||||
>>> pr = nx.pagerank_numpy(G, alpha=0.9)
|
||||
|
||||
Notes
|
||||
-----
|
||||
The eigenvector calculation uses NumPy's interface to the LAPACK
|
||||
eigenvalue solvers. This will be the fastest and most accurate
|
||||
for small graphs.
|
||||
|
||||
This implementation works with Multi(Di)Graphs. For multigraphs the
|
||||
weight between two nodes is set to be the sum of all edge weights
|
||||
between those nodes.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pagerank, pagerank_scipy, google_matrix
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] A. Langville and C. Meyer,
|
||||
"A survey of eigenvector methods of web information retrieval."
|
||||
http://citeseer.ist.psu.edu/713792.html
|
||||
.. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
|
||||
The PageRank citation ranking: Bringing order to the Web. 1999
|
||||
http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
if len(G) == 0:
|
||||
return {}
|
||||
M = google_matrix(
|
||||
G, alpha, personalization=personalization, weight=weight, dangling=dangling
|
||||
)
|
||||
# use numpy LAPACK solver
|
||||
eigenvalues, eigenvectors = np.linalg.eig(M.T)
|
||||
ind = np.argmax(eigenvalues)
|
||||
# eigenvector of largest eigenvalue is at ind, normalized
|
||||
largest = np.array(eigenvectors[:, ind]).flatten().real
|
||||
norm = float(largest.sum())
|
||||
return dict(zip(G, map(float, largest / norm)))
|
||||
|
||||
|
||||
def pagerank_scipy(
|
||||
G,
|
||||
alpha=0.85,
|
||||
personalization=None,
|
||||
max_iter=100,
|
||||
tol=1.0e-6,
|
||||
nstart=None,
|
||||
weight="weight",
|
||||
dangling=None,
|
||||
):
|
||||
"""Returns the PageRank of the nodes in the graph.
|
||||
|
||||
PageRank computes a ranking of the nodes in the graph G based on
|
||||
the structure of the incoming links. It was originally designed as
|
||||
an algorithm to rank web pages.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
G : graph
|
||||
A NetworkX graph. Undirected graphs will be converted to a directed
|
||||
graph with two directed edges for each undirected edge.
|
||||
|
||||
alpha : float, optional
|
||||
Damping parameter for PageRank, default=0.85.
|
||||
|
||||
personalization: dict, optional
|
||||
The "personalization vector" consisting of a dictionary with a
|
||||
key some subset of graph nodes and personalization value each of those.
|
||||
At least one personalization value must be non-zero.
|
||||
If not specfiied, a nodes personalization value will be zero.
|
||||
By default, a uniform distribution is used.
|
||||
|
||||
max_iter : integer, optional
|
||||
Maximum number of iterations in power method eigenvalue solver.
|
||||
|
||||
tol : float, optional
|
||||
Error tolerance used to check convergence in power method solver.
|
||||
|
||||
nstart : dictionary, optional
|
||||
Starting value of PageRank iteration for each node.
|
||||
|
||||
weight : key, optional
|
||||
Edge data key to use as weight. If None weights are set to 1.
|
||||
|
||||
dangling: dict, optional
|
||||
The outedges to be assigned to any "dangling" nodes, i.e., nodes without
|
||||
any outedges. The dict key is the node the outedge points to and the dict
|
||||
value is the weight of that outedge. By default, dangling nodes are given
|
||||
outedges according to the personalization vector (uniform if not
|
||||
specified) This must be selected to result in an irreducible transition
|
||||
matrix (see notes under google_matrix). It may be common to have the
|
||||
dangling dict to be the same as the personalization dict.
|
||||
|
||||
Returns
|
||||
-------
|
||||
pagerank : dictionary
|
||||
Dictionary of nodes with PageRank as value
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> G = nx.DiGraph(nx.path_graph(4))
|
||||
>>> pr = nx.pagerank_scipy(G, alpha=0.9)
|
||||
|
||||
Notes
|
||||
-----
|
||||
The eigenvector calculation uses power iteration with a SciPy
|
||||
sparse matrix representation.
|
||||
|
||||
This implementation works with Multi(Di)Graphs. For multigraphs the
|
||||
weight between two nodes is set to be the sum of all edge weights
|
||||
between those nodes.
|
||||
|
||||
See Also
|
||||
--------
|
||||
pagerank, pagerank_numpy, google_matrix
|
||||
|
||||
Raises
|
||||
------
|
||||
PowerIterationFailedConvergence
|
||||
If the algorithm fails to converge to the specified tolerance
|
||||
within the specified number of iterations of the power iteration
|
||||
method.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [1] A. Langville and C. Meyer,
|
||||
"A survey of eigenvector methods of web information retrieval."
|
||||
http://citeseer.ist.psu.edu/713792.html
|
||||
.. [2] Page, Lawrence; Brin, Sergey; Motwani, Rajeev and Winograd, Terry,
|
||||
The PageRank citation ranking: Bringing order to the Web. 1999
|
||||
http://dbpubs.stanford.edu:8090/pub/showDoc.Fulltext?lang=en&doc=1999-66&format=pdf
|
||||
"""
|
||||
import numpy as np
|
||||
import scipy.sparse
|
||||
|
||||
N = len(G)
|
||||
if N == 0:
|
||||
return {}
|
||||
|
||||
nodelist = list(G)
|
||||
M = nx.to_scipy_sparse_matrix(G, nodelist=nodelist, weight=weight, dtype=float)
|
||||
S = np.array(M.sum(axis=1)).flatten()
|
||||
S[S != 0] = 1.0 / S[S != 0]
|
||||
Q = scipy.sparse.spdiags(S.T, 0, *M.shape, format="csr")
|
||||
M = Q * M
|
||||
|
||||
# initial vector
|
||||
if nstart is None:
|
||||
x = np.repeat(1.0 / N, N)
|
||||
else:
|
||||
x = np.array([nstart.get(n, 0) for n in nodelist], dtype=float)
|
||||
x = x / x.sum()
|
||||
|
||||
# Personalization vector
|
||||
if personalization is None:
|
||||
p = np.repeat(1.0 / N, N)
|
||||
else:
|
||||
p = np.array([personalization.get(n, 0) for n in nodelist], dtype=float)
|
||||
p = p / p.sum()
|
||||
|
||||
# Dangling nodes
|
||||
if dangling is None:
|
||||
dangling_weights = p
|
||||
else:
|
||||
# Convert the dangling dictionary into an array in nodelist order
|
||||
dangling_weights = np.array([dangling.get(n, 0) for n in nodelist], dtype=float)
|
||||
dangling_weights /= dangling_weights.sum()
|
||||
is_dangling = np.where(S == 0)[0]
|
||||
|
||||
# power iteration: make up to max_iter iterations
|
||||
for _ in range(max_iter):
|
||||
xlast = x
|
||||
x = alpha * (x * M + sum(x[is_dangling]) * dangling_weights) + (1 - alpha) * p
|
||||
# check convergence, l1 norm
|
||||
err = np.absolute(x - xlast).sum()
|
||||
if err < N * tol:
|
||||
return dict(zip(nodelist, map(float, x)))
|
||||
raise nx.PowerIterationFailedConvergence(max_iter)
|
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
@ -0,0 +1,76 @@
|
|||
import pytest
|
||||
|
||||
|
||||
import networkx
|
||||
from networkx.testing import almost_equal
|
||||
|
||||
# Example from
|
||||
# A. Langville and C. Meyer, "A survey of eigenvector methods of web
|
||||
# information retrieval." http://citeseer.ist.psu.edu/713792.html
|
||||
|
||||
|
||||
class TestHITS:
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
|
||||
G = networkx.DiGraph()
|
||||
|
||||
edges = [(1, 3), (1, 5), (2, 1), (3, 5), (5, 4), (5, 3), (6, 5)]
|
||||
|
||||
G.add_edges_from(edges, weight=1)
|
||||
cls.G = G
|
||||
cls.G.a = dict(
|
||||
zip(sorted(G), [0.000000, 0.000000, 0.366025, 0.133975, 0.500000, 0.000000])
|
||||
)
|
||||
cls.G.h = dict(
|
||||
zip(sorted(G), [0.366025, 0.000000, 0.211325, 0.000000, 0.211325, 0.211325])
|
||||
)
|
||||
|
||||
def test_hits(self):
|
||||
G = self.G
|
||||
h, a = networkx.hits(G, tol=1.0e-08)
|
||||
for n in G:
|
||||
assert almost_equal(h[n], G.h[n], places=4)
|
||||
for n in G:
|
||||
assert almost_equal(a[n], G.a[n], places=4)
|
||||
|
||||
def test_hits_nstart(self):
|
||||
G = self.G
|
||||
nstart = {i: 1.0 / 2 for i in G}
|
||||
h, a = networkx.hits(G, nstart=nstart)
|
||||
|
||||
def test_hits_numpy(self):
|
||||
numpy = pytest.importorskip("numpy")
|
||||
G = self.G
|
||||
h, a = networkx.hits_numpy(G)
|
||||
for n in G:
|
||||
assert almost_equal(h[n], G.h[n], places=4)
|
||||
for n in G:
|
||||
assert almost_equal(a[n], G.a[n], places=4)
|
||||
|
||||
def test_hits_scipy(self):
|
||||
sp = pytest.importorskip("scipy")
|
||||
G = self.G
|
||||
h, a = networkx.hits_scipy(G, tol=1.0e-08)
|
||||
for n in G:
|
||||
assert almost_equal(h[n], G.h[n], places=4)
|
||||
for n in G:
|
||||
assert almost_equal(a[n], G.a[n], places=4)
|
||||
|
||||
def test_empty(self):
|
||||
numpy = pytest.importorskip("numpy")
|
||||
G = networkx.Graph()
|
||||
assert networkx.hits(G) == ({}, {})
|
||||
assert networkx.hits_numpy(G) == ({}, {})
|
||||
assert networkx.authority_matrix(G).shape == (0, 0)
|
||||
assert networkx.hub_matrix(G).shape == (0, 0)
|
||||
|
||||
def test_empty_scipy(self):
|
||||
scipy = pytest.importorskip("scipy")
|
||||
G = networkx.Graph()
|
||||
assert networkx.hits_scipy(G) == ({}, {})
|
||||
|
||||
def test_hits_not_convergent(self):
|
||||
with pytest.raises(networkx.PowerIterationFailedConvergence):
|
||||
G = self.G
|
||||
networkx.hits(G, max_iter=0)
|
|
@ -0,0 +1,197 @@
|
|||
import random
|
||||
|
||||
import networkx
|
||||
import pytest
|
||||
|
||||
numpy = pytest.importorskip("numpy")
|
||||
scipy = pytest.importorskip("scipy")
|
||||
|
||||
from networkx.testing import almost_equal
|
||||
|
||||
# Example from
|
||||
# A. Langville and C. Meyer, "A survey of eigenvector methods of web
|
||||
# information retrieval." http://citeseer.ist.psu.edu/713792.html
|
||||
|
||||
|
||||
class TestPageRank:
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
G = networkx.DiGraph()
|
||||
edges = [
|
||||
(1, 2),
|
||||
(1, 3),
|
||||
# 2 is a dangling node
|
||||
(3, 1),
|
||||
(3, 2),
|
||||
(3, 5),
|
||||
(4, 5),
|
||||
(4, 6),
|
||||
(5, 4),
|
||||
(5, 6),
|
||||
(6, 4),
|
||||
]
|
||||
G.add_edges_from(edges)
|
||||
cls.G = G
|
||||
cls.G.pagerank = dict(
|
||||
zip(
|
||||
sorted(G),
|
||||
[
|
||||
0.03721197,
|
||||
0.05395735,
|
||||
0.04150565,
|
||||
0.37508082,
|
||||
0.20599833,
|
||||
0.28624589,
|
||||
],
|
||||
)
|
||||
)
|
||||
cls.dangling_node_index = 1
|
||||
cls.dangling_edges = {1: 2, 2: 3, 3: 0, 4: 0, 5: 0, 6: 0}
|
||||
cls.G.dangling_pagerank = dict(
|
||||
zip(
|
||||
sorted(G),
|
||||
[0.10844518, 0.18618601, 0.0710892, 0.2683668, 0.15919783, 0.20671497],
|
||||
)
|
||||
)
|
||||
|
||||
def test_pagerank(self):
|
||||
G = self.G
|
||||
p = networkx.pagerank(G, alpha=0.9, tol=1.0e-08)
|
||||
for n in G:
|
||||
assert almost_equal(p[n], G.pagerank[n], places=4)
|
||||
|
||||
nstart = {n: random.random() for n in G}
|
||||
p = networkx.pagerank(G, alpha=0.9, tol=1.0e-08, nstart=nstart)
|
||||
for n in G:
|
||||
assert almost_equal(p[n], G.pagerank[n], places=4)
|
||||
|
||||
def test_pagerank_max_iter(self):
|
||||
with pytest.raises(networkx.PowerIterationFailedConvergence):
|
||||
networkx.pagerank(self.G, max_iter=0)
|
||||
|
||||
def test_numpy_pagerank(self):
|
||||
G = self.G
|
||||
p = networkx.pagerank_numpy(G, alpha=0.9)
|
||||
for n in G:
|
||||
assert almost_equal(p[n], G.pagerank[n], places=4)
|
||||
personalize = {n: random.random() for n in G}
|
||||
p = networkx.pagerank_numpy(G, alpha=0.9, personalization=personalize)
|
||||
|
||||
def test_google_matrix(self):
|
||||
G = self.G
|
||||
M = networkx.google_matrix(G, alpha=0.9, nodelist=sorted(G))
|
||||
e, ev = numpy.linalg.eig(M.T)
|
||||
p = numpy.array(ev[:, 0] / ev[:, 0].sum())[:, 0]
|
||||
for (a, b) in zip(p, self.G.pagerank.values()):
|
||||
assert almost_equal(a, b)
|
||||
|
||||
def test_personalization(self):
|
||||
G = networkx.complete_graph(4)
|
||||
personalize = {0: 1, 1: 1, 2: 4, 3: 4}
|
||||
answer = {
|
||||
0: 0.23246732615667579,
|
||||
1: 0.23246732615667579,
|
||||
2: 0.267532673843324,
|
||||
3: 0.2675326738433241,
|
||||
}
|
||||
p = networkx.pagerank(G, alpha=0.85, personalization=personalize)
|
||||
for n in G:
|
||||
assert almost_equal(p[n], answer[n], places=4)
|
||||
|
||||
def test_zero_personalization_vector(self):
|
||||
G = networkx.complete_graph(4)
|
||||
personalize = {0: 0, 1: 0, 2: 0, 3: 0}
|
||||
pytest.raises(
|
||||
ZeroDivisionError, networkx.pagerank, G, personalization=personalize
|
||||
)
|
||||
|
||||
def test_one_nonzero_personalization_value(self):
|
||||
G = networkx.complete_graph(4)
|
||||
personalize = {0: 0, 1: 0, 2: 0, 3: 1}
|
||||
answer = {
|
||||
0: 0.22077931820379187,
|
||||
1: 0.22077931820379187,
|
||||
2: 0.22077931820379187,
|
||||
3: 0.3376620453886241,
|
||||
}
|
||||
p = networkx.pagerank(G, alpha=0.85, personalization=personalize)
|
||||
for n in G:
|
||||
assert almost_equal(p[n], answer[n], places=4)
|
||||
|
||||
def test_incomplete_personalization(self):
|
||||
G = networkx.complete_graph(4)
|
||||
personalize = {3: 1}
|
||||
answer = {
|
||||
0: 0.22077931820379187,
|
||||
1: 0.22077931820379187,
|
||||
2: 0.22077931820379187,
|
||||
3: 0.3376620453886241,
|
||||
}
|
||||
p = networkx.pagerank(G, alpha=0.85, personalization=personalize)
|
||||
for n in G:
|
||||
assert almost_equal(p[n], answer[n], places=4)
|
||||
|
||||
def test_dangling_matrix(self):
|
||||
"""
|
||||
Tests that the google_matrix doesn't change except for the dangling
|
||||
nodes.
|
||||
"""
|
||||
G = self.G
|
||||
dangling = self.dangling_edges
|
||||
dangling_sum = float(sum(dangling.values()))
|
||||
M1 = networkx.google_matrix(G, personalization=dangling)
|
||||
M2 = networkx.google_matrix(G, personalization=dangling, dangling=dangling)
|
||||
for i in range(len(G)):
|
||||
for j in range(len(G)):
|
||||
if i == self.dangling_node_index and (j + 1) in dangling:
|
||||
assert almost_equal(
|
||||
M2[i, j], dangling[j + 1] / dangling_sum, places=4
|
||||
)
|
||||
else:
|
||||
assert almost_equal(M2[i, j], M1[i, j], places=4)
|
||||
|
||||
def test_dangling_pagerank(self):
|
||||
pr = networkx.pagerank(self.G, dangling=self.dangling_edges)
|
||||
for n in self.G:
|
||||
assert almost_equal(pr[n], self.G.dangling_pagerank[n], places=4)
|
||||
|
||||
def test_dangling_numpy_pagerank(self):
|
||||
pr = networkx.pagerank_numpy(self.G, dangling=self.dangling_edges)
|
||||
for n in self.G:
|
||||
assert almost_equal(pr[n], self.G.dangling_pagerank[n], places=4)
|
||||
|
||||
def test_empty(self):
|
||||
G = networkx.Graph()
|
||||
assert networkx.pagerank(G) == {}
|
||||
assert networkx.pagerank_numpy(G) == {}
|
||||
assert networkx.google_matrix(G).shape == (0, 0)
|
||||
|
||||
|
||||
class TestPageRankScipy(TestPageRank):
|
||||
def test_scipy_pagerank(self):
|
||||
G = self.G
|
||||
p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08)
|
||||
for n in G:
|
||||
assert almost_equal(p[n], G.pagerank[n], places=4)
|
||||
personalize = {n: random.random() for n in G}
|
||||
p = networkx.pagerank_scipy(
|
||||
G, alpha=0.9, tol=1.0e-08, personalization=personalize
|
||||
)
|
||||
|
||||
nstart = {n: random.random() for n in G}
|
||||
p = networkx.pagerank_scipy(G, alpha=0.9, tol=1.0e-08, nstart=nstart)
|
||||
for n in G:
|
||||
assert almost_equal(p[n], G.pagerank[n], places=4)
|
||||
|
||||
def test_scipy_pagerank_max_iter(self):
|
||||
with pytest.raises(networkx.PowerIterationFailedConvergence):
|
||||
networkx.pagerank_scipy(self.G, max_iter=0)
|
||||
|
||||
def test_dangling_scipy_pagerank(self):
|
||||
pr = networkx.pagerank_scipy(self.G, dangling=self.dangling_edges)
|
||||
for n in self.G:
|
||||
assert almost_equal(pr[n], self.G.dangling_pagerank[n], places=4)
|
||||
|
||||
def test_empty_scipy(self):
|
||||
G = networkx.Graph()
|
||||
assert networkx.pagerank_scipy(G) == {}
|
Loading…
Add table
Add a link
Reference in a new issue