282 lines
10 KiB
Python
282 lines
10 KiB
Python
|
# modularity_max.py - functions for finding communities based on modularity
|
||
|
#
|
||
|
# Copyright 2018 Edward L. Platt
|
||
|
#
|
||
|
# This file is part of NetworkX
|
||
|
#
|
||
|
# NetworkX is distributed under a BSD license; see LICENSE.txt for more
|
||
|
# information.
|
||
|
#
|
||
|
# Authors:
|
||
|
# Edward L. Platt <ed@elplatt.com>
|
||
|
#
|
||
|
# TODO:
|
||
|
# - Alter equations for weighted case
|
||
|
# - Write tests for weighted case
|
||
|
"""Functions for detecting communities based on modularity.
|
||
|
"""
|
||
|
|
||
|
import networkx as nx
|
||
|
from networkx.algorithms.community.quality import modularity
|
||
|
|
||
|
from networkx.utils.mapped_queue import MappedQueue
|
||
|
|
||
|
__all__ = [
|
||
|
'greedy_modularity_communities',
|
||
|
'_naive_greedy_modularity_communities']
|
||
|
|
||
|
|
||
|
def greedy_modularity_communities(G, weight=None):
|
||
|
"""Find communities in graph using Clauset-Newman-Moore greedy modularity
|
||
|
maximization. This method currently supports the Graph class and does not
|
||
|
consider edge weights.
|
||
|
|
||
|
Greedy modularity maximization begins with each node in its own community
|
||
|
and joins the pair of communities that most increases modularity until no
|
||
|
such pair exists.
|
||
|
|
||
|
Parameters
|
||
|
----------
|
||
|
G : NetworkX graph
|
||
|
|
||
|
Returns
|
||
|
-------
|
||
|
Yields sets of nodes, one for each community.
|
||
|
|
||
|
Examples
|
||
|
--------
|
||
|
>>> from networkx.algorithms.community import greedy_modularity_communities
|
||
|
>>> G = nx.karate_club_graph()
|
||
|
>>> c = list(greedy_modularity_communities(G))
|
||
|
>>> sorted(c[0])
|
||
|
[8, 14, 15, 18, 20, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]
|
||
|
|
||
|
References
|
||
|
----------
|
||
|
.. [1] M. E. J Newman 'Networks: An Introduction', page 224
|
||
|
Oxford University Press 2011.
|
||
|
.. [2] Clauset, A., Newman, M. E., & Moore, C.
|
||
|
"Finding community structure in very large networks."
|
||
|
Physical Review E 70(6), 2004.
|
||
|
"""
|
||
|
|
||
|
# Count nodes and edges
|
||
|
N = len(G.nodes())
|
||
|
m = sum([d.get('weight', 1) for u, v, d in G.edges(data=True)])
|
||
|
q0 = 1.0 / (2.0*m)
|
||
|
|
||
|
# Map node labels to contiguous integers
|
||
|
label_for_node = dict((i, v) for i, v in enumerate(G.nodes()))
|
||
|
node_for_label = dict((label_for_node[i], i) for i in range(N))
|
||
|
|
||
|
# Calculate degrees
|
||
|
k_for_label = G.degree(G.nodes(), weight=weight)
|
||
|
k = [k_for_label[label_for_node[i]] for i in range(N)]
|
||
|
|
||
|
# Initialize community and merge lists
|
||
|
communities = dict((i, frozenset([i])) for i in range(N))
|
||
|
merges = []
|
||
|
|
||
|
# Initial modularity
|
||
|
partition = [[label_for_node[x] for x in c] for c in communities.values()]
|
||
|
q_cnm = modularity(G, partition)
|
||
|
|
||
|
# Initialize data structures
|
||
|
# CNM Eq 8-9 (Eq 8 was missing a factor of 2 (from A_ij + A_ji)
|
||
|
# a[i]: fraction of edges within community i
|
||
|
# dq_dict[i][j]: dQ for merging community i, j
|
||
|
# dq_heap[i][n] : (-dq, i, j) for communitiy i nth largest dQ
|
||
|
# H[n]: (-dq, i, j) for community with nth largest max_j(dQ_ij)
|
||
|
a = [k[i]*q0 for i in range(N)]
|
||
|
dq_dict = dict(
|
||
|
(i, dict(
|
||
|
(j, 2*q0 - 2*k[i]*k[j]*q0*q0)
|
||
|
for j in [
|
||
|
node_for_label[u]
|
||
|
for u in G.neighbors(label_for_node[i])]
|
||
|
if j != i))
|
||
|
for i in range(N))
|
||
|
dq_heap = [
|
||
|
MappedQueue([
|
||
|
(-dq, i, j)
|
||
|
for j, dq in dq_dict[i].items()])
|
||
|
for i in range(N)]
|
||
|
H = MappedQueue([
|
||
|
dq_heap[i].h[0]
|
||
|
for i in range(N)
|
||
|
if len(dq_heap[i]) > 0])
|
||
|
|
||
|
# Merge communities until we can't improve modularity
|
||
|
while len(H) > 1:
|
||
|
# Find best merge
|
||
|
# Remove from heap of row maxes
|
||
|
# Ties will be broken by choosing the pair with lowest min community id
|
||
|
try:
|
||
|
dq, i, j = H.pop()
|
||
|
except IndexError:
|
||
|
break
|
||
|
dq = -dq
|
||
|
# Remove best merge from row i heap
|
||
|
dq_heap[i].pop()
|
||
|
# Push new row max onto H
|
||
|
if len(dq_heap[i]) > 0:
|
||
|
H.push(dq_heap[i].h[0])
|
||
|
# If this element was also at the root of row j, we need to remove the
|
||
|
# duplicate entry from H
|
||
|
if dq_heap[j].h[0] == (-dq, j, i):
|
||
|
H.remove((-dq, j, i))
|
||
|
# Remove best merge from row j heap
|
||
|
dq_heap[j].remove((-dq, j, i))
|
||
|
# Push new row max onto H
|
||
|
if len(dq_heap[j]) > 0:
|
||
|
H.push(dq_heap[j].h[0])
|
||
|
else:
|
||
|
# Duplicate wasn't in H, just remove from row j heap
|
||
|
dq_heap[j].remove((-dq, j, i))
|
||
|
# Stop when change is non-positive
|
||
|
if dq <= 0:
|
||
|
break
|
||
|
|
||
|
# Perform merge
|
||
|
communities[j] = frozenset(communities[i] | communities[j])
|
||
|
del communities[i]
|
||
|
merges.append((i, j, dq))
|
||
|
# New modularity
|
||
|
q_cnm += dq
|
||
|
# Get list of communities connected to merged communities
|
||
|
i_set = set(dq_dict[i].keys())
|
||
|
j_set = set(dq_dict[j].keys())
|
||
|
all_set = (i_set | j_set) - set([i, j])
|
||
|
both_set = i_set & j_set
|
||
|
# Merge i into j and update dQ
|
||
|
for k in all_set:
|
||
|
# Calculate new dq value
|
||
|
if k in both_set:
|
||
|
dq_jk = dq_dict[j][k] + dq_dict[i][k]
|
||
|
elif k in j_set:
|
||
|
dq_jk = dq_dict[j][k] - 2.0*a[i]*a[k]
|
||
|
else:
|
||
|
# k in i_set
|
||
|
dq_jk = dq_dict[i][k] - 2.0*a[j]*a[k]
|
||
|
# Update rows j and k
|
||
|
for row, col in [(j, k), (k, j)]:
|
||
|
# Save old value for finding heap index
|
||
|
if k in j_set:
|
||
|
d_old = (-dq_dict[row][col], row, col)
|
||
|
else:
|
||
|
d_old = None
|
||
|
# Update dict for j,k only (i is removed below)
|
||
|
dq_dict[row][col] = dq_jk
|
||
|
# Save old max of per-row heap
|
||
|
if len(dq_heap[row]) > 0:
|
||
|
d_oldmax = dq_heap[row].h[0]
|
||
|
else:
|
||
|
d_oldmax = None
|
||
|
# Add/update heaps
|
||
|
d = (-dq_jk, row, col)
|
||
|
if d_old is None:
|
||
|
# We're creating a new nonzero element, add to heap
|
||
|
dq_heap[row].push(d)
|
||
|
else:
|
||
|
# Update existing element in per-row heap
|
||
|
dq_heap[row].update(d_old, d)
|
||
|
# Update heap of row maxes if necessary
|
||
|
if d_oldmax is None:
|
||
|
# No entries previously in this row, push new max
|
||
|
H.push(d)
|
||
|
else:
|
||
|
# We've updated an entry in this row, has the max changed?
|
||
|
if dq_heap[row].h[0] != d_oldmax:
|
||
|
H.update(d_oldmax, dq_heap[row].h[0])
|
||
|
|
||
|
# Remove row/col i from matrix
|
||
|
i_neighbors = dq_dict[i].keys()
|
||
|
for k in i_neighbors:
|
||
|
# Remove from dict
|
||
|
dq_old = dq_dict[k][i]
|
||
|
del dq_dict[k][i]
|
||
|
# Remove from heaps if we haven't already
|
||
|
if k != j:
|
||
|
# Remove both row and column
|
||
|
for row, col in [(k, i), (i, k)]:
|
||
|
# Check if replaced dq is row max
|
||
|
d_old = (-dq_old, row, col)
|
||
|
if dq_heap[row].h[0] == d_old:
|
||
|
# Update per-row heap and heap of row maxes
|
||
|
dq_heap[row].remove(d_old)
|
||
|
H.remove(d_old)
|
||
|
# Update row max
|
||
|
if len(dq_heap[row]) > 0:
|
||
|
H.push(dq_heap[row].h[0])
|
||
|
else:
|
||
|
# Only update per-row heap
|
||
|
dq_heap[row].remove(d_old)
|
||
|
|
||
|
del dq_dict[i]
|
||
|
# Mark row i as deleted, but keep placeholder
|
||
|
dq_heap[i] = MappedQueue()
|
||
|
# Merge i into j and update a
|
||
|
a[j] += a[i]
|
||
|
a[i] = 0
|
||
|
|
||
|
communities = [
|
||
|
frozenset([label_for_node[i] for i in c])
|
||
|
for c in communities.values()]
|
||
|
return sorted(communities, key=len, reverse=True)
|
||
|
|
||
|
|
||
|
def _naive_greedy_modularity_communities(G):
|
||
|
"""Find communities in graph using the greedy modularity maximization.
|
||
|
This implementation is O(n^4), much slower than alternatives, but it is
|
||
|
provided as an easy-to-understand reference implementation.
|
||
|
"""
|
||
|
# First create one community for each node
|
||
|
communities = list([frozenset([u]) for u in G.nodes()])
|
||
|
# Track merges
|
||
|
merges = []
|
||
|
# Greedily merge communities until no improvement is possible
|
||
|
old_modularity = None
|
||
|
new_modularity = modularity(G, communities)
|
||
|
while old_modularity is None or new_modularity > old_modularity:
|
||
|
# Save modularity for comparison
|
||
|
old_modularity = new_modularity
|
||
|
# Find best pair to merge
|
||
|
trial_communities = list(communities)
|
||
|
to_merge = None
|
||
|
for i, u in enumerate(communities):
|
||
|
for j, v in enumerate(communities):
|
||
|
# Skip i=j and empty communities
|
||
|
if j <= i or len(u) == 0 or len(v) == 0:
|
||
|
continue
|
||
|
# Merge communities u and v
|
||
|
trial_communities[j] = u | v
|
||
|
trial_communities[i] = frozenset([])
|
||
|
trial_modularity = modularity(G, trial_communities)
|
||
|
if trial_modularity >= new_modularity:
|
||
|
# Check if strictly better or tie
|
||
|
if trial_modularity > new_modularity:
|
||
|
# Found new best, save modularity and group indexes
|
||
|
new_modularity = trial_modularity
|
||
|
to_merge = (i, j, new_modularity - old_modularity)
|
||
|
elif (
|
||
|
to_merge and
|
||
|
min(i, j) < min(to_merge[0], to_merge[1])
|
||
|
):
|
||
|
# Break ties by choosing pair with lowest min id
|
||
|
new_modularity = trial_modularity
|
||
|
to_merge = (i, j, new_modularity - old_modularity)
|
||
|
# Un-merge
|
||
|
trial_communities[i] = u
|
||
|
trial_communities[j] = v
|
||
|
if to_merge is not None:
|
||
|
# If the best merge improves modularity, use it
|
||
|
merges.append(to_merge)
|
||
|
i, j, dq = to_merge
|
||
|
u, v = communities[i], communities[j]
|
||
|
communities[j] = u | v
|
||
|
communities[i] = frozenset([])
|
||
|
# Remove empty communities and sort
|
||
|
communities = [c for c in communities if len(c) > 0]
|
||
|
for com in sorted(communities, key=lambda x: len(x), reverse=True):
|
||
|
yield com
|