203 lines
6.8 KiB
Python
203 lines
6.8 KiB
Python
# -*- coding: utf-8 -*-
|
|
# Copyright (C) 2015-2019 Aitor Almeida
|
|
# All rights reserved.
|
|
# BSD license.
|
|
#
|
|
# Author: Aitor Almeida <aitoralmeida@gmail.com>
|
|
"""
|
|
Label propagation community detection algorithms.
|
|
"""
|
|
from collections import Counter
|
|
|
|
import networkx as nx
|
|
from networkx.utils import groups
|
|
from networkx.utils import not_implemented_for
|
|
from networkx.utils import py_random_state
|
|
|
|
__all__ = ['label_propagation_communities', 'asyn_lpa_communities']
|
|
|
|
|
|
@py_random_state(2)
|
|
def asyn_lpa_communities(G, weight=None, seed=None):
|
|
"""Returns communities in `G` as detected by asynchronous label
|
|
propagation.
|
|
|
|
The asynchronous label propagation algorithm is described in
|
|
[1]_. The algorithm is probabilistic and the found communities may
|
|
vary on different executions.
|
|
|
|
The algorithm proceeds as follows. After initializing each node with
|
|
a unique label, the algorithm repeatedly sets the label of a node to
|
|
be the label that appears most frequently among that nodes
|
|
neighbors. The algorithm halts when each node has the label that
|
|
appears most frequently among its neighbors. The algorithm is
|
|
asynchronous because each node is updated without waiting for
|
|
updates on the remaining nodes.
|
|
|
|
This generalized version of the algorithm in [1]_ accepts edge
|
|
weights.
|
|
|
|
Parameters
|
|
----------
|
|
G : Graph
|
|
|
|
weight : string
|
|
The edge attribute representing the weight of an edge.
|
|
If None, each edge is assumed to have weight one. In this
|
|
algorithm, the weight of an edge is used in determining the
|
|
frequency with which a label appears among the neighbors of a
|
|
node: a higher weight means the label appears more often.
|
|
|
|
seed : integer, random_state, or None (default)
|
|
Indicator of random number generation state.
|
|
See :ref:`Randomness<randomness>`.
|
|
|
|
Returns
|
|
-------
|
|
communities : iterable
|
|
Iterable of communities given as sets of nodes.
|
|
|
|
Notes
|
|
------
|
|
Edge weight attributes must be numerical.
|
|
|
|
References
|
|
----------
|
|
.. [1] Raghavan, Usha Nandini, Réka Albert, and Soundar Kumara. "Near
|
|
linear time algorithm to detect community structures in large-scale
|
|
networks." Physical Review E 76.3 (2007): 036106.
|
|
"""
|
|
|
|
labels = {n: i for i, n in enumerate(G)}
|
|
cont = True
|
|
while cont:
|
|
cont = False
|
|
nodes = list(G)
|
|
seed.shuffle(nodes)
|
|
# Calculate the label for each node
|
|
for node in nodes:
|
|
if len(G[node]) < 1:
|
|
continue
|
|
|
|
# Get label frequencies. Depending on the order they are processed
|
|
# in some nodes with be in t and others in t-1, making the
|
|
# algorithm asynchronous.
|
|
label_freq = Counter()
|
|
for v in G[node]:
|
|
label_freq.update({labels[v]: G.edges[node, v][weight]
|
|
if weight else 1})
|
|
# Choose the label with the highest frecuency. If more than 1 label
|
|
# has the highest frecuency choose one randomly.
|
|
max_freq = max(label_freq.values())
|
|
best_labels = [label for label, freq in label_freq.items()
|
|
if freq == max_freq]
|
|
|
|
# Continue until all nodes have a majority label
|
|
if labels[node] not in best_labels:
|
|
labels[node] = seed.choice(best_labels)
|
|
cont = True
|
|
|
|
# TODO In Python 3.3 or later, this should be `yield from ...`.
|
|
return iter(groups(labels).values())
|
|
|
|
|
|
@not_implemented_for('directed')
|
|
def label_propagation_communities(G):
|
|
"""Generates community sets determined by label propagation
|
|
|
|
Finds communities in `G` using a semi-synchronous label propagation
|
|
method[1]_. This method combines the advantages of both the synchronous
|
|
and asynchronous models. Not implemented for directed graphs.
|
|
|
|
Parameters
|
|
----------
|
|
G : graph
|
|
An undirected NetworkX graph.
|
|
|
|
Yields
|
|
------
|
|
communities : generator
|
|
Yields sets of the nodes in each community.
|
|
|
|
Raises
|
|
------
|
|
NetworkXNotImplemented
|
|
If the graph is directed
|
|
|
|
References
|
|
----------
|
|
.. [1] Cordasco, G., & Gargano, L. (2010, December). Community detection
|
|
via semi-synchronous label propagation algorithms. In Business
|
|
Applications of Social Network Analysis (BASNA), 2010 IEEE International
|
|
Workshop on (pp. 1-8). IEEE.
|
|
"""
|
|
coloring = _color_network(G)
|
|
# Create a unique label for each node in the graph
|
|
labeling = {v: k for k, v in enumerate(G)}
|
|
while not _labeling_complete(labeling, G):
|
|
# Update the labels of every node with the same color.
|
|
for color, nodes in coloring.items():
|
|
for n in nodes:
|
|
_update_label(n, labeling, G)
|
|
|
|
for label in set(labeling.values()):
|
|
yield set((x for x in labeling if labeling[x] == label))
|
|
|
|
|
|
def _color_network(G):
|
|
"""Colors the network so that neighboring nodes all have distinct colors.
|
|
|
|
Returns a dict keyed by color to a set of nodes with that color.
|
|
"""
|
|
coloring = dict() # color => set(node)
|
|
colors = nx.coloring.greedy_color(G)
|
|
for node, color in colors.items():
|
|
if color in coloring:
|
|
coloring[color].add(node)
|
|
else:
|
|
coloring[color] = set([node])
|
|
return coloring
|
|
|
|
|
|
def _labeling_complete(labeling, G):
|
|
"""Determines whether or not LPA is done.
|
|
|
|
Label propagation is complete when all nodes have a label that is
|
|
in the set of highest frequency labels amongst its neighbors.
|
|
|
|
Nodes with no neighbors are considered complete.
|
|
"""
|
|
return all(labeling[v] in _most_frequent_labels(v, labeling, G)
|
|
for v in G if len(G[v]) > 0)
|
|
|
|
|
|
def _most_frequent_labels(node, labeling, G):
|
|
"""Returns a set of all labels with maximum frequency in `labeling`.
|
|
|
|
Input `labeling` should be a dict keyed by node to labels.
|
|
"""
|
|
if not G[node]:
|
|
# Nodes with no neighbors are themselves a community and are labeled
|
|
# accordingly, hence the immediate if statement.
|
|
return {labeling[node]}
|
|
|
|
# Compute the frequencies of all neighbours of node
|
|
freqs = Counter(labeling[q] for q in G[node])
|
|
max_freq = max(freqs.values())
|
|
return {label for label, freq in freqs.items() if freq == max_freq}
|
|
|
|
|
|
def _update_label(node, labeling, G):
|
|
"""Updates the label of a node using the Prec-Max tie breaking algorithm
|
|
|
|
The algorithm is explained in: 'Community Detection via Semi-Synchronous
|
|
Label Propagation Algorithms' Cordasco and Gargano, 2011
|
|
"""
|
|
high_labels = _most_frequent_labels(node, labeling, G)
|
|
if len(high_labels) == 1:
|
|
labeling[node] = high_labels.pop()
|
|
elif len(high_labels) > 1:
|
|
# Prec-Max
|
|
if labeling[node] not in high_labels:
|
|
labeling[node] = max(high_labels)
|