1/* $Id$Revision: */
2/* vim:set shiftwidth=4 ts=8: */
3
4/*************************************************************************
5 * Copyright (c) 2011 AT&T Intellectual Property
6 * All rights reserved. This program and the accompanying materials
7 * are made available under the terms of the Eclipse Public License v1.0
8 * which accompanies this distribution, and is available at
9 * http://www.eclipse.org/legal/epl-v10.html
10 *
11 * Contributors: See CVS logs. Details at http://www.graphviz.org/
12 *************************************************************************/
13
14#define STANDALONE
15#include "general.h"
16#include "SparseMatrix.h"
17#include "clustering.h"
18
19
20
21static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_init(SparseMatrix A, int level){
22 Multilevel_Modularity_Clustering grid;
23 int n = A->n, i, j;
24
25 assert(A->type == MATRIX_TYPE_REAL);
26 assert(SparseMatrix_is_symmetric(A, FALSE));
27
28 if (!A) return NULL;
29 assert(A->m == n);
30 grid = MALLOC(sizeof(struct Multilevel_Modularity_Clustering_struct));
31 grid->level = level;
32 grid->n = n;
33 grid->A = A;
34 grid->P = NULL;
35 grid->R = NULL;
36 grid->next = NULL;
37 grid->prev = NULL;
38 grid->delete_top_level_A = FALSE;
39 grid->matching = MALLOC(sizeof(real)*(n));
40 grid->deg = NULL;
41 grid->agglomerate_regardless = FALSE;
42
43 if (level == 0){
44 real modularity = 0;
45 int *ia = A->ia, *ja = A->ja, n = A->n;
46 real deg_total = 0;
47 real *deg, *a = (real*) (A->a);
48 real *indeg;
49
50 grid->deg_total = 0.;
51 grid->deg = MALLOC(sizeof(real)*(n));
52 deg = grid->deg;
53
54 indeg = MALLOC(sizeof(real)*n);
55 for (i = 0; i < n; i++){
56 deg[i] = 0;
57 indeg[i] = 0.;
58 for (j = ia[i]; j < ia[i+1]; j++){
59 deg[i] += a[j];
60 if (ja[j] == i) indeg[i] = a[j];
61 }
62 deg_total += deg[i];
63 }
64 if (deg_total == 0) deg_total = 1;
65 for (i = 0; i < n; i++){
66 modularity += (indeg[i] - deg[i]*deg[i]/deg_total)/deg_total;
67 }
68 grid->deg_total = deg_total;
69 grid->deg = deg;
70 grid->modularity = modularity;
71 FREE(indeg);
72 }
73
74
75 return grid;
76}
77
78static void Multilevel_Modularity_Clustering_delete(Multilevel_Modularity_Clustering grid){
79 if (!grid) return;
80 if (grid->A){
81 if (grid->level == 0) {
82 if (grid->delete_top_level_A) SparseMatrix_delete(grid->A);
83 } else {
84 SparseMatrix_delete(grid->A);
85 }
86 }
87 SparseMatrix_delete(grid->P);
88 SparseMatrix_delete(grid->R);
89 FREE(grid->matching);
90 FREE(grid->deg);
91
92 Multilevel_Modularity_Clustering_delete(grid->next);
93 FREE(grid);
94}
95
96static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_establish(Multilevel_Modularity_Clustering grid, int ncluster_target){
97 int *matching = grid->matching;
98 SparseMatrix A = grid->A;
99 int n = grid->n, level = grid->level, nc = 0;
100 real modularity = 0;
101 int *ia = A->ia, *ja = A->ja;
102 real *a;
103 real *deg = grid->deg;
104 real *deg_new;
105 int i, j, jj, jc, jmax;
106 real inv_deg_total = 1./(grid->deg_total);
107 real *deg_inter, gain;
108 int *mask;
109 real maxgain;
110 real total_gain = 0;
111 modularity = grid->modularity;
112
113 deg_new = MALLOC(sizeof(real)*n);
114 deg_inter = MALLOC(sizeof(real)*n);
115 mask = MALLOC(sizeof(int)*n);
116 for (i = 0; i < n; i++) mask[i] = -1;
117
118 assert(n == A->n);
119 for (i = 0; i < n; i++) matching[i] = UNMATCHED;
120
121 /* gain in merging node i into cluster j is
122 deg(i,j)/deg_total - 2*deg(i)*deg(j)/deg_total^2
123 */
124 a = (real*) A->a;
125 for (i = 0; i < n; i++){
126 if (matching[i] != UNMATCHED) continue;
127 /* accumulate connections between i and clusters */
128 for (j = ia[i]; j < ia[i+1]; j++){
129 jj = ja[j];
130 if (jj == i) continue;
131 if ((jc=matching[jj]) != UNMATCHED){
132 if (mask[jc] != i) {
133 mask[jc] = i;
134 deg_inter[jc] = a[j];
135 } else {
136 deg_inter[jc] += a[j];
137 }
138 }
139 }
140
141 maxgain = 0;
142 jmax = -1;
143 for (j = ia[i]; j < ia[i+1]; j++){
144 jj = ja[j];
145 if (jj == i) continue;
146 if ((jc=matching[jj]) == UNMATCHED){
147 /* the first 2 is due to the fact that deg_iter gives edge weight from i to jj, but there are also edges from jj to i */
148 gain = (2*a[j] - 2*deg[i]*deg[jj]*inv_deg_total)*inv_deg_total;
149 } else {
150 if (deg_inter[jc] > 0){
151 /* the first 2 is due to the fact that deg_iter gives edge weight from i to jc, but there are also edges from jc to i */
152 gain = (2*deg_inter[jc] - 2*deg[i]*deg_new[jc]*inv_deg_total)*inv_deg_total;
153 // printf("mod = %f deg_inter[jc] =%f, deg[i] = %f, deg_new[jc]=%f, gain = %f\n",modularity, deg_inter[jc],deg[i],deg_new[jc],gain);
154 deg_inter[jc] = -1; /* so that we do not redo the calulation when we hit another neighbor in cluster jc */
155 } else {
156 gain = -1;
157 }
158 }
159 if (jmax < 0 || gain > maxgain){
160 maxgain = gain;
161 jmax = jj;
162 }
163
164 }
165
166 /* now merge i and jmax */
167 if (maxgain > 0 || grid->agglomerate_regardless){
168 total_gain += maxgain;
169 jc = matching[jmax];
170 if (jc == UNMATCHED){
171 //fprintf(stderr, "maxgain=%f, merge %d, %d\n",maxgain, i, jmax);
172 matching[i] = matching[jmax] = nc;
173 deg_new[nc] = deg[i] + deg[jmax];
174 nc++;
175 } else {
176 //fprintf(stderr, "maxgain=%f, merge with existing cluster %d, %d\n",maxgain, i, jc);
177 deg_new[jc] += deg[i];
178 matching[i] = jc;
179 }
180 } else {
181 assert(maxgain <= 0);
182 matching[i] = nc;
183 deg_new[nc] = deg[i];
184 nc++;
185 }
186
187 }
188
189 if (Verbose) fprintf(stderr,"modularity = %f new modularity = %f level = %d, n = %d, nc = %d, gain = %g\n", modularity, modularity + total_gain,
190 level, n, nc, total_gain);
191
192 /* !!!!!!!!!!!!!!!!!!!!!! */
193 if (ncluster_target > 0){
194 if (nc <= ncluster_target && n >= ncluster_target){
195 if (n - ncluster_target > ncluster_target - nc){/* ncluster = nc */
196
197 } else if (n - ncluster_target <= ncluster_target - nc){/* ncluster_target close to n */
198 fprintf(stderr,"ncluster_target = %d, close to n=%d\n", ncluster_target, n);
199 for (i = 0; i < n; i++) matching[i] = i;
200 FREE(deg_new);
201 goto RETURN;
202 }
203 } else if (n < ncluster_target){
204 fprintf(stderr,"n < target\n");
205 for (i = 0; i < n; i++) matching[i] = i;
206 FREE(deg_new);
207 goto RETURN;
208 }
209 }
210
211 if (nc >= 1 && (total_gain > 0 || nc < n)){
212 /* now set up restriction and prolongation operator */
213 SparseMatrix P, R, R0, B, cA;
214 real one = 1.;
215 Multilevel_Modularity_Clustering cgrid;
216
217 R0 = SparseMatrix_new(nc, n, 1, MATRIX_TYPE_REAL, FORMAT_COORD);
218 for (i = 0; i < n; i++){
219 jj = matching[i];
220 SparseMatrix_coordinate_form_add_entries(R0, 1, &jj, &i, &one);
221 }
222 R = SparseMatrix_from_coordinate_format(R0);
223 SparseMatrix_delete(R0);
224 P = SparseMatrix_transpose(R);
225 B = SparseMatrix_multiply(R, A);
226 if (!B) goto RETURN;
227 cA = SparseMatrix_multiply(B, P);
228 if (!cA) goto RETURN;
229 SparseMatrix_delete(B);
230 grid->P = P;
231 grid->R = R;
232 level++;
233 cgrid = Multilevel_Modularity_Clustering_init(cA, level);
234 deg_new = REALLOC(deg_new, nc*sizeof(real));
235 cgrid->deg = deg_new;
236 cgrid->modularity = grid->modularity + total_gain;
237 cgrid->deg_total = grid->deg_total;
238 cgrid = Multilevel_Modularity_Clustering_establish(cgrid, ncluster_target);
239 grid->next = cgrid;
240 cgrid->prev = grid;
241 } else {
242 /* if we want a small number of cluster but right now we have too many, we will force agglomeration */
243 if (ncluster_target > 0 && nc > ncluster_target && !(grid->agglomerate_regardless)){
244 grid->agglomerate_regardless = TRUE;
245 FREE(deg_inter);
246 FREE(mask);
247 FREE(deg_new);
248 return Multilevel_Modularity_Clustering_establish(grid, ncluster_target);
249 }
250 /* no more improvement, stop and final clustering found */
251 for (i = 0; i < n; i++) matching[i] = i;
252 FREE(deg_new);
253 }
254
255 RETURN:
256 FREE(deg_inter);
257 FREE(mask);
258 return grid;
259}
260
261static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_new(SparseMatrix A0, int ncluster_target){
262 /* ncluster_target is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
263 is desired. The resulting clustering will give as close to this number as possible.
264 If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
265 . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
266 . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
267 . selected among nc or nc2, which ever is closer to ncluster_target.
268 Default: ncluster_target <= 0 */
269
270 Multilevel_Modularity_Clustering grid;
271 SparseMatrix A = A0;
272
273 if (!SparseMatrix_is_symmetric(A, FALSE) || A->type != MATRIX_TYPE_REAL){
274 A = SparseMatrix_get_real_adjacency_matrix_symmetrized(A);
275 }
276 grid = Multilevel_Modularity_Clustering_init(A, 0);
277
278 grid = Multilevel_Modularity_Clustering_establish(grid, ncluster_target);
279
280 if (A != A0) grid->delete_top_level_A = TRUE;/* be sure to clean up later */
281 return grid;
282}
283
284
285static void hierachical_modularity_clustering(SparseMatrix A, int ncluster_target,
286 int *nclusters, int **assignment, real *modularity, int *flag){
287 /* find a clustering of vertices by maximize modularity
288 A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1.
289
290 ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
291 is desired. The resulting clustering will give as close to this number as possible.
292 If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
293 . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
294 . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
295 . selected among nc or nc2, which ever is closer to ncluster_target.
296 Default: ncluster_target <= 0
297
298 nclusters: on output the number of clusters
299 assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters
300 */
301
302 Multilevel_Modularity_Clustering grid, cgrid;
303 int *matching, i;
304 SparseMatrix P;
305 real *u;
306 assert(A->m == A->n);
307
308 *modularity = 0.;
309
310 *flag = 0;
311
312 grid = Multilevel_Modularity_Clustering_new(A, ncluster_target);
313
314 /* find coarsest */
315 cgrid = grid;
316 while (cgrid->next){
317 cgrid = cgrid->next;
318 }
319
320 /* project clustering up */
321 u = MALLOC(sizeof(real)*cgrid->n);
322 for (i = 0; i < cgrid->n; i++) u[i] = (real) (cgrid->matching)[i];
323 *nclusters = cgrid->n;
324 *modularity = cgrid->modularity;
325
326 while (cgrid->prev){
327 real *v = NULL;
328 P = cgrid->prev->P;
329 SparseMatrix_multiply_vector(P, u, &v, FALSE);
330 FREE(u);
331 u = v;
332 cgrid = cgrid->prev;
333 }
334
335 if (*assignment){
336 matching = *assignment;
337 } else {
338 matching = MALLOC(sizeof(int)*(grid->n));
339 *assignment = matching;
340 }
341 for (i = 0; i < grid->n; i++) (matching)[i] = (int) u[i];
342 FREE(u);
343
344 Multilevel_Modularity_Clustering_delete(grid);
345
346}
347
348
349
350void modularity_clustering(SparseMatrix A, int inplace, int ncluster_target, int use_value,
351 int *nclusters, int **assignment, real *modularity, int *flag){
352 /* find a clustering of vertices by maximize modularity
353 A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1.
354 inplace: whether A can e modified. If true, A will be modified by removing diagonal.
355 ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters
356 is desired. The resulting clustering will give as close to this number as possible.
357 If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity.
358 . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2,
359 . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be
360 . selected among nc or nc2, which ever is closer to ncluster_target.
361 Default: ncluster_target <= 0
362 nclusters: on output the number of clusters
363 assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters
364 */
365 SparseMatrix B;
366
367 *flag = 0;
368
369 assert(A->m == A->n);
370
371 B = SparseMatrix_symmetrize(A, FALSE);
372
373 if (!inplace && B == A) {
374 B = SparseMatrix_copy(A);
375 }
376
377 B = SparseMatrix_remove_diagonal(B);
378
379 if (B->type != MATRIX_TYPE_REAL || !use_value) B = SparseMatrix_set_entries_to_real_one(B);
380
381 hierachical_modularity_clustering(B, ncluster_target, nclusters, assignment, modularity, flag);
382
383 if (B != A) SparseMatrix_delete(B);
384
385}
386