1 | /* $Id$Revision: */ |
2 | /* vim:set shiftwidth=4 ts=8: */ |
3 | |
4 | /************************************************************************* |
5 | * Copyright (c) 2011 AT&T Intellectual Property |
6 | * All rights reserved. This program and the accompanying materials |
7 | * are made available under the terms of the Eclipse Public License v1.0 |
8 | * which accompanies this distribution, and is available at |
9 | * http://www.eclipse.org/legal/epl-v10.html |
10 | * |
11 | * Contributors: See CVS logs. Details at http://www.graphviz.org/ |
12 | *************************************************************************/ |
13 | |
14 | #define STANDALONE |
15 | #include "general.h" |
16 | #include "SparseMatrix.h" |
17 | #include "clustering.h" |
18 | |
19 | |
20 | |
21 | static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_init(SparseMatrix A, int level){ |
22 | Multilevel_Modularity_Clustering grid; |
23 | int n = A->n, i, j; |
24 | |
25 | assert(A->type == MATRIX_TYPE_REAL); |
26 | assert(SparseMatrix_is_symmetric(A, FALSE)); |
27 | |
28 | if (!A) return NULL; |
29 | assert(A->m == n); |
30 | grid = MALLOC(sizeof(struct Multilevel_Modularity_Clustering_struct)); |
31 | grid->level = level; |
32 | grid->n = n; |
33 | grid->A = A; |
34 | grid->P = NULL; |
35 | grid->R = NULL; |
36 | grid->next = NULL; |
37 | grid->prev = NULL; |
38 | grid->delete_top_level_A = FALSE; |
39 | grid->matching = MALLOC(sizeof(real)*(n)); |
40 | grid->deg = NULL; |
41 | grid->agglomerate_regardless = FALSE; |
42 | |
43 | if (level == 0){ |
44 | real modularity = 0; |
45 | int *ia = A->ia, *ja = A->ja, n = A->n; |
46 | real deg_total = 0; |
47 | real *deg, *a = (real*) (A->a); |
48 | real *indeg; |
49 | |
50 | grid->deg_total = 0.; |
51 | grid->deg = MALLOC(sizeof(real)*(n)); |
52 | deg = grid->deg; |
53 | |
54 | indeg = MALLOC(sizeof(real)*n); |
55 | for (i = 0; i < n; i++){ |
56 | deg[i] = 0; |
57 | indeg[i] = 0.; |
58 | for (j = ia[i]; j < ia[i+1]; j++){ |
59 | deg[i] += a[j]; |
60 | if (ja[j] == i) indeg[i] = a[j]; |
61 | } |
62 | deg_total += deg[i]; |
63 | } |
64 | if (deg_total == 0) deg_total = 1; |
65 | for (i = 0; i < n; i++){ |
66 | modularity += (indeg[i] - deg[i]*deg[i]/deg_total)/deg_total; |
67 | } |
68 | grid->deg_total = deg_total; |
69 | grid->deg = deg; |
70 | grid->modularity = modularity; |
71 | FREE(indeg); |
72 | } |
73 | |
74 | |
75 | return grid; |
76 | } |
77 | |
78 | static void Multilevel_Modularity_Clustering_delete(Multilevel_Modularity_Clustering grid){ |
79 | if (!grid) return; |
80 | if (grid->A){ |
81 | if (grid->level == 0) { |
82 | if (grid->delete_top_level_A) SparseMatrix_delete(grid->A); |
83 | } else { |
84 | SparseMatrix_delete(grid->A); |
85 | } |
86 | } |
87 | SparseMatrix_delete(grid->P); |
88 | SparseMatrix_delete(grid->R); |
89 | FREE(grid->matching); |
90 | FREE(grid->deg); |
91 | |
92 | Multilevel_Modularity_Clustering_delete(grid->next); |
93 | FREE(grid); |
94 | } |
95 | |
96 | static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_establish(Multilevel_Modularity_Clustering grid, int ncluster_target){ |
97 | int *matching = grid->matching; |
98 | SparseMatrix A = grid->A; |
99 | int n = grid->n, level = grid->level, nc = 0; |
100 | real modularity = 0; |
101 | int *ia = A->ia, *ja = A->ja; |
102 | real *a; |
103 | real *deg = grid->deg; |
104 | real *deg_new; |
105 | int i, j, jj, jc, jmax; |
106 | real inv_deg_total = 1./(grid->deg_total); |
107 | real *deg_inter, gain; |
108 | int *mask; |
109 | real maxgain; |
110 | real total_gain = 0; |
111 | modularity = grid->modularity; |
112 | |
113 | deg_new = MALLOC(sizeof(real)*n); |
114 | deg_inter = MALLOC(sizeof(real)*n); |
115 | mask = MALLOC(sizeof(int)*n); |
116 | for (i = 0; i < n; i++) mask[i] = -1; |
117 | |
118 | assert(n == A->n); |
119 | for (i = 0; i < n; i++) matching[i] = UNMATCHED; |
120 | |
121 | /* gain in merging node i into cluster j is |
122 | deg(i,j)/deg_total - 2*deg(i)*deg(j)/deg_total^2 |
123 | */ |
124 | a = (real*) A->a; |
125 | for (i = 0; i < n; i++){ |
126 | if (matching[i] != UNMATCHED) continue; |
127 | /* accumulate connections between i and clusters */ |
128 | for (j = ia[i]; j < ia[i+1]; j++){ |
129 | jj = ja[j]; |
130 | if (jj == i) continue; |
131 | if ((jc=matching[jj]) != UNMATCHED){ |
132 | if (mask[jc] != i) { |
133 | mask[jc] = i; |
134 | deg_inter[jc] = a[j]; |
135 | } else { |
136 | deg_inter[jc] += a[j]; |
137 | } |
138 | } |
139 | } |
140 | |
141 | maxgain = 0; |
142 | jmax = -1; |
143 | for (j = ia[i]; j < ia[i+1]; j++){ |
144 | jj = ja[j]; |
145 | if (jj == i) continue; |
146 | if ((jc=matching[jj]) == UNMATCHED){ |
147 | /* the first 2 is due to the fact that deg_iter gives edge weight from i to jj, but there are also edges from jj to i */ |
148 | gain = (2*a[j] - 2*deg[i]*deg[jj]*inv_deg_total)*inv_deg_total; |
149 | } else { |
150 | if (deg_inter[jc] > 0){ |
151 | /* the first 2 is due to the fact that deg_iter gives edge weight from i to jc, but there are also edges from jc to i */ |
152 | gain = (2*deg_inter[jc] - 2*deg[i]*deg_new[jc]*inv_deg_total)*inv_deg_total; |
153 | // printf("mod = %f deg_inter[jc] =%f, deg[i] = %f, deg_new[jc]=%f, gain = %f\n",modularity, deg_inter[jc],deg[i],deg_new[jc],gain); |
154 | deg_inter[jc] = -1; /* so that we do not redo the calulation when we hit another neighbor in cluster jc */ |
155 | } else { |
156 | gain = -1; |
157 | } |
158 | } |
159 | if (jmax < 0 || gain > maxgain){ |
160 | maxgain = gain; |
161 | jmax = jj; |
162 | } |
163 | |
164 | } |
165 | |
166 | /* now merge i and jmax */ |
167 | if (maxgain > 0 || grid->agglomerate_regardless){ |
168 | total_gain += maxgain; |
169 | jc = matching[jmax]; |
170 | if (jc == UNMATCHED){ |
171 | //fprintf(stderr, "maxgain=%f, merge %d, %d\n",maxgain, i, jmax); |
172 | matching[i] = matching[jmax] = nc; |
173 | deg_new[nc] = deg[i] + deg[jmax]; |
174 | nc++; |
175 | } else { |
176 | //fprintf(stderr, "maxgain=%f, merge with existing cluster %d, %d\n",maxgain, i, jc); |
177 | deg_new[jc] += deg[i]; |
178 | matching[i] = jc; |
179 | } |
180 | } else { |
181 | assert(maxgain <= 0); |
182 | matching[i] = nc; |
183 | deg_new[nc] = deg[i]; |
184 | nc++; |
185 | } |
186 | |
187 | } |
188 | |
189 | if (Verbose) fprintf(stderr,"modularity = %f new modularity = %f level = %d, n = %d, nc = %d, gain = %g\n" , modularity, modularity + total_gain, |
190 | level, n, nc, total_gain); |
191 | |
192 | /* !!!!!!!!!!!!!!!!!!!!!! */ |
193 | if (ncluster_target > 0){ |
194 | if (nc <= ncluster_target && n >= ncluster_target){ |
195 | if (n - ncluster_target > ncluster_target - nc){/* ncluster = nc */ |
196 | |
197 | } else if (n - ncluster_target <= ncluster_target - nc){/* ncluster_target close to n */ |
198 | fprintf(stderr,"ncluster_target = %d, close to n=%d\n" , ncluster_target, n); |
199 | for (i = 0; i < n; i++) matching[i] = i; |
200 | FREE(deg_new); |
201 | goto RETURN; |
202 | } |
203 | } else if (n < ncluster_target){ |
204 | fprintf(stderr,"n < target\n" ); |
205 | for (i = 0; i < n; i++) matching[i] = i; |
206 | FREE(deg_new); |
207 | goto RETURN; |
208 | } |
209 | } |
210 | |
211 | if (nc >= 1 && (total_gain > 0 || nc < n)){ |
212 | /* now set up restriction and prolongation operator */ |
213 | SparseMatrix P, R, R0, B, cA; |
214 | real one = 1.; |
215 | Multilevel_Modularity_Clustering cgrid; |
216 | |
217 | R0 = SparseMatrix_new(nc, n, 1, MATRIX_TYPE_REAL, FORMAT_COORD); |
218 | for (i = 0; i < n; i++){ |
219 | jj = matching[i]; |
220 | SparseMatrix_coordinate_form_add_entries(R0, 1, &jj, &i, &one); |
221 | } |
222 | R = SparseMatrix_from_coordinate_format(R0); |
223 | SparseMatrix_delete(R0); |
224 | P = SparseMatrix_transpose(R); |
225 | B = SparseMatrix_multiply(R, A); |
226 | if (!B) goto RETURN; |
227 | cA = SparseMatrix_multiply(B, P); |
228 | if (!cA) goto RETURN; |
229 | SparseMatrix_delete(B); |
230 | grid->P = P; |
231 | grid->R = R; |
232 | level++; |
233 | cgrid = Multilevel_Modularity_Clustering_init(cA, level); |
234 | deg_new = REALLOC(deg_new, nc*sizeof(real)); |
235 | cgrid->deg = deg_new; |
236 | cgrid->modularity = grid->modularity + total_gain; |
237 | cgrid->deg_total = grid->deg_total; |
238 | cgrid = Multilevel_Modularity_Clustering_establish(cgrid, ncluster_target); |
239 | grid->next = cgrid; |
240 | cgrid->prev = grid; |
241 | } else { |
242 | /* if we want a small number of cluster but right now we have too many, we will force agglomeration */ |
243 | if (ncluster_target > 0 && nc > ncluster_target && !(grid->agglomerate_regardless)){ |
244 | grid->agglomerate_regardless = TRUE; |
245 | FREE(deg_inter); |
246 | FREE(mask); |
247 | FREE(deg_new); |
248 | return Multilevel_Modularity_Clustering_establish(grid, ncluster_target); |
249 | } |
250 | /* no more improvement, stop and final clustering found */ |
251 | for (i = 0; i < n; i++) matching[i] = i; |
252 | FREE(deg_new); |
253 | } |
254 | |
255 | RETURN: |
256 | FREE(deg_inter); |
257 | FREE(mask); |
258 | return grid; |
259 | } |
260 | |
261 | static Multilevel_Modularity_Clustering Multilevel_Modularity_Clustering_new(SparseMatrix A0, int ncluster_target){ |
262 | /* ncluster_target is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters |
263 | is desired. The resulting clustering will give as close to this number as possible. |
264 | If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity. |
265 | . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2, |
266 | . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be |
267 | . selected among nc or nc2, which ever is closer to ncluster_target. |
268 | Default: ncluster_target <= 0 */ |
269 | |
270 | Multilevel_Modularity_Clustering grid; |
271 | SparseMatrix A = A0; |
272 | |
273 | if (!SparseMatrix_is_symmetric(A, FALSE) || A->type != MATRIX_TYPE_REAL){ |
274 | A = SparseMatrix_get_real_adjacency_matrix_symmetrized(A); |
275 | } |
276 | grid = Multilevel_Modularity_Clustering_init(A, 0); |
277 | |
278 | grid = Multilevel_Modularity_Clustering_establish(grid, ncluster_target); |
279 | |
280 | if (A != A0) grid->delete_top_level_A = TRUE;/* be sure to clean up later */ |
281 | return grid; |
282 | } |
283 | |
284 | |
285 | static void hierachical_modularity_clustering(SparseMatrix A, int ncluster_target, |
286 | int *nclusters, int **assignment, real *modularity, int *flag){ |
287 | /* find a clustering of vertices by maximize modularity |
288 | A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1. |
289 | |
290 | ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters |
291 | is desired. The resulting clustering will give as close to this number as possible. |
292 | If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity. |
293 | . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2, |
294 | . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be |
295 | . selected among nc or nc2, which ever is closer to ncluster_target. |
296 | Default: ncluster_target <= 0 |
297 | |
298 | nclusters: on output the number of clusters |
299 | assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters |
300 | */ |
301 | |
302 | Multilevel_Modularity_Clustering grid, cgrid; |
303 | int *matching, i; |
304 | SparseMatrix P; |
305 | real *u; |
306 | assert(A->m == A->n); |
307 | |
308 | *modularity = 0.; |
309 | |
310 | *flag = 0; |
311 | |
312 | grid = Multilevel_Modularity_Clustering_new(A, ncluster_target); |
313 | |
314 | /* find coarsest */ |
315 | cgrid = grid; |
316 | while (cgrid->next){ |
317 | cgrid = cgrid->next; |
318 | } |
319 | |
320 | /* project clustering up */ |
321 | u = MALLOC(sizeof(real)*cgrid->n); |
322 | for (i = 0; i < cgrid->n; i++) u[i] = (real) (cgrid->matching)[i]; |
323 | *nclusters = cgrid->n; |
324 | *modularity = cgrid->modularity; |
325 | |
326 | while (cgrid->prev){ |
327 | real *v = NULL; |
328 | P = cgrid->prev->P; |
329 | SparseMatrix_multiply_vector(P, u, &v, FALSE); |
330 | FREE(u); |
331 | u = v; |
332 | cgrid = cgrid->prev; |
333 | } |
334 | |
335 | if (*assignment){ |
336 | matching = *assignment; |
337 | } else { |
338 | matching = MALLOC(sizeof(int)*(grid->n)); |
339 | *assignment = matching; |
340 | } |
341 | for (i = 0; i < grid->n; i++) (matching)[i] = (int) u[i]; |
342 | FREE(u); |
343 | |
344 | Multilevel_Modularity_Clustering_delete(grid); |
345 | |
346 | } |
347 | |
348 | |
349 | |
350 | void modularity_clustering(SparseMatrix A, int inplace, int ncluster_target, int use_value, |
351 | int *nclusters, int **assignment, real *modularity, int *flag){ |
352 | /* find a clustering of vertices by maximize modularity |
353 | A: symmetric square matrix n x n. If real value, value will be used as edges weights, otherwise edge weights are considered as 1. |
354 | inplace: whether A can e modified. If true, A will be modified by removing diagonal. |
355 | ncluster_target: is used to specify the target number of cluster desired, e.g., ncluster_target=10 means that around 10 clusters |
356 | is desired. The resulting clustering will give as close to this number as possible. |
357 | If this number != the optimal number of clusters, the resulting modularity may be lower, or equal to, the optimal modularity. |
358 | . Agglomeration will be forced even if that reduces the modularity when there are too many clusters. It will stop when nc <= ncluster_target <= nc2, |
359 | . where nc and nc2 are the number of clusters in the current and next level of clustering. The final cluster number will be |
360 | . selected among nc or nc2, which ever is closer to ncluster_target. |
361 | Default: ncluster_target <= 0 |
362 | nclusters: on output the number of clusters |
363 | assignment: dimension n. Node i is assigned to cluster "assignment[i]". 0 <= assignment < nclusters |
364 | */ |
365 | SparseMatrix B; |
366 | |
367 | *flag = 0; |
368 | |
369 | assert(A->m == A->n); |
370 | |
371 | B = SparseMatrix_symmetrize(A, FALSE); |
372 | |
373 | if (!inplace && B == A) { |
374 | B = SparseMatrix_copy(A); |
375 | } |
376 | |
377 | B = SparseMatrix_remove_diagonal(B); |
378 | |
379 | if (B->type != MATRIX_TYPE_REAL || !use_value) B = SparseMatrix_set_entries_to_real_one(B); |
380 | |
381 | hierachical_modularity_clustering(B, ncluster_target, nclusters, assignment, modularity, flag); |
382 | |
383 | if (B != A) SparseMatrix_delete(B); |
384 | |
385 | } |
386 | |