1/* Copyright (C) 2006 MySQL AB & MySQL Finland AB & TCX DataKonsult AB
2
3 This program is free software; you can redistribute it and/or modify
4 it under the terms of the GNU General Public License as published by
5 the Free Software Foundation; version 2 of the License.
6
7 This program is distributed in the hope that it will be useful,
8 but WITHOUT ANY WARRANTY; without even the implied warranty of
9 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 GNU General Public License for more details.
11
12 You should have received a copy of the GNU General Public License
13 along with this program; if not, write to the Free Software
14 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111-1301 USA */
15
16/* Written by Sergei A. Golubchik, who has a shared copyright to this code
17 added support for long options (my_getopt) 22.5.2002 by Jani Tolonen */
18
19#include "ma_ftdefs.h"
20#include <my_getopt.h>
21
22static void usage();
23static void complain(int val);
24static my_bool get_one_option(int, const struct my_option *, char *);
25
26static int count=0, stats=0, dump=0, lstats=0;
27static my_bool verbose;
28static char *query=NULL;
29static uint lengths[256];
30
31#define MAX_LEN (HA_FT_MAXBYTELEN+10)
32#define HOW_OFTEN_TO_WRITE 10000
33
34static struct my_option my_long_options[] =
35{
36 {"help", 'h', "Display help and exit.",
37 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
38 {"help", '?', "Synonym for -h.",
39 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
40 {"count", 'c', "Calculate per-word stats (counts and global weights).",
41 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
42 {"dump", 'd', "Dump index (incl. data offsets and word weights).",
43 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
44 {"length", 'l', "Report length distribution.",
45 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
46 {"stats", 's', "Report global stats.",
47 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
48 {"verbose", 'v', "Be verbose.",
49 &verbose, &verbose, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
50 { 0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
51};
52
53
54int main(int argc,char *argv[])
55{
56 int error=0;
57 uint keylen, keylen2=0, inx, doc_cnt=0;
58 float weight= 1.0;
59 double gws, min_gws=0, avg_gws=0;
60 MARIA_HA *info;
61 char buf[MAX_LEN], buf2[MAX_LEN], buf_maxlen[MAX_LEN], buf_min_gws[MAX_LEN];
62 ulong total=0, maxlen=0, uniq=0, max_doc_cnt=0;
63 struct { MARIA_HA *info; } aio0, *aio=&aio0; /* for GWS_IN_USE */
64
65 MY_INIT(argv[0]);
66 if ((error= handle_options(&argc, &argv, my_long_options, get_one_option)))
67 exit(error);
68 maria_init();
69 if (count || dump)
70 verbose=0;
71 if (!count && !dump && !lstats && !query)
72 stats=1;
73
74 if (verbose)
75 setbuf(stdout,NULL);
76
77 if (argc < 2)
78 usage();
79
80 {
81 char *end;
82 inx= (uint) strtoll(argv[1], &end, 10);
83 if (*end)
84 usage();
85 }
86
87 init_pagecache(maria_pagecache, PAGE_BUFFER_INIT, 0, 0,
88 MARIA_KEY_BLOCK_LENGTH, 0, MY_WME);
89
90 if (!(info=maria_open(argv[0], O_RDONLY,
91 HA_OPEN_ABORT_IF_LOCKED|HA_OPEN_FROM_SQL_LAYER)))
92 {
93 error=my_errno;
94 goto err;
95 }
96
97 *buf2=0;
98 aio->info=info;
99
100 if ((inx >= info->s->base.keys) ||
101 !(info->s->keyinfo[inx].flag & HA_FULLTEXT))
102 {
103 printf("Key %d in table %s is not a FULLTEXT key\n", inx,
104 info->s->open_file_name.str);
105 goto err;
106 }
107
108 maria_lock_database(info, F_EXTRA_LCK);
109
110 info->cur_row.lastpos= HA_OFFSET_ERROR;
111 info->update|= HA_STATE_PREV_FOUND;
112
113 while (!(error=maria_rnext(info,NULL,inx)))
114 {
115 FT_WEIGTH subkeys;
116 keylen=*(info->lastkey_buff);
117
118 subkeys.i= ft_sintXkorr(info->lastkey_buff + keylen + 1);
119 if (subkeys.i >= 0)
120 weight= subkeys.f;
121
122#ifdef HAVE_SNPRINTF
123 snprintf(buf,MAX_LEN,"%.*s",(int) keylen,info->lastkey_buff+1);
124#else
125 sprintf(buf,"%.*s",(int) keylen,info->lastkey_buff+1);
126#endif
127 my_casedn_str(default_charset_info,buf);
128 total++;
129 lengths[keylen]++;
130
131 if (count || stats)
132 {
133 if (strcmp(buf, buf2))
134 {
135 if (*buf2)
136 {
137 uniq++;
138 avg_gws+=gws=GWS_IN_USE;
139 if (count)
140 printf("%9u %20.7f %s\n",doc_cnt,gws,buf2);
141 if (maxlen<keylen2)
142 {
143 maxlen=keylen2;
144 strmov(buf_maxlen, buf2);
145 }
146 if (max_doc_cnt < doc_cnt)
147 {
148 max_doc_cnt=doc_cnt;
149 strmov(buf_min_gws, buf2);
150 min_gws=gws;
151 }
152 }
153 strmov(buf2, buf);
154 keylen2=keylen;
155 doc_cnt=0;
156 }
157 doc_cnt+= (subkeys.i >= 0 ? 1 : -subkeys.i);
158 }
159 if (dump)
160 {
161 if (subkeys.i >= 0)
162 printf("%9lx %20.7f %s\n", (long) info->cur_row.lastpos,weight,buf);
163 else
164 printf("%9lx => %17d %s\n",(long) info->cur_row.lastpos,-subkeys.i,
165 buf);
166 }
167 if (verbose && (total%HOW_OFTEN_TO_WRITE)==0)
168 printf("%10ld\r",total);
169 }
170 maria_lock_database(info, F_UNLCK);
171
172 if (count || stats)
173 {
174 if (*buf2)
175 {
176 uniq++;
177 avg_gws+=gws=GWS_IN_USE;
178 if (count)
179 printf("%9u %20.7f %s\n",doc_cnt,gws,buf2);
180 if (maxlen<keylen2)
181 {
182 maxlen=keylen2;
183 strmov(buf_maxlen, buf2);
184 }
185 if (max_doc_cnt < doc_cnt)
186 {
187 max_doc_cnt=doc_cnt;
188 strmov(buf_min_gws, buf2);
189 min_gws=gws;
190 }
191 }
192 }
193
194 if (stats)
195 {
196 count=0;
197 for (inx=0;inx<256;inx++)
198 {
199 count+=lengths[inx];
200 if ((ulong) count >= total/2)
201 break;
202 }
203 printf("Total rows: %lu\nTotal words: %lu\n"
204 "Unique words: %lu\nLongest word: %lu chars (%s)\n"
205 "Median length: %u\n"
206 "Average global weight: %f\n"
207 "Most common word: %lu times, weight: %f (%s)\n",
208 (long) info->state->records, total, uniq, maxlen, buf_maxlen,
209 inx, avg_gws/uniq, max_doc_cnt, min_gws, buf_min_gws);
210 }
211 if (lstats)
212 {
213 count=0;
214 for (inx=0; inx<256; inx++)
215 {
216 count+=lengths[inx];
217 if (count && lengths[inx])
218 printf("%3u: %10lu %5.2f%% %20lu %4.1f%%\n", inx,
219 (ulong) lengths[inx],100.0*lengths[inx]/total,(ulong) count,
220 100.0*count/total);
221 }
222 }
223
224err:
225 if (error && error != HA_ERR_END_OF_FILE)
226 printf("got error %d\n",my_errno);
227 if (info)
228 maria_close(info);
229 maria_end();
230 return 0;
231}
232
233
234static my_bool
235get_one_option(int optid, const struct my_option *opt __attribute__((unused)),
236 char *argument __attribute__((unused)))
237{
238 switch(optid) {
239 case 'd':
240 dump=1;
241 complain(count || query);
242 break;
243 case 's':
244 stats=1;
245 complain(query!=0);
246 break;
247 case 'c':
248 count= 1;
249 complain(dump || query);
250 break;
251 case 'l':
252 lstats=1;
253 complain(query!=0);
254 break;
255 case '?':
256 case 'h':
257 usage();
258 }
259 return 0;
260}
261
262
263static void usage()
264{
265 printf("Use: aria_ft_dump <table_name> <index_num>\n");
266 my_print_help(my_long_options);
267 my_print_variables(my_long_options);
268 exit(1);
269}
270
271
272static void complain(int val) /* Kinda assert :-) */
273{
274 if (val)
275 {
276 printf("You cannot use these options together!\n");
277 exit(1);
278 }
279}
280
281#include "ma_check_standalone.h"
282
283