autovacuum.c source code [PostgreSQL/src/backend/postmaster/autovacuum.c]

1	/-------------------------------------------------------------------------*
2	*
3	* autovacuum.c
4	*
5	* PostgreSQL Integrated Autovacuum Daemon
6	*
7	* The autovacuum system is structured in two different kinds of processes: the
8	* autovacuum launcher and the autovacuum worker. The launcher is an
9	* always-running process, started by the postmaster when the autovacuum GUC
10	* parameter is set. The launcher schedules autovacuum workers to be started
11	* when appropriate. The workers are the processes which execute the actual
12	* vacuuming; they connect to a database as determined in the launcher, and
13	* once connected they examine the catalogs to select the tables to vacuum.
14	*
15	* The autovacuum launcher cannot start the worker processes by itself,
16	* because doing so would cause robustness issues (namely, failure to shut
17	* them down on exceptional conditions, and also, since the launcher is
18	* connected to shared memory and is thus subject to corruption there, it is
19	* not as robust as the postmaster). So it leaves that task to the postmaster.
20	*
21	* There is an autovacuum shared memory area, where the launcher stores
22	* information about the database it wants vacuumed. When it wants a new
23	* worker to start, it sets a flag in shared memory and sends a signal to the
24	* postmaster. Then postmaster knows nothing more than it must start a worker;
25	* so it forks a new child, which turns into a worker. This new process
26	* connects to shared memory, and there it can inspect the information that the
27	* launcher has set up.
28	*
29	* If the fork() call fails in the postmaster, it sets a flag in the shared
30	* memory area, and sends a signal to the launcher. The launcher, upon
31	* noticing the flag, can try starting the worker again by resending the
32	* signal. Note that the failure can only be transient (fork failure due to
33	* high load, memory pressure, too many processes, etc); more permanent
34	* problems, like failure to connect to a database, are detected later in the
35	* worker and dealt with just by having the worker exit normally. The launcher
36	* will launch a new worker again later, per schedule.
37	*
38	* When the worker is done vacuuming it sends SIGUSR2 to the launcher. The
39	* launcher then wakes up and is able to launch another worker, if the schedule
40	* is so tight that a new worker is needed immediately. At this time the
41	* launcher can also balance the settings for the various remaining workers'
42	* cost-based vacuum delay feature.
43	*
44	* Note that there can be more than one worker in a database concurrently.
45	* They will store the table they are currently vacuuming in shared memory, so
46	* that other workers avoid being blocked waiting for the vacuum lock for that
47	* table. They will also reload the pgstats data just before vacuuming each
48	* table, to avoid vacuuming a table that was just finished being vacuumed by
49	* another worker and thus is no longer noted in shared memory. However,
50	* there is a window (caused by pgstat delay) on which a worker may choose a
51	* table that was already vacuumed; this is a bug in the current design.
52	*
53	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
54	* Portions Copyright (c) 1994, Regents of the University of California
55	*
56	*
57	* IDENTIFICATION
58	* src/backend/postmaster/autovacuum.c
59	*
60	*-------------------------------------------------------------------------
61	*/
62	#include "postgres.h"
63
64	#include <signal.h>
65	#include <sys/time.h>
66	#include <unistd.h>
67
68	#include "access/heapam.h"
69	#include "access/htup_details.h"
70	#include "access/multixact.h"
71	#include "access/reloptions.h"
72	#include "access/tableam.h"
73	#include "access/transam.h"
74	#include "access/xact.h"
75	#include "catalog/dependency.h"
76	#include "catalog/namespace.h"
77	#include "catalog/pg_database.h"
78	#include "commands/dbcommands.h"
79	#include "commands/vacuum.h"
80	#include "lib/ilist.h"
81	#include "libpq/pqsignal.h"
82	#include "miscadmin.h"
83	#include "nodes/makefuncs.h"
84	#include "pgstat.h"
85	#include "postmaster/autovacuum.h"
86	#include "postmaster/fork_process.h"
87	#include "postmaster/postmaster.h"
88	#include "storage/bufmgr.h"
89	#include "storage/ipc.h"
90	#include "storage/latch.h"
91	#include "storage/lmgr.h"
92	#include "storage/pmsignal.h"
93	#include "storage/proc.h"
94	#include "storage/procsignal.h"
95	#include "storage/sinvaladt.h"
96	#include "storage/smgr.h"
97	#include "tcop/tcopprot.h"
98	#include "utils/fmgroids.h"
99	#include "utils/fmgrprotos.h"
100	#include "utils/lsyscache.h"
101	#include "utils/memutils.h"
102	#include "utils/ps_status.h"
103	#include "utils/rel.h"
104	#include "utils/snapmgr.h"
105	#include "utils/syscache.h"
106	#include "utils/timeout.h"
107	#include "utils/timestamp.h"
108
109
110	/*
111	* GUC parameters
112	*/
113	bool autovacuum_start_daemon = false;
114	int autovacuum_max_workers;
115	int autovacuum_work_mem = -`1`;
116	int autovacuum_naptime;
117	int autovacuum_vac_thresh;
118	double autovacuum_vac_scale;
119	int autovacuum_anl_thresh;
120	double autovacuum_anl_scale;
121	int autovacuum_freeze_max_age;
122	int autovacuum_multixact_freeze_max_age;
123
124	double autovacuum_vac_cost_delay;
125	int autovacuum_vac_cost_limit;
126
127	int Log_autovacuum_min_duration = -`1`;
128
129	/ how long to keep pgstat data in the launcher, in milliseconds /
130	#define STATS_READ_DELAY 1000
131
132	/ the minimum allowed time between two awakenings of the launcher /
133	#define MIN_AUTOVAC_SLEEPTIME 100.0 /* milliseconds */
134	#define MAX_AUTOVAC_SLEEPTIME 300 /* seconds */
135
136	/ Flags to tell if we are in an autovacuum process /
137	static bool am_autovacuum_launcher = false;
138	static bool am_autovacuum_worker = false;
139
140	/ Flags set by signal handlers /
141	static volatile sig_atomic_t got_SIGHUP = false;
142	static volatile sig_atomic_t got_SIGUSR2 = false;
143	static volatile sig_atomic_t got_SIGTERM = false;
144
145	/ Comparison points for determining whether freeze_max_age is exceeded /
146	static TransactionId recentXid;
147	static MultiXactId recentMulti;
148
149	/ Default freeze ages to use for autovacuum (varies by database) /
150	static int default_freeze_min_age;
151	static int default_freeze_table_age;
152	static int default_multixact_freeze_min_age;
153	static int default_multixact_freeze_table_age;
154
155	/ Memory context for long-lived data /
156	static MemoryContext AutovacMemCxt;
157
158	/ struct to keep track of databases in launcher /
159	typedef struct avl_dbase
160	{
161	Oid adl_datid; / hash key -- must be first /
162	TimestampTz adl_next_worker;
163	int adl_score;
164	dlist_node adl_node;
165	} avl_dbase;
166
167	/ struct to keep track of databases in worker /
168	typedef struct avw_dbase
169	{
170	Oid adw_datid;
171	char *adw_name;
172	TransactionId adw_frozenxid;
173	MultiXactId adw_minmulti;
174	PgStat_StatDBEntry *adw_entry;
175	} avw_dbase;
176
177	/ struct to keep track of tables to vacuum and/or analyze, in 1st pass /
178	typedef struct av_relation
179	{
180	Oid ar_toastrelid; / hash key - must be first /
181	Oid ar_relid;
182	bool ar_hasrelopts;
183	AutoVacOpts ar_reloptions; / copy of AutoVacOpts from the main table's*
184	* reloptions, or NULL if none */
185	} av_relation;
186
187	/ struct to keep track of tables to vacuum and/or analyze, after rechecking /
188	typedef struct autovac_table
189	{
190	Oid at_relid;
191	VacuumParams at_params;
192	double at_vacuum_cost_delay;
193	int at_vacuum_cost_limit;
194	bool at_dobalance;
195	bool at_sharedrel;
196	char *at_relname;
197	char *at_nspname;
198	char *at_datname;
199	} autovac_table;
200
201	/-------------*
202	* This struct holds information about a single worker's whereabouts. We keep
203	* an array of these in shared memory, sized according to
204	* autovacuum_max_workers.
205	*
206	* wi_links entry into free list or running list
207	* wi_dboid OID of the database this worker is supposed to work on
208	* wi_tableoid OID of the table currently being vacuumed, if any
209	* wi_sharedrel flag indicating whether table is marked relisshared
210	* wi_proc pointer to PGPROC of the running worker, NULL if not started
211	* wi_launchtime Time at which this worker was launched
212	* wi_cost_* Vacuum cost-based delay parameters current in this worker
213	*
214	* All fields are protected by AutovacuumLock, except for wi_tableoid and
215	* wi_sharedrel which are protected by AutovacuumScheduleLock (note these
216	* two fields are read-only for everyone except that worker itself).
217	*-------------
218	*/
219	typedef struct WorkerInfoData
220	{
221	dlist_node wi_links;
222	Oid wi_dboid;
223	Oid wi_tableoid;
224	PGPROC *wi_proc;
225	TimestampTz wi_launchtime;
226	bool wi_dobalance;
227	bool wi_sharedrel;
228	double wi_cost_delay;
229	int wi_cost_limit;
230	int wi_cost_limit_base;
231	} WorkerInfoData;
232
233	typedef struct WorkerInfoData *WorkerInfo;
234
235	/*
236	* Possible signals received by the launcher from remote processes. These are
237	* stored atomically in shared memory so that other processes can set them
238	* without locking.
239	*/
240	typedef enum
241	{
242	AutoVacForkFailed, / failed trying to start a worker /
243	AutoVacRebalance, / rebalance the cost limits /
244	AutoVacNumSignals / must be last /
245	} AutoVacuumSignal;
246
247	/*
248	* Autovacuum workitem array, stored in AutoVacuumShmem->av_workItems. This
249	* list is mostly protected by AutovacuumLock, except that if an item is
250	* marked 'active' other processes must not modify the work-identifying
251	* members.
252	*/
253	typedef struct AutoVacuumWorkItem
254	{
255	AutoVacuumWorkItemType avw_type;
256	bool avw_used; / below data is valid /
257	bool avw_active; / being processed /
258	Oid avw_database;
259	Oid avw_relation;
260	BlockNumber avw_blockNumber;
261	} AutoVacuumWorkItem;
262
263	#define NUM_WORKITEMS 256
264
265	/-------------*
266	* The main autovacuum shmem struct. On shared memory we store this main
267	* struct and the array of WorkerInfo structs. This struct keeps:
268	*
269	* av_signal set by other processes to indicate various conditions
270	* av_launcherpid the PID of the autovacuum launcher
271	* av_freeWorkers the WorkerInfo freelist
272	* av_runningWorkers the WorkerInfo non-free queue
273	* av_startingWorker pointer to WorkerInfo currently being started (cleared by
274	* the worker itself as soon as it's up and running)
275	* av_workItems work item array
276	*
277	* This struct is protected by AutovacuumLock, except for av_signal and parts
278	* of the worker list (see above).
279	*-------------
280	*/
281	typedef struct
282	{
283	sig_atomic_t av_signal[AutoVacNumSignals];
284	pid_t av_launcherpid;
285	dlist_head av_freeWorkers;
286	dlist_head av_runningWorkers;
287	WorkerInfo av_startingWorker;
288	AutoVacuumWorkItem av_workItems[NUM_WORKITEMS];
289	} AutoVacuumShmemStruct;
290
291	static AutoVacuumShmemStruct *AutoVacuumShmem;
292
293	/*
294	* the database list (of avl_dbase elements) in the launcher, and the context
295	* that contains it
296	*/
297	static dlist_head DatabaseList = DLIST_STATIC_INIT(DatabaseList);
298	static MemoryContext DatabaseListCxt = NULL;
299
300	/ Pointer to my own WorkerInfo, valid on each worker /
301	static WorkerInfo MyWorkerInfo = NULL;
302
303	/ PID of launcher, valid only in worker while shutting down /
304	int AutovacuumLauncherPid = `0`;
305
306	#ifdef EXEC_BACKEND
307	static pid_t avlauncher_forkexec(void);
308	static pid_t avworker_forkexec(void);
309	#endif
310	NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]) pg_attribute_noreturn();
311	NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]) pg_attribute_noreturn();
312
313	static Oid do_start_worker(void);
314	static void launcher_determine_sleep(bool canlaunch, bool recursing,
315	struct timeval *nap);
316	static void launch_worker(TimestampTz now);
317	static List get_database_list(void*);
318	static void rebuild_database_list(Oid newdb);
319	static int db_comparator(const void a, const* void *b);
320	static void autovac_balance_cost(void);
321
322	static void do_autovacuum(void);
323	static void FreeWorkerInfo(int code, Datum arg);
324
325	static autovac_table table_recheck_autovac(Oid relid, HTAB table_toast_map,
326	TupleDesc pg_class_desc,
327	int effective_multixact_freeze_max_age);
328	static void relation_needs_vacanalyze(Oid relid, AutoVacOpts *relopts,
329	Form_pg_class classForm,
330	PgStat_StatTabEntry *tabentry,
331	int effective_multixact_freeze_max_age,
332	bool dovacuum, bool doanalyze, bool *wraparound);
333
334	static void autovacuum_do_vac_analyze(autovac_table *tab,
335	BufferAccessStrategy bstrategy);
336	static AutoVacOpts *extract_autovac_opts(HeapTuple tup,
337	TupleDesc pg_class_desc);
338	static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared,
339	PgStat_StatDBEntry *shared,
340	PgStat_StatDBEntry *dbentry);
341	static void perform_work_item(AutoVacuumWorkItem *workitem);
342	static void autovac_report_activity(autovac_table *tab);
343	static void autovac_report_workitem(AutoVacuumWorkItem *workitem,
344	const char nspname, const* char *relname);
345	static void av_sighup_handler(SIGNAL_ARGS);
346	static void avl_sigusr2_handler(SIGNAL_ARGS);
347	static void avl_sigterm_handler(SIGNAL_ARGS);
348	static void autovac_refresh_stats(void);
349
350
351
352	/********************************************************************
353	* AUTOVACUUM LAUNCHER CODE
354	********************************************************************/
355
356	#ifdef EXEC_BACKEND
357	/*
358	* forkexec routine for the autovacuum launcher process.
359	*
360	* Format up the arglist, then fork and exec.
361	*/
362	static pid_t
363	avlauncher_forkexec(void)
364	{
365	char *av[`10`];
366	int ac = `0`;
367
368	av[ac++] = "postgres";
369	av[ac++] = "--forkavlauncher";
370	av[ac++] = NULL; / filled in by postmaster_forkexec /
371	av[ac] = NULL;
372
373	Assert(ac < lengthof(av));
374
375	return postmaster_forkexec(ac, av);
376	}
377
378	/*
379	* We need this set from the outside, before InitProcess is called
380	*/
381	void
382	AutovacuumLauncherIAm(void)
383	{
384	am_autovacuum_launcher = true;
385	}
386	#endif
387
388	/*
389	* Main entry point for autovacuum launcher process, to be called from the
390	* postmaster.
391	*/
392	int
393	StartAutoVacLauncher(void)
394	{
395	pid_t AutoVacPID;
396
397	#ifdef EXEC_BACKEND
398	switch ((AutoVacPID = avlauncher_forkexec()))
399	#else
400	switch ((AutoVacPID = fork_process()))
401	#endif
402	{
403	case -`1`:
404	ereport(LOG,
405	(errmsg("could not fork autovacuum launcher process: %m")));
406	return `0`;
407
408	#ifndef EXEC_BACKEND
409	case `0`:
410	/ in postmaster child ... /
411	InitPostmasterChild();
412
413	/ Close the postmaster's sockets /
414	ClosePostmasterPorts(false);
415
416	AutoVacLauncherMain(`0`, NULL);
417	break;
418	#endif
419	default:
420	return (int) AutoVacPID;
421	}
422
423	/ shouldn't get here /
424	return `0`;
425	}
426
427	/*
428	* Main loop for the autovacuum launcher process.
429	*/
430	NON_EXEC_STATIC void
431	AutoVacLauncherMain(int argc, char *argv[])
432	{
433	sigjmp_buf local_sigjmp_buf;
434
435	am_autovacuum_launcher = true;
436
437	/ Identify myself via ps /
438	init_ps_display(pgstat_get_backend_desc(B_AUTOVAC_LAUNCHER), "", "", "");
439
440	ereport(DEBUG1,
441	(errmsg("autovacuum launcher started")));
442
443	if (PostAuthDelay)
444	pg_usleep(PostAuthDelay * `1000000L`);
445
446	SetProcessingMode(InitProcessing);
447
448	/*
449	* Set up signal handlers. We operate on databases much like a regular
450	* backend, so we use the same signal handling. See equivalent code in
451	* tcop/postgres.c.
452	*/
453	pqsignal(SIGHUP, av_sighup_handler);
454	pqsignal(SIGINT, StatementCancelHandler);
455	pqsignal(SIGTERM, avl_sigterm_handler);
456
457	pqsignal(SIGQUIT, quickdie);
458	InitializeTimeouts(); / establishes SIGALRM handler /
459
460	pqsignal(SIGPIPE, SIG_IGN);
461	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
462	pqsignal(SIGUSR2, avl_sigusr2_handler);
463	pqsignal(SIGFPE, FloatExceptionHandler);
464	pqsignal(SIGCHLD, SIG_DFL);
465
466	/ Early initialization /
467	BaseInit();
468
469	/*
470	* Create a per-backend PGPROC struct in shared memory, except in the
471	* EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
472	* this before we can use LWLocks (and in the EXEC_BACKEND case we already
473	* had to do some stuff with LWLocks).
474	*/
475	#ifndef EXEC_BACKEND
476	InitProcess();
477	#endif
478
479	InitPostgres(NULL, InvalidOid, NULL, InvalidOid, NULL, false);
480
481	SetProcessingMode(NormalProcessing);
482
483	/*
484	* Create a memory context that we will do all our work in. We do this so
485	* that we can reset the context during error recovery and thereby avoid
486	* possible memory leaks.
487	*/
488	AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
489	"Autovacuum Launcher",
490	ALLOCSET_DEFAULT_SIZES);
491	MemoryContextSwitchTo(AutovacMemCxt);
492
493	/*
494	* If an exception is encountered, processing resumes here.
495	*
496	* This code is a stripped down version of PostgresMain error recovery.
497	*/
498	if (sigsetjmp(local_sigjmp_buf, `1`) != `0`)
499	{
500	/ since not using PG_TRY, must reset error stack by hand /
501	error_context_stack = NULL;
502
503	/ Prevents interrupts while cleaning up /
504	HOLD_INTERRUPTS();
505
506	/ Forget any pending QueryCancel or timeout request /
507	disable_all_timeouts(false);
508	QueryCancelPending = false; / second to avoid race condition /
509
510	/ Report the error to the server log /
511	EmitErrorReport();
512
513	/ Abort the current transaction in order to recover /
514	AbortCurrentTransaction();
515
516	/*
517	* Release any other resources, for the case where we were not in a
518	* transaction.
519	*/
520	LWLockReleaseAll();
521	pgstat_report_wait_end();
522	AbortBufferIO();
523	UnlockBuffers();
524	/ this is probably dead code, but let's be safe: /
525	if (AuxProcessResourceOwner)
526	ReleaseAuxProcessResources(false);
527	AtEOXact_Buffers(false);
528	AtEOXact_SMgr();
529	AtEOXact_Files(false);
530	AtEOXact_HashTables(false);
531
532	/*
533	* Now return to normal top-level context and clear ErrorContext for
534	* next time.
535	*/
536	MemoryContextSwitchTo(AutovacMemCxt);
537	FlushErrorState();
538
539	/ Flush any leaked data in the top-level context /
540	MemoryContextResetAndDeleteChildren(AutovacMemCxt);
541
542	/ don't leave dangling pointers to freed memory /
543	DatabaseListCxt = NULL;
544	dlist_init(&DatabaseList);
545
546	/*
547	* Make sure pgstat also considers our stat data as gone. Note: we
548	* mustn't use autovac_refresh_stats here.
549	*/
550	pgstat_clear_snapshot();
551
552	/ Now we can allow interrupts again /
553	RESUME_INTERRUPTS();
554
555	/ if in shutdown mode, no need for anything further; just go away /
556	if (got_SIGTERM)
557	goto shutdown;
558
559	/*
560	* Sleep at least 1 second after any error. We don't want to be
561	* filling the error logs as fast as we can.
562	*/
563	pg_usleep(`1000000L`);
564	}
565
566	/ We can now handle ereport(ERROR) /
567	PG_exception_stack = &local_sigjmp_buf;
568
569	/ must unblock signals before calling rebuild_database_list /
570	PG_SETMASK(&UnBlockSig);
571
572	/*
573	* Set always-secure search path. Launcher doesn't connect to a database,
574	* so this has no effect.
575	*/
576	SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
577
578	/*
579	* Force zero_damaged_pages OFF in the autovac process, even if it is set
580	* in postgresql.conf. We don't really want such a dangerous option being
581	* applied non-interactively.
582	*/
583	SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
584
585	/*
586	* Force settable timeouts off to avoid letting these settings prevent
587	* regular maintenance from being executed.
588	*/
589	SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
590	SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
591	SetConfigOption("idle_in_transaction_session_timeout", "0",
592	PGC_SUSET, PGC_S_OVERRIDE);
593
594	/*
595	* Force default_transaction_isolation to READ COMMITTED. We don't want
596	* to pay the overhead of serializable mode, nor add any risk of causing
597	* deadlocks or delaying other transactions.
598	*/
599	SetConfigOption("default_transaction_isolation", "read committed",
600	PGC_SUSET, PGC_S_OVERRIDE);
601
602	/*
603	* In emergency mode, just start a worker (unless shutdown was requested)
604	* and go away.
605	*/
606	if (!AutoVacuumingActive())
607	{
608	if (!got_SIGTERM)
609	do_start_worker();
610	proc_exit(`0`); / done /
611	}
612
613	AutoVacuumShmem->av_launcherpid = MyProcPid;
614
615	/*
616	* Create the initial database list. The invariant we want this list to
617	* keep is that it's ordered by decreasing next_time. As soon as an entry
618	* is updated to a higher time, it will be moved to the front (which is
619	* correct because the only operation is to add autovacuum_naptime to the
620	* entry, and time always increases).
621	*/
622	rebuild_database_list(InvalidOid);
623
624	/ loop until shutdown request /
625	while (!got_SIGTERM)
626	{
627	struct timeval nap;
628	TimestampTz current_time = `0`;
629	bool can_launch;
630
631	/*
632	* This loop is a bit different from the normal use of WaitLatch,
633	* because we'd like to sleep before the first launch of a child
634	* process. So it's WaitLatch, then ResetLatch, then check for
635	* wakening conditions.
636	*/
637
638	launcher_determine_sleep(!dlist_is_empty(&AutoVacuumShmem->av_freeWorkers),
639	false, &nap);
640
641	/*
642	* Wait until naptime expires or we get some type of signal (all the
643	* signal handlers will wake us by calling SetLatch).
644	*/
645	(void) WaitLatch(MyLatch,
646	WL_LATCH_SET \| WL_TIMEOUT \| WL_EXIT_ON_PM_DEATH,
647	(nap.tv_sec * `1000L`) + (nap.tv_usec / `1000L`),
648	WAIT_EVENT_AUTOVACUUM_MAIN);
649
650	ResetLatch(MyLatch);
651
652	/ Process sinval catchup interrupts that happened while sleeping /
653	ProcessCatchupInterrupt();
654
655	/ the normal shutdown case /
656	if (got_SIGTERM)
657	break;
658
659	if (got_SIGHUP)
660	{
661	got_SIGHUP = false;
662	ProcessConfigFile(PGC_SIGHUP);
663
664	/ shutdown requested in config file? /
665	if (!AutoVacuumingActive())
666	break;
667
668	/ rebalance in case the default cost parameters changed /
669	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
670	autovac_balance_cost();
671	LWLockRelease(AutovacuumLock);
672
673	/ rebuild the list in case the naptime changed /
674	rebuild_database_list(InvalidOid);
675	}
676
677	/*
678	* a worker finished, or postmaster signalled failure to start a
679	* worker
680	*/
681	if (got_SIGUSR2)
682	{
683	got_SIGUSR2 = false;
684
685	/ rebalance cost limits, if needed /
686	if (AutoVacuumShmem->av_signal[AutoVacRebalance])
687	{
688	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
689	AutoVacuumShmem->av_signal[AutoVacRebalance] = false;
690	autovac_balance_cost();
691	LWLockRelease(AutovacuumLock);
692	}
693
694	if (AutoVacuumShmem->av_signal[AutoVacForkFailed])
695	{
696	/*
697	* If the postmaster failed to start a new worker, we sleep
698	* for a little while and resend the signal. The new worker's
699	* state is still in memory, so this is sufficient. After
700	* that, we restart the main loop.
701	*
702	* XXX should we put a limit to the number of times we retry?
703	* I don't think it makes much sense, because a future start
704	* of a worker will continue to fail in the same way.
705	*/
706	AutoVacuumShmem->av_signal[AutoVacForkFailed] = false;
707	pg_usleep(`1000000L`); / 1s /
708	SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
709	continue;
710	}
711	}
712
713	/*
714	* There are some conditions that we need to check before trying to
715	* start a worker. First, we need to make sure that there is a worker
716	* slot available. Second, we need to make sure that no other worker
717	* failed while starting up.
718	*/
719
720	current_time = GetCurrentTimestamp();
721	LWLockAcquire(AutovacuumLock, LW_SHARED);
722
723	can_launch = !dlist_is_empty(&AutoVacuumShmem->av_freeWorkers);
724
725	if (AutoVacuumShmem->av_startingWorker != NULL)
726	{
727	int waittime;
728	WorkerInfo worker = AutoVacuumShmem->av_startingWorker;
729
730	/*
731	* We can't launch another worker when another one is still
732	* starting up (or failed while doing so), so just sleep for a bit
733	* more; that worker will wake us up again as soon as it's ready.
734	* We will only wait autovacuum_naptime seconds (up to a maximum
735	* of 60 seconds) for this to happen however. Note that failure
736	* to connect to a particular database is not a problem here,
737	* because the worker removes itself from the startingWorker
738	* pointer before trying to connect. Problems detected by the
739	* postmaster (like fork() failure) are also reported and handled
740	* differently. The only problems that may cause this code to
741	* fire are errors in the earlier sections of AutoVacWorkerMain,
742	* before the worker removes the WorkerInfo from the
743	* startingWorker pointer.
744	*/
745	waittime = Min(autovacuum_naptime, `60`) * `1000`;
746	if (TimestampDifferenceExceeds(worker->wi_launchtime, current_time,
747	waittime))
748	{
749	LWLockRelease(AutovacuumLock);
750	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
751
752	/*
753	* No other process can put a worker in starting mode, so if
754	* startingWorker is still INVALID after exchanging our lock,
755	* we assume it's the same one we saw above (so we don't
756	* recheck the launch time).
757	*/
758	if (AutoVacuumShmem->av_startingWorker != NULL)
759	{
760	worker = AutoVacuumShmem->av_startingWorker;
761	worker->wi_dboid = InvalidOid;
762	worker->wi_tableoid = InvalidOid;
763	worker->wi_sharedrel = false;
764	worker->wi_proc = NULL;
765	worker->wi_launchtime = `0`;
766	dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
767	&worker->wi_links);
768	AutoVacuumShmem->av_startingWorker = NULL;
769	elog(WARNING, "worker took too long to start; canceled");
770	}
771	}
772	else
773	can_launch = false;
774	}
775	LWLockRelease(AutovacuumLock); / either shared or exclusive /
776
777	/ if we can't do anything, just go back to sleep /
778	if (!can_launch)
779	continue;
780
781	/ We're OK to start a new worker /
782
783	if (dlist_is_empty(&DatabaseList))
784	{
785	/*
786	* Special case when the list is empty: start a worker right away.
787	* This covers the initial case, when no database is in pgstats
788	* (thus the list is empty). Note that the constraints in
789	* launcher_determine_sleep keep us from starting workers too
790	* quickly (at most once every autovacuum_naptime when the list is
791	* empty).
792	*/
793	launch_worker(current_time);
794	}
795	else
796	{
797	/*
798	* because rebuild_database_list constructs a list with most
799	* distant adl_next_worker first, we obtain our database from the
800	* tail of the list.
801	*/
802	avl_dbase *avdb;
803
804	avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
805
806	/*
807	* launch a worker if next_worker is right now or it is in the
808	* past
809	*/
810	if (TimestampDifferenceExceeds(avdb->adl_next_worker,
811	current_time, `0`))
812	launch_worker(current_time);
813	}
814	}
815
816	/ Normal exit from the autovac launcher is here /
817	shutdown:
818	ereport(DEBUG1,
819	(errmsg("autovacuum launcher shutting down")));
820	AutoVacuumShmem->av_launcherpid = `0`;
821
822	proc_exit(`0`); / done /
823	}
824
825	/*
826	* Determine the time to sleep, based on the database list.
827	*
828	* The "canlaunch" parameter indicates whether we can start a worker right now,
829	* for example due to the workers being all busy. If this is false, we will
830	* cause a long sleep, which will be interrupted when a worker exits.
831	*/
832	static void
833	launcher_determine_sleep(bool canlaunch, bool recursing, struct timeval *nap)
834	{
835	/*
836	* We sleep until the next scheduled vacuum. We trust that when the
837	* database list was built, care was taken so that no entries have times
838	* in the past; if the first entry has too close a next_worker value, or a
839	* time in the past, we will sleep a small nominal time.
840	*/
841	if (!canlaunch)
842	{
843	nap->tv_sec = autovacuum_naptime;
844	nap->tv_usec = `0`;
845	}
846	else if (!dlist_is_empty(&DatabaseList))
847	{
848	TimestampTz current_time = GetCurrentTimestamp();
849	TimestampTz next_wakeup;
850	avl_dbase *avdb;
851	long secs;
852	int usecs;
853
854	avdb = dlist_tail_element(avl_dbase, adl_node, &DatabaseList);
855
856	next_wakeup = avdb->adl_next_worker;
857	TimestampDifference(current_time, next_wakeup, &secs, &usecs);
858
859	nap->tv_sec = secs;
860	nap->tv_usec = usecs;
861	}
862	else
863	{
864	/ list is empty, sleep for whole autovacuum_naptime seconds /
865	nap->tv_sec = autovacuum_naptime;
866	nap->tv_usec = `0`;
867	}
868
869	/*
870	* If the result is exactly zero, it means a database had an entry with
871	* time in the past. Rebuild the list so that the databases are evenly
872	* distributed again, and recalculate the time to sleep. This can happen
873	* if there are more tables needing vacuum than workers, and they all take
874	* longer to vacuum than autovacuum_naptime.
875	*
876	* We only recurse once. rebuild_database_list should always return times
877	* in the future, but it seems best not to trust too much on that.
878	*/
879	if (nap->tv_sec == `0` && nap->tv_usec == `0` && !recursing)
880	{
881	rebuild_database_list(InvalidOid);
882	launcher_determine_sleep(canlaunch, true, nap);
883	return;
884	}
885
886	/ The smallest time we'll allow the launcher to sleep. /
887	if (nap->tv_sec <= `0` && nap->tv_usec <= MIN_AUTOVAC_SLEEPTIME * `1000`)
888	{
889	nap->tv_sec = `0`;
890	nap->tv_usec = MIN_AUTOVAC_SLEEPTIME * `1000`;
891	}
892
893	/*
894	* If the sleep time is too large, clamp it to an arbitrary maximum (plus
895	* any fractional seconds, for simplicity). This avoids an essentially
896	* infinite sleep in strange cases like the system clock going backwards a
897	* few years.
898	*/
899	if (nap->tv_sec > MAX_AUTOVAC_SLEEPTIME)
900	nap->tv_sec = MAX_AUTOVAC_SLEEPTIME;
901	}
902
903	/*
904	* Build an updated DatabaseList. It must only contain databases that appear
905	* in pgstats, and must be sorted by next_worker from highest to lowest,
906	* distributed regularly across the next autovacuum_naptime interval.
907	*
908	* Receives the Oid of the database that made this list be generated (we call
909	* this the "new" database, because when the database was already present on
910	* the list, we expect that this function is not called at all). The
911	* preexisting list, if any, will be used to preserve the order of the
912	* databases in the autovacuum_naptime period. The new database is put at the
913	* end of the interval. The actual values are not saved, which should not be
914	* much of a problem.
915	*/
916	static void
917	rebuild_database_list(Oid newdb)
918	{
919	List *dblist;
920	ListCell *cell;
921	MemoryContext newcxt;
922	MemoryContext oldcxt;
923	MemoryContext tmpcxt;
924	HASHCTL hctl;
925	int score;
926	int nelems;
927	HTAB *dbhash;
928	dlist_iter iter;
929
930	/ use fresh stats /
931	autovac_refresh_stats();
932
933	newcxt = AllocSetContextCreate(AutovacMemCxt,
934	"AV dblist",
935	ALLOCSET_DEFAULT_SIZES);
936	tmpcxt = AllocSetContextCreate(newcxt,
937	"tmp AV dblist",
938	ALLOCSET_DEFAULT_SIZES);
939	oldcxt = MemoryContextSwitchTo(tmpcxt);
940
941	/*
942	* Implementing this is not as simple as it sounds, because we need to put
943	* the new database at the end of the list; next the databases that were
944	* already on the list, and finally (at the tail of the list) all the
945	* other databases that are not on the existing list.
946	*
947	* To do this, we build an empty hash table of scored databases. We will
948	* start with the lowest score (zero) for the new database, then
949	* increasing scores for the databases in the existing list, in order, and
950	* lastly increasing scores for all databases gotten via
951	* get_database_list() that are not already on the hash.
952	*
953	* Then we will put all the hash elements into an array, sort the array by
954	* score, and finally put the array elements into the new doubly linked
955	* list.
956	*/
957	hctl.keysize = sizeof(Oid);
958	hctl.entrysize = sizeof(avl_dbase);
959	hctl.hcxt = tmpcxt;
960	dbhash = hash_create("db hash", `20`, &hctl, / magic number here FIXME /
961	HASH_ELEM \| HASH_BLOBS \| HASH_CONTEXT);
962
963	/ start by inserting the new database /
964	score = `0`;
965	if (OidIsValid(newdb))
966	{
967	avl_dbase *db;
968	PgStat_StatDBEntry *entry;
969
970	/ only consider this database if it has a pgstat entry /
971	entry = pgstat_fetch_stat_dbentry(newdb);
972	if (entry != NULL)
973	{
974	/ we assume it isn't found because the hash was just created /
975	db = hash_search(dbhash, &newdb, HASH_ENTER, NULL);
976
977	/ hash_search already filled in the key /
978	db->adl_score = score++;
979	/ next_worker is filled in later /
980	}
981	}
982
983	/ Now insert the databases from the existing list /
984	dlist_foreach(iter, &DatabaseList)
985	{
986	avl_dbase *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
987	avl_dbase *db;
988	bool found;
989	PgStat_StatDBEntry *entry;
990
991	/*
992	* skip databases with no stat entries -- in particular, this gets rid
993	* of dropped databases
994	*/
995	entry = pgstat_fetch_stat_dbentry(avdb->adl_datid);
996	if (entry == NULL)
997	continue;
998
999	db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found);
1000
1001	if (!found)
1002	{
1003	/ hash_search already filled in the key /
1004	db->adl_score = score++;
1005	/ next_worker is filled in later /
1006	}
1007	}
1008
1009	/ finally, insert all qualifying databases not previously inserted /
1010	dblist = get_database_list();
1011	foreach(cell, dblist)
1012	{
1013	avw_dbase *avdb = lfirst(cell);
1014	avl_dbase *db;
1015	bool found;
1016	PgStat_StatDBEntry *entry;
1017
1018	/ only consider databases with a pgstat entry /
1019	entry = pgstat_fetch_stat_dbentry(avdb->adw_datid);
1020	if (entry == NULL)
1021	continue;
1022
1023	db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found);
1024	/ only update the score if the database was not already on the hash /
1025	if (!found)
1026	{
1027	/ hash_search already filled in the key /
1028	db->adl_score = score++;
1029	/ next_worker is filled in later /
1030	}
1031	}
1032	nelems = score;
1033
1034	/ from here on, the allocated memory belongs to the new list /
1035	MemoryContextSwitchTo(newcxt);
1036	dlist_init(&DatabaseList);
1037
1038	if (nelems > `0`)
1039	{
1040	TimestampTz current_time;
1041	int millis_increment;
1042	avl_dbase *dbary;
1043	avl_dbase *db;
1044	HASH_SEQ_STATUS seq;
1045	int i;
1046
1047	/ put all the hash elements into an array /
1048	dbary = palloc(nelems * sizeof(avl_dbase));
1049
1050	i = `0`;
1051	hash_seq_init(&seq, dbhash);
1052	while ((db = hash_seq_search(&seq)) != NULL)
1053	memcpy(&(dbary[i++]), db, sizeof(avl_dbase));
1054
1055	/ sort the array /
1056	qsort(dbary, nelems, sizeof(avl_dbase), db_comparator);
1057
1058	/*
1059	* Determine the time interval between databases in the schedule. If
1060	* we see that the configured naptime would take us to sleep times
1061	* lower than our min sleep time (which launcher_determine_sleep is
1062	* coded not to allow), silently use a larger naptime (but don't touch
1063	* the GUC variable).
1064	*/
1065	millis_increment = `1000.0` * autovacuum_naptime / nelems;
1066	if (millis_increment <= MIN_AUTOVAC_SLEEPTIME)
1067	millis_increment = MIN_AUTOVAC_SLEEPTIME * `1.1`;
1068
1069	current_time = GetCurrentTimestamp();
1070
1071	/*
1072	* move the elements from the array into the dllist, setting the
1073	* next_worker while walking the array
1074	*/
1075	for (i = `0`; i < nelems; i++)
1076	{
1077	avl_dbase *db = &(dbary[i]);
1078
1079	current_time = TimestampTzPlusMilliseconds(current_time,
1080	millis_increment);
1081	db->adl_next_worker = current_time;
1082
1083	/ later elements should go closer to the head of the list /
1084	dlist_push_head(&DatabaseList, &db->adl_node);
1085	}
1086	}
1087
1088	/ all done, clean up memory /
1089	if (DatabaseListCxt != NULL)
1090	MemoryContextDelete(DatabaseListCxt);
1091	MemoryContextDelete(tmpcxt);
1092	DatabaseListCxt = newcxt;
1093	MemoryContextSwitchTo(oldcxt);
1094	}
1095
1096	/ qsort comparator for avl_dbase, using adl_score /
1097	static int
1098	db_comparator(const void a, const* void *b)
1099	{
1100	if (((const avl_dbase ) a)->adl_score == ((const* avl_dbase *) b)->adl_score)
1101	return `0`;
1102	else
1103	return (((const avl_dbase ) a)->adl_score < ((const* avl_dbase *) b)->adl_score) ? `1` : -`1`;
1104	}
1105
1106	/*
1107	* do_start_worker
1108	*
1109	* Bare-bones procedure for starting an autovacuum worker from the launcher.
1110	* It determines what database to work on, sets up shared memory stuff and
1111	* signals postmaster to start the worker. It fails gracefully if invoked when
1112	* autovacuum_workers are already active.
1113	*
1114	* Return value is the OID of the database that the worker is going to process,
1115	* or InvalidOid if no worker was actually started.
1116	*/
1117	static Oid
1118	do_start_worker(void)
1119	{
1120	List *dblist;
1121	ListCell *cell;
1122	TransactionId xidForceLimit;
1123	MultiXactId multiForceLimit;
1124	bool for_xid_wrap;
1125	bool for_multi_wrap;
1126	avw_dbase *avdb;
1127	TimestampTz current_time;
1128	bool skipit = false;
1129	Oid retval = InvalidOid;
1130	MemoryContext tmpcxt,
1131	oldcxt;
1132
1133	/ return quickly when there are no free workers /
1134	LWLockAcquire(AutovacuumLock, LW_SHARED);
1135	if (dlist_is_empty(&AutoVacuumShmem->av_freeWorkers))
1136	{
1137	LWLockRelease(AutovacuumLock);
1138	return InvalidOid;
1139	}
1140	LWLockRelease(AutovacuumLock);
1141
1142	/*
1143	* Create and switch to a temporary context to avoid leaking the memory
1144	* allocated for the database list.
1145	*/
1146	tmpcxt = AllocSetContextCreate(CurrentMemoryContext,
1147	"Start worker tmp cxt",
1148	ALLOCSET_DEFAULT_SIZES);
1149	oldcxt = MemoryContextSwitchTo(tmpcxt);
1150
1151	/ use fresh stats /
1152	autovac_refresh_stats();
1153
1154	/ Get a list of databases /
1155	dblist = get_database_list();
1156
1157	/*
1158	* Determine the oldest datfrozenxid/relfrozenxid that we will allow to
1159	* pass without forcing a vacuum. (This limit can be tightened for
1160	* particular tables, but not loosened.)
1161	*/
1162	recentXid = ReadNewTransactionId();
1163	xidForceLimit = recentXid - autovacuum_freeze_max_age;
1164	/ ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves /
1165	/ this can cause the limit to go backwards by 3, but that's OK /
1166	if (xidForceLimit < FirstNormalTransactionId)
1167	xidForceLimit -= FirstNormalTransactionId;
1168
1169	/ Also determine the oldest datminmxid we will consider. /
1170	recentMulti = ReadNextMultiXactId();
1171	multiForceLimit = recentMulti - MultiXactMemberFreezeThreshold();
1172	if (multiForceLimit < FirstMultiXactId)
1173	multiForceLimit -= FirstMultiXactId;
1174
1175	/*
1176	* Choose a database to connect to. We pick the database that was least
1177	* recently auto-vacuumed, or one that needs vacuuming to prevent Xid
1178	* wraparound-related data loss. If any db at risk of Xid wraparound is
1179	* found, we pick the one with oldest datfrozenxid, independently of
1180	* autovacuum times; similarly we pick the one with the oldest datminmxid
1181	* if any is in MultiXactId wraparound. Note that those in Xid wraparound
1182	* danger are given more priority than those in multi wraparound danger.
1183	*
1184	* Note that a database with no stats entry is not considered, except for
1185	* Xid wraparound purposes. The theory is that if no one has ever
1186	* connected to it since the stats were last initialized, it doesn't need
1187	* vacuuming.
1188	*
1189	* XXX This could be improved if we had more info about whether it needs
1190	* vacuuming before connecting to it. Perhaps look through the pgstats
1191	* data for the database's tables? One idea is to keep track of the
1192	* number of new and dead tuples per database in pgstats. However it
1193	* isn't clear how to construct a metric that measures that and not cause
1194	* starvation for less busy databases.
1195	*/
1196	avdb = NULL;
1197	for_xid_wrap = false;
1198	for_multi_wrap = false;
1199	current_time = GetCurrentTimestamp();
1200	foreach(cell, dblist)
1201	{
1202	avw_dbase *tmp = lfirst(cell);
1203	dlist_iter iter;
1204
1205	/ Check to see if this one is at risk of wraparound /
1206	if (TransactionIdPrecedes(tmp->adw_frozenxid, xidForceLimit))
1207	{
1208	if (avdb == NULL \|\|
1209	TransactionIdPrecedes(tmp->adw_frozenxid,
1210	avdb->adw_frozenxid))
1211	avdb = tmp;
1212	for_xid_wrap = true;
1213	continue;
1214	}
1215	else if (for_xid_wrap)
1216	continue; / ignore not-at-risk DBs /
1217	else if (MultiXactIdPrecedes(tmp->adw_minmulti, multiForceLimit))
1218	{
1219	if (avdb == NULL \|\|
1220	MultiXactIdPrecedes(tmp->adw_minmulti, avdb->adw_minmulti))
1221	avdb = tmp;
1222	for_multi_wrap = true;
1223	continue;
1224	}
1225	else if (for_multi_wrap)
1226	continue; / ignore not-at-risk DBs /
1227
1228	/ Find pgstat entry if any /
1229	tmp->adw_entry = pgstat_fetch_stat_dbentry(tmp->adw_datid);
1230
1231	/*
1232	* Skip a database with no pgstat entry; it means it hasn't seen any
1233	* activity.
1234	*/
1235	if (!tmp->adw_entry)
1236	continue;
1237
1238	/*
1239	* Also, skip a database that appears on the database list as having
1240	* been processed recently (less than autovacuum_naptime seconds ago).
1241	* We do this so that we don't select a database which we just
1242	* selected, but that pgstat hasn't gotten around to updating the last
1243	* autovacuum time yet.
1244	*/
1245	skipit = false;
1246
1247	dlist_reverse_foreach(iter, &DatabaseList)
1248	{
1249	avl_dbase *dbp = dlist_container(avl_dbase, adl_node, iter.cur);
1250
1251	if (dbp->adl_datid == tmp->adw_datid)
1252	{
1253	/*
1254	* Skip this database if its next_worker value falls between
1255	* the current time and the current time plus naptime.
1256	*/
1257	if (!TimestampDifferenceExceeds(dbp->adl_next_worker,
1258	current_time, `0`) &&
1259	!TimestampDifferenceExceeds(current_time,
1260	dbp->adl_next_worker,
1261	autovacuum_naptime * `1000`))
1262	skipit = true;
1263
1264	break;
1265	}
1266	}
1267	if (skipit)
1268	continue;
1269
1270	/*
1271	* Remember the db with oldest autovac time. (If we are here, both
1272	* tmp->entry and db->entry must be non-null.)
1273	*/
1274	if (avdb == NULL \|\|
1275	tmp->adw_entry->last_autovac_time < avdb->adw_entry->last_autovac_time)
1276	avdb = tmp;
1277	}
1278
1279	/ Found a database -- process it /
1280	if (avdb != NULL)
1281	{
1282	WorkerInfo worker;
1283	dlist_node *wptr;
1284
1285	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1286
1287	/*
1288	* Get a worker entry from the freelist. We checked above, so there
1289	* really should be a free slot.
1290	*/
1291	wptr = dlist_pop_head_node(&AutoVacuumShmem->av_freeWorkers);
1292
1293	worker = dlist_container(WorkerInfoData, wi_links, wptr);
1294	worker->wi_dboid = avdb->adw_datid;
1295	worker->wi_proc = NULL;
1296	worker->wi_launchtime = GetCurrentTimestamp();
1297
1298	AutoVacuumShmem->av_startingWorker = worker;
1299
1300	LWLockRelease(AutovacuumLock);
1301
1302	SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
1303
1304	retval = avdb->adw_datid;
1305	}
1306	else if (skipit)
1307	{
1308	/*
1309	* If we skipped all databases on the list, rebuild it, because it
1310	* probably contains a dropped database.
1311	*/
1312	rebuild_database_list(InvalidOid);
1313	}
1314
1315	MemoryContextSwitchTo(oldcxt);
1316	MemoryContextDelete(tmpcxt);
1317
1318	return retval;
1319	}
1320
1321	/*
1322	* launch_worker
1323	*
1324	* Wrapper for starting a worker from the launcher. Besides actually starting
1325	* it, update the database list to reflect the next time that another one will
1326	* need to be started on the selected database. The actual database choice is
1327	* left to do_start_worker.
1328	*
1329	* This routine is also expected to insert an entry into the database list if
1330	* the selected database was previously absent from the list.
1331	*/
1332	static void
1333	launch_worker(TimestampTz now)
1334	{
1335	Oid dbid;
1336	dlist_iter iter;
1337
1338	dbid = do_start_worker();
1339	if (OidIsValid(dbid))
1340	{
1341	bool found = false;
1342
1343	/*
1344	* Walk the database list and update the corresponding entry. If the
1345	* database is not on the list, we'll recreate the list.
1346	*/
1347	dlist_foreach(iter, &DatabaseList)
1348	{
1349	avl_dbase *avdb = dlist_container(avl_dbase, adl_node, iter.cur);
1350
1351	if (avdb->adl_datid == dbid)
1352	{
1353	found = true;
1354
1355	/*
1356	* add autovacuum_naptime seconds to the current time, and use
1357	* that as the new "next_worker" field for this database.
1358	*/
1359	avdb->adl_next_worker =
1360	TimestampTzPlusMilliseconds(now, autovacuum_naptime * `1000`);
1361
1362	dlist_move_head(&DatabaseList, iter.cur);
1363	break;
1364	}
1365	}
1366
1367	/*
1368	* If the database was not present in the database list, we rebuild
1369	* the list. It's possible that the database does not get into the
1370	* list anyway, for example if it's a database that doesn't have a
1371	* pgstat entry, but this is not a problem because we don't want to
1372	* schedule workers regularly into those in any case.
1373	*/
1374	if (!found)
1375	rebuild_database_list(dbid);
1376	}
1377	}
1378
1379	/*
1380	* Called from postmaster to signal a failure to fork a process to become
1381	* worker. The postmaster should kill(SIGUSR2) the launcher shortly
1382	* after calling this function.
1383	*/
1384	void
1385	AutoVacWorkerFailed(void)
1386	{
1387	AutoVacuumShmem->av_signal[AutoVacForkFailed] = true;
1388	}
1389
1390	/ SIGHUP: set flag to re-read config file at next convenient time /
1391	static void
1392	av_sighup_handler(SIGNAL_ARGS)
1393	{
1394	int save_errno = errno;
1395
1396	got_SIGHUP = true;
1397	SetLatch(MyLatch);
1398
1399	errno = save_errno;
1400	}
1401
1402	/ SIGUSR2: a worker is up and running, or just finished, or failed to fork /
1403	static void
1404	avl_sigusr2_handler(SIGNAL_ARGS)
1405	{
1406	int save_errno = errno;
1407
1408	got_SIGUSR2 = true;
1409	SetLatch(MyLatch);
1410
1411	errno = save_errno;
1412	}
1413
1414	/ SIGTERM: time to die /
1415	static void
1416	avl_sigterm_handler(SIGNAL_ARGS)
1417	{
1418	int save_errno = errno;
1419
1420	got_SIGTERM = true;
1421	SetLatch(MyLatch);
1422
1423	errno = save_errno;
1424	}
1425
1426
1427	/********************************************************************
1428	* AUTOVACUUM WORKER CODE
1429	********************************************************************/
1430
1431	#ifdef EXEC_BACKEND
1432	/*
1433	* forkexec routines for the autovacuum worker.
1434	*
1435	* Format up the arglist, then fork and exec.
1436	*/
1437	static pid_t
1438	avworker_forkexec(void)
1439	{
1440	char *av[`10`];
1441	int ac = `0`;
1442
1443	av[ac++] = "postgres";
1444	av[ac++] = "--forkavworker";
1445	av[ac++] = NULL; / filled in by postmaster_forkexec /
1446	av[ac] = NULL;
1447
1448	Assert(ac < lengthof(av));
1449
1450	return postmaster_forkexec(ac, av);
1451	}
1452
1453	/*
1454	* We need this set from the outside, before InitProcess is called
1455	*/
1456	void
1457	AutovacuumWorkerIAm(void)
1458	{
1459	am_autovacuum_worker = true;
1460	}
1461	#endif
1462
1463	/*
1464	* Main entry point for autovacuum worker process.
1465	*
1466	* This code is heavily based on pgarch.c, q.v.
1467	*/
1468	int
1469	StartAutoVacWorker(void)
1470	{
1471	pid_t worker_pid;
1472
1473	#ifdef EXEC_BACKEND
1474	switch ((worker_pid = avworker_forkexec()))
1475	#else
1476	switch ((worker_pid = fork_process()))
1477	#endif
1478	{
1479	case -`1`:
1480	ereport(LOG,
1481	(errmsg("could not fork autovacuum worker process: %m")));
1482	return `0`;
1483
1484	#ifndef EXEC_BACKEND
1485	case `0`:
1486	/ in postmaster child ... /
1487	InitPostmasterChild();
1488
1489	/ Close the postmaster's sockets /
1490	ClosePostmasterPorts(false);
1491
1492	AutoVacWorkerMain(`0`, NULL);
1493	break;
1494	#endif
1495	default:
1496	return (int) worker_pid;
1497	}
1498
1499	/ shouldn't get here /
1500	return `0`;
1501	}
1502
1503	/*
1504	* AutoVacWorkerMain
1505	*/
1506	NON_EXEC_STATIC void
1507	AutoVacWorkerMain(int argc, char *argv[])
1508	{
1509	sigjmp_buf local_sigjmp_buf;
1510	Oid dbid;
1511
1512	am_autovacuum_worker = true;
1513
1514	/ Identify myself via ps /
1515	init_ps_display(pgstat_get_backend_desc(B_AUTOVAC_WORKER), "", "", "");
1516
1517	SetProcessingMode(InitProcessing);
1518
1519	/*
1520	* Set up signal handlers. We operate on databases much like a regular
1521	* backend, so we use the same signal handling. See equivalent code in
1522	* tcop/postgres.c.
1523	*/
1524	pqsignal(SIGHUP, av_sighup_handler);
1525
1526	/*
1527	* SIGINT is used to signal canceling the current table's vacuum; SIGTERM
1528	* means abort and exit cleanly, and SIGQUIT means abandon ship.
1529	*/
1530	pqsignal(SIGINT, StatementCancelHandler);
1531	pqsignal(SIGTERM, die);
1532	pqsignal(SIGQUIT, quickdie);
1533	InitializeTimeouts(); / establishes SIGALRM handler /
1534
1535	pqsignal(SIGPIPE, SIG_IGN);
1536	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
1537	pqsignal(SIGUSR2, SIG_IGN);
1538	pqsignal(SIGFPE, FloatExceptionHandler);
1539	pqsignal(SIGCHLD, SIG_DFL);
1540
1541	/ Early initialization /
1542	BaseInit();
1543
1544	/*
1545	* Create a per-backend PGPROC struct in shared memory, except in the
1546	* EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
1547	* this before we can use LWLocks (and in the EXEC_BACKEND case we already
1548	* had to do some stuff with LWLocks).
1549	*/
1550	#ifndef EXEC_BACKEND
1551	InitProcess();
1552	#endif
1553
1554	/*
1555	* If an exception is encountered, processing resumes here.
1556	*
1557	* See notes in postgres.c about the design of this coding.
1558	*/
1559	if (sigsetjmp(local_sigjmp_buf, `1`) != `0`)
1560	{
1561	/ Prevents interrupts while cleaning up /
1562	HOLD_INTERRUPTS();
1563
1564	/ Report the error to the server log /
1565	EmitErrorReport();
1566
1567	/*
1568	* We can now go away. Note that because we called InitProcess, a
1569	* callback was registered to do ProcKill, which will clean up
1570	* necessary state.
1571	*/
1572	proc_exit(`0`);
1573	}
1574
1575	/ We can now handle ereport(ERROR) /
1576	PG_exception_stack = &local_sigjmp_buf;
1577
1578	PG_SETMASK(&UnBlockSig);
1579
1580	/*
1581	* Set always-secure search path, so malicious users can't redirect user
1582	* code (e.g. pg_index.indexprs). (That code runs in a
1583	* SECURITY_RESTRICTED_OPERATION sandbox, so malicious users could not
1584	* take control of the entire autovacuum worker in any case.)
1585	*/
1586	SetConfigOption("search_path", "", PGC_SUSET, PGC_S_OVERRIDE);
1587
1588	/*
1589	* Force zero_damaged_pages OFF in the autovac process, even if it is set
1590	* in postgresql.conf. We don't really want such a dangerous option being
1591	* applied non-interactively.
1592	*/
1593	SetConfigOption("zero_damaged_pages", "false", PGC_SUSET, PGC_S_OVERRIDE);
1594
1595	/*
1596	* Force settable timeouts off to avoid letting these settings prevent
1597	* regular maintenance from being executed.
1598	*/
1599	SetConfigOption("statement_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1600	SetConfigOption("lock_timeout", "0", PGC_SUSET, PGC_S_OVERRIDE);
1601	SetConfigOption("idle_in_transaction_session_timeout", "0",
1602	PGC_SUSET, PGC_S_OVERRIDE);
1603
1604	/*
1605	* Force default_transaction_isolation to READ COMMITTED. We don't want
1606	* to pay the overhead of serializable mode, nor add any risk of causing
1607	* deadlocks or delaying other transactions.
1608	*/
1609	SetConfigOption("default_transaction_isolation", "read committed",
1610	PGC_SUSET, PGC_S_OVERRIDE);
1611
1612	/*
1613	* Force synchronous replication off to allow regular maintenance even if
1614	* we are waiting for standbys to connect. This is important to ensure we
1615	* aren't blocked from performing anti-wraparound tasks.
1616	*/
1617	if (synchronous_commit > SYNCHRONOUS_COMMIT_LOCAL_FLUSH)
1618	SetConfigOption("synchronous_commit", "local",
1619	PGC_SUSET, PGC_S_OVERRIDE);
1620
1621	/*
1622	* Get the info about the database we're going to work on.
1623	*/
1624	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1625
1626	/*
1627	* beware of startingWorker being INVALID; this should normally not
1628	* happen, but if a worker fails after forking and before this, the
1629	* launcher might have decided to remove it from the queue and start
1630	* again.
1631	*/
1632	if (AutoVacuumShmem->av_startingWorker != NULL)
1633	{
1634	MyWorkerInfo = AutoVacuumShmem->av_startingWorker;
1635	dbid = MyWorkerInfo->wi_dboid;
1636	MyWorkerInfo->wi_proc = MyProc;
1637
1638	/ insert into the running list /
1639	dlist_push_head(&AutoVacuumShmem->av_runningWorkers,
1640	&MyWorkerInfo->wi_links);
1641
1642	/*
1643	* remove from the "starting" pointer, so that the launcher can start
1644	* a new worker if required
1645	*/
1646	AutoVacuumShmem->av_startingWorker = NULL;
1647	LWLockRelease(AutovacuumLock);
1648
1649	on_shmem_exit(FreeWorkerInfo, `0`);
1650
1651	/ wake up the launcher /
1652	if (AutoVacuumShmem->av_launcherpid != `0`)
1653	kill(AutoVacuumShmem->av_launcherpid, SIGUSR2);
1654	}
1655	else
1656	{
1657	/ no worker entry for me, go away /
1658	elog(WARNING, "autovacuum worker started without a worker entry");
1659	dbid = InvalidOid;
1660	LWLockRelease(AutovacuumLock);
1661	}
1662
1663	if (OidIsValid(dbid))
1664	{
1665	char dbname[NAMEDATALEN];
1666
1667	/*
1668	* Report autovac startup to the stats collector. We deliberately do
1669	* this before InitPostgres, so that the last_autovac_time will get
1670	* updated even if the connection attempt fails. This is to prevent
1671	* autovac from getting "stuck" repeatedly selecting an unopenable
1672	* database, rather than making any progress on stuff it can connect
1673	* to.
1674	*/
1675	pgstat_report_autovac(dbid);
1676
1677	/*
1678	* Connect to the selected database
1679	*
1680	* Note: if we have selected a just-deleted database (due to using
1681	* stale stats info), we'll fail and exit here.
1682	*/
1683	InitPostgres(NULL, dbid, NULL, InvalidOid, dbname, false);
1684	SetProcessingMode(NormalProcessing);
1685	set_ps_display(dbname, false);
1686	ereport(DEBUG1,
1687	(errmsg("autovacuum: processing database \"%s\"", dbname)));
1688
1689	if (PostAuthDelay)
1690	pg_usleep(PostAuthDelay * `1000000L`);
1691
1692	/ And do an appropriate amount of work /
1693	recentXid = ReadNewTransactionId();
1694	recentMulti = ReadNextMultiXactId();
1695	do_autovacuum();
1696	}
1697
1698	/*
1699	* The launcher will be notified of my death in ProcKill, if we managed
1700	* to get a worker slot at all
1701	*/
1702
1703	/ All done, go away /
1704	proc_exit(`0`);
1705	}
1706
1707	/*
1708	* Return a WorkerInfo to the free list
1709	*/
1710	static void
1711	FreeWorkerInfo(int code, Datum arg)
1712	{
1713	if (MyWorkerInfo != NULL)
1714	{
1715	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
1716
1717	/*
1718	* Wake the launcher up so that he can launch a new worker immediately
1719	* if required. We only save the launcher's PID in local memory here;
1720	* the actual signal will be sent when the PGPROC is recycled. Note
1721	* that we always do this, so that the launcher can rebalance the cost
1722	* limit setting of the remaining workers.
1723	*
1724	* We somewhat ignore the risk that the launcher changes its PID
1725	* between us reading it and the actual kill; we expect ProcKill to be
1726	* called shortly after us, and we assume that PIDs are not reused too
1727	* quickly after a process exits.
1728	*/
1729	AutovacuumLauncherPid = AutoVacuumShmem->av_launcherpid;
1730
1731	dlist_delete(&MyWorkerInfo->wi_links);
1732	MyWorkerInfo->wi_dboid = InvalidOid;
1733	MyWorkerInfo->wi_tableoid = InvalidOid;
1734	MyWorkerInfo->wi_sharedrel = false;
1735	MyWorkerInfo->wi_proc = NULL;
1736	MyWorkerInfo->wi_launchtime = `0`;
1737	MyWorkerInfo->wi_dobalance = false;
1738	MyWorkerInfo->wi_cost_delay = `0`;
1739	MyWorkerInfo->wi_cost_limit = `0`;
1740	MyWorkerInfo->wi_cost_limit_base = `0`;
1741	dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
1742	&MyWorkerInfo->wi_links);
1743	/ not mine anymore /
1744	MyWorkerInfo = NULL;
1745
1746	/*
1747	* now that we're inactive, cause a rebalancing of the surviving
1748	* workers
1749	*/
1750	AutoVacuumShmem->av_signal[AutoVacRebalance] = true;
1751	LWLockRelease(AutovacuumLock);
1752	}
1753	}
1754
1755	/*
1756	* Update the cost-based delay parameters, so that multiple workers consume
1757	* each a fraction of the total available I/O.
1758	*/
1759	void
1760	AutoVacuumUpdateDelay(void)
1761	{
1762	if (MyWorkerInfo)
1763	{
1764	VacuumCostDelay = MyWorkerInfo->wi_cost_delay;
1765	VacuumCostLimit = MyWorkerInfo->wi_cost_limit;
1766	}
1767	}
1768
1769	/*
1770	* autovac_balance_cost
1771	* Recalculate the cost limit setting for each active worker.
1772	*
1773	* Caller must hold the AutovacuumLock in exclusive mode.
1774	*/
1775	static void
1776	autovac_balance_cost(void)
1777	{
1778	/*
1779	* The idea here is that we ration out I/O equally. The amount of I/O
1780	* that a worker can consume is determined by cost_limit/cost_delay, so we
1781	* try to equalize those ratios rather than the raw limit settings.
1782	*
1783	* note: in cost_limit, zero also means use value from elsewhere, because
1784	* zero is not a valid value.
1785	*/
1786	int vac_cost_limit = (autovacuum_vac_cost_limit > `0` ?
1787	autovacuum_vac_cost_limit : VacuumCostLimit);
1788	double vac_cost_delay = (autovacuum_vac_cost_delay >= `0` ?
1789	autovacuum_vac_cost_delay : VacuumCostDelay);
1790	double cost_total;
1791	double cost_avail;
1792	dlist_iter iter;
1793
1794	/ not set? nothing to do /
1795	if (vac_cost_limit <= `0` \|\| vac_cost_delay <= `0`)
1796	return;
1797
1798	/ calculate the total base cost limit of participating active workers /
1799	cost_total = `0.0`;
1800	dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1801	{
1802	WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1803
1804	if (worker->wi_proc != NULL &&
1805	worker->wi_dobalance &&
1806	worker->wi_cost_limit_base > `0` && worker->wi_cost_delay > `0`)
1807	cost_total +=
1808	(double) worker->wi_cost_limit_base / worker->wi_cost_delay;
1809	}
1810
1811	/ there are no cost limits -- nothing to do /
1812	if (cost_total <= `0`)
1813	return;
1814
1815	/*
1816	* Adjust cost limit of each active worker to balance the total of cost
1817	* limit to autovacuum_vacuum_cost_limit.
1818	*/
1819	cost_avail = (double) vac_cost_limit / vac_cost_delay;
1820	dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
1821	{
1822	WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
1823
1824	if (worker->wi_proc != NULL &&
1825	worker->wi_dobalance &&
1826	worker->wi_cost_limit_base > `0` && worker->wi_cost_delay > `0`)
1827	{
1828	int limit = (int)
1829	(cost_avail * worker->wi_cost_limit_base / cost_total);
1830
1831	/*
1832	* We put a lower bound of 1 on the cost_limit, to avoid division-
1833	* by-zero in the vacuum code. Also, in case of roundoff trouble
1834	* in these calculations, let's be sure we don't ever set
1835	* cost_limit to more than the base value.
1836	*/
1837	worker->wi_cost_limit = Max(Min(limit,
1838	worker->wi_cost_limit_base),
1839	`1`);
1840	}
1841
1842	if (worker->wi_proc != NULL)
1843	elog(DEBUG2, "autovac_balance_cost(pid=%u db=%u, rel=%u, dobalance=%s cost_limit=%d, cost_limit_base=%d, cost_delay=%g)",
1844	worker->wi_proc->pid, worker->wi_dboid, worker->wi_tableoid,
1845	worker->wi_dobalance ? "yes" : "no",
1846	worker->wi_cost_limit, worker->wi_cost_limit_base,
1847	worker->wi_cost_delay);
1848	}
1849	}
1850
1851	/*
1852	* get_database_list
1853	* Return a list of all databases found in pg_database.
1854	*
1855	* The list and associated data is allocated in the caller's memory context,
1856	* which is in charge of ensuring that it's properly cleaned up afterwards.
1857	*
1858	* Note: this is the only function in which the autovacuum launcher uses a
1859	* transaction. Although we aren't attached to any particular database and
1860	* therefore can't access most catalogs, we do have enough infrastructure
1861	* to do a seqscan on pg_database.
1862	*/
1863	static List *
1864	get_database_list(void)
1865	{
1866	List *dblist = NIL;
1867	Relation rel;
1868	TableScanDesc scan;
1869	HeapTuple tup;
1870	MemoryContext resultcxt;
1871
1872	/ This is the context that we will allocate our output data in /
1873	resultcxt = CurrentMemoryContext;
1874
1875	/*
1876	* Start a transaction so we can access pg_database, and get a snapshot.
1877	* We don't have a use for the snapshot itself, but we're interested in
1878	* the secondary effect that it sets RecentGlobalXmin. (This is critical
1879	* for anything that reads heap pages, because HOT may decide to prune
1880	* them even if the process doesn't attempt to modify any tuples.)
1881	*/
1882	StartTransactionCommand();
1883	(void) GetTransactionSnapshot();
1884
1885	rel = table_open(DatabaseRelationId, AccessShareLock);
1886	scan = table_beginscan_catalog(rel, `0`, NULL);
1887
1888	while (HeapTupleIsValid(tup = heap_getnext(scan, ForwardScanDirection)))
1889	{
1890	Form_pg_database pgdatabase = (Form_pg_database) GETSTRUCT(tup);
1891	avw_dbase *avdb;
1892	MemoryContext oldcxt;
1893
1894	/*
1895	* Allocate our results in the caller's context, not the
1896	* transaction's. We do this inside the loop, and restore the original
1897	* context at the end, so that leaky things like heap_getnext() are
1898	* not called in a potentially long-lived context.
1899	*/
1900	oldcxt = MemoryContextSwitchTo(resultcxt);
1901
1902	avdb = (avw_dbase ) palloc(sizeof*(avw_dbase));
1903
1904	avdb->adw_datid = pgdatabase->oid;
1905	avdb->adw_name = pstrdup(NameStr(pgdatabase->datname));
1906	avdb->adw_frozenxid = pgdatabase->datfrozenxid;
1907	avdb->adw_minmulti = pgdatabase->datminmxid;
1908	/ this gets set later: /
1909	avdb->adw_entry = NULL;
1910
1911	dblist = lappend(dblist, avdb);
1912	MemoryContextSwitchTo(oldcxt);
1913	}
1914
1915	table_endscan(scan);
1916	table_close(rel, AccessShareLock);
1917
1918	CommitTransactionCommand();
1919
1920	return dblist;
1921	}
1922
1923	/*
1924	* Process a database table-by-table
1925	*
1926	* Note that CHECK_FOR_INTERRUPTS is supposed to be used in certain spots in
1927	* order not to ignore shutdown commands for too long.
1928	*/
1929	static void
1930	do_autovacuum(void)
1931	{
1932	Relation classRel;
1933	HeapTuple tuple;
1934	TableScanDesc relScan;
1935	Form_pg_database dbForm;
1936	List *table_oids = NIL;
1937	List *orphan_oids = NIL;
1938	HASHCTL ctl;
1939	HTAB *table_toast_map;
1940	ListCell *volatile cell;
1941	PgStat_StatDBEntry *shared;
1942	PgStat_StatDBEntry *dbentry;
1943	BufferAccessStrategy bstrategy;
1944	ScanKeyData key;
1945	TupleDesc pg_class_desc;
1946	int effective_multixact_freeze_max_age;
1947	bool did_vacuum = false;
1948	bool found_concurrent_worker = false;
1949	int i;
1950
1951	/*
1952	* StartTransactionCommand and CommitTransactionCommand will automatically
1953	* switch to other contexts. We need this one to keep the list of
1954	* relations to vacuum/analyze across transactions.
1955	*/
1956	AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
1957	"AV worker",
1958	ALLOCSET_DEFAULT_SIZES);
1959	MemoryContextSwitchTo(AutovacMemCxt);
1960
1961	/*
1962	* may be NULL if we couldn't find an entry (only happens if we are
1963	* forcing a vacuum for anti-wrap purposes).
1964	*/
1965	dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
1966
1967	/ Start a transaction so our commands have one to play into. /
1968	StartTransactionCommand();
1969
1970	/*
1971	* Clean up any dead statistics collector entries for this DB. We always
1972	* want to do this exactly once per DB-processing cycle, even if we find
1973	* nothing worth vacuuming in the database.
1974	*/
1975	pgstat_vacuum_stat();
1976
1977	/*
1978	* Compute the multixact age for which freezing is urgent. This is
1979	* normally autovacuum_multixact_freeze_max_age, but may be less if we are
1980	* short of multixact member space.
1981	*/
1982	effective_multixact_freeze_max_age = MultiXactMemberFreezeThreshold();
1983
1984	/*
1985	* Find the pg_database entry and select the default freeze ages. We use
1986	* zero in template and nonconnectable databases, else the system-wide
1987	* default.
1988	*/
1989	tuple = SearchSysCache1(DATABASEOID, ObjectIdGetDatum(MyDatabaseId));
1990	if (!HeapTupleIsValid(tuple))
1991	elog(ERROR, "cache lookup failed for database %u", MyDatabaseId);
1992	dbForm = (Form_pg_database) GETSTRUCT(tuple);
1993
1994	if (dbForm->datistemplate \|\| !dbForm->datallowconn)
1995	{
1996	default_freeze_min_age = `0`;
1997	default_freeze_table_age = `0`;
1998	default_multixact_freeze_min_age = `0`;
1999	default_multixact_freeze_table_age = `0`;
2000	}
2001	else
2002	{
2003	default_freeze_min_age = vacuum_freeze_min_age;
2004	default_freeze_table_age = vacuum_freeze_table_age;
2005	default_multixact_freeze_min_age = vacuum_multixact_freeze_min_age;
2006	default_multixact_freeze_table_age = vacuum_multixact_freeze_table_age;
2007	}
2008
2009	ReleaseSysCache(tuple);
2010
2011	/ StartTransactionCommand changed elsewhere /
2012	MemoryContextSwitchTo(AutovacMemCxt);
2013
2014	/ The database hash where pgstat keeps shared relations /
2015	shared = pgstat_fetch_stat_dbentry(InvalidOid);
2016
2017	classRel = table_open(RelationRelationId, AccessShareLock);
2018
2019	/ create a copy so we can use it after closing pg_class /
2020	pg_class_desc = CreateTupleDescCopy(RelationGetDescr(classRel));
2021
2022	/ create hash table for toast <-> main relid mapping /
2023	MemSet(&ctl, `0`, sizeof(ctl));
2024	ctl.keysize = sizeof(Oid);
2025	ctl.entrysize = sizeof(av_relation);
2026
2027	table_toast_map = hash_create("TOAST to main relid map",
2028	`100`,
2029	&ctl,
2030	HASH_ELEM \| HASH_BLOBS);
2031
2032	/*
2033	* Scan pg_class to determine which tables to vacuum.
2034	*
2035	* We do this in two passes: on the first one we collect the list of plain
2036	* relations and materialized views, and on the second one we collect
2037	* TOAST tables. The reason for doing the second pass is that during it we
2038	* want to use the main relation's pg_class.reloptions entry if the TOAST
2039	* table does not have any, and we cannot obtain it unless we know
2040	* beforehand what's the main table OID.
2041	*
2042	* We need to check TOAST tables separately because in cases with short,
2043	* wide tables there might be proportionally much more activity in the
2044	* TOAST table than in its parent.
2045	*/
2046	relScan = table_beginscan_catalog(classRel, `0`, NULL);
2047
2048	/*
2049	* On the first pass, we collect main tables to vacuum, and also the main
2050	* table relid to TOAST relid mapping.
2051	*/
2052	while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2053	{
2054	Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2055	PgStat_StatTabEntry *tabentry;
2056	AutoVacOpts *relopts;
2057	Oid relid;
2058	bool dovacuum;
2059	bool doanalyze;
2060	bool wraparound;
2061
2062	if (classForm->relkind != RELKIND_RELATION &&
2063	classForm->relkind != RELKIND_MATVIEW)
2064	continue;
2065
2066	relid = classForm->oid;
2067
2068	/*
2069	* Check if it is a temp table (presumably, of some other backend's).
2070	* We cannot safely process other backends' temp tables.
2071	*/
2072	if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2073	{
2074	/*
2075	* We just ignore it if the owning backend is still active and
2076	* using the temporary schema.
2077	*/
2078	if (!isTempNamespaceInUse(classForm->relnamespace))
2079	{
2080	/*
2081	* The table seems to be orphaned -- although it might be that
2082	* the owning backend has already deleted it and exited; our
2083	* pg_class scan snapshot is not necessarily up-to-date
2084	* anymore, so we could be looking at a committed-dead entry.
2085	* Remember it so we can try to delete it later.
2086	*/
2087	orphan_oids = lappend_oid(orphan_oids, relid);
2088	}
2089	continue;
2090	}
2091
2092	/ Fetch reloptions and the pgstat entry for this table /
2093	relopts = extract_autovac_opts(tuple, pg_class_desc);
2094	tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2095	shared, dbentry);
2096
2097	/ Check if it needs vacuum or analyze /
2098	relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2099	effective_multixact_freeze_max_age,
2100	&dovacuum, &doanalyze, &wraparound);
2101
2102	/ Relations that need work are added to table_oids /
2103	if (dovacuum \|\| doanalyze)
2104	table_oids = lappend_oid(table_oids, relid);
2105
2106	/*
2107	* Remember TOAST associations for the second pass. Note: we must do
2108	* this whether or not the table is going to be vacuumed, because we
2109	* don't automatically vacuum toast tables along the parent table.
2110	*/
2111	if (OidIsValid(classForm->reltoastrelid))
2112	{
2113	av_relation *hentry;
2114	bool found;
2115
2116	hentry = hash_search(table_toast_map,
2117	&classForm->reltoastrelid,
2118	HASH_ENTER, &found);
2119
2120	if (!found)
2121	{
2122	/ hash_search already filled in the key /
2123	hentry->ar_relid = relid;
2124	hentry->ar_hasrelopts = false;
2125	if (relopts != NULL)
2126	{
2127	hentry->ar_hasrelopts = true;
2128	memcpy(&hentry->ar_reloptions, relopts,
2129	sizeof(AutoVacOpts));
2130	}
2131	}
2132	}
2133	}
2134
2135	table_endscan(relScan);
2136
2137	/ second pass: check TOAST tables /
2138	ScanKeyInit(&key,
2139	Anum_pg_class_relkind,
2140	BTEqualStrategyNumber, F_CHAREQ,
2141	CharGetDatum(RELKIND_TOASTVALUE));
2142
2143	relScan = table_beginscan_catalog(classRel, `1`, &key);
2144	while ((tuple = heap_getnext(relScan, ForwardScanDirection)) != NULL)
2145	{
2146	Form_pg_class classForm = (Form_pg_class) GETSTRUCT(tuple);
2147	PgStat_StatTabEntry *tabentry;
2148	Oid relid;
2149	AutoVacOpts *relopts = NULL;
2150	bool dovacuum;
2151	bool doanalyze;
2152	bool wraparound;
2153
2154	/*
2155	* We cannot safely process other backends' temp tables, so skip 'em.
2156	*/
2157	if (classForm->relpersistence == RELPERSISTENCE_TEMP)
2158	continue;
2159
2160	relid = classForm->oid;
2161
2162	/*
2163	* fetch reloptions -- if this toast table does not have them, try the
2164	* main rel
2165	*/
2166	relopts = extract_autovac_opts(tuple, pg_class_desc);
2167	if (relopts == NULL)
2168	{
2169	av_relation *hentry;
2170	bool found;
2171
2172	hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2173	if (found && hentry->ar_hasrelopts)
2174	relopts = &hentry->ar_reloptions;
2175	}
2176
2177	/ Fetch the pgstat entry for this table /
2178	tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2179	shared, dbentry);
2180
2181	relation_needs_vacanalyze(relid, relopts, classForm, tabentry,
2182	effective_multixact_freeze_max_age,
2183	&dovacuum, &doanalyze, &wraparound);
2184
2185	/ ignore analyze for toast tables /
2186	if (dovacuum)
2187	table_oids = lappend_oid(table_oids, relid);
2188	}
2189
2190	table_endscan(relScan);
2191	table_close(classRel, AccessShareLock);
2192
2193	/*
2194	* Recheck orphan temporary tables, and if they still seem orphaned, drop
2195	* them. We'll eat a transaction per dropped table, which might seem
2196	* excessive, but we should only need to do anything as a result of a
2197	* previous backend crash, so this should not happen often enough to
2198	* justify "optimizing". Using separate transactions ensures that we
2199	* don't bloat the lock table if there are many temp tables to be dropped,
2200	* and it ensures that we don't lose work if a deletion attempt fails.
2201	*/
2202	foreach(cell, orphan_oids)
2203	{
2204	Oid relid = lfirst_oid(cell);
2205	Form_pg_class classForm;
2206	ObjectAddress object;
2207
2208	/*
2209	* Check for user-requested abort.
2210	*/
2211	CHECK_FOR_INTERRUPTS();
2212
2213	/*
2214	* Try to lock the table. If we can't get the lock immediately,
2215	* somebody else is using (or dropping) the table, so it's not our
2216	* concern anymore. Having the lock prevents race conditions below.
2217	*/
2218	if (!ConditionalLockRelationOid(relid, AccessExclusiveLock))
2219	continue;
2220
2221	/*
2222	* Re-fetch the pg_class tuple and re-check whether it still seems to
2223	* be an orphaned temp table. If it's not there or no longer the same
2224	* relation, ignore it.
2225	*/
2226	tuple = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
2227	if (!HeapTupleIsValid(tuple))
2228	{
2229	/ be sure to drop useless lock so we don't bloat lock table /
2230	UnlockRelationOid(relid, AccessExclusiveLock);
2231	continue;
2232	}
2233	classForm = (Form_pg_class) GETSTRUCT(tuple);
2234
2235	/*
2236	* Make all the same tests made in the loop above. In event of OID
2237	* counter wraparound, the pg_class entry we have now might be
2238	* completely unrelated to the one we saw before.
2239	*/
2240	if (!((classForm->relkind == RELKIND_RELATION \|\|
2241	classForm->relkind == RELKIND_MATVIEW) &&
2242	classForm->relpersistence == RELPERSISTENCE_TEMP))
2243	{
2244	UnlockRelationOid(relid, AccessExclusiveLock);
2245	continue;
2246	}
2247
2248	if (isTempNamespaceInUse(classForm->relnamespace))
2249	{
2250	UnlockRelationOid(relid, AccessExclusiveLock);
2251	continue;
2252	}
2253
2254	/ OK, let's delete it /
2255	ereport(LOG,
2256	(errmsg("autovacuum: dropping orphan temp table \"%s.%s.%s\"",
2257	get_database_name(MyDatabaseId),
2258	get_namespace_name(classForm->relnamespace),
2259	NameStr(classForm->relname))));
2260
2261	object.classId = RelationRelationId;
2262	object.objectId = relid;
2263	object.objectSubId = `0`;
2264	performDeletion(&object, DROP_CASCADE,
2265	PERFORM_DELETION_INTERNAL \|
2266	PERFORM_DELETION_QUIETLY \|
2267	PERFORM_DELETION_SKIP_EXTENSIONS);
2268
2269	/*
2270	* To commit the deletion, end current transaction and start a new
2271	* one. Note this also releases the lock we took.
2272	*/
2273	CommitTransactionCommand();
2274	StartTransactionCommand();
2275
2276	/ StartTransactionCommand changed current memory context /
2277	MemoryContextSwitchTo(AutovacMemCxt);
2278	}
2279
2280	/*
2281	* Create a buffer access strategy object for VACUUM to use. We want to
2282	* use the same one across all the vacuum operations we perform, since the
2283	* point is for VACUUM not to blow out the shared cache.
2284	*/
2285	bstrategy = GetAccessStrategy(BAS_VACUUM);
2286
2287	/*
2288	* create a memory context to act as fake PortalContext, so that the
2289	* contexts created in the vacuum code are cleaned up for each table.
2290	*/
2291	PortalContext = AllocSetContextCreate(AutovacMemCxt,
2292	"Autovacuum Portal",
2293	ALLOCSET_DEFAULT_SIZES);
2294
2295	/*
2296	* Perform operations on collected tables.
2297	*/
2298	foreach(cell, table_oids)
2299	{
2300	Oid relid = lfirst_oid(cell);
2301	HeapTuple classTup;
2302	autovac_table *tab;
2303	bool isshared;
2304	bool skipit;
2305	double stdVacuumCostDelay;
2306	int stdVacuumCostLimit;
2307	dlist_iter iter;
2308
2309	CHECK_FOR_INTERRUPTS();
2310
2311	/*
2312	* Check for config changes before processing each collected table.
2313	*/
2314	if (got_SIGHUP)
2315	{
2316	got_SIGHUP = false;
2317	ProcessConfigFile(PGC_SIGHUP);
2318
2319	/*
2320	* You might be tempted to bail out if we see autovacuum is now
2321	* disabled. Must resist that temptation -- this might be a
2322	* for-wraparound emergency worker, in which case that would be
2323	* entirely inappropriate.
2324	*/
2325	}
2326
2327	/*
2328	* Find out whether the table is shared or not. (It's slightly
2329	* annoying to fetch the syscache entry just for this, but in typical
2330	* cases it adds little cost because table_recheck_autovac would
2331	* refetch the entry anyway. We could buy that back by copying the
2332	* tuple here and passing it to table_recheck_autovac, but that
2333	* increases the odds of that function working with stale data.)
2334	*/
2335	classTup = SearchSysCache1(RELOID, ObjectIdGetDatum(relid));
2336	if (!HeapTupleIsValid(classTup))
2337	continue; / somebody deleted the rel, forget it /
2338	isshared = ((Form_pg_class) GETSTRUCT(classTup))->relisshared;
2339	ReleaseSysCache(classTup);
2340
2341	/*
2342	* Hold schedule lock from here until we've claimed the table. We
2343	* also need the AutovacuumLock to walk the worker array, but that one
2344	* can just be a shared lock.
2345	*/
2346	LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2347	LWLockAcquire(AutovacuumLock, LW_SHARED);
2348
2349	/*
2350	* Check whether the table is being vacuumed concurrently by another
2351	* worker.
2352	*/
2353	skipit = false;
2354	dlist_foreach(iter, &AutoVacuumShmem->av_runningWorkers)
2355	{
2356	WorkerInfo worker = dlist_container(WorkerInfoData, wi_links, iter.cur);
2357
2358	/ ignore myself /
2359	if (worker == MyWorkerInfo)
2360	continue;
2361
2362	/ ignore workers in other databases (unless table is shared) /
2363	if (!worker->wi_sharedrel && worker->wi_dboid != MyDatabaseId)
2364	continue;
2365
2366	if (worker->wi_tableoid == relid)
2367	{
2368	skipit = true;
2369	found_concurrent_worker = true;
2370	break;
2371	}
2372	}
2373	LWLockRelease(AutovacuumLock);
2374	if (skipit)
2375	{
2376	LWLockRelease(AutovacuumScheduleLock);
2377	continue;
2378	}
2379
2380	/*
2381	* Store the table's OID in shared memory before releasing the
2382	* schedule lock, so that other workers don't try to vacuum it
2383	* concurrently. (We claim it here so as not to hold
2384	* AutovacuumScheduleLock while rechecking the stats.)
2385	*/
2386	MyWorkerInfo->wi_tableoid = relid;
2387	MyWorkerInfo->wi_sharedrel = isshared;
2388	LWLockRelease(AutovacuumScheduleLock);
2389
2390	/*
2391	* Check whether pgstat data still says we need to vacuum this table.
2392	* It could have changed if something else processed the table while
2393	* we weren't looking.
2394	*
2395	* Note: we have a special case in pgstat code to ensure that the
2396	* stats we read are as up-to-date as possible, to avoid the problem
2397	* that somebody just finished vacuuming this table. The window to
2398	* the race condition is not closed but it is very small.
2399	*/
2400	MemoryContextSwitchTo(AutovacMemCxt);
2401	tab = table_recheck_autovac(relid, table_toast_map, pg_class_desc,
2402	effective_multixact_freeze_max_age);
2403	if (tab == NULL)
2404	{
2405	/ someone else vacuumed the table, or it went away /
2406	LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2407	MyWorkerInfo->wi_tableoid = InvalidOid;
2408	MyWorkerInfo->wi_sharedrel = false;
2409	LWLockRelease(AutovacuumScheduleLock);
2410	continue;
2411	}
2412
2413	/*
2414	* Remember the prevailing values of the vacuum cost GUCs. We have to
2415	* restore these at the bottom of the loop, else we'll compute wrong
2416	* values in the next iteration of autovac_balance_cost().
2417	*/
2418	stdVacuumCostDelay = VacuumCostDelay;
2419	stdVacuumCostLimit = VacuumCostLimit;
2420
2421	/ Must hold AutovacuumLock while mucking with cost balance info /
2422	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2423
2424	/ advertise my cost delay parameters for the balancing algorithm /
2425	MyWorkerInfo->wi_dobalance = tab->at_dobalance;
2426	MyWorkerInfo->wi_cost_delay = tab->at_vacuum_cost_delay;
2427	MyWorkerInfo->wi_cost_limit = tab->at_vacuum_cost_limit;
2428	MyWorkerInfo->wi_cost_limit_base = tab->at_vacuum_cost_limit;
2429
2430	/ do a balance /
2431	autovac_balance_cost();
2432
2433	/ set the active cost parameters from the result of that /
2434	AutoVacuumUpdateDelay();
2435
2436	/ done /
2437	LWLockRelease(AutovacuumLock);
2438
2439	/ clean up memory before each iteration /
2440	MemoryContextResetAndDeleteChildren(PortalContext);
2441
2442	/*
2443	* Save the relation name for a possible error message, to avoid a
2444	* catalog lookup in case of an error. If any of these return NULL,
2445	* then the relation has been dropped since last we checked; skip it.
2446	* Note: they must live in a long-lived memory context because we call
2447	* vacuum and analyze in different transactions.
2448	*/
2449
2450	tab->at_relname = get_rel_name(tab->at_relid);
2451	tab->at_nspname = get_namespace_name(get_rel_namespace(tab->at_relid));
2452	tab->at_datname = get_database_name(MyDatabaseId);
2453	if (!tab->at_relname \|\| !tab->at_nspname \|\| !tab->at_datname)
2454	goto deleted;
2455
2456	/*
2457	* We will abort vacuuming the current table if something errors out,
2458	* and continue with the next one in schedule; in particular, this
2459	* happens if we are interrupted with SIGINT.
2460	*/
2461	PG_TRY();
2462	{
2463	/ Use PortalContext for any per-table allocations /
2464	MemoryContextSwitchTo(PortalContext);
2465
2466	/ have at it /
2467	autovacuum_do_vac_analyze(tab, bstrategy);
2468
2469	/*
2470	* Clear a possible query-cancel signal, to avoid a late reaction
2471	* to an automatically-sent signal because of vacuuming the
2472	* current table (we're done with it, so it would make no sense to
2473	* cancel at this point.)
2474	*/
2475	QueryCancelPending = false;
2476	}
2477	PG_CATCH();
2478	{
2479	/*
2480	* Abort the transaction, start a new one, and proceed with the
2481	* next table in our list.
2482	*/
2483	HOLD_INTERRUPTS();
2484	if (tab->at_params.options & VACOPT_VACUUM)
2485	errcontext("automatic vacuum of table \"%s.%s.%s\"",
2486	tab->at_datname, tab->at_nspname, tab->at_relname);
2487	else
2488	errcontext("automatic analyze of table \"%s.%s.%s\"",
2489	tab->at_datname, tab->at_nspname, tab->at_relname);
2490	EmitErrorReport();
2491
2492	/ this resets the PGXACT flags too /
2493	AbortOutOfAnyTransaction();
2494	FlushErrorState();
2495	MemoryContextResetAndDeleteChildren(PortalContext);
2496
2497	/ restart our transaction for the following operations /
2498	StartTransactionCommand();
2499	RESUME_INTERRUPTS();
2500	}
2501	PG_END_TRY();
2502
2503	/ Make sure we're back in AutovacMemCxt /
2504	MemoryContextSwitchTo(AutovacMemCxt);
2505
2506	did_vacuum = true;
2507
2508	/ the PGXACT flags are reset at the next end of transaction /
2509
2510	/ be tidy /
2511	deleted:
2512	if (tab->at_datname != NULL)
2513	pfree(tab->at_datname);
2514	if (tab->at_nspname != NULL)
2515	pfree(tab->at_nspname);
2516	if (tab->at_relname != NULL)
2517	pfree(tab->at_relname);
2518	pfree(tab);
2519
2520	/*
2521	* Remove my info from shared memory. We could, but intentionally
2522	* don't, clear wi_cost_limit and friends --- this is on the
2523	* assumption that we probably have more to do with similar cost
2524	* settings, so we don't want to give up our share of I/O for a very
2525	* short interval and thereby thrash the global balance.
2526	*/
2527	LWLockAcquire(AutovacuumScheduleLock, LW_EXCLUSIVE);
2528	MyWorkerInfo->wi_tableoid = InvalidOid;
2529	MyWorkerInfo->wi_sharedrel = false;
2530	LWLockRelease(AutovacuumScheduleLock);
2531
2532	/ restore vacuum cost GUCs for the next iteration /
2533	VacuumCostDelay = stdVacuumCostDelay;
2534	VacuumCostLimit = stdVacuumCostLimit;
2535	}
2536
2537	/*
2538	* Perform additional work items, as requested by backends.
2539	*/
2540	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2541	for (i = `0`; i < NUM_WORKITEMS; i++)
2542	{
2543	AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i];
2544
2545	if (!workitem->avw_used)
2546	continue;
2547	if (workitem->avw_active)
2548	continue;
2549	if (workitem->avw_database != MyDatabaseId)
2550	continue;
2551
2552	/ claim this one, and release lock while performing it /
2553	workitem->avw_active = true;
2554	LWLockRelease(AutovacuumLock);
2555
2556	perform_work_item(workitem);
2557
2558	/*
2559	* Check for config changes before acquiring lock for further jobs.
2560	*/
2561	CHECK_FOR_INTERRUPTS();
2562	if (got_SIGHUP)
2563	{
2564	got_SIGHUP = false;
2565	ProcessConfigFile(PGC_SIGHUP);
2566	}
2567
2568	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
2569
2570	/ and mark it done /
2571	workitem->avw_active = false;
2572	workitem->avw_used = false;
2573	}
2574	LWLockRelease(AutovacuumLock);
2575
2576	/*
2577	* We leak table_toast_map here (among other things), but since we're
2578	* going away soon, it's not a problem.
2579	*/
2580
2581	/*
2582	* Update pg_database.datfrozenxid, and truncate pg_xact if possible. We
2583	* only need to do this once, not after each table.
2584	*
2585	* Even if we didn't vacuum anything, it may still be important to do
2586	* this, because one indirect effect of vac_update_datfrozenxid() is to
2587	* update ShmemVariableCache->xidVacLimit. That might need to be done
2588	* even if we haven't vacuumed anything, because relations with older
2589	* relfrozenxid values or other databases with older datfrozenxid values
2590	* might have been dropped, allowing xidVacLimit to advance.
2591	*
2592	* However, it's also important not to do this blindly in all cases,
2593	* because when autovacuum=off this will restart the autovacuum launcher.
2594	* If we're not careful, an infinite loop can result, where workers find
2595	* no work to do and restart the launcher, which starts another worker in
2596	* the same database that finds no work to do. To prevent that, we skip
2597	* this if (1) we found no work to do and (2) we skipped at least one
2598	* table due to concurrent autovacuum activity. In that case, the other
2599	* worker has already done it, or will do so when it finishes.
2600	*/
2601	if (did_vacuum \|\| !found_concurrent_worker)
2602	vac_update_datfrozenxid();
2603
2604	/ Finally close out the last transaction. /
2605	CommitTransactionCommand();
2606	}
2607
2608	/*
2609	* Execute a previously registered work item.
2610	*/
2611	static void
2612	perform_work_item(AutoVacuumWorkItem *workitem)
2613	{
2614	char *cur_datname = NULL;
2615	char *cur_nspname = NULL;
2616	char *cur_relname = NULL;
2617
2618	/*
2619	* Note we do not store table info in MyWorkerInfo, since this is not
2620	* vacuuming proper.
2621	*/
2622
2623	/*
2624	* Save the relation name for a possible error message, to avoid a catalog
2625	* lookup in case of an error. If any of these return NULL, then the
2626	* relation has been dropped since last we checked; skip it.
2627	*/
2628	Assert(CurrentMemoryContext == AutovacMemCxt);
2629
2630	cur_relname = get_rel_name(workitem->avw_relation);
2631	cur_nspname = get_namespace_name(get_rel_namespace(workitem->avw_relation));
2632	cur_datname = get_database_name(MyDatabaseId);
2633	if (!cur_relname \|\| !cur_nspname \|\| !cur_datname)
2634	goto deleted2;
2635
2636	autovac_report_workitem(workitem, cur_nspname, cur_relname);
2637
2638	/ clean up memory before each work item /
2639	MemoryContextResetAndDeleteChildren(PortalContext);
2640
2641	/*
2642	* We will abort the current work item if something errors out, and
2643	* continue with the next one; in particular, this happens if we are
2644	* interrupted with SIGINT. Note that this means that the work item list
2645	* can be lossy.
2646	*/
2647	PG_TRY();
2648	{
2649	/ Use PortalContext for any per-work-item allocations /
2650	MemoryContextSwitchTo(PortalContext);
2651
2652	/ have at it /
2653	switch (workitem->avw_type)
2654	{
2655	case AVW_BRINSummarizeRange:
2656	DirectFunctionCall2(brin_summarize_range,
2657	ObjectIdGetDatum(workitem->avw_relation),
2658	Int64GetDatum((int64) workitem->avw_blockNumber));
2659	break;
2660	default:
2661	elog(WARNING, "unrecognized work item found: type %d",
2662	workitem->avw_type);
2663	break;
2664	}
2665
2666	/*
2667	* Clear a possible query-cancel signal, to avoid a late reaction to
2668	* an automatically-sent signal because of vacuuming the current table
2669	* (we're done with it, so it would make no sense to cancel at this
2670	* point.)
2671	*/
2672	QueryCancelPending = false;
2673	}
2674	PG_CATCH();
2675	{
2676	/*
2677	* Abort the transaction, start a new one, and proceed with the next
2678	* table in our list.
2679	*/
2680	HOLD_INTERRUPTS();
2681	errcontext("processing work entry for relation \"%s.%s.%s\"",
2682	cur_datname, cur_nspname, cur_relname);
2683	EmitErrorReport();
2684
2685	/ this resets the PGXACT flags too /
2686	AbortOutOfAnyTransaction();
2687	FlushErrorState();
2688	MemoryContextResetAndDeleteChildren(PortalContext);
2689
2690	/ restart our transaction for the following operations /
2691	StartTransactionCommand();
2692	RESUME_INTERRUPTS();
2693	}
2694	PG_END_TRY();
2695
2696	/ Make sure we're back in AutovacMemCxt /
2697	MemoryContextSwitchTo(AutovacMemCxt);
2698
2699	/ We intentionally do not set did_vacuum here /
2700
2701	/ be tidy /
2702	deleted2:
2703	if (cur_datname)
2704	pfree(cur_datname);
2705	if (cur_nspname)
2706	pfree(cur_nspname);
2707	if (cur_relname)
2708	pfree(cur_relname);
2709	}
2710
2711	/*
2712	* extract_autovac_opts
2713	*
2714	* Given a relation's pg_class tuple, return the AutoVacOpts portion of
2715	* reloptions, if set; otherwise, return NULL.
2716	*/
2717	static AutoVacOpts *
2718	extract_autovac_opts(HeapTuple tup, TupleDesc pg_class_desc)
2719	{
2720	bytea *relopts;
2721	AutoVacOpts *av;
2722
2723	Assert(((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_RELATION \|\|
2724	((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_MATVIEW \|\|
2725	((Form_pg_class) GETSTRUCT(tup))->relkind == RELKIND_TOASTVALUE);
2726
2727	relopts = extractRelOptions(tup, pg_class_desc, NULL);
2728	if (relopts == NULL)
2729	return NULL;
2730
2731	av = palloc(sizeof(AutoVacOpts));
2732	memcpy(av, &(((StdRdOptions ) relopts)->autovacuum), sizeof*(AutoVacOpts));
2733	pfree(relopts);
2734
2735	return av;
2736	}
2737
2738	/*
2739	* get_pgstat_tabentry_relid
2740	*
2741	* Fetch the pgstat entry of a table, either local to a database or shared.
2742	*/
2743	static PgStat_StatTabEntry *
2744	get_pgstat_tabentry_relid(Oid relid, bool isshared, PgStat_StatDBEntry *shared,
2745	PgStat_StatDBEntry *dbentry)
2746	{
2747	PgStat_StatTabEntry *tabentry = NULL;
2748
2749	if (isshared)
2750	{
2751	if (PointerIsValid(shared))
2752	tabentry = hash_search(shared->tables, &relid,
2753	HASH_FIND, NULL);
2754	}
2755	else if (PointerIsValid(dbentry))
2756	tabentry = hash_search(dbentry->tables, &relid,
2757	HASH_FIND, NULL);
2758
2759	return tabentry;
2760	}
2761
2762	/*
2763	* table_recheck_autovac
2764	*
2765	* Recheck whether a table still needs vacuum or analyze. Return value is a
2766	* valid autovac_table pointer if it does, NULL otherwise.
2767	*
2768	* Note that the returned autovac_table does not have the name fields set.
2769	*/
2770	static autovac_table *
2771	table_recheck_autovac(Oid relid, HTAB *table_toast_map,
2772	TupleDesc pg_class_desc,
2773	int effective_multixact_freeze_max_age)
2774	{
2775	Form_pg_class classForm;
2776	HeapTuple classTup;
2777	bool dovacuum;
2778	bool doanalyze;
2779	autovac_table *tab = NULL;
2780	PgStat_StatTabEntry *tabentry;
2781	PgStat_StatDBEntry *shared;
2782	PgStat_StatDBEntry *dbentry;
2783	bool wraparound;
2784	AutoVacOpts *avopts;
2785
2786	/ use fresh stats /
2787	autovac_refresh_stats();
2788
2789	shared = pgstat_fetch_stat_dbentry(InvalidOid);
2790	dbentry = pgstat_fetch_stat_dbentry(MyDatabaseId);
2791
2792	/ fetch the relation's relcache entry /
2793	classTup = SearchSysCacheCopy1(RELOID, ObjectIdGetDatum(relid));
2794	if (!HeapTupleIsValid(classTup))
2795	return NULL;
2796	classForm = (Form_pg_class) GETSTRUCT(classTup);
2797
2798	/*
2799	* Get the applicable reloptions. If it is a TOAST table, try to get the
2800	* main table reloptions if the toast table itself doesn't have.
2801	*/
2802	avopts = extract_autovac_opts(classTup, pg_class_desc);
2803	if (classForm->relkind == RELKIND_TOASTVALUE &&
2804	avopts == NULL && table_toast_map != NULL)
2805	{
2806	av_relation *hentry;
2807	bool found;
2808
2809	hentry = hash_search(table_toast_map, &relid, HASH_FIND, &found);
2810	if (found && hentry->ar_hasrelopts)
2811	avopts = &hentry->ar_reloptions;
2812	}
2813
2814	/ fetch the pgstat table entry /
2815	tabentry = get_pgstat_tabentry_relid(relid, classForm->relisshared,
2816	shared, dbentry);
2817
2818	relation_needs_vacanalyze(relid, avopts, classForm, tabentry,
2819	effective_multixact_freeze_max_age,
2820	&dovacuum, &doanalyze, &wraparound);
2821
2822	/ ignore ANALYZE for toast tables /
2823	if (classForm->relkind == RELKIND_TOASTVALUE)
2824	doanalyze = false;
2825
2826	/ OK, it needs something done /
2827	if (doanalyze \|\| dovacuum)
2828	{
2829	int freeze_min_age;
2830	int freeze_table_age;
2831	int multixact_freeze_min_age;
2832	int multixact_freeze_table_age;
2833	int vac_cost_limit;
2834	double vac_cost_delay;
2835	int log_min_duration;
2836
2837	/*
2838	* Calculate the vacuum cost parameters and the freeze ages. If there
2839	* are options set in pg_class.reloptions, use them; in the case of a
2840	* toast table, try the main table too. Otherwise use the GUC
2841	* defaults, autovacuum's own first and plain vacuum second.
2842	*/
2843
2844	/ -1 in autovac setting means use plain vacuum_cost_delay /
2845	vac_cost_delay = (avopts && avopts->vacuum_cost_delay >= `0`)
2846	? avopts->vacuum_cost_delay
2847	: (autovacuum_vac_cost_delay >= `0`)
2848	? autovacuum_vac_cost_delay
2849	: VacuumCostDelay;
2850
2851	/ 0 or -1 in autovac setting means use plain vacuum_cost_limit /
2852	vac_cost_limit = (avopts && avopts->vacuum_cost_limit > `0`)
2853	? avopts->vacuum_cost_limit
2854	: (autovacuum_vac_cost_limit > `0`)
2855	? autovacuum_vac_cost_limit
2856	: VacuumCostLimit;
2857
2858	/ -1 in autovac setting means use log_autovacuum_min_duration /
2859	log_min_duration = (avopts && avopts->log_min_duration >= `0`)
2860	? avopts->log_min_duration
2861	: Log_autovacuum_min_duration;
2862
2863	/ these do not have autovacuum-specific settings /
2864	freeze_min_age = (avopts && avopts->freeze_min_age >= `0`)
2865	? avopts->freeze_min_age
2866	: default_freeze_min_age;
2867
2868	freeze_table_age = (avopts && avopts->freeze_table_age >= `0`)
2869	? avopts->freeze_table_age
2870	: default_freeze_table_age;
2871
2872	multixact_freeze_min_age = (avopts &&
2873	avopts->multixact_freeze_min_age >= `0`)
2874	? avopts->multixact_freeze_min_age
2875	: default_multixact_freeze_min_age;
2876
2877	multixact_freeze_table_age = (avopts &&
2878	avopts->multixact_freeze_table_age >= `0`)
2879	? avopts->multixact_freeze_table_age
2880	: default_multixact_freeze_table_age;
2881
2882	tab = palloc(sizeof(autovac_table));
2883	tab->at_relid = relid;
2884	tab->at_sharedrel = classForm->relisshared;
2885	tab->at_params.options = VACOPT_SKIPTOAST \|
2886	(dovacuum ? VACOPT_VACUUM : `0`) \|
2887	(doanalyze ? VACOPT_ANALYZE : `0`) \|
2888	(!wraparound ? VACOPT_SKIP_LOCKED : `0`);
2889	tab->at_params.index_cleanup = VACOPT_TERNARY_DEFAULT;
2890	tab->at_params.truncate = VACOPT_TERNARY_DEFAULT;
2891	tab->at_params.freeze_min_age = freeze_min_age;
2892	tab->at_params.freeze_table_age = freeze_table_age;
2893	tab->at_params.multixact_freeze_min_age = multixact_freeze_min_age;
2894	tab->at_params.multixact_freeze_table_age = multixact_freeze_table_age;
2895	tab->at_params.is_wraparound = wraparound;
2896	tab->at_params.log_min_duration = log_min_duration;
2897	tab->at_vacuum_cost_limit = vac_cost_limit;
2898	tab->at_vacuum_cost_delay = vac_cost_delay;
2899	tab->at_relname = NULL;
2900	tab->at_nspname = NULL;
2901	tab->at_datname = NULL;
2902
2903	/*
2904	* If any of the cost delay parameters has been set individually for
2905	* this table, disable the balancing algorithm.
2906	*/
2907	tab->at_dobalance =
2908	!(avopts && (avopts->vacuum_cost_limit > `0` \|\|
2909	avopts->vacuum_cost_delay > `0`));
2910	}
2911
2912	heap_freetuple(classTup);
2913
2914	return tab;
2915	}
2916
2917	/*
2918	* relation_needs_vacanalyze
2919	*
2920	* Check whether a relation needs to be vacuumed or analyzed; return each into
2921	* "dovacuum" and "doanalyze", respectively. Also return whether the vacuum is
2922	* being forced because of Xid or multixact wraparound.
2923	*
2924	* relopts is a pointer to the AutoVacOpts options (either for itself in the
2925	* case of a plain table, or for either itself or its parent table in the case
2926	* of a TOAST table), NULL if none; tabentry is the pgstats entry, which can be
2927	* NULL.
2928	*
2929	* A table needs to be vacuumed if the number of dead tuples exceeds a
2930	* threshold. This threshold is calculated as
2931	*
2932	* threshold = vac_base_thresh + vac_scale_factor * reltuples
2933	*
2934	* For analyze, the analysis done is that the number of tuples inserted,
2935	* deleted and updated since the last analyze exceeds a threshold calculated
2936	* in the same fashion as above. Note that the collector actually stores
2937	* the number of tuples (both live and dead) that there were as of the last
2938	* analyze. This is asymmetric to the VACUUM case.
2939	*
2940	* We also force vacuum if the table's relfrozenxid is more than freeze_max_age
2941	* transactions back, and if its relminmxid is more than
2942	* multixact_freeze_max_age multixacts back.
2943	*
2944	* A table whose autovacuum_enabled option is false is
2945	* automatically skipped (unless we have to vacuum it due to freeze_max_age).
2946	* Thus autovacuum can be disabled for specific tables. Also, when the stats
2947	* collector does not have data about a table, it will be skipped.
2948	*
2949	* A table whose vac_base_thresh value is < 0 takes the base value from the
2950	* autovacuum_vacuum_threshold GUC variable. Similarly, a vac_scale_factor
2951	* value < 0 is substituted with the value of
2952	* autovacuum_vacuum_scale_factor GUC variable. Ditto for analyze.
2953	*/
2954	static void
2955	relation_needs_vacanalyze(Oid relid,
2956	AutoVacOpts *relopts,
2957	Form_pg_class classForm,
2958	PgStat_StatTabEntry *tabentry,
2959	int effective_multixact_freeze_max_age,
2960	/ output params below /
2961	bool *dovacuum,
2962	bool *doanalyze,
2963	bool *wraparound)
2964	{
2965	bool force_vacuum;
2966	bool av_enabled;
2967	float4 reltuples; / pg_class.reltuples /
2968
2969	/ constants from reloptions or GUC variables /
2970	int vac_base_thresh,
2971	anl_base_thresh;
2972	float4 vac_scale_factor,
2973	anl_scale_factor;
2974
2975	/ thresholds calculated from above constants /
2976	float4 vacthresh,
2977	anlthresh;
2978
2979	/ number of vacuum (resp. analyze) tuples at this time /
2980	float4 vactuples,
2981	anltuples;
2982
2983	/ freeze parameters /
2984	int freeze_max_age;
2985	int multixact_freeze_max_age;
2986	TransactionId xidForceLimit;
2987	MultiXactId multiForceLimit;
2988
2989	AssertArg(classForm != NULL);
2990	AssertArg(OidIsValid(relid));
2991
2992	/*
2993	* Determine vacuum/analyze equation parameters. We have two possible
2994	* sources: the passed reloptions (which could be a main table or a toast
2995	* table), or the autovacuum GUC variables.
2996	*/
2997
2998	/ -1 in autovac setting means use plain vacuum_scale_factor /
2999	vac_scale_factor = (relopts && relopts->vacuum_scale_factor >= `0`)
3000	? relopts->vacuum_scale_factor
3001	: autovacuum_vac_scale;
3002
3003	vac_base_thresh = (relopts && relopts->vacuum_threshold >= `0`)
3004	? relopts->vacuum_threshold
3005	: autovacuum_vac_thresh;
3006
3007	anl_scale_factor = (relopts && relopts->analyze_scale_factor >= `0`)
3008	? relopts->analyze_scale_factor
3009	: autovacuum_anl_scale;
3010
3011	anl_base_thresh = (relopts && relopts->analyze_threshold >= `0`)
3012	? relopts->analyze_threshold
3013	: autovacuum_anl_thresh;
3014
3015	freeze_max_age = (relopts && relopts->freeze_max_age >= `0`)
3016	? Min(relopts->freeze_max_age, autovacuum_freeze_max_age)
3017	: autovacuum_freeze_max_age;
3018
3019	multixact_freeze_max_age = (relopts && relopts->multixact_freeze_max_age >= `0`)
3020	? Min(relopts->multixact_freeze_max_age, effective_multixact_freeze_max_age)
3021	: effective_multixact_freeze_max_age;
3022
3023	av_enabled = (relopts ? relopts->enabled : true);
3024
3025	/ Force vacuum if table is at risk of wraparound /
3026	xidForceLimit = recentXid - freeze_max_age;
3027	if (xidForceLimit < FirstNormalTransactionId)
3028	xidForceLimit -= FirstNormalTransactionId;
3029	force_vacuum = (TransactionIdIsNormal(classForm->relfrozenxid) &&
3030	TransactionIdPrecedes(classForm->relfrozenxid,
3031	xidForceLimit));
3032	if (!force_vacuum)
3033	{
3034	multiForceLimit = recentMulti - multixact_freeze_max_age;
3035	if (multiForceLimit < FirstMultiXactId)
3036	multiForceLimit -= FirstMultiXactId;
3037	force_vacuum = MultiXactIdIsValid(classForm->relminmxid) &&
3038	MultiXactIdPrecedes(classForm->relminmxid, multiForceLimit);
3039	}
3040	*wraparound = force_vacuum;
3041
3042	/ User disabled it in pg_class.reloptions? (But ignore if at risk) /
3043	if (!av_enabled && !force_vacuum)
3044	{
3045	*doanalyze = false;
3046	*dovacuum = false;
3047	return;
3048	}
3049
3050	/*
3051	* If we found the table in the stats hash, and autovacuum is currently
3052	* enabled, make a threshold-based decision whether to vacuum and/or
3053	* analyze. If autovacuum is currently disabled, we must be here for
3054	* anti-wraparound vacuuming only, so don't vacuum (or analyze) anything
3055	* that's not being forced.
3056	*/
3057	if (PointerIsValid(tabentry) && AutoVacuumingActive())
3058	{
3059	reltuples = classForm->reltuples;
3060	vactuples = tabentry->n_dead_tuples;
3061	anltuples = tabentry->changes_since_analyze;
3062
3063	vacthresh = (float4) vac_base_thresh + vac_scale_factor * reltuples;
3064	anlthresh = (float4) anl_base_thresh + anl_scale_factor * reltuples;
3065
3066	/*
3067	* Note that we don't need to take special consideration for stat
3068	* reset, because if that happens, the last vacuum and analyze counts
3069	* will be reset too.
3070	*/
3071	elog(DEBUG3, "%s: vac: %.0f (threshold %.0f), anl: %.0f (threshold %.0f)",
3072	NameStr(classForm->relname),
3073	vactuples, vacthresh, anltuples, anlthresh);
3074
3075	/ Determine if this table needs vacuum or analyze. /
3076	*dovacuum = force_vacuum \|\| (vactuples > vacthresh);
3077	*doanalyze = (anltuples > anlthresh);
3078	}
3079	else
3080	{
3081	/*
3082	* Skip a table not found in stat hash, unless we have to force vacuum
3083	* for anti-wrap purposes. If it's not acted upon, there's no need to
3084	* vacuum it.
3085	*/
3086	*dovacuum = force_vacuum;
3087	*doanalyze = false;
3088	}
3089
3090	/ ANALYZE refuses to work with pg_statistic /
3091	if (relid == StatisticRelationId)
3092	*doanalyze = false;
3093	}
3094
3095	/*
3096	* autovacuum_do_vac_analyze
3097	* Vacuum and/or analyze the specified table
3098	*/
3099	static void
3100	autovacuum_do_vac_analyze(autovac_table *tab, BufferAccessStrategy bstrategy)
3101	{
3102	RangeVar *rangevar;
3103	VacuumRelation *rel;
3104	List *rel_list;
3105
3106	/ Let pgstat know what we're doing /
3107	autovac_report_activity(tab);
3108
3109	/ Set up one VacuumRelation target, identified by OID, for vacuum() /
3110	rangevar = makeRangeVar(tab->at_nspname, tab->at_relname, -`1`);
3111	rel = makeVacuumRelation(rangevar, tab->at_relid, NIL);
3112	rel_list = list_make1(rel);
3113
3114	vacuum(rel_list, &tab->at_params, bstrategy, true);
3115	}
3116
3117	/*
3118	* autovac_report_activity
3119	* Report to pgstat what autovacuum is doing
3120	*
3121	* We send a SQL string corresponding to what the user would see if the
3122	* equivalent command was to be issued manually.
3123	*
3124	* Note we assume that we are going to report the next command as soon as we're
3125	* done with the current one, and exit right after the last one, so we don't
3126	* bother to report "<IDLE>" or some such.
3127	*/
3128	static void
3129	autovac_report_activity(autovac_table *tab)
3130	{
3131	#define MAX_AUTOVAC_ACTIV_LEN (NAMEDATALEN * 2 + 56)
3132	char activity[MAX_AUTOVAC_ACTIV_LEN];
3133	int len;
3134
3135	/ Report the command and possible options /
3136	if (tab->at_params.options & VACOPT_VACUUM)
3137	snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3138	"autovacuum: VACUUM%s",
3139	tab->at_params.options & VACOPT_ANALYZE ? " ANALYZE" : "");
3140	else
3141	snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3142	"autovacuum: ANALYZE");
3143
3144	/*
3145	* Report the qualified name of the relation.
3146	*/
3147	len = strlen(activity);
3148
3149	snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
3150	" %s.%s%s", tab->at_nspname, tab->at_relname,
3151	tab->at_params.is_wraparound ? " (to prevent wraparound)" : "");
3152
3153	/ Set statement_timestamp() to current time for pg_stat_activity /
3154	SetCurrentStatementStartTimestamp();
3155
3156	pgstat_report_activity(STATE_RUNNING, activity);
3157	}
3158
3159	/*
3160	* autovac_report_workitem
3161	* Report to pgstat that autovacuum is processing a work item
3162	*/
3163	static void
3164	autovac_report_workitem(AutoVacuumWorkItem *workitem,
3165	const char nspname, const* char *relname)
3166	{
3167	char activity[MAX_AUTOVAC_ACTIV_LEN + `12` + `2`];
3168	char blk[`12` + `2`];
3169	int len;
3170
3171	switch (workitem->avw_type)
3172	{
3173	case AVW_BRINSummarizeRange:
3174	snprintf(activity, MAX_AUTOVAC_ACTIV_LEN,
3175	"autovacuum: BRIN summarize");
3176	break;
3177	}
3178
3179	/*
3180	* Report the qualified name of the relation, and the block number if any
3181	*/
3182	len = strlen(activity);
3183
3184	if (BlockNumberIsValid(workitem->avw_blockNumber))
3185	snprintf(blk, sizeof(blk), " %u", workitem->avw_blockNumber);
3186	else
3187	blk[`0`] = `'\0'`;
3188
3189	snprintf(activity + len, MAX_AUTOVAC_ACTIV_LEN - len,
3190	" %s.%s%s", nspname, relname, blk);
3191
3192	/ Set statement_timestamp() to current time for pg_stat_activity /
3193	SetCurrentStatementStartTimestamp();
3194
3195	pgstat_report_activity(STATE_RUNNING, activity);
3196	}
3197
3198	/*
3199	* AutoVacuumingActive
3200	* Check GUC vars and report whether the autovacuum process should be
3201	* running.
3202	*/
3203	bool
3204	AutoVacuumingActive(void)
3205	{
3206	if (!autovacuum_start_daemon \|\| !pgstat_track_counts)
3207	return false;
3208	return true;
3209	}
3210
3211	/*
3212	* Request one work item to the next autovacuum run processing our database.
3213	* Return false if the request can't be recorded.
3214	*/
3215	bool
3216	AutoVacuumRequestWork(AutoVacuumWorkItemType type, Oid relationId,
3217	BlockNumber blkno)
3218	{
3219	int i;
3220	bool result = false;
3221
3222	LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
3223
3224	/*
3225	* Locate an unused work item and fill it with the given data.
3226	*/
3227	for (i = `0`; i < NUM_WORKITEMS; i++)
3228	{
3229	AutoVacuumWorkItem *workitem = &AutoVacuumShmem->av_workItems[i];
3230
3231	if (workitem->avw_used)
3232	continue;
3233
3234	workitem->avw_used = true;
3235	workitem->avw_active = false;
3236	workitem->avw_type = type;
3237	workitem->avw_database = MyDatabaseId;
3238	workitem->avw_relation = relationId;
3239	workitem->avw_blockNumber = blkno;
3240	result = true;
3241
3242	/ done /
3243	break;
3244	}
3245
3246	LWLockRelease(AutovacuumLock);
3247
3248	return result;
3249	}
3250
3251	/*
3252	* autovac_init
3253	* This is called at postmaster initialization.
3254	*
3255	* All we do here is annoy the user if he got it wrong.
3256	*/
3257	void
3258	autovac_init(void)
3259	{
3260	if (autovacuum_start_daemon && !pgstat_track_counts)
3261	ereport(WARNING,
3262	(errmsg("autovacuum not started because of misconfiguration"),
3263	errhint("Enable the \"track_counts\" option.")));
3264	}
3265
3266	/*
3267	* IsAutoVacuum functions
3268	* Return whether this is either a launcher autovacuum process or a worker
3269	* process.
3270	*/
3271	bool
3272	IsAutoVacuumLauncherProcess(void)
3273	{
3274	return am_autovacuum_launcher;
3275	}
3276
3277	bool
3278	IsAutoVacuumWorkerProcess(void)
3279	{
3280	return am_autovacuum_worker;
3281	}
3282
3283
3284	/*
3285	* AutoVacuumShmemSize
3286	* Compute space needed for autovacuum-related shared memory
3287	*/
3288	Size
3289	AutoVacuumShmemSize(void)
3290	{
3291	Size size;
3292
3293	/*
3294	* Need the fixed struct and the array of WorkerInfoData.
3295	*/
3296	size = sizeof(AutoVacuumShmemStruct);
3297	size = MAXALIGN(size);
3298	size = add_size(size, mul_size(autovacuum_max_workers,
3299	sizeof(WorkerInfoData)));
3300	return size;
3301	}
3302
3303	/*
3304	* AutoVacuumShmemInit
3305	* Allocate and initialize autovacuum-related shared memory
3306	*/
3307	void
3308	AutoVacuumShmemInit(void)
3309	{
3310	bool found;
3311
3312	AutoVacuumShmem = (AutoVacuumShmemStruct *)
3313	ShmemInitStruct("AutoVacuum Data",
3314	AutoVacuumShmemSize(),
3315	&found);
3316
3317	if (!IsUnderPostmaster)
3318	{
3319	WorkerInfo worker;
3320	int i;
3321
3322	Assert(!found);
3323
3324	AutoVacuumShmem->av_launcherpid = `0`;
3325	dlist_init(&AutoVacuumShmem->av_freeWorkers);
3326	dlist_init(&AutoVacuumShmem->av_runningWorkers);
3327	AutoVacuumShmem->av_startingWorker = NULL;
3328	memset(AutoVacuumShmem->av_workItems, `0`,
3329	sizeof(AutoVacuumWorkItem) * NUM_WORKITEMS);
3330
3331	worker = (WorkerInfo) ((char *) AutoVacuumShmem +
3332	MAXALIGN(sizeof(AutoVacuumShmemStruct)));
3333
3334	/ initialize the WorkerInfo free list /
3335	for (i = `0`; i < autovacuum_max_workers; i++)
3336	dlist_push_head(&AutoVacuumShmem->av_freeWorkers,
3337	&worker[i].wi_links);
3338	}
3339	else
3340	Assert(found);
3341	}
3342
3343	/*
3344	* autovac_refresh_stats
3345	* Refresh pgstats data for an autovacuum process
3346	*
3347	* Cause the next pgstats read operation to obtain fresh data, but throttle
3348	* such refreshing in the autovacuum launcher. This is mostly to avoid
3349	* rereading the pgstats files too many times in quick succession when there
3350	* are many databases.
3351	*
3352	* Note: we avoid throttling in the autovac worker, as it would be
3353	* counterproductive in the recheck logic.
3354	*/
3355	static void
3356	autovac_refresh_stats(void)
3357	{
3358	if (IsAutoVacuumLauncherProcess())
3359	{
3360	static TimestampTz last_read = `0`;
3361	TimestampTz current_time;
3362
3363	current_time = GetCurrentTimestamp();
3364
3365	if (!TimestampDifferenceExceeds(last_read, current_time,
3366	STATS_READ_DELAY))
3367	return;
3368
3369	last_read = current_time;
3370	}
3371
3372	pgstat_clear_snapshot();
3373	}
3374

Browse the source code of PostgreSQL/src/backend/postmaster/autovacuum.c