简介
当数据库发生意外故障时,缓冲池中的脏页还未刷至持久化存储磁盘,此时磁盘的数据未处于一致性状态,在数据库重启时会通过回放WAL日志使数据恢复至一致性状态。该流程在 StartupXLOG函数中实现,由startup进程执行。
源码解析
1 首先会判断数据目录下是否存在standby.singal文件或者recovery.singal文件,如果存在则会进入相应的处理流程;standby.singal文件存在表明该库是物理复制中的备库,recovery.singal文件存在表明该库是正在进行PITR;存在若不存在则进行的是普通的恢复操作。
/*
* See if there are any recovery signal files and if so,set state for
* recovery.
*
* See if there is a recovery command file (recovery.conf),and if so
* throw an ERROR since as of PG12 we no longer recognize that.
*/
static void
readRecoverySignalFile(void)
{
struct stat stat_buf;
if (IsBootstrapProcessingMode())
return;
/*
* Check for old recovery API file: recovery.conf
*/
// pg12以后不支持 recovery.conf,报错
if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
ereport(FATAL,
(errcode_for_file_access(),
errmsg("using recovery command file \"%s\" is not supported",
RECOVERY_COMMAND_FILE)));
/*
* Remove unused .done file,if present. Ignore if absent.
*/
unlink(RECOVERY_COMMAND_DONE); // 移除无用的 .done文件
/*
* Check for recovery signal files and if found,fsync them since they
* represent server state information. We don't sweat too much about the
* possibility of fsync failure,however.
*
* If present,standby signal file takes precedence. If neither is present
* then we won't enter archive recovery.
*/
// 如果存在 recovery signal files文件,对此进行pg_fsync操作,更新标识
if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
{
int fd;
fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
S_IRUSR | S_IWUSR);
if (fd >= 0)
{
(void) pg_fsync(fd);
close(fd);
}
standby_signal_file_found = true;
}
// recovery.signal的处理逻辑
else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
{
int fd;
fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE,
S_IRUSR | S_IWUSR);
if (fd >= 0)
{
(void) pg_fsync(fd);
close(fd);
}
recovery_signal_file_found = true;
}
// 设置标识信息,如standbymode或 ArchiveRecoveryRequested
StandbyModeRequested = false;
ArchiveRecoveryRequested = false;
if (standby_signal_file_found)
{
StandbyModeRequested = true;
ArchiveRecoveryRequested = true;
}
else if (recovery_signal_file_found)
{
StandbyModeRequested = false;
ArchiveRecoveryRequested = true;
}
else
return;
/*
* We don't support standby mode in standalone backends; that requires
* other processes such as the WAL receiver to be alive.
*/
// 在非postmaster下不支持standby模式
if (StandbyModeRequested && !IsUnderPostmaster)
ereport(FATAL,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("standby mode is not supported by single-user servers")));
}
2 故障恢复起点的确认
故障恢复起点确认由两种方式,其优先级为backup_label File >> controlFile
backup_label File记录如下信息
CHECKPOINT LOCATION | 创建检查点的LSN号 |
---|---|
START WAL LOCATION | WAL恢复的起始位置 |
BACKUP METHOD | 备份方法 |
BACKUP FROM | 备份源来自 主库还是备库 |
START TIME | pg_start_backup 执行时间 |
LABEL | 备份标识 |
START TIMELINE | 时间线 |
/*
* Allocate two page buffers dedicated to WAL consistency checks. We do
* it this way,rather than just making static arrays,for two reasons:
* (1) no need to waste the storage in most instantiations of the backend;
* (2) a static char array isn't guaranteed to have any particular
* alignment,whereas palloc() will provide MAXALIGN'd storage.
*/
replay_image_masked = (char *) palloc(BLCKSZ);
primary_image_masked = (char *) palloc(BLCKSZ);
if (read_backup_label(&checkPointLoc, &backupEndRequired,
&backupFromStandby))
{
List *tablespaces = NIL;
/*
* Archive recovery was requested,and thanks to the backup label
* file,we know how far we need to replay to reach consistency. Enter
* archive recovery directly.
*/
InArchiveRecovery = true;
if (StandbyModeRequested)
StandbyMode = true;
/*
* When a backup_label file is present,we want to roll forward from
* the checkpoint it identifies,rather than using pg_control.
*/
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
if (record != NULL)
{
memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
ereport(DEBUG1,
(errmsg_internal("checkpoint record is at %X/%X",
LSN_FORMAT_ARGS(checkPointLoc))));
InRecovery = true; /* force recovery even if SHUTDOWNED */
/*
* Make sure that REDO location exists. This may not be the case
* if there was a crash during an online backup,which left a
* backup_label around that references a WAL segment that's
* already been archived.
*/
// 读到检查点信息,但是创建检查点的 RedoPtr已经读不到了,则需要删除backup_label文件
if (checkPoint.redo < checkPointLoc)
{
XLogBeginRead(xlogreader, checkPoint.redo);
if (!ReadRecord(xlogreader, LOG, false))
ereport(FATAL,
(errmsg("could not find redo location referenced by checkpoint record"),
errhint("If you are restoring from a backup,touch \"%s/recovery.signal\" and add required recovery options.\n"
"If you are not restoring from a backup,try removing the file \"%s/backup_label\".\n"
"Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
DataDir, DataDir, DataDir)));
}
}
else
{
ereport(FATAL,
(errmsg("could not locate required checkpoint record"),
errhint("If you are restoring from a backup,touch \"%s/recovery.signal\" and add required recovery options.\n"
"If you are not restoring from a backup,try removing the file \"%s/backup_label\".\n"
"Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
DataDir, DataDir)));
wasShutdown = false; /* keep compiler quiet */
}
/* read the tablespace_map file if present and create symlinks. */
if (read_tablespace_map(&tablespaces))
{
ListCell *lc;
foreach(lc, tablespaces)
{
tablespaceinfo *ti = lfirst(lc);
char *linkloc;
linkloc = psprintf("pg_tblspc/%s", ti->oid);
/*
* Remove the existing symlink if any and Create the symlink
* under PGDATA.
*/
remove_tablespace_symlink(linkloc);
if (symlink(ti->path, linkloc) < 0)
ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not create symbolic link \"%s\": %m",
linkloc)));
pfree(ti->oid);
pfree(ti->path);
pfree(ti);
}
/* set flag to delete it later */
haveTblspcMap = true;
}
/* set flag to delete it later */
haveBackupLabel = true;
}
else
{
/*
* If tablespace_map file is present without backup_label file,there
* is no use of such file. There is no harm in retaining it,but it
* is better to get rid of the map file so that we don't have any
* redundant file in data directory and it will avoid any sort of
* confusion. It seems prudent though to just rename the file out of
* the way rather than delete it completely,also we ignore any error
* that occurs in rename operation as even if map file is present
* without backup_label file,it is harmless.
*/
if (stat(TABLESPACE_MAP, &st) == 0)
{
unlink(TABLESPACE_MAP_OLD);
if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
ereport(LOG,
(errmsg("ignoring file \"%s\" because no file \"%s\" exists",
TABLESPACE_MAP, BACKUP_LABEL_FILE),
errdetail("File \"%s\" was renamed to \"%s\".",
TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
else
ereport(LOG,
errdetail("Could not rename file \"%s\" to \"%s\": %m.", TABLESPACE_MAP_OLD)));
}
从ControlFile文件中确认恢复起点执行逻辑:
/* Get the last valid checkpoint record. */
checkPointLoc = ControlFile->checkPoint;
RedoStartLSN = ControlFile->checkPointCopy.redo;
record = ReadCheckpointRecord(xlogreader, 1, true);
if (record != NULL)
{
ereport(DEBUG1,
LSN_FORMAT_ARGS(checkPointLoc))));
}
else
{
/*
* We used to attempt to go back to a secondary checkpoint record
* here,but only when not in standby mode. We now just fail if we
* can't read the last checkpoint because this allows us to
* simplify processing around checkpoints.
*/
ereport(PANIC,
(errmsg("could not locate a valid checkpoint record")));
}
原文地址:https://blog.csdn.net/qq_52668274/article/details/127170672
版权声明:本文内容由互联网用户自发贡献,该文观点与技术仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容, 请发送邮件至 dio@foxmail.com 举报,一经查实,本站将立刻删除。