自顶向下redis4.0(5)持久化

redis4.0的持久化

目录

简介

虽然redis是内存数据库,但它也提供了持久化的功能。其中rdb持久化可以定时备份用于回滚,而aof持久化则更接近数据库最新的状态,服务器重启后可以恢复至最新的状态。两者数据备份的粒度不同,rdb将整个数据库备份,aof持久化粒度更为小,但生成的文件更大。如果有多个线程同时向磁盘写入,那么会增大磁盘的压力,最终导致线程阻塞,因此redis在同一时间只允许一个持久化向磁盘写入数据。redis默认配置关闭aof持久化,开启rdb后台持久化。由于aof持久化数据较新,所以如果开启了aof持久化,redis启动时会选择加载aof文件中的数据。

# 默认关闭aof
appendonly no
#   after 900 sec (15 min) if at least 1 key changed
#   after 300 sec (5 min) if at least 10 keys changed
#   after 60 sec if at least 10000 keys changed
save 900 1
save 300 10
save 60 10000

正文

rdb持久化

redis允许save命令和bgsave命令,还支持配置定期保存rdb数据。

save命令

save命令使用saveCommand函数直接调用rdbSave函数在主线程保存数据,线上模式不建议使用。在进一步介绍之前,我们先看一眼相关的成员。

struct redisServer {
    /* RDB persistence */
    pid_t rdb_child_pid;            /* PID of RDB saving child */
    char *rdb_filename;             /* Name of RDB file */
    long long dirty;                /* Changes to DB from the last rdb save */
    time_t lastsave;                /* Unix time of last successful save */
    int lastbgsave_status;          /* C_OK or C_ERR */
}

如果已经有rdb子进程在运行,则会直接返回。如果没有运行的子进程,则将数据存储到server.rdb_filename文件中,默认为dump.rdbrdbSave函数会打开一个临时文件,向其写入数据后,刷新数据到磁盘,然后重命名这个临时文件为dump.rdb。然后重置server.dirty0,设置lastsave时间。

void saveCommand(client *c) {
    if (server.rdb_child_pid != -1) {
        addReplyError(c,"Background save already in progress");
        return;
    }

    if (rdbSave(server.rdb_filename,null) == C_OK) {
        addReply(c,shared.ok);
    } 
}

具体写入数据的操作位于rdbSaveRio,它会先写入rdb的版本,再写入一些辅助信息,然后将每个db中的数据写入,最后写入校验码。

bgsave命令

bgsave命令会调用fork函数开启子进程,在子进程中调用rdbSave函数。

save命令相同,如果有正在运行的子进程在存储数据,则会返回错误提示。但如果使用bgsave schedule命令并且当前的子进程为aof,则可以延迟调用bgsave命令。

struct redisServer {
    ...
    /* RDB persistence */
    pid_t rdb_child_pid;            /* PID of RDB saving child */
    int child_info_pipe[2];         /* Pipe used to write the child_info_data. */
    struct {
        int process_type;           /* AOF or RDB child? */
        size_t cow_size;            /* Copy on write size. */
        unsigned long long magic;   /* Magic value to make sure data is valid. */
    } child_info_data;
    ...
};

后台启动rdb就是调用fork函数创建一个子进程,在子进程中调用rdbSave函数。在调用fork函数之前,redis会先创建一个管道用于子进程向父进程的单向通信,fork后的子进程会和父进程共享文件描述符,所以可以通过管道文件描述符单向通信。在子进程存储db数据的时候,会修改内存空间,造成copy-on-write,占用额外的内存空间,数据存储完成后,子进程会向父进程发送额外创建的内存大小。

fork(2)
*  The child inherits copies of the parent's set of open file
 descriptors.  Each file descriptor in the child refers to the same
 open file description (see open(2)) as the corresponding file
 descriptor in the parent.  This means that the two file
 descriptors share open file status flags, file offset, and signal-
 driven I/O attributes (see the description of F_SETOWN and
 F_SETSIG in fcntl(2)).
int rdbSaveBackground(char *filename, rdbSaveInfo *rsi) {
    pid_t childpid;
    long long start;

    if (server.aof_child_pid != -1 || server.rdb_child_pid != -1) return C_ERR;

    openChildInfoPipe(); // 创建管道

    start = ustime();
    if ((childpid = fork()) == 0) {
        //子进程
        int retval;

        closeListeningSockets(0); //因为会继承文件描述符,所以此处关闭套接字连接
        redisSetProcTitle("redis-rdb-bgsave");
        retval = rdbSave(filename,rsi);
        if (retval == C_OK) {
            size_t private_dirty = zmalloc_get_private_dirty(-1);
            
            server.child_info_data.cow_size = private_dirty;
            sendChildInfo(CHILD_INFO_TYPE_RDB);
        }
        exitFromChild((retval == C_OK) ? 0 : 1);
    } else {
        //父进程
        serverLog(LL_NOTICE,"Background saving started by pid %d",childpid);
        server.rdb_save_time_start = time(NULL);
        server.rdb_child_pid = childpid;
        server.rdb_child_type = RDB_CHILD_TYPE_DISK;
        updateDictResizePolicy();
        return C_OK;
    }
    return C_OK; /* unreached */
}

父进程此时记录子进程id rdb_child_pid和类型。然后在之前注册的时间事件serverCron中检查子进程是否结束。wait3等待子进程的状态发送改变,可能是运行结束了,也可能是被信号量暂停或者恢复了。如果子进程已经结束则接受子进程通过管道发送的信息,也就是Copy-On-Write的大小。然后关闭管道。

int serverCron(struct aeEventLoop *eventLoop, long long id, void *clientData) {
    ...
    //如果有子进程在全量存储数据
    if (server.rdb_child_pid != -1|| server.aof_child_pid != -1 ||
        ldbPendingChildren())
    {
        int statloc;
        pid_t pid;

        if ((pid = wait3(&statloc,WNOHANG,NULL)) != 0) {
            int exitcode = WEXITSTATUS(statloc);
            int bysignal = 0;

            if (WIFSIGNALED(statloc)) bysignal = WTERMSIG(statloc);
 			
            if (pid == server.rdb_child_pid) {
                backgroundSaveDoneHandler(exitcode,bysignal);
                if (!bysignal && exitcode == 0) receiveChildInfo();
            } 
            updateDictResizePolicy();
            closeChildInfoPipe();
        }
    }
}

由于我们此处是RDB存储(与之相对的是AOF重写,但如果开启RDB格式存储,两者几乎等价),backgroundSaveDoneHandler会调用backgroundSaveDoneHandlerDisk函数。这里会将rdb_child_pid等数据重置,如果保存成功,则更新server.dirty以及lastsave

void backgroundSaveDoneHandlerDisk(int exitcode, int bysignal) {
    if (!bysignal && exitcode == 0) {
        serverLog(LL_NOTICE,
            "Background saving terminated with success");
        server.dirty = server.dirty - server.dirty_before_bgsave;
        server.lastsave = time(NULL);
        server.lastbgsave_status = C_OK;
    } else if (!bysignal && exitcode != 0) {
        serverLog(LL_WARNING, "Background saving error");
        server.lastbgsave_status = C_ERR;
    } else {
        mstime_t latency;

        serverLog(LL_WARNING,
            "Background saving terminated by signal %d", bysignal);
        latencyStartMonitor(latency);
        rdbRemoveTempFile(server.rdb_child_pid);
        latencyEndMonitor(latency);
        latencyAddSampleIfNeeded("rdb-unlink-temp-file",latency);
        /* SIGUSR1 is whitelisted, so we have a way to kill a child without
         * tirggering an error conditon. */
        if (bysignal != SIGUSR1)
            server.lastbgsave_status = C_ERR;
    }
    server.rdb_child_pid = -1;
    server.rdb_child_type = RDB_CHILD_TYPE_NONE;
    server.rdb_save_time_last = time(NULL)-server.rdb_save_time_start;
    server.rdb_save_time_start = -1;
}

rdb定期保存数据

redis默认添加3个定期保存参数,如果使用redis.conf,则会清空默认配置使用redis.conf配置。如果redis.conf中没有配置,则不会使用rdb定期保存。

appendServerSaveParams(60*60,1);  /* save after 1 hour and 1 change */
appendServerSaveParams(300,100);  /* save after 5 minutes and 100 changes */
appendServerSaveParams(60,10000); /* save after 1 minute and 10000 changes */

同样是在serverCron函数中,如果当前没有aof或者rdb子进程存储数据,则会检测条件是否满足。如果(距离上一次写入的时间和数据变更的数量满足条件)并且(上一次写入成功或者距离上一次写入已经超过5秒钟,默认的CONFIG_BGSAVE_RETRY_DELAY值) ,则启动rdb序列化。

    if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
        ldbPendingChildren())
    {
        ...
    } else {
        /* If there is not a background saving/rewrite in progress check if
         * we have to save/rewrite now. */
         for (j = 0; j < server.saveparamslen; j++) {
            struct saveparam *sp = server.saveparams+j;

            /* Save if we reached the given amount of changes,
             * the given amount of seconds, and if the latest bgsave was
             * successful or if, in case of an error, at least
             * CONFIG_BGSAVE_RETRY_DELAY seconds already elapsed. */
            if (server.dirty >= sp->changes &&
                server.unixtime-server.lastsave > sp->seconds &&
                (server.unixtime-server.lastbgsave_try >
                 CONFIG_BGSAVE_RETRY_DELAY || // 值为5
                 server.lastbgsave_status == C_OK))
            {
                serverLog(LL_NOTICE,"%d changes in %d seconds. Saving...",
                    sp->changes, (int)sp->seconds);
                rdbSaveInfo rsi, *rsiptr;
                rsiptr = rdbPopulateSaveInfo(&rsi);
                rdbSaveBackground(server.rdb_filename,rsiptr);
                break;
            }
         }

         /* Trigger an AOF rewrite if needed. */
			...
    }

进程结束保存数据

redis正常关闭的情况下(接受客户端shutdown命令或者是收到terminal信号),会调用prepareForShutdown函数。该函数会关闭正在存储的子进程。如果有配置定期存储rdb或者是关闭时有传入save参数,则会在主线程中调用rdbSave存储数据等,接着关闭进程。

可以看到在使用rdb保存数据之前,如果开启了AOF,那么redis会调用flushAppendOnlyFile强制将数据写入磁盘,并调用aof_fsync保证数据刷新。

int prepareForShutdown(int flags) {
    int save = flags & SHUTDOWN_SAVE;
    int nosave = flags & SHUTDOWN_NOSAVE;

    serverLog(LL_WARNING,"User requested shutdown...");

    /* Kill all the Lua debugger forked sessions. */
    ldbKillForkedSessions();

    /* Kill the saving child if there is a background saving in progress.
       We want to avoid race conditions, for instance our saving child may
       overwrite the synchronous saving did by SHUTDOWN. */
    if (server.rdb_child_pid != -1) {
        serverLog(LL_WARNING,"There is a child saving an .rdb. Killing it!");
        kill(server.rdb_child_pid,SIGUSR1);
        rdbRemoveTempFile(server.rdb_child_pid);
    }

    if (server.aof_state != AOF_OFF) {
        /* Kill the AOF saving child as the AOF we already have may be longer
         * but contains the full dataset anyway. */
        if (server.aof_child_pid != -1) {
            /* If we have AOF enabled but haven't written the AOF yet, don't
             * shutdown or else the dataset will be lost. */
            if (server.aof_state == AOF_WAIT_REWRITE) {
                serverLog(LL_WARNING, "Writing initial AOF, can't exit.");
                return C_ERR;
            }
            serverLog(LL_WARNING,
                "There is a child rewriting the AOF. Killing it!");
            kill(server.aof_child_pid,SIGUSR1);
        }
        /* Append only file: flush buffers and fsync() the AOF at exit */
        serverLog(LL_NOTICE,"Calling fsync() on the AOF file.");
        flushAppendOnlyFile(1);
        aof_fsync(server.aof_fd);
    }

    /* Create a new RDB file before exiting. */
    if ((server.saveparamslen > 0 && !nosave) || save) {
        serverLog(LL_NOTICE,"Saving the final RDB snapshot before exiting.");
        /* Snapshotting. Perform a SYNC SAVE and exit */
        rdbSaveInfo rsi, *rsiptr;
        rsiptr = rdbPopulateSaveInfo(&rsi);
        if (rdbSave(server.rdb_filename,rsiptr) != C_OK) {
            /* Ooops.. error saving! The best we can do is to continue
             * operating. Note that if there was a background saving process,
             * in the next cron() Redis will be notified that the background
             * saving aborted, handling special stuff like slaves pending for
             * synchronization... */
            serverLog(LL_WARNING,"Error trying to save the DB, can't exit.");
            return C_ERR;
        }
    }

    /* Remove the pid file if possible and needed. */
    if (server.daemonize || server.pidfile) {
        serverLog(LL_NOTICE,"Removing the pid file.");
        unlink(server.pidfile);
    }

    /* Best effort flush of slave output buffers, so that we hopefully
     * send them pending writes. */
    flushSlavesOutputBuffers();

    /* Close the listening sockets. Apparently this allows faster restarts. */
    closeListeningSockets(1);
    serverLog(LL_WARNING,"%s is now ready to exit, bye bye...",
        server.sentinel_mode ? "Sentinel" : "Redis");
    return C_OK;
}

aof持久化

数据缓冲区

上文已经提到,redis在解析客户端请求到client-argcclient-argv后会调用processCommand检查请求命令的条件是否满足,如果满足,则会调用call(client, CMD_CALL_FULL)

/* Command call flags, see call() function */
#define CMD_CALL_NONE 0
#define CMD_CALL_SLOWLOG (1<<0)
#define CMD_CALL_STATS (1<<1)
#define CMD_CALL_PROPAGATE_AOF (1<<2)
#define CMD_CALL_PROPAGATE_REPL (1<<3)
#define CMD_CALL_PROPAGATE (CMD_CALL_PROPAGATE_AOF|CMD_CALL_PROPAGATE_REPL)
#define CMD_CALL_FULL (CMD_CALL_SLOWLOG | CMD_CALL_STATS | CMD_CALL_PROPAGATE)

在这里,我们观察一下CMD_CALL_FULL,此时我们只需要知道,该值包含CMD_CALL_PROPAGATE。在调用完命令后,redis会根据情况将命令追加到server->aof_buf中,如果数据有发生改动,命令没有禁止propagate,并且redis开启了aof,则会将命令追加到缓冲区。

call(client *c, int flags) {
  	c->cmd->proc(c); //已经执行命令

	/* Propagate the command into the AOF and replication link */
    if (flags & CMD_CALL_PROPAGATE && // flag 就是 CMD_CALL_FULL
        (c->flags & CLIENT_PREVENT_PROP) != CLIENT_PREVENT_PROP)
    {
        int propagate_flags = PROPAGATE_NONE;

        //如果指令有造成数据变化
        if (dirty) propagate_flags |= (PROPAGATE_AOF|PROPAGATE_REPL);

        //有些命令强制propagete, 比如publishMessage
        if (c->flags & CLIENT_FORCE_REPL) propagate_flags |= PROPAGATE_REPL;
        if (c->flags & CLIENT_FORCE_AOF) propagate_flags |= PROPAGATE_AOF;

        //有些命令禁止在此处propagate,比如spop,会在其他函数操作
        if (c->flags & CLIENT_PREVENT_REPL_PROP ||
            !(flags & CMD_CALL_PROPAGATE_REPL))
            propagate_flags &= ~PROPAGATE_REPL;
        if (c->flags & CLIENT_PREVENT_AOF_PROP ||
            !(flags & CMD_CALL_PROPAGATE_AOF))
            propagate_flags &= ~PROPAGATE_AOF;

        /* Call propagate() only if at least one of AOF / replication
         * propagation is needed. Note that modules commands handle replication
         * in an explicit way, so we never replicate them automatically. */
        if (propagate_flags != PROPAGATE_NONE && !(c->cmd->flags & CMD_MODULE))
            propagate(c->cmd,c->db->id,c->argv,c->argc,propagate_flags);
    }
  
}

void propagate(struct redisCommand *cmd, int dbid, robj **argv, int argc,
               int flags)
{
    if (server.aof_state != AOF_OFF && flags & PROPAGATE_AOF)
        feedAppendOnlyFile(cmd,dbid,argv,argc);
    if (flags & PROPAGATE_REPL)
        replicationFeedSlaves(server.slaves,dbid,argv,argc);
}

在追加命令之前,redis还会做一些处理,如果命令对应的db和上次追加命令的db不同,则插入select命令 。如果是expire系列的命令,则全部切换成pexpireat命令。如果是setex命令,则拆分成setpexpireat。如果此时没有子进程在重写,则写入到缓冲区,如果有子进程在重写,则尝试将数据发送给子进程。

void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
    sds buf = sdsempty();
    robj *tmpargv[3];

    /* The DB this command was targeting is not the same as the last command
     * we appended. To issue a SELECT command is needed. */
    if (dictid != server.aof_selected_db) {
        char seldb[64];

        snprintf(seldb,sizeof(seldb),"%d",dictid);
        buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
            (unsigned long)strlen(seldb),seldb);
        server.aof_selected_db = dictid;
    }

    if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
        cmd->proc == expireatCommand) {
        /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
    } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {
        /* Translate SETEX/PSETEX to SET and PEXPIREAT */
        tmpargv[0] = createStringObject("SET",3);
        tmpargv[1] = argv[1];
        tmpargv[2] = argv[3];
        buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
        decrRefCount(tmpargv[0]);
        buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
    } else if (cmd->proc == setCommand && argc > 3) {
        int i;
        robj *exarg = NULL, *pxarg = NULL;
        /* Translate SET [EX seconds][PX milliseconds] to SET and PEXPIREAT */
        buf = catAppendOnlyGenericCommand(buf,3,argv);
        for (i = 3; i < argc; i ++) {
            if (!strcasecmp(argv[i]->ptr, "ex")) exarg = argv[i+1];
            if (!strcasecmp(argv[i]->ptr, "px")) pxarg = argv[i+1];
        }
        serverAssert(!(exarg && pxarg));
        if (exarg)
            buf = catAppendOnlyExpireAtCommand(buf,server.expireCommand,argv[1],
                                               exarg);
        if (pxarg)
            buf = catAppendOnlyExpireAtCommand(buf,server.pexpireCommand,argv[1],
                                               pxarg);
    } else {

        buf = catAppendOnlyGenericCommand(buf,argc,argv);
    }

    /* Append to the AOF buffer. This will be flushed on disk just before
     * of re-entering the event loop, so before the client will get a
     * positive reply about the operation performed. */
    if (server.aof_state == AOF_ON)
        server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));

    /* If a background append only file rewriting is in progress we want to
     * accumulate the differences between the child DB and the current one
     * in a buffer, so that when the child process will do its work we
     * can append the differences to the new append only file. */
    if (server.aof_child_pid != -1)
        aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));

    sdsfree(buf);
}

刷新数据到磁盘

appendonly no #关闭aof


# 开启aof后生效
# appendfsync always  #aof 磁盘刷新策略
appendfsync everysec
# appendfsync no

redis默认关闭aof,如果关闭aofserver->aof_buf不会包含任何数据,只有开启了aof,也就是appendonly yes,才会往aof中写入数据。

在配置appendonly yes之后,appendfsync配置才会生效,redis默认配置为everysec,也就是每秒尝试后台线程刷新数据到磁盘,但写入数据还是主线程写入的,只要有数据且没有子线程在写入数据,就会写入数据。

redis刷新磁盘的操作也放在beforeSleep中处理。如果读者看过该系列之前的文章,应该记得redis返回客户端数据并不是直接发送给客户端,而是先将数据保存在client->buf中,然后在下一轮的aeMainLoop前的beforeSleep函数中调用handleClientsWithPendingWrites, 将数据返回给客户端。这样做的目的是为了兼容appendfysync always的效果。所以在beforeSleep函数中,刷新函数flushAppendOnlyFile位于handleClientsWithPendingWrites之前。

void beforeSleep(struct aeEventLoop *eventLoop) {
    ...
    /* Write the AOF buffer on disk */
    flushAppendOnlyFile(0);

    /* Handle writes with pending output buffers. */
    handleClientsWithPendingWrites();
}

刷新数据也有3种策略,下文会按照noalwayseverysec的顺序结合源码讲解。

appendfsync no

在不保证刷新的策略下,redis也会调用flushAppendOnly函数就等于直接调用aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));将数据写入系统缓冲区,但文件是否刷新到磁盘,以及什么时候刷新由系统决定。由于调用aofWrite可能会遇到磁盘空间不够的问题,redis会对比传入的数据长度和写入的数据长度,如果没有全部写入,为了保证下一次加载aof文件能够顺利,reids会裁剪掉部分写入的数据,等待下次重新写入。如果裁剪失败,则缩减aof_buf的长度,删除aof_buf中已经写入的部分,下次从最新的地方开始写入。并且如果写入系统缓冲区发送问题,则会在处理完问题后返回,而不会调用aof_sync等刷新磁盘的函数。

void flushAppendOnlyFile(int force) {
    ssize_t nwritten;
    int sync_in_progress = 0;
    mstime_t latency;

    if (sdslen(server.aof_buf) == 0) return;

    nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));

    if (nwritten != (ssize_t)sdslen(server.aof_buf)) {
        static time_t last_write_error_log = 0;

        //有写入数据
        if (nwritten != -1) {
            //将刚才写入的数据裁剪掉
            
            //todo what will happen if system ftruncate the file some part is still in the memory not yet flushed to the disk
            if (ftruncate(server.aof_fd, server.aof_current_size) != -1) {
                //裁剪成功
                nwritten = -1;
            } 
            server.aof_last_write_errno = ENOSPC;
        }

        server.aof_last_write_status = C_ERR;
        //如果裁剪失败
        if (nwritten > 0) {
            server.aof_current_size += nwritten;
            sdsrange(server.aof_buf,nwritten,-1);
        }
        return; /* We'll try again on the next call... */
        
    }
    
    server.aof_current_size += nwritten;

    if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
        sdsclear(server.aof_buf);
    } else {
        sdsfree(server.aof_buf);
        server.aof_buf = sdsempty();
    }
    //下面是刷新磁盘的操作
}
appendfysnc always

always模式保证客户端接受返回数据后,redis一定已经将数据变化刷新回磁盘。采用该模式相当于redis在主线程中调用完aofWrite函数后,紧接着调用了aof_sync函数,也就是fsync系列的函数。该模式迫使redis在主线程访问磁盘,会导致性能极具下降。并且always的容错性较差,如果aofWrite没有将aof_buf中的全部数据写入,redis会立刻退出。

自顶向下redis4.0(5)持久化

appendfysnc everysec

每秒刷新一次数据到磁盘是redis的默认配置,它会尝试每秒刷新文件到磁盘。由于flushAppendOnlyFileserverCron中被调用,而serverCron的频率为10次/秒,所以redis默认写入数据的频率和刷新数据的频率为10:1。如果开启了aof_no_fsync_on_rewrite,则不会在有子进程全量存储的时候(包括rdb存储和aof重写)同步增量aof数据。

void flushAppendOnlyFile(int force) {
    ssize_t nwritten;
    int sync_in_progress = 0;
    mstime_t latency;

    if (sdslen(server.aof_buf) == 0) return;

    // 查看是否有子线程在同步数据
    if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
        sync_in_progress = bioPendingJobsOfType(BIO_AOF_FSYNC) != 0;

    if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
        if (sync_in_progress) {
            
            //如果有另外的线程在写入数据,则等待一个postponed的循环和2秒
            if (server.aof_flush_postponed_start == 0) {
                server.aof_flush_postponed_start = server.unixtime;
                return;
            } else if (server.unixtime - server.aof_flush_postponed_start < 2) {
                return;
            }
            //如果还没有处理完,则继续写入,实际上会阻塞
        }
    }

    nwritten = aofWrite(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));

    server.aof_flush_postponed_start = 0;

    if (nwritten != (ssize_t)sdslen(server.aof_buf)) {
    	//上文已经介绍,如果写入的数据不全,则返回
        ...
        return; /* We'll try again on the next call... */
    } 

    //此时数据已写入系统缓冲区,刷新`aof_buf`的缓冲区
    sdsfree(server.aof_buf);
    server.aof_buf = sdsempty();

    /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
     * children doing I/O in the background. */
    if (server.aof_no_fsync_on_rewrite &&
        (server.aof_child_pid != -1 || server.rdb_child_pid != -1))
            return;

    if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
                server.unixtime > server.aof_last_fsync)) {
        if (!sync_in_progress) aof_background_fsync(server.aof_fd);
        server.aof_last_fsync = server.unixtime;
    }
}

redis在将数据写入磁盘时,会在主线程调用write函数,然后在另外的线程中调用fsync函数。这样能够让另外一个线程阻塞在IO上而不影响主线程的操作,但需要注意的是如果另一个线程fsync函数如果没有返回,主线程就调用write函数,那么主线程也会阻塞在write函数上。[4]

《Redis开发与运维》[3]中提到

通过对AOF阻塞流程可以发现两个问题:

1) everysec配置最多可能丢失2秒数据, 不是1秒

2) 如果系统fsync缓慢, 将会导致Redis主线程阻塞影响效率。

实际上在redis4.0版本中,everysec配置最多可能丢失2秒加上一个aeMainLoop循环的时间。虽然《Redis开发与运维》指出了两个问题,但实际上它们是同一个问题,那就是磁盘写入速度无法承受过量的数据。在使用everysec配置时,如果发生这个问题,redis首先考虑主线程的运行,如果距离上一次延迟写入的时间戳aof_flush_postponed_start小于2秒,那么先跳过这一次的写入,避免阻塞以保证主线程能够处理请求。如果2秒后数据还没有从缓冲区刷新到磁盘,那么将会调用aofWrite导致主线程阻塞。

aof重写

aof重写的配置

aof重写可以输入指令触发bgrewriteaof,也可以配置条件触发重写。

auto-aof-rewrite-min-size 64mb
auto-aof-rewrite-percentage 100

仅仅这两个配置还不能了解清楚redis何时重写,我们还需要有aof_current_sizeaof_base_sizeaof_current_size就是aof文件当前的大小,redis启动加载aof文件或者每次aof追加数据都会更新这个值,这个值并不会存储到磁盘中,aof_base_size也是同理,如果启动时有加载aof文件,那么aof_base_size的值就是aof文件的大小。

aof_current_size>auto-aof-rewrite-min-size并且有配置auto-aof-rewrite-percentage时,如果(aof_current_size-aof_base_size)/100 >= percentage,则会自动重写。比如按照上文的配置,redis启动时加载的aof文件大小为100mb,那么aof_base_size就是100mb,当redis文件增长到200mb的时候就会自动重写。

但是会存在这样一种情况,redis文件增长到199mb的时候,刚好重启了,那么下次启动的时候,aof_base_size就和aof_current_size大小相等,想要触发自动重写,就要等到redis文件大小增长到400mb左右。如果数据增长地比较缓慢,或者是百分比配置较大。在触发重写之前,redis就关闭或者重启了。那么aof_base_size下次启动的时候会被刷新成aof_current_size的大小,导致可能永远无法触发自动重写。

aof重写的优先级

aof重写的优先级低于rdb,如果两者的触发条件同时满足,redis会优先处理rdb存储。观察源代码,可以发现rdb存储先于aof,如果rdb此处触发,即使aof触发重写的条件满足,因为server.rdb_child_pid将不为-1,导致无法进入aof重写。

serverCron(aeEventLoop*, longlong, void*) {
        if (server.rdb_child_pid != -1 || server.aof_child_pid != -1 ||
        ldbPendingChildren()) {
            //... 检查子进程是否结束并处理。
        } else {
            /* If there is not a background saving/rewrite in progress check if
             * we have to save/rewrite now. */
             for (j = 0; j < server.saveparamslen; j++) {
                ...
                //..处理rdb自动存储
             }

             /* Trigger an AOF rewrite if needed. */
             if (server.aof_state == AOF_ON &&
                 server.rdb_child_pid == -1 &&
                 server.aof_child_pid == -1 &&
                 server.aof_rewrite_perc &&
                 server.aof_current_size > server.aof_rewrite_min_size)
             {
                long long base = server.aof_rewrite_base_size ?
                                server.aof_rewrite_base_size : 1;
                long long growth = (server.aof_current_size*100/base) - 100;
                if (growth >= server.aof_rewrite_perc) {
                    serverLog(LL_NOTICE,"Starting automatic rewriting of AOF on %lld%% growth",growth);
                    rewriteAppendOnlyFileBackground();
                }
             }
        }
}

aof 重写的来龙去脉

rewriteAppendOnlyFileBackground会创建许多管道用于父子间通信。

  • childInfoPipe用于子进程向父进程提示有多少个Copy-On-Write内存。
  • aof_pipe_write_data_to_child用于父进程向aof重写子进程发送最近的数据变更。
  • aof_pipe_write_ack_to_parentaof_pipe_write_ack_to_child用于等待彼此的确认消息。

并且注册了aof_pipe_read_ack_from_child的文件事件,当子进程向父进程发送中止请求的时候,就会调用aof_pipe_read_ack_from_child函数。

int aofCreatePipes(void) {
    int fds[6] = {-1, -1, -1, -1, -1, -1};
    int j;

    if (pipe(fds) == -1) goto error; /* parent -> children data. */
    if (pipe(fds+2) == -1) goto error; /* children -> parent ack. */
    if (pipe(fds+4) == -1) goto error; /* parent -> children ack. */
    /* Parent -> children data is non blocking. */
    if (anetNonBlock(NULL,fds[0]) != ANET_OK) goto error;
    if (anetNonBlock(NULL,fds[1]) != ANET_OK) goto error;
    
    //注意:
    //这里注册了一个文件事件
    if (aeCreateFileEvent(server.el, fds[2], AE_READABLE, aofChildPipeReadable, NULL) == AE_ERR) goto error;

    server.aof_pipe_write_data_to_child = fds[1];
    server.aof_pipe_read_data_from_parent = fds[0];
    server.aof_pipe_write_ack_to_parent = fds[3];
    server.aof_pipe_read_ack_from_child = fds[2];
    server.aof_pipe_write_ack_to_child = fds[5];
    server.aof_pipe_read_ack_from_parent = fds[4];
    server.aof_stop_sending_diff = 0;
    return C_OK;

error:
    serverLog(LL_WARNING,"Error opening /setting AOF rewrite IPC pipes: %s",
        strerror(errno));
    for (j = 0; j < 6; j++) if(fds[j] != -1) close(fds[j]);
    return C_ERR;
}

父进程 创建完子进程后,父进程会更新aof_child_pid记录子进程id,虽然只更新了一个字段,但意味着已经开启了一个很有可能影响redis性能的任务。

子进程 先向临时文件写入当前数据库的内容,如果开启了aof_use_rdb_preamble(默认关闭,但建议开启),那么就会写入rdb数据,也就是db数据全量存储,否则按aof追加模式,全量存储db中的内容,接着刷新数据到磁盘,阻塞。

//in function rewriteAppendOnlyFile(char* filename)
if (server.aof_use_rdb_preamble) {
    int error;
    if (rdbSaveRio(&aof,&error,RDB_SAVE_AOF_PREAMBLE,NULL) == C_ERR) {
        errno = error;
        goto werr;
    }
} else {
    if (rewriteAppendOnlyFileRio(&aof) == C_ERR) goto werr;
}

/* Do an initial slow fsync here while the parent is still sending
     * data, in order to make the next final fsync faster. */
if (fflush(fp) == EOF) goto werr;
if (fsync(fileno(fp)) == -1) goto werr;

父进程aof子进程等待数据刷新的时候,继续处理请求,并且将数据追加到server.aof_rewrite_buf_blocks,如果没有注册aof_pipe_write_data_to_child(是个管道,也就是文件描述符)文件事件的话,会将该管道和aofChildWriteDiffData绑定,如果管道可写,则会将server.aof_rewrite_buf_blocks中的数据写入管道发送给子进程。这样保证了父进程不会因为向管道写入数据而阻塞

/* Append data to the AOF rewrite buffer, allocating new blocks if needed. */
void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {
    listNode *ln = listLast(server.aof_rewrite_buf_blocks);
    aofrwblock *block = ln ? ln->value : NULL;

    while(len) {
       ...
       // 一直将数据写入aof_rewrite_buf_block
    }

	//注册文件事件
    if (aeGetFileEvents(server.el,server.aof_pipe_write_data_to_child) == 0) {
        aeCreateFileEvent(server.el, server.aof_pipe_write_data_to_child,
            AE_WRITABLE, aofChildWriteDiffData, NULL);
    }
}

子进程 刷新完之前的数据后,会在1秒内一直读取来自父进程的数据,将其写入到aof_child_diff中。然后向父进程发送停发数据请求。

//in function rewriteAppendOnlyFile(char* filename)
mstime_t start = mstime();
while(mstime()-start < 1000 && nodata < 20) {
    if (aeWait(server.aof_pipe_read_data_from_parent, AE_READABLE, 1) <= 0)
    {
        nodata++;
        continue;
    }
    nodata = 0; /* Start counting from zero, we stop on N *contiguous*
                       timeouts. */
    aofReadDiffFromParent();
}
if (write(server.aof_pipe_write_ack_to_parent,"!",1) != 1) goto werr;

父进程aeMainLoop中检测到aof_pipe_read_ack_from_child管道可读事件(在创建管道的时候注册,请看前文),调用aofChildPipeReadable函数,将aof_stop_sending_diff设置为1,父进程不会再将aof_rewrite_buf_blocks缓冲区的内容写给子进程。并向子进程发送消息表示已经收到停发请求。

子进程 接受到父进程的同意后,最后读取一次数据,因为在父进程接受到停发请求前可能又发送了数据。至此,停发请求前的额外aof增量数据都已写入aof_child_diff。接着子进程将其写入文件并刷新,退出子进程。

if (syncRead(server.aof_pipe_read_ack_from_parent,&byte,1,5000) != 1 ||
    byte != '!') goto werr;
aofReadDiffFromParent();
if (rioWrite(&aof,server.aof_child_diff,sdslen(server.aof_child_diff)) == 0)
    goto werr;

/* Make sure data will not remain on the OS's output buffers */
if (fflush(fp) == EOF) goto werr;
if (fsync(fileno(fp)) == -1) goto werr;
if (fclose(fp) == EOF) goto werr;

父进程serverCron函数中调用wait3检测到aof重写子进程的退出,会调用backgroundRewriteDoneHandler处理。
它首先会打开之前保存的临时文件,将中止请求后的追加数据aof_rewrite_buf_blocks写入文件(注意:虽然子进程之前请求中止发送数据了,但因为rdb_child_pid直到现在还是保存的子进程的id,会一直接受追加数据到aof_rewrite_buf_blocks)。此时已经将所有的数据都写入aof临时文件。接下来就是将临时文件替换为aof保存的文件名。

rdb对比aof

官网有一篇文章《persistence》已经做了比对,在此不再赘述。

参考文献

[1]《Redis 源码》

[2]《Redis开发与运维》

[3]《Redis设计与实现》

[4]《fsync() on a different thread: apparently a useless trick》

[5]《private dirty memory》

[6]《pipe(2) - Linux man page》

[7]《wait3(2) - Linux man page》

[8]《ftruncate(3) - Linux man page》

[9]《Redis persistence》

发表评论

相关文章