Back to home page

Redis cross reference

 
 

    


0001 /*
0002  * Copyright (c) 2009-2012, Salvatore Sanfilippo <antirez at gmail dot com>
0003  * All rights reserved.
0004  *
0005  * Redistribution and use in source and binary forms, with or without
0006  * modification, are permitted provided that the following conditions are met:
0007  *
0008  *   * Redistributions of source code must retain the above copyright notice,
0009  *     this list of conditions and the following disclaimer.
0010  *   * Redistributions in binary form must reproduce the above copyright
0011  *     notice, this list of conditions and the following disclaimer in the
0012  *     documentation and/or other materials provided with the distribution.
0013  *   * Neither the name of Redis nor the names of its contributors may be used
0014  *     to endorse or promote products derived from this software without
0015  *     specific prior written permission.
0016  *
0017  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
0018  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
0019  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
0020  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
0021  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
0022  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
0023  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
0024  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
0025  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
0026  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
0027  * POSSIBILITY OF SUCH DAMAGE.
0028  */
0029 
0030 #include "redis.h"
0031 #include "bio.h"
0032 #include "rio.h"
0033 
0034 #include <signal.h>
0035 #include <fcntl.h>
0036 #include <sys/stat.h>
0037 #include <sys/types.h>
0038 #include <sys/time.h>
0039 #include <sys/resource.h>
0040 #include <sys/wait.h>
0041 
0042 void aofUpdateCurrentSize(void);
0043 
0044 /* ----------------------------------------------------------------------------
0045  * AOF rewrite buffer implementation.
0046  *
0047  * The following code implement a simple buffer used in order to accumulate
0048  * changes while the background process is rewriting the AOF file.
0049  *
0050  * We only need to append, but can't just use realloc with a large block
0051  * because 'huge' reallocs are not always handled as one could expect
0052  * (via remapping of pages at OS level) but may involve copying data.
0053  *
0054  * For this reason we use a list of blocks, every block is
0055  * AOF_RW_BUF_BLOCK_SIZE bytes.
0056  * ------------------------------------------------------------------------- */
0057 
0058 #define AOF_RW_BUF_BLOCK_SIZE (1024*1024*10)    /* 10 MB per block */
0059 
0060 typedef struct aofrwblock {
0061     unsigned long used, free;
0062     char buf[AOF_RW_BUF_BLOCK_SIZE];
0063 } aofrwblock;
0064 
0065 /* This function free the old AOF rewrite buffer if needed, and initialize
0066  * a fresh new one. It tests for server.aof_rewrite_buf_blocks equal to NULL
0067  * so can be used for the first initialization as well. */
0068 void aofRewriteBufferReset(void) {
0069     if (server.aof_rewrite_buf_blocks)
0070         listRelease(server.aof_rewrite_buf_blocks);
0071 
0072     server.aof_rewrite_buf_blocks = listCreate();
0073     listSetFreeMethod(server.aof_rewrite_buf_blocks,zfree);
0074 }
0075 
0076 /* Return the current size of the AOF rerwite buffer. */
0077 unsigned long aofRewriteBufferSize(void) {
0078     listNode *ln = listLast(server.aof_rewrite_buf_blocks);
0079     aofrwblock *block = ln ? ln->value : NULL;
0080 
0081     if (block == NULL) return 0;
0082     unsigned long size =
0083         (listLength(server.aof_rewrite_buf_blocks)-1) * AOF_RW_BUF_BLOCK_SIZE;
0084     size += block->used;
0085     return size;
0086 }
0087 
0088 /* Append data to the AOF rewrite buffer, allocating new blocks if needed. */
0089 void aofRewriteBufferAppend(unsigned char *s, unsigned long len) {
0090     listNode *ln = listLast(server.aof_rewrite_buf_blocks);
0091     aofrwblock *block = ln ? ln->value : NULL;
0092 
0093     while(len) {
0094         /* If we already got at least an allocated block, try appending
0095          * at least some piece into it. */
0096         if (block) {
0097             unsigned long thislen = (block->free < len) ? block->free : len;
0098             if (thislen) {  /* The current block is not already full. */
0099                 memcpy(block->buf+block->used, s, thislen);
0100                 block->used += thislen;
0101                 block->free -= thislen;
0102                 s += thislen;
0103                 len -= thislen;
0104             }
0105         }
0106 
0107         if (len) { /* First block to allocate, or need another block. */
0108             int numblocks;
0109 
0110             block = zmalloc(sizeof(*block));
0111             block->free = AOF_RW_BUF_BLOCK_SIZE;
0112             block->used = 0;
0113             listAddNodeTail(server.aof_rewrite_buf_blocks,block);
0114 
0115             /* Log every time we cross more 10 or 100 blocks, respectively
0116              * as a notice or warning. */
0117             numblocks = listLength(server.aof_rewrite_buf_blocks);
0118             if (((numblocks+1) % 10) == 0) {
0119                 int level = ((numblocks+1) % 100) == 0 ? REDIS_WARNING :
0120                                                          REDIS_NOTICE;
0121                 redisLog(level,"Background AOF buffer size: %lu MB",
0122                     aofRewriteBufferSize()/(1024*1024));
0123             }
0124         }
0125     }
0126 }
0127 
0128 /* Write the buffer (possibly composed of multiple blocks) into the specified
0129  * fd. If no short write or any other error happens -1 is returned,
0130  * otherwise the number of bytes written is returned. */
0131 ssize_t aofRewriteBufferWrite(int fd) {
0132     listNode *ln;
0133     listIter li;
0134     ssize_t count = 0;
0135 
0136     listRewind(server.aof_rewrite_buf_blocks,&li);
0137     while((ln = listNext(&li))) {
0138         aofrwblock *block = listNodeValue(ln);
0139         ssize_t nwritten;
0140 
0141         if (block->used) {
0142             nwritten = write(fd,block->buf,block->used);
0143             if (nwritten != block->used) {
0144                 if (nwritten == 0) errno = EIO;
0145                 return -1;
0146             }
0147             count += nwritten;
0148         }
0149     }
0150     return count;
0151 }
0152 
0153 /* ----------------------------------------------------------------------------
0154  * AOF file implementation
0155  * ------------------------------------------------------------------------- */
0156 
0157 /* Starts a background task that performs fsync() against the specified
0158  * file descriptor (the one of the AOF file) in another thread. */
0159 void aof_background_fsync(int fd) {
0160     bioCreateBackgroundJob(REDIS_BIO_AOF_FSYNC,(void*)(long)fd,NULL,NULL);
0161 }
0162 
0163 /* Called when the user switches from "appendonly yes" to "appendonly no"
0164  * at runtime using the CONFIG command. */
0165 void stopAppendOnly(void) {
0166     redisAssert(server.aof_state != REDIS_AOF_OFF);
0167     flushAppendOnlyFile(1);
0168     aof_fsync(server.aof_fd);
0169     close(server.aof_fd);
0170 
0171     server.aof_fd = -1;
0172     server.aof_selected_db = -1;
0173     server.aof_state = REDIS_AOF_OFF;
0174     /* rewrite operation in progress? kill it, wait child exit */
0175     if (server.aof_child_pid != -1) {
0176         int statloc;
0177 
0178         redisLog(REDIS_NOTICE,"Killing running AOF rewrite child: %ld",
0179             (long) server.aof_child_pid);
0180         if (kill(server.aof_child_pid,SIGUSR1) != -1)
0181             wait3(&statloc,0,NULL);
0182         /* reset the buffer accumulating changes while the child saves */
0183         aofRewriteBufferReset();
0184         aofRemoveTempFile(server.aof_child_pid);
0185         server.aof_child_pid = -1;
0186         server.aof_rewrite_time_start = -1;
0187     }
0188 }
0189 
0190 /* Called when the user switches from "appendonly no" to "appendonly yes"
0191  * at runtime using the CONFIG command. */
0192 int startAppendOnly(void) {
0193     server.aof_last_fsync = server.unixtime;
0194     server.aof_fd = open(server.aof_filename,O_WRONLY|O_APPEND|O_CREAT,0644);
0195     redisAssert(server.aof_state == REDIS_AOF_OFF);
0196     if (server.aof_fd == -1) {
0197         redisLog(REDIS_WARNING,"Redis needs to enable the AOF but can't open the append only file: %s",strerror(errno));
0198         return REDIS_ERR;
0199     }
0200     if (rewriteAppendOnlyFileBackground() == REDIS_ERR) {
0201         close(server.aof_fd);
0202         redisLog(REDIS_WARNING,"Redis needs to enable the AOF but can't trigger a background AOF rewrite operation. Check the above logs for more info about the error.");
0203         return REDIS_ERR;
0204     }
0205     /* We correctly switched on AOF, now wait for the rerwite to be complete
0206      * in order to append data on disk. */
0207     server.aof_state = REDIS_AOF_WAIT_REWRITE;
0208     return REDIS_OK;
0209 }
0210 
0211 /* Write the append only file buffer on disk.
0212  *
0213  * Since we are required to write the AOF before replying to the client,
0214  * and the only way the client socket can get a write is entering when the
0215  * the event loop, we accumulate all the AOF writes in a memory
0216  * buffer and write it on disk using this function just before entering
0217  * the event loop again.
0218  *
0219  * About the 'force' argument:
0220  *
0221  * When the fsync policy is set to 'everysec' we may delay the flush if there
0222  * is still an fsync() going on in the background thread, since for instance
0223  * on Linux write(2) will be blocked by the background fsync anyway.
0224  * When this happens we remember that there is some aof buffer to be
0225  * flushed ASAP, and will try to do that in the serverCron() function.
0226  *
0227  * However if force is set to 1 we'll write regardless of the background
0228  * fsync. */
0229 void flushAppendOnlyFile(int force) {
0230     ssize_t nwritten;
0231     int sync_in_progress = 0;
0232 
0233     if (sdslen(server.aof_buf) == 0) return;
0234 
0235     if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
0236         sync_in_progress = bioPendingJobsOfType(REDIS_BIO_AOF_FSYNC) != 0;
0237 
0238     if (server.aof_fsync == AOF_FSYNC_EVERYSEC && !force) {
0239         /* With this append fsync policy we do background fsyncing.
0240          * If the fsync is still in progress we can try to delay
0241          * the write for a couple of seconds. */
0242         if (sync_in_progress) {
0243             if (server.aof_flush_postponed_start == 0) {
0244                 /* No previous write postponinig, remember that we are
0245                  * postponing the flush and return. */
0246                 server.aof_flush_postponed_start = server.unixtime;
0247                 return;
0248             } else if (server.unixtime - server.aof_flush_postponed_start < 2) {
0249                 /* We were already waiting for fsync to finish, but for less
0250                  * than two seconds this is still ok. Postpone again. */
0251                 return;
0252             }
0253             /* Otherwise fall trough, and go write since we can't wait
0254              * over two seconds. */
0255             server.aof_delayed_fsync++;
0256             redisLog(REDIS_NOTICE,"Asynchronous AOF fsync is taking too long (disk is busy?). Writing the AOF buffer without waiting for fsync to complete, this may slow down Redis.");
0257         }
0258     }
0259     /* If you are following this code path, then we are going to write so
0260      * set reset the postponed flush sentinel to zero. */
0261     server.aof_flush_postponed_start = 0;
0262 
0263     /* We want to perform a single write. This should be guaranteed atomic
0264      * at least if the filesystem we are writing is a real physical one.
0265      * While this will save us against the server being killed I don't think
0266      * there is much to do about the whole server stopping for power problems
0267      * or alike */
0268     nwritten = write(server.aof_fd,server.aof_buf,sdslen(server.aof_buf));
0269     if (nwritten != (signed)sdslen(server.aof_buf)) {
0270         /* Ooops, we are in troubles. The best thing to do for now is
0271          * aborting instead of giving the illusion that everything is
0272          * working as expected. */
0273         if (nwritten == -1) {
0274             redisLog(REDIS_WARNING,"Exiting on error writing to the append-only file: %s",strerror(errno));
0275         } else {
0276             redisLog(REDIS_WARNING,"Exiting on short write while writing to "
0277                                    "the append-only file: %s (nwritten=%ld, "
0278                                    "expected=%ld)",
0279                                    strerror(errno),
0280                                    (long)nwritten,
0281                                    (long)sdslen(server.aof_buf));
0282 
0283             if (ftruncate(server.aof_fd, server.aof_current_size) == -1) {
0284                 redisLog(REDIS_WARNING, "Could not remove short write "
0285                          "from the append-only file.  Redis may refuse "
0286                          "to load the AOF the next time it starts.  "
0287                          "ftruncate: %s", strerror(errno));
0288             }
0289         }
0290         exit(1);
0291     }
0292     server.aof_current_size += nwritten;
0293 
0294     /* Re-use AOF buffer when it is small enough. The maximum comes from the
0295      * arena size of 4k minus some overhead (but is otherwise arbitrary). */
0296     if ((sdslen(server.aof_buf)+sdsavail(server.aof_buf)) < 4000) {
0297         sdsclear(server.aof_buf);
0298     } else {
0299         sdsfree(server.aof_buf);
0300         server.aof_buf = sdsempty();
0301     }
0302 
0303     /* Don't fsync if no-appendfsync-on-rewrite is set to yes and there are
0304      * children doing I/O in the background. */
0305     if (server.aof_no_fsync_on_rewrite &&
0306         (server.aof_child_pid != -1 || server.rdb_child_pid != -1))
0307             return;
0308 
0309     /* Perform the fsync if needed. */
0310     if (server.aof_fsync == AOF_FSYNC_ALWAYS) {
0311         /* aof_fsync is defined as fdatasync() for Linux in order to avoid
0312          * flushing metadata. */
0313         aof_fsync(server.aof_fd); /* Let's try to get this data on the disk */
0314         server.aof_last_fsync = server.unixtime;
0315     } else if ((server.aof_fsync == AOF_FSYNC_EVERYSEC &&
0316                 server.unixtime > server.aof_last_fsync)) {
0317         if (!sync_in_progress) aof_background_fsync(server.aof_fd);
0318         server.aof_last_fsync = server.unixtime;
0319     }
0320 }
0321 
0322 sds catAppendOnlyGenericCommand(sds dst, int argc, robj **argv) {
0323     char buf[32];
0324     int len, j;
0325     robj *o;
0326 
0327     buf[0] = '*';
0328     len = 1+ll2string(buf+1,sizeof(buf)-1,argc);
0329     buf[len++] = '\r';
0330     buf[len++] = '\n';
0331     dst = sdscatlen(dst,buf,len);
0332 
0333     for (j = 0; j < argc; j++) {
0334         o = getDecodedObject(argv[j]);
0335         buf[0] = '$';
0336         len = 1+ll2string(buf+1,sizeof(buf)-1,sdslen(o->ptr));
0337         buf[len++] = '\r';
0338         buf[len++] = '\n';
0339         dst = sdscatlen(dst,buf,len);
0340         dst = sdscatlen(dst,o->ptr,sdslen(o->ptr));
0341         dst = sdscatlen(dst,"\r\n",2);
0342         decrRefCount(o);
0343     }
0344     return dst;
0345 }
0346 
0347 /* Create the sds representation of an PEXPIREAT command, using
0348  * 'seconds' as time to live and 'cmd' to understand what command
0349  * we are translating into a PEXPIREAT.
0350  *
0351  * This command is used in order to translate EXPIRE and PEXPIRE commands
0352  * into PEXPIREAT command so that we retain precision in the append only
0353  * file, and the time is always absolute and not relative. */
0354 sds catAppendOnlyExpireAtCommand(sds buf, struct redisCommand *cmd, robj *key, robj *seconds) {
0355     long long when;
0356     robj *argv[3];
0357 
0358     /* Make sure we can use strtol */
0359     seconds = getDecodedObject(seconds);
0360     when = strtoll(seconds->ptr,NULL,10);
0361     /* Convert argument into milliseconds for EXPIRE, SETEX, EXPIREAT */
0362     if (cmd->proc == expireCommand || cmd->proc == setexCommand ||
0363         cmd->proc == expireatCommand)
0364     {
0365         when *= 1000;
0366     }
0367     /* Convert into absolute time for EXPIRE, PEXPIRE, SETEX, PSETEX */
0368     if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
0369         cmd->proc == setexCommand || cmd->proc == psetexCommand)
0370     {
0371         when += mstime();
0372     }
0373     decrRefCount(seconds);
0374 
0375     argv[0] = createStringObject("PEXPIREAT",9);
0376     argv[1] = key;
0377     argv[2] = createStringObjectFromLongLong(when);
0378     buf = catAppendOnlyGenericCommand(buf, 3, argv);
0379     decrRefCount(argv[0]);
0380     decrRefCount(argv[2]);
0381     return buf;
0382 }
0383 
0384 void feedAppendOnlyFile(struct redisCommand *cmd, int dictid, robj **argv, int argc) {
0385     sds buf = sdsempty();
0386     robj *tmpargv[3];
0387 
0388     /* The DB this command was targeting is not the same as the last command
0389      * we appendend. To issue a SELECT command is needed. */
0390     if (dictid != server.aof_selected_db) {
0391         char seldb[64];
0392 
0393         snprintf(seldb,sizeof(seldb),"%d",dictid);
0394         buf = sdscatprintf(buf,"*2\r\n$6\r\nSELECT\r\n$%lu\r\n%s\r\n",
0395             (unsigned long)strlen(seldb),seldb);
0396         server.aof_selected_db = dictid;
0397     }
0398 
0399     if (cmd->proc == expireCommand || cmd->proc == pexpireCommand ||
0400         cmd->proc == expireatCommand) {
0401         /* Translate EXPIRE/PEXPIRE/EXPIREAT into PEXPIREAT */
0402         buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
0403     } else if (cmd->proc == setexCommand || cmd->proc == psetexCommand) {
0404         /* Translate SETEX/PSETEX to SET and PEXPIREAT */
0405         tmpargv[0] = createStringObject("SET",3);
0406         tmpargv[1] = argv[1];
0407         tmpargv[2] = argv[3];
0408         buf = catAppendOnlyGenericCommand(buf,3,tmpargv);
0409         decrRefCount(tmpargv[0]);
0410         buf = catAppendOnlyExpireAtCommand(buf,cmd,argv[1],argv[2]);
0411     } else {
0412         /* All the other commands don't need translation or need the
0413          * same translation already operated in the command vector
0414          * for the replication itself. */
0415         buf = catAppendOnlyGenericCommand(buf,argc,argv);
0416     }
0417 
0418     /* Append to the AOF buffer. This will be flushed on disk just before
0419      * of re-entering the event loop, so before the client will get a
0420      * positive reply about the operation performed. */
0421     if (server.aof_state == REDIS_AOF_ON)
0422         server.aof_buf = sdscatlen(server.aof_buf,buf,sdslen(buf));
0423 
0424     /* If a background append only file rewriting is in progress we want to
0425      * accumulate the differences between the child DB and the current one
0426      * in a buffer, so that when the child process will do its work we
0427      * can append the differences to the new append only file. */
0428     if (server.aof_child_pid != -1)
0429         aofRewriteBufferAppend((unsigned char*)buf,sdslen(buf));
0430 
0431     sdsfree(buf);
0432 }
0433 
0434 /* ----------------------------------------------------------------------------
0435  * AOF loading
0436  * ------------------------------------------------------------------------- */
0437 
0438 /* In Redis commands are always executed in the context of a client, so in
0439  * order to load the append only file we need to create a fake client. */
0440 struct redisClient *createFakeClient(void) {
0441     struct redisClient *c = zmalloc(sizeof(*c));
0442 
0443     selectDb(c,0);
0444     c->fd = -1;
0445     c->name = NULL;
0446     c->querybuf = sdsempty();
0447     c->querybuf_peak = 0;
0448     c->argc = 0;
0449     c->argv = NULL;
0450     c->bufpos = 0;
0451     c->flags = 0;
0452     /* We set the fake client as a slave waiting for the synchronization
0453      * so that Redis will not try to send replies to this client. */
0454     c->replstate = REDIS_REPL_WAIT_BGSAVE_START;
0455     c->reply = listCreate();
0456     c->reply_bytes = 0;
0457     c->obuf_soft_limit_reached_time = 0;
0458     c->watched_keys = listCreate();
0459     listSetFreeMethod(c->reply,decrRefCount);
0460     listSetDupMethod(c->reply,dupClientReplyValue);
0461     initClientMultiState(c);
0462     return c;
0463 }
0464 
0465 void freeFakeClient(struct redisClient *c) {
0466     sdsfree(c->querybuf);
0467     listRelease(c->reply);
0468     listRelease(c->watched_keys);
0469     freeClientMultiState(c);
0470     zfree(c);
0471 }
0472 
0473 /* Replay the append log file. On error REDIS_OK is returned. On non fatal
0474  * error (the append only file is zero-length) REDIS_ERR is returned. On
0475  * fatal error an error message is logged and the program exists. */
0476 int loadAppendOnlyFile(char *filename) {
0477     struct redisClient *fakeClient;
0478     FILE *fp = fopen(filename,"r");
0479     struct redis_stat sb;
0480     int old_aof_state = server.aof_state;
0481     long loops = 0;
0482 
0483     if (fp && redis_fstat(fileno(fp),&sb) != -1 && sb.st_size == 0) {
0484         server.aof_current_size = 0;
0485         fclose(fp);
0486         return REDIS_ERR;
0487     }
0488 
0489     if (fp == NULL) {
0490         redisLog(REDIS_WARNING,"Fatal error: can't open the append log file for reading: %s",strerror(errno));
0491         exit(1);
0492     }
0493 
0494     /* Temporarily disable AOF, to prevent EXEC from feeding a MULTI
0495      * to the same file we're about to read. */
0496     server.aof_state = REDIS_AOF_OFF;
0497 
0498     fakeClient = createFakeClient();
0499     startLoading(fp);
0500 
0501     while(1) {
0502         int argc, j;
0503         unsigned long len;
0504         robj **argv;
0505         char buf[128];
0506         sds argsds;
0507         struct redisCommand *cmd;
0508 
0509         /* Serve the clients from time to time */
0510         if (!(loops++ % 1000)) {
0511             loadingProgress(ftello(fp));
0512             aeProcessEvents(server.el, AE_FILE_EVENTS|AE_DONT_WAIT);
0513         }
0514 
0515         if (fgets(buf,sizeof(buf),fp) == NULL) {
0516             if (feof(fp))
0517                 break;
0518             else
0519                 goto readerr;
0520         }
0521         if (buf[0] != '*') goto fmterr;
0522         argc = atoi(buf+1);
0523         if (argc < 1) goto fmterr;
0524 
0525         argv = zmalloc(sizeof(robj*)*argc);
0526         for (j = 0; j < argc; j++) {
0527             if (fgets(buf,sizeof(buf),fp) == NULL) goto readerr;
0528             if (buf[0] != '$') goto fmterr;
0529             len = strtol(buf+1,NULL,10);
0530             argsds = sdsnewlen(NULL,len);
0531             if (len && fread(argsds,len,1,fp) == 0) goto fmterr;
0532             argv[j] = createObject(REDIS_STRING,argsds);
0533             if (fread(buf,2,1,fp) == 0) goto fmterr; /* discard CRLF */
0534         }
0535 
0536         /* Command lookup */
0537         cmd = lookupCommand(argv[0]->ptr);
0538         if (!cmd) {
0539             redisLog(REDIS_WARNING,"Unknown command '%s' reading the append only file", argv[0]->ptr);
0540             exit(1);
0541         }
0542         /* Run the command in the context of a fake client */
0543         fakeClient->argc = argc;
0544         fakeClient->argv = argv;
0545         cmd->proc(fakeClient);
0546 
0547         /* The fake client should not have a reply */
0548         redisAssert(fakeClient->bufpos == 0 && listLength(fakeClient->reply) == 0);
0549         /* The fake client should never get blocked */
0550         redisAssert((fakeClient->flags & REDIS_BLOCKED) == 0);
0551 
0552         /* Clean up. Command code may have changed argv/argc so we use the
0553          * argv/argc of the client instead of the local variables. */
0554         for (j = 0; j < fakeClient->argc; j++)
0555             decrRefCount(fakeClient->argv[j]);
0556         zfree(fakeClient->argv);
0557     }
0558 
0559     /* This point can only be reached when EOF is reached without errors.
0560      * If the client is in the middle of a MULTI/EXEC, log error and quit. */
0561     if (fakeClient->flags & REDIS_MULTI) goto readerr;
0562 
0563     fclose(fp);
0564     freeFakeClient(fakeClient);
0565     server.aof_state = old_aof_state;
0566     stopLoading();
0567     aofUpdateCurrentSize();
0568     server.aof_rewrite_base_size = server.aof_current_size;
0569     return REDIS_OK;
0570 
0571 readerr:
0572     if (feof(fp)) {
0573         redisLog(REDIS_WARNING,"Unexpected end of file reading the append only file");
0574     } else {
0575         redisLog(REDIS_WARNING,"Unrecoverable error reading the append only file: %s", strerror(errno));
0576     }
0577     exit(1);
0578 fmterr:
0579     redisLog(REDIS_WARNING,"Bad file format reading the append only file: make a backup of your AOF file, then use ./redis-check-aof --fix <filename>");
0580     exit(1);
0581 }
0582 
0583 /* ----------------------------------------------------------------------------
0584  * AOF rewrite
0585  * ------------------------------------------------------------------------- */
0586 
0587 /* Delegate writing an object to writing a bulk string or bulk long long.
0588  * This is not placed in rio.c since that adds the redis.h dependency. */
0589 int rioWriteBulkObject(rio *r, robj *obj) {
0590     /* Avoid using getDecodedObject to help copy-on-write (we are often
0591      * in a child process when this function is called). */
0592     if (obj->encoding == REDIS_ENCODING_INT) {
0593         return rioWriteBulkLongLong(r,(long)obj->ptr);
0594     } else if (obj->encoding == REDIS_ENCODING_RAW) {
0595         return rioWriteBulkString(r,obj->ptr,sdslen(obj->ptr));
0596     } else {
0597         redisPanic("Unknown string encoding");
0598     }
0599 }
0600 
0601 /* Emit the commands needed to rebuild a list object.
0602  * The function returns 0 on error, 1 on success. */
0603 int rewriteListObject(rio *r, robj *key, robj *o) {
0604     long long count = 0, items = listTypeLength(o);
0605 
0606     if (o->encoding == REDIS_ENCODING_ZIPLIST) {
0607         unsigned char *zl = o->ptr;
0608         unsigned char *p = ziplistIndex(zl,0);
0609         unsigned char *vstr;
0610         unsigned int vlen;
0611         long long vlong;
0612 
0613         while(ziplistGet(p,&vstr,&vlen,&vlong)) {
0614             if (count == 0) {
0615                 int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ?
0616                     REDIS_AOF_REWRITE_ITEMS_PER_CMD : items;
0617 
0618                 if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0;
0619                 if (rioWriteBulkString(r,"RPUSH",5) == 0) return 0;
0620                 if (rioWriteBulkObject(r,key) == 0) return 0;
0621             }
0622             if (vstr) {
0623                 if (rioWriteBulkString(r,(char*)vstr,vlen) == 0) return 0;
0624             } else {
0625                 if (rioWriteBulkLongLong(r,vlong) == 0) return 0;
0626             }
0627             p = ziplistNext(zl,p);
0628             if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0;
0629             items--;
0630         }
0631     } else if (o->encoding == REDIS_ENCODING_LINKEDLIST) {
0632         list *list = o->ptr;
0633         listNode *ln;
0634         listIter li;
0635 
0636         listRewind(list,&li);
0637         while((ln = listNext(&li))) {
0638             robj *eleobj = listNodeValue(ln);
0639 
0640             if (count == 0) {
0641                 int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ?
0642                     REDIS_AOF_REWRITE_ITEMS_PER_CMD : items;
0643 
0644                 if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0;
0645                 if (rioWriteBulkString(r,"RPUSH",5) == 0) return 0;
0646                 if (rioWriteBulkObject(r,key) == 0) return 0;
0647             }
0648             if (rioWriteBulkObject(r,eleobj) == 0) return 0;
0649             if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0;
0650             items--;
0651         }
0652     } else {
0653         redisPanic("Unknown list encoding");
0654     }
0655     return 1;
0656 }
0657 
0658 /* Emit the commands needed to rebuild a set object.
0659  * The function returns 0 on error, 1 on success. */
0660 int rewriteSetObject(rio *r, robj *key, robj *o) {
0661     long long count = 0, items = setTypeSize(o);
0662 
0663     if (o->encoding == REDIS_ENCODING_INTSET) {
0664         int ii = 0;
0665         int64_t llval;
0666 
0667         while(intsetGet(o->ptr,ii++,&llval)) {
0668             if (count == 0) {
0669                 int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ?
0670                     REDIS_AOF_REWRITE_ITEMS_PER_CMD : items;
0671 
0672                 if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0;
0673                 if (rioWriteBulkString(r,"SADD",4) == 0) return 0;
0674                 if (rioWriteBulkObject(r,key) == 0) return 0;
0675             }
0676             if (rioWriteBulkLongLong(r,llval) == 0) return 0;
0677             if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0;
0678             items--;
0679         }
0680     } else if (o->encoding == REDIS_ENCODING_HT) {
0681         dictIterator *di = dictGetIterator(o->ptr);
0682         dictEntry *de;
0683 
0684         while((de = dictNext(di)) != NULL) {
0685             robj *eleobj = dictGetKey(de);
0686             if (count == 0) {
0687                 int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ?
0688                     REDIS_AOF_REWRITE_ITEMS_PER_CMD : items;
0689 
0690                 if (rioWriteBulkCount(r,'*',2+cmd_items) == 0) return 0;
0691                 if (rioWriteBulkString(r,"SADD",4) == 0) return 0;
0692                 if (rioWriteBulkObject(r,key) == 0) return 0;
0693             }
0694             if (rioWriteBulkObject(r,eleobj) == 0) return 0;
0695             if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0;
0696             items--;
0697         }
0698         dictReleaseIterator(di);
0699     } else {
0700         redisPanic("Unknown set encoding");
0701     }
0702     return 1;
0703 }
0704 
0705 /* Emit the commands needed to rebuild a sorted set object.
0706  * The function returns 0 on error, 1 on success. */
0707 int rewriteSortedSetObject(rio *r, robj *key, robj *o) {
0708     long long count = 0, items = zsetLength(o);
0709 
0710     if (o->encoding == REDIS_ENCODING_ZIPLIST) {
0711         unsigned char *zl = o->ptr;
0712         unsigned char *eptr, *sptr;
0713         unsigned char *vstr;
0714         unsigned int vlen;
0715         long long vll;
0716         double score;
0717 
0718         eptr = ziplistIndex(zl,0);
0719         redisAssert(eptr != NULL);
0720         sptr = ziplistNext(zl,eptr);
0721         redisAssert(sptr != NULL);
0722 
0723         while (eptr != NULL) {
0724             redisAssert(ziplistGet(eptr,&vstr,&vlen,&vll));
0725             score = zzlGetScore(sptr);
0726 
0727             if (count == 0) {
0728                 int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ?
0729                     REDIS_AOF_REWRITE_ITEMS_PER_CMD : items;
0730 
0731                 if (rioWriteBulkCount(r,'*',2+cmd_items*2) == 0) return 0;
0732                 if (rioWriteBulkString(r,"ZADD",4) == 0) return 0;
0733                 if (rioWriteBulkObject(r,key) == 0) return 0;
0734             }
0735             if (rioWriteBulkDouble(r,score) == 0) return 0;
0736             if (vstr != NULL) {
0737                 if (rioWriteBulkString(r,(char*)vstr,vlen) == 0) return 0;
0738             } else {
0739                 if (rioWriteBulkLongLong(r,vll) == 0) return 0;
0740             }
0741             zzlNext(zl,&eptr,&sptr);
0742             if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0;
0743             items--;
0744         }
0745     } else if (o->encoding == REDIS_ENCODING_SKIPLIST) {
0746         zset *zs = o->ptr;
0747         dictIterator *di = dictGetIterator(zs->dict);
0748         dictEntry *de;
0749 
0750         while((de = dictNext(di)) != NULL) {
0751             robj *eleobj = dictGetKey(de);
0752             double *score = dictGetVal(de);
0753 
0754             if (count == 0) {
0755                 int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ?
0756                     REDIS_AOF_REWRITE_ITEMS_PER_CMD : items;
0757 
0758                 if (rioWriteBulkCount(r,'*',2+cmd_items*2) == 0) return 0;
0759                 if (rioWriteBulkString(r,"ZADD",4) == 0) return 0;
0760                 if (rioWriteBulkObject(r,key) == 0) return 0;
0761             }
0762             if (rioWriteBulkDouble(r,*score) == 0) return 0;
0763             if (rioWriteBulkObject(r,eleobj) == 0) return 0;
0764             if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0;
0765             items--;
0766         }
0767         dictReleaseIterator(di);
0768     } else {
0769         redisPanic("Unknown sorted zset encoding");
0770     }
0771     return 1;
0772 }
0773 
0774 /* Write either the key or the value of the currently selected item of an hash.
0775  * The 'hi' argument passes a valid Redis hash iterator.
0776  * The 'what' filed specifies if to write a key or a value and can be
0777  * either REDIS_HASH_KEY or REDIS_HASH_VALUE.
0778  *
0779  * The function returns 0 on error, non-zero on success. */
0780 static int rioWriteHashIteratorCursor(rio *r, hashTypeIterator *hi, int what) {
0781     if (hi->encoding == REDIS_ENCODING_ZIPLIST) {
0782         unsigned char *vstr = NULL;
0783         unsigned int vlen = UINT_MAX;
0784         long long vll = LLONG_MAX;
0785 
0786         hashTypeCurrentFromZiplist(hi, what, &vstr, &vlen, &vll);
0787         if (vstr) {
0788             return rioWriteBulkString(r, (char*)vstr, vlen);
0789         } else {
0790             return rioWriteBulkLongLong(r, vll);
0791         }
0792 
0793     } else if (hi->encoding == REDIS_ENCODING_HT) {
0794         robj *value;
0795 
0796         hashTypeCurrentFromHashTable(hi, what, &value);
0797         return rioWriteBulkObject(r, value);
0798     }
0799 
0800     redisPanic("Unknown hash encoding");
0801     return 0;
0802 }
0803 
0804 /* Emit the commands needed to rebuild a hash object.
0805  * The function returns 0 on error, 1 on success. */
0806 int rewriteHashObject(rio *r, robj *key, robj *o) {
0807     hashTypeIterator *hi;
0808     long long count = 0, items = hashTypeLength(o);
0809 
0810     hi = hashTypeInitIterator(o);
0811     while (hashTypeNext(hi) != REDIS_ERR) {
0812         if (count == 0) {
0813             int cmd_items = (items > REDIS_AOF_REWRITE_ITEMS_PER_CMD) ?
0814                 REDIS_AOF_REWRITE_ITEMS_PER_CMD : items;
0815 
0816             if (rioWriteBulkCount(r,'*',2+cmd_items*2) == 0) return 0;
0817             if (rioWriteBulkString(r,"HMSET",5) == 0) return 0;
0818             if (rioWriteBulkObject(r,key) == 0) return 0;
0819         }
0820 
0821         if (rioWriteHashIteratorCursor(r, hi, REDIS_HASH_KEY) == 0) return 0;
0822         if (rioWriteHashIteratorCursor(r, hi, REDIS_HASH_VALUE) == 0) return 0;
0823         if (++count == REDIS_AOF_REWRITE_ITEMS_PER_CMD) count = 0;
0824         items--;
0825     }
0826 
0827     hashTypeReleaseIterator(hi);
0828 
0829     return 1;
0830 }
0831 
0832 /* Write a sequence of commands able to fully rebuild the dataset into
0833  * "filename". Used both by REWRITEAOF and BGREWRITEAOF.
0834  *
0835  * In order to minimize the number of commands needed in the rewritten
0836  * log Redis uses variadic commands when possible, such as RPUSH, SADD
0837  * and ZADD. However at max REDIS_AOF_REWRITE_ITEMS_PER_CMD items per time
0838  * are inserted using a single command. */
0839 int rewriteAppendOnlyFile(char *filename) {
0840     dictIterator *di = NULL;
0841     dictEntry *de;
0842     rio aof;
0843     FILE *fp;
0844     char tmpfile[256];
0845     int j;
0846     long long now = mstime();
0847 
0848     /* Note that we have to use a different temp name here compared to the
0849      * one used by rewriteAppendOnlyFileBackground() function. */
0850     snprintf(tmpfile,256,"temp-rewriteaof-%d.aof", (int) getpid());
0851     fp = fopen(tmpfile,"w");
0852     if (!fp) {
0853         redisLog(REDIS_WARNING, "Opening the temp file for AOF rewrite in rewriteAppendOnlyFile(): %s", strerror(errno));
0854         return REDIS_ERR;
0855     }
0856 
0857     rioInitWithFile(&aof,fp);
0858     if (server.aof_rewrite_incremental_fsync)
0859         rioSetAutoSync(&aof,REDIS_AOF_AUTOSYNC_BYTES);
0860     for (j = 0; j < server.dbnum; j++) {
0861         char selectcmd[] = "*2\r\n$6\r\nSELECT\r\n";
0862         redisDb *db = server.db+j;
0863         dict *d = db->dict;
0864         if (dictSize(d) == 0) continue;
0865         di = dictGetSafeIterator(d);
0866         if (!di) {
0867             fclose(fp);
0868             return REDIS_ERR;
0869         }
0870 
0871         /* SELECT the new DB */
0872         if (rioWrite(&aof,selectcmd,sizeof(selectcmd)-1) == 0) goto werr;
0873         if (rioWriteBulkLongLong(&aof,j) == 0) goto werr;
0874 
0875         /* Iterate this DB writing every entry */
0876         while((de = dictNext(di)) != NULL) {
0877             sds keystr;
0878             robj key, *o;
0879             long long expiretime;
0880 
0881             keystr = dictGetKey(de);
0882             o = dictGetVal(de);
0883             initStaticStringObject(key,keystr);
0884 
0885             expiretime = getExpire(db,&key);
0886 
0887             /* Save the key and associated value */
0888             if (o->type == REDIS_STRING) {
0889                 /* Emit a SET command */
0890                 char cmd[]="*3\r\n$3\r\nSET\r\n";
0891                 if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
0892                 /* Key and value */
0893                 if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
0894                 if (rioWriteBulkObject(&aof,o) == 0) goto werr;
0895             } else if (o->type == REDIS_LIST) {
0896                 if (rewriteListObject(&aof,&key,o) == 0) goto werr;
0897             } else if (o->type == REDIS_SET) {
0898                 if (rewriteSetObject(&aof,&key,o) == 0) goto werr;
0899             } else if (o->type == REDIS_ZSET) {
0900                 if (rewriteSortedSetObject(&aof,&key,o) == 0) goto werr;
0901             } else if (o->type == REDIS_HASH) {
0902                 if (rewriteHashObject(&aof,&key,o) == 0) goto werr;
0903             } else {
0904                 redisPanic("Unknown object type");
0905             }
0906             /* Save the expire time */
0907             if (expiretime != -1) {
0908                 char cmd[]="*3\r\n$9\r\nPEXPIREAT\r\n";
0909                 /* If this key is already expired skip it */
0910                 if (expiretime < now) continue;
0911                 if (rioWrite(&aof,cmd,sizeof(cmd)-1) == 0) goto werr;
0912                 if (rioWriteBulkObject(&aof,&key) == 0) goto werr;
0913                 if (rioWriteBulkLongLong(&aof,expiretime) == 0) goto werr;
0914             }
0915         }
0916         dictReleaseIterator(di);
0917     }
0918 
0919     /* Make sure data will not remain on the OS's output buffers */
0920     fflush(fp);
0921     aof_fsync(fileno(fp));
0922     fclose(fp);
0923 
0924     /* Use RENAME to make sure the DB file is changed atomically only
0925      * if the generate DB file is ok. */
0926     if (rename(tmpfile,filename) == -1) {
0927         redisLog(REDIS_WARNING,"Error moving temp append only file on the final destination: %s", strerror(errno));
0928         unlink(tmpfile);
0929         return REDIS_ERR;
0930     }
0931     redisLog(REDIS_NOTICE,"SYNC append only file rewrite performed");
0932     return REDIS_OK;
0933 
0934 werr:
0935     fclose(fp);
0936     unlink(tmpfile);
0937     redisLog(REDIS_WARNING,"Write error writing append only file on disk: %s", strerror(errno));
0938     if (di) dictReleaseIterator(di);
0939     return REDIS_ERR;
0940 }
0941 
0942 /* This is how rewriting of the append only file in background works:
0943  *
0944  * 1) The user calls BGREWRITEAOF
0945  * 2) Redis calls this function, that forks():
0946  *    2a) the child rewrite the append only file in a temp file.
0947  *    2b) the parent accumulates differences in server.aof_rewrite_buf.
0948  * 3) When the child finished '2a' exists.
0949  * 4) The parent will trap the exit code, if it's OK, will append the
0950  *    data accumulated into server.aof_rewrite_buf into the temp file, and
0951  *    finally will rename(2) the temp file in the actual file name.
0952  *    The the new file is reopened as the new append only file. Profit!
0953  */
0954 int rewriteAppendOnlyFileBackground(void) {
0955     pid_t childpid;
0956     long long start;
0957 
0958     if (server.aof_child_pid != -1) return REDIS_ERR;
0959     start = ustime();
0960     if ((childpid = fork()) == 0) {
0961         char tmpfile[256];
0962 
0963         /* Child */
0964         if (server.ipfd > 0) close(server.ipfd);
0965         if (server.sofd > 0) close(server.sofd);
0966         snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) getpid());
0967         if (rewriteAppendOnlyFile(tmpfile) == REDIS_OK) {
0968             size_t private_dirty = zmalloc_get_private_dirty();
0969 
0970             if (private_dirty) {
0971                 redisLog(REDIS_NOTICE,
0972                     "AOF rewrite: %lu MB of memory used by copy-on-write",
0973                     private_dirty/(1024*1024));
0974             }
0975             exitFromChild(0);
0976         } else {
0977             exitFromChild(1);
0978         }
0979     } else {
0980         /* Parent */
0981         server.stat_fork_time = ustime()-start;
0982         if (childpid == -1) {
0983             redisLog(REDIS_WARNING,
0984                 "Can't rewrite append only file in background: fork: %s",
0985                 strerror(errno));
0986             return REDIS_ERR;
0987         }
0988         redisLog(REDIS_NOTICE,
0989             "Background append only file rewriting started by pid %d",childpid);
0990         server.aof_rewrite_scheduled = 0;
0991         server.aof_rewrite_time_start = time(NULL);
0992         server.aof_child_pid = childpid;
0993         updateDictResizePolicy();
0994         /* We set appendseldb to -1 in order to force the next call to the
0995          * feedAppendOnlyFile() to issue a SELECT command, so the differences
0996          * accumulated by the parent into server.aof_rewrite_buf will start
0997          * with a SELECT statement and it will be safe to merge. */
0998         server.aof_selected_db = -1;
0999         return REDIS_OK;
1000     }
1001     return REDIS_OK; /* unreached */
1002 }
1003 
1004 void bgrewriteaofCommand(redisClient *c) {
1005     if (server.aof_child_pid != -1) {
1006         addReplyError(c,"Background append only file rewriting already in progress");
1007     } else if (server.rdb_child_pid != -1) {
1008         server.aof_rewrite_scheduled = 1;
1009         addReplyStatus(c,"Background append only file rewriting scheduled");
1010     } else if (rewriteAppendOnlyFileBackground() == REDIS_OK) {
1011         addReplyStatus(c,"Background append only file rewriting started");
1012     } else {
1013         addReply(c,shared.err);
1014     }
1015 }
1016 
1017 void aofRemoveTempFile(pid_t childpid) {
1018     char tmpfile[256];
1019 
1020     snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof", (int) childpid);
1021     unlink(tmpfile);
1022 }
1023 
1024 /* Update the server.aof_current_size filed explicitly using stat(2)
1025  * to check the size of the file. This is useful after a rewrite or after
1026  * a restart, normally the size is updated just adding the write length
1027  * to the current length, that is much faster. */
1028 void aofUpdateCurrentSize(void) {
1029     struct redis_stat sb;
1030 
1031     if (redis_fstat(server.aof_fd,&sb) == -1) {
1032         redisLog(REDIS_WARNING,"Unable to obtain the AOF file length. stat: %s",
1033             strerror(errno));
1034     } else {
1035         server.aof_current_size = sb.st_size;
1036     }
1037 }
1038 
1039 /* A background append only file rewriting (BGREWRITEAOF) terminated its work.
1040  * Handle this. */
1041 void backgroundRewriteDoneHandler(int exitcode, int bysignal) {
1042     if (!bysignal && exitcode == 0) {
1043         int newfd, oldfd;
1044         char tmpfile[256];
1045         long long now = ustime();
1046 
1047         redisLog(REDIS_NOTICE,
1048             "Background AOF rewrite terminated with success");
1049 
1050         /* Flush the differences accumulated by the parent to the
1051          * rewritten AOF. */
1052         snprintf(tmpfile,256,"temp-rewriteaof-bg-%d.aof",
1053             (int)server.aof_child_pid);
1054         newfd = open(tmpfile,O_WRONLY|O_APPEND);
1055         if (newfd == -1) {
1056             redisLog(REDIS_WARNING,
1057                 "Unable to open the temporary AOF produced by the child: %s", strerror(errno));
1058             goto cleanup;
1059         }
1060 
1061         if (aofRewriteBufferWrite(newfd) == -1) {
1062             redisLog(REDIS_WARNING,
1063                 "Error trying to flush the parent diff to the rewritten AOF: %s", strerror(errno));
1064             close(newfd);
1065             goto cleanup;
1066         }
1067 
1068         redisLog(REDIS_NOTICE,
1069             "Parent diff successfully flushed to the rewritten AOF (%lu bytes)", aofRewriteBufferSize());
1070 
1071         /* The only remaining thing to do is to rename the temporary file to
1072          * the configured file and switch the file descriptor used to do AOF
1073          * writes. We don't want close(2) or rename(2) calls to block the
1074          * server on old file deletion.
1075          *
1076          * There are two possible scenarios:
1077          *
1078          * 1) AOF is DISABLED and this was a one time rewrite. The temporary
1079          * file will be renamed to the configured file. When this file already
1080          * exists, it will be unlinked, which may block the server.
1081          *
1082          * 2) AOF is ENABLED and the rewritten AOF will immediately start
1083          * receiving writes. After the temporary file is renamed to the
1084          * configured file, the original AOF file descriptor will be closed.
1085          * Since this will be the last reference to that file, closing it
1086          * causes the underlying file to be unlinked, which may block the
1087          * server.
1088          *
1089          * To mitigate the blocking effect of the unlink operation (either
1090          * caused by rename(2) in scenario 1, or by close(2) in scenario 2), we
1091          * use a background thread to take care of this. First, we
1092          * make scenario 1 identical to scenario 2 by opening the target file
1093          * when it exists. The unlink operation after the rename(2) will then
1094          * be executed upon calling close(2) for its descriptor. Everything to
1095          * guarantee atomicity for this switch has already happened by then, so
1096          * we don't care what the outcome or duration of that close operation
1097          * is, as long as the file descriptor is released again. */
1098         if (server.aof_fd == -1) {
1099             /* AOF disabled */
1100 
1101              /* Don't care if this fails: oldfd will be -1 and we handle that.
1102               * One notable case of -1 return is if the old file does
1103               * not exist. */
1104              oldfd = open(server.aof_filename,O_RDONLY|O_NONBLOCK);
1105         } else {
1106             /* AOF enabled */
1107             oldfd = -1; /* We'll set this to the current AOF filedes later. */
1108         }
1109 
1110         /* Rename the temporary file. This will not unlink the target file if
1111          * it exists, because we reference it with "oldfd". */
1112         if (rename(tmpfile,server.aof_filename) == -1) {
1113             redisLog(REDIS_WARNING,
1114                 "Error trying to rename the temporary AOF file: %s", strerror(errno));
1115             close(newfd);
1116             if (oldfd != -1) close(oldfd);
1117             goto cleanup;
1118         }
1119 
1120         if (server.aof_fd == -1) {
1121             /* AOF disabled, we don't need to set the AOF file descriptor
1122              * to this new file, so we can close it. */
1123             close(newfd);
1124         } else {
1125             /* AOF enabled, replace the old fd with the new one. */
1126             oldfd = server.aof_fd;
1127             server.aof_fd = newfd;
1128             if (server.aof_fsync == AOF_FSYNC_ALWAYS)
1129                 aof_fsync(newfd);
1130             else if (server.aof_fsync == AOF_FSYNC_EVERYSEC)
1131                 aof_background_fsync(newfd);
1132             server.aof_selected_db = -1; /* Make sure SELECT is re-issued */
1133             aofUpdateCurrentSize();
1134             server.aof_rewrite_base_size = server.aof_current_size;
1135 
1136             /* Clear regular AOF buffer since its contents was just written to
1137              * the new AOF from the background rewrite buffer. */
1138             sdsfree(server.aof_buf);
1139             server.aof_buf = sdsempty();
1140         }
1141 
1142         server.aof_lastbgrewrite_status = REDIS_OK;
1143 
1144         redisLog(REDIS_NOTICE, "Background AOF rewrite finished successfully");
1145         /* Change state from WAIT_REWRITE to ON if needed */
1146         if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
1147             server.aof_state = REDIS_AOF_ON;
1148 
1149         /* Asynchronously close the overwritten AOF. */
1150         if (oldfd != -1) bioCreateBackgroundJob(REDIS_BIO_CLOSE_FILE,(void*)(long)oldfd,NULL,NULL);
1151 
1152         redisLog(REDIS_VERBOSE,
1153             "Background AOF rewrite signal handler took %lldus", ustime()-now);
1154     } else if (!bysignal && exitcode != 0) {
1155         server.aof_lastbgrewrite_status = REDIS_ERR;
1156 
1157         redisLog(REDIS_WARNING,
1158             "Background AOF rewrite terminated with error");
1159     } else {
1160         server.aof_lastbgrewrite_status = REDIS_ERR;
1161 
1162         redisLog(REDIS_WARNING,
1163             "Background AOF rewrite terminated by signal %d", bysignal);
1164     }
1165 
1166 cleanup:
1167     aofRewriteBufferReset();
1168     aofRemoveTempFile(server.aof_child_pid);
1169     server.aof_child_pid = -1;
1170     server.aof_rewrite_time_last = time(NULL)-server.aof_rewrite_time_start;
1171     server.aof_rewrite_time_start = -1;
1172     /* Schedule a new rewrite if we are waiting for it to switch the AOF ON. */
1173     if (server.aof_state == REDIS_AOF_WAIT_REWRITE)
1174         server.aof_rewrite_scheduled = 1;
1175 }