valkey-io
diff --git a/‎src/bio.c
+188 b/‎src/bio.c
+188
diff --git a/‎src/bio.h
+2 b/‎src/bio.h
+2
diff --git a/‎src/config.c
+1 b/‎src/config.c
+1
@@ -61,13 +61,19 @@
 
 
 #include "server.h"
+#include "connection.h"
 #include "bio.h"
 #include <stdatomic.h>
+#include <pthread.h>
+#include <signal.h>
+#include <errno.h>
+#include <stdio.h>
 
 static char *bio_worker_title[] = {
     "bio_close_file",
     "bio_aof",
     "bio_lazy_free",
+    "bio_save_to_disk",
 };
 
 #define BIO_WORKER_NUM (sizeof(bio_worker_title) / sizeof(*bio_worker_title))
@@ -77,6 +83,7 @@ static unsigned int bio_job_to_worker[] = {
     [BIO_AOF_FSYNC] = 1,
     [BIO_CLOSE_AOF] = 1,
     [BIO_LAZY_FREE] = 2,
+    [BIO_SAVE_TO_DISK] = 3,
 };
 
 static pthread_t bio_threads[BIO_WORKER_NUM];
@@ -108,6 +115,12 @@ typedef union bio_job {
         lazy_free_fn *free_fn; /* Function that will free the provided arguments */
         void *free_args[];     /* List of arguments to be passed to the free function */
     } free_args;
+    struct {
+        int type;
+        connection *conn; /* Connection to download the rdb from */
+        int dest_fd;
+        int sync_io_timeout;
+    } save_to_disk_args;
 } bio_job;
 
 void *bioProcessBackgroundJobs(void *arg);
@@ -203,6 +216,18 @@ void bioCreateFsyncJob(int fd, long long offset, int need_reclaim_cache) {
     bioSubmitJob(BIO_AOF_FSYNC, job);
 }
 
+void bioCreateSaveRDBToDiskJob(connection *conn, int dest_fd, int sync_io_timeout) {
+    bio_job *job = zmalloc(sizeof(*job));
+    job->save_to_disk_args.conn = conn;
+    job->save_to_disk_args.sync_io_timeout = sync_io_timeout;
+    job->save_to_disk_args.dest_fd = dest_fd;
+    bioSubmitJob(BIO_SAVE_TO_DISK, job);
+}
+
+int shouldAbortSave(void) {
+    return atomic_load_explicit(&server.replica_bio_abort_save, memory_order_relaxed);
+}
+
 void *bioProcessBackgroundJobs(void *arg) {
     bio_job *job;
     unsigned long worker = (unsigned long)arg;
@@ -278,6 +303,169 @@ void *bioProcessBackgroundJobs(void *arg) {
             if (job_type == BIO_CLOSE_AOF) close(job->fd_args.fd);
         } else if (job_type == BIO_LAZY_FREE) {
             job->free_args.free_fn(job->free_args.free_args);
+        } else if (job_type == BIO_SAVE_TO_DISK) {
+            ssize_t nread, readlen, nwritten;
+            off_t left, repl_transfer_last_fsync_off = 0;
+            int usemark;
+            off_t repl_transfer_size = -1, repl_transfer_read = 0;
+            char eofmark[RDB_EOF_MARK_SIZE];
+            char lastbytes[RDB_EOF_MARK_SIZE];
+            char buf[PROTO_IOBUF_LEN];
+            int sync_io_timeout = job->save_to_disk_args.sync_io_timeout;
+            connection *conn = job->save_to_disk_args.conn;
+            int dest_fd = job->save_to_disk_args.dest_fd;
+            int received_ping = 0;
+            long long stat_net_repl_input_bytes = 0;
+            int error = 0, main_thread_abort = 0;
+
+            /* Put the socket in blocking mode to simplify RDB transfer.
+            * We'll restore it when the RDB is received. */
+            connBlock(conn);
+            connRecvTimeout(conn, sync_io_timeout);
+            do {
+                received_ping = 0;
+                if (shouldAbortSave()) {
+                    main_thread_abort = 1;
+                    goto done;
+                }
+                nread = connSyncReadLine(conn, buf, 1024, sync_io_timeout);
+                if (nread == -1) {
+                    replicaBioSaveServerLog(LL_WARNING, "I/O error reading bulk count from PRIMARY: %s", connGetLastError(conn));
+                    error = 1;
+                    goto done;
+                } else {
+                    /* nread here is returned by connSyncReadLine(), which calls syncReadLine() and
+                    * convert "\r\n" to '\0' so 1 byte is lost. */
+                    stat_net_repl_input_bytes += nread + 1;
+                }
+                int ret = inspectBulkPayloadHeaderForErrors(buf);
+                if (ret == INSPECT_BULK_PAYLOAD_PRIMARY_ABORT) {
+                    replicaBioSaveServerLog(LL_WARNING, "PRIMARY aborted replication with an error: %s", buf + 1);
+                    error = 1;
+                    goto done;
+                } else if (ret == INSPECT_BULK_PAYLOAD_PRIMARY_PING) {
+                    atomic_store_explicit(&server.repl_transfer_lastio, atomic_load_explicit(&server.unixtime, memory_order_relaxed), memory_order_relaxed);
+                    memset(buf, 0, PROTO_IOBUF_LEN);
+                    received_ping = 1;
+                } else if (ret == INSPECT_BULK_PAYLOAD_PRIMARY_BAD_PROTO) {
+                    replicaBioSaveServerLog(LL_WARNING,
+                                            "Bad protocol from PRIMARY, the first byte is not '$' (we received '%s'), are you sure the host "
+                                            "and port are right?",
+                                            buf);
+                    error = 1;
+                    goto done;
+                }
+            } while (received_ping);
+
+            usemark = inspectBulkPayloadHeaderForEOF(buf, eofmark, lastbytes);
+            if (usemark) {
+                repl_transfer_size = 0;
+                replicaBioSaveServerLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: receiving streamed RDB from primary with EOF to disk");
+            } else {
+                repl_transfer_size = strtol(buf + 1, NULL, 10);
+                replicaBioSaveServerLog(LL_NOTICE, "PRIMARY <-> REPLICA sync: receiving %lld bytes from primary to disk",
+                                        (long long)repl_transfer_size);
+            }
+
+            while (1) {
+                if (shouldAbortSave()) {
+                    replicaBioSaveServerLog(3, "BIO THREAD ABORTING DUE TO MAIN THREAD REQUEST");
+                    main_thread_abort = 1;
+                    goto done;
+                }
+                if (usemark) {
+                    readlen = sizeof(buf);
+                } else {
+                    left = repl_transfer_size - repl_transfer_read;
+                    readlen = (left < (signed)sizeof(buf)) ? left : (signed)sizeof(buf);
+                }
+                nread = connRead(conn, buf, readlen);
+                if (nread <= 0) {
+                    if (connGetState(conn) == CONN_STATE_CONNECTED) {
+                        /* equivalent to EAGAIN */
+                        memset(buf, 0, PROTO_IOBUF_LEN);
+                        continue;
+                    }
+                    replicaBioSaveServerLog(LL_WARNING, "I/O error trying to sync with PRIMARY: %s",
+                                            (nread == -1) ? connGetLastError(conn) : "connection lost");
+                    error = 1;
+                    goto done;
+                }
+                stat_net_repl_input_bytes += nread;
+
+                /* When a mark is used, we want to detect EOF asap in order to avoid
+                * writing the EOF mark into the file... */
+                int eof_reached = 0;
+                if (usemark) eof_reached = inspectBulkPayloadForEOF(buf, nread, eofmark, lastbytes);
+                /* Update the last I/O time for the replication transfer (used in
+                * order to detect timeouts during replication), and write what we
+                * got from the socket to the dump file on disk. */
+                atomic_store_explicit(&server.repl_transfer_lastio, atomic_load_explicit(&server.unixtime, memory_order_relaxed), memory_order_relaxed);
+                if ((nwritten = write(dest_fd, buf, nread)) != nread) {
+                    replicaBioSaveServerLog(LL_WARNING,
+                                            "Write error or short write writing to the DB dump file "
+                                            "needed for PRIMARY <-> REPLICA synchronization: %s",
+                                            (nwritten == -1) ? strerror(errno) : "short write");
+                    error = 1;
+                    goto done;
+                }
+                repl_transfer_read += nread;
+
+                /* Delete the last 40 bytes from the file if we reached EOF. */
+                if (usemark && eof_reached) {
+                    if (ftruncate(dest_fd, repl_transfer_read - RDB_EOF_MARK_SIZE) == -1) {
+                        replicaBioSaveServerLog(LL_WARNING,
+                                                "Error truncating the RDB file received from the primary "
+                                                "for SYNC: %s",
+                                                strerror(errno));
+                        error = 1;
+                        goto done;
+                    }
+                }
+
+                /* Sync data on disk from time to time, otherwise at the end of the
+                * transfer we may suffer a big delay as the memory buffers are copied
+                * into the actual disk. */
+                if (repl_transfer_read >= repl_transfer_last_fsync_off + REPL_MAX_WRITTEN_BEFORE_FSYNC) {
+                    off_t sync_size = repl_transfer_read - repl_transfer_last_fsync_off;
+                    rdb_fsync_range(dest_fd, repl_transfer_last_fsync_off, sync_size);
+                    repl_transfer_last_fsync_off += sync_size;
+                }
+
+                /* Check if the transfer is now complete */
+                if (!usemark) {
+                    if (repl_transfer_read == repl_transfer_size) eof_reached = 1;
+                }
+
+                /* If the transfer is yet not complete, we need to read more, so
+                * return ASAP and wait for the handler to be called again. */
+                if (!eof_reached) {
+                    memset(buf, 0, PROTO_IOBUF_LEN);
+                    continue;
+                }
+
+                break;
+            }
+
+        done:
+            /* Restore the socket to the original state to continue
+             * with the normal replication. */
+            connNonBlock(conn);
+            connRecvTimeout(conn, 0);
+            if (main_thread_abort) {
+                // Do nothing for now, main thread already handles cancelReplHandshake
+                replicaBioSaveServerLog(LL_WARNING, "Replica main thread aborted RDB save");
+            } else if (error) {
+                replicaBioSaveServerLog(LL_WARNING, "Error downloading RDB");
+                atomic_store_explicit(&server.replica_bio_disk_save_state, REPL_BIO_DISK_SAVE_STATE_FAIL, memory_order_release);
+            } else {
+                replicaBioSaveServerLog(LL_NOTICE, "Done downloading RDB");
+                server.bio_stat_net_repl_input_bytes = stat_net_repl_input_bytes;
+                server.bio_repl_transfer_size = repl_transfer_size;
+                server.bio_repl_transfer_read = repl_transfer_read;
+                server.bio_conn = conn;
+                atomic_store_explicit(&server.replica_bio_disk_save_state, REPL_BIO_DISK_SAVE_STATE_FINISHED, memory_order_release);
+            }
         } else {
             serverPanic("Wrong job type in bioProcessBackgroundJobs().");
         }
 
@@ -41,13 +41,15 @@ void bioCreateCloseJob(int fd, int need_fsync, int need_reclaim_cache);
 void bioCreateCloseAofJob(int fd, long long offset, int need_reclaim_cache);
 void bioCreateFsyncJob(int fd, long long offset, int need_reclaim_cache);
 void bioCreateLazyFreeJob(lazy_free_fn free_fn, int arg_count, ...);
+void bioCreateSaveRDBToDiskJob(connection *conn, int dest_fd, int sync_io_timeout);
 
 /* Background job opcodes */
 enum {
     BIO_CLOSE_FILE = 0, /* Deferred close(2) syscall. */
     BIO_AOF_FSYNC,      /* Deferred AOF fsync. */
     BIO_LAZY_FREE,      /* Deferred objects freeing. */
     BIO_CLOSE_AOF,      /* Deferred close for AOF files. */
+    BIO_SAVE_TO_DISK,   /* Deferred save RDB to disk on replica */
     BIO_NUM_OPS
 };
 
 
@@ -3160,6 +3160,7 @@ standardConfig static_configs[] = {
     createBoolConfig("repl-disable-tcp-nodelay", NULL, MODIFIABLE_CONFIG, server.repl_disable_tcp_nodelay, 0, NULL, NULL),
     createBoolConfig("repl-diskless-sync", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.repl_diskless_sync, 1, NULL, NULL),
     createBoolConfig("dual-channel-replication-enabled", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.dual_channel_replication, 0, NULL, NULL),
+    createBoolConfig("replica-save-to-disk-in-bio-thread", NULL, DEBUG_CONFIG | MODIFIABLE_CONFIG, server.replica_save_to_disk_in_bio_thread, 1, NULL, NULL),
     createBoolConfig("aof-rewrite-incremental-fsync", NULL, MODIFIABLE_CONFIG, server.aof_rewrite_incremental_fsync, 1, NULL, NULL),
     createBoolConfig("no-appendfsync-on-rewrite", NULL, MODIFIABLE_CONFIG, server.aof_no_fsync_on_rewrite, 0, NULL, NULL),
     createBoolConfig("cluster-require-full-coverage", NULL, MODIFIABLE_CONFIG, server.cluster_require_full_coverage, 1, NULL, NULL),