gramineproject · kailun-qin · Jan 7, 2025 · Feb 13, 2025 · Feb 14, 2025
diff --git a/Documentation/devel/encfiles.rst b/Documentation/devel/encfiles.rst
@@ -508,10 +508,19 @@ Additional details
   least one process writes to the file), the file may become corrupted or
   inaccessible to one of the processes.
 
-- There is no support for file recovery. If the file was only partially written
-  to storage when the app abruptly terminated, Gramine will treat this file as
-  corrupted and will return an ``-EACCES`` error. (This is in contrast to Intel
-  SGX SDK which supports file recovery.)
+- File recovery: Gramine supports recovery for encrypted files, which can be
+  enabled via the ``enable_recovery`` mount parameter in the Gramine manifest.
+  This allows a file to be recovered from a corrupted state (caused by e.g.,
+  incorrect GMACs and/or encryption keys) when it was only partially written to
+  storage due to a fatal error (e.g., abrupt app termination). Similar to Intel
+  SGX SDK’s recovery mechanism, Gramine uses a "shadow" recovery file and a
+  ``has_pending_write`` flag in the metadata node to manage write transactions.
+  During file flush, cached blocks about to change are saved to the recovery
+  file. If an encrypted file is opened with the flag set, a recovery process
+  reverts partial changes using the recovery file, restoring the last known good
+  state. The "shadow" recovery file is cleaned up on file close. Note that
+  enabling this feature can impact performance due to additional writes to the
+  shadow file on each flush.
 
 - There is no key rotation scheme. The application must perform key rotation of
   the KDK by itself (by overwriting the ``/dev/attestation/keys/``

diff --git a/Documentation/img/encfiles/02_encfiles_representation.svg b/Documentation/img/encfiles/02_encfiles_representation.svg
diff --git a/Documentation/img/encfiles/04_encfiles_write_less3k.svg b/Documentation/img/encfiles/04_encfiles_write_less3k.svg
diff --git a/Documentation/img/encfiles/05_encfiles_read_less3k.svg b/Documentation/img/encfiles/05_encfiles_read_less3k.svg
diff --git a/Documentation/img/encfiles/06_encfiles_write_greater3k.svg b/Documentation/img/encfiles/06_encfiles_write_greater3k.svg
diff --git a/Documentation/img/encfiles/08_encfiles_read_greater3k.svg b/Documentation/img/encfiles/08_encfiles_read_greater3k.svg
diff --git a/Documentation/manifest-syntax.rst b/Documentation/manifest-syntax.rst
@@ -1088,7 +1088,7 @@ Encrypted files
 ::
 
     fs.mounts = [
-      { type = "encrypted", path = "[PATH]", uri = "[URI]", key_name = "[KEY_NAME]" },
+      { type = "encrypted", path = "[PATH]", uri = "[URI]", key_name = "[KEY_NAME]", enable_recovery = [true|false] },
     ]
 
     fs.insecure__keys.[KEY_NAME] = "[32-character hex value]"
@@ -1154,6 +1154,12 @@ Gramine:
    in the application is insecure. If you need to derive encryption keys from
    such a "doubly-used" key, you must apply a KDF.
 
+The ``enable_recovery`` mount parameter (default: ``false``) determines whether
+file recovery is enabled for the mount. This feature allows selective enabling
+or disabling of recovery for different mounted files or directories. Note that
+enabling this feature can negatively impact performance, as it writes to a
+second shadow file for later recovery purposes on each flush.
+
 .. _untrusted-shared-memory:
 
 Untrusted shared memory

diff --git a/common/src/protected_files/protected_files.c b/common/src/protected_files/protected_files.c
@@ -56,7 +56,7 @@ static const char* g_pf_error_list[] = {
     [-PF_STATUS_NOT_IMPLEMENTED] = "Functionality not implemented",
     [-PF_STATUS_CALLBACK_FAILED] = "Callback failed",
     [-PF_STATUS_PATH_TOO_LONG] = "Path is too long",
-    [-PF_STATUS_RECOVERY_NEEDED] = "File recovery needed",
+    [-PF_STATUS_RECOVERY_NEEDED] = "File recovery needed but failed",
     [-PF_STATUS_FLUSH_ERROR] = "Flush error",
     [-PF_STATUS_CRYPTO_ERROR] = "Crypto error",
     [-PF_STATUS_CORRUPTED] = "File is corrupted",
@@ -429,36 +429,134 @@ static bool ipf_update_metadata_node(pf_context_t* pf) {
     return true;
 }
 
+static bool ipf_write_recovery_node(pf_context_t* pf, uint64_t physical_node_number,
+                                    const void* buffer, uint64_t offset) {
+    assert(pf->host_recovery_file_handle);
+
+    recovery_node_t recovery_node = { .physical_node_number = physical_node_number };
+    memcpy(recovery_node.bytes, buffer, sizeof(recovery_node.bytes));
+
+    pf_status_t status = g_cb_write(pf->host_recovery_file_handle, (void*)&recovery_node, offset,
+                                    sizeof(recovery_node));
+    if (PF_FAILURE(status)) {
+        pf->last_error = status;
+        return false;
+    }
+
+    return true;
+}
+
+static bool ipf_write_recovery_file(pf_context_t* pf) {
+    assert(pf->host_recovery_file_handle);
+
+    pf_status_t status = g_cb_truncate(pf->host_recovery_file_handle, 0);
+    if (PF_FAILURE(status)) {
+        pf->last_error = status;
+        return false;
+    }
+
+    void* node;
+    uint64_t offset = 0;
+    for (node = lruc_get_first(pf->cache); node != NULL; node = lruc_get_next(pf->cache)) {
+        file_node_t* file_node = (file_node_t*)node;
+        if (!file_node->need_writing)
+            continue;
+
+        if (!ipf_write_recovery_node(pf, file_node->physical_node_number, &file_node->encrypted,
+                                     offset))
+            return false;
+
+        offset += sizeof(recovery_node_t);
+    }
+
+    if (!ipf_write_recovery_node(pf, /*physical_node_number=*/1, &pf->root_mht_node.encrypted,
+                                 offset))
+        return false;
+
+    offset += sizeof(recovery_node_t);
+
+    if (!ipf_write_recovery_node(pf, /*physical_node_number=*/0, &pf->metadata_node, offset))
+        return false;
+
+    return true;
+}
+
+static bool ipf_set_pending_write(pf_context_t* pf) {
+    pf->metadata_node.plaintext_part.has_pending_write = 1;
+    bool ret = ipf_write_node(pf, /*physical_node_number=*/0, &pf->metadata_node);
+
+    /* Unset the `has_pending_write` in memory, which will be cleared on disk at the end of the
+     * flush when we write the metadata to disk. */
+    pf->metadata_node.plaintext_part.has_pending_write = 0;
+
+    return ret;
+}
+
+static bool ipf_clear_pending_write(pf_context_t* pf) {
+    assert(pf->metadata_node.plaintext_part.has_pending_write == 0);
+
+    if (!ipf_write_node(pf, /*physical_node_number=*/0, &pf->metadata_node))
+        return false;
+
+    pf_status_t status = g_cb_fsync(pf->host_file_handle);
+    if (PF_FAILURE(status)) {
+        pf->last_error = status;
+        return false;
+    }
+
+    return true;
+}
+
 static bool ipf_internal_flush(pf_context_t* pf) {
     if (!pf->need_writing) {
         DEBUG_PF("no need to write");
         return true;
     }
 
     if (pf->metadata_decrypted.file_size > MD_USER_DATA_SIZE && pf->root_mht_node.need_writing) {
+        if (pf->host_recovery_file_handle) {
+            if (!ipf_write_recovery_file(pf)) {
+                pf->file_status = PF_STATUS_FLUSH_ERROR;
+                DEBUG_PF("failed to write changes to the recovery file");
+                goto recoverable_error;
+            }
+
+            if (!ipf_set_pending_write(pf)) {
+                pf->file_status = PF_STATUS_FLUSH_ERROR;
+                DEBUG_PF("failed to set the pending write flag");
+                goto recoverable_error;
+            }
+        }
+
         if (!ipf_update_all_data_and_mht_nodes(pf)) {
             // this is something that shouldn't happen, can't fix this...
             pf->file_status = PF_STATUS_CRYPTO_ERROR;
             DEBUG_PF("failed to update data and MHT nodes");
-            return false;
+            goto unrecoverable_error;
         }
     }
 
     if (!ipf_update_metadata_node(pf)) {
         // this is something that shouldn't happen, can't fix this...
         pf->file_status = PF_STATUS_CRYPTO_ERROR;
         DEBUG_PF("failed to update metadata node");
-        return false;
+        goto unrecoverable_error;
     }
 
     if (!ipf_write_all_changes_to_disk(pf)) {
         pf->file_status = PF_STATUS_WRITE_TO_DISK_FAILED;
         DEBUG_PF("failed to write changes to disk");
-        return false;
+        goto recoverable_error;
     }
 
     pf->need_writing = false;
     return true;
+
+unrecoverable_error:
+    if (pf->host_recovery_file_handle)
+        (void)ipf_clear_pending_write(pf);
+recoverable_error:
+    return false;
 }
 
 static file_node_t* ipf_get_mht_node(pf_context_t* pf, uint64_t offset) {
@@ -750,10 +848,11 @@ static bool ipf_init_fields(pf_context_t* pf) {
 
     ipf_init_root_mht(&pf->root_mht_node);
 
-    pf->host_file_handle = NULL;
-    pf->need_writing     = false;
-    pf->file_status      = PF_STATUS_UNINITIALIZED;
-    pf->last_error       = PF_STATUS_SUCCESS;
+    pf->host_file_handle          = NULL;
+    pf->host_recovery_file_handle = NULL;
+    pf->need_writing              = false;
+    pf->file_status               = PF_STATUS_UNINITIALIZED;
+    pf->last_error                = PF_STATUS_SUCCESS;
 
     pf->cache = lruc_create();
     return true;
@@ -851,8 +950,67 @@ static void ipf_try_clear_error(pf_context_t* pf) {
     }
 }
 
+static bool ipf_check_recovery_needed(pf_context_t* pf) {
+    // read metadata node
+    if (!ipf_read_node(pf, /*physical_node_number=*/0, (uint8_t*)&pf->metadata_node))
+        return pf->last_error;
+
+    return pf->metadata_node.plaintext_part.has_pending_write == 1;
+}
+
+/* Reads each recovery node from the recovery file and apply the embedded pf node
+ * (recovery_node.bytes) to the corresponding offset (recovery_node.physical_node_number) in the
+ * main file. */
+static bool ipf_recover(pf_context_t* pf, uint64_t recovery_file_size) {
+    pf_status_t status;
+
+    if (!pf->host_recovery_file_handle) {
+        DEBUG_PF("file recovery needed but recovery file handle not set; please consider setting "
+                 "'enable_recovery = true' for the mount");
+        pf->last_error = PF_STATUS_RECOVERY_NEEDED;
+        return false;
+    }
+
+    if (recovery_file_size == 0 || recovery_file_size % sizeof(recovery_node_t) != 0) {
+        DEBUG_PF("recovery file size is not right [%lu]", recovery_file_size);
+        pf->last_error = PF_STATUS_RECOVERY_NEEDED;
+        return false;
+    }
+
+    size_t recovery_nodes_count = recovery_file_size / sizeof(recovery_node_t);
+
+    for (size_t i = 0; i < recovery_nodes_count; i++) {
+        recovery_node_t recovery_node;
+
+        status = g_cb_read(pf->host_recovery_file_handle, &recovery_node,
+                           i * sizeof(recovery_node_t), sizeof(recovery_node_t));
+        if (PF_FAILURE(status)) {
+            pf->last_error = status;
+            return false;
+        }
+
+        size_t offset = recovery_node.physical_node_number;
+        status = g_cb_write(pf->host_file_handle, recovery_node.bytes,
+                            offset * sizeof(recovery_node.bytes), sizeof(recovery_node.bytes));
+        if (PF_FAILURE(status)) {
+            pf->last_error = status;
+            return false;
+        }
+    }
+
+    status = g_cb_fsync(pf->host_file_handle);
+    if (PF_FAILURE(status)) {
+        pf->last_error = status;
+        return false;
+    }
+
+    return true;
+}
+
 static pf_context_t* ipf_open(const char* path, pf_file_mode_t mode, bool create, pf_handle_t file,
-                              uint64_t real_size, const pf_key_t* kdk_key, pf_status_t* status) {
+                              uint64_t real_size, const pf_key_t* kdk_key,
+                              pf_handle_t recovery_file_handle, uint64_t recovery_file_size,
+                              bool try_recover, pf_status_t* status) {
     *status = PF_STATUS_NO_MEMORY;
     pf_context_t* pf = calloc(1, sizeof(*pf));
 
@@ -892,10 +1050,26 @@ static pf_context_t* ipf_open(const char* path, pf_file_mode_t mode, bool create
     pf->host_file_handle = file;
     pf->mode = mode;
 
+    pf->host_recovery_file_handle = recovery_file_handle;
+
     if (!create) {
         if (!ipf_init_existing_file(pf, path))
             goto out;
 
+        if (try_recover && ipf_check_recovery_needed(pf)) {
+            DEBUG_PF("%s: starting file recovery", path);
+
+            if (!ipf_recover(pf, recovery_file_size))
+                goto out;
+
+            if (ipf_check_recovery_needed(pf)) {
+                DEBUG_PF("%s: file recovery attempted but failed", path);
+                pf->last_error = PF_STATUS_RECOVERY_NEEDED;
+                goto out;
+            }
+
+            DEBUG_PF("%s: file recovery completed", path);
+        }
     } else {
         if (!ipf_init_new_file(pf, path))
             goto out;
@@ -1126,12 +1300,15 @@ void pf_set_callbacks(pf_read_f read_f, pf_write_f write_f, pf_fsync_f fsync_f,
 }
 
 pf_status_t pf_open(pf_handle_t handle, const char* path, uint64_t underlying_size,
-                    pf_file_mode_t mode, bool create, const pf_key_t* key, pf_context_t** context) {
+                    pf_file_mode_t mode, bool create, const pf_key_t* key,
+                    pf_handle_t recovery_file_handle, uint64_t recovery_file_size,
+                    bool try_recover, pf_context_t** context) {
     if (!g_initialized)
         return PF_STATUS_UNINITIALIZED;
 
     pf_status_t status;
-    *context = ipf_open(path, mode, create, handle, underlying_size, key, &status);
+    *context = ipf_open(path, mode, create, handle, underlying_size, key, recovery_file_handle,
+                        recovery_file_size, try_recover, &status);
     return status;
 }
 

diff --git a/common/src/protected_files/protected_files.h b/common/src/protected_files/protected_files.h
@@ -210,21 +210,26 @@ void pf_set_callbacks(pf_read_f read_f, pf_write_f write_f, pf_fsync_f fsync_f,
 const char* pf_strerror(int err);
 
 /*!
- * \brief Open a protected file.
- *
- * \param      handle           Open underlying file handle.
- * \param      path             Path to the file. If NULL and \p create is false, don't check path
- *                              for validity.
- * \param      underlying_size  Underlying file size.
- * \param      mode             Access mode.
- * \param      create           Overwrite file contents if true.
- * \param      key              Wrap key.
- * \param[out] context          PF context for later calls.
+ * \brief Open a protected file, with optional recovery check and process.
+ *
+ * \param      handle                Open underlying file handle.
+ * \param      path                  Path to the file. If NULL and \p create is false, don't check path
+ *                                   for validity.
+ * \param      underlying_size       Underlying file size.
+ * \param      mode                  Access mode.
+ * \param      create                Overwrite file contents if true.
+ * \param      key                   Wrap key.
+ * \param      recovery_file_handle  (optional) Underlying recovery file handle.
+ * \param      recovery_file_size    Recovery file size.
+ * \param      try_recover           Whether to check for and perform file recovery if needed.
+ * \param[out] context               PF context for later calls.
  *
  * \returns PF status.
  */
 pf_status_t pf_open(pf_handle_t handle, const char* path, uint64_t underlying_size,
-                    pf_file_mode_t mode, bool create, const pf_key_t* key, pf_context_t** context);
+                    pf_file_mode_t mode, bool create, const pf_key_t* key,
+                    pf_handle_t recovery_file_handle, uint64_t recovery_file_size,
+                    bool try_cover, pf_context_t** context);
 
 /*!
  * \brief Close a protected file and commit all changes to disk.

diff --git a/common/src/protected_files/protected_files_format.h b/common/src/protected_files/protected_files_format.h
@@ -56,6 +56,7 @@ typedef struct {
     uint8_t    minor_version;
     pf_nonce_t metadata_key_nonce;
     pf_mac_t   metadata_mac; /* GCM mac */
+    uint8_t    has_pending_write; /* flag for file recovery */
 } metadata_plaintext_t;
 
 typedef struct {
@@ -95,6 +96,11 @@ typedef struct {
 } encrypted_node_t;
 static_assert(sizeof(encrypted_node_t) == PF_NODE_SIZE, "sizeof(encrypted_node_t)");
 
+typedef struct {
+    uint64_t physical_node_number;
+    uint8_t bytes[PF_NODE_SIZE];
+} recovery_node_t;
+
 static_assert(sizeof(mht_node_t) == sizeof(data_node_t), "sizes of MHT and data nodes differ");
 
 // Data struct that wraps the 4KB encrypted-node buffer (bounce buffer) and the corresponding 4KB

diff --git a/common/src/protected_files/protected_files_internal.h b/common/src/protected_files/protected_files_internal.h
@@ -17,6 +17,9 @@ struct pf_context {
     pf_file_mode_t mode;           // read-only, write-only or read-write
     bool need_writing;             // whether file was modified and thus needs writing to storage
 
+    pf_handle_t host_recovery_file_handle;  // opaque recovery file handle (e.g. PAL handle) used by
+                                            // callbacks
+
     pf_status_t file_status;       // PF_STATUS_SUCCESS, PF_STATUS_CRYPTO_ERROR, etc.
     pf_status_t last_error;        // FIXME: unclear why this is needed