-
Notifications
You must be signed in to change notification settings - Fork 19
Add automatic snapshot feature for QEMU VMs #775
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,7 +1,11 @@ | ||||||
import logging | ||||||
import qmp | ||||||
from pydantic import BaseModel | ||||||
|
||||||
|
||||||
logger = logging.getLogger(__name__) | ||||||
|
||||||
|
||||||
class VmSevInfo(BaseModel): | ||||||
enabled: bool | ||||||
api_major: int | ||||||
|
@@ -74,3 +78,46 @@ def query_status(self) -> None: | |||||
""" | ||||||
# {'status': 'prelaunch', 'singlestep': False, 'running': False} | ||||||
return self.qmp_client.command("query-status") | ||||||
|
||||||
def create_snapshot(self, snapshot_name: str) -> bool: | ||||||
""" | ||||||
Create a VM snapshot using QMP. This will snapshot the VM's RAM state and disks. | ||||||
|
||||||
:param snapshot_name: Name of the snapshot | ||||||
:return: True if successful, False otherwise | ||||||
""" | ||||||
try: | ||||||
logger.debug(f"Creating snapshot {snapshot_name} for VM {self.vm.vm_id}") | ||||||
self.qmp_client.command("savevm", **{"name": snapshot_name}) | ||||||
return True | ||||||
except Exception as e: | ||||||
logger.error(f"Failed to create snapshot {snapshot_name} for VM {self.vm.vm_id}: {e}") | ||||||
return False | ||||||
|
||||||
def delete_snapshot(self, snapshot_name: str) -> bool: | ||||||
""" | ||||||
Delete a VM snapshot using QMP. | ||||||
|
||||||
:param snapshot_name: Name of the snapshot to delete | ||||||
:return: True if successful, False otherwise | ||||||
""" | ||||||
try: | ||||||
logger.debug(f"Deleting snapshot {snapshot_name} for VM {self.vm.vm_id}") | ||||||
self.qmp_client.command("delvm", **{"name": snapshot_name}) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
return True | ||||||
except Exception as e: | ||||||
logger.error(f"Failed to delete snapshot {snapshot_name} for VM {self.vm.vm_id}: {e}") | ||||||
return False | ||||||
|
||||||
def list_snapshots(self) -> list[str]: | ||||||
""" | ||||||
List all VM snapshots using QMP. | ||||||
|
||||||
:return: List of snapshot names | ||||||
""" | ||||||
try: | ||||||
snapshots = self.qmp_client.command("query-snapshots") | ||||||
return [snapshot["name"] for snapshot in snapshots] | ||||||
except Exception as e: | ||||||
logger.error(f"Failed to list snapshots for VM {self.vm.vm_id}: {e}") | ||||||
return [] |
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,121 @@ | ||||||||||
import asyncio | ||||||||||
import logging | ||||||||||
import threading | ||||||||||
from time import sleep | ||||||||||
|
||||||||||
from aleph_message.models import ItemHash | ||||||||||
from schedule import Job, Scheduler | ||||||||||
|
||||||||||
from aleph.vm.conf import settings | ||||||||||
from aleph.vm.controllers.firecracker.snapshots import CompressedDiskVolumeSnapshot | ||||||||||
|
||||||||||
logger = logging.getLogger(__name__) | ||||||||||
|
||||||||||
|
||||||||||
def wrap_async_snapshot(vm): | ||||||||||
asyncio.run(do_vm_snapshot(vm)) | ||||||||||
|
||||||||||
|
||||||||||
def run_threaded_snapshot(vm): | ||||||||||
job_thread = threading.Thread(target=wrap_async_snapshot, args=(vm,)) | ||||||||||
job_thread.start() | ||||||||||
|
||||||||||
|
||||||||||
async def do_vm_snapshot(vm) -> CompressedDiskVolumeSnapshot: | ||||||||||
try: | ||||||||||
logger.debug(f"Starting new snapshot for QEMU VM {vm.vm_hash}") | ||||||||||
assert vm, "VM execution not set" | ||||||||||
|
||||||||||
snapshot = await vm.create_snapshot() | ||||||||||
logger.debug(f"New snapshot for QEMU VM {vm.vm_hash} created successfully") | ||||||||||
return snapshot | ||||||||||
except ValueError as error: | ||||||||||
msg = "Failed to create QEMU VM snapshot" | ||||||||||
raise ValueError(msg) from error | ||||||||||
|
||||||||||
|
||||||||||
def infinite_run_scheduler_jobs(scheduler: Scheduler) -> None: | ||||||||||
while True: | ||||||||||
scheduler.run_pending() | ||||||||||
sleep(1) | ||||||||||
|
||||||||||
|
||||||||||
class QemuSnapshotExecution: | ||||||||||
vm_hash: ItemHash | ||||||||||
execution: any # AlephQemuInstance | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
This should be |
||||||||||
frequency: int | ||||||||||
_scheduler: Scheduler | ||||||||||
_job: Job | ||||||||||
|
||||||||||
def __init__( | ||||||||||
self, | ||||||||||
scheduler: Scheduler, | ||||||||||
vm_hash: ItemHash, | ||||||||||
execution, | ||||||||||
frequency: int, | ||||||||||
): | ||||||||||
self.vm_hash = vm_hash | ||||||||||
self.execution = execution | ||||||||||
self.frequency = frequency | ||||||||||
self._scheduler = scheduler | ||||||||||
|
||||||||||
async def start(self) -> None: | ||||||||||
logger.debug(f"Starting QEMU snapshots for VM {self.vm_hash} every {self.frequency} minutes") | ||||||||||
job = self._scheduler.every(self.frequency).minutes.do(run_threaded_snapshot, self.execution) | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
And remove the next line There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
And remove the next line |
||||||||||
self._job = job | ||||||||||
|
||||||||||
async def stop(self) -> None: | ||||||||||
logger.debug(f"Stopping QEMU snapshots for VM {self.vm_hash}") | ||||||||||
self._scheduler.cancel_job(self._job) | ||||||||||
|
||||||||||
|
||||||||||
class QemuSnapshotManager: | ||||||||||
""" | ||||||||||
Manage QEMU VM snapshots. | ||||||||||
""" | ||||||||||
|
||||||||||
executions: dict[ItemHash, QemuSnapshotExecution] | ||||||||||
_scheduler: Scheduler | ||||||||||
|
||||||||||
def __init__(self): | ||||||||||
self.executions = {} | ||||||||||
self._scheduler = Scheduler() | ||||||||||
|
||||||||||
def run_in_thread(self) -> None: | ||||||||||
job_thread = threading.Thread( | ||||||||||
target=infinite_run_scheduler_jobs, | ||||||||||
args=[self._scheduler], | ||||||||||
daemon=True, | ||||||||||
name="QemuSnapshotManager", | ||||||||||
) | ||||||||||
job_thread.start() | ||||||||||
|
||||||||||
async def start_for(self, vm, frequency: int | None = None) -> None: | ||||||||||
if not vm.support_snapshot: | ||||||||||
msg = "Snapshots are not supported for this VM type." | ||||||||||
raise NotImplementedError(msg) | ||||||||||
|
||||||||||
# Default to 10 minutes if not specified and settings value is 0 | ||||||||||
default_frequency = frequency or settings.SNAPSHOT_FREQUENCY or 10 | ||||||||||
|
||||||||||
vm_hash = vm.vm_hash | ||||||||||
snapshot_execution = QemuSnapshotExecution( | ||||||||||
scheduler=self._scheduler, | ||||||||||
vm_hash=vm_hash, | ||||||||||
execution=vm, | ||||||||||
frequency=default_frequency, | ||||||||||
) | ||||||||||
self.executions[vm_hash] = snapshot_execution | ||||||||||
await snapshot_execution.start() | ||||||||||
|
||||||||||
async def stop_for(self, vm_hash: ItemHash) -> None: | ||||||||||
try: | ||||||||||
snapshot_execution = self.executions.pop(vm_hash) | ||||||||||
except KeyError: | ||||||||||
logger.warning("Could not find snapshot task for QEMU instance %s", vm_hash) | ||||||||||
return | ||||||||||
|
||||||||||
await snapshot_execution.stop() | ||||||||||
|
||||||||||
async def stop_all(self) -> None: | ||||||||||
await asyncio.gather(*(self.stop_for(vm_hash) for vm_hash in list(self.executions.keys()))) | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Shouldn't this function also do a |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.