From 5588b7342e6540af815cad309b20530f0a3eae3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bastian=20M=C3=A4user?= Date: Mon, 26 Feb 2024 17:09:13 +0100 Subject: [PATCH] Require Unique Disk names throughout all Storage pools. --- README.md | 4 ++++ crossover | 44 +++++++++++++++++++++++++++++++++++++++----- 2 files changed, 43 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 2cbba63..e90e8f0 100644 --- a/README.md +++ b/README.md @@ -232,6 +232,10 @@ For the Destination Cluster you need to copy your ssh-key to the first host in t Currently preflight checks don't include the check for enough resources in the destination cluster. Check beforehand that you don't exceed the maximum safe size of ceph in the destination cluster. +## Unique Disk names + +There are cases, when the Source VM has Disks on different ceph pools. Now, in theory you can have identical image names for different disks. Since all disk images are migrated to one destination pool, they need to be unique. This tool detects this in Preflight checks, and skips these VMs and issues a warning. To solve this, give them unique names, like vm-100-disk-0, vm,100-disk-1 and so on. `rbd mv` will help you. + ## Some words about Snapshot consistency and what qemu-guest-agent can do for you Bear in mind, that when taking a snapshot of a running VM, it's basically like if you have a server which gets pulled away from the Power. Often this is not cathastrophic as the next fsck will try to fix Filesystem Issues, but in the worst case this could leave you with a severely damaged Filesystem, or even worse, half written Inodes which were in-flight when the power failed lead to silent data corruption. To overcome these things, we have the qemu-guest-agent to improve the consistency of the Filesystem while taking a snapshot. It won't leave you a clean filesystem, but it sync()'s outstanding writes and halts all i/o until the snapshot is complete. Still, there might me issues on the Application layer. Databases processes might have unwritten data in memory, which is the most common case. Here you have the opportunity to do additional tuning, and use hooks to tell your vital processes things to do prio and post freezes. diff --git a/crossover b/crossover index d260264..9d6eacd 100755 --- a/crossover +++ b/crossover @@ -363,6 +363,28 @@ function get_disks_from_config(){ echo "$disks" } +function check_unique_disk_config() { + local file_config=$1 + disks=$(while read -r line; do + [[ "$line" == "" ]] && break + echo "$line" + done < "$file_config" | \ + grep -P '^(?:((?:efidisk|virtio|ide|scsi|sata|mp)\d+)|rootfs): ' | \ + grep -v -P 'cdrom|none' | \ + grep -v -P 'backup=0' | \ + awk '{ split($0,a,","); split(a[1],b," "); print b[2]}'| wc -l) + uniquedisks=$(while read -r line; do + [[ "$line" == "" ]] && break + echo "$line" + done < "$file_config" | \ + grep -P '^(?:((?:efidisk|virtio|ide|scsi|sata|mp)\d+)|rootfs): ' | \ + grep -v -P 'cdrom|none' | \ + grep -v -P 'backup=0' | \ + awk '{ split($0,a,","); split(a[1],b," "); print b[2]}'|cut -d ':' -f 2 | sort -nr | uniq | wc -l) + difference=$(expr $disks - $uniquedisks) + echo "$difference" +} + function log(){ local level=$1 shift 1 @@ -436,6 +458,8 @@ function mirror() { local -i endjob local -i vmcount=0 local -i diskcount=0 + local -i vmdiskcount=0 + local -i skipped_vm_count=0 local -i startdowntime local -i enddowntime local -i ga_ping @@ -491,17 +515,23 @@ function mirror() { log error "Preflight check: Destination RBD-Pool $opt_pool does not exist." end_process 255 fi - for vm_id in $svmids; do file_config="$PVE_NODES/${pvnode[$vm_id]}/$QEMU/$vm_id.conf" + if [[ $(check_unique_disk_config "$file_config") -ge 1 ]]; then + log error "VM $vm_id - Preflight check: VM $vm_id has duplicate disk entries - skipping to next VM. Check Documentation to learn how to avoid this." + (( skipped_vm_count++ )) + continue + fi if ! exist_file "$file_config"; then log error "VM $vm_id - Preflight check: VM $vm_id does not exist on source cluster [$scluster] - skipping to next VM." + (( skipped_vm_count++ )) continue fi ga_ping=$(gaping "$vm_id") log debug "ga_ping: $ga_ping" if [ "$ga_ping" -eq 255 ] ; then #vm running but no qemu-guest-agent answering log error "VM $vm_id - Preflight check: VM $vm_id on source cluster [$scluster] has no qemu-guest-agent running - skipping to next VM." + (( skipped_vm_count++ )) continue fi (( vmcount++ )) @@ -594,17 +624,18 @@ function mirror() { fi for disk in $(get_disks_from_config "$file_config"); do (( diskcount++ )) - log debug "VMID: $vm_id Disk: $disk DESTVMID: $dvmid" + (( vmdiskcount++ )) src_image_spec=$(get_image_spec "$disk") + log debug "src_image_spec: $src_image_spec" [ -z "$src_image_spec" ] && continue dst_image_spec=$(echo $src_image_spec | sed -r -e "s/(.*\/[a-zA-Z0-9]+\-)([0-9]+)(\-[a-zA-Z0-9]+\-[0-9]+)/\1$dvmid\3/") [ -z "$dst_image_spec" ] && continue [[ $disk =~ $recephimg ]] - src_image_pool_pve=${BASH_REMATCH[1]} +# src_image_pool_pve=${BASH_REMATCH[1]} src_image_pool=$(lookupcephpool "localhost" ${BASH_REMATCH[1]}) src_image_name=${BASH_REMATCH[2]} [[ $dst_image_spec =~ ^.*\/(.*)$ ]] - dst_image_name=${BASH_REMATCH[1]}-$src_image_pool_pve + dst_image_name=${BASH_REMATCH[1]} #-$src_image_pool_pve dst_image_pool=$(lookupcephpool $opt_destination $opt_pool) dst_data_pool=$(lookupdatapool $opt_destination $opt_pool) if [ -n "$dst_data_pool" ]; then @@ -689,6 +720,7 @@ function mirror() { do_run "$cmd" fi unset basets + vmdiskcount=0 done if [ $opt_keepdlock -eq 0 ]; then ssh root@${dstpvnode[$dvmid]} qm unlock $dvmid @@ -711,12 +743,14 @@ function mirror() { if [ "$perf_ss_failed" -gt 0 ]; then disp_perf_ss_failed="$(echored $perf_ss_failed)"; else disp_perf_ss_failed="$(echogreen $perf_ss_failed)"; fi if [ "$perf_full_failed" -gt 0 ]; then disp_perf_full_failed="$(echored $perf_full_failed)"; else disp_perf_full_failed="$(echogreen $perf_full_failed)"; fi if [ "$perf_diff_failed" -gt 0 ]; then disp_perf_diff_failed="$(echored $perf_diff_failed)"; else disp_perf_diff_failed="$(echogreen $perf_diff_failed)"; fi + if [ "$skipped_vm_count" -gt 0 ]; then disp_skipped_vm_count="$(echored $skipped_vm_count)"; else disp_skipped_vm_count="$(echogreen $skipped_vm_count)"; fi log info "VM Freeze OK/failed.......: $perf_freeze_ok/$disp_perf_freeze_failed" log info "RBD Snapshot OK/failed....: $perf_ss_ok/$disp_perf_ss_failed" log info "RBD export-full OK/failed.: $perf_full_ok/$disp_perf_full_failed" log info "RBD export-diff OK/failed.: $perf_diff_ok/$disp_perf_diff_failed" log info "Full xmitted..............: $(human_readable $perf_bytes_full)" log info "Differential Bytes .......: $(human_readable $perf_bytes_diff)" + log info "Skipped VMs ..............: $disp_skipped_vm_count" if [ -n "$opt_influx_api_url" ]; then log info "VM $vm_id - Logging Job summary to InfluxDB: $opt_influx_api_url" influxlp="$opt_influx_summary_metrics,jobname=$opt_influx_jobname perf_bytes_diff=$perf_bytes_diff""i,perf_bytes_full=$perf_bytes_full""i,perf_bytes_total=$perf_bytes_total""i,perf_diff_failed=$perf_diff_failed""i,perf_diff_ok=$perf_diff_ok""i,perf_freeze_failed=$perf_freeze_failed""i,perf_freeze_ok=$perf_freeze_ok""i,perf_full_failed=$perf_full_failed""i,perf_full_ok=$perf_full_ok""i,perf_ss_failed=$perf_ss_failed""i,perf_ss_ok=$perf_ss_ok""i,perf_vm_running=$perf_vm_running""i,perf_vm_stopped=$perf_vm_stopped""i" @@ -832,7 +866,7 @@ function rewriteconfig(){ else sedcmd='sed -e /^$/,$d' fi - cat "$oldconfig" | sed -r -e "s/^(efidisk|virtio|ide|scsi|sata|mp)([0-9]+):\s([a-zA-Z0-9]+):(.*)-([0-9]+)-disk-([0-9]+).*,(.*)$/\1\2: $newpool:\4-$newvmid-disk-\6-\3,\7/g" | $sedcmd | sed -e '/^$/,$d' | sed -e '/ide[0-9]:.*-cloudinit,media=cdrom.*/d' | grep -v "^parent:\s.*$" | ssh "$dst" "cat - >$newconfig" + cat "$oldconfig" | sed -r -e "s/^(efidisk|virtio|ide|scsi|sata|mp)([0-9]+):\s([a-zA-Z0-9]+):(.*)-([0-9]+)-disk-([0-9]+).*,(.*)$/\1\2: $newpool:\4-$newvmid-disk-\6,\7/g" | $sedcmd | sed -e '/^$/,$d' | sed -e '/ide[0-9]:.*-cloudinit,media=cdrom.*/d' | grep -v "^parent:\s.*$" | ssh "$dst" "cat - >$newconfig" } function checkvmid(){