Require Unique Disk names throughout all Storage pools.

This commit is contained in:
2024-02-26 17:09:13 +01:00
parent 2f985df07d
commit 5588b7342e
2 changed files with 43 additions and 5 deletions

View File

@@ -232,6 +232,10 @@ For the Destination Cluster you need to copy your ssh-key to the first host in t
Currently preflight checks don't include the check for enough resources in the destination cluster. Check beforehand that you don't exceed the maximum safe size of ceph in the destination cluster. Currently preflight checks don't include the check for enough resources in the destination cluster. Check beforehand that you don't exceed the maximum safe size of ceph in the destination cluster.
## Unique Disk names
There are cases, when the Source VM has Disks on different ceph pools. Now, in theory you can have identical image names for different disks. Since all disk images are migrated to one destination pool, they need to be unique. This tool detects this in Preflight checks, and skips these VMs and issues a warning. To solve this, give them unique names, like vm-100-disk-0, vm,100-disk-1 and so on. `rbd mv` will help you.
## Some words about Snapshot consistency and what qemu-guest-agent can do for you ## Some words about Snapshot consistency and what qemu-guest-agent can do for you
Bear in mind, that when taking a snapshot of a running VM, it's basically like if you have a server which gets pulled away from the Power. Often this is not cathastrophic as the next fsck will try to fix Filesystem Issues, but in the worst case this could leave you with a severely damaged Filesystem, or even worse, half written Inodes which were in-flight when the power failed lead to silent data corruption. To overcome these things, we have the qemu-guest-agent to improve the consistency of the Filesystem while taking a snapshot. It won't leave you a clean filesystem, but it sync()'s outstanding writes and halts all i/o until the snapshot is complete. Still, there might me issues on the Application layer. Databases processes might have unwritten data in memory, which is the most common case. Here you have the opportunity to do additional tuning, and use hooks to tell your vital processes things to do prio and post freezes. Bear in mind, that when taking a snapshot of a running VM, it's basically like if you have a server which gets pulled away from the Power. Often this is not cathastrophic as the next fsck will try to fix Filesystem Issues, but in the worst case this could leave you with a severely damaged Filesystem, or even worse, half written Inodes which were in-flight when the power failed lead to silent data corruption. To overcome these things, we have the qemu-guest-agent to improve the consistency of the Filesystem while taking a snapshot. It won't leave you a clean filesystem, but it sync()'s outstanding writes and halts all i/o until the snapshot is complete. Still, there might me issues on the Application layer. Databases processes might have unwritten data in memory, which is the most common case. Here you have the opportunity to do additional tuning, and use hooks to tell your vital processes things to do prio and post freezes.

View File

@@ -363,6 +363,28 @@ function get_disks_from_config(){
echo "$disks" echo "$disks"
} }
function check_unique_disk_config() {
local file_config=$1
disks=$(while read -r line; do
[[ "$line" == "" ]] && break
echo "$line"
done < "$file_config" | \
grep -P '^(?:((?:efidisk|virtio|ide|scsi|sata|mp)\d+)|rootfs): ' | \
grep -v -P 'cdrom|none' | \
grep -v -P 'backup=0' | \
awk '{ split($0,a,","); split(a[1],b," "); print b[2]}'| wc -l)
uniquedisks=$(while read -r line; do
[[ "$line" == "" ]] && break
echo "$line"
done < "$file_config" | \
grep -P '^(?:((?:efidisk|virtio|ide|scsi|sata|mp)\d+)|rootfs): ' | \
grep -v -P 'cdrom|none' | \
grep -v -P 'backup=0' | \
awk '{ split($0,a,","); split(a[1],b," "); print b[2]}'|cut -d ':' -f 2 | sort -nr | uniq | wc -l)
difference=$(expr $disks - $uniquedisks)
echo "$difference"
}
function log(){ function log(){
local level=$1 local level=$1
shift 1 shift 1
@@ -436,6 +458,8 @@ function mirror() {
local -i endjob local -i endjob
local -i vmcount=0 local -i vmcount=0
local -i diskcount=0 local -i diskcount=0
local -i vmdiskcount=0
local -i skipped_vm_count=0
local -i startdowntime local -i startdowntime
local -i enddowntime local -i enddowntime
local -i ga_ping local -i ga_ping
@@ -491,17 +515,23 @@ function mirror() {
log error "Preflight check: Destination RBD-Pool $opt_pool does not exist." log error "Preflight check: Destination RBD-Pool $opt_pool does not exist."
end_process 255 end_process 255
fi fi
for vm_id in $svmids; do for vm_id in $svmids; do
file_config="$PVE_NODES/${pvnode[$vm_id]}/$QEMU/$vm_id.conf" file_config="$PVE_NODES/${pvnode[$vm_id]}/$QEMU/$vm_id.conf"
if [[ $(check_unique_disk_config "$file_config") -ge 1 ]]; then
log error "VM $vm_id - Preflight check: VM $vm_id has duplicate disk entries - skipping to next VM. Check Documentation to learn how to avoid this."
(( skipped_vm_count++ ))
continue
fi
if ! exist_file "$file_config"; then if ! exist_file "$file_config"; then
log error "VM $vm_id - Preflight check: VM $vm_id does not exist on source cluster [$scluster] - skipping to next VM." log error "VM $vm_id - Preflight check: VM $vm_id does not exist on source cluster [$scluster] - skipping to next VM."
(( skipped_vm_count++ ))
continue continue
fi fi
ga_ping=$(gaping "$vm_id") ga_ping=$(gaping "$vm_id")
log debug "ga_ping: $ga_ping" log debug "ga_ping: $ga_ping"
if [ "$ga_ping" -eq 255 ] ; then #vm running but no qemu-guest-agent answering if [ "$ga_ping" -eq 255 ] ; then #vm running but no qemu-guest-agent answering
log error "VM $vm_id - Preflight check: VM $vm_id on source cluster [$scluster] has no qemu-guest-agent running - skipping to next VM." log error "VM $vm_id - Preflight check: VM $vm_id on source cluster [$scluster] has no qemu-guest-agent running - skipping to next VM."
(( skipped_vm_count++ ))
continue continue
fi fi
(( vmcount++ )) (( vmcount++ ))
@@ -594,17 +624,18 @@ function mirror() {
fi fi
for disk in $(get_disks_from_config "$file_config"); do for disk in $(get_disks_from_config "$file_config"); do
(( diskcount++ )) (( diskcount++ ))
log debug "VMID: $vm_id Disk: $disk DESTVMID: $dvmid" (( vmdiskcount++ ))
src_image_spec=$(get_image_spec "$disk") src_image_spec=$(get_image_spec "$disk")
log debug "src_image_spec: $src_image_spec"
[ -z "$src_image_spec" ] && continue [ -z "$src_image_spec" ] && continue
dst_image_spec=$(echo $src_image_spec | sed -r -e "s/(.*\/[a-zA-Z0-9]+\-)([0-9]+)(\-[a-zA-Z0-9]+\-[0-9]+)/\1$dvmid\3/") dst_image_spec=$(echo $src_image_spec | sed -r -e "s/(.*\/[a-zA-Z0-9]+\-)([0-9]+)(\-[a-zA-Z0-9]+\-[0-9]+)/\1$dvmid\3/")
[ -z "$dst_image_spec" ] && continue [ -z "$dst_image_spec" ] && continue
[[ $disk =~ $recephimg ]] [[ $disk =~ $recephimg ]]
src_image_pool_pve=${BASH_REMATCH[1]} # src_image_pool_pve=${BASH_REMATCH[1]}
src_image_pool=$(lookupcephpool "localhost" ${BASH_REMATCH[1]}) src_image_pool=$(lookupcephpool "localhost" ${BASH_REMATCH[1]})
src_image_name=${BASH_REMATCH[2]} src_image_name=${BASH_REMATCH[2]}
[[ $dst_image_spec =~ ^.*\/(.*)$ ]] [[ $dst_image_spec =~ ^.*\/(.*)$ ]]
dst_image_name=${BASH_REMATCH[1]}-$src_image_pool_pve dst_image_name=${BASH_REMATCH[1]} #-$src_image_pool_pve
dst_image_pool=$(lookupcephpool $opt_destination $opt_pool) dst_image_pool=$(lookupcephpool $opt_destination $opt_pool)
dst_data_pool=$(lookupdatapool $opt_destination $opt_pool) dst_data_pool=$(lookupdatapool $opt_destination $opt_pool)
if [ -n "$dst_data_pool" ]; then if [ -n "$dst_data_pool" ]; then
@@ -689,6 +720,7 @@ function mirror() {
do_run "$cmd" do_run "$cmd"
fi fi
unset basets unset basets
vmdiskcount=0
done done
if [ $opt_keepdlock -eq 0 ]; then if [ $opt_keepdlock -eq 0 ]; then
ssh root@${dstpvnode[$dvmid]} qm unlock $dvmid ssh root@${dstpvnode[$dvmid]} qm unlock $dvmid
@@ -711,12 +743,14 @@ function mirror() {
if [ "$perf_ss_failed" -gt 0 ]; then disp_perf_ss_failed="$(echored $perf_ss_failed)"; else disp_perf_ss_failed="$(echogreen $perf_ss_failed)"; fi if [ "$perf_ss_failed" -gt 0 ]; then disp_perf_ss_failed="$(echored $perf_ss_failed)"; else disp_perf_ss_failed="$(echogreen $perf_ss_failed)"; fi
if [ "$perf_full_failed" -gt 0 ]; then disp_perf_full_failed="$(echored $perf_full_failed)"; else disp_perf_full_failed="$(echogreen $perf_full_failed)"; fi if [ "$perf_full_failed" -gt 0 ]; then disp_perf_full_failed="$(echored $perf_full_failed)"; else disp_perf_full_failed="$(echogreen $perf_full_failed)"; fi
if [ "$perf_diff_failed" -gt 0 ]; then disp_perf_diff_failed="$(echored $perf_diff_failed)"; else disp_perf_diff_failed="$(echogreen $perf_diff_failed)"; fi if [ "$perf_diff_failed" -gt 0 ]; then disp_perf_diff_failed="$(echored $perf_diff_failed)"; else disp_perf_diff_failed="$(echogreen $perf_diff_failed)"; fi
if [ "$skipped_vm_count" -gt 0 ]; then disp_skipped_vm_count="$(echored $skipped_vm_count)"; else disp_skipped_vm_count="$(echogreen $skipped_vm_count)"; fi
log info "VM Freeze OK/failed.......: $perf_freeze_ok/$disp_perf_freeze_failed" log info "VM Freeze OK/failed.......: $perf_freeze_ok/$disp_perf_freeze_failed"
log info "RBD Snapshot OK/failed....: $perf_ss_ok/$disp_perf_ss_failed" log info "RBD Snapshot OK/failed....: $perf_ss_ok/$disp_perf_ss_failed"
log info "RBD export-full OK/failed.: $perf_full_ok/$disp_perf_full_failed" log info "RBD export-full OK/failed.: $perf_full_ok/$disp_perf_full_failed"
log info "RBD export-diff OK/failed.: $perf_diff_ok/$disp_perf_diff_failed" log info "RBD export-diff OK/failed.: $perf_diff_ok/$disp_perf_diff_failed"
log info "Full xmitted..............: $(human_readable $perf_bytes_full)" log info "Full xmitted..............: $(human_readable $perf_bytes_full)"
log info "Differential Bytes .......: $(human_readable $perf_bytes_diff)" log info "Differential Bytes .......: $(human_readable $perf_bytes_diff)"
log info "Skipped VMs ..............: $disp_skipped_vm_count"
if [ -n "$opt_influx_api_url" ]; then if [ -n "$opt_influx_api_url" ]; then
log info "VM $vm_id - Logging Job summary to InfluxDB: $opt_influx_api_url" log info "VM $vm_id - Logging Job summary to InfluxDB: $opt_influx_api_url"
influxlp="$opt_influx_summary_metrics,jobname=$opt_influx_jobname perf_bytes_diff=$perf_bytes_diff""i,perf_bytes_full=$perf_bytes_full""i,perf_bytes_total=$perf_bytes_total""i,perf_diff_failed=$perf_diff_failed""i,perf_diff_ok=$perf_diff_ok""i,perf_freeze_failed=$perf_freeze_failed""i,perf_freeze_ok=$perf_freeze_ok""i,perf_full_failed=$perf_full_failed""i,perf_full_ok=$perf_full_ok""i,perf_ss_failed=$perf_ss_failed""i,perf_ss_ok=$perf_ss_ok""i,perf_vm_running=$perf_vm_running""i,perf_vm_stopped=$perf_vm_stopped""i" influxlp="$opt_influx_summary_metrics,jobname=$opt_influx_jobname perf_bytes_diff=$perf_bytes_diff""i,perf_bytes_full=$perf_bytes_full""i,perf_bytes_total=$perf_bytes_total""i,perf_diff_failed=$perf_diff_failed""i,perf_diff_ok=$perf_diff_ok""i,perf_freeze_failed=$perf_freeze_failed""i,perf_freeze_ok=$perf_freeze_ok""i,perf_full_failed=$perf_full_failed""i,perf_full_ok=$perf_full_ok""i,perf_ss_failed=$perf_ss_failed""i,perf_ss_ok=$perf_ss_ok""i,perf_vm_running=$perf_vm_running""i,perf_vm_stopped=$perf_vm_stopped""i"
@@ -832,7 +866,7 @@ function rewriteconfig(){
else else
sedcmd='sed -e /^$/,$d' sedcmd='sed -e /^$/,$d'
fi fi
cat "$oldconfig" | sed -r -e "s/^(efidisk|virtio|ide|scsi|sata|mp)([0-9]+):\s([a-zA-Z0-9]+):(.*)-([0-9]+)-disk-([0-9]+).*,(.*)$/\1\2: $newpool:\4-$newvmid-disk-\6-\3,\7/g" | $sedcmd | sed -e '/^$/,$d' | sed -e '/ide[0-9]:.*-cloudinit,media=cdrom.*/d' | grep -v "^parent:\s.*$" | ssh "$dst" "cat - >$newconfig" cat "$oldconfig" | sed -r -e "s/^(efidisk|virtio|ide|scsi|sata|mp)([0-9]+):\s([a-zA-Z0-9]+):(.*)-([0-9]+)-disk-([0-9]+).*,(.*)$/\1\2: $newpool:\4-$newvmid-disk-\6,\7/g" | $sedcmd | sed -e '/^$/,$d' | sed -e '/ide[0-9]:.*-cloudinit,media=cdrom.*/d' | grep -v "^parent:\s.*$" | ssh "$dst" "cat - >$newconfig"
} }
function checkvmid(){ function checkvmid(){