6 Commits

Author SHA1 Message Date
Bastian
a11dd53d4d wip 2024-10-16 21:36:08 +02:00
Bastian
e3419b45ce Merge remote-tracking branch 'origin/main' into ft-uniquenames 2024-10-16 21:30:54 +02:00
Bastian
587392e06d Add TPM Disk to migration 2024-10-15 12:53:00 +02:00
Bastian Mäuser
813c54760d WIP 2024-02-26 17:25:46 +01:00
Bastian Mäuser
5588b7342e Require Unique Disk names throughout all Storage pools. 2024-02-26 17:09:13 +01:00
Bastian Mäuser
2f985df07d Added option, do specify VMs to process using Proxmox UI Tags 2024-02-26 15:32:23 +01:00
2 changed files with 67 additions and 11 deletions

View File

@@ -232,6 +232,10 @@ For the Destination Cluster you need to copy your ssh-key to the first host in t
Currently preflight checks don't include the check for enough resources in the destination cluster. Check beforehand that you don't exceed the maximum safe size of ceph in the destination cluster.
## Unique Disk names
There are cases, when the Source VM has Disks on different ceph pools. Now, in theory you can have identical image names for different disks. Since all disk images are migrated to one destination pool, they need to be unique. This tool detects this in Preflight checks, and skips these VMs and issues a warning. To solve this, give them unique names, like vm-100-disk-0, vm,100-disk-1 and so on. `rbd mv` will help you.
## Some words about Snapshot consistency and what qemu-guest-agent can do for you
Bear in mind, that when taking a snapshot of a running VM, it's basically like if you have a server which gets pulled away from the Power. Often this is not cathastrophic as the next fsck will try to fix Filesystem Issues, but in the worst case this could leave you with a severely damaged Filesystem, or even worse, half written Inodes which were in-flight when the power failed lead to silent data corruption. To overcome these things, we have the qemu-guest-agent to improve the consistency of the Filesystem while taking a snapshot. It won't leave you a clean filesystem, but it sync()'s outstanding writes and halts all i/o until the snapshot is complete. Still, there might me issues on the Application layer. Databases processes might have unwritten data in memory, which is the most common case. Here you have the opportunity to do additional tuning, and use hooks to tell your vital processes things to do prio and post freezes.

View File

@@ -19,7 +19,7 @@ declare opt_influx_summary_metrics='crossover_jobs'
name=$(basename "$0")
# readonly variables
declare -r NAME=$name
declare -r VERSION=0.8
declare -r VERSION=0.9
declare -r PROGNAME=${NAME%.*}
declare -r PVE_DIR="/etc/pve"
declare -r PVE_NODES="$PVE_DIR/nodes"
@@ -69,6 +69,7 @@ declare opt_snapshot_prefix='mirror-'
declare opt_rewrite=''
declare opt_pool='rbd'
declare opt_sshcipher='aes128-gcm@openssh.com,aes128-cbc'
declare opt_tag=''
declare -i opt_prefix_id
declare opt_exclude_vmids=''
declare -i opt_debug=0
@@ -120,6 +121,7 @@ Commands:
Options:
--sshcipher SSH Cipher to use for transfer (default: aes128-gcm@openssh.com,aes128-cbc)
--tag Include all VMs with a specific tag set in the Proxmox UI (if set, implies vmid=all)
--vmid The source+target ID of the VM/CT, comma separated (eg. --vmid=100:100,101:101), or all for all
--prefixid Prefix for VMID's on target System [optional]
--excludevmids Exclusde VM IDs when using --vmid==all
@@ -156,7 +158,7 @@ function parse_opts(){
local args
args=$(getopt \
--options '' \
--longoptions=sshcipher:,vmid:,prefixid:,excludevmids:,destination:,pool:,keeplocal:,keepremote:,rewrite:,influxurl:,influxorg:,influxtoken:,influxbucket:,jobname:,mail:,online,migrate,nolock,keep-slock,keep-dlock,overwrite,dry-run,noconfirm,debug,syslog \
--longoptions=sshcipher:,tag:,vmid:,prefixid:,excludevmids:,destination:,pool:,keeplocal:,keepremote:,rewrite:,influxurl:,influxorg:,influxtoken:,influxbucket:,jobname:,mail:,online,migrate,nolock,keep-slock,keep-dlock,overwrite,dry-run,noconfirm,debug,syslog \
--name "$PROGNAME" \
-- "$@") \
|| end_process 128
@@ -166,6 +168,7 @@ function parse_opts(){
while true; do
case "$1" in
--sshcipher) opt_sshcipher=$2; shift 2;;
--tag) opt_tag=$2; shift 2;;
--vmid) opt_vm_ids=$2; shift 2;;
--prefixid) opt_prefix_id=$2; shift 2;;
--excludevmids) opt_exclude_vmids=$2; shift 2;;
@@ -207,7 +210,6 @@ function parse_opts(){
log info "============================================"
fi
[ -z "$opt_vm_ids" ] && { log info "VM id is not set."; end_process 1; }
[ -z "$opt_influx_jobname" ] && { log info "Jobname is not set."; end_process 1; }
@@ -230,6 +232,15 @@ function parse_opts(){
end_process 255
fi
if [ -n "$opt_tag" ] && [ -n "$opt_vm_ids" ] && [ "$opt_vm_ids" != "all" ]; then
log error "You can't use --tag and --vmid at the same time"
end_process 255
fi
[ -n "$opt_tag" ] && [ -z $opt_vm_ids ] && opt_vm_ids="all"
[ -z "$opt_vm_ids" ] && { log info "VM id is not set."; end_process 1; }
if [ "$opt_vm_ids" = "all" ]; then
local all=''
local data=''
@@ -237,6 +248,7 @@ function parse_opts(){
local ids=''
all=$(get_vm_ids "$QEMU_CONF_CLUSTER/*$EXT_CONF" "$LXC_CONF_CLUSTER/*$EXT_CONF")
log debug "all: $all"
all=$(echo "$all" | tr ',' "\n")
opt_exclude_vmids=$(echo "$opt_exclude_vmids" | tr ',' "\n")
for id in $all; do
@@ -257,7 +269,7 @@ function parse_opts(){
vm_ids=$(echo "$opt_vm_ids" | tr ',' "\n")
fi
fi
log debug "vm_ids: $vm_ids"
}
human_readable() {
@@ -319,7 +331,9 @@ function get_vm_ids(){
while [ $# -gt 0 ]; do
for conf in $1; do
[ ! -e "$conf" ] && break
if [ -n "$opt_tag" ] && ! grep -qE "^tags:\s.*$opt_tag(;|$)" $conf; then
continue
fi
conf=$(basename "$conf")
[ "$data" != '' ] && data="$data,"
data="$data${conf%.*}"
@@ -341,7 +355,7 @@ function get_disks_from_config(){
[[ "$line" == "" ]] && break
echo "$line"
done < "$file_config" | \
grep -P '^(?:((?:efidisk|virtio|ide|scsi|sata|mp)\d+)|rootfs): ' | \
grep -P '^(?:((?:efidisk|virtio|ide|scsi|sata|mp|tpmstate)\d+)|rootfs): ' | \
grep -v -P 'cdrom|none' | \
grep -v -P 'backup=0' | \
awk '{ split($0,a,","); split(a[1],b," "); print b[2]}')
@@ -349,6 +363,30 @@ function get_disks_from_config(){
echo "$disks"
}
function check_unique_disk_config() {
local file_config=$1
disks=$(while read -r line; do
[[ "$line" == "" ]] && break
echo "$line"
done < "$file_config" | \
grep -P '^(?:((?:efidisk|virtio|ide|scsi|sata|mp|tpmstate)\d+)|rootfs): ' | \
grep -v -P 'cdrom|none' | \
grep -v -P 'backup=0' | \
awk '{ split($0,a,","); split(a[1],b," "); print b[2]}'| wc -l)
echo disks
uniquedisks=$(while read -r line; do
[[ "$line" == "" ]] && break
echo "$line"
done < "$file_config" | \
grep -P '^(?:((?:efidisk|virtio|ide|scsi|sata|mp|tpmstate)\d+)|rootfs): ' | \
grep -v -P 'cdrom|none' | \
grep -v -P 'backup=0' | \
awk '{ split($0,a,","); split(a[1],b," "); print b[2]}'|cut -d ':' -f 2 | sort -nr | uniq | wc -l)
# TBD: ^(vm|ct)-([0-9]+)-([a-z]+)-[\d]+.*$
difference=$(expr $disks - $uniquedisks)
echo "$difference"
}
function log(){
local level=$1
shift 1
@@ -422,6 +460,8 @@ function mirror() {
local -i endjob
local -i vmcount=0
local -i diskcount=0
local -i vmdiskcount=0
local -i skipped_vm_count=0
local -i startdowntime
local -i enddowntime
local -i ga_ping
@@ -477,17 +517,25 @@ function mirror() {
log error "Preflight check: Destination RBD-Pool $opt_pool does not exist."
end_process 255
fi
for vm_id in $svmids; do
file_config="$PVE_NODES/${pvnode[$vm_id]}/$QEMU/$vm_id.conf"
check_unique_disk_config "$file_config"
end_process 255
if [[ $(check_unique_disk_config "$file_config") -ge 1 ]]; then
log error "VM $vm_id - Preflight check: VM $vm_id has duplicate disk entries - skipping to next VM. Check Documentation to learn how to avoid this."
(( skipped_vm_count++ ))
continue
fi
if ! exist_file "$file_config"; then
log error "VM $vm_id - Preflight check: VM $vm_id does not exist on source cluster [$scluster] - skipping to next VM."
(( skipped_vm_count++ ))
continue
fi
ga_ping=$(gaping "$vm_id")
log debug "ga_ping: $ga_ping"
if [ "$ga_ping" -eq 255 ] ; then #vm running but no qemu-guest-agent answering
log error "VM $vm_id - Preflight check: VM $vm_id on source cluster [$scluster] has no qemu-guest-agent running - skipping to next VM."
(( skipped_vm_count++ ))
continue
fi
(( vmcount++ ))
@@ -580,17 +628,18 @@ function mirror() {
fi
for disk in $(get_disks_from_config "$file_config"); do
(( diskcount++ ))
log debug "VMID: $vm_id Disk: $disk DESTVMID: $dvmid"
(( vmdiskcount++ ))
src_image_spec=$(get_image_spec "$disk")
log debug "src_image_spec: $src_image_spec"
[ -z "$src_image_spec" ] && continue
dst_image_spec=$(echo $src_image_spec | sed -r -e "s/(.*\/[a-zA-Z0-9]+\-)([0-9]+)(\-[a-zA-Z0-9]+\-[0-9]+)/\1$dvmid\3/")
[ -z "$dst_image_spec" ] && continue
[[ $disk =~ $recephimg ]]
src_image_pool_pve=${BASH_REMATCH[1]}
# src_image_pool_pve=${BASH_REMATCH[1]}
src_image_pool=$(lookupcephpool "localhost" ${BASH_REMATCH[1]})
src_image_name=${BASH_REMATCH[2]}
[[ $dst_image_spec =~ ^.*\/(.*)$ ]]
dst_image_name=${BASH_REMATCH[1]}-$src_image_pool_pve
dst_image_name=${BASH_REMATCH[1]} #-$src_image_pool_pve
dst_image_pool=$(lookupcephpool $opt_destination $opt_pool)
dst_data_pool=$(lookupdatapool $opt_destination $opt_pool)
if [ -n "$dst_data_pool" ]; then
@@ -675,6 +724,7 @@ function mirror() {
do_run "$cmd"
fi
unset basets
vmdiskcount=0
done
if [ $opt_keepdlock -eq 0 ]; then
ssh root@${dstpvnode[$dvmid]} qm unlock $dvmid
@@ -697,12 +747,14 @@ function mirror() {
if [ "$perf_ss_failed" -gt 0 ]; then disp_perf_ss_failed="$(echored $perf_ss_failed)"; else disp_perf_ss_failed="$(echogreen $perf_ss_failed)"; fi
if [ "$perf_full_failed" -gt 0 ]; then disp_perf_full_failed="$(echored $perf_full_failed)"; else disp_perf_full_failed="$(echogreen $perf_full_failed)"; fi
if [ "$perf_diff_failed" -gt 0 ]; then disp_perf_diff_failed="$(echored $perf_diff_failed)"; else disp_perf_diff_failed="$(echogreen $perf_diff_failed)"; fi
if [ "$skipped_vm_count" -gt 0 ]; then disp_skipped_vm_count="$(echored $skipped_vm_count)"; else disp_skipped_vm_count="$(echogreen $skipped_vm_count)"; fi
log info "VM Freeze OK/failed.......: $perf_freeze_ok/$disp_perf_freeze_failed"
log info "RBD Snapshot OK/failed....: $perf_ss_ok/$disp_perf_ss_failed"
log info "RBD export-full OK/failed.: $perf_full_ok/$disp_perf_full_failed"
log info "RBD export-diff OK/failed.: $perf_diff_ok/$disp_perf_diff_failed"
log info "Full xmitted..............: $(human_readable $perf_bytes_full)"
log info "Differential Bytes .......: $(human_readable $perf_bytes_diff)"
log info "Skipped VMs ..............: $disp_skipped_vm_count"
if [ -n "$opt_influx_api_url" ]; then
log info "VM $vm_id - Logging Job summary to InfluxDB: $opt_influx_api_url"
influxlp="$opt_influx_summary_metrics,jobname=$opt_influx_jobname perf_bytes_diff=$perf_bytes_diff""i,perf_bytes_full=$perf_bytes_full""i,perf_bytes_total=$perf_bytes_total""i,perf_diff_failed=$perf_diff_failed""i,perf_diff_ok=$perf_diff_ok""i,perf_freeze_failed=$perf_freeze_failed""i,perf_freeze_ok=$perf_freeze_ok""i,perf_full_failed=$perf_full_failed""i,perf_full_ok=$perf_full_ok""i,perf_ss_failed=$perf_ss_failed""i,perf_ss_ok=$perf_ss_ok""i,perf_vm_running=$perf_vm_running""i,perf_vm_stopped=$perf_vm_stopped""i"
@@ -818,7 +870,7 @@ function rewriteconfig(){
else
sedcmd='sed -e /^$/,$d'
fi
cat "$oldconfig" | sed -r -e "s/^(efidisk|virtio|ide|scsi|sata|mp)([0-9]+):\s([a-zA-Z0-9]+):(.*)-([0-9]+)-disk-([0-9]+).*,(.*)$/\1\2: $newpool:\4-$newvmid-disk-\6-\3,\7/g" | $sedcmd | sed -e '/^$/,$d' | sed -e '/ide[0-9]:.*-cloudinit,media=cdrom.*/d' | grep -v "^parent:\s.*$" | ssh "$dst" "cat - >$newconfig"
cat "$oldconfig" | sed -r -e "s/^(efidisk|virtio|ide|scsi|sata|mp)([0-9]+):\s([a-zA-Z0-9]+):(.*)-([0-9]+)-disk-([0-9]+).*,(.*)$/\1\2: $newpool:\4-$newvmid-disk-\6,\7/g" | $sedcmd | sed -e '/^$/,$d' | sed -e '/ide[0-9]:.*-cloudinit,media=cdrom.*/d' | grep -v "^parent:\s.*$" | ssh "$dst" "cat - >$newconfig"
}
function checkvmid(){