Pacemaker + DRBD + Xen: failback issues

Hi,

We have a 2 node cluster with SLES11 SP1 with Pacemaker + DRBD + XEN installed with the latest patches.

Here’s the drbd.conf:
global {
usage-count yes;
}
common {
protocol C;
disk {
on-io-error detach;

fencing resource-only;

    fencing resource-and-stonith;

}
syncer {
rate 33M;
al-extents 3389;
}
net {
allow-two-primaries; # Enable this after initial testing
cram-hmac-alg sha1;
shared-secret “a6a0680c40bca2439dbe48343ddddcf4”;
after-sb-0pri discard-zero-changes;
after-sb-1pri discard-secondary;
after-sb-2pri disconnect;
}

startup {

become-primary-on both;

#}
handlers {
fence-peer “/usr/lib/drbd/stonith_admin-fence-peer.sh”;

fence-peer “/usr/lib/drbd/crm-fence-peer.sh”;

after-resync-target “/usr/lib/drbd/crm-unfence-peer.sh”;

pri-on-incon-degr “echo b > /proc/sysrq-trigger”;

split-brain “/usr/lib/drbd/notify-split-brain.sh root”;

}
}
resource vmsvn {
device /dev/drbd0;
disk /dev/sdb;
meta-disk internal;
on xm01 {
address 100.0.0.1:7788;
}
on xm02 {
address 100.0.0.2:7788;
}
}

resource srvsvn1 {
protocol C;
device /dev/drbd1;
disk /dev/sdc;
meta-disk internal;
on xm01 {
address 100.0.0.1:7789;
}
on xm02 {
address 100.0.0.2:7789;
}
}

resource srvsvn2 {
protocol C;
device /dev/drbd2;
disk /dev/sdd;
meta-disk internal;
on xm01 {
address 100.0.0.1:7790;
}
on xm02 {
address 100.0.0.2:7790;
}
}

resource vmconfig {
protocol C;
device /dev/drbd3;
meta-disk internal;
on xm01 {
address 100.0.0.1:7791;
disk /dev/vg_xm01/lv_xm01_vmconfig;
}
on xm02 {
address 100.0.0.2:7791;
disk /dev/vg_xm02/lv_xm02_vmconfig;
}
}

and here’s the pacemaker configuration:
node xm01
node xm02
primitive VMSVN ocf:heartbeat:Xen \
meta target-role=“Started” allow-migrate=“true” is-managed=“true” \
operations $id=“VMSVN-operations” \
op monitor interval=“30” timeout=“30” \
op start interval=“0” timeout=“60” \
op stop interval=“0” timeout=“60” \
op migrate_to interval=“0” timeout=“180” \
params xmfile="/vmconfig/vmsvn"
primitive clvm ocf:lvm2:clvmd \
operations $id=“clvm-operations” \
op monitor interval=“10” timeout=“20”
primitive dlm ocf:pacemaker:controld \
operations $id=“dlm-operations” \
op monitor interval=“10” timeout=“20” start-delay=“0”
primitive ipmi-stonith-xm01 stonith:external/ipmi \
meta target-role=“Started” is-managed=“true” priority=“10” \
operations $id=“ipmi-stonith-xm01-operations” \
op monitor interval=“15” timeout=“15” start-delay=“15” \
params hostname=“xm01” ipaddr=“125.1.254.107” userid=“administrator” passwd=“" interface=“lan”
primitive ipmi-stonith-xm02 stonith:external/ipmi \
meta target-role=“Started” is-managed=“true” priority=“9” \
operations $id=“ipmi-stonith-xm02-operations” \
op monitor interval=“15” timeout=“15” start-delay=“15” \
params hostname=“xm02” ipaddr=“125.1.254.248” userid=“administrator” passwd="
*” interface=“lan”
primitive o2cb ocf:ocfs2:o2cb \
operations $id=“o2cb-operations” \
op monitor interval=“10” timeout=“20”
primitive srvsvn1-drbd ocf:linbit:drbd \
params drbd_resource=“srvsvn1” \
operations $id=“srvsvn1-drbd-operations” \
op monitor interval=“20” role=“Master” timeout=“20” \
op monitor interval=“30” role=“Slave” timeout=“20” \
op start interval=“0” timeout=“240” \
op promote interval=“0” timeout=“90” \
op demote interval=“0” timeout=“90” \
op stop interval=“0” timeout=“100” \
meta migration-threshold=“10” failure-timeout=“600”
primitive srvsvn2-drbd ocf:linbit:drbd \
params drbd_resource=“srvsvn2” \
operations $id=“srvsvn2-drbd-operations” \
op monitor interval=“20” role=“Master” timeout=“20” \
op monitor interval=“30” role=“Slave” timeout=“20” \
op start interval=“0” timeout=“240” \
op promote interval=“0” timeout=“90” \
op demote interval=“0” timeout=“90” \
op stop interval=“0” timeout=“100” \
meta migration-threshold=“10” failure-timeout=“600”
primitive vg_svn ocf:heartbeat:LVM \
params volgrpname=“vg_svn”
primitive vmconfig ocf:linbit:drbd \
operations $id=“vmconfig-operations” \
op monitor interval=“20” role=“Master” timeout=“120” \
op start interval=“0” timeout=“240” \
op promote interval=“0” timeout=“90” \
op demote interval=“0” timeout=“90” \
op stop interval=“0” timeout=“100” \
params drbd_resource=“vmconfig” \
meta migration-threshold=“10” failure-timeout=“600”
primitive vmconfig-pri ocf:heartbeat:Filesystem \
operations $id=“vmconfig-pri-operations” \
op monitor interval=“20” timeout=“40” \
params device="/dev/drbd3" directory="/vmconfig" fstype=“ocfs2”
primitive vmsvn-drbd ocf:linbit:drbd \
operations $id=“vmsvn-drbd-operations” \
op monitor interval=“20” role=“Master” timeout=“20” \
op monitor interval=“30” role=“Slave” timeout=“20” \
op start interval=“0” timeout=“240” \
op promote interval=“0” timeout=“90” \
op demote interval=“0” timeout=“90” \
op stop interval=“0” timeout=“100” \
params drbd_resource=“vmsvn” \
meta is-managed=“true” migration-threshold=“10” failure-timeout=“600”
group init-group dlm o2cb clvm
group operaciones-group vg_svn
ms ms_drbd_srvsvn1 srvsvn1-drbd \
meta resource-stickiness=“100” master-max=“2” notify=“true” target-role=“Started” priority=“6”
ms ms_drbd_srvsvn2 srvsvn2-drbd \
meta resource-stickiness=“100” master-max=“2” notify=“true” target-role=“Started” priority=“5”
ms ms_drbd_vmconfig vmconfig \
meta resource-stickiness=“100” master-max=“2” clone-max=“2” notify=“true” priority=“8” target-role=“Started”
ms ms_drbd_vmsvn vmsvn-drbd \
meta resource-stickiness=“100” master-max=“2” notify=“true” target-role=“Started” priority=“7”
clone init-clone init-group \
meta interleave=“true” target-role=“Started” is-managed=“true” priority=“4”
clone operaciones-clone operaciones-group \
meta target-role=“Started” is-managed=“true” priority=“2” interleave=“true”
clone vmconfig-clone vmconfig-pri \
meta target-role=“Started” priority=“3” is-managed=“true”
location location-stonith-xm01 ipmi-stonith-xm01 -inf: xm01
location location-stonith-xm02 ipmi-stonith-xm02 -inf: xm02
colocation colocacion : init-clone operaciones-clone vmconfig-clone VMSVN ms_drbd_vmconfig:Master ms_drbd_vmsvn:Master ms_drbd_srvsvn1:Master ms_drbd_srvsvn2:Master
order ordenamiento : ms_drbd_vmconfig:promote ms_drbd_vmsvn:promote ms_drbd_srvsvn1:promote ms_drbd_srvsvn2:promote init-clone:start operaciones-clone:start vmconfig-clone:start VMSVN:start
property $id=“cib-bootstrap-options” \
dc-version=“1.1.5-5bd2b9154d7d9f86d7f56fe0a74072a5a6590c60” \
cluster-infrastructure=“openais” \
expected-quorum-votes=“2” \
no-quorum-policy=“ignore” \
last-lrm-refresh=“1330713370”

Here’s what I do:

  1. xm01 currently has the VMSVN vm machine. I run “rcnetwork stop” on xm01
  2. xm02 stoniths xm01 as it should
  3. VMSVN (vm machine) fails over xm02
  4. xm01 is shut down (not reboot… why?)
  5. I manually start xm01
  6. xm01 becomes online
  7. VMSVN shuts down properly and starts on xm01

Any ideas why the virtual machine stop and starts again on the first node? I don’t have any stickiness!

Regards,
Daniel

Daniel,

we’ve been experiencing resource allocation between pacemaker nodes, too (SLES11SP1HAE), and got around that by setting

Yes, “INFINITY” is a bit rude… but “5” didn’t catch and since we’ve had no downsides so far, we just left it at the first successfully tested value :wink:

Regards,
Jens