From 2a7ac8e3882fcdaa7ad92308ddae462ff37375ab Mon Sep 17 00:00:00 2001 From: epriestley Date: Wed, 7 Nov 2018 10:08:51 -0800 Subject: [PATCH] Make "bin/repository thaw" workflow more clear when devices are disabled Summary: Ref T13216. See PHI943. If autoscale lightning strikes all your servers at once and destroys them, the path to recovery can be unclear. You're "supposed" to: - demote all the devices; - disable the bindings; - bind the new servers; - put whatever working copies you can scrape up back on disk; - promote one of the new servers. However, the documentation is a bit misleading (it was sort of written with "you lost one or two devices" in mind, not "you lost every device") and demote-before-disable is unnecessary and slightly risky if servers come back online. There's also a missing guardrail before the promote step which lets you accidentally skip the demotion step and end up in a confusing state. Instead: - Add a guard rail: when you try to promote a new server, warn if inactive devices still have versions and tell the user to demote them. - Allow demotion of inactive devices: the order "disable, demote" is safer and more intuitive than "demote, disable" and there's no reason to require the unintuitive order. - Make the "cluster already has leaders" message more clear. - Make the documentation more clear. Test Plan: - Bound a repository to two devices. - Wrote to A to make it a leader, then disabled it (simulating a lightning strike). - Tried to promote B. Got a new, useful error ("demote A first"). - Demoted A (before: error about demoting inactive devices; now: works fine). - Promoted B. This worked. Reviewers: amckinley Reviewed By: amckinley Maniphest Tasks: T13216 Differential Revision: https://secure.phabricator.com/D19793 --- ...icatorRepositoryManagementThawWorkflow.php | 90 +++++++++++++------ .../user/cluster/cluster_repositories.diviner | 24 +++-- 2 files changed, 74 insertions(+), 40 deletions(-) diff --git a/src/applications/repository/management/PhabricatorRepositoryManagementThawWorkflow.php b/src/applications/repository/management/PhabricatorRepositoryManagementThawWorkflow.php index 71076314da..4c4eed85b9 100644 --- a/src/applications/repository/management/PhabricatorRepositoryManagementThawWorkflow.php +++ b/src/applications/repository/management/PhabricatorRepositoryManagementThawWorkflow.php @@ -120,33 +120,71 @@ final class PhabricatorRepositoryManagementThawWorkflow $repository->getDisplayName())); } - $bindings = $service->getActiveBindings(); - $bindings = mpull($bindings, null, 'getDevicePHID'); - if (empty($bindings[$device->getPHID()])) { - throw new PhutilArgumentUsageException( - pht( - 'Repository "%s" has no active binding to device "%s". Only '. - 'actively bound devices can be promoted or demoted.', - $repository->getDisplayName(), - $device->getName())); - } - - $versions = PhabricatorRepositoryWorkingCopyVersion::loadVersions( - $repository->getPHID()); - - $versions = mpull($versions, null, 'getDevicePHID'); - $versions = array_select_keys($versions, array_keys($bindings)); - - if ($versions && $promote) { - throw new PhutilArgumentUsageException( - pht( - 'Unable to promote "%s" for repository "%s": the leaders for '. - 'this cluster are not ambiguous.', - $device->getName(), - $repository->getDisplayName())); - } - if ($promote) { + // You can only promote active devices. (You may demote active or + // inactive devices.) + $bindings = $service->getActiveBindings(); + $bindings = mpull($bindings, null, 'getDevicePHID'); + if (empty($bindings[$device->getPHID()])) { + throw new PhutilArgumentUsageException( + pht( + 'Repository "%s" has no active binding to device "%s". Only '. + 'actively bound devices can be promoted.', + $repository->getDisplayName(), + $device->getName())); + } + + $versions = PhabricatorRepositoryWorkingCopyVersion::loadVersions( + $repository->getPHID()); + $versions = mpull($versions, null, 'getDevicePHID'); + + // Before we promote, make sure there are no outstanding versions on + // devices with inactive bindings. If there are, you need to demote + // these first. + $inactive = array(); + foreach ($versions as $device_phid => $version) { + if (isset($bindings[$device_phid])) { + continue; + } + $inactive[$device_phid] = $version; + } + + if ($inactive) { + $handles = $viewer->loadHandles(array_keys($inactive)); + + $handle_list = iterator_to_array($handles); + $handle_list = mpull($handle_list, 'getName'); + $handle_list = implode(', ', $handle_list); + + throw new PhutilArgumentUsageException( + pht( + 'Repository "%s" has versions on inactive devices. Demote '. + '(or reactivate) these devices before promoting a new '. + 'leader: %s.', + $repository->getDisplayName(), + $handle_list)); + } + + // Now, make sure there are no outstanding versions on devices with + // active bindings. These also need to be demoted (or promoting is a + // mistake or already happened). + $active = array_select_keys($versions, array_keys($bindings)); + if ($active) { + $handles = $viewer->loadHandles(array_keys($active)); + + $handle_list = iterator_to_array($handles); + $handle_list = mpull($handle_list, 'getName'); + $handle_list = implode(', ', $handle_list); + + throw new PhutilArgumentUsageException( + pht( + 'Unable to promote "%s" for repository "%s" because this '. + 'cluster already has one or more unambiguous leaders: %s.', + $device->getName(), + $repository->getDisplayName(), + $handle_list)); + } + PhabricatorRepositoryWorkingCopyVersion::updateVersion( $repository->getPHID(), $device->getPHID(), diff --git a/src/docs/user/cluster/cluster_repositories.diviner b/src/docs/user/cluster/cluster_repositories.diviner index 8d764c2e3e..cdf2641933 100644 --- a/src/docs/user/cluster/cluster_repositories.diviner +++ b/src/docs/user/cluster/cluster_repositories.diviner @@ -414,28 +414,24 @@ present on the leaders but not present on the followers by examining the push logs. If you are comfortable discarding these changes, you can instruct Phabricator -that it can forget about the leaders in two ways: disable the service bindings -to all of the leader devices so they are no longer part of the cluster, or use -`bin/repository thaw` to `--demote` the leaders explicitly. +that it can forget about the leaders by doing this: -If you do this, **you will lose data**. Either action will discard any changes -on the affected leaders which have not replicated to other devices in the -cluster. + - Disable the service bindings to all of the leader devices so they are no + longer part of the cluster. + - Then, use `bin/repository thaw` to `--demote` the leaders explicitly. -To remove a device from the cluster, disable all of the bindings to it -in Almanac, using the web UI. - -{icon exclamation-triangle, color="red"} Any data which is only present on -the disabled device will be lost. - -To demote a device without removing it from the cluster, run this command: +To demote a device, run this command: ``` phabricator/ $ ./bin/repository thaw rXYZ --demote repo002.corp.net ``` {icon exclamation-triangle, color="red"} Any data which is only present on -**this** device will be lost. +the demoted device will be lost. + +If you do this, **you will lose unreplicated data**. You will discard any +changes on the affected leaders which have not replicated to other devices +in the cluster. Ambiguous Leaders