From a1baedbd9a08ca2d41b81eefeaf4b3a4d214ab8e Mon Sep 17 00:00:00 2001 From: epriestley Date: Tue, 13 Feb 2018 10:46:14 -0800 Subject: [PATCH] Lock resources briefly while acquiring leases on them to prevent acquiring near-death resources Summary: Depends on D19078. Ref T13073. Currently, there is a narrow window where we can acquire a resource after a reclaim has started against it. To prevent this, briefly lock resources before acquiring them and make sure they're still good. If a resource isn't good, throw the lease back in the pool. Test Plan: This is tricky. You need: - Hoax blueprint with limits and a rule where leases of a given "flavor" can only be satisfied by resources of the same flavor. - Reduce the 3-minute "wait before resources can be released" to 3 seconds. - Limit Hoaxes to 1. - Allocate one "cherry" flavored Hoax and release the lease. - Add a `sleep(15)` to `releaseResource()` in `DrydockResourceUpdateWorker`, after the `canReclaimResource()` check, with a `print`. Now: - Run `bin/phd debug task` in two windows. - Run `bin/drydock lease --type host --attributes flavor=banana` in a third window. - This will start to reclaim the existing "cherry" resource. Once one of the `phd` windows prints the "RECLAIMING" message run `bin/drydock lease --type host --attributes flavor=cherry` in a fourth window. - Before patch: the "cherry" lease acquired immediately, then was released and destroyed moments later. - After patch: the "cherry" lease yields. Subscribers: PHID-OPKG-gm6ozazyms6q6i22gyam Maniphest Tasks: T13073 Differential Revision: https://secure.phabricator.com/D19080 --- src/__phutil_library_map__.php | 2 + .../DrydockResourceLockException.php | 4 + .../drydock/storage/DrydockLease.php | 83 ++++++++++++++----- .../worker/DrydockLeaseUpdateWorker.php | 26 ++++-- 4 files changed, 91 insertions(+), 24 deletions(-) create mode 100644 src/applications/drydock/exception/DrydockResourceLockException.php diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php index e26d68f72b..80c70fd98e 100644 --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -1113,6 +1113,7 @@ phutil_register_library_map(array( 'DrydockResourceDatasource' => 'applications/drydock/typeahead/DrydockResourceDatasource.php', 'DrydockResourceListController' => 'applications/drydock/controller/DrydockResourceListController.php', 'DrydockResourceListView' => 'applications/drydock/view/DrydockResourceListView.php', + 'DrydockResourceLockException' => 'applications/drydock/exception/DrydockResourceLockException.php', 'DrydockResourcePHIDType' => 'applications/drydock/phid/DrydockResourcePHIDType.php', 'DrydockResourceQuery' => 'applications/drydock/query/DrydockResourceQuery.php', 'DrydockResourceReclaimLogType' => 'applications/drydock/logtype/DrydockResourceReclaimLogType.php', @@ -6344,6 +6345,7 @@ phutil_register_library_map(array( 'DrydockResourceDatasource' => 'PhabricatorTypeaheadDatasource', 'DrydockResourceListController' => 'DrydockResourceController', 'DrydockResourceListView' => 'AphrontView', + 'DrydockResourceLockException' => 'Exception', 'DrydockResourcePHIDType' => 'PhabricatorPHIDType', 'DrydockResourceQuery' => 'DrydockQuery', 'DrydockResourceReclaimLogType' => 'DrydockLogType', diff --git a/src/applications/drydock/exception/DrydockResourceLockException.php b/src/applications/drydock/exception/DrydockResourceLockException.php new file mode 100644 index 0000000000..2b54538e08 --- /dev/null +++ b/src/applications/drydock/exception/DrydockResourceLockException.php @@ -0,0 +1,4 @@ +openTransaction(); + // Before we associate the lease with the resource, we lock the resource + // and reload it to make sure it is still pending or active. If we don't + // do this, the resource may have just been reclaimed. (Once we acquire + // the resource that stops it from being released, so we're nearly safe.) + + $resource_phid = $resource->getPHID(); + $hash = PhabricatorHash::digestForIndex($resource_phid); + $lock_key = 'drydock.resource:'.$hash; + $lock = PhabricatorGlobalLock::newLock($lock_key); try { - DrydockSlotLock::acquireLocks($this->getPHID(), $this->slotLocks); - $this->slotLocks = array(); - } catch (DrydockSlotLockException $ex) { - $this->killTransaction(); - - $this->logEvent( - DrydockSlotLockFailureLogType::LOGCONST, - array( - 'locks' => $ex->getLockMap(), - )); - - throw $ex; + $lock->lock(15); + } catch (Exception $ex) { + throw new DrydockResourceLockException( + pht( + 'Failed to acquire lock for resource ("%s") while trying to '. + 'acquire lease ("%s").', + $resource->getPHID(), + $this->getPHID())); } - $this - ->setResourcePHID($resource->getPHID()) - ->attachResource($resource) - ->setStatus($new_status) - ->save(); + $resource->reload(); - $this->saveTransaction(); + if (($resource->getStatus() !== DrydockResourceStatus::STATUS_ACTIVE) && + ($resource->getStatus() !== DrydockResourceStatus::STATUS_PENDING)) { + throw new DrydockAcquiredBrokenResourceException( + pht( + 'Trying to acquire lease ("%s") on a resource ("%s") in the '. + 'wrong status ("%s").', + $this->getPHID(), + $resource->getPHID(), + $resource->getStatus())); + } + + $caught = null; + try { + $this->openTransaction(); + + try { + DrydockSlotLock::acquireLocks($this->getPHID(), $this->slotLocks); + $this->slotLocks = array(); + } catch (DrydockSlotLockException $ex) { + $this->killTransaction(); + + $this->logEvent( + DrydockSlotLockFailureLogType::LOGCONST, + array( + 'locks' => $ex->getLockMap(), + )); + + throw $ex; + } + + $this + ->setResourcePHID($resource->getPHID()) + ->attachResource($resource) + ->setStatus($new_status) + ->save(); + + $this->saveTransaction(); + } catch (Exception $ex) { + $caught = $ex; + } + + $lock->unlock(); + + if ($caught) { + throw $caught; + } $this->isAcquired = true; diff --git a/src/applications/drydock/worker/DrydockLeaseUpdateWorker.php b/src/applications/drydock/worker/DrydockLeaseUpdateWorker.php index 55d9136cef..4a0cba8de2 100644 --- a/src/applications/drydock/worker/DrydockLeaseUpdateWorker.php +++ b/src/applications/drydock/worker/DrydockLeaseUpdateWorker.php @@ -53,6 +53,7 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker { $lease ->setStatus(DrydockLeaseStatus::STATUS_PENDING) + ->setResourcePHID(null) ->save(); $lease->logEvent( @@ -301,23 +302,38 @@ final class DrydockLeaseUpdateWorker extends DrydockWorker { $resources = $this->rankResources($resources, $lease); $exceptions = array(); + $yields = array(); $allocated = false; foreach ($resources as $resource) { try { $this->acquireLease($resource, $lease); $allocated = true; break; + } catch (DrydockResourceLockException $ex) { + // We need to lock the resource to actually acquire it. If we aren't + // able to acquire the lock quickly enough, we can yield and try again + // later. + $yields[] = $ex; + } catch (DrydockAcquiredBrokenResourceException $ex) { + // If a resource was reclaimed or destroyed by the time we actually + // got around to acquiring it, we just got unlucky. We can yield and + // try again later. + $yields[] = $ex; } catch (Exception $ex) { $exceptions[] = $ex; } } if (!$allocated) { - throw new PhutilAggregateException( - pht( - 'Unable to acquire lease "%s" on any resource.', - $lease->getPHID()), - $exceptions); + if ($yields) { + throw new PhabricatorWorkerYieldException(15); + } else { + throw new PhutilAggregateException( + pht( + 'Unable to acquire lease "%s" on any resource.', + $lease->getPHID()), + $exceptions); + } } }