diff --git a/src/__phutil_library_map__.php b/src/__phutil_library_map__.php index ef1c410600..82341ac212 100644 --- a/src/__phutil_library_map__.php +++ b/src/__phutil_library_map__.php @@ -1983,8 +1983,10 @@ phutil_register_library_map(array( 'PhabricatorRepositoryEngine' => 'applications/repository/engine/PhabricatorRepositoryEngine.php', 'PhabricatorRepositoryGitCommitChangeParserWorker' => 'applications/repository/worker/commitchangeparser/PhabricatorRepositoryGitCommitChangeParserWorker.php', 'PhabricatorRepositoryGitCommitMessageParserWorker' => 'applications/repository/worker/commitmessageparser/PhabricatorRepositoryGitCommitMessageParserWorker.php', + 'PhabricatorRepositoryGraphCache' => 'applications/repository/graphcache/PhabricatorRepositoryGraphCache.php', 'PhabricatorRepositoryGraphStream' => 'applications/repository/daemon/PhabricatorRepositoryGraphStream.php', 'PhabricatorRepositoryListController' => 'applications/repository/controller/PhabricatorRepositoryListController.php', + 'PhabricatorRepositoryManagementCacheWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementCacheWorkflow.php', 'PhabricatorRepositoryManagementDeleteWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementDeleteWorkflow.php', 'PhabricatorRepositoryManagementDiscoverWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementDiscoverWorkflow.php', 'PhabricatorRepositoryManagementEditWorkflow' => 'applications/repository/management/PhabricatorRepositoryManagementEditWorkflow.php', @@ -4828,6 +4830,7 @@ phutil_register_library_map(array( 'PhabricatorRepositoryGitCommitMessageParserWorker' => 'PhabricatorRepositoryCommitMessageParserWorker', 'PhabricatorRepositoryGraphStream' => 'Phobject', 'PhabricatorRepositoryListController' => 'PhabricatorRepositoryController', + 'PhabricatorRepositoryManagementCacheWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 'PhabricatorRepositoryManagementDeleteWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 'PhabricatorRepositoryManagementDiscoverWorkflow' => 'PhabricatorRepositoryManagementWorkflow', 'PhabricatorRepositoryManagementEditWorkflow' => 'PhabricatorRepositoryManagementWorkflow', diff --git a/src/applications/cache/PhabricatorCaches.php b/src/applications/cache/PhabricatorCaches.php index 666329788f..64193a579d 100644 --- a/src/applications/cache/PhabricatorCaches.php +++ b/src/applications/cache/PhabricatorCaches.php @@ -9,6 +9,53 @@ final class PhabricatorCaches { return PhabricatorEnv::getEnvConfig('phabricator.cache-namespace'); } + private static function newStackFromCaches(array $caches) { + $caches = self::addNamespaceToCaches($caches); + $caches = self::addProfilerToCaches($caches); + return id(new PhutilKeyValueCacheStack()) + ->setCaches($caches); + } + + +/* -( Repository Graph Cache )--------------------------------------------- */ + + + public static function getRepositoryGraphL1Cache() { + static $cache; + if (!$cache) { + $caches = self::buildRepositoryGraphL1Caches(); + $cache = self::newStackFromCaches($caches); + } + return $cache; + } + + private static function buildRepositoryGraphL1Caches() { + $caches = array(); + + $apc = new PhutilKeyValueCacheAPC(); + if ($apc->isAvailable()) { + $caches[] = $apc; + } + + return $caches; + } + + public static function getRepositoryGraphL2Cache() { + static $cache; + if (!$cache) { + $caches = self::buildRepositoryGraphL2Caches(); + $cache = self::newStackFromCaches($caches); + } + return $cache; + } + + private static function buildRepositoryGraphL2Caches() { + $caches = array(); + $caches[] = new PhabricatorKeyValueDatabaseCache(); + return $caches; + } + + /* -( Setup Cache )-------------------------------------------------------- */ @@ -30,10 +77,7 @@ final class PhabricatorCaches { static $cache; if (!$cache) { $caches = self::buildSetupCaches(); - $caches = self::addNamespaceToCaches($caches); - $caches = self::addProfilerToCaches($caches); - $cache = id(new PhutilKeyValueCacheStack()) - ->setCaches($caches); + $cache = self::newStackFromCaches($caches); } return $cache; } diff --git a/src/applications/repository/engine/PhabricatorRepositoryDiscoveryEngine.php b/src/applications/repository/engine/PhabricatorRepositoryDiscoveryEngine.php index 0f8485c0ff..5b7233ce5c 100644 --- a/src/applications/repository/engine/PhabricatorRepositoryDiscoveryEngine.php +++ b/src/applications/repository/engine/PhabricatorRepositoryDiscoveryEngine.php @@ -575,6 +575,12 @@ final class PhabricatorRepositoryDiscoveryEngine } $parent_ids[] = $parent_map[$parent]; } + } else { + // Write an explicit 0 so we can distinguish between "really no + // parents" and "data not available". + if (!$repository->isSVN()) { + $parent_ids = array(0); + } } $commit->openTransaction(); diff --git a/src/applications/repository/graphcache/PhabricatorRepositoryGraphCache.php b/src/applications/repository/graphcache/PhabricatorRepositoryGraphCache.php new file mode 100644 index 0000000000..303b5b83f2 --- /dev/null +++ b/src/applications/repository/graphcache/PhabricatorRepositoryGraphCache.php @@ -0,0 +1,386 @@ + -- + * + * ...routinely takes several hundred milliseconds, and equivalent requests + * often take longer in Mercurial. + * + * Unfortunately, this operation is fundamental to rendering a repository for + * the web, and essentially everything else that's slow can be reduced to this + * plus some trivial work afterward. Making this fast is desirable and powerful, + * and allows us to make other things fast by expressing them in terms of this + * query. + * + * Because the query is fundamentally a graph query, it isn't easy to express + * in a reasonable way in MySQL, and we can't do round trips to the server to + * walk the graph without incurring huge performance penalties. + * + * However, the total amount of data in the graph is relatively small. By + * caching it in chunks and keeping it in APC, we can reasonably load and walk + * the graph in PHP quickly. + * + * For more context, see T2683. + * + * Structure of the Cache + * ====================== + * + * The cache divides commits into buckets (see @{method:getBucketSize}). To + * walk the graph, we pull a commit's bucket. The bucket is a map from commit + * IDs to a list of parents and changed paths, separated by `null`. For + * example, a bucket might look like this: + * + * array( + * 1 => array(0, null, 17, 18), + * 2 => array(1, null, 4), + * // ... + * ) + * + * This means that commit ID 1 has parent commit 0 (a special value meaning + * no parents) and affected path IDs 17 and 18. Commit ID 2 has parent commit 1, + * and affected path 4. + * + * This data structure attempts to balance compactness, ease of construction, + * simplicity of cache semantics, and lookup performance. In the average case, + * it appears to do a reasonable job at this. + * + * @task query Querying the Graph Cache + * @task cache Cache Internals + */ +final class PhabricatorRepositoryGraphCache { + + +/* -( Querying the Graph Cache )------------------------------------------- */ + + + /** + * Search the graph cache for the most modification to a path. + * + * @param int The commit ID to search ancestors of. + * @param int The path ID to search for changes to. + * @param float Maximum number of seconds to spend trying to satisfy this + * query using the graph cache. By default, `0.5` (500ms). + * @return mixed Commit ID, or `null` if no ancestors exist, or `false` if + * the graph cache was unable to determine the answer. + * @task query + */ + public function loadLastModifiedCommitID($commit_id, $path_id, $time = 0.5) { + $commit_id = (int)$commit_id; + $path_id = (int)$path_id; + + $bucket_data = null; + $data_key = null; + $seen = array(); + + $t_start = microtime(true); + $iterations = 0; + while (true) { + $bucket_key = $this->getBucketKey($commit_id); + + if (($data_key != $bucket_key) || $bucket_data === null) { + $bucket_data = $this->getBucketData($bucket_key); + $data_key = $bucket_key; + } + + if (empty($bucket_data[$commit_id])) { + // Rebuild the cache bucket, since the commit might be a very recent + // one that we'll pick up by rebuilding. + + $bucket_data = $this->getBucketData($bucket_key, $bucket_data); + if (empty($bucket_data[$commit_id])) { + // A rebuild didn't help. This can occur legitimately if the commit + // is new and hasn't parsed yet. + return false; + } + + // Otherwise, the rebuild gave us the data, so we can keep going. + } + + // Sanity check so we can survive and recover from bad data. + if (isset($seen[$commit_id])) { + phlog(pht('Unexpected infinite loop in RepositoryGraphCache!')); + return false; + } else { + $seen[$commit_id] = true; + } + + // `$data` is a list: the commit's parent IDs, followed by `null`, + // followed by the modified paths in ascending order. We figure out the + // first parent first, then check if the path was touched. If the path + // was touched, this is the commit we're after. If not, walk backward + // in the tree. + + $items = $bucket_data[$commit_id]; + $size = count($items); + + // Walk past the parent information. + $parent_id = null; + for ($ii = 0; ; ++$ii) { + if ($items[$ii] === null) { + break; + } + if ($parent_id === null) { + $parent_id = $items[$ii]; + } + } + + // Look for a modification to the path. + for (; $ii < $size; ++$ii) { + $item = $items[$ii]; + if ($item > $path_id) { + break; + } + if ($item === $path_id) { + return $commit_id; + } + } + + if ($parent_id) { + $commit_id = $parent_id; + + // Periodically check if we've spent too long looking for a result + // in the cache, and return so we can fall back to a VCS operation. This + // keeps us from having a degenerate worst case if, e.g., the cache + // is cold and we need to inspect a very large number of blocks + // to satisfy the query. + + if (((++$iterations) % 64) === 0) { + $t_end = microtime(true); + if (($t_end - $t_start) > $time) { + return false; + } + } + continue; + } + + // If we have an explicit 0, that means this commit really has no parents. + // Usually, it is the first commit in the repository. + if ($parent_id === 0) { + return null; + } + + // If we didn't find a parent, the parent data isn't available. We fail + // to find an answer in the cache and fall back to querying the VCS. + return false; + } + } + + +/* -( Cache Internals )---------------------------------------------------- */ + + + /** + * Get the bucket key for a given commit ID. + * + * @param int Commit ID. + * @return int Bucket key. + * @task cache + */ + private function getBucketKey($commit_id) { + return (int)floor($commit_id / $this->getBucketSize()); + } + + + /** + * Get the cache key for a given bucket key (from @{method:getBucketKey}). + * + * @param int Bucket key. + * @return string Cache key. + * @task cache + */ + private function getBucketCacheKey($bucket_key) { + static $prefix; + + if ($prefix === null) { + $self = get_class($this); + $size = $this->getBucketSize(); + $prefix = "{$self}:{$size}:1:"; + } + + return $prefix.$bucket_key; + } + + + /** + * Get the number of items per bucket. + * + * @return int Number of items to store per bucket. + * @task cache + */ + private function getBucketSize() { + return 4096; + } + + + /** + * Retrieve or build a graph cache bucket from the cache. + * + * Normally, this operates as a readthrough cache call. It can also be used + * to force a cache update by passing the existing data to `$rebuild_data`. + * + * @param int Bucket key, from @{method:getBucketKey}. + * @param mixed Current data, to force a cache rebuild of this bucket. + * @return array Data from the cache. + * @task cache + */ + private function getBucketData($bucket_key, $rebuild_data = null) { + $cache_key = $this->getBucketCacheKey($bucket_key); + + // TODO: This cache stuff could be handled more gracefully, but the + // database cache currently requires values to be strings and needs + // some tweaking to support this as part of a stack. Our cache semantics + // here are also unusual (not purely readthrough) because this cache is + // appendable. + + $cache_level1 = PhabricatorCaches::getRepositoryGraphL1Cache(); + $cache_level2 = PhabricatorCaches::getRepositoryGraphL2Cache(); + if ($rebuild_data === null) { + $bucket_data = $cache_level1->getKey($cache_key); + if ($bucket_data) { + return $bucket_data; + } + + $bucket_data = $cache_level2->getKey($cache_key); + if ($bucket_data) { + $unserialized = @unserialize($bucket_data); + if ($unserialized) { + // Fill APC if we got a database hit but missed in APC. + $cache_level1->setKey($cache_key, $unserialized); + return $unserialized; + } + } + } + + if (!is_array($rebuild_data)) { + $rebuild_data = array(); + } + + $bucket_data = $this->rebuildBucket($bucket_key, $rebuild_data); + + // Don't bother writing the data if we didn't update anything. + if ($bucket_data !== $rebuild_data) { + $cache_level2->setKey($cache_key, serialize($bucket_data)); + $cache_level1->setKey($cache_key, $bucket_data); + } + + return $bucket_data; + } + + + /** + * Rebuild a cache bucket, amending existing data if avialable. + * + * @param int Bucket key, from @{method:getBucketKey}. + * @param array Existing bucket data. + * @return array Rebuilt bucket data. + * @task cache + */ + private function rebuildBucket($bucket_key, array $current_data) { + $bucket_min = ($bucket_key * $this->getBucketSize()); + $bucket_max = ($bucket_min + $this->getBucketSize()) - 1; + + // We need to reload all of the commits in the bucket because there is + // no guarantee that they'll get parsed in order, so we can fill large + // commit IDs before small ones. Later on, we'll ignore the commits we + // already know about. + + $table_commit = new PhabricatorRepositoryCommit(); + $table_repository = new PhabricatorRepository(); + $conn_r = $table_commit->establishConnection('r'); + + // Find all the Git and Mercurial commits in the block which have completed + // change import. We can't fill the cache accurately for commits which have + // not completed change import, so just pretend we don't know about them. + // In these cases, we will will ultimately fall back to VCS queries. + + $commit_rows = queryfx_all( + $conn_r, + 'SELECT c.id FROM %T c + JOIN %T r ON c.repositoryID = r.id AND r.versionControlSystem IN (%Ls) + WHERE c.id BETWEEN %d AND %d + AND (c.importStatus & %d) = %d', + $table_commit->getTableName(), + $table_repository->getTableName(), + array( + PhabricatorRepositoryType::REPOSITORY_TYPE_GIT, + PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL, + ), + $bucket_min, + $bucket_max, + PhabricatorRepositoryCommit::IMPORTED_CHANGE, + PhabricatorRepositoryCommit::IMPORTED_CHANGE); + + // If we don't have any data, just return the existing data. + if (!$commit_rows) { + return $current_data; + } + + // Remove the commits we already have data for. We don't need to rebuild + // these. If there's nothing left, return the existing data. + + $commit_ids = ipull($commit_rows, 'id', 'id'); + $commit_ids = array_diff_key($commit_ids, $current_data); + + if (!$commit_ids) { + return $current_data; + } + + // Find all the path changes for the new commits. + $path_changes = queryfx_all( + $conn_r, + 'SELECT commitID, pathID FROM %T + WHERE commitID IN (%Ld) + AND (isDirect = 1 OR changeType = %d)', + PhabricatorRepository::TABLE_PATHCHANGE, + $commit_ids, + DifferentialChangeType::TYPE_CHILD); + $path_changes = igroup($path_changes, 'commitID'); + + // Find all the parents for the new commits. + $parents = queryfx_all( + $conn_r, + 'SELECT childCommitID, parentCommitID FROM %T + WHERE childCommitID IN (%Ld) + ORDER BY id ASC', + PhabricatorRepository::TABLE_PARENTS, + $commit_ids); + $parents = igroup($parents, 'childCommitID'); + + // Build the actual data for the cache. + foreach ($commit_ids as $commit_id) { + $parent_ids = array(); + if (isset($parents[$commit_id])) { + foreach ($parents[$commit_id] as $row) { + $parent_ids[] = (int)$row['parentCommitID']; + } + } + + if (isset($path_changes[$commit_id])) { + $path_ids = $path_changes[$commit_id]; + foreach ($path_ids as $key => $path_id) { + $path_ids[$key] = (int)$path_id['pathID']; + } + sort($path_ids); + } else { + $path_ids = array(); + } + + $value = $parent_ids; + $value[] = null; + foreach ($path_ids as $path_id) { + $value[] = $path_id; + } + + $current_data[$commit_id] = $value; + } + + return $current_data; + } + +} diff --git a/src/applications/repository/management/PhabricatorRepositoryManagementCacheWorkflow.php b/src/applications/repository/management/PhabricatorRepositoryManagementCacheWorkflow.php new file mode 100644 index 0000000000..82de8492ec --- /dev/null +++ b/src/applications/repository/management/PhabricatorRepositoryManagementCacheWorkflow.php @@ -0,0 +1,92 @@ +setName('cache') + ->setExamples( + "**cache** [__options__] --commit __commit__ --path __path__") + ->setSynopsis(pht('Manage the repository graph cache.')) + ->setArguments( + array( + array( + 'name' => 'commit', + 'param' => 'commit', + 'help' => pht('Specify a commit to look up.'), + ), + array( + 'name' => 'path', + 'param' => 'path', + 'help' => pht('Specify a path to look up.'), + ), + )); + } + + public function execute(PhutilArgumentParser $args) { + + $commit_name = $args->getArg('commit'); + if ($commit_name === null) { + throw new PhutilArgumentUsageException( + pht('Specify a commit to look up with `--commit`.')); + } + $commit = $this->loadNamedCommit($commit_name); + + $path_name = $args->getArg('path'); + if ($path_name === null) { + throw new PhutilArgumentUsageException( + pht('Specify a path to look up with `--path`.')); + } + + $path_map = id(new DiffusionPathIDQuery(array($path_name))) + ->loadPathIDs(); + if (empty($path_map[$path_name])) { + throw new PhutilArgumentUsageException( + pht('Path "%s" is not known to Phabricator.', $path_name)); + } + $path_id = $path_map[$path_name]; + + $graph_cache = new PhabricatorRepositoryGraphCache(); + + $t_start = microtime(true); + $cache_result = $graph_cache->loadLastModifiedCommitID( + $commit->getID(), + $path_id); + $t_end = microtime(true); + + $console = PhutilConsole::getConsole(); + + $console->writeOut( + "%s\n", + pht('Query took %s ms.', new PhutilNumber(1000 * ($t_end - $t_start)))); + + if ($cache_result === false) { + $console->writeOut( + "%s\n", + pht('Not found in graph cache.')); + } else if ($cache_result === null) { + $console->writeOut( + "%s\n", + pht('Path not modified in any ancestor commit.')); + } else { + $last = id(new DiffusionCommitQuery()) + ->setViewer($this->getViewer()) + ->withIDs(array($cache_result)) + ->executeOne(); + if (!$last) { + throw new Exception(pht('Cache returned bogus result!')); + } + + $console->writeOut( + "%s\n", + pht( + 'Path was last changed at %s.', + $commit->getRepository()->formatCommitName( + $last->getcommitIdentifier()))); + } + + return 0; + } + +} diff --git a/src/applications/repository/management/PhabricatorRepositoryManagementParentsWorkflow.php b/src/applications/repository/management/PhabricatorRepositoryManagementParentsWorkflow.php index 1b2eee4d04..fd49df59a0 100644 --- a/src/applications/repository/management/PhabricatorRepositoryManagementParentsWorkflow.php +++ b/src/applications/repository/management/PhabricatorRepositoryManagementParentsWorkflow.php @@ -116,12 +116,20 @@ final class PhabricatorRepositoryManagementParentsWorkflow } $sql = array(); - foreach ($parents as $parent) { + if (!$parents) { + // Write an explicit 0 to indicate "no parents" instead of "no data". $sql[] = qsprintf( $conn_w, - '(%d, %d)', - $map[$child], - $map[$parent]); + '(%d, 0)', + $map[$child]); + } else { + foreach ($parents as $parent) { + $sql[] = qsprintf( + $conn_w, + '(%d, %d)', + $map[$child], + $map[$parent]); + } } $commit_table->openTransaction(); diff --git a/src/applications/repository/management/PhabricatorRepositoryManagementWorkflow.php b/src/applications/repository/management/PhabricatorRepositoryManagementWorkflow.php index 218b7e36bc..504772dcb2 100644 --- a/src/applications/repository/management/PhabricatorRepositoryManagementWorkflow.php +++ b/src/applications/repository/management/PhabricatorRepositoryManagementWorkflow.php @@ -32,6 +32,15 @@ abstract class PhabricatorRepositoryManagementWorkflow return null; } + return $this->loadNamedCommits($names); + } + + protected function loadNamedCommit($name) { + $map = $this->loadNamedCommits(array($name)); + return $map[$name]; + } + + protected function loadNamedCommits(array $names) { $query = id(new DiffusionCommitQuery()) ->setViewer($this->getViewer()) ->withIdentifiers($names); @@ -49,4 +58,5 @@ abstract class PhabricatorRepositoryManagementWorkflow return $map; } + }