phorge/src/applications/diffusion/query/lowlevel/DiffusionLowLevelCommitQuery.php
epriestley 785f3c98da Extract raw commit messages from Git more faithfully across Git versions
Summary:
Fixes T5028. Older versions of Git (apparently, from before 2010) did not provide a way to extract the raw body of a commit message from "git log", so we approximate it with "subject" and "wrapped body".

In newer versions of Git, the raw body can be extracted exactly.

Adjust how we extract messages based on the version of Git, and try to be more faithful to edge cases: particularly, be more careful to extract the correct number of trailing newlines.

Test Plan:
  - Added "var_dump()" + "die(1)" later in this method, then pushed various commit messages. Used "&& false" to force execution down the old path (either path should work in modern Git).
  - Observed more faithful extraction of messages, including a more faithful extraction of the number of trailing newlines. Extraction is fully faithful if we can go down the "%B" path, which we should be able to in nearly all modern cases.
  - Not all messages extract faithfully or consistently across the old and new versions, but the old extraction is destructive so this is likely about as close as we can realistically ever get.

Maniphest Tasks: T5028

Differential Revision: https://secure.phabricator.com/D21027
2020-02-24 12:37:45 -08:00

214 lines
6.1 KiB
PHP

<?php
/**
* Populate a @{class:DiffusionCommitRef} with information about a specific
* commit in a repository. This is a low-level query which talks directly to
* the underlying VCS.
*/
final class DiffusionLowLevelCommitQuery
extends DiffusionLowLevelQuery {
private $identifier;
public function withIdentifier($identifier) {
$this->identifier = $identifier;
return $this;
}
protected function executeQuery() {
if (!strlen($this->identifier)) {
throw new PhutilInvalidStateException('withIdentifier');
}
$type = $this->getRepository()->getVersionControlSystem();
switch ($type) {
case PhabricatorRepositoryType::REPOSITORY_TYPE_GIT:
$result = $this->loadGitCommitRef();
break;
case PhabricatorRepositoryType::REPOSITORY_TYPE_MERCURIAL:
$result = $this->loadMercurialCommitRef();
break;
case PhabricatorRepositoryType::REPOSITORY_TYPE_SVN:
$result = $this->loadSubversionCommitRef();
break;
default:
throw new Exception(pht('Unsupported repository type "%s"!', $type));
}
return $result;
}
private function loadGitCommitRef() {
$repository = $this->getRepository();
// See T5028. The "%B" (raw body) mode is not present in very old versions
// of Git. Use "%s" and "%b" ("subject" and "wrapped body") as an
// approximation.
$git_binary = PhutilBinaryAnalyzer::getForBinary('git');
$git_version = $git_binary->getBinaryVersion();
if (version_compare($git_version, '1.7.2', '>=')) {
$body_format = '%B';
$split_body = false;
} else {
$body_format = '%s%x00%b';
$split_body = true;
}
// Even though we pass --encoding here, git doesn't always succeed, so
// we try a little harder, since git *does* tell us what the actual encoding
// is correctly (unless it doesn't; encoding is sometimes empty).
list($info) = $repository->execxLocalCommand(
'log -n 1 --encoding=%s --format=%s %s --',
'UTF-8',
implode(
'%x00',
array(
'%e',
'%cn',
'%ce',
'%an',
'%ae',
'%T',
'%at',
$body_format,
// The "git log" output includes a trailing newline. We want to
// faithfully capture only the exact text of the commit message,
// so include an explicit terminator: this makes sure the exact
// body text is surrounded by "\0" characters.
'~',
)),
$this->identifier);
$parts = explode("\0", $info);
$encoding = array_shift($parts);
foreach ($parts as $key => $part) {
if ($encoding) {
$part = phutil_utf8_convert($part, 'UTF-8', $encoding);
}
$parts[$key] = phutil_utf8ize($part);
if (!strlen($parts[$key])) {
$parts[$key] = null;
}
}
$hashes = array(
id(new DiffusionCommitHash())
->setHashType(ArcanistDifferentialRevisionHash::HASH_GIT_COMMIT)
->setHashValue($this->identifier),
id(new DiffusionCommitHash())
->setHashType(ArcanistDifferentialRevisionHash::HASH_GIT_TREE)
->setHashValue($parts[4]),
);
$author_epoch = (int)$parts[5];
if (!$author_epoch) {
$author_epoch = null;
}
if ($split_body) {
// Here, the body is: "subject", "\0", "wrapped body". Stitch the
// pieces back together by putting a newline between them if both
// parts are nonempty.
$head = $parts[6];
$tail = $parts[7];
if (strlen($head) && strlen($tail)) {
$body = $head."\n\n".$tail;
} else if (strlen($head)) {
$body = $head;
} else if (strlen($tail)) {
$body = $tail;
} else {
$body = '';
}
} else {
// Here, the body is the raw unwrapped body.
$body = $parts[6];
}
return id(new DiffusionCommitRef())
->setCommitterName($parts[0])
->setCommitterEmail($parts[1])
->setAuthorName($parts[2])
->setAuthorEmail($parts[3])
->setHashes($hashes)
->setAuthorEpoch($author_epoch)
->setMessage($body);
}
private function loadMercurialCommitRef() {
$repository = $this->getRepository();
list($stdout) = $repository->execxLocalCommand(
'log --template %s --rev %s',
'{author}\\n{desc}',
hgsprintf('%s', $this->identifier));
list($author, $message) = explode("\n", $stdout, 2);
$author = phutil_utf8ize($author);
$message = phutil_utf8ize($message);
list($author_name, $author_email) = $this->splitUserIdentifier($author);
$hashes = array(
id(new DiffusionCommitHash())
->setHashType(ArcanistDifferentialRevisionHash::HASH_MERCURIAL_COMMIT)
->setHashValue($this->identifier),
);
return id(new DiffusionCommitRef())
->setAuthorName($author_name)
->setAuthorEmail($author_email)
->setMessage($message)
->setHashes($hashes);
}
private function loadSubversionCommitRef() {
$repository = $this->getRepository();
list($xml) = $repository->execxRemoteCommand(
'log --xml --limit 1 %s',
$repository->getSubversionPathURI(null, $this->identifier));
// Subversion may send us back commit messages which won't parse because
// they have non UTF-8 garbage in them. Slam them into valid UTF-8.
$xml = phutil_utf8ize($xml);
$log = new SimpleXMLElement($xml);
$entry = $log->logentry[0];
$author = (string)$entry->author;
$message = (string)$entry->msg;
list($author_name, $author_email) = $this->splitUserIdentifier($author);
// No hashes in Subversion.
$hashes = array();
return id(new DiffusionCommitRef())
->setAuthorName($author_name)
->setAuthorEmail($author_email)
->setMessage($message)
->setHashes($hashes);
}
private function splitUserIdentifier($user) {
$email = new PhutilEmailAddress($user);
if ($email->getDisplayName() || $email->getDomainName()) {
$user_name = $email->getDisplayName();
$user_email = $email->getAddress();
} else {
$user_name = $email->getAddress();
$user_email = null;
}
return array($user_name, $user_email);
}
}