From d2cff6a2cf01396f6337edfadd1f7df7cce1277d Mon Sep 17 00:00:00 2001 From: Alex Vandiver Date: Thu, 16 Nov 2017 00:43:35 -0800 Subject: [PATCH] Transcode the HTML part of incoming email into UTF-8 as well Summary: D1093 did this for just the text/plain part of incoming email. Most text/html parts choose to either use entity encoding //or// are already UTF-8, thus obviating the need to transcode the HTML part. However, this is not always the case, and leads to dropped messages, by way of: ``` EXCEPTION: (Exception) Failed to JSON encode value (#5: Malformed UTF-8 characters, possibly incorrectly encoded): Dictionary value at key "html" is not valid UTF8, and cannot be JSON encoded: [snip HTML part of message content]``` Generalize the charset transcoding to not apply to just the text/plain part, but both text/plain and text/html parts. Test Plan: Fed in a Windows-1252-encoded text/html part with 0x92 bytes in it; verified that $content only contained valid UTF-8 after this change. Reviewers: #blessed_reviewers, epriestley Reviewed By: #blessed_reviewers, epriestley Subscribers: Korvin, epriestley Differential Revision: https://secure.phabricator.com/D18776 --- scripts/mail/mail_handler.php | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/scripts/mail/mail_handler.php b/scripts/mail/mail_handler.php index 2ff23adb0f..b76b3910df 100755 --- a/scripts/mail/mail_handler.php +++ b/scripts/mail/mail_handler.php @@ -35,16 +35,19 @@ $args->parse( $parser = new MimeMailParser(); $parser->setText(file_get_contents('php://stdin')); -$text_body = $parser->getMessageBody('text'); - -$text_body_headers = $parser->getMessageBodyHeaders('text'); -$content_type = idx($text_body_headers, 'content-type'); -if ( - !phutil_is_utf8($text_body) && - (preg_match('/charset="(.*?)"/', $content_type, $matches) || - preg_match('/charset=(\S+)/', $content_type, $matches)) -) { - $text_body = phutil_utf8_convert($text_body, 'UTF-8', $matches[1]); +$content = array(); +foreach (array('text', 'html') as $part) { + $part_body = $parser->getMessageBody($part); + $part_headers = $parser->getMessageBodyHeaders($part); + $content_type = idx($part_headers, 'content-type'); + if ( + !phutil_is_utf8($part_body) && + (preg_match('/charset="(.*?)"/', $content_type, $matches) || + preg_match('/charset=(\S+)/', $content_type, $matches)) + ) { + $part_body = phutil_utf8_convert($part_body, 'UTF-8', $matches[1]); + } + $content[$part] = $part_body; } $headers = $parser->getHeaders(); @@ -57,10 +60,7 @@ if ($args->getArg('process-duplicates')) { $received = new PhabricatorMetaMTAReceivedMail(); $received->setHeaders($headers); -$received->setBodies(array( - 'text' => $text_body, - 'html' => $parser->getMessageBody('html'), -)); +$received->setBodies($content); $attachments = array(); foreach ($parser->getAttachments() as $attachment) {