Commit e17bd11d authored by Byron Jones's avatar Byron Jones

Bug 633776: Automatic charset detection for text attachments

r=mkanat, a=mkanat
parent fd4f9fad
File mode changed from 100644 to 100755
File mode changed from 100644 to 100755
......@@ -291,6 +291,19 @@ sub OPTIONAL_MODULES {
version => 0,
feature => ['html_desc'],
},
{
# we need version 2.21 of Encode for mime_name
package => 'Encode',
module => 'Encode',
version => 2.21,
feature => ['detect_charset'],
},
{
package => 'Encode-Detect',
module => 'Encode::Detect',
version => 0,
feature => ['detect_charset'],
},
# Inbound Email
{
......
......@@ -43,7 +43,8 @@ use base qw(Exporter);
file_mod_time is_7bit_clean
bz_crypt generate_random_password
validate_email_syntax clean_text
get_text template_var disable_utf8);
get_text template_var disable_utf8
detect_encoding);
use Bugzilla::Constants;
......@@ -58,6 +59,8 @@ use Math::Random::Secure qw(irand);
use Scalar::Util qw(tainted blessed);
use Template::Filters;
use Text::Wrap;
use Encode qw(encode decode resolve_alias);
use Encode::Guess;
sub trick_taint {
require Carp;
......@@ -673,6 +676,63 @@ sub disable_utf8 {
}
}
use constant UTF8_ACCIDENTAL => qw(shiftjis big5-eten euc-kr euc-jp);
sub detect_encoding {
my $data = shift;
if (!Bugzilla->feature('detect_charset')) {
require Bugzilla::Error;
Bugzilla::Error::ThrowCodeError('feature_disabled',
{ feature => 'detect_charset' });
}
require Encode::Detect::Detector;
import Encode::Detect::Detector 'detect';
my $encoding = detect($data);
$encoding = resolve_alias($encoding) if $encoding;
# Encode::Detect is bad at detecting certain charsets, but Encode::Guess
# is better at them. Here's the details:
# shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect
# tends to accidentally mis-detect UTF-8 strings as being
# these encodings.)
if ($encoding && grep($_ eq $encoding, UTF8_ACCIDENTAL)) {
$encoding = undef;
my $decoder = guess_encoding($data, UTF8_ACCIDENTAL);
$encoding = $decoder->name if ref $decoder;
}
# Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8,
# but Encode::Guess can usually tell which one it is.
if ($encoding && $encoding eq 'iso-8859-8') {
my $decoded_as = _guess_iso($data, 'iso-8859-8',
# These are ordered this way because it gives the most
# accurate results.
qw(iso-8859-7 iso-8859-2));
$encoding = $decoded_as if $decoded_as;
}
return $encoding;
}
# A helper for detect_encoding.
sub _guess_iso {
my ($data, $versus, @isos) = (shift, shift, shift);
my $encoding;
foreach my $iso (@isos) {
my $decoder = guess_encoding($data, ($iso, $versus));
if (ref $decoder) {
$encoding = $decoder->name if ref $decoder;
last;
}
}
return $encoding;
}
1;
__END__
......@@ -903,6 +963,12 @@ ASCII 10 (LineFeed) and ASCII 13 (Carrage Return).
Disable utf8 on STDOUT (and display raw data instead).
=item C<detect_encoding($str)>
Guesses what encoding a given data is encoded in, returning the canonical name
of the detected encoding (which may be different from the MIME charset
specification).
=item C<clean_text($str)>
Returns the parameter "cleaned" by exchanging non-printable characters with spaces.
Specifically characters (ASCII 0 through 31) and (ASCII 127) will become ASCII 32 (Space).
......
......@@ -53,7 +53,7 @@ use Bugzilla::Attachment::PatchReader;
use Bugzilla::Token;
use Bugzilla::Keyword;
use Encode qw(encode);
use Encode qw(encode find_encoding);
# For most scripts we don't make $cgi and $template global variables. But
# when preparing Bugzilla for mod_perl, this script used these
......@@ -335,6 +335,12 @@ sub view {
# In order to prevent Apache from adding a charset, we have to send a
# charset that's a single space.
$cgi->charset(' ');
if (Bugzilla->feature('detect_charset') && $contenttype =~ /^text\//) {
my $encoding = detect_encoding($attachment->data);
if ($encoding) {
$cgi->charset(find_encoding($encoding)->mime_name);
}
}
}
print $cgi->header(-type=>"$contenttype; name=\"$filename\"",
-content_disposition=> "$disposition; filename=\"$filename\"",
......
......@@ -24,10 +24,10 @@ use lib qw(. lib);
use Bugzilla;
use Bugzilla::Constants;
use Bugzilla::Util qw(detect_encoding);
use Digest::MD5 qw(md5_base64);
use Encode qw(encode decode resolve_alias is_utf8);
use Encode::Guess;
use Getopt::Long;
use Pod::Usage;
......@@ -71,53 +71,6 @@ sub trunc {
return $truncated;
}
sub do_guess {
my ($data) = @_;
my $encoding = detect($data);
$encoding = resolve_alias($encoding) if $encoding;
# Encode::Detect is bad at detecting certain charsets, but Encode::Guess
# is better at them. Here's the details:
# shiftjis, big5-eten, euc-kr, and euc-jp: (Encode::Detect
# tends to accidentally mis-detect UTF-8 strings as being
# these encodings.)
my @utf8_accidental = qw(shiftjis big5-eten euc-kr euc-jp);
if ($encoding && grep($_ eq $encoding, @utf8_accidental)) {
$encoding = undef;
my $decoder = guess_encoding($data, @utf8_accidental);
$encoding = $decoder->name if ref $decoder;
}
# Encode::Detect sometimes mis-detects various ISO encodings as iso-8859-8,
# but Encode::Guess can usually tell which one it is.
if ($encoding && $encoding eq 'iso-8859-8') {
my $decoded_as = guess_iso($data, 'iso-8859-8',
# These are ordered this way because it gives the most
# accurate results.
qw(iso-8859-7 iso-8859-2));
$encoding = $decoded_as if $decoded_as;
}
return $encoding;
}
# A helper for do_guess.
sub guess_iso {
my ($data, $versus, @isos) = @_;
my $encoding;
foreach my $iso (@isos) {
my $decoder = guess_encoding($data, ($iso, $versus));
if (ref $decoder) {
$encoding = $decoder->name if ref $decoder;
last;
}
}
return $encoding;
}
sub is_valid_utf8 {
my ($str) = @_;
Encode::_utf8_on($str);
......@@ -143,8 +96,6 @@ if (exists $switch{'charset'}) {
}
if ($switch{'guess'}) {
# Encode::Detect::Detector doesn't seem to return a true value.
# So we have to check if we can run detect.
if (!eval { require Encode::Detect::Detector }) {
my $root = ROOT_USER;
print STDERR <<EOT;
......@@ -156,8 +107,6 @@ Encode::Detect, run the following command:
EOT
exit;
}
import Encode::Detect::Detector qw(detect);
}
my %overrides;
......@@ -255,7 +204,7 @@ foreach my $table ($dbh->bz_table_list_real) {
my $encoding;
if ($switch{'guess'}) {
$encoding = do_guess($data);
$encoding = detect_encoding($data);
# We only show failures if they don't appear to be
# ASCII.
......
File mode changed from 100644 to 100755
......@@ -108,6 +108,7 @@ END
feature_smtp_auth => 'SMTP Authentication',
feature_updates => 'Automatic Update Notifications',
feature_xmlrpc => 'XML-RPC Interface',
feature_detect_charset => 'Automatic charset detection for text attachments',
file_remove => 'Removing ##name##...',
file_rename => 'Renaming ##from## to ##to##...',
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment