Commit 2071a7e2 authored by IKEDA Soji's avatar IKEDA Soji
Browse files

Deprecate filesystem_encoding parameter which has already been fixed to utf-8.

Introduced Sympa::Tools::Text::slurp() to slurp text file applying Unicode sanitization and normalization. Note: Though Unicode::Normalize and Unicode::UTF8 are optional now, they would be mandatory in the future.
parent 5cea343a
......@@ -179,9 +179,11 @@ recommends 'Net::SMTP';
# Normalizes file names represented by Unicode
# Note: Perl 5.8.1 bundles version 0.23.
# Note: Perl 5.10.1 bundles this version (per Unicode 5.1.0).
# Note: Perl 5.10.1 bundles 1.03 (per Unicode 5.1.0).
recommends 'Unicode::Normalize', '>= 1.03';
recommends 'Unicode::UTF8', '>= 0.58';
### Features
##
......@@ -296,10 +298,11 @@ feature 'soap', 'Required if you want to run the Sympa SOAP server that provides
requires 'SOAP::Lite', '>= 0.712';
};
feature 'Unicode::Normalize', 'Normalizes file names represented by Unicode.' => sub {
feature 'safe-unicode', 'Sanitises inputs with Unicode text.' => sub {
# Note: Perl 5.8.1 bundles version 0.23.
# Note: Perl 5.10.1 bundles this version (per Unicode 5.1.0).
# Note: Perl 5.10.1 bundles 1.03 (per Unicode 5.1.0).
requires 'Unicode::Normalize', '>= 1.03';
requires 'Unicode::UTF8', '>= 0.58';
};
on 'test' => sub {
......
......@@ -4502,11 +4502,12 @@ sub do_info {
## Get List Description
if (-r $list->{'dir'} . '/homepage') {
my $file_path = $list->{'dir'} . '/homepage';
unless (open FILE, "<", $file_path) {
$param->{'homepage_content'} = Sympa::Tools::Text::slurp($file_path);
unless (defined $param->{'homepage_content'}) {
wwslog('err', 'Failed to open file %s: %m', $file_path);
Sympa::WWW::Report::reject_report_web('intern',
'cannot_open_file', {'file' => $file_path},
$param->{'action'}, $list, $param->{'user'}{'email'}, $robot);
wwslog('err', 'Failed to open file %s: %s', $file_path, $ERRNO);
web_db_log(
{ 'parameters' => $file_path,
'status' => 'error',
......@@ -4515,21 +4516,17 @@ sub do_info {
);
return undef;
}
while (<FILE>) {
Encode::from_to($_, $Conf::Conf{'filesystem_encoding'}, 'utf8');
$param->{'homepage_content'} .= $_;
}
close FILE;
 
## Used by previous templates
$param->{'homepage'} = 1;
} elsif (-r $list->{'dir'} . '/info') {
my $file_path = $list->{'dir'} . '/info';
unless (open FILE, "<", $file_path) {
$param->{'info_content'} = Sympa::Tools::Text::slurp($file_path);
unless (defined $param->{'info_content'}) {
wwslog('err', 'Failed to open file %s: %m', $file_path);
Sympa::WWW::Report::reject_report_web('intern',
'cannot_open_file', {'file' => $file_path},
$param->{'action'}, $list, $param->{'user'}{'email'}, $robot);
wwslog('err', 'Failed to open file %s: %s', $file_path, $ERRNO);
web_db_log(
{ 'parameters' => $file_path,
'status' => 'error',
......@@ -4538,11 +4535,7 @@ sub do_info {
);
return undef;
}
while (<FILE>) {
Encode::from_to($_, $Conf::Conf{'filesystem_encoding'}, 'utf8');
$param->{'info_content'} .= $_;
}
close FILE;
#FIXME: needed?
$param->{'info_content'} =~ s/\n/\<br\/\>/g;
}
 
......@@ -8288,19 +8281,16 @@ sub do_editfile {
if (defined $param->{'filepath'}) {
## open file and provide filecontent to the parser
## It allows to us the correct file encoding
my $fh;
unless (open $fh, '<', $param->{'filepath'}) {
my $file_path = $param->{'filepath'};
$param->{'filecontent'} = Sympa::Tools::Text::slurp($file_path);
unless (defined $param->{'filecontent'}) {
wwslog('err', 'Failed to open file %s: %m', $file_path);
Sympa::WWW::Report::reject_report_web(
'intern',
'cannot_open_file',
{'file' => $param->{'filepath'}},
$param->{'action'},
$list,
$param->{'user'}{'email'},
'intern', 'cannot_open_file',
{'file' => $file_path}, $param->{'action'},
$list, $param->{'user'}{'email'},
$robot
);
wwslog('err', 'Failed to open file %s: %s',
$param->{'filepath'}, $ERRNO);
web_db_log(
{ 'parameters' => $in{'file'},
'status' => 'error',
......@@ -8309,12 +8299,6 @@ sub do_editfile {
);
return undef;
}
my $file_content = do { local $RS; <$fh> };
close $fh;
Encode::from_to($file_content, $Conf::Conf{'filesystem_encoding'},
'utf8');
$param->{'filecontent'} = $file_content;
} else {
$param->{'filepath'} = $list->{'dir'} . '/' . $subdir . $file;
}
......@@ -8525,7 +8509,8 @@ sub do_savefile {
}
 
## Save new file
unless (open FILE, ">", $param->{'filepath'}) {
my $ofh;
unless (open $ofh, '>', $param->{'filepath'}) {
Sympa::WWW::Report::reject_report_web(
'intern', 'cannot_open_file',
{'file' => $param->{'filepath'}}, $param->{'action'},
......@@ -8542,10 +8527,8 @@ sub do_savefile {
);
return undef;
}
my $e = $in{'content'};
Encode::from_to($e, 'utf8', $Conf::Conf{'filesystem_encoding'});
print FILE $e;
close FILE;
print $ofh Sympa::Tools::Text::canonic_text($in{'content'});
close $ofh;
} elsif (-f $param->{'filepath'}) {
wwslog('info', 'Deleting %s', $param->{'filepath'});
unlink $param->{'filepath'};
......
......@@ -112,11 +112,6 @@ my %old_params = (
'dkim_header_list' => '',
);
## These parameters now have a hard-coded value
## Customized value can be accessed though as %Ignored_Conf
my %Ignored_Conf;
my %hardcoded_params = (filesystem_encoding => 'utf8');
my %trusted_applications = (
'trusted_application' => {
'occurrence' => '0-n',
......@@ -211,11 +206,6 @@ sub load {
}
);
# Some parameter values are hardcoded. In that case, ignore what was
# set in the config file and simply use the hardcoded value.
%Ignored_Conf =
%{_set_hardcoded_parameter_values({'config_hash' => \%Conf,})};
_set_listmasters_entry({'config_hash' => \%Conf, 'main_config' => 1});
## Some parameters must have a value specifically defined in the
......@@ -2057,18 +2047,8 @@ sub _load_robot_secondary_config_files {
## For parameters whose value is hard_coded, as per %hardcoded_params, set the
## parameter value to the hardcoded value, whatever is defined in the config.
## Returns a ref to a hash containing the ignored values.
sub _set_hardcoded_parameter_values {
my $param = shift;
my %ignored_values;
## Some parameter values are hardcoded. In that case, ignore what was set
## in the config file and simply use the hardcoded value.
foreach my $p (keys %hardcoded_params) {
$ignored_values{$p} = $param->{'config_hash'}{$p}
if (defined $param->{'config_hash'}{$p});
$param->{'config_hash'}{$p} = $hardcoded_params{$p};
}
return \%ignored_values;
}
# Deprecated.
#sub _set_hardcoded_parameter_values;
sub _detect_missing_mandatory_parameters {
my $param = shift;
......
......@@ -204,13 +204,7 @@ sub _twist {
unless (open $fh, '>', "$list_dir/info") {
$log->syslog('err', 'Impossible to create %s/info: %m', $list_dir);
} elsif (defined $param->{'description'}) {
# remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL, and
# EIMS:
$param->{'description'} =~ s/\r\n|\r/\n/g;
Encode::from_to($param->{'description'},
'utf8', $Conf::Conf{'filesystem_encoding'});
print $fh $param->{'description'};
print $fh Sympa::Tools::Text::canonic_text($param->{'description'});
}
close $fh;
......
......@@ -207,8 +207,8 @@ sub load_topics {
return;
}
my $fh;
unless (open $fh, '<', $conf_file) {
my $config_content = Sympa::Tools::Text::slurp($conf_file);
unless (defined $config_content) {
$log->syslog('err', 'Unable to open config file %s', $conf_file);
return;
}
......@@ -216,9 +216,7 @@ sub load_topics {
## Rough parsing
my $index = 0;
my (@rough_data, $topic);
while (my $line = <$fh>) {
Encode::from_to($line, $Conf::Conf{'filesystem_encoding'},
'utf8');
foreach my $line (split /(?<=\n)(?=\n|.)/, $config_content) {
if ($line =~ /\A(others|topicsless)\s*\z/i) {
# "others" and "topicsless" are reserved words. Ignore.
next;
......@@ -239,7 +237,6 @@ sub load_topics {
$topic = {};
}
}
close $fh;
## Last topic
if (defined $topic->{'name'}) {
......
......@@ -8,8 +8,8 @@
# Copyright (c) 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
# 2006, 2007, 2008, 2009, 2010, 2011 Comite Reseau des Universites
# Copyright (c) 2011, 2012, 2013, 2014, 2015, 2016, 2017 GIP RENATER
# Copyright 2018 The Sympa Community. See the AUTHORS.md file at the
# top-level directory of this distribution and at
# Copyright 2018, 2019 The Sympa Community. See the AUTHORS.md file at
# the top-level directory of this distribution and at
# <https://github.com/sympa-community/sympa.git>.
#
# This program is free software; you can redistribute it and/or modify
......@@ -40,6 +40,7 @@ use URI::Escape qw();
use if ($] < 5.016), qw(Unicode::CaseFold fc);
use if (5.016 <= $]), qw(feature fc);
BEGIN { eval 'use Unicode::Normalize qw()'; }
BEGIN { eval 'use Unicode::UTF8 qw()'; }
use Sympa::Language;
use Sympa::Regexps;
......@@ -115,6 +116,51 @@ sub canonic_message_id {
return $msg_id;
}
sub canonic_text {
my $text = shift;
return undef unless defined $text;
# Normalize text. See also discussion on
# https://listes.renater.fr/sympa/arc/sympa-developpers/2018-03/thrd1.html
#
# N.B.: Corresponding modules are optional by now, and should be
# mandatory in the future.
my $utext;
if (Encode::is_utf8($text)) {
$utext = $text;
} elsif ($Unicode::UTF8::VERSION) {
no warnings 'utf8';
$utext = Unicode::UTF8::decode_utf8($text);
} else {
$utext = Encode::decode_utf8($text);
}
if ($Unicode::Normalize::VERSION) {
$utext = Unicode::Normalize::normalize('NFC', $utext);
}
# Remove DOS linefeeds (^M) that cause problems with Outlook 98, AOL,
# and EIMS:
$utext =~ s/\r\n|\r/\n/g;
if (Encode::is_utf8($text)) {
return $utext;
} else {
return Encode::encode_utf8($utext);
}
}
sub slurp {
my $path = shift;
my $ifh;
return undef unless open $ifh, '<', $path;
my $text = do { local $RS; <$ifh> };
close $ifh;
return canonic_text($text);
}
sub wrap_text {
my $text = shift;
my $init = shift;
......@@ -282,13 +328,20 @@ sub guessed_to_utf8 {
and $text =~ /[^\x00-\x7F]/;
my $utf8;
foreach
my $charset ('utf-8', map { $_ ? @$_ : () } @legacy_charsets{@langs})
{
$utf8 = eval { Encode::decode($charset, $text, Encode::FB_CROAK()) };
last if defined $utf8;
if ($Unicode::UTF8::VERSION) {
$utf8 =
eval { Unicode::UTF8::decode_utf8($text, Encode::FB_CROAK()) };
}
unless (defined $utf8) {
foreach my $charset (map { $_ ? @$_ : () } @legacy_charsets{@langs}) {
$utf8 =
eval { Encode::decode($charset, $text, Encode::FB_CROAK()) };
last if defined $utf8;
}
}
unless (defined $utf8) {
$utf8 = Encode::decode('iso-8859-1', $text);
}
$utf8 = Encode::decode('iso-8859-1', $text) unless defined $utf8;
# Apply NFC: e.g. for modified-NFD by Mac OS X.
$utf8 = Unicode::Normalize::normalize('NFC', $utf8)
......@@ -533,6 +586,14 @@ For malformed inputs returns C<undef>.
Returns canonical form of message ID without trailing or leading whitespaces
or C<E<lt>>, C<E<gt>>.
=item canonic_text ( $text )
Canonicalizes text.
C<$text> should be a binary string encoded by UTF-8 character set or
a Unicode string.
Forbidden sequences in binary string will be replaced by
U+FFFD REPLACEMENT CHARACTERs, and Normalization Form C (NFC) will be applied.
=item decode_filesystem_safe ( $str )
I<Function>.
......@@ -778,6 +839,12 @@ ToDo:
This should be obsoleted in the future release: Would be better to use
L</encode_filesystem_safe>.
=item slurp ( $file )
Get entire content of the file.
Normalization by canonic_text() is applied.
C<$file> is the path to text file.
=item unescape_chars ( $str )
Unescape weird characters.
......@@ -870,4 +937,6 @@ were added on Sympa 6.2.14, and escape_url() was deprecated.
guessed_to_utf8() and pad() were added on Sympa 6.2.17.
canonic_text() and slurp() were added on Sympa 6.2.53b.
=cut
......@@ -2095,6 +2095,15 @@ sub to_utf8 {
message_report.tt2 | moderate.tt2 | modindex.tt2 | send_auth.tt2 }x;
my $total;
# Get an obsoleted parameter filesystem_encoding.
my $filesystem_encoding;
open my $fh, '<', Conf::get_sympa_conf();
while (my $line = <$fh>) {
$filesystem_encoding = $1
if $line =~ /\A\s*(?:web_recode_to|filesystem_encoding)\s+(\S+)/i;
}
close $fh;
foreach my $pair (@{$files}) {
my ($file, $lang) = @$pair;
unless (open(TEMPLATE, $file)) {
......@@ -2105,12 +2114,11 @@ sub to_utf8 {
my $text = '';
my $modified = 0;
## If filesystem_encoding is set, files are supposed to be encoded
## according to it
# If an obsoleted parameter filesystem_encoding was set, files are
# supposed to be encoded according to it.
my $charset;
if (defined $Conf::Ignored_Conf{'filesystem_encoding'}
and $Conf::Ignored_Conf{'filesystem_encoding'} ne 'utf-8') {
$charset = $Conf::Ignored_Conf{'filesystem_encoding'};
if ($filesystem_encoding) {
$charset = $filesystem_encoding;
} else {
$language->push_lang($lang);
$charset = Conf::lang2charset($language->get_lang);
......
......@@ -8,6 +8,7 @@ use English qw(-no_match_vars);
use Test::More;
my $test_pod_ok;
BEGIN {
# Test::Pod 1.40 mistakenly complains about the construct L<text|url>.
eval 'use Test::Pod 1.41';
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment