Skip to content

Commit

Permalink
Convert all strings to the character encoding [WIP]
Browse files Browse the repository at this point in the history
  • Loading branch information
mquinson committed Jan 3, 2024
1 parent a25c853 commit e048fd3
Show file tree
Hide file tree
Showing 7 changed files with 109 additions and 356 deletions.
161 changes: 52 additions & 109 deletions lib/Locale/Po4a/Po.pm
Original file line number Diff line number Diff line change
Expand Up @@ -225,7 +225,6 @@ sub initialize {
$self->{options}{'package-name'} = "PACKAGE";
$self->{options}{'package-version'} = "VERSION";
$self->{options}{'wrap-po'} = 76;
$self->{options}{'pot-charset'} = "UTF-8";
$self->{options}{'pot-language'} = "";

foreach my $opt ( keys %$options ) {
Expand Down Expand Up @@ -296,13 +295,11 @@ sub initialize {
. "Language: "
. $self->{options}{'pot-language'} . "\n"
. "MIME-Version: 1.0\n"
. "Content-Type: text/plain; charset="
. $self->{options}{'pot-charset'} . "\n"
. "Content-Type: text/plain; charset=UTF-8\n"
. "Content-Transfer-Encoding: 8bit\n"
);

$self->{encoder} = find_encoding("UTF-8");
$self->{footer} = [];
$self->{footer} = [];

# To make stats about gettext hits
$self->stats_clear();
Expand All @@ -320,6 +317,8 @@ sub read {
my $self = shift;
my $filename = shift
or croak wrap_mod( "po4a::po", dgettext( "po4a", "Please provide a non-null filename" ) );
my $charset = shift // 'UTF-8';
warn "Read $filename with encoding: $charset" if $debug{'encoding'};

my $lang = basename($filename);
$lang =~ s/\.po$//;
Expand All @@ -336,23 +335,32 @@ sub read {
unless ( $? == 0 );

my $fh;
my $close_fh = 1;
if ( $filename eq '-' ) {
$fh = *STDIN;
$close_fh = 0;
$fh = *STDIN;
} else {
open $fh, "<$filename"
open( $fh, "<:raw:encoding($charset)", $filename )
or croak wrap_mod( "po4a::po", dgettext( "po4a", "Cannot read from %s: %s" ), $filename, $! );
}

## Read paragraphs line-by-line
my $pofile = "";
my $textline;
while ( defined( $textline = <$fh> ) ) {
while ( defined( my $textline = <$fh> ) ) {
$pofile .= $textline;
}

if ($close_fh) {
# If we did not get the charset right, reload the file with the right one
if ( $pofile =~ /charset=(.*?)[\s\\]/ ) {
my $detected_charset = $1;

if ( $detected_charset ne $charset || uc($detected_charset) ne $charset ) {
warn "Reloading the PO file, changing the charset from '$charset' to '$detected_charset'"
if $debug{'encoding'};
$self->read( $filename, $detected_charset );
return;
}
}

if ( $filename ne '-' ) {
close $fh
or croak wrap_mod( "po4a::po", dgettext( "po4a", "Cannot close %s after reading: %s" ), $filename, $! );
}
Expand Down Expand Up @@ -508,15 +516,26 @@ sub write {
File::Path::mkpath( $dir, 0, 0755 ) # Croaks on error
if ( length($dir) && !-e $dir );
}
open $fh, ">$filename"
open( $fh, '>:raw:encoding(UTF-8)', $filename )
or croak wrap_mod( "po4a::po", dgettext( "po4a", "Cannot write to %s: %s" ), $filename, $! );
}

print $fh "" . format_comment( $self->{header_comment}, "" )
if length( $self->{header_comment} );

# Force the encoding of PO files in UTF-8 on disk, because msgmerge can get messed up when mixing encodings
# See https://savannah.gnu.org/bugs/index.php?65104
my $header = $self->{header};
$header =~ /charset=([^\s\\]*)/i;
my $oldcharset = $1 // '';
warn sprintf(
dgettext( "po4a", "Force the encoding of %s to UTF-8 (was %s), as mixing encodings may break msgmerge.\n" ),
$filename, $oldcharset )
if $oldcharset ne 'UTF-8';
$header =~ s/charset=[^\s\\]*/charset=UTF-8/i;

print $fh "msgid \"\"\n";
print $fh "msgstr " . quote_text( $self->{header}, $self->{options}{'wrap-po'} ) . "\n\n";
print $fh "msgstr " . quote_text( $header, $self->{options}{'wrap-po'} ) . "\n\n";

my $buf_msgstr_plural; # Used to keep the first msgstr of plural forms
my $first = 1;
Expand Down Expand Up @@ -551,55 +570,37 @@ sub write {

if ( exists $self->{po}{$msgid}{'plural'} ) {
if ( $self->{po}{$msgid}{'plural'} == 0 ) {
if ( $self->get_charset =~ /^utf-8$/i ) {
my $msgstr = Encode::decode_utf8( $self->{po}{$msgid}{'msgstr'} );
$msgid = Encode::decode_utf8($msgid);
$output .=
Encode::encode_utf8( "msgid " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n" );
$buf_msgstr_plural =
Encode::encode_utf8( "msgstr[0] " . quote_text( $msgstr, $self->{options}{'wrap-po'} ) . "\n" );
} else {
$output = "msgid " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n";
$buf_msgstr_plural =
"msgstr[0] " . quote_text( $self->{po}{$msgid}{'msgstr'}, $self->{options}{'wrap-po'} ) . "\n";
}
$output = "msgid " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n";
$buf_msgstr_plural =
"msgstr[0] " . quote_text( $self->{po}{$msgid}{'msgstr'}, $self->{options}{'wrap-po'} ) . "\n";
} elsif ( $self->{po}{$msgid}{'plural'} == 1 ) {

# TODO: there may be only one plural form
if ( $self->get_charset =~ /^utf-8$/i ) {
my $msgstr = Encode::decode_utf8( $self->{po}{$msgid}{'msgstr'} );
$msgid = Encode::decode_utf8($msgid);
$output =
Encode::encode_utf8( "msgid_plural " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n" );
$output .= $buf_msgstr_plural;
$output .=
Encode::encode_utf8( "msgstr[1] " . quote_text( $msgstr, $self->{options}{'wrap-po'} ) . "\n" );
$buf_msgstr_plural = "";
} else {
$output = "msgid_plural " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n";
$output .= $buf_msgstr_plural;
$output .=
"msgstr[1] " . quote_text( $self->{po}{$msgid}{'msgstr'}, $self->{options}{'wrap-po'} ) . "\n";
}
$output = "msgid_plural " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n";
$output .= $buf_msgstr_plural;
$output .=
"msgstr[1] " . quote_text( $self->{po}{$msgid}{'msgstr'}, $self->{options}{'wrap-po'} ) . "\n";
} else {
die wrap_msg( dgettext( "po4a", "Cannot write PO files with more than two plural forms." ) );
}
} else {
if ( $self->get_charset =~ /^utf-8$/i ) {
my $msgstr = Encode::decode_utf8( $self->{po}{$msgid}{'msgstr'} );
$msgid = Encode::decode_utf8($msgid);
$output .= Encode::encode_utf8( "msgid " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n" );
$output .= Encode::encode_utf8( "msgstr " . quote_text( $msgstr, $self->{options}{'wrap-po'} ) . "\n" );
} else {
$output .= "msgid " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n";
$output .= "msgstr " . quote_text( $self->{po}{$msgid}{'msgstr'}, $self->{options}{'wrap-po'} ) . "\n";
}
print STDERR "STDERRR:: msgid is ".(Encode::is_utf8($msgid, Encode::FB_CROAK)?"UTF":"raw")."\n";
print STDERR "STDERRR:: msgstr is ".(Encode::is_utf8($self->{po}{$msgid}{'msgstr'}, Encode::FB_CROAK)?"UTF":"raw")."\n";
print STDERR "STDERRR:: output is ".(Encode::is_utf8($output, Encode::FB_CROAK)?"UTF":"raw")."\n";

$output .= "msgid " . quote_text( $msgid, $self->{options}{'wrap-po'} ) . "\n";
$output .= "msgstr " . quote_text( $self->{po}{$msgid}{'msgstr'}, $self->{options}{'wrap-po'} ) . "\n";
}

print STDERR "STDERRR:: ".(Encode::is_utf8($output)?"UTF":"raw")." $output\n";
print $fh $output;
}
print $fh join( "\n\n", @{ $self->{footer} } ) if scalar @{ $self->{footer} };

if ($filename ne '-') {
print STDERR "STDERR: CONTENT OF $filename\n\n";
system("cat $filename");
}
# print STDERR "$fh";
# if ($filename ne '-') {
# close $fh
Expand Down Expand Up @@ -895,29 +896,6 @@ sub filter {
return $res;
}

=item to_utf8()
Recodes to UTF-8 the PO's msgstrs. Does nothing if the charset is not
specified in the PO file ("CHARSET" value), or if it's already UTF-8 or
ASCII.
=cut

sub to_utf8 {
my $this = shift;
my $charset = $this->get_charset();

unless ( $charset eq "CHARSET"
or $charset =~ /^ascii$/i
or $charset =~ /^utf-8$/i )
{
foreach my $msgid ( keys %{ $this->{po} } ) {
Encode::from_to( $this->{po}{$msgid}{'msgstr'}, $charset, "utf-8" );
}
$this->set_charset("UTF-8");
}
}

=back
=head1 Functions to use a message catalog for translations
Expand Down Expand Up @@ -1011,13 +989,7 @@ sub gettext {
}

if ( $opt{'wrap'} ) {
if ( $self->get_charset =~ /^utf-8$/i ) {
$res = Encode::decode_utf8($res);
$res = wrap( $res, $opt{'wrapcol'} || 76 );
$res = Encode::encode_utf8($res);
} else {
$res = wrap( $res, $opt{'wrapcol'} || 76 );
}
$res = wrap( $res, $opt{'wrapcol'} || 76 );
}

# print STDERR "Gettext >>>$text<<<(escaped=$esc_text)=[[[$res]]]\n\n";
Expand Down Expand Up @@ -1198,12 +1170,6 @@ sub push_raw {
# } FIXME: do that iff the header isn't the default one.
$self->{header} = $msgstr;
$self->{header_comment} = $comment;
my $charset = $self->get_charset;
if ( $charset ne "CHARSET" ) {
$self->{encoder} = find_encoding($charset);
} else {
$self->{encoder} = find_encoding("UTF-8");
}
return;
}

Expand Down Expand Up @@ -1429,29 +1395,6 @@ sub get_charset() {
}
}

=item set_charset($)
This sets the character set of the PO header to the value specified in its
first argument. If you never call this function (and no file with a specified
character set is read), the default value is left to "UTF-8". This value
doesn't change the behavior of this module, it's just used to fill that field
in the header, and to return it in get_charset().
=cut

sub set_charset() {
my $self = shift;

my ( $newchar, $oldchar );
$newchar = shift;
$oldchar = $self->get_charset();

if ( $newchar ne $oldchar ) {
$self->{header} =~ s/$oldchar/$newchar/;
$self->{encoder} = find_encoding($newchar);
}
}

#----[ helper functions ]---------------------------------------------------

# transforme the string from its PO file representation to the form which
Expand Down
14 changes: 0 additions & 14 deletions lib/Locale/Po4a/TeX.pm
Original file line number Diff line number Diff line change
Expand Up @@ -1010,20 +1010,6 @@ sub read_file {
if ( length($textline) ) {
my @entry = ( $textline, $ref );
push @entries, @entry;

# Detect if this file has non-ascii characters
if ( $self->{TT}{ascii_input} ) {

my $decoder = guess_encoding($textline);
if ( !ref($decoder) or $decoder !~ /Encode::XS=/ ) {

# We have detected a non-ascii line
$self->{TT}{ascii_input} = 0;

# Save the reference for future error message
$self->{TT}{non_ascii_ref} ||= $ref;
}
}
}
}
close $in
Expand Down
Loading

0 comments on commit e048fd3

Please sign in to comment.