177 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Raku
		
	
	
		
			Executable File
		
	
	
	
	
			
		
		
	
	
			177 lines
		
	
	
		
			4.7 KiB
		
	
	
	
		
			Raku
		
	
	
		
			Executable File
		
	
	
	
	
#!/usr/bin/perl
 | 
						|
#
 | 
						|
# Produce a codepage matching table.  For each 8-bit character, list
 | 
						|
# a primary and an alternate match (the latter used for case-insensitive
 | 
						|
# matching.)
 | 
						|
#
 | 
						|
# Usage:
 | 
						|
#	cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp
 | 
						|
#
 | 
						|
# Note: for the format of the UnicodeData file, see:
 | 
						|
# http://www.unicode.org/Public/UNIDATA/UCD.html
 | 
						|
#
 | 
						|
 | 
						|
($ucd, $cpco, $cpfs, $cpout) = @ARGV;
 | 
						|
 | 
						|
if (!defined($cpout)) {
 | 
						|
    die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n";
 | 
						|
}
 | 
						|
 | 
						|
%ucase   = ();
 | 
						|
%lcase   = ();
 | 
						|
%tcase   = ();
 | 
						|
%decomp  = ();
 | 
						|
 | 
						|
open(UCD, '<', $ucd)
 | 
						|
    or die "$0: could not open unicode data: $ucd: $!\n";
 | 
						|
while (defined($line = <UCD>)) {
 | 
						|
    chomp $line;
 | 
						|
    @f = split(/;/, $line);
 | 
						|
    $n = hex $f[0];
 | 
						|
    $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n;
 | 
						|
    $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n;
 | 
						|
    $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n;
 | 
						|
    if ($f[5] =~ /^[0-9A-F\s]+$/) {
 | 
						|
	# This character has a canonical decomposition.
 | 
						|
	# The regular expression rejects angle brackets, so other
 | 
						|
	# decompositions aren't permitted.
 | 
						|
	$decomp{$n} = [];
 | 
						|
	foreach my $dch (split(' ', $f[5])) {
 | 
						|
	    push(@{$decomp{$n}}, hex $dch);
 | 
						|
	}
 | 
						|
    }
 | 
						|
}
 | 
						|
close(UCD);
 | 
						|
 | 
						|
#
 | 
						|
# Filesystem and console codepages.  The filesystem codepage is used
 | 
						|
# for FAT shortnames, whereas the console codepage is whatever is used
 | 
						|
# on the screen and keyboard.
 | 
						|
#
 | 
						|
@xtab = (undef) x 256;
 | 
						|
%tabx = ();
 | 
						|
open(CPFS, '<', $cpfs)
 | 
						|
    or die "$0: could not open fs codepage: $cpfs: $!\n";
 | 
						|
while (defined($line = <CPFS>)) {
 | 
						|
    $line =~ s/\s*(\#.*|)$//;
 | 
						|
    @f = split(/\s+/, $line);
 | 
						|
    next if (scalar @f != 2);
 | 
						|
    next if (hex $f[0] > 255);
 | 
						|
    $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
 | 
						|
    $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
 | 
						|
}
 | 
						|
close(CPFS);
 | 
						|
 | 
						|
@ytab = (undef) x 256;
 | 
						|
%taby = ();
 | 
						|
open(CPCO, '<', $cpco)
 | 
						|
    or die "$0: could not open console codepage: $cpco: $!\n";
 | 
						|
while (defined($line = <CPCO>)) {
 | 
						|
    $line =~ s/\s*(\#.*|)$//;
 | 
						|
    @f = split(/\s+/, $line);
 | 
						|
    next if (scalar @f != 2);
 | 
						|
    next if (hex $f[0] > 255);
 | 
						|
    $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode
 | 
						|
    $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage
 | 
						|
}
 | 
						|
close(CPCO);
 | 
						|
 | 
						|
open(CPOUT, '>', $cpout)
 | 
						|
    or die "$0: could not open output file: $cpout: $!\n";
 | 
						|
#
 | 
						|
# Magic number, in anticipation of being able to load these
 | 
						|
# files dynamically...
 | 
						|
#
 | 
						|
print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1);
 | 
						|
 | 
						|
# Header fields available for future use...
 | 
						|
print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0);
 | 
						|
 | 
						|
#
 | 
						|
# Self (shortname) uppercase table.
 | 
						|
# This depends both on the console codepage and the filesystem codepage;
 | 
						|
# the logical transcoding operation is:
 | 
						|
#
 | 
						|
# $tabx{$ucase{$ytab[$i]}}
 | 
						|
#
 | 
						|
# ... where @ytab is console codepage -> Unicode and
 | 
						|
# %tabx is Unicode -> filesystem codepage.
 | 
						|
#
 | 
						|
@uctab = (undef) x 256;
 | 
						|
for ($i = 0; $i < 256; $i++) {
 | 
						|
    $uuc = $ucase{$ytab[$i]};	# Unicode upper case
 | 
						|
    if (defined($tabx{$uuc})) {
 | 
						|
	# Straight-forward conversion
 | 
						|
	$u = $tabx{$uuc};
 | 
						|
    } elsif (defined($tabx{${$decomp{$uuc}}[0]})) {
 | 
						|
	# Upper case equivalent stripped of accents
 | 
						|
	$u = $tabx{${$decomp{$uuc}}[0]};
 | 
						|
    } else {
 | 
						|
	# No equivalent at all found.  Assume it is a lower-case-only
 | 
						|
	# character, like greek alpha in CP437.
 | 
						|
	$u = $i;
 | 
						|
    }
 | 
						|
    $uctab[$i] = $u;
 | 
						|
    print CPOUT pack("C", $u);
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# Self (shortname) lowercase table.
 | 
						|
# This depends both on the console codepage and the filesystem codepage;
 | 
						|
# the logical transcoding operation is:
 | 
						|
#
 | 
						|
# $taby{$lcase{$xtab[$i]}}
 | 
						|
#
 | 
						|
# ... where @ytab is console codepage -> Unicode and
 | 
						|
# %tabx is Unicode -> filesystem codepage.
 | 
						|
#
 | 
						|
@lctab = (undef) x 256;
 | 
						|
for ($i = 0; $i < 256; $i++) {
 | 
						|
    $llc = $lcase{$xtab[$i]};	# Unicode lower case
 | 
						|
    if (defined($l = $taby{$llc}) && $uctab[$l] == $i) {
 | 
						|
	# Straight-forward conversion
 | 
						|
    } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) {
 | 
						|
	# Lower case equivalent stripped of accents
 | 
						|
    } else {
 | 
						|
	# No equivalent at all found.  Find *anything* that matches the
 | 
						|
	# bijection criterion...
 | 
						|
	for ($l = 0; $l < 256; $l++) {
 | 
						|
	    last if ($uctab[$l] == $i);
 | 
						|
	}
 | 
						|
	$l = $i if ($l == 256);	# If nothing, we're screwed anyway...
 | 
						|
    }
 | 
						|
    $lctab[$i] = $l;
 | 
						|
    print CPOUT pack("C", $l);
 | 
						|
}
 | 
						|
 | 
						|
#
 | 
						|
# Unicode (longname) matching table.
 | 
						|
# This only depends on the console codepage.
 | 
						|
#
 | 
						|
$pp0 = '';  $pp1 = '';
 | 
						|
for ($i = 0; $i < 256; $i++) {
 | 
						|
    if (!defined($ytab[$i])) {
 | 
						|
	$p0 = $p1 = 0xffff;
 | 
						|
    } else {
 | 
						|
	$p0 = $ytab[$i];
 | 
						|
	if ($ucase{$p0} != $p0) {
 | 
						|
	    $p1 = $ucase{$p0};
 | 
						|
	} elsif ($lcase{$p0} != $p0) {
 | 
						|
	    $p1 = $lcase{$p0};
 | 
						|
	} elsif ($tcase{$p0} != $p0) {
 | 
						|
	    $p1 = $tcase{$p0};
 | 
						|
	} else {
 | 
						|
	    $p1 = $p0;
 | 
						|
	}
 | 
						|
    }
 | 
						|
    # Only the BMP is supported...
 | 
						|
    $p0 = 0xffff if ($p0 > 0xffff);
 | 
						|
    $p1 = 0xffff if ($p1 > 0xffff);
 | 
						|
    $pp0 .= pack("v", $p0);
 | 
						|
    $pp1 .= pack("v", $p1);
 | 
						|
}
 | 
						|
print CPOUT $pp0, $pp1;
 | 
						|
close (CPOUT);
 | 
						|
 | 
						|
    
 |