cpmap.pl 39.3 KB
Newer Older
1 2 3 4 5 6
#!/usr/bin/perl
#
# Generate code page .c files from ftp.unicode.org descriptions
#
# Copyright 2000 Alexandre Julliard
#
7 8 9 10 11 12 13 14 15 16 17 18 19 20
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
21 22 23 24 25 26 27 28

# base directory for ftp.unicode.org files
$BASEDIR = "ftp.unicode.org/Public/";
$MAPPREFIX = $BASEDIR . "MAPPINGS/";

# UnicodeData file
$UNICODEDATA = $BASEDIR . "UNIDATA/UnicodeData.txt";

29 30 31
# Sort keys file
$SORTKEYS = "www.unicode.org/reports/tr10/allkeys.txt";

32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83
# Defaults mapping
$DEFAULTS = "./defaults";

# Default char for undefined mappings
$DEF_CHAR = ord '?';

@allfiles =
(
    [ 37,    "VENDORS/MICSFT/EBCDIC/CP037.TXT",   "IBM EBCDIC US Canada" ],
    [ 424,   "VENDORS/MISC/CP424.TXT",            "IBM EBCDIC Hebrew" ],
    [ 437,   "VENDORS/MICSFT/PC/CP437.TXT",       "OEM United States" ],
    [ 500,   "VENDORS/MICSFT/EBCDIC/CP500.TXT",   "IBM EBCDIC International" ],
    [ 737,   "VENDORS/MICSFT/PC/CP737.TXT",       "OEM Greek 437G" ],
    [ 775,   "VENDORS/MICSFT/PC/CP775.TXT",       "OEM Baltic" ],
    [ 850,   "VENDORS/MICSFT/PC/CP850.TXT",       "OEM Multilingual Latin 1" ],
    [ 852,   "VENDORS/MICSFT/PC/CP852.TXT",       "OEM Slovak Latin 2" ],
    [ 855,   "VENDORS/MICSFT/PC/CP855.TXT",       "OEM Cyrillic" ],
    [ 856,   "VENDORS/MISC/CP856.TXT",            "Hebrew PC" ],
    [ 857,   "VENDORS/MICSFT/PC/CP857.TXT",       "OEM Turkish" ],
    [ 860,   "VENDORS/MICSFT/PC/CP860.TXT",       "OEM Portuguese" ],
    [ 861,   "VENDORS/MICSFT/PC/CP861.TXT",       "OEM Icelandic" ],
    [ 862,   "VENDORS/MICSFT/PC/CP862.TXT",       "OEM Hebrew" ],
    [ 863,   "VENDORS/MICSFT/PC/CP863.TXT",       "OEM Canadian French" ],
    [ 864,   "VENDORS/MICSFT/PC/CP864.TXT",       "OEM Arabic" ],
    [ 865,   "VENDORS/MICSFT/PC/CP865.TXT",       "OEM Nordic" ],
    [ 866,   "VENDORS/MICSFT/PC/CP866.TXT",       "OEM Russian" ],
    [ 869,   "VENDORS/MICSFT/PC/CP869.TXT",       "OEM Greek" ],
    [ 874,   "VENDORS/MICSFT/PC/CP874.TXT",       "ANSI/OEM Thai" ],
    [ 875,   "VENDORS/MICSFT/EBCDIC/CP875.TXT",   "IBM EBCDIC Greek" ],
    [ 878,   "VENDORS/MISC/KOI8-R.TXT",           "Russian KOI8" ],
    [ 932,   "VENDORS/MICSFT/WINDOWS/CP932.TXT",  "ANSI/OEM Japanese Shift-JIS" ],
    [ 936,   "VENDORS/MICSFT/WINDOWS/CP936.TXT",  "ANSI/OEM Simplified Chinese GBK" ],
    [ 949,   "VENDORS/MICSFT/WINDOWS/CP949.TXT",  "ANSI/OEM Korean Unified Hangul" ],
    [ 950,   "VENDORS/MICSFT/WINDOWS/CP950.TXT",  "ANSI/OEM Traditional Chinese Big5" ],
    [ 1006,  "VENDORS/MISC/CP1006.TXT",           "IBM Arabic" ],
    [ 1026,  "VENDORS/MICSFT/EBCDIC/CP1026.TXT",  "IBM EBCDIC Latin 5 Turkish" ],
    [ 1250,  "VENDORS/MICSFT/WINDOWS/CP1250.TXT", "ANSI Eastern Europe" ],
    [ 1251,  "VENDORS/MICSFT/WINDOWS/CP1251.TXT", "ANSI Cyrillic" ],
    [ 1252,  "VENDORS/MICSFT/WINDOWS/CP1252.TXT", "ANSI Latin 1" ],
    [ 1253,  "VENDORS/MICSFT/WINDOWS/CP1253.TXT", "ANSI Greek" ],
    [ 1254,  "VENDORS/MICSFT/WINDOWS/CP1254.TXT", "ANSI Turkish" ],
    [ 1255,  "VENDORS/MICSFT/WINDOWS/CP1255.TXT", "ANSI Hebrew" ],
    [ 1256,  "VENDORS/MICSFT/WINDOWS/CP1256.TXT", "ANSI Arabic" ],
    [ 1257,  "VENDORS/MICSFT/WINDOWS/CP1257.TXT", "ANSI Baltic" ],
    [ 1258,  "VENDORS/MICSFT/WINDOWS/CP1258.TXT", "ANSI/OEM Viet Nam" ],
    [ 10000, "VENDORS/MICSFT/MAC/ROMAN.TXT",      "Mac Roman" ],
    [ 10006, "VENDORS/MICSFT/MAC/GREEK.TXT",      "Mac Greek" ],
    [ 10007, "VENDORS/MICSFT/MAC/CYRILLIC.TXT",   "Mac Cyrillic" ],
    [ 10029, "VENDORS/MICSFT/MAC/LATIN2.TXT",     "Mac Latin 2" ],
    [ 10079, "VENDORS/MICSFT/MAC/ICELAND.TXT",    "Mac Icelandic" ],
    [ 10081, "VENDORS/MICSFT/MAC/TURKISH.TXT",    "Mac Turkish" ],
    [ 20866, "VENDORS/MISC/KOI8-R.TXT",           "Russian KOI8" ],
84
    [ 20932, "OBSOLETE/EASTASIA/JIS/JIS0208.TXT", "EUC-JP" ],
85
    [ 21866, "VENDORS/MISC/KOI8-U.TXT",           "Ukrainian KOI8" ],
86
    [ 28591, "ISO8859/8859-1.TXT",                "ISO 8859-1 Latin 1" ],
87 88 89
    [ 28592, "ISO8859/8859-2.TXT",                "ISO 8859-2 Latin 2 (East European)" ],
    [ 28593, "ISO8859/8859-3.TXT",                "ISO 8859-3 Latin 3 (South European)" ],
    [ 28594, "ISO8859/8859-4.TXT",                "ISO 8859-4 Latin 4 (Baltic old)" ],
90 91 92 93
    [ 28595, "ISO8859/8859-5.TXT",                "ISO 8859-5 Cyrillic" ],
    [ 28596, "ISO8859/8859-6.TXT",                "ISO 8859-6 Arabic" ],
    [ 28597, "ISO8859/8859-7.TXT",                "ISO 8859-7 Greek" ],
    [ 28598, "ISO8859/8859-8.TXT",                "ISO 8859-8 Hebrew" ],
94 95 96 97
    [ 28599, "ISO8859/8859-9.TXT",                "ISO 8859-9 Latin 5 (Turkish)" ],
    [ 28600, "ISO8859/8859-10.TXT",               "ISO 8859-10 Latin 6 (Nordic)" ],
    [ 28603, "ISO8859/8859-13.TXT",               "ISO 8859-13 Latin 7 (Baltic)" ],
    [ 28604, "ISO8859/8859-14.TXT",               "ISO 8859-14 Latin 8 (Celtic)" ],
98 99
    [ 28605, "ISO8859/8859-15.TXT",               "ISO 8859-15 Latin 9 (Euro)" ],
    [ 28606, "ISO8859/8859-16.TXT",               "ISO 8859-16 Latin 10 (Balkan)" ]
100 101
);

102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127

%ctype =
(
    "upper"  => 0x0001,
    "lower"  => 0x0002,
    "digit"  => 0x0004,
    "space"  => 0x0008,
    "punct"  => 0x0010,
    "cntrl"  => 0x0020,
    "blank"  => 0x0040,
    "xdigit" => 0x0080,
    "alpha"  => 0x0100
);

%categories =
(
    "Lu" => $ctype{"alpha"}|$ctype{"upper"}, # Letter, Uppercase
    "Ll" => $ctype{"alpha"}|$ctype{"lower"}, # Letter, Lowercase
    "Lt" => $ctype{"alpha"},    # Letter, Titlecase
    "Mn" => $ctype{"punct"},    # Mark, Non-Spacing
    "Mc" => $ctype{"punct"},    # Mark, Spacing Combining
    "Me" => $ctype{"punct"},    # Mark, Enclosing
    "Nd" => $ctype{"digit"},    # Number, Decimal Digit
    "Nl" => $ctype{"punct"},    # Number, Letter
    "No" => $ctype{"punct"},    # Number, Other
    "Zs" => $ctype{"space"},    # Separator, Space
128 129
    "Zl" => $ctype{"space"},    # Separator, Line
    "Zp" => $ctype{"space"},    # Separator, Paragraph
130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
    "Cc" => $ctype{"cntrl"},    # Other, Control
    "Cf" => 0,                  # Other, Format
    "Cs" => 0,                  # Other, Surrogate
    "Co" => 0,                  # Other, Private Use
    "Cn" => 0,                  # Other, Not Assigned
    "Lm" => $ctype{"punct"},    # Letter, Modifier
    "Lo" => $ctype{"alpha"},    # Letter, Other
    "Pc" => $ctype{"punct"},    # Punctuation, Connector
    "Pd" => $ctype{"punct"},    # Punctuation, Dash
    "Ps" => $ctype{"punct"},    # Punctuation, Open
    "Pe" => $ctype{"punct"},    # Punctuation, Close
    "Pi" => $ctype{"punct"},    # Punctuation, Initial quote
    "Pf" => $ctype{"punct"},    # Punctuation, Final quote
    "Po" => $ctype{"punct"},    # Punctuation, Other
    "Sm" => $ctype{"punct"},    # Symbol, Math
    "Sc" => $ctype{"punct"},    # Symbol, Currency
    "Sk" => $ctype{"punct"},    # Symbol, Modifier
147
    "So" => $ctype{"punct"}     # Symbol, Other
148 149 150 151 152 153 154
);

# a few characters need additional categories that cannot be determined automatically
%special_categories =
(
    "xdigit" => [ ord('0')..ord('9'),ord('A')..ord('F'),ord('a')..ord('f'),
                  0xff10..0xff19, 0xff21..0xff26, 0xff41..0xff46 ],
155 156 157 158 159 160
    "space"  => [ 0x09..0x0d, 0x85 ],
    "blank"  => [ 0x09, 0x20, 0xa0, 0x3000, 0xfeff ],
    "cntrl"  => [ 0x070f, 0x180b, 0x180c, 0x180d, 0x180e, 0x200c, 0x200d,
                  0x200e, 0x200f, 0x202a, 0x202b, 0x202c, 0x202d, 0x202e,
                  0x206a, 0x206b, 0x206c, 0x206d, 0x206e, 0x206f, 0xfeff,
                  0xfff9, 0xfffa, 0xfffb ]
161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
);

%directions =
(
    "L"   => 1,    # Left-to-Right
    "LRE" => 11,   # Left-to-Right Embedding
    "LRO" => 11,   # Left-to-Right Override
    "R"   => 2,    # Right-to-Left
    "AL"  => 2,    # Right-to-Left Arabic
    "RLE" => 11,   # Right-to-Left Embedding
    "RLO" => 11,   # Right-to-Left Override
    "PDF" => 11,   # Pop Directional Format
    "EN"  => 3,    # European Number
    "ES"  => 4,    # European Number Separator
    "ET"  => 5,    # European Number Terminator
    "AN"  => 6,    # Arabic Number
    "CS"  => 7,    # Common Number Separator
    "NSM" => 0,    # Non-Spacing Mark
    "BN"  => 0,    # Boundary Neutral
    "B"   => 8,    # Paragraph Separator
    "S"   => 9,    # Segment Separator
    "WS"  => 10,   # Whitespace
    "ON"  => 11    # Other Neutrals
);


187 188 189 190
################################################################
# main routine

READ_DEFAULTS();
191
my @sortkeys = READ_SORTKEYS_FILE();
192
DUMP_CASE_MAPPINGS();
193
DUMP_SORTKEYS(@sortkeys);
194
DUMP_COMPOSE_TABLES();
195
DUMP_CTYPE_TABLES();
196 197 198 199 200 201 202 203 204 205 206 207 208 209

foreach $file (@allfiles) { HANDLE_FILE( @$file ); }

OUTPUT_CPTABLE();

exit(0);


################################################################
# read in the defaults file
sub READ_DEFAULTS
{
    @unicode_defaults = ();
    @unicode_aliases = ();
210 211
    @tolower_table = ();
    @toupper_table = ();
212 213
    @digitmap_table = ();
    @compatmap_table = ();
214 215
    @category_table = ();
    @direction_table = ();
216 217
    @decomp_table = ();
    @compose_table = ();
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247

    # first setup a few default mappings

    open DEFAULTS or die "Cannot open $DEFAULTS";
    print "Loading $DEFAULTS\n";
    while (<DEFAULTS>)
    {
        next if /^\#/;  # skip comments
        next if /^$/;  # skip empty lines
        if (/^(([0-9a-fA-F]+)(,[0-9a-fA-F]+)*)\s+([0-9a-fA-F]+|'.'|none)\s+(\#.*)?/)
        {
            my @src = map hex, split /,/,$1;
            my $dst = $4;
            my $comment = $5;
            if ($#src > 0) { push @unicode_aliases, \@src; }
            next if ($dst eq "none");
            $dst = ($dst =~ /\'.\'/) ? ord substr($dst,1,1) : hex $dst;
            foreach $src (@src)
            {
                die "Duplicate value" if defined($unicode_defaults[$src]);
                $unicode_defaults[$src] = $dst;
            }
            next;
        }
        die "Unrecognized line $_\n";
    }

    # now build mappings from the decomposition field of the Unicode database

    open UNICODEDATA or die "Cannot open $UNICODEDATA";
248
    print "Loading $UNICODEDATA\n";
249 250 251
    while (<UNICODEDATA>)
    {
	# Decode the fields ...
252 253
	($code, $name, $cat, $comb, $bidi,
	 $decomp, $dec, $dig, $num, $mirror,
254 255
	 $oldname, $comment, $upper, $lower, $title) = split /;/;

256
        my $src = hex $code;
257

258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278
        die "unknown category $cat" unless defined $categories{$cat};
        die "unknown directionality $bidi" unless defined $directions{$bidi};

        $uniname[$src] = $name;
        $category_table[$src] = $categories{$cat};
        $direction_table[$src] = $directions{$bidi};

        if ($lower ne "")
        {
            $tolower_table[$src] = hex $lower;
            $category_table[$src] |= $ctype{"upper"}|$ctype{"alpha"};
        }
        if ($upper ne "")
        {
            $toupper_table[$src] = hex $upper;
            $category_table[$src] |= $ctype{"lower"}|$ctype{"alpha"};
        }
        if ($dec ne "")
        {
            $category_table[$src] |= $ctype{"digit"};
        }
279 280 281 282
        if ($dig ne "")
        {
            $digitmap_table[$src] = ord $dig;
        }
283

284 285 286 287 288 289 290 291 292 293 294
        # copy the category and direction for everything between First/Last pairs
        if ($name =~ /, First>/) { $start = $src; }
        if ($name =~ /, Last>/)
        {
            while ($start < $src)
            {
                $category_table[$start] = $category_table[$src];
                $direction_table[$start] = $direction_table[$src];
                $start++;
            }
        }
295

296
        next if $decomp eq "";  # no decomposition, skip it
297 298 299 300

        if ($decomp =~ /^<([a-zA-Z]+)>\s+([0-9a-fA-F]+)$/)
        {
            # decomposition of the form "<foo> 1234" -> use char if type is known
301 302
            if (($src >= 0xf900 && $src < 0xfb00) || ($src >= 0xfe30 && $src < 0xfffd))
            {
Francois Gouget's avatar
Francois Gouget committed
303
                # Single char decomposition in the compatibility range
304 305
                $compatmap_table[$src] = hex $2;
            }
306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325
            next unless ($1 eq "font" ||
                         $1 eq "noBreak" ||
                         $1 eq "circle" ||
                         $1 eq "super" ||
                         $1 eq "sub" ||
                         $1 eq "wide" ||
                         $1 eq "narrow" ||
                         $1 eq "compat" ||
                         $1 eq "small");
            $dst = hex $2;
        }
        elsif ($decomp =~ /^<compat>\s+0020\s+([0-9a-fA-F]+)/)
        {
            # decomposition "<compat> 0020 1234" -> combining accent
            $dst = hex $1;
        }
        elsif ($decomp =~ /^([0-9a-fA-F]+)/)
        {
            # decomposition contains only char values without prefix -> use first char
            $dst = hex $1;
326
            $category_table[$src] |= $category_table[$dst];
327 328 329 330 331 332
            # store decomposition if it contains two chars
            if ($decomp =~ /^([0-9a-fA-F]+)\s+([0-9a-fA-F]+)$/)
            {
                $decomp_table[$src] = [ hex $1, hex $2 ];
                push @compose_table, [ hex $1, hex $2, $src ];
            }
333 334 335
            elsif ($decomp =~ /^(<[a-z]+>\s)*([0-9a-fA-F]+)$/ &&
                   (($src >= 0xf900 && $src < 0xfb00) || ($src >= 0xfe30 && $src < 0xfffd)))
            {
Francois Gouget's avatar
Francois Gouget committed
336
                # Single char decomposition in the compatibility range
337 338
                $compatmap_table[$src] = hex $2;
            }
339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354
        }
        else
        {
            next;
        }

        next if defined($unicode_defaults[$src]);  # may have been set in the defaults file

        # check for loops
        for ($i = $dst; ; $i = $unicode_defaults[$i])
        {
            die sprintf("loop detected for %04x -> %04x",$src,$dst) if $i == $src;
            last unless defined($unicode_defaults[$i]);
        }
        $unicode_defaults[$src] = $dst;
    }
355 356 357 358 359 360 361 362

    # patch the category of some special characters

    foreach $cat (keys %special_categories)
    {
        my $flag = $ctype{$cat};
        foreach $i (@{$special_categories{$cat}}) { $category_table[$i] |= $flag; }
    }
363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380
}


################################################################
# parse the input file
sub READ_FILE
{
    my $name = shift;
    open INPUT,$name or die "Cannot open $name";
    @cp2uni = ();
    @lead_bytes = ();
    @uni2cp = ();

    while (<INPUT>)
    {
        next if /^\#/;  # skip comments
        next if /^$/;  # skip empty lines
        next if /\x1a/;  # skip ^Z
381 382
        next if (/^0x([0-9a-fA-F]+)\s+\#UNDEFINED/);  # undefined char

383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399
        if (/^0x([0-9a-fA-F]+)\s+\#DBCS LEAD BYTE/)
        {
            $cp = hex $1;
            push @lead_bytes,$cp;
            $cp2uni[$cp] = 0;
            next;
        }
        if (/^0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
        {
            $cp = hex $1;
            $uni = hex $2;
            $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
            $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
            next;
        }
        die "$name: Unrecognized line $_\n";
    }
400 401 402
}


403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467
################################################################
# build EUC-JP table from the JIS 0208 file
# FIXME: for proper EUC-JP we should probably read JIS 0212 too
# but this would require 3-byte DBCS characters
sub READ_JIS0208_FILE
{
    my $name = shift;
    @cp2uni = ();
    @lead_bytes = ();
    @uni2cp = ();

    # ASCII chars
    for ($i = 0x00; $i <= 0x7f; $i++)
    {
        $cp2uni[$i] = $i;
        $uni2cp[$i] = $i;
    }

    # JIS X 0201 right plane
    for ($i = 0xa1; $i <= 0xdf; $i++)
    {
        $cp2uni[0x8e00 + $i] = 0xfec0 + $i;
        $uni2cp[0xfec0 + $i] = 0x8e00 + $i;
    }

    # lead bytes
    foreach $i (0x8e, 0x8f, 0xa1 .. 0xfe)
    {
        push @lead_bytes,$i;
        $cp2uni[$i] = 0;
    }

    # undefined chars
    foreach $i (0x80 .. 0x8d, 0x90 .. 0xa0, 0xff)
    {
        $cp2uni[$i] = $DEF_CHAR;
    }

    # Shift-JIS compatibility
    $uni2cp[0x00a5] = 0x5c;
    $uni2cp[0x203e] = 0x7e;

    # Fix backslash conversion
    $cp2uni[0xa1c0] = 0xff3c;
    $uni2cp[0xff3c] = 0xa1c0;

    open INPUT, "$name" or die "Cannot open $name";
    while (<INPUT>)
    {
        next if /^\#/;  # skip comments
        next if /^$/;  # skip empty lines
        next if /\x1a/;  # skip ^Z
        if (/^0x[0-9a-fA-F]+\s+0x([0-9a-fA-F]+)\s+0x([0-9a-fA-F]+)\s+(\#.*)?/)
        {
            $cp = 0x8080 + hex $1;
            $uni = hex $2;
            $cp2uni[$cp] = $uni unless defined($cp2uni[$cp]);
            $uni2cp[$uni] = $cp unless defined($uni2cp[$uni]);
            next;
        }
        die "$name: Unrecognized line $_\n";
    }
}


468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579
################################################################
# build the sort keys table
sub READ_SORTKEYS_FILE
{
    my @sortkeys = ();
    for (my $i = 0; $i < 65536; $i++) { $sortkeys[$i] = [ -1, 0, 0, 0, 0 ] };

    open INPUT, "$SORTKEYS" or die "Cannot open $SORTKEYS";
    print "Loading $SORTKEYS\n";
    while (<INPUT>)
    {
        next if /^\#/;  # skip comments
        next if /^$/;  # skip empty lines
        next if /\x1a/;  # skip ^Z
        next if /^\@version/;  # skip @version header
        if (/^([0-9a-fA-F]+)\s+;\s+\[([*.])([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
        {
            my ($uni,$variable) = (hex $1, $2);
            next if $uni > 65535;
            $sortkeys[$uni] = [ $uni, hex $3, hex $4, hex $5, hex $6 ];
            next;
        }
        if (/^([0-9a-fA-F]+\s+)+;\s+\[[*.]([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]{4})\.([0-9a-fA-F]+)\]/)
        {
            # multiple character sequence, ignored for now
            next;
        }
        die "$SORTKEYS: Unrecognized line $_\n";
    }
    close INPUT;

    # compress the keys to 32 bit:
    # key 1 to 16 bits, key 2 to 8 bits, key 3 to 4 bits, key 4 to 1 bit

    @sortkeys = sort { ${$a}[1] <=> ${$b}[1] or 
                       ${$a}[2] <=> ${$b}[2] or
                       ${$a}[3] <=> ${$b}[3] or
                       ${$a}[4] <=> ${$b}[4] or
                       $a cmp $b; } @sortkeys;

    my ($n2, $n3) = (1, 1);
    my @keys = (-1, -1, -1, -1, -1 );
    my @flatkeys = ();

    for (my $i = 0; $i < 65536; $i++)
    {
        my @current = @{$sortkeys[$i]};
        next if $current[0] == -1;
        if ($current[1] == $keys[1])
        {
            if ($current[2] == $keys[2])
            {
                if ($current[3] == $keys[3])
                {
                    # nothing
                }
                else
                {
                    $keys[3] = $current[3];
                    $n3++;
                    die if ($n3 >= 16);
                }
            }
            else
            {
                $keys[2] = $current[2];
                $keys[3] = $current[3];
                $n2++;
                $n3 = 1;
                die if ($n2 >= 256);
            }
        }
        else
        {
            $keys[1] = $current[1];
            $keys[2] = $current[2];
            $keys[3] = $current[3];
            $n2 = 1;
            $n3 = 1;
        }

        if ($current[2]) { $current[2] = $n2; }
        if ($current[3]) { $current[3] = $n3; }
        if ($current[4]) { $current[4] = 1; }

        $flatkeys[$current[0]] = ($current[1] << 16) | ($current[2] << 8) | ($current[3] << 4) | $current[4];
    }
    return @flatkeys;
}


################################################################
# build the sort keys table
sub DUMP_SORTKEYS
{
    my @keys = @_;

    # count the number of 256-key ranges that contain something

    my @offsets = ();
    my $ranges = 2;
    for (my $i = 0; $i < 256; $i++) { $offsets[$i] = 256; }
    for (my $i = 0; $i < 65536; $i++)
    {
        next unless defined $keys[$i];
        $offsets[$i >> 8] = $ranges * 256;
        $ranges++;
        $i |= 255;
    }

    # output the range offsets

580
    open OUTPUT,">collation.c.new" or die "Cannot create collation.c";
581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604
    printf "Building collation.c\n";
    printf OUTPUT "/* Unicode collation element table */\n";
    printf OUTPUT "/* generated from %s */\n", $SORTKEYS;
    printf OUTPUT "/* DO NOT EDIT!! */\n\n";

    printf OUTPUT "const unsigned int collation_table[%d] =\n{\n", $ranges*256;
    printf OUTPUT "    /* index */\n";
    printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%08x", 0, @offsets );

    # output the default values

    printf OUTPUT "    /* defaults */\n";
    printf OUTPUT "%s", DUMP_ARRAY( "0x%08x", 0, (-1) x 256 );

    # output all the key ranges

    for (my $i = 0; $i < 256; $i++)
    {
        next if $offsets[$i] == 256;
        printf OUTPUT ",\n    /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
        printf OUTPUT "%s", DUMP_ARRAY( "0x%08x", -1, @keys[($i<<8) .. ($i<<8)+255] );
    }
    printf OUTPUT "\n};\n";
    close OUTPUT;
605
    save_file("collation.c");
606 607 608
}


609 610 611 612
################################################################
# add default mappings once the file had been read
sub ADD_DEFAULT_MAPPINGS
{
613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649
    # Apply aliases

    foreach $alias (@unicode_aliases)
    {
        my $target = undef;
        foreach $src (@$alias)
        {
            if (defined($uni2cp[$src]))
            {
                $target = $uni2cp[$src];
                last;
            }
        }
        next unless defined($target);

        # At least one char of the alias set is defined, set the others to the same value
        foreach $src (@$alias)
        {
            $uni2cp[$src] = $target unless defined($uni2cp[$src]);
        }
    }

    # For every src -> target mapping in the defaults table,
    # make uni2cp[src] = uni2cp[target] if uni2cp[target] is defined

    for ($src = 0; $src < 65536; $src++)
    {
        next if defined($uni2cp[$src]);  # source has a definition already
        next unless defined($unicode_defaults[$src]);  # no default for this char
        my $target = $unicode_defaults[$src];

        # do a recursive mapping until we find a target char that is defined
        while (!defined($uni2cp[$target]) &&
               defined($unicode_defaults[$target])) { $target = $unicode_defaults[$target]; }

        if (defined($uni2cp[$target])) { $uni2cp[$src] = $uni2cp[$target]; }
    }
650 651 652 653 654 655 656 657 658

    # Add an identity mapping for all undefined chars

    for ($i = 0; $i < 256; $i++)
    {
        next if defined($cp2uni[$i]);
        next if defined($uni2cp[$i]);
        $cp2uni[$i] = $uni2cp[$i] = $i;
    }
659 660
}

661 662 663 664 665 666 667 668 669 670 671 672 673 674
################################################################
# dump an array of integers
sub DUMP_ARRAY
{
    my ($format,$default,@array) = @_;
    my $i, $ret = "    ";
    for ($i = 0; $i < $#array; $i++)
    {
        $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
        $ret .= (($i % 8) != 7) ? ", " : ",\n    ";
    }
    $ret .= sprintf($format, defined $array[$i] ? $array[$i] : $default);
    return $ret;
}
675 676 677 678 679 680

################################################################
# dump an SBCS mapping table
sub DUMP_SBCS_TABLE
{
    my ($codepage, $name) = @_;
681
    my $i;
682 683 684

    # output the ascii->unicode table

685 686
    printf OUTPUT "static const WCHAR cp2uni[256] =\n";
    printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
687 688 689 690 691 692 693 694 695 696

    # count the number of unicode->ascii subtables that contain something

    my @filled = ();
    my $subtables = 1;
    for ($i = 0; $i < 65536; $i++)
    {
        next unless defined $uni2cp[$i];
        $filled[$i >> 8] = 1;
        $subtables++;
697
        $i |= 255;
698 699 700 701
    }

    # output all the subtables into a single array

702 703
    printf OUTPUT "static const unsigned char uni2cp_low[%d] =\n{\n", $subtables*256;
    for ($i = 0; $i < 256; $i++)
704
    {
705 706 707
        next unless $filled[$i];
        printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $i, $i;
        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%02x", $DEF_CHAR, @uni2cp[($i<<8) .. ($i<<8)+255] );
708
    }
709 710
    printf OUTPUT "    /* defaults */\n";
    printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, ($DEF_CHAR) x 256 );
711 712 713 714

    # output a table of the offsets of the subtables in the previous array

    my $pos = 0;
715 716
    my @offsets = ();
    for ($i = 0; $i < 256; $i++)
717
    {
718 719
        if ($filled[$i]) { push @offsets, $pos; $pos += 256; }
        else { push @offsets, ($subtables-1) * 256; }
720
    }
721 722
    printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
    printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
723 724 725 726

    # output the code page descriptor

    printf OUTPUT "const struct sbcs_table cptable_%03d =\n{\n", $codepage;
727
    printf OUTPUT "    { %d, 1, 0x%04x, 0x%04x, \"%s\" },\n",
728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760
                  $codepage, $DEF_CHAR, $DEF_CHAR, $name;
    printf OUTPUT "    cp2uni,\n";
    printf OUTPUT "    uni2cp_low,\n";
    printf OUTPUT "    uni2cp_high\n};\n";
}


################################################################
# dump a DBCS mapping table
sub DUMP_DBCS_TABLE
{
    my ($codepage, $name) = @_;
    my $i, $x, $y;

    # build a list of lead bytes that are actually used

    my @lblist = ();
    LBLOOP: for ($y = 0; $y <= $#lead_bytes; $y++)
    {
        my $base = $lead_bytes[$y] << 8;
        for ($x = 0; $x < 256; $x++)
        {
            if (defined $cp2uni[$base+$x])
            {
                push @lblist,$lead_bytes[$y];
                next LBLOOP;
            }
        }
    }
    my $unused = ($#lead_bytes > $#lblist);

    # output the ascii->unicode table for the single byte chars

761 762
    printf OUTPUT "static const WCHAR cp2uni[%d] =\n", 256 * ($#lblist + 2 + $unused);
    printf OUTPUT "{\n%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[0 .. 255] );
763 764 765 766 767

    # output the default table for unused lead bytes

    if ($unused)
    {
768 769
        printf OUTPUT "    /* unused lead bytes */\n";
        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
770 771 772 773 774 775 776
    }

    # output the ascii->unicode table for each DBCS lead byte

    for ($y = 0; $y <= $#lblist; $y++)
    {
        my $base = $lblist[$y] << 8;
777 778 779
        printf OUTPUT "    /* lead byte %02x */\n", $lblist[$y];
        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @cp2uni[$base .. $base+255] );
        printf OUTPUT ($y < $#lblist) ? ",\n" : "\n};\n\n";
780 781 782 783 784 785 786 787 788 789 790 791
    }

    # output the lead byte subtables offsets

    my @offsets = ();
    for ($x = 0; $x < 256; $x++) { $offsets[$x] = 0; }
    for ($x = 0; $x <= $#lblist; $x++) { $offsets[$lblist[$x]] = $x + 1; }
    if ($unused)
    {
        # increment all lead bytes offset to take into account the unused table
        for ($x = 0; $x <= $#lead_bytes; $x++) { $offsets[$lead_bytes[$x]]++; }
    }
792 793
    printf OUTPUT "static const unsigned char cp2uni_leadbytes[256] =\n";
    printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%02x", 0, @offsets );
794 795 796 797 798 799 800 801 802 803

    # count the number of unicode->ascii subtables that contain something

    my @filled = ();
    my $subtables = 1;
    for ($i = 0; $i < 65536; $i++)
    {
        next unless defined $uni2cp[$i];
        $filled[$i >> 8] = 1;
        $subtables++;
804
        $i |= 255;
805 806 807 808
    }

    # output all the subtables into a single array

809
    printf OUTPUT "static const unsigned short uni2cp_low[%d] =\n{\n", $subtables*256;
810 811 812
    for ($y = 0; $y < 256; $y++)
    {
        next unless $filled[$y];
813 814
        printf OUTPUT "    /* 0x%02x00 .. 0x%02xff */\n", $y, $y;
        printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", $DEF_CHAR, @uni2cp[($y<<8) .. ($y<<8)+255] );
815
    }
816 817
    printf OUTPUT "    /* defaults */\n";
    printf OUTPUT "%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, ($DEF_CHAR) x 256 );
818 819 820 821

    # output a table of the offsets of the subtables in the previous array

    my $pos = 0;
822
    my @offsets = ();
823 824
    for ($y = 0; $y < 256; $y++)
    {
825 826
        if ($filled[$y]) { push @offsets, $pos; $pos += 256; }
        else { push @offsets, ($subtables-1) * 256; }
827
    }
828 829
    printf OUTPUT "static const unsigned short uni2cp_high[256] =\n";
    printf OUTPUT "{\n%s\n};\n\n", DUMP_ARRAY( "0x%04x", 0, @offsets );
830 831 832 833

    # output the code page descriptor

    printf OUTPUT "const struct dbcs_table cptable_%03d =\n{\n", $codepage;
834
    printf OUTPUT "    { %d, 2, 0x%04x, 0x%04x, \"%s\" },\n",
835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865
                  $codepage, $DEF_CHAR, $DEF_CHAR, $name;
    printf OUTPUT "    cp2uni,\n";
    printf OUTPUT "    cp2uni_leadbytes,\n";
    printf OUTPUT "    uni2cp_low,\n";
    printf OUTPUT "    uni2cp_high,\n";
    DUMP_LB_RANGES();
    printf OUTPUT "};\n";
}


################################################################
# dump the list of defined lead byte ranges
sub DUMP_LB_RANGES
{
    my @list = ();
    my $i = 0;
    foreach $i (@lead_bytes) { $list[$i] = 1; }
    my $on = 0;
    printf OUTPUT "    { ";
    for ($i = 0; $i < 256; $i++)
    {
        if ($on)
        {
            if (!defined $list[$i]) { printf OUTPUT "0x%02x, ", $i-1; $on = 0; }
        }
        else
        {
            if ($list[$i]) { printf OUTPUT "0x%02x, ", $i; $on = 1; }
        }
    }
    if ($on) { printf OUTPUT "0xff, "; }
866
    printf OUTPUT "0x00, 0x00 }\n";
867 868 869
}


870 871 872 873
################################################################
# dump the case mapping tables
sub DUMP_CASE_MAPPINGS
{
874
    open OUTPUT,">casemap.c.new" or die "Cannot create casemap.c";
875 876 877 878 879
    printf "Building casemap.c\n";
    printf OUTPUT "/* Unicode case mappings */\n";
    printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
    printf OUTPUT "#include \"wine/unicode.h\"\n\n";

880 881
    DUMP_CASE_TABLE( "wine_casemap_lower", @tolower_table );
    DUMP_CASE_TABLE( "wine_casemap_upper", @toupper_table );
882 883
    DUMP_CASE_TABLE( "wine_digitmap",  @digitmap_table );
    DUMP_CASE_TABLE( "wine_compatmap", @compatmap_table );
884
    close OUTPUT;
885
    save_file("casemap.c");
886 887 888 889 890 891 892 893 894 895
}


################################################################
# dump a case mapping table
sub DUMP_CASE_TABLE
{
    my ($name,@table) = @_;

    # count the number of sub tables that contain something
896
    # also compute the low and upper populated bounds
897

898 899 900
    my @lowerbounds = ( 0, 0 );
    my @upperbounds = ( 0, 255 );
    my $index = 0;
901 902 903 904
    my @filled = ();
    for ($i = 0; $i < 65536; $i++)
    {
        next unless defined $table[$i];
905 906 907 908 909 910 911 912 913 914 915 916
        if (!defined $filled[$i >> 8])
        {
          $lowerbounds[$index] = $i & 0xff;
          $upperbounds[$index] = 0xff - $lowerbounds[$index];
          $filled[$i >> 8] = $index * 256 + 512;
          $index++;
        }
        else
        {
          $upperbounds[$index-1] = 0xff - ($i & 0xff);
        }
        $table[$i] = ($table[$i] - $i) & 0xffff;
917
    }
918 919 920 921 922

    # Collapse blocks upwards if possible
    my $removed = 0;
    $index = 0;
    for ($i = 0; $i < 256; $i++)
923
    {
924 925 926 927 928 929 930 931 932 933 934 935
        next unless defined $filled[$i];
        if ($upperbounds[$index - 1] > $lowerbounds[$index])
        {
           $removed = $removed + $lowerbounds[$index];
        }
        else
        {
           $removed = $removed + $upperbounds[$index - 1];
           $lowerbounds[$index] = $upperbounds[$index - 1];
        }
        $filled[$i] = $filled[$i] - $removed;
        $index++;
936 937 938 939
    }

    # dump the table

940
    printf OUTPUT "const WCHAR %s[%d] =\n", $name, $index * 256 + 512 - $removed;
941 942 943 944
    printf OUTPUT "{\n    /* index */\n";
    printf OUTPUT "%s,\n", DUMP_ARRAY( "0x%04x", 256, @filled );
    printf OUTPUT "    /* defaults */\n";
    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 256 );
945
    $index = 0;
946 947 948
    for ($i = 0; $i < 256; $i++)
    {
        next unless $filled[$i];
949 950 951 952
        printf OUTPUT ",\n    /* 0x%02x%02x .. 0x%02xff */\n", $i, $lowerbounds[$index], $i;
        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0,
                      @table[($i<<8) + $lowerbounds[$index] .. ($i<<8)+255] );
        $index++;
953 954 955 956 957
    }
    printf OUTPUT "\n};\n";
}


958 959 960 961
################################################################
# dump the ctype tables
sub DUMP_CTYPE_TABLES
{
962
    open OUTPUT,">wctype.c.new" or die "Cannot create wctype.c";
963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993
    printf "Building wctype.c\n";
    printf OUTPUT "/* Unicode ctype tables */\n";
    printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
    printf OUTPUT "#include \"wine/unicode.h\"\n\n";

    my $i;
    my @array = (0) x 256;

    # add the direction in the high 4 bits of the category
    for ($i = 0; $i < 65536; $i++)
    {
        $category_table[$i] |= $direction_table[$i] << 12;
    }

    # try to merge table rows
    for ($row = 0; $row < 256; $row++)
    {
        my $rowtxt = sprintf "%04x" x 256, @category_table[($row<<8)..($row<<8)+255];
        if (defined($sequences{$rowtxt}))
        {
            # reuse an existing row
            $array[$row] = $sequences{$rowtxt};
        }
        else
        {
            # create a new row
            $sequences{$rowtxt} = $array[$row] = $#array + 1;
            push @array, @category_table[($row<<8)..($row<<8)+255];
        }
    }

994
    printf OUTPUT "const unsigned short wine_wctype_table[%d] =\n{\n", $#array+1;
995 996 997 998
    printf OUTPUT "    /* offsets */\n%s,\n", DUMP_ARRAY( "0x%04x", 0, @array[0..255] );
    printf OUTPUT "    /* values */\n%s\n};\n", DUMP_ARRAY( "0x%04x", 0, @array[256..$#array] );

    close OUTPUT;
999
    save_file("wctype.c");
1000 1001
}

1002 1003 1004 1005 1006

################################################################
# dump the char composition tables
sub DUMP_COMPOSE_TABLES
{
1007
    open OUTPUT,">compose.c.new" or die "Cannot create compose.c";
1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136
    printf "Building compose.c\n";
    printf OUTPUT "/* Unicode char composition */\n";
    printf OUTPUT "/* Automatically generated; DO NOT EDIT!! */\n\n";
    printf OUTPUT "#include \"wine/unicode.h\"\n\n";

    ######### composition table

    my @filled = ();
    foreach $i (@compose_table)
    {
        my @comp = @$i;
        push @{$filled[$comp[1]]}, [ $comp[0], $comp[2] ];
    }

    # count how many different second chars we have

    for ($i = $count = 0; $i < 65536; $i++)
    {
        next unless defined $filled[$i];
        $count++;
    }

    # build the table of second chars and offsets

    my $pos = $count + 1;
    for ($i = 0; $i < 65536; $i++)
    {
        next unless defined $filled[$i];
        push @table, $i, $pos;
        $pos += @{$filled[$i]};
    }
    # terminator with last position
    push @table, 0, $pos;
    printf OUTPUT "const WCHAR unicode_compose_table[0x%x] =\n{\n", 2*$pos;
    printf OUTPUT "    /* second chars + offsets */\n%s", DUMP_ARRAY( "0x%04x", 0, @table );

    # build the table of first chars and mappings

    for ($i = 0; $i < 65536; $i++)
    {
        next unless defined $filled[$i];
        my @table = ();
        my @list = sort { $a->[0] <=> $b->[0] } @{$filled[$i]};
        for ($j = 0; $j <= $#list; $j++)
        {
            push @table, $list[$j][0], $list[$j][1];
        }
        printf OUTPUT ",\n    /* 0x%04x */\n%s", $i, DUMP_ARRAY( "0x%04x", 0, @table );
    }
    printf OUTPUT "\n};\n\nconst unsigned int unicode_compose_table_size = %d;\n\n", $count;

    ######### decomposition table

    # first determine all the 16-char subsets that contain something

    my @filled = (0) x 4096;
    my $pos = 16*2;  # for the null subset
    for ($i = 0; $i < 65536; $i++)
    {
        next unless defined $decomp_table[$i];
        $filled[$i >> 4] = $pos;
        $pos += 16*2;
        $i |= 15;
    }
    my $total = $pos;

    # now count the 256-char subsets that contain something

    my @filled_idx = (256) x 256;
    $pos = 256 + 16;
    for ($i = 0; $i < 4096; $i++)
    {
        next unless $filled[$i];
        $filled_idx[$i >> 4] = $pos;
        $pos += 16;
        $i |= 15;
    }
    my $null_offset = $pos;  # null mapping
    $total += $pos;

    # add the index offsets to the subsets positions

    for ($i = 0; $i < 4096; $i++)
    {
        next unless $filled[$i];
        $filled[$i] += $null_offset;
    }

    # dump the main index

    printf OUTPUT "const WCHAR unicode_decompose_table[%d] =\n", $total;
    printf OUTPUT "{\n    /* index */\n";
    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @filled_idx );
    printf OUTPUT ",\n    /* null sub-index */\n%s", DUMP_ARRAY( "0x%04x", 0, ($null_offset) x 16 );

    # dump the second-level indexes

    for ($i = 0; $i < 256; $i++)
    {
        next unless ($filled_idx[$i] > 256);
        my @table = @filled[($i<<4)..($i<<4)+15];
        for ($j = 0; $j < 16; $j++) { $table[$j] ||= $null_offset; }
        printf OUTPUT ",\n    /* sub-index %02x */\n", $i;
        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
    }

    # dump the 16-char subsets

    printf OUTPUT ",\n    /* null mapping */\n";
    printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, (0) x 32 );

    for ($i = 0; $i < 4096; $i++)
    {
        next unless $filled[$i];
        my @table = (0) x 32;
        for ($j = 0; $j < 16; $j++)
        {
            if (defined $decomp_table[($i<<4) + $j])
            {
                $table[2 * $j] = ${$decomp_table[($i << 4) + $j]}[0];
                $table[2 * $j + 1] = ${$decomp_table[($i << 4) + $j]}[1];
            }
        }
        printf OUTPUT ",\n    /* 0x%03x0 .. 0x%03xf */\n", $i, $i;
        printf OUTPUT "%s", DUMP_ARRAY( "0x%04x", 0, @table );
    }

    printf OUTPUT "\n};\n";
    close OUTPUT;
1137
    save_file("compose.c");
1138 1139 1140
}


1141 1142 1143 1144 1145 1146
################################################################
# read an input file and generate the corresponding .c file
sub HANDLE_FILE
{
    my ($codepage,$filename,$comment) = @_;

1147
    # symbol codepage file is special
1148
    if ($codepage == 20932) { READ_JIS0208_FILE($MAPPREFIX . $filename); }
1149 1150
    else { READ_FILE($MAPPREFIX . $filename); }

1151 1152 1153
    # hack: 0x00a5 must map to backslash in Shift-JIS
    if ($codepage == 932) { $uni2cp[0x00a5] = 0x5c; }

1154
    ADD_DEFAULT_MAPPINGS();
1155 1156

    my $output = sprintf "c_%03d.c", $codepage;
1157
    open OUTPUT,">$output.new" or die "Cannot create $output";
1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169

    printf "Building %s from %s (%s)\n", $output, $filename, $comment;

    # dump all tables

    printf OUTPUT "/* code page %03d (%s) */\n", $codepage, $comment;
    printf OUTPUT "/* generated from %s */\n", $MAPPREFIX . $filename;
    printf OUTPUT "/* DO NOT EDIT!! */\n\n";
    printf OUTPUT "#include \"wine/unicode.h\"\n\n";

    if ($#lead_bytes == -1) { DUMP_SBCS_TABLE( $codepage, $comment ); }
    else { DUMP_DBCS_TABLE( $codepage, $comment ); }
1170
    close OUTPUT;
1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187
    save_file($output);
}


################################################################
# save a file if modified
sub save_file($)
{
    my $file = shift;
    if (!system "cmp $file $file.new >/dev/null")
    {
        unlink "$file.new";
    }
    else
    {
        rename "$file.new", "$file";
    }
1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232
}


################################################################
# output the list of codepage tables into the cptable.c file
sub OUTPUT_CPTABLE
{
    @tables_decl = ();

    foreach $file (@allfiles)
    {
        my ($codepage,$filename,$comment) = @$file;
        push @tables_decl, sprintf("extern union cptable cptable_%03d;\n",$codepage);
    }

    push @tables_decl, sprintf("\nstatic const union cptable * const cptables[%d] =\n{\n",$#allfiles+1);
    foreach $file (@allfiles)
    {
        my ($codepage,$filename,$comment) = @$file;
        push @tables_decl, sprintf("    &cptable_%03d,\n", $codepage);
    }
    push @tables_decl, "};";
    REPLACE_IN_FILE( "cptable.c", @tables_decl );
}

################################################################
# replace the contents of a file between ### cpmap ### marks

sub REPLACE_IN_FILE
{
    my $name = shift;
    my @data = @_;
    my @lines = ();
    open(FILE,$name) or die "Can't open $name";
    while (<FILE>)
    {
	push @lines, $_;
	last if /\#\#\# cpmap begin \#\#\#/;
    }
    push @lines, @data;
    while (<FILE>)
    {
	if (/\#\#\# cpmap end \#\#\#/) { push @lines, "\n", $_; last; }
    }
    push @lines, <FILE>;
1233
    open(FILE,">$name.new") or die "Can't modify $name";
1234 1235
    print FILE @lines;
    close(FILE);
1236
    save_file($name);
1237
}