#!/usr/bin/perl

#  Yudit Unicode Editor Source File
#
#  Copyright (C) 2000  Gaspar Sinai <gsinai@yudit.org>  
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

# FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME
# From:  Miikka-Markus.Alhonen@tigatieto.com
# There are also a few errors in category.txt, specifically concerning the
# following lines in the original UnicodeData.txt
# 
# 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
# 4DB5;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;;
# 4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
# 9FA5;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
# AC00;<Hangul Syllable, First>;Lo;0;L;;;;;N;;;;;
# D7A3;<Hangul Syllable, Last>;Lo;0;L;;;;;N;;;;;
# D800;<Non Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
# DB7F;<Non Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
# DB80;<Private Use High Surrogate, First>;Cs;0;L;;;;;N;;;;;
# DBFF;<Private Use High Surrogate, Last>;Cs;0;L;;;;;N;;;;;
# DC00;<Low Surrogate, First>;Cs;0;L;;;;;N;;;;;
# DFFF;<Low Surrogate, Last>;Cs;0;L;;;;;N;;;;;
# E000;<Private Use, First>;Co;0;L;;;;;N;;;;;
# F8FF;<Private Use, Last>;Co;0;L;;;;;N;;;;;
# 
# So, all the characters in ranges U+3400-U+4DB5, U+4E00-U+9FA5, and
# U+AC00-U+D7A3 belong to the class Lo, in U+D800-U+DFFF to Cs and in
# U+E000-U+F8FF to Co. The Perl script which generated category.txt, interpreted
# these lines so that e.g. only characters U+3400 and U+4DB5 are Lo and those in
# between unassigned. At the moment, this is not so serious an error, since all
# these classes and the unassigned characters are treated the same way, but in the
# future this could cause a problem, if e.g. unknown characters in the Private Use
# Area were to be drawed differently from other unknown characters or so.
# 
# This same misinterpretation is potential to occur also with the following lines
# in Unicode 3.1 Character Database, as soon as a newer version of the database is
# taken as the base of category.txt:
# 
# 20000;<CJK Ideograph Extension B, First>;Lo;0;L;;;;;N;;;;;
# 2A6D6;<CJK Ideograph Extension B, Last>;Lo;0;L;;;;;N;;;;;
# F0000;<Plane 15 Private Use, First>;Co;0;L;;;;;N;;;;;
# FFFFD;<Plane 15 Private Use, Last>;Co;0;L;;;;;N;;;;;
# 100000;<Plane 16 Private Use, First>;Co;0;L;;;;;N;;;;;
# 10FFFD;<Plane 16 Private Use, Last>;Co;0;L;;;;;N;;;;;
# 
# FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME FIXME

#
# This script makes a precompose map.
# In order to run the script you need to get
#  ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
# as and input.
# encode composing chars -> precomposed char
# decode precomposed char -> composing chars

%GeneralCategory = (
"Lu", 1,
"Ll", 2,
"Lt", 3,
"Mn", 4,
"Mc", 5,
"Me", 6,
"Nd", 7,
"Nl", 8,
"No", 9,
"Zs", 10,
"Zl", 11,
"Zp", 12,
"Cc", 13,
"Cf", 14,
"Cs", 15,
"Co", 16,
"Cn", 17,

# Informative Categories

"Lm", 18,
"Lo", 19,
"Pc", 20,
"Pd", 21,
"Ps", 22,
"Pe", 23,
"Pi", 24,
"Pf", 25,
"Po", 26,
"Sm", 27,
"Sc", 28,
"Sk", 29,
"So", 30
);

@encodes = ();
@decodes = ();

@full_date = localtime(time);
$year = $full_date[5] + 1900;
$mon = $full_date[4] + 1;
$mday = $full_date[3];
$date = sprintf ("%4d-%02d-%02d", $year, $mon, $mday);
$name="precompose.my";

print <<EOD;
#
# generated by $0 $date
# 
# from ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt 
#

#  01 Lu Letter, Uppercase
#  02 Ll Letter, Lowercase
#  03 Lt Letter, Titlecase
#  04 Mn Mark, Non-Spacing
#  05 Mc Mark, Spacing Combining
#  06 Me Mark, Enclosing
#  07 Nd Number, Decimal Digit
#  08 Nl Number, Letter
#  09 No Number, Other
#  0A Zs Separator, Space
#  0B Zl Separator, Line
#  0C Zp Separator, Paragraph
#  0D Cc Other, Control
#  0E Cf Other, Format
#  0F Cs Other, Surrogate
#  10 Co Other, Private Use
#  11 Cn Other, Not Assigned (no characters in the file have this property)

# Informative Categories

#  12 Lm Letter, Modifier
#  13 Lo Letter, Other
#  14 Pc Punctuation, Connector
#  15 Pd Punctuation, Dash
#  16 Ps Punctuation, Open
#  17 Pe Punctuation, Close
#  18 Pi Punctuation, Initial quote(may behave like Ps or Pe depending on usage)
#  19 Pf Punctuation, Final quote (may behave like Ps or Pe depending on usage)
#  1A Po Punctuation, Other
#  1B Sm Symbol, Math
#  1C Sc Symbol, Currency
#  1D Sk Symbol, Modifier
#  1E So Symbol, Other

# bumap does not handle 0 - this codepoint has been excluded for now.
#
EOD


while (<>)
{
  next if (split (';')) < 6;
  $plain = hex $_[0];
  next if ($plain >= 0x10000);
  $general = $_[2];
  next if (!defined ($GeneralCategory {$general}));
  $general = $GeneralCategory {$general};
  printf ("0x%02X 0x%04X$/", $general, $plain) if ($plain !=0);
}

print "$/#END$/";
