#!/usr/local/bin/perl -w
#
# Copyright (c) 1998  David Hiebeler
# For licensing information, see the "printLicense" function
# down around line 85.
#
# File: cedictmerge, version 1.1
#   By: David Hiebeler
#       Center for Applied Math
#       Cornell University
#       Ithaca, NY 14853
#       hiebeler@cam.cornell.edu
#       http://www.cam.cornell.edu/hiebeler/home.html
#
#       Version 1.1: December 1998
#       Version 1.0: July 1998
#
#
# This is a perl script for merging two CEDICT-format files (see
# "http://www.mindspring.com/~paul_denisowski/cedict.html" for
# information about CEDICT).
#
# Usage: cedictmerge [-o outFile] [-nodup] file1 file2 [file3 file4 ... fileN]
#
# Consider file1 to be the "original" or "main" file, and the other
# files will be merged into it.  Consider file2 being merged in.
# If an entry in file2 is not in file1, then it will be appended to
# the end of file1's data (however, file1 will not be changed -- the
# merged data will be written to outFile if specified, otherwise stdout).
# If an entry in file2 is in file1, then it will still be appended to
# the end of the merged file, but with the special field "/!!!!!/" appended
# to its English definition (so that one can easily find such entries
# using an editor later, and manually merge duplicate entries).
#
# If the "-nodup" flag is specified, duplicate entries from file2 will not
# appended to the output data.
#
# You can use the "-nma x" argument to se the "NeutralMatchesAny" flag.
# The value "x" should be either 0 or 1.  If you use 1, it means a neutral
# tone (i.e. tone 5) matches any tone.  This is because it's a pretty
# common mistake (at least for me) to put the "intrinsic tone" in the pinyin
# field for a character, if I don't realize the character's tone becomes
# neutral in that particular word.  This helps catch such mistakes.
#
# Use can use the "-uu2u:" command-line argument to turn pinyin entries
# like "nuu3" into "nu:3", and the "-u:2uu" argument to do the opposite,
# i.e. turn "nu:3" into "nuu3".  (This feature is available because
# both forms have appeared in various versions of CEDICT).
#
# This script should work correctly on both GB and BIG5 files.
#
# Note that this script will exit if it encounters any lines not
# in cedict format, with the following exception: it will ignore (and
# discard) any blank lines, and discard any comments which begin
# with '#' (whether the comment is the only thing on a line, or at the
# end of a line).  You may want to use the "cedictcheckformat" script
# first to catch any lines in your vocabulary file which are not in strict
# CEDICT format.  You may also want to use "cedictsort" after merging,
# to sort the results.
#
# Also note that this script checks each new word against the list of
# all words it has seen so far.  In particular, this means it will catch
# duplicate entries within a single file (even the first file), not only
# duplicates between different files.
#
# Wishlist:
# o) Allow the option of ignoring the pinyin field, since then it could
#    catch entries which have mistakes in the pinyin (other than just
#    mistakes about characters changing to neutral tone).  It would probably
#    be best to do this only for multi-character words, so that it wouldn't
#    flag all of the single characters which have multiple pronunciations.
#
# History:
#   10 Dec 1998: added code to turn "uu" into "u:" or vice-versa in the
#      pinyin field if the user requests it, to handle the fact that both
#      forms have been present in cedict for some time now.
#   29 July 1998: original version, 1.0


# Define a couple of constants
$uu2uc = 1;
$uc2uu = 2;


sub printLicense {
    print <<"END_OF_LICENSE";
cedictsort version 1.1   June 10, 1999
Copyright (C) 1998,1999  David Hiebeler
                         Center for Applied Math
                         Cornell University
                         Ithaca, NY 14853
                         hiebeler\@cam.cornell.edu
                         http://www.cam.cornell.edu/hiebeler/home.html

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

END_OF_LICENSE
}


#
# Set up default parameter values.
#
sub setupdefaults {
    $outFname = "-";
    $doDup = 1;
    $neutralMatchesAny = 0;
    $uConvert = 0;
}


#
# Print a usage message and exit.
#
sub printusage {
    print "Usage: $0 [-nma 0|1] [-o outFname] [-nodup] [-uu2u: | -u:2uu] file1 file2 [file3 ... fileN]\n";
    print "   -nma 0|1 : Neutral Matches Any (1 for on, 0 for off) -- for comparing two\n";
    print "              entries to see if they are the same; on means neutral tone\n";
    print "              matches any tone\n";
    print "   -o outFname : specify where to put the output (default = stdout)\n";
    print "   -nodup : Don't output duplicate entries\n";
    print "            (default = output, with extra /!!!!!/ field at end)\n";
    print "   -uu2u: : Turn pinyin entries like `nuu3' into `nu:3' (default = don't)\n";
    print "   -u:2uu : Turn pinyin entries like `nu:3' into `nuu3' (default = don't)\n";
    exit 2;
}


#
# Read a line, removing comments which begin with "#", and ignoring
# empty lines (or lines which only have a comment).
#
$getlinelinenum = 0;
sub getline {
    if ($#_ == -1) {
	while (<>) {
	    $getlinelinenum++;
	    next if /^\s*#/;
	    next if /^\s*$/;
	    s/#.*$//;
	    chop;
	    return $_;
	}
	return;
    }
    elsif ($#_ == 0) {
	$fh = $_[0];
    }
    else {
	die "getlinefp must be called with a single argument";
    }
    while (<$fh>) {
	$getlinelinenum++;
	next if /^\s*#/;
	next if /^\s*$/;
	s/#.*$//;
	chop;
	return $_;
    }
    return;
}


#
# Return 1 if the two strings have the same pinyin, otherwise return 0
# The two strings are in $_[0] and $_[1]
#
sub samePinyin {
    my (@words1, @words2, $i, $tmpWord1, $tmpWord2);

    @words1 = split(" ", $_[0]);
    @words2 = split(" ", $_[1]);

    if (scalar(@words1) != scalar(@words2)) { return 0; }
    for ($i=0; $i < scalar(@words1); $i++) {
	if ($neutralMatchesAny) {
	    if ($words1[$i] =~ m/5$/) {
		# word1 is neutral tone, so we swap the words, since
		# we use word2 for the pattern-matching, and we want
		# to turn the tone 5 into a pattern which matches any tone.
		$tmpWord1 = $words2[$i];
		$tmpWord2 = $words1[$i];
		$tmpWord2 =~ s/5/\\d/;
	    }
	    else {
		# no need to swap, but if word2 is neutral tone, turn
		# it into a pattern which matches any tone.
		$tmpWord1 = $words1[$i];
		$tmpWord2 = $words2[$i];
		$tmpWord2 =~ s/5/\\d/;
	    }
	    if (! ($tmpWord1 =~ m/^$tmpWord2$/)) { return 0; }
	}
	else {
	    if ($words1[$i] ne $words2[$i]) { return 0; }
	}
    }
    # if we got this far, it must have been a match
    return 1;
}


#
# read in a vocabulary file
# Filename to read from is in $_[0]
# Reference to array of references to hashes to use is in $_[1]
# If this is not the first file we are reading, then $_[2] will be 1,
# telling us we should check for duplicates as we read.  If $_[2] is 0,
# then this is the first file, and we just read it, not checking for
# duplicate entries.
#
sub readvocabfile {
    my $levels;
    my $chinese;
    my $english;
    my $pinyin;
    my $arrayRef;
    my $i;

    $arrayRef = $_[1];
    open(INFILE, $_[0]) or die "Couldn't open infile '$_[0]'";
  READVOCABLOOP:
    while ($line=getline("INFILE")) {
	# handle case where line has skill level(s) at beginning
	if ($line =~ m@^\s*([0-9]+)\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) {
	    ($levels,$chinese,$pinyin,$english) = ($1,$2,$3,$4);
	    $chinese =~ s/\s+$//;  # truncate trailing spaces on chinese
	    $levels .= " ";
	}
	# line doen't have skill level numbers at beginning
	elsif ($line =~ m@^\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) {
	    ($chinese,$pinyin,$english) = ($1,$2,$3);
	    $chinese =~ s/\s+$//;  # truncate trailing spaces on chinese
	    $levels = "";
	}
	else {
	    $line =~ s/[\n\r]//;
	    print "Invalid line: `$line'\n";
	    die "Invalid line encountered";
	}

	# Convert "uu" into "u:" or vice-versa in pinyin field,
	# if the user requested it.
	if ($uConvert == $uu2uc) {
	    $pinyin =~ s/uu/u:/;
	}
	elsif ($uConvert == $uc2uu) {
	    $pinyin =~ s/u:/uu/;
	}

	if ($_[2] == 1) {  # check for duplicates
	  DUPLOOP:
	    for ($i=0; $i < $vocabIndex; $i++) {
		if ($$arrayRef[$i]->{chinese} eq $chinese) {
		    # same chinese, but check to see if they have the
		    # same pinyin
		    if (samePinyin($$arrayRef[$i]->{pinyin}, $pinyin)) {
			if ($doDup == 0) {
			    # don't do duplicates, so don't add this entry to
			    # the vocabulary list; go read the next entry.
			    next READVOCABLOOP;
			}
			else { $english =~ s@/$@/!!!!!/@; }
			last DUPLOOP;  # we found duplicate, so exit inner loop
		    }
		}
	    }
	}
	# now put everything into the main array of hashes
	$$arrayRef[$vocabIndex]->{"levels"} = $levels;
	$$arrayRef[$vocabIndex]->{"chinese"} = $chinese;
	$$arrayRef[$vocabIndex]->{"english"} = $english;
	$$arrayRef[$vocabIndex]->{"pinyin"} = $pinyin;
	$vocabIndex++;
    }
    close INFILE;
}


#
# Print out the vocabulary list
#
sub printVocab {
    my $fh = $_[0];

    foreach $word (@wordList) {
	print $fh "$word->{levels}",
	"$word->{chinese} [$word->{pinyin}] $word->{english}\n";
    }
}


##############
# Main program
##############
setupdefaults();
while ($thisarg = shift()) {
    if ($thisarg eq "-o") {
	if (!defined($outFname = shift())) { printusage; }
    }
    elsif ($thisarg eq "-nodup") { $doDup = 0; }
    elsif ($thisarg eq "-license") { printLicense(); exit(0); }
    elsif ($thisarg eq "-nma") {
	if (!defined($neutralMatchesAny = shift())) { printusage; }
    }
    elsif ($thisarg eq "-uu2u:") {
	$uConvert = $uu2uc;
    }
    elsif ($thisarg eq "-u:2uu") {
	$uConvert = $uc2uu;
    }
    else { last; }
}
open (OUTFP, ">$outFname") or die "Couldn't open output file `$outFname'\n";

# These next 2 lines are just to avoid warnings about using OUTFP once
# and it being a possible typo...
$myJunk = \*OUTFP;
$myJunk = "";

# ok, now $thisarg contains the first filename (if it's defined)
if (!defined($thisarg)) { printusage; }
@wordList = ();
$vocabIndex = 0;
readvocabfile($thisarg, \@wordList, 0);
print "# Got ", scalar(@wordList), " entries\n";
while ($thisarg = shift()) {
    my $savedVocabIndex = $vocabIndex;
    readvocabfile($thisarg, \@wordList, 1);
    print "# Got another ", $vocabIndex - $savedVocabIndex, " entries\n";
}
printVocab("OUTFP");
