#!/usr/local/bin/perl -w
#
# Copyright (c) 1998,1999  David Hiebeler
# For licensing information, see the "printLicense" function
# down around line 84.
#
# File: cedictlookup, version 1.1  (formerly known as `chnvlookup')
#   By: David Hiebeler
#       Center for Applied Math
#       Cornell University
#       Ithaca, NY 14853
#       hiebeler@cam.cornell.edu
#       http://www.cam.cornell.edu/hiebeler/home.html
#
#       Version 1.1: June 1999
#       Version 1.0: August 1998  (under the name `chnvlookup')
#
#       Any comments about this software are appreciated, especially
#       performance-improving ideas for the lookupWord() subroutine.
#
# Perl script for doing Chinese vocabulary lookup (by Chinese, pinyin, or
# English) from CEDICT-format vocabulary files.
# (See "http://www.mindspring.com/~paul_denisowski/cedict.html" for
# information about CEDICT.)
#
# See cedictlookup.doc for documentation.
# Quick & dirty "how-to" for those who don't like documentation:
#    cedictlookup -vf /my/chinese/vocab/file.gb
# Then type in words in Chinese, pinyin, or English, and it will try
# to look them up in your vocab file.  You'll have to read the docs (or
# try to decipher the output of "cedictlookup -help") if you want to use
# multiple vocabulary files simultaneously, do non-exact matches, or
# enable/disable FastExact searching.
#
# Note that this script will ignore any lines it encounters in
# vocabulary files which are not in cedict format.  In particular, it
# will ignore any blank lines, and discard any comments which
# begin with '#' (whether the comment is the only thing on a line, or
# at the end of a line).  You may want to use the "cedictcheckformat"
# script to catch any lines in your vocabulary files which are not in
# strict CEDICT format.  If you don't have it already, it's also available
# at my web site, at the URL above.
#
# Wishlist / known bugs:
# o) Allow FastExact match searching for English (although that will probably
#    turn out to be too much of a memory hog).
# o) Allow you to change the FastExact settings without restarting
#    (as you can change Match Mode and Anchor Mode).
# o) "0-tone" (or "forgotten-tone") pinyin lookups do not work with
#    Match Mode = Shorter.  Hopefully it will be fixed in the next release.
# o) Further improve performance of vocabulary lookups.
# o) Allow some options to sort the output in different ways, e.g.
#    alphabetically by pinyin (using the algorithm that "cedictsort" uses),
#    or maybe first by length of Chinese and then alphabetically within that.
#    It should also remove duplicate entries after sorting.
#
# History:
#   08 June 1999: Added "fastexact" matches in Chinese and pinyin.  These use
#      Perl's built-in hashing algorithm to find exact matches to Chinese
#      words much more quickly than the code was able to do by searching
#      through the entire vocabulary list.
#   10 Dec 1998: added code to turn "uu" into "u:" or vice-versa in the
#      pinyin field if the user requests it, to handle the fact that both
#      forms were present in cedict for some time, or may be present in
#      various Chinese documents you encounter.  Note, if you activate
#      this conversion, it will be done both when reading in the
#      vocabulary files, and when processing pinyin user input.
#      You can activate conversion by using either the "-uu2u:" or "-u:2uu"
#      command-line arguments.
#   05 August 1998: original version, 1.0
#
################################################################


# I was using the following for timing the code; commented out for now.
# use Benchmark;


# Define a couple of constants
$uu2uc = 1;
$uc2uu = 2;


sub printLicense {
    print <<"END_OF_LICENSE";
cedictlookup version 1.1   June 11, 1999
Copyright (C) 1998,1999  David Hiebeler
                         Center for Applied Math
                         Cornell University
                         Ithaca, NY 14853
                         hiebeler\@cam.cornell.edu
                         http://www.cam.cornell.edu/hiebeler/home.html

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

END_OF_LICENSE
}


#
# Set up default values for parameters
#
sub setdefaults {
    # 1=case-insensitive matching of English words, 0=case-sensitive matching
    $caseInsensitive = 1;

    # whether we are logging to a file (for debugging)
    $doLog = 0;

    # whether to give some extra output (for debugging)
    $verbose = 0;

    # we slightly change our behavior when called by emacs
    $underEmacs = 0;

    $HOME = $ENV{'HOME'};

    # directory where vocab files are
    $vocabDir = "$HOME/chinese/words/vocab";

    # colon-separated list of vocab files
#    $vocabFiles = "vocabulary.gb:syhykb3.gb:yuwen5.gb:yuyan.gb:cedict.gb";
    $vocabFiles = "vocabulary.gb:yuwen5.gb:yuyan.gb:cedict.gb";

    # default MatchMode and AnchorMode to use
    $matchMode = "exact";
    $anchorMode = "start";

    # What kind (if any) of conversion to do in the pinyin field
    # (converting "uu" to "u:" or vice-versa).
    $uConvert = 0;  # default is to do no conversion

    # whether or not to remove skill level information from English field
    $removeSkillLevels = 1;

    # What kind fast exact matches to do.  For now, only Chinese fast exact
    # matches are supported (and enabled by default).
    # Note that if you want the default to be "c+" for example, you should
    # still set this value to "c", and set the next variable to 1 (i.e.
    # the continueAfterFastMatch variable indicates whether or not you
    # want the '+').
    $fastExactMatches = "c";

    # whether or not to continue with general matches after a successful
    # fast match
    $continueAfterFastMatch = 0;

    # explicitly initialize empty hashes
    %fastMatchChineseHash = ();
    %fastMatchPinyinHash = ();
}


#
# Print parameter values
#
sub printsettings {
    print "verbose = $verbose, underEmacs = $underEmacs, caseInsensitive = $caseInsensitive\n";
    print "vocabDir = `$vocabDir'\n";
    print "vocabFiles = `$vocabFiles'\n";
    print "matchMode = `$matchMode', anchorMode = `$anchorMode'\n";
    print "uConvert = ";
    if ($uConvert == 0) { print "none\n"; }
    elsif ($uConvert == $uu2uc) { print "uu2u:\n"; }
    elsif ($uConvert == $uc2uu) { print "u:2uu\n"; }
    else { die "Illegal value: $uConvert\n"; }
    print "rmskill = $removeSkillLevels\n";
    print "fastExactMatches = ";
    if ($fastExactMatches eq "") { print "(none)\n"; }
    else {
	if ($fastExactMatches =~ m/c/) { print "chinese "; }
	if ($fastExactMatches =~ m/p/) { print "pinyin "; }
	if ($continueAfterFastMatch) { print "+\n"; }
	else { print "\n"; }
    }
}


#
# Print usage message and exit
#
sub printusage {
    print "Usage: $0 [-v] [-vd path] [-i | +i] [-emacs] [-uu2u: | -u:2uu]\n";
    print "         [-vf fname1:fname2:...:fnameN] [-rmskill 0|1] [-mm e|s|l] [-am s|e|n]\n";
    print "         [-fastexact 0|c] [-license]\n";
    print "\n";
    print "  -v : verbose mode (print extra info at startup and when scanning user input)\n";
    print "  -emacs : for use when invoked as subprocess under emacs\n";
    print "  -i : case-insensitive matching for English (enabled by default)\n";
    print "  +i : do case-sensitive matching of English words\n";
    print "  -vd path : set vocabDir, directory containing vocabulary files\n";
    print "  -vf fname1:fname2:...:fnameN : colon-separated list of vocabulary filenames\n";
    print " -rmskill 0|1 : do(1) or don't(0) remove skill level info from English field\n";
    print "  -mm e|s|l : match mode -- (e)xact, (s)horter, (l)onger\n";
    print "  -am s|e|n : anchor mode -- (s)tart, (e)nd, or (n)one\n";
    print "              Only used for non-exact match mode; e.g. anchor mode of `start'\n";
    print "              means your word must be at the beginning of the match\n.";
    print "  -uu2u: : Turn pinyin entries like `nuu3' into `nu:3' (default = don't)\n";
    print "  -u:2uu : Turn pinyin entries like `nu:3' into `nuu3' (default = don't)\n";
    print "  -fastexact 0|c : Do fast exact matches in addition to slower pattern matches\n";
    print "                   (This option can also be abbreviated as `-fe').\n";
    print "      `0' means don't do any fast exact matches.\n";
    print "      `c' means do fast exact matches for Chinese (which uses extra memory).\n";
    print "  -license: Print license information and exit.\n";
    exit 2;
}


#
# Read a line from a file or stdin, removing comments which begin with "#",
# and ignoring empty lines (or lines which only have a comment).
#
sub getline {
    if ($#_ == -1) {
	while (<>) {
	    next if /^\s*#/;
	    next if /^\s*$/;
	    s/#.*$//;
	    chop;
	    return $_;
	}
	return undef;
    }
    elsif ($#_ == 0) {
	$fh = $_[0];
    }
    else {
	die "getlinefp must be called with a single argument or no arguments";
    }
    while (<$fh>) {
	next if /^\s*#/;
	next if /^\s*$/;
	s/#.*$//;
	chop;
	return $_;
    }
    return undef;
}


#
# Read in a cedict-format vocabulary file
#
sub readvocabfile {
    my $chinese;
    my $english;
    my $pinyin;
    my $chineseLength;
    my $i = 0;

    open(INFILE, $_[0]) or die "Couldn't open infile `$_[0]'";
    READVOCABLOOP: while ($line=getline("INFILE")) {
	# handle case where line has skill level(s) at beginning
	if ($line =~ m@^\s*[0-9]+\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) {
	    ($chinese,$pinyin,$english) = ($1,$2,$3);
	    $chinese =~ s/\s+$//;  # truncate trailing spaces on chinese
	}
	# line doen't have skill level numbers at beginning
	elsif ($line =~ m@^\s*(.+)\s*\[(.+)\]\s*(/.*/)\s*$@) {
	    ($chinese,$pinyin,$english) = ($1,$2,$3);
	    $chinese =~ s/\s+$//;  # truncate trailing spaces on chinese
	}
	else {
	    # die "Invalid line '$line'";
	    # ok, don't be so harsh, we will just ignore it
	    next READVOCABLOOP;
	}
#	print "chinese='$chinese', pinyin='$pinyin', english='$english'\n";
	$chineseLength = length($chinese);

	# Convert "uu" into "u:" or vice-versa in pinyin field,
	# if the user requested it.
	if ($uConvert == $uu2uc) {
	    $pinyin =~ s/uu/u:/;
	}
	elsif ($uConvert == $uc2uu) {
	    $pinyin =~ s/u:/uu/;
	}

	# Remove skill level info from English field if requested
	if ($removeSkillLevels) {
	    $english =~ s@/=[0-9]+=/$@/@;
	}

	# now put everything into the main array of hashes
	$wordList[$vocabIndex]->{"chinese"} = $chinese;
	$wordList[$vocabIndex]->{"english"} = $english;
	$wordList[$vocabIndex]->{"pinyin"} = $pinyin;
	$wordList[$vocabIndex]->{"chineseLength"} = $chineseLength;
	if ($fastExactMatches =~ m/c/) {
	    # store index of this entry in a hash for fast lookup later.
	    push @{$fastMatchChineseHash{$chinese}}, $vocabIndex;
	}
	if ($fastExactMatches =~ m/p/) {
	    # store index of this entry in a hash for fast lookup later.
	    push @{$fastMatchPinyinHash{$pinyin}}, $vocabIndex;
	}
	$vocabIndex++;
	$i++;
    }
    close INFILE;
    print "read $i words\n";
    if ($doLog) { print LOGFP "read $i words\n"; }
}


#
# Parse the command-line arguments
#
sub getargs {
    $thisarg = shift();
    while (defined($thisarg)) {
	if ($thisarg eq "-v") {  # verbose
	    $verbose = 1;
	}
	elsif ($thisarg eq "-emacs") {  # being run under emacs
	    $underEmacs = 1;
	}
	elsif ($thisarg eq "-i") {  # case-insensitive English matching
	    $caseInsensitive = 1;
	}
	elsif ($thisarg eq "+i") {  # case-sensitive English matching
	    $caseInsensitive = 0;
	}
	elsif ($thisarg eq "-uu2u:") {
	    $uConvert = $uu2uc;
	}
	elsif ($thisarg eq "-u:2uu") {
	    $uConvert = $uc2uu;
	}
	elsif ($thisarg eq "-license") {
	    printLicense();
	    exit(0);
	}
	else {
	    $secondarg = shift;
	    if (! defined($secondarg)) { printusage; }
	    if ($thisarg eq "-vd") {  # vocabulary directory
		$vocabDir = $secondarg;
	    }
	    elsif ($thisarg eq "-vf") {  # vocabulary file(s)
		$vocabFiles = $secondarg;
	    }
	    elsif ($thisarg eq "-rmskill") {
		$removeSkillLevels = $secondarg;
	    }
	    elsif (($thisarg eq "-fastexact") || ($thisarg eq "-fe")) {
		if ($secondarg =~ s/0//) { $fastExactMatches = ""; }
		else {
		    $fastExactMatches = "";
		    if ($secondarg =~ s/c//) { $fastExactMatches .= "c"; }
		    if ($secondarg =~ s/p//) { $fastExactMatches .= "p"; }
		    if ($secondarg =~ s/\+$//) { $continueAfterFastMatch = 1; }
		    if ($secondarg =~ s/-$//) { $continueAfterFastMatch = 0; }
		}
		if ($secondarg ne "") { printusage; }
	    }
	    elsif ($thisarg eq "-mm") {  # match mode
		$matchModeStr = $secondarg;
		if ($matchModeStr eq "e") { $matchMode = "exact"; }
		elsif ($matchModeStr eq "s") { $matchMode = "shorter"; }
		elsif ($matchModeStr eq "l") { $matchMode = "longer"; }
		else { printusage; }
	    }
	    elsif ($thisarg eq "-am") {  # anchor mode
		$anchorModeStr = $secondarg;
		if ($anchorModeStr eq "s") { $anchorMode = "start"; }
		elsif ($anchorModeStr eq "e") { $anchorMode = "end"; }
		elsif ($anchorModeStr eq "n") { $anchorMode = "none"; }
		else { printusage; }
	    }
	    elsif ($thisarg eq "-log") {   # do extra logging to a file
					   # (for debugging)
		$logFname = $secondarg;
		$doLog = 1;
	    }
	    else { printusage; }
	}	    
	$thisarg = shift;
    }
}


#
# Check to see whether a string contains Chinese, pinyin, or English.
# This routine returns one of the following strings:
#    "english", "chinese", or "pinyin"
# It also handles a couple of special flags, "-wp" or "-we" at the
# beginning of the string, which indicates the user wants to do "wildcard
# matching".
#
sub classifyCPE
{
    $mycopy = $_[0];   # so we can modify the string
    $firstChar = ord($mycopy);
    if ($firstChar >= 128) { # if high bit on first char is set, assume chinese
	return "chinese";
    }
    $mycopy =~ s/[\.,\(\)]+/ /g;  # strip out periods, commas, and parentheses
    if (!(@words = split(" ", $_[0]))) {   # split into words
	die "Fatal error in classifyCPE(), on `$_[0]'\n";
    }
    # Special case -- if the beginning of the string is a "-wp", it
    # indicates "wildcard pinyin", i.e. the person wants to do partial matches,
    # letting "jin" match "jin4", "jing1", etc.
    if ($words[0] =~ m/^(?=-)(?:-\w+\s+)*-wp/) {
	$_[0] =~ s/^\s*-wp\s*//;   # surgically remove the "-wp" from string
	# Convert "uu" into "u:" or vice-versa in pinyin field,
	# if the user requested it.
	if ($uConvert = $uu2uc) {
	    $_[0] =~ s/uu/u:/;
	}
	elsif ($uConvert = $uc2uu) {
	    $_[0] =~ s/u:/uu/;
	}
	@words = split(" ", $_[0]);   # split string into words again
	foreach $word (@words) {
	    $word =~ s/^(.*)$/\[\^\\s\]\*$1/;
	    $word =~ s/^(.*[^0-5])$/$1\[\^\\s\]\*/;  # turn word
	    					# into a nice regexp so it can
	    					# be used for matching later
	    					# (unless this word already
	    					# has a tone on it)
	}
	$_[0] = join(' ', @words);   # join words back into one string
	# See comment below about pinyin tone 0 for explanation of this line
	$_[0] =~ s/0/[1-5]/g;
	return "pinyin";
    }
    # If the beginning of the string is a "-we", it
    # indicates "wildcard english", with partial matches (very much like
    # "-wp" above).
    if ($words[0] =~ m/^(?=-)(?:-\w+\s+)*-we/) {
	$_[0] =~ s/^\s*-we\s*//;  # surgically remove "-we" from string
	@words = split(" ", $_[0]);
	foreach $word (@words) {
	    $word =~ s/^(.*)$/\.\*$1\.\*/;
	}
	$_[0] = join(' ', @words);
	return "english";
    }

    # If we got this far, just check the first word and make a guess from that
    if ($words[0] =~ m/[a-zA-z]+[012345]$/) {
	# if the first word is letters followed by a digit from 0-5 (presumably
	# a pinyin tone), then assume it really is pinyin.
	# But pinyin tone 0 is a special case, meaning the person doesn't
	# actually know which tone it is, so we should replace any zero's
	# by a character class matching any digit 1-5.
	if ($_[0] =~ m/0(\s|$)/) {
	    $forgotTone = 1;
	    $_[0] =~ s/0/[1-5]/g;
	}
	# Convert "uu" into "u:" or vice-versa in pinyin field,
	# if the user requested it.
	if ($uConvert = $uu2uc) {
	    $_[0] =~ s/uu/u:/;
	}
	elsif ($uConvert = $uc2uu) {
	    $_[0] =~ s/u:/uu/;
	}
	return "pinyin";
    }
    else {
	return "english";   # otherwise, assume it's English
    }
}


#
# Used for sorting an array of references to hashes, by the Chinese length
sub byChineseLength {
    $$a->{"chineseLength"} <=> $$b->{"chineseLength"};
}


#
# Look up a word.  This is the main routine which does all the work.
# The word to look up will be the first element of the argument array @_,
# and the language to use (chinese, pinyin, or english) will be in $_[1].
# The matchMode and anchorMode parameters to use are passed in as
# $_[2] and $_[3], respectively.
#
# Performance note: when MatchMode is "exact" or "longer", I use "eval"
# to do the main search loop.  This lets me build the pattern-matching
# regexp dynamically each time this function is executed, but not have
# the regexp be recompiled 12,000+ times as I loop through the
# vocabulary file (or however big cedict is when you read this).
# From my timings, however, this doesn't seem to make a very big difference.
# If anyone has some suggestions for improving the performance of vocab
# lookups, I'd appreciate hearing them, since in my opinion this routine
# is somewhat slow (not surprising, since it's pretty simple-minded).
#
sub lookupWord {
    my ($i, $foundMatch, $numMatchesFound, $matchMode, $anchorMode,
	$fastExactMatches, $continueAfterFastMatch);
    my ($lastChinese, $lastPinyin, $lastEnglish, $headStr, $fastRef);
    my (@matchingWordList, @fastIndexArray);
    my %fastSeen;

    $lookFor = $_[0];  # just to make things more readable
    $languageToUse = $_[1];  # same here
    $caseStr = "";
    if ($_[1] eq "english") {
	if ($caseInsensitive) { $caseStr= "(?i)"; }
    }
    elsif (($_[1] ne "chinese") && ($_[1] ne "pinyin"))
    {
	die "Unknown lookup method `$_[1]'";
    }
    $matchMode = $_[2];
    $anchorMode = $_[3];
    $fastExactMatches = $_[4];
    $continueAfterFastMatch = $_[5];
    $foundMatch = 0;
    %fastSeen = ();

    if (($_[1] eq "chinese") &&
	(($fastExactMatches =~ m/c/) || ($matchMode eq "exact"))) {
	$fastRef = $fastMatchChineseHash{$lookFor};
	if (defined($fastRef)) {
	    $foundMatch = 1;
	    if ($matchMode ne "exact") { print "Fast exact matches:\n"; }
	    @fastIndexArray = @{$fastRef};
	    foreach $i (@fastIndexArray) {
		print "$wordList[$i]->{chinese} [$wordList[$i]->{pinyin}] $wordList[$i]->{english}\n";
		$fastSeen{$i}++;
	    }
	    unless ($continueAfterFastMatch) { return; }
	    if ($matchMode ne "exact") { print "Other matches:\n"; }
	}
    }
    if (($_[1] eq "pinyin") &&
	(($fastExactMatches =~ m/p/) || ($matchMode eq "exact"))) {
	$fastRef = $fastMatchPinyinHash{$lookFor};
	if (defined($fastRef)) {
	    $foundMatch = 1;
	    if ($matchMode ne "exact") { print "Fast exact matches:\n"; }
	    @fastIndexArray = @{$fastRef};
	    foreach $i (@fastIndexArray) {
		print "$wordList[$i]->{chinese} [$wordList[$i]->{pinyin}] $wordList[$i]->{english}\n";
		$fastSeen{$i}++;
	    }
	    unless ($continueAfterFastMatch) { return; }
	    if ($matchMode ne "exact") { print "Other matches:\n"; }
	}
    }
    if ($matchMode eq "exact") {
	if ($_[1] eq "chinese") {
	    $matchStr = "^$lookFor\$";
	}
	if ($_[1] eq "pinyin") {
	    $matchStr = "^$lookFor\$";
	}
	elsif ($_[1] eq "english") {
	    $matchStr = "$caseStr/$lookFor/";
	}
	# turn '/' into '\/' in search string, since we are going to eval
	# the pattern-match, rather than run it directly
	$matchStr =~ s@/@\\/@g;
	if ($verbose) {
	    print "matchStr = `$matchStr'\n";
	}
	$searchCmd = "";
	$searchCmd .= "for (\$i=0; \$i < $numVocabWords; \$i++) {";
	$searchCmd .= "if (\$wordList[\$i]->{$_[1]} =~ m/$matchStr/) {";
	$searchCmd .= "print \"\$wordList[\$i]->{chinese} [\$wordList[\$i]->{pinyin}] \$wordList[\$i]->{english}\\n\";";
	if ($doLog) {
	    $searchCmd .= "print LOGFP \"\$wordList[\$i]->{chinese} [\$wordList[\$i]->{pinyin}] \$wordList[\$i]->{english}\\n\";";
	}
	$searchCmd .= "\$foundMatch = 1;";
	$searchCmd .= "}";
	$searchCmd .= "}";
	eval $searchCmd;
    }
    elsif ($matchMode eq "longer") {
	if ($anchorMode eq "start") {
	    if ($_[1] eq "chinese") { $matchStr = "^$lookFor"; }
	    elsif ($_[1] eq "pinyin") { $matchStr = "^$lookFor"; }
	    elsif ($_[1] eq "english") { $matchStr = "$caseStr/${lookFor}[\\s/]"; }
	}
	elsif ($anchorMode eq "end") {
	    # since Chinese characters are 2 bytes each, we need this little
	    # bit of nonsense to avoid matching 2 bytes which straddle 2
	    # separate Chinese characters (it happens more than you might
	    # expect).
	    if ($_[1] eq "chinese") { $matchStr = "^(..)*$lookFor\$"; }
	    elsif ($_[1] eq "pinyin") { $matchStr = "(^|\\s)$lookFor\$"; }
	    elsif ($_[1] eq "english") { $matchStr = "[\\s/]$caseStr$lookFor/"; }
	}
	elsif ($anchorMode eq "none") {
	    if ($_[1] eq "chinese") { $matchStr = "^(..)*$lookFor"; }
	    elsif ($_[1] eq "pinyin") { $matchStr = "(^|\\s)$lookFor"; }
	    elsif ($_[1] eq "english") { $matchStr = "[\\s/]$caseStr${lookFor}[\\s/]"; }
	}
	else {
	    die "Unknown anchorMode `$anchorMode'\n";
	}
	# turn '/' into '\/' in search string, since we are going to eval
	# the pattern-match, rather than run it directly
	$matchStr =~ s@/@\\/@g;
	if ($verbose) {
	    print "matchStr = `$matchStr'\n";
	}
	$searchCmd = "";
	$searchCmd .= "for (\$i=0; \$i < $numVocabWords; \$i++) {\n";  # for: i
	$searchCmd .= " unless (\$fastSeen{\$i}) {\n";  # unless: fastSeen
	$searchCmd .= "  if (\$wordList[\$i]->{$_[1]} =~ m/$matchStr/) {\n";  # if: wordList
	$searchCmd .= "     print \"\$wordList[\$i]->{chinese} [\$wordList[\$i]->{pinyin}] \$wordList[\$i]->{english}\\n\";\n";
	if ($doLog) {
	    $searchCmd .= "    print LOGFP \"\$wordList[\$i]->{chinese} [\$wordList[\$i]->{pinyin}] \$wordList[\$i]->{english}\\n\";\n";
	}
	$searchCmd .= "    \$foundMatch = 1;\n";
	$searchCmd .= "  }";  # if: wordList
	$searchCmd .= " }";  # unless: fastSeen
	$searchCmd .= "}";  # for: i
	if ($verbose) { print "searchCmd = `$searchCmd'\n"; }
	eval $searchCmd;
    }
    elsif ($matchMode eq "shorter") {
	if (($_[1] ne "chinese") && ($_[1] ne "pinyin")) {
	    print "`shorter' matchMode can only be used with chinese and pinyin lookups\n";
	    return;
	}
	$numMatchesFound = 0;
	if ($anchorMode eq "start") {
	    $headStr = "^";
	    for ($i=0; $i < $numVocabWords; $i++) {
		unless ($fastSeen{$i}) {
		    $matchStr = $headStr . $wordList[$i]->{$languageToUse};
		    if ($lookFor =~ m/$matchStr/) {
			$matchingWordList[$numMatchesFound] = \${wordList[$i]};
			$numMatchesFound++;
			$foundMatch = 1;
		    }
		}
	    }
	}
	elsif ($anchorMode eq "end") {
	    if ($_[1] eq "chinese") { $headStr = "^(..)*"; }
	    else { $headStr = "(^|\\s)"; }
	    for ($i=0; $i < $numVocabWords; $i++) {
		unless ($fastSeen{$i}) {
		    $matchStr = $headStr . $wordList[$i]->{$languageToUse} . "\$";
		    if ($lookFor =~ m/$matchStr/) {
			$matchingWordList[$numMatchesFound] = \${wordList[$i]};
			$numMatchesFound++;
			$foundMatch = 1;
		    }
		}
	    }
	}
	elsif ($anchorMode eq "none") {
	    if ($_[1] eq "chinese") { $headStr = "^(..)*"; }
	    else { $headStr = "(^|\\s)"; }
	    for ($i=0; $i < $numVocabWords; $i++) {
		unless ($fastSeen{$i}) {
		    $matchStr = $headStr . $wordList[$i]->{$languageToUse};
		    if ($lookFor =~ m/$matchStr/) {
			$matchingWordList[$numMatchesFound] = \${wordList[$i]};
			$numMatchesFound++;
			$foundMatch = 1;
		    }
		}
	    }
	}
	# print out the matches sorted by length (we want longer matches
	# displayed last where they will be seen more easily, since they
	# are presumably closer to the whole phrase which we wanted to match).
	$lastChinese = "";
	$lastPinyin = "";
	$lastEnglish = "";
	foreach $i (sort byChineseLength @matchingWordList) {
	    # If this entry is identical to the previous one, don't bother
	    # printing it.  Although for this to be maximally useful,
	    # we should do alphabetical sorting first
	    if (! ((${$i}->{chinese} eq $lastChinese) &&
		(${$i}->{pinyin} eq $lastPinyin) &&
		    (${$i}->{english} eq $lastEnglish))) {
		print ${$i}->{"chinese"}, " ",
		  ${$i}->{"pinyin"}, " ",
		  ${$i}->{"english"}, "\n";
		if ($doLog) {
		    print LOGFP ${$i}->{"chinese"}, " ",
		      ${$i}->{"pinyin"}, " ",
		      ${$i}->{"english"}, "\n";
		}
		$lastChinese = ${$i}->{chinese};
		$lastPinyin = ${$i}->{pinyin};
		$lastEnglish = ${$i}->{english};
	    }
	}
    }
    if ($foundMatch == 0) { print "No match found.\n"; }
}


sub prompt {
    if ($underEmacs) { print "====\n"; }
    else { print "\nEnter word:\n"; }

    if ($doLog) { print LOGFP "\nEnter word:\n"; }
}


##############
# Main program
##############

my $thisTimeMatchMode;
my $thisTimeAnchorMode;
setdefaults();

# split apart arguments which have spaces in them; this is because when
# this script is called from emacs, sometimes multiple arguments may
# be packed into one string, separated by spaces (e.g. "-mm e").
@ARGV = split(" ", join(" ", @ARGV));

getargs(@ARGV);

print "cedictlookup version 1.1    June 11, 1999\n";
print "Chinese Vocabulary Lookup\n";
print "Copyright (C) 1998,1999   David Hiebeler\n";
print "                          Center for Applied Math\n";
print "                          Cornell University\n";
print "                          Ithaca, NY 14853\n";
print "                          hiebeler\@cam.cornell.edu\n";
print "                          http://www.cam.cornell.edu/hiebeler/home.html\n";
print "cedictlookup comes with ABSOLUTELY NO WARRANTY.\n";
print "Enter `-license' for more information.\n";

use FileHandle;
if ($doLog) {
    open(LOGFP, ">$logFname") or die "Couldn't open logfile `$logFname'\n";
    select(LOGFP); $| = 1;  # make unbuffered
    LOGFP->autoflush(1);
    print LOGFP "verbose = $verbose, underEmacs = $underEmacs, caseInsensitive = $caseInsensitive\n";
    print LOGFP "vocabDir = `$vocabDir'\n";
    print LOGFP "vocabFiles = `$vocabFiles'\n";
    print LOGFP "matchMode = `$matchMode', anchorMode = `$anchorMode'\n";
}
select(STDOUT); $| = 1;
STDOUT->autoflush(1);

print "cedictlookup: Reading Chinese vocabulary files...\n";
# Append "/" to vocabDir if it doesn't already have one
if (! ($vocabDir =~ m@/$@)) { $vocabDir .= "/"; }

if ($verbose) { printsettings(); }
$vocabIndex = 0;
foreach $vFile (split(/:/, $vocabFiles)) {
    $tmpStr = $vocabDir . $vFile;
    print "reading `$tmpStr'...";
    if ($doLog) { print LOGFP "reading `$tmpStr'... "; }
    readvocabfile($tmpStr);
}
$numVocabWords = $vocabIndex;
print "cedictlookup: Done reading vocabulary files.\n";
print "cedictlookup: Got $numVocabWords entries.\n";

prompt();
INPUTLOOP: while (defined($line = <STDIN>)) {
    chop $line;
    $line =~ s/^\s+//;   # remove any leading spaces
    $line =~ s/\s+$//;   #   and trailing spaces
    if ($line eq "q") { exit 0; }
    $thisTimeMatchMode = $matchMode;
    $thisTimeAnchorMode = $anchorMode;
    $thisTimeFastExactMatches = $fastExactMatches;
    $thisTimeContinueAfterFastMatch = $continueAfterFastMatch;
    if ($line eq "-license") {
	printLicense();
	prompt();
	next INPUTLOOP;
    }
    if ($line eq "-q") {
	print "matchMode = $matchMode, anchorMode = $anchorMode\n";
	print "fastExactMatches = ";
	if ($fastExactMatches eq "") { print "(none)\n"; }
	else {
	    if ($fastExactMatches =~ m/c/) { print "chinese "; }
	    if ($fastExactMatches =~ m/p/) { print "pinyin "; }
	    if ($continueAfterFastMatch) { print "+\n"; }
	    else { print "\n"; }
	}
	prompt();
	next INPUTLOOP;
    }
    while ($line =~ m/^(?=-)(?:-\w+\s+)*-(mm|am) (\S+)/) {
	if ($1 eq "mm") {
	    if ($2 eq "e") { $thisTimeMatchMode = "exact"; }
	    elsif ($2 eq "s") { $thisTimeMatchMode = "shorter"; }
	    elsif ($2 eq "l") { $thisTimeMatchMode = "longer"; }
	    $line =~ s/\-mm \w\s*//;
	}
	elsif ($1 eq "am") {
	    if ($2 eq "s") { $thisTimeAnchorMode = "start"; }
	    elsif ($2 eq "e") { $thisTimeAnchorMode = "end"; }
	    elsif ($2 eq "n") { $thisTimeAnchorMode = "none"; }
	    $line =~ s/\-am \w\s*//;
	}
	elsif (($1 eq "fe") || ($1 eq "fastexact")) {
	    # This code will never actually get executed, because
	    # the "fastexact" and "fe" flags aren't in the pattern match
	    # above.  Later I may allow the user to change the FastExact
	    # mode in the middle of things, by building the hash table
	    # if necessary.  But not yet.
	    $firstarg = $1;
	    $secondarg = $2;
	    if ($secondarg =~ s/0//) { $thisTimeFastExactMatches = ""; }
	    else {
		$thisTimeFastExactMatches = "";
		if ($secondarg =~ s/c//) { $thisTimeFastExactMatches .= "c"; }
		if ($secondarg =~ s/c//) { $thisTimeFastExactMatches .= "p"; }
		if ($secondarg =~ s/\+$//) {$thisTimeContinueAfterFastMatch=1;}
		if ($secondarg =~ s/\-$//) {$thisTimeContinueAfterFastMatch=0;}
	    }
	    if ($secondarg ne "") {print "Illegal option for `$firstarg' argument\n";}
	    $line =~ s/\-$firstarg \S+\s*//;
	}
	else {
	    die "That's funny, I shouldn't be able to die this way.";
	}
    }
    if ($verbose) {
	print "line = `$line'\n";
	print "mm = $thisTimeMatchMode, am = $thisTimeAnchorMode, fastExactMatches = $thisTimeFastExactMatches, continueAfterFastMatch = $thisTimeContinueAfterFastMatch\n";
    }
    if ($line eq "") {
	# special case -- if we encounter an empty line, set the local
	# values of matchMode and anchorMode to the global copies.  This
	# allows you to enter e.g. the line "+mm l -am n" to "permanently"
	# set the matchMode to "longer", and the anchorMode to "none",
	# at least until the user changes them again.  We also display
	# the new settings for matchMode and anchorMode.
	$matchMode = $thisTimeMatchMode;
	$anchorMode = $thisTimeAnchorMode;
	$fastExactMatches = $thisTimeFastExactMatches;
	$continueAfterFastMatch = $thisTimeContinueAfterFastMatch;
	print "matchMode = $matchMode, anchorMode = $anchorMode\n";
	print "fastExactMatches = ";
	if ($fastExactMatches eq "") { print "(none)\n"; }
	else {
	    if ($fastExactMatches =~ m/c/) { print "chinese "; }
	    if ($fastExactMatches =~ m/p/) { print "pinyin "; }
	    if ($continueAfterFastMatch) { print "+\n"; }
	    else { print "\n"; }
	}
	prompt();
	next INPUTLOOP;
    }
    $forgotTone = 0;
    $typeOfInput = classifyCPE($line);
    if ($line =~ m/^-/) {
	warn "Hmm, looks like you tried to specify a flag on the input,\n";
	warn "but I didn't recognize it.  Ignoring input; try again!\n";
	prompt();
	next INPUTLOOP;
    }
    if ($verbose) {
	print "typeOfInput = `$typeOfInput'\n";
	print "line = `$line'\n";
    }
    if ($doLog) {
	print LOGFP "typeOfInput = `$typeOfInput'\n";
	print LOGFP "line = `$line'\n";
    }
    if ($forgotTone && ($thisTimeMatchMode eq "shorter")) {
	print "0-tone (forgotten-tone) pinyin lookups currently do not\n";
	print "work with Match Mode = Shorter, sorry.\n";
	prompt();
	next INPUTLOOP;
    }

    # timing code is commented out for now...
    # $t0 = new Benchmark;

    if ($typeOfInput eq "pinyin") {
	# Convert "uu" into "u:" or vice-versa in pinyin field,
	# if the user requested it.
	if ($uConvert = $uu2uc) {
	    $line =~ s/uu/u:/;
	}
	elsif ($uConvert = $uc2uu) {
	    $line =~ s/u:/uu/;
	}
    }
    if ($underEmacs) { print "Searching...\n"; }
    lookupWord($line, $typeOfInput, $thisTimeMatchMode, $thisTimeAnchorMode,
	       $thisTimeFastExactMatches, $thisTimeContinueAfterFastMatch);


    # $t1 = new Benchmark;
    # $td = timediff($t1, $t0);
    # print "word lookup took: ", timestr($td), "\n";
    prompt();
}
