/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                      Copyright (c) 1995,1996                          */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission to use, copy, modify, distribute this software and its    */
/*  documentation for research, educational and individual use only, is  */
/*  hereby granted without fee, subject to the following conditions:     */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*  This software may not be used for commercial purposes without        */
/*  specific prior written permission from the authors.                  */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*                 Authors:  Simon King                                  */
/*                 Date   :  April 1997                                  */
/*-----------------------------------------------------------------------*/
/*                 EST_Ngrammar interpolating program                    */
/*                                                                       */
/*=======================================================================*/
#include "EST.h"
#include "EST_Ngrammar.h"
#include "EST_Pathname.h"

int main(int argc, char **argv)
{
    int order;
    EST_StrList files;
    EST_Option al, op;
    EST_String wordlist_file, out_file, format;
    EST_String prev_tag(""), prev_prev_tag(""), last_tag(""), input_format("");
    EST_Ngrammar::representation_t representation = 
	EST_Ngrammar::dense;

    EST_StrList wordlist;
    EST_Ngrammar ngrammar;
    bool trace=false;
    double floor=0.0;
    EST_TBI *p,*p2;
    EST_StrList weights;

    parse_command_line2(argc, argv, 
	 EST_String("Usage:\n")+
         "build_ngram -w <filename> <options> <input files>\n"+
	 " (use \"-\" to make input/output files stdin/out)\n"+
	 "-w <string>      Filename containing target word list\n"+
	 "-o <string>      output file\n"+
	 "-order <int>     order, 1=unigram, 2=bigram etc. (default 2)\n"+
	 "-otype <string>  format of output file, one of cstr_ascii\n"+
         "                 cstr_bin or htk_ascii\n"+
         "-floor <double>\n"+
	 "                 Frequency floor value used with some ngrams\n"+
         "-weights <string>\n"+
         "                 grammar weights, e.g. 3,4,1 (no spaces !)\n"+		       
         "-trace           Give verbose outout about build process\n"+
         "-save_compressed Save ngram in gzipped format\n"+
         "\n"+
	 "Pseudo-words :\n"+
	 "-prev_tag <string>\n"+
         "                 tag before sentence start\n"+
	 "-prev_prev_tag <string>\n"+
         "                 all words before 'prev_tag'\n"+
	 "-last_tag <string>\n"+
         "                 after sentence end\n"+
	 "-default_tags    use default tags of "+SENTENCE_START_MARKER+
			SENTENCE_END_MARKER+" and "+SENTENCE_END_MARKER+"\n"+
         "                 respectively\n",
			files, al);

    if (al.present("-w"))
	wordlist_file = al.val("-w");
    else{
	cerr << "ch_ngram: Must specify a wordlist with -w" << endl;
	exit(1);
    }

    if (load_StrList(wordlist_file,wordlist) != format_ok)
    {
	cerr << "ch_ngram: Could not read wordlist from file " 
	    << wordlist_file << endl;
	exit(1);
    }

    if (al.present("-trace"))
	trace=true;

    if (al.present("-o"))
	out_file = al.val("-o");
    else
	out_file = "-";
    
    if (al.present("-default_tags"))
    {	
	prev_tag = SENTENCE_START_MARKER;
	prev_prev_tag = SENTENCE_END_MARKER;
	last_tag = SENTENCE_END_MARKER;

	wordlist.append(SENTENCE_START_MARKER);
	wordlist.append(SENTENCE_END_MARKER);
    }
    
    if (al.present("-prev_tag"))
    {
	if (al.present("-default_tags"))
	    cerr << "ch_ngram: WARNING : -prev_tag overrides -default_tags"
		<< endl;
	prev_tag = al.val("-prev_tag");
    }

    if (al.present("-prev_prev_tag"))
    {
	if (al.present("-default_tags"))
	    cerr << "ch_ngram: WARNING : -prev_prev_tag overrides -default_tags" 
		<< endl;
	prev_prev_tag = al.val("-prev_prev_tag");
    }	
    
    if (al.present("-last_tag"))
    {
	if (al.present("-default_tags"))
	    cerr << "ch_ngram: WARNING : -last_tag overrides -default_tags"
		<< endl;
	last_tag = al.val("-last_tag");
    }

    if (   ( (prev_tag=="") ||  (prev_prev_tag=="") || (last_tag=="") )
	&& ( (prev_tag!="") ||  (prev_prev_tag!="") || (last_tag!="") )   )
    {
	cerr << "ch_ngram: ERROR : if any tags are given, ALL must be given"
	    << endl;
	exit(1);
    }

    if (al.present("-order"))
	order = al.ival("-order");
    else
    {
	cerr << "build_ngram: WARNING : No order specified with -order : defaulting to bigram" 
	    << endl;
	order = 2;
    }



    if (al.present("-otype"))
	format = al.val("-otype");
    else
	format = "";
 
  
    if (al.present("-floor"))
	floor = al.dval("-floor");
    else
	floor = 0.0;

    if (!ngrammar.init(order,representation,wordlist))
    {
	cerr << "build_ngram: Failed to initialise " << order << "-gram" << endl;
	exit(1);
    }

    if(al.present("-weights"))
    {
	StringtoStrList(al.val("-weights"),weights,",");
    }
    else
    {
	cerr << "Must give weights using -weights" << endl;
	exit(1);
    }

    if(weights.length() != files.length())
    {
	cerr << "Number of weights (" << weights.length();
	cerr << ") does not match number of files (" << files.length();
	cerr << ")" << endl;
	exit(1);
    }

    // load source grammars, one at a time to save memory
    for (p = files.head(),p2=weights.head(); (p && p2); p = next(p),p2=next(p2))
    {
	EST_Ngrammar other_grammar;
	
	if(trace)
	{
	    cerr << "Merging grammar from " << files(p);
	    cerr << " with weighting " << atof(weights(p2)) << endl;
	}

	if (other_grammar.load(files(p)) != format_ok)
	{

	    cerr << "ch_ngram: Failed to load source grammar from ";
	    cerr << files(p);
	    cerr << " with weighting " << atof(weights(p2)) << endl;

	    // should we exit or soldier on ?
	    //exit(1);
	}

	else if(other_grammar.order() != order)
	{
	    cerr << "ch_ngram: source grammar from ";
	    cerr << files(p) << " is not of order " << order << endl;	    
	}
	
	else
	{
	    ngrammar.merge(other_grammar,atof(weights(p2)));
	}
    }

    // we don't need to normalise
    // because we work in frequencies (even if
    // they are really probabilities !)


    // save
    if (al.present("-save_compressed"))
    {
	EST_String tmp_file = make_tmp_filename();
	if (ngrammar.save(tmp_file,format,trace,floor) == write_ok)
	{
	    EST_String prog_name;
	    EST_Pathname tmp(out_file);
	    if (tmp.extension() == GZIP_FILENAME_EXTENSION)
		prog_name = "gzip --stdout";
	    else if (tmp.extension() == COMPRESS_FILENAME_EXTENSION)
		prog_name = "compress -c";
	    else // default
	    {
		prog_name = "gzip --stdout";
		if(out_file != "-")
		    out_file = out_file + "." + GZIP_FILENAME_EXTENSION;
	    }

	    if (trace)
		cerr << "ch_ngram: Compressing with '" << prog_name << "'" << endl;

	    // now compress
	    if(compress_file(tmp_file,out_file,prog_name) != 0)
	    {
		cerr << "ch_ngram: Failed to compress to file "
		    << out_file << endl;
		(void)delete_file(tmp_file);
		exit(1);
	    }

	    (void)delete_file(tmp_file);

	    if(trace)
		cerr << "ch_ngram: Saved in compressed " << format
		    << " format to " << out_file << endl;
	}
	else
	{
	    cerr << "ch_ngram: Failed to write temporary file " 
		<< tmp_file << endl;
	    exit(1);
	}

	
    }
    else
    {
	if (ngrammar.save(out_file,format,trace,floor) == write_ok)
	{
	    if(trace)
		cerr << "ch_ngram: Saved in " << format
		    << " format to " << out_file << endl;
	}
	else
	{
	    cerr << "ch_ngram: Failed to save " << format << " format data to " 
		<< out_file << endl;
	    exit(1);
	}
    }

    
    // everything went okay
    return 0;
}


