/*************************************************************************/
/*                                                                       */
/*                Centre for Speech Technology Research                  */
/*                     University of Edinburgh, UK                       */
/*                         Copyright (c) 1996                            */
/*                        All Rights Reserved.                           */
/*                                                                       */
/*  Permission to use, copy, and modify this software and its            */
/*  documentation for research, educational and individual use only, is  */
/*  hereby granted without fee, subject to the following conditions:     */
/*   1. The code must retain the above copyright notice, this list of    */
/*      conditions and the following disclaimer.                         */
/*   2. Any modifications must be clearly marked as such.                */
/*   3. Original authors' names are not deleted.                         */
/*  This software may not be used for commercial purposes without        */
/*  specific prior written permission from the authors.                  */
/*                                                                       */
/*  THE UNIVERSITY OF EDINBURGH AND THE CONTRIBUTORS TO THIS WORK        */
/*  DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING      */
/*  ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT   */
/*  SHALL THE UNIVERSITY OF EDINBURGH NOR THE CONTRIBUTORS BE LIABLE     */
/*  FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES    */
/*  WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN   */
/*  AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,          */
/*  ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF       */
/*  THIS SOFTWARE.                                                       */
/*                                                                       */
/*************************************************************************/
/*             Author :  Alan W Black                                    */
/*             Date   :  August 1996                                     */
/*-----------------------------------------------------------------------*/
/*  This is program to convert the standard cuvoald710.doc file into     */
/*  something that festival can use.  This is one of those "hacky little */
/*  scripts" I hate so much.                                             */
/*                                                                       */
/*  text710.dat is avilable from                                         */
/*     ftp://ota.ox.ac.uk/pub/ota/public/dicts/710/text710.dat           */
/* for non-comercial use only                                            */
/*                                                                       */
#include <stdio.h>
#include <string.h>

#ifndef TRUE
#define TRUE (1==1)
#define FALSE (0==1)
#endif
#ifndef streq
#define streq(X,Y) (strcmp(X,Y) == 0)
#endif
void *safe_walloc(int size);
void *safe_wcalloc(int size);
#define walloc(TYPE,SIZE) ((TYPE *)safe_walloc(sizeof(TYPE)*SIZE))
#define wcalloc(TYPE,SIZE) ((TYPE *)safe_wcalloc(sizeof(TYPE)*SIZE))
char *wstrdup(const char *s);
void wfree(void *p);

#define ENTRYSIZE 129
char entry[ENTRYSIZE];

struct {
    char *lhs;
    char *rcontext;
    char *rhs;
} prules[] =
{
{"vl","#","v @ l"},                       /* syllabic l */
{"vl","bcdfghjklmnpqrstvwxyz","v @ l"},
{0,0,0}
};

struct {
    char *lhs;
    char *rhs;
} rules[] =
{
{"@U","ou"},
{"eI","ei"},
{"aI","ai"},
{"aU","au"},
{"oI","oi"},
{"oI","oi"},
{"I@","i@"},
{"e@","e@"},
{"U@","u@"},
{"tS","ch"},
{"dZ","jh"},
{"Sn@s","sh n @ s"},  /* bookishness */
{"Sn","sh @ n"},      /* introduction */
{"p","p"},
{"b","b"},
{"k","k"},
{"g","g"},
{"t","t"},
{"d","d"},
{"m","m"},
{"n","n"},
{"f","f"},
{"v","v"},
{"s","s"},
{"z","z"},
{"r","r"},
{"l","l"},
{"w","w"},
{"h","h"},
{"j","y"},
{"i","ii"},
{"I","i"},
{"e","e"},
{"&","a"},
{"A","aa"},
{"0","o"},
{"O","oo"},
{"U","u"},
{"u","uu"},
{"V","uh"},
{"3","@@"},
{"@","@"},
{"N","ng"},
{"T","th"},
{"D","dh"},
{"S","sh"},
{"Z","zh"},
{"R","r"},  /* link R -- gets deleted by postlex rules when not before vowel */
{"'","'"},
{",",","},
{"-","-"},
{"+","-"},
{" ","-"},
{"#"," "},
{0,0}};
     
static char *pos2name(char *pos);
static char *get_field(char *entry,int start,int end);
static char **map_pron(char *pron,char *pos);
static void process_entry(char *entry);
static char *next_phone(char **p);
static int ph_vowel(char *ph);

int main(int argc, char **argv)
{
    FILE *fd;
    int n,entryno;

    fd=fopen("text710.dat","r");
/*    fd=fopen("t1.dat","r"); */
    if (fd==NULL)
    {
	fprintf(stderr,"ERROR: can't open the data file\n");
	exit(-1);
    }

    entryno=0;
    while ((n=fread(entry,sizeof(char),ENTRYSIZE,fd) == ENTRYSIZE))
    {
	if (entry[ENTRYSIZE-1] != '\n')
	{
	    fprintf(stderr,"ERROR: mangled entry %d\n",entryno);
	    exit(-1);
	}
	process_entry(entry);
	entryno++;
    }

    return 0;
    
}

static void process_entry(char *entry)
{
    /* Seprate out the fields, maps the phones etc */
    char *headword, *pron, *pos, *syls, *subcat;
    char **mappron;
    int num_syls;
    int i,j;

    headword = get_field(entry,0,22);
    pron = get_field(entry,23,45);
    pos = get_field(entry,46,68);
    syls = get_field(entry,69,69);
    subcat = get_field(entry,70,128);

    mappron = map_pron(pron,pos);
    
    printf("( \"");
    for (i=0; headword[i] != '\0'; i++)
    {
	if (headword[i] == '"')
	    putc('\\',stdout);
	putc(headword[i],stdout);
    }
    printf("\" ");
    /* Simple part of speech */
    printf(" %s ",pos2name(pos));
    
    printf("( ");
    for (i=0; mappron[i] != 0; i++)
    {
	printf("%s ",mappron[i]);
	wfree(mappron[i]);
    }
    printf(") ((pos ");
    for (j=i=0; pos[i] != '\0'; i++)
    {
	if (pos[i] == ',')
	{
	    pos[i] = '\0';
	    printf("\"%s\" ",pos+j);
	    j=i+1;
	}
    }
    printf("\"%s\" ) ",pos+j);
    if (streq(" ",subcat))
	printf("))\n");
    else
    {
	printf("(subcat ");
	for (j=i=0; subcat[i] != '\0'; i++)
	{
	    if (subcat[i] == ',')
	    {
		subcat[i] = '\0';
		printf("\"%s\" ",subcat+j);
		j=i+1;
	    }
	}
	printf(")))\n");
    }
    wfree(mappron);
    wfree(headword);
    wfree(pron);
    wfree(syls);
    wfree(subcat);
}

static char *pos2name(char *pos)
{
    /* Convert first pos tag to more standard name */
    
    if (strchr("GHIJ",pos[0]) != NULL)
	return "v";
    else if (strchr("KLM",pos[0]) != NULL)
	return "n";
    else if (pos[0] == 'N')
	return "n";
    else if (pos[0] == 'O')
	return "j";
    else if (pos[0] == 'P')
	return "a";
    else if (pos[0] == 'Q')
	return "prp";
    else if (strchr("RS",pos[0]) != NULL)
	return "dt";
    else if (pos[0] == 'T')
	return "in";
    else if (pos[0] == 'U')
	return "j"; /* prefix */
    else if (pos[0] == 'V')
	return "cc";
    else if (pos[0] == 'W')
	return "uh"; /* ex */
    else if (pos[0] == 'X')
	return "nil";
    else if (pos[0] == 'Y')
	return "n";
    else if (pos[0] == 'Z')
	return "nil";
    else
	return "nil";
}

static char *get_field(char *entry,int start,int end)
{
    /* find fields starting at start and ending on the first no white */
    /* character before end                                           */
    int i;
    char *p;

    for (i=end; i > start; i--)
	if (strchr(" \n",entry[i]) == NULL)
	    break;
    p = walloc(char,i-start+2);
    p[i-start+1] = '\0';
    for (;i >=start; i--)
	p[i-start] = entry[i];
    
    return p;
}

static char *map_vowel(char *ph)
{
    /* If this is a short vowel map it ot a long one */

    if (streq("uh",ph))
	return "@";
    else if (streq("e",ph))
	return "@";
    else if (streq("a",ph))
	return "aa";
    else if (streq("i",ph))
	return "ii";
    else if (streq("u",ph))
	return "@";
    else
	return ph;
}    

static int should_add_stress(char *pron,char *pos)
{
    /* A gues are weither to add stress or not */

    if ((strchr(pron,'\'') == NULL) &&   /* no stress marked at all */
	((strchr(pos,'G') != NULL) ||    /* small verb */
	 (strchr(pos,'Q') != NULL) ||    /* pronoun */
	 (strchr(pos,'R') != NULL) ||    /* definite article */
	 (strchr(pos,'S') != NULL) ||    /* indefinite article */
	 (strchr(pos,'T') != NULL) ||    /* preposition */
	 (strchr(pos,'U') != NULL) ||    /* prefix */
	 (strchr(pos,'V') != NULL) ||    /* conjunction */
	 (strchr(pos,'X') != NULL)      /* particle */
	 ))
	return FALSE;
    else
	return TRUE;
}

static char **map_pron(char *pron, char *pos)
{
    /* Map pronunciation from string of chars to separated list of phones */
    char **phones;
    char *p, *ph;
    char phone[2048];
    int stress,i;
    char *ppron;

    phones = walloc(char *,strlen(pron)+2);
    stress = 0;
    ppron = walloc(char,strlen(pron)+4);
    if (should_add_stress(pron,pos))
	sprintf(ppron,"'%s#",pron);   /* add stress on first (only) syl */
    else
	sprintf(ppron,"%s#",pron);
    for (p=ppron,i=0; *p != '#'; i++)
    {
	ph=next_phone(&p);
	if (streq(ph,"'"))
	{
	    stress = 1;
	    i--;
	}
	else if (streq(ph,","))
	{
	    stress = 2;
	    i--;
	}
	else
	{
	    /* simple phone */
	    if (ph_vowel(ph))
	    {
		if ((*p == '#') || /* for ending vowel */
		    (*p == 'R'))
		    ph = map_vowel(ph);
		if (stress != 0)
		    sprintf(phone,"%s%d",ph,stress);
		else 
		    sprintf(phone,"%s",ph);
		stress = 0;
	    }
	    else
		sprintf(phone,"%s",ph);
	    phones[i] = wstrdup(phone);
	}
    }

    phones[i] = 0;
    wfree(ppron);
    return phones;
}

static char *next_phone(char **p)
{
    /* Find next phone an move p accordingly */
    int i;

    /* first try the rules with context */
    for (i=0; prules[i].lhs != NULL; i++)
    {
	if ((strncmp(*p,prules[i].lhs,strlen(prules[i].lhs)) == 0) &&
	    (strchr(prules[i].rcontext,(*p)[strlen(prules[i].lhs)]) != NULL))
	{
	    *p += strlen(prules[i].lhs);
	    return prules[i].rhs;
	}
    }
    
    for (i=0; rules[i].lhs != NULL; i++)
    {
	if (strncmp(*p,rules[i].lhs,strlen(rules[i].lhs)) == 0)
	{
	    *p += strlen(rules[i].lhs);
	    return rules[i].rhs;
	}
    }
    fprintf(stderr,"ERROR: Failed to find a phone match %s\n",*p);
    exit (-1);
}

static int ph_vowel(char *ph)
{
    /* TRUE is phone is vowel */

    if (strchr("aeiou@",ph[0]) != NULL)
	return TRUE;
    else
	return FALSE;
}

void *safe_walloc(int size)
{
    char *p;

    p = (char *)malloc(size);

    if (p == NULL)
    {
	fprintf(stderr,"WALLOC: failed to malloc %d bytes\n",size);
	exit(-1);  /* I'd rather not do this but this is the only safe */
	           /* thing to do */
    }

    return p;
}

void *safe_wcalloc(int size)
{
    char *p = safe_walloc(size);

    memset(p,0,size);

    return p;
}

char *wstrdup(const char *s)
{
    char *t = walloc(char,strlen(s)+1);
    strcpy(t,s);
    return t;
}

void wfree(void *p)
{
    if (p != NULL)
	free(p);
}

