############################################################################
#                                                                          #
#             copyright (c) 2003 ITB, Humboldt-University Berlin           #
#             written by: Raphael Ritz, r.ritz@biologie.hu-berlin.de       #
#                                                                          #
############################################################################

"""BibtexParser class"""

# Python stuff
import re

# Zope stuff
from Globals import InitializeClass
from App.Dialogs import MessageDialog

# Bibliography stuff
from Products.CMFBibliographyAT.tool.parsers.base \
     import IBibliographyParser, BibliographyParser

# our custom entity converter
from entities import Convert
convert = Convert()

_encoding = 'utf-8'   # XXX: should be taken from the site configuration

class BibtexParser(BibliographyParser):
    """
    specific parser to process input in BiBTeX-format
    """

    __implements__ = (IBibliographyParser ,)

    meta_type = "Bibtex Parser"

    format = {'name':'BibTeX',
              'extension':'bib'}

    def __init__(self,
                 id = 'bibtex',
                 title = "BibTeX parser",
                 delimiter = '}\s*@',
                 pattern = '(,\s*\w{2,}\s*=)'):
        """
        initializes including the regular expression patterns
        """
        self.id = id
        self.title = title
        self.setDelimiter(delimiter)
        self.setPattern(pattern)


    # Here we need to provide 'checkFormat' and 'parseEntry'

    def checkFormat(self, source):
        """
        is this my format?
        """
        # vanilla test for 'author' or 'editor'
        # in the sub-string 'source[10, 100]'
        ## rr: can definitively be improved

        teststring = source[10:100].lower()
        ai = teststring.find('author')
        ei = teststring.find('editor')
        if ai + ei > -2:
            return 1
        else:
            return 0

    def preprocess(self, source):
        """
        expands LaTeX macros
        removes LaTeX commands and special formating
        converts special characters to their HTML equivalents
        """
        source = self.expandMacros(source)
        source = self.stripCommands(source)
        source = self.stripComments(source)
        return self.convertChars(source)

    def expandMacros(self, source):
        source = self.expandStringMacros(source)
        # add more macro conventions here if there are any
        return source

    def expandStringMacros(self, source):
        lines = source.split('\n')
        macros = []
        sourcelns = []
        for line in lines:
            if line.find('@String') > -1:
                macros.append(line)
            else:
                sourcelns.append(line)
        source = '\n'.join(sourcelns)
        for macro in macros:
            split_on = re.compile('[{=}]+')
            raw_matches = split_on.split(macro)
            matches = [m for m in raw_matches if m not in ['', ' ', '\r']]
            # raise str(matches)
            short = matches[1].strip()
            long = matches[-1].strip()
            pattern = "\\b" + short + "\\b"
            old = re.compile(pattern)
            source = old.sub(long, source)
        return source
    
    def stripCommands(self, source):
        oldstyle_cmd = re.compile(r'{\\[a-zA-Z]{2,}')
        newstyle_cmd = re.compile(r'\\[a-zA-Z]+{')
        source = oldstyle_cmd.sub('{', source)
        source = newstyle_cmd.sub('{', source)
        return source

    def stripComments(self, source):
        # In LaTeX the '%' denotes the beginning of a comment irrespective of 
        # position unless it is escaped '\%'. The comment is terminated by the 
        # end of the line.
        pattern = re.compile(r'([^\\])%[^\n]*')
        source = pattern.sub('\g<1>', source)
        return source

    def convertChars(self, source):
        source = self.convertAccents(source)
        source = self.fixWhiteSpace(source)
        return self.explicitReplacements(source)

    def convertAccents(self, source):
        umlaute = re.compile(r'{?\\"{?(.?)}?')
        acute = re.compile(r"{?\\'{?(.?)}?")
        grave = re.compile(r'{?\\`{?(.?)}?')
        circ = re.compile(r'{?\\^{?(.?)}?')
        tilde = re.compile(r'{?\\~{?(.?)}?')
        source = umlaute.sub(self.uml2h, source)
        source = acute.sub(self.ac2h, source)
        source = grave.sub(self.gr2h, source)
        source = circ.sub(self.cf2h, source)
        source = tilde.sub(self.tilde2h, source)
        
        return source

##     def uml2h(self, hit): return r'&' + hit.group(1) + 'uml;'
##     def ac2h(self, hit): return r'&' + hit.group(1) + 'acute;'
##     def gr2h(self, hit): return r'&' + hit.group(1) + 'grave;'
##     def cf2h(self, hit): return r'&' + hit.group(1) + 'circ;'
##     def tilde2h(self, hit): return r'&' + hit.group(1) + 'tilde;'

    def uml2h(self, hit):
        return convert(r'&' + hit.group(1) + 'uml;').encode(_encoding)
    def ac2h(self, hit):
        return convert(r'&' + hit.group(1) + 'acute;').encode(_encoding)
    def gr2h(self, hit):
        return convert(r'&' + hit.group(1) + 'grave;').encode(_encoding)
    def cf2h(self, hit):
        return convert(r'&' + hit.group(1) + 'circ;').encode(_encoding)
    def tilde2h(self, hit):
        return convert(r'&' + hit.group(1) + 'tilde;').encode(_encoding)

    def fixWhiteSpace(self, source):
        ttable = [(r'\ ', ' '),
                  (r'\!', ' '),
                  ]
        source = self.mreplace(source, ttable)
        wsp_tilde = re.compile(r'[^/\\]~')
        return wsp_tilde.sub(self.tilde2wsp, source).replace('\~', '~')

    def tilde2wsp(self, hit): return hit.group(0)[0] + ' '

    def explicitReplacements(self, source):
        # list of 2 tuples; second element replaces first
        ttable = [(r'\/', ''),
                  (r'\&', '&'),
                  (r'\~', '~'),
                  (r'---', '&mdash;'),
                  (r'--', '&ndash;'),
                  ]
        return self.mreplace(source, ttable)
        
    def mreplace(self, s, ttable):
        for a, b in ttable:
            s = s.replace(a, b)
        return s
    
    # done with preprocessing

    def parseEntry(self, entry):
        """
        parses a single entry
        
        returns a dictionary to be passed to
        BibliographyEntry's edit method
        """
        result = {}
        authorlist = authorURLlist = []
        
        tokens = self.pattern.split(entry)
    
        try: 
            type, pid = tokens[0].strip().split('{')
            type = type.replace('@', '').strip().lower()
            result['publication_type'] = type.capitalize() + 'Reference'
            result['pid'] = pid.replace(',', '').strip()
        except:
            return "Bibtex Parser Error: malformed first line."

        for k,v in self.group(tokens[1:],2):
            key = k[1:-1].strip().lower()
            result[key] = self.clean(v)

        # compile authors list of dictionaries
        if result.has_key('author'):
            authorlist = result['author'].split(' and')
        if result.has_key('authorURLs'):
            authorURLlist = result['authorURLs'].split('and ')

        if authorlist:
            alist = []
            authorlist = [x for x in authorlist if x]
            for author in authorlist:
                fname = mname = lname = ''
                parts = self.splitAuthor(author)
                if len(parts) == 1:
                    lname = parts[0].strip()
                else:
                    lname = parts[-1].strip()
                    fname = parts[0].strip()
                    if parts[1:-1]:
                        for part in parts[1:-1]:
                            mname = mname + part.strip()
                adict = {'firstname': fname,
                         'middlename': mname,
                         'lastname': lname}
                alist.append(adict)

        if authorURLlist and alist:
            index = 0
            for url in authorURLlist:
                alist[index]['homepage'] = url.strip()
                index += 1

        if authorlist:
            result['authors'] = alist

        # do some renaming and reformatting
        tmp = result.get('note')
        while tmp and tmp[-1] in ['}', ',', '\n', '\r']:
            tmp = tmp[:-1]
        if tmp:
            result['note'] = tmp
        # make keywords a list
        try:
            tmp = eval(result.get('keywords', '[]'))
        except:
            tmp = result.get('keywords').split(',')
        result['keywords'] = tmp
        result['publication_year'] = result.get('year', '')
        result['publication_month'] = result.get('month', '')
        result['publication_url'] = result.get('url', '')
        ## result['publication_title'] = result.get('title', '')
        tmp = result.get('title','')
        for car in ('\n', '\r', '\t'):
            tmp = tmp.replace(car, ' ')
        while '  ' in tmp:
            tmp = tmp.replace('  ', ' ')
        result['title'] = tmp

        return result

    # the helper method's

    def splitAuthor(self, author=None):
        if not author: return []
        parts = author.replace('.', ' ').split(',',1)
        if len(parts) == 1: return parts[0].split()
        else:
            tmp = parts[1].split()
            tmp.append(parts[0])
            return tmp

    def clean(self, value):
        value = value.replace('{', '').replace('}', '').strip()
        if value and value[0] == '"' and len(value) > 1:
            value = value[1:-1]
        return value

    def group(self, p,n):
        """ Group a sequence p into a list of n tuples."""
        mlen, lft = divmod(len(p), n)
        if lft != 0: mlen += 1

        # initialize a list of suitable length
        lst = [[None]*n for i in range(mlen)]
        
        # Loop over all items in the input sequence
        for i in range(len(p)):
            j,k = divmod(i,n)
            lst[j][k] = p[i]

        return map(tuple, lst)    
    
        

 # Class instanciation
InitializeClass(BibtexParser)

   
def manage_addBibtexParser(self, REQUEST=None):
    """ """
    try:
        self._setObject('bibtex', BibtexParser())
    except:
        return MessageDialog(
            title='Bibliography tool warning message',
            message='The parser you attempted to add already exists.',
            action='manage_main')    
    return self.manage_main(self, REQUEST)
