/***************************************************************************/
/* 		This code is part of WWW graber called pavuk		   */
/*		Copyright (c) 1997,1998,1999 Ondrejicka Stefan		   */
/*		(ondrej@idata.sk)					   */
/*		Distributed under GPL 2 or later			   */
/***************************************************************************/

#include <sys/types.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>


#ifdef HAVE_FNMATCH
#include <fnmatch.h>
#else
#include "fnmatch.h"
#endif

#include "config.h"
#include "url.h"
#include "condition.h"
#include "tools.h"
#include "robots.h"
#include "uexit.h"
#include "re.h"
#include "dns.h"

/********************************************************/
/* kontrola ci dany server je v zozname povolenych/	*/
/* zakazanych domen					*/
/********************************************************/
static bool domain_condition(site)
char *site;
{
	char **p = cfg.condition.domains;

	while(*p)
	{
		if (!(strlen(*p) > strlen(site)))
		{
			if (!strcasecmp(*p , site + strlen(site) - strlen(*p)))
			{
				return TRUE;
			}
		}
		p++;
	}
	return FALSE;
}

/********************************************************/
/* kontrola ci dokument ma dany sufix			*/
/********************************************************/
static bool cmp_sfx(urlr , sfx)
url *urlr;
char *sfx;
{
	char *pom = NULL;
	int nlen,slen;
	char *p;
	int rv;

	switch(urlr->type)
	{
		case URLT_HTTP:
#ifdef USE_SSL
		case URLT_HTTPS:
#endif
			if (urlr->p.http.searchstr)
			{
				pom = _malloc(strlen(urlr->p.http.document) +
					strlen(urlr->p.http.searchstr) + 2);
				sprintf(pom ,"%s?%s", urlr->p.http.document,
					urlr->p.http.searchstr);
				p = pom;
			}
			else
				p = urlr->p.http.document;
			break;
		case URLT_FTP:
#ifdef USE_SSL
		case URLT_FTPS:
#endif
			p = urlr->p.ftp.path;
			break;
		case URLT_GOPHER:
			p = urlr->p.gopher.selector;
			break;
		case URLT_FILE:
			p = urlr->p.file.filename;
			break;
		default: return FALSE;
	}
	
	slen = strlen(sfx);
	nlen = strlen(p);

	if (nlen < slen) return FALSE;

	rv = (!strcmp(sfx , p + nlen - slen));

	_free(pom);
	return rv;
}

/********************************************************/
/*  kontrola ci sufix URL je zo zoznamov sufixov	*/
/********************************************************/
static bool sfx_condition(urlr)
url *urlr;
{
	char **p=cfg.condition.sufix;

	while (*p)
	{
		if (cmp_sfx(urlr , *p))
		{
			return TRUE;
		}
		p++;
	}

	return FALSE;
}

/********************************************************/
/* kontrola ci URL ma dany prefix			*/
/********************************************************/
static bool cmp_prefix(urlr , prefix)
url *urlr;
char *prefix;
{
	char *p;
	char *pom = NULL;
	int rv;

	switch(urlr->type)
	{
		case URLT_HTTP:
#ifdef USE_SSL
		case URLT_HTTPS:
#endif
			if (urlr->p.http.searchstr)
			{
				pom = _malloc(strlen(urlr->p.http.document) +
					strlen(urlr->p.http.searchstr) + 2);
				sprintf(pom ,"%s?%s", urlr->p.http.document,
					urlr->p.http.searchstr);
				p = pom;
			}
			else
				p = urlr->p.http.document;
			break;
		case URLT_FTP:
#ifdef USE_SSL
		case URLT_FTPS:
#endif
			p = urlr->p.ftp.path;
			break;
		case URLT_GOPHER:
			p = urlr->p.gopher.selector;
			break;
		case URLT_FILE:
			p = urlr->p.file.filename;
			break;
		default: return FALSE;
	}
	

	rv = (!strncmp(prefix , p , strlen(prefix)));
	_free(pom);
	return rv;
}


/********************************************************/
/* kontrola ci URL ma prefix zo zoznamu 		*/
/* povolenych/zakaznych					*/
/********************************************************/
static bool prefix_condition(urlr)
url *urlr;
{
	char **p=cfg.condition.dir_prefix;

	while (*p)
	{
		if (cmp_prefix(urlr , *p))
		{
			return TRUE;
		}
		p++;
	}

	return FALSE;
}

/********************************************************/
/* kontola URL ci splna dany fnamtch regularny vyraz	*/
/********************************************************/
static bool cmp_pattern(urlr, pattern)
url *urlr;
char **pattern;
{
	char *p;
	char **pp;
	char *pom = NULL;

	switch(urlr->type)
	{
		case URLT_HTTP:
#ifdef USE_SSL
		case URLT_HTTPS:
#endif
			if (urlr->p.http.searchstr)
			{
				pom = _malloc(strlen(urlr->p.http.document) +
					strlen(urlr->p.http.searchstr) + 2);
				sprintf(pom ,"%s?%s", urlr->p.http.document,
					urlr->p.http.searchstr);
				p = pom;
			}
			else
				p = urlr->p.http.document;
			break;
		case URLT_FTP:
#ifdef USE_SSL
		case URLT_FTPS:
#endif
			p = urlr->p.ftp.path;
			break;
		case URLT_GOPHER:
			p = urlr->p.gopher.selector;
			break;
		case URLT_FILE:
			p = urlr->p.file.filename;
			break;
		default: return FALSE;
	}

	for (pp = pattern; pp && *pp ; pp++) 
	{
		if (!fnmatch(*pp , p , 0))
		{
			_free(pom);
			return FALSE;
		}
	}
	_free(pom);

	return (pattern != NULL);
}

static bool cmp_url_pattern(urlr , url_pattern)
url *urlr;
char **url_pattern;
{
	char *p = url_to_urlstr(urlr , FALSE);
	char **pp;

	for (pp = url_pattern; pp && *pp ; pp++) 
	{
		if (!fnmatch(*pp , p , 0))
		{
			_free(p);
			return FALSE;
		}
	}

	free(p);

	return (url_pattern != NULL);
}

#ifdef HAVE_REGEX
/********************************************************/
/* kontola URL ci splna dany regularny vyraz		*/
/********************************************************/
static bool cmp_rpattern(urlr, pattern)
url *urlr;
dllist *pattern;
{
	char *p;
	dllist *pp;
	char *pom = NULL;

	switch(urlr->type)
	{
		case URLT_HTTP:
#ifdef USE_SSL
		case URLT_HTTPS:
#endif
			if (urlr->p.http.searchstr)
			{
				pom = _malloc(strlen(urlr->p.http.document) +
					strlen(urlr->p.http.searchstr) + 2);
				sprintf(pom ,"%s?%s", urlr->p.http.document,
					urlr->p.http.searchstr);
				p = pom;
			}
			else
				p = urlr->p.http.document;
			break;
		case URLT_FTP:
#ifdef USE_SSL
		case URLT_FTPS:
#endif
			p = urlr->p.ftp.path;
			break;
		case URLT_GOPHER:
			p = urlr->p.gopher.selector;
			break;
		case URLT_FILE:
			p = urlr->p.file.filename;
			break;
		default: return FALSE;
	}

	for (pp = pattern; pp  ; pp = pp->next) 
	{
		if (re_pmatch((re_entry *)pp->data , p))
		{
			_free(pom);
			return FALSE;
		}
	}
	_free(pom);

	return (pattern != NULL);
}

static bool cmp_url_rpattern(urlr , url_pattern)
url *urlr;
dllist *url_pattern;
{
	char *p = url_to_urlstr(urlr , FALSE);
	dllist *pp;

	for (pp = url_pattern; pp ; pp = pp->next) 
	{
		if (re_pmatch((re_entry*)pp->data , p))
		{
			_free(p);
			return FALSE;
		}
	}

	free(p);

	return (url_pattern != NULL);
}

static bool cmp_ip_rpattern(urlr , ip_pattern , addr)
url *urlr;
dllist *ip_pattern;
char *addr;
{
	dllist *pp;

	for (pp = ip_pattern; pp ; pp = pp->next) 
	{
		if (re_pmatch((re_entry*)pp->data , addr))
			return FALSE;
	}
	return TRUE;
}
#endif

/********************************************************/
/* kontrola ci URL splna vsetky obmedzujuce podmienky	*/
/********************************************************/
int url_append_condition(urlp, urlnr)
url *urlp;
int urlnr;
{
	char *p;
	int ret_val;
	char *site;
	bool pm1,pm2,pm3,pm4;

	if (!prottable[urlp->type].supported) return 0;

	ret_val = FALSE;

	if (!cfg.condition.ftp)
		ret_val |= urlp->type == URLT_FTP;

	if (!cfg.condition.http)
		ret_val |= urlp->type == URLT_HTTP;

#ifdef USE_SSL
	if (!cfg.condition.https)
		ret_val |= urlp->type == URLT_HTTPS;

	if (!cfg.condition.ftps)
		ret_val |= urlp->type == URLT_FTPS;
#endif

	if (!cfg.condition.gopher)
		ret_val |=  urlp->type == URLT_GOPHER;

	if (cfg.condition.max_levels)
	{
		ret_val |= (urlp->level - 
			((urlp->status & URL_INLINE_OBJ) ? 1 : 0))
			 > cfg.condition.max_levels;
	}
	
	if (cfg.condition.max_documents)
	{
		ret_val |= ((!urlnr && (cfg.total_cnt + 1) > cfg.condition.max_documents) || 
			    (urlnr && urlnr > cfg.condition.max_documents));
	}

	if (urlp->type == URLT_GOPHER)
	{
		ret_val |= ! (urlp->p.gopher.selector[0] == '1' || urlp->p.gopher.selector[0] == '0' ||
			urlp->p.gopher.selector[0] == '2' || urlp->p.gopher.selector[0] == '4' ||
			urlp->p.gopher.selector[0] == '5' || urlp->p.gopher.selector[0] == '6' ||
			urlp->p.gopher.selector[0] == '9' || urlp->p.gopher.selector[0] == 'g' ||
			urlp->p.gopher.selector[0] == 'I');
	}

	if (!ret_val && (urlp->type == URLT_HTTP || urlp->type == URLT_HTTPS) && !cfg.condition.cgi)
	{
		ret_val |= urlp->p.http.searchstr != NULL;
	}

	site = url_get_site(urlp);
	if (!ret_val && cfg.condition.sites && cfg.condition.sites[0] && site)
	{
		if (cfg.condition.allow_site) 
		{
			ret_val |= !is_in_list(site , cfg.condition.sites);
		}
		else
		{
			ret_val |= is_in_list(site , cfg.condition.sites);
		}
	}
#ifdef HAVE_REGEX
	if (!ret_val && site && (cfg.condition.aip || cfg.condition.skipip))
	{
		struct sockaddr_in addr;
		int rv;
		int is_valid = TRUE;

		h_errno = 0;
		memset(&addr , '\0' ,  sizeof(addr));

		if ((addr.sin_addr.s_addr = inet_addr(site)) == -1)
		{
			if (dns_gethostbyname(site , &rv, (char *)&(addr.sin_addr)))
				is_valid = FALSE;
		}
		if (is_valid)
		{
			p = inet_ntoa(addr.sin_addr);
			if (cfg.condition.aip)
				ret_val |= cmp_ip_rpattern(urlp , cfg.condition.aip , p);
			if (cfg.condition.skipip)
				ret_val |= !cmp_ip_rpattern(urlp , cfg.condition.skipip , p);
		}
        }
#endif
	if (!ret_val && cfg.condition.domains && cfg.condition.domains[0] && site)
	{
		if (cfg.condition.allow_domain) 
		{
			ret_val |= !domain_condition(site);
		}
		else
		{
			ret_val |= domain_condition(site);
		}
	}

	if (!ret_val && cfg.condition.sufix && cfg.condition.sufix[0] && urlp->type != URLT_FILE)
	{
		if (cfg.condition.allow_sufix)
		{
			ret_val |= !sfx_condition(urlp);
		}
		else
		{
			ret_val |= sfx_condition(urlp);
		}
	}

	if (!ret_val && cfg.condition.dir_prefix && cfg.condition.dir_prefix[0] && 
		urlp->type != URLT_FILE)
	{
		if (cfg.condition.allow_prefix)
		{
			ret_val |= !prefix_condition(urlp);
		}
		else
		{
			ret_val |= prefix_condition(urlp);
		}
	}

#ifdef HAVE_REGEX
	if (!ret_val && urlp->type != URLT_FILE)
	{
		pm1 = FALSE;
		pm2 = FALSE;
		pm3 = FALSE;
		pm4 = FALSE;

		if (cfg.condition.pattern)
			pm1 = cmp_pattern(urlp, cfg.condition.pattern);
		if (cfg.condition.rpattern)
			pm3 = cmp_rpattern(urlp, cfg.condition.rpattern);

		if (cfg.condition.pattern && cfg.condition.rpattern)
			ret_val = pm1 && pm3;
		else if (cfg.condition.pattern)
			ret_val |= pm1;
		else if (cfg.condition.rpattern)
			ret_val |= pm3;
			   
		if (!ret_val)
		{
			if (cfg.condition.skip_pattern)
				pm2 = !cmp_pattern(urlp, cfg.condition.skip_pattern);
			if (cfg.condition.rskip_pattern)
				pm4 = !cmp_rpattern(urlp, cfg.condition.rskip_pattern);

			if (cfg.condition.skip_pattern && cfg.condition.rskip_pattern)
				ret_val = pm2 && pm4;
			else if (cfg.condition.skip_pattern)
				ret_val |= pm2;
			else if (cfg.condition.rskip_pattern)
				ret_val |= pm4;
		}
		
	}
	if (!ret_val && urlp->type != URLT_FILE)
	{
		pm1 = FALSE;
		pm2 = FALSE;
		pm3 = FALSE;
		pm4 = FALSE;

		if (cfg.condition.url_pattern)
			pm1 = cmp_url_pattern(urlp, cfg.condition.url_pattern);
		if (cfg.condition.rurl_pattern)
			pm3 = cmp_url_rpattern(urlp, cfg.condition.rurl_pattern);

		if (cfg.condition.url_pattern && cfg.condition.rurl_pattern)
			ret_val = pm1 && pm3;
		else if (cfg.condition.url_pattern)
			ret_val |= pm1;
		else if (cfg.condition.rurl_pattern)
			ret_val |= pm3;
			   
		if (!ret_val)
		{
			if (cfg.condition.skip_url_pattern)
				pm2 = !cmp_url_pattern(urlp, cfg.condition.skip_url_pattern);
			if (cfg.condition.rskip_url_pattern)
				pm4 = !cmp_url_rpattern(urlp, cfg.condition.rskip_url_pattern);

			if (cfg.condition.skip_url_pattern && cfg.condition.rskip_url_pattern)
				ret_val = pm2 && pm4;
			else if (cfg.condition.skip_url_pattern)
				ret_val |= pm2;
			else if (cfg.condition.rskip_url_pattern)
				ret_val |= pm4;
		}
		
	}
#else
	if (!ret_val && cfg.condition.pattern && urlp->type != URLT_FILE)
	{
		ret_val |= cmp_pattern(urlp, cfg.condition.pattern);
	}

	if (!ret_val && cfg.condition.skip_pattern && urlp->type != URLT_FILE)
	{
		ret_val |= !cmp_pattern(urlp, cfg.condition.skip_pattern);
	}

	if (!ret_val && cfg.condition.url_pattern && urlp->type != URLT_FILE)
	{
		ret_val |= cmp_url_pattern(urlp , cfg.condition.url_pattern);
	}

	if (!ret_val && cfg.condition.skip_url_pattern && urlp->type != URLT_FILE)
	{
		ret_val |= !cmp_url_pattern(urlp , cfg.condition.skip_url_pattern);
	}
#endif

	if (!ret_val && urlp->type != URLT_FILE && cfg.condition.dont_leave_site)
	{
		url *gparent = urlp;

		while (gparent->parent_url[0]) gparent = gparent->parent_url[0];

		ret_val |= (urlp->type != gparent->type) || 
			(url_get_port(urlp) != url_get_port(gparent)) ||
			strcmp(url_get_site(urlp) , url_get_site(gparent));
	}

	if (!ret_val && urlp->type != URLT_FILE && cfg.condition.site_level)
	{
		url *curl = urlp;
		url *parent = urlp->parent_url[0];
		int level = 0;

		while (parent)
		{
			if ((curl->type != parent->type) ||
			    (url_get_port(curl) != url_get_port(parent)) ||
			    strcmp(url_get_site(curl) , url_get_site(parent)))
			{
				if (!curl->moved_to)
					level++;
			}
			curl = parent;
			parent = parent->parent_url[0];
		}

		ret_val |= level > cfg.condition.site_level;
	}

	if (!ret_val && urlp->type != URLT_FILE && cfg.condition.dont_leave_dir)
	{
		url *gparent = urlp;
		char *p1,*p2;
		int len = 0;

		while (gparent->parent_url[0]) gparent = gparent->parent_url[0];

		p1 = url_get_path(urlp);
		p2 = url_get_path(gparent);
		
		p = strrchr(p2 , '/');
		if (p) len = p - p2;

		ret_val |= (urlp->type != gparent->type) || 
			(url_get_port(urlp) != url_get_port(gparent)) ||
			strcmp(url_get_site(urlp) , url_get_site(gparent)) ||
			strncmp(p1 , p2 , len);
	}

	if (!ret_val && urlp->type != URLT_FILE && cfg.condition.leave_level)
	{
		url *gparent = urlp;
		url *pomurl = urlp;
		int level = -1;

		while (gparent->parent_url[0]) gparent = gparent->parent_url[0];

		while (pomurl)
		{
			if ((pomurl->type == gparent->type) && 
				(url_get_port(pomurl) == url_get_port(gparent)) &&
				!strcmp(url_get_site(pomurl) , url_get_site(gparent)))
			{
				break;
			}

			if (!pomurl->moved_to)
				level ++;

			if ((level - ((urlp->status & URL_INLINE_OBJ) ? 1 : 0)) >=
				cfg.condition.leave_level)
			{
				ret_val = TRUE; 
				break;
			}
			pomurl = pomurl->parent_url[0];
		}
	}

	if (!ret_val && cfg.condition.uexit)
	{
		ret_val = !uexit_condition(urlp , NULL , 0L);
	}

	return !ret_val;
}
