#****************************************************************************
#  ##   ##         #####   #####  ##     **      NoSQL RDBMS - addtypes     *
#  ###  ##        ####### ####### ##     **      $Revision: 2.1 $			*
#  #### ##        ###     ##   ## ##     ************************************
#  #######  ####  #####   ##   ## ##     **      Carlo Strozzi (c) 1998     *
#  ####### ######   ##### ## # ## ##     ************************************
#  ## #### ##  ##     ### ##  ### ##     **           Written by            *
#  ##  ### ###### ####### ######  ###### **          Carlo Strozzi          *
#  ##   ##  ####   #####   #### # ###### **     e-mail: carlos@linux.it     *
#****************************************************************************
#   NoSQL RDBMS, Copyright (C) 1998 Carlo Strozzi.                          *
#   This program comes with ABSOLUTELY NO WARRANTY; for details             *
#   refer to the GNU General Public License.                                *
#****************************************************************************
#
# Replaces the table dashline with column type/length information.
#
# This operator reads a NoSQL table via STDIN and prints the same
# table to STDOUT, after replacing the dashline with a definition 
# line similar to the one used in Walt Hobbs' RDB tables.
#
# Column types/widths are desumed from the input table in an auto-adaptive
# manner. Some examples of how datatypes are detected:
#
# Value					Type
# -----					----
# zxZZ123				String
# 12345					Integer
# -12345				Integer
# +12345				Integer
# 1234.5				Float1
# -1234.5				Float1
# +1234.567				Float3
# 19:02:05				Time		(HH:MM:SS)
# 39:02:05				String      (HH > 24)
# 1998.07.28			Date
# 98.07.28				Date		(Year:MM:DD)
# 123498.07.28			Date
# 98.07.28				Date
# 98.07.99				String		(DD > 31)
# 1,346.99				Currency
# -1,346.99				Currency
# 1,955,346.99			Currency
# 3425,251.00			String      (leading group of digits longer than 3)
# 425,251.0				String      (Decimal part is incomplete).
# 0.56					Float or Currency, depending on context.
# +0.56					Float or Currency, depending on context.
#
# Note: although only one currency format is supported, it is very
# easy for the application program to convert it to a different
# representation. For instance, to use a comma (,) as decimal separator
# and the dot (.) for thousands, the following bit of AWK code can
# do the trick (supposing that 'currency' contains "1,346.99") :
#
#    gsub( ",", ";", currency )
#    sub( ".", ",", currency )
#    gsub( ";", ".", currency )
#
# After that, 'currency' will contain "1.346,99", i.e. comma(s) and dot(s)
# have been exchanged.
#
# This operator is meant to be used by other operators to pre-process
# the input table. It converts the input NoSQL table into the intermediate
# RDB format, for subsequent processing by other filters.
#
##########################################################################


BEGIN { NULL = ""; FS = OFS = "\t" }

NR == 1 { print; next }
NR == 2 { next }

{
  if ( trunc )
  {
	out_rec = substr($1, 1, trunc)
	for ( i = 2; i <= NF; i++ ) out_rec = out_rec OFS substr($i, 1, trunc)
	print out_rec > table_tmp
  }
  else print > table_tmp

  for ( i = 1; i <= NF; i++ )
  {
	# NULL fields do not modify the datatype.
	if ( $i == NULL ) continue
	
    field_width = length($i)
    if ( field_width > col_w[i] ) col_w[i] = field_width
	if ( trunc )
	{
	  if ( col_w[i] > trunc )
	  { 
		 col_w[i] = trunc
		 $i = substr($i, 1, trunc)
	  }
	}

	# Note: order is important in the following sequence of
	# fall-through tests on data types.

	# If not already String.
    if ( col_t[i] != "S" )
	{
	  # Date: Year.MM.DD
	  if ( col_t[i] == "D" || !tested[i] )
	  {
        if ( $i ~ /^[0-9\.]+$/ )
	    {
          if ( split($i, a, ".") == 3 )
		  {
            if ( length(a[3]) == 2 && a[2] <= 12 && a[3] <= 31 )
			{
		      col_t[i] = "D"
			  if ( i == debug )
			  {
				print "addtypes> NR: " NR \
				  " , value: " $i " , type: D" > "/dev/stderr"
			  }
			  tested[i] = 1 ; continue
			}
          }
	    }
	  }

	  # Time: HH:MM:SS
	  if ( col_t[i] == "T" || !tested[i] )
	  {
        if ( $i ~ /^[0-9:]+$/ )
		{
		  split($i, a, ":")
		  if ( length(a[3]) == 2 \
			&& a[1] <= 24 && a[2] <= 59 && a[3] <= 59 )
		  {
		    col_t[i] = "T"
			if ( i == debug )
			{
			  print "addtypes> NR: " NR \
				" , value: " $i " , type: T" > "/dev/stderr"
			}
			tested[i] = 1 ; continue
		  }
	    }
	  }

	  # Currency
	  if ( col_t[i] ~ /^[CF]/ || !tested[i] )
	  {
        if ( $i ~ /^[-+]?[0-9,]+\.[0-9][0-9]$/ )
		{
		  last = split($i, a, ",")
		  if ( length(a[1]) <= 3 && length(a[last]) == 6 )
		  {
		    col_t[i] = "C"
			if ( i == debug )
			{
			  print "addtypes> NR: " NR \
				" , value: " $i " , type: C" > "/dev/stderr"
			}
			tested[i] = 1 ; continue
		  }
	    }
	  }

	  # Float
	  if ( col_t[i] ~ /^[CFI]/ || !tested[i] )
	  {
        if ( $i ~ /^[-+]?[0-9]+\.[0-9]+$/ )
		{
		  split($i, a, ".")
		  new_precision = length(a[2])

		  # Float may be compatible with Currency.
          if ( col_t[i] != "C" )
		  {
		    if ( new_precision > precision[i] ) precision[i] = new_precision

			# In floats sx_w[i] is the length of the integer part only.
		    new_width = length(a[1])
		    if ( new_width > sx_w[i] ) sx_w[i] = new_width

            col_t[i] = "F." precision[i]
			if ( i == debug )
			{
			  print "addtypes> NR: " NR \
				" , value: " $i " , type: F." precision[i] > "/dev/stderr"
			}
			tested[i] = 1 ; continue
		  }

		  # It was Currency. Check whether this is compatible.

		  if ( length(a[1]) <= 3 && new_precision == 2 )
		  {
			tested[i] = 1
			continue
		  }
        }
	  }
  
	  # Integer
	  if ( col_t[i] ~ /^[FI]/ || !tested[i] )
	  {
        if ( $i ~ /^[-+]?[0-9]+$/ )
		{
		  # Integer is compatible with Float.
		  if ( col_t[i] ~ /^F/ )
		  {
			# In floats sx_w[i] is the length of the integer part only.
			new_width = length($i)
		    if ( new_width > sx_w[i] ) sx_w[i] = new_width
		  }
		  else col_t[i] = "I"
		  if ( i == debug )
		  {
		    print "addtypes> NR: " NR \
		  	  " , value: " $i " , type: I" > "/dev/stderr"
		  }
		  tested[i] = 1 ; continue
	    }
	  }
  
	  # Default is String.
      col_t[i] = "S"
      if ( i == debug )
      {
        print "addtypes> NR: " NR \
          " , value: " $i " , type: S" > "/dev/stderr"
      }
	  tested[i] = 1

    } # END If not already String.
  }
}

END \
{
  if ( NR < 3 )
  {
	# The table comprises only the header.

    for ( i = 1; i <= NF; i++ ) { col_w[i] = length($i); col_t[i] = "S" }
  }

  # Compute actual length if Float.
  if ( sx_w[1] )
  {
    new_width = sx_w[1] + precision[1] + 1
    if ( new_width > col_w[1] ) col_w[1] = new_width
  }

  out_rec = col_w[1] col_t[1]
  for ( i = 2; i <= NF; i++ )
  {
	# Compute actual length if Float.
    if ( sx_w[i] )
    {
      new_width = sx_w[i] + precision[i] + 1
      if ( new_width > col_w[i] ) col_w[i] = new_width
    }

	# Yet unassigned fields default to "0S".
	if ( col_t[i] == NULL ) { col_w[i] = 0 ; col_t[i] = "S" }
	out_rec = out_rec OFS col_w[i] col_t[i]
  }

  # Print column definiton line.
  print out_rec
}

