#****************************************************************************
#  ##   ##         #####   #####  ##     **        NoSQL RDBMS - join       *
#  ###  ##        ####### ####### ##     **      $Revision: 2.1 $			*
#  #### ##        ###     ##   ## ##     ************************************
#  #######  ####  #####   ##   ## ##     **      Carlo Strozzi (c) 1998     *
#  ####### ######   ##### ## # ## ##     ************************************
#  ## #### ##  ##     ### ##  ### ##     **           Written by            *
#  ##  ### ###### ####### ######  ###### **          Carlo Strozzi          *
#  ##   ##  ####   #####   #### # ###### **     e-mail: carlos@linux.it     *
#****************************************************************************
#   NoSQL RDBMS, Copyright (C) 1998 Carlo Strozzi.                          *
#   This program comes with ABSOLUTELY NO WARRANTY; for details             *
#   refer to the GNU General Public License.                                *
#****************************************************************************
#
#  Joins two tables on a common field.
#
#  Joins two tables on a common field, using the unix join(1) program.
#  The two tables must be sorted on the respective join fields for
#  the operation to function correctly.
#
#  Usage: nosql join [-a 1|2] [-j column] [-1 column_1] \
#         [-2 column_2] table_1 table_2
#
#  Options:
#
#  -j|--column column
#		join on the column from both files.
#
#  -a|--all 1|2
#       In addition to normal output, produce a line for each
#       unpairable line in file 1 or 2 respectively.
#
#  -J|--suppress-join-column
#       Exclude the join column (leftmost column) from the output table.
#
#  -x|--debug
#       Debug. Prints the join(1) command line to STDERR.
#
#  Either one or the other, but not both, of the two input tables must
#  be specified as '-', meaning STDIN. See join(1) for more details
#  on the meaning of each option. 
#
#  If only one of '-1' or '-2' is specified, then the other one
#  defaults to the first (leftmost) field of the associated table.
#
#  Either '-j' or the '-1,-2' options should be specified. If they
#  are both present, 'join' will try to apply them in a sensible
#  fashion, i.e. if '-j' comes last it will override both '-1' and
#  '-2', while if '-j' comes first then '-1,-2' will #  override
#  the '-j' specification only for the associate table.
#
#  If neither '-j' nor '-1|-2' are specified, then the two tables
#  are joined on their respective first (leftmost) fields.
#
#  If any of the two input tables contain duplicated columns, i.e.
#  columns with the same name but possibly different values, only the
#  first (leftmost) one is taken into account. The output will still have
#  the duplicates, but this time with equal column values.
#
#  The following names are reserved to the awk language, and should not
#  be used to indicate column names:
#
#  BEGIN, END, break, continue, else, exit, exp, for, getline, if, in,
#  index, int, length, log, next, print, printf, split, sprintf, sqrt,
#  substr, while, and possibly others, depending on the implementation
#  of your awk (i.e. mawk, gawk, etc.). Refer to the man page and the
#  documentation of you awk interpeter.
#
#  This NoSQL operator reads two tables, one from STDIN and the other
#  one from a file, and writes a table to STDOUT.
#
########################################################################

########################################################################
# BEGIN block
########################################################################

BEGIN \
{
  NULL = ""; FS = OFS = "\t";

  # Join columns should default to non-existent names.
  c_names[1] = c_names[2] = "---"

  # Get command line arguments.
  split( __nosql_args, args, " " )

  # Separate join(1) options from column names.

  while ( args[++i] != NULL )
  {
	# Turn long options into their short form.
	if ( args[i] == "--all" ) args[i] = "-a"
	else if ( args[i] == "--column" ) args[i] = "-j"
	else if ( args[i] == "--suppress-join-column" ) args[i] = "-J"
	else if ( args[i] == "--debug" ) args[i] = "-x"

	# Now process each option in turn.
	if ( args[i] == "-j" ) c_names[1] = c_names[2] = args[++i]
	else if ( args[i] == "-J" ) no_jcol = 1
	else if ( args[i] == "-x" ) debug = 1
	else if ( args[i] == "-1" ) c_names[1] = args[++i]
	else if ( args[i] == "-2" ) c_names[2] = args[++i]
	else if (  sub( /^-a */, NULL, args[i] ) )
	{
	  if ( args[i] == NULL ) join_args = join_args " -a " args[++i]
	  else join_args = join_args " -a " args[i]
	}
	else if ( args[i] !~ /^-/ || args[i] == "-" )
	{
	  j_tables = j_tables " " args[i]
	}
	else join_args = join_args " " args[i]
  }

  split( j_tables, tbl, " " )

  # Only one table may be on stdin.
  if ( tbl[1] == "-" && tbl[2] == "-" ) exit 1

  # Check that mandatory temporary file name is set.
  if ( __nosql_tmpfile == NULL ) exit 2

  # Get column names from whichever table is _not_ on stdin.
  if ( tbl[1] != "-" )
  {
	getline < tbl[1]		# Column names
	hdr_1 = $0

	# Get column names and positions.
	# Make sure we pick the first occurrence of duplicated column
	# positions (it may happen after a join).

	while ( ++p <= NF )
  	{
	  if ( P1[$p] == NULL ) P1[$p] = p
	  N1[p] = $p
	}

	j1 = P1[ c_names[1] ]

	# Default to 1st column on invalid column name.
	if ( j1 == NULL )
	{
	  c_names[1] = N1[1]
	  j1 = P1[ c_names[1] ]
	}
	
	getline < tbl[1]		# Dasline

	# Now write the body of tbl[1] to file __nosql_tmpfile

	while ( getline < tbl[1] > 0 ) print > __nosql_tmpfile
	close( tbl[1] ) ; close( __nosql_tmpfile )

	# Set new table file name.
	tbl[1] = __nosql_tmpfile
  }
  else
  {
	getline < tbl[2]		# Column names
	hdr_2 = OFS $0 OFS

	# Get column names and positions.
	# Make sure we pick the first occurrence of duplicated column
	# names (it may happen after a join).

	while ( ++p <= NF )
	{
	  if ( P2[$p] == NULL ) P2[$p] = p
	  N2[p] = $p
	}

	j2 = P2[ c_names[2] ]

	# Default to 1st column on invalid column name.
	if ( j2 == NULL )
  	{
	  c_names[2] = N2[1]
	  j2 = P2[ c_names[2] ]
	}
	
	getline < tbl[2]		# Dashline

	# Now write the body of tbl[2] to file __nosql_tmpfile

	while ( getline < tbl[2] > 0 ) print > __nosql_tmpfile
	close( tbl[2] ) ; close( __nosql_tmpfile )

	# Set new table file name.
	tbl[2] = __nosql_tmpfile
  }
}

########################################################################
# Main loop
########################################################################

# Get column names from whichever table _is_ on stdin.
NR == 1 \
{
  p = 0
  if ( tbl[1] == "-" )
  {
    hdr_1 = $0

    # Get column names and positions.
	# Make sure we pick the first occurrence of duplicated column
	# positions (it may happen after a join).

    while ( ++p <= NF )
	{
	  if ( P1[$p] == NULL ) P1[$p] = p
	  N1[p] = $p
	}

    j1 = P1[ c_names[1] ]

	# Default to 1st column on invalid column name.
	if ( j1 == NULL )
	{
	  c_names[1] = N1[1]
	  j1 = P1[ c_names[1] ]
	}
  }
  else
  {
    hdr_2 = OFS $0 OFS

    # Get column names and positions.
    while ( ++p <= NF ) { P2[$p] = p; N2[p] = $p }

    j2 = P2[ c_names[2] ]

    # Default to 1st column on invalid column name.
    if ( j2 == NULL )
	{
	  c_names[2] = N2[1]
	  j2 = P2[ c_names[2] ]
    }
  }

  # Remove j2 from hdr_2.
  regexp = OFS c_names[2] OFS
  gsub( regexp, OFS, hdr_2 )

  # Now print output header and dashline.
  hdr1_size = split( hdr_1, a )
  hdr_1 = NULL

  for ( x = 1; x <= hdr1_size; x++ )
  {
	if ( a[x] == c_names[1] ) first_field = a[x]
	else ( hdr_1 = hdr_1 OFS a[x] )
  }

  if ( no_jcol ) sub( /^\t/, "", hdr_1 )
  else hdr_1 = first_field hdr_1

  out_rec = hdr_1 hdr_2
  sub( /\t*$/, NULL, out_rec )
  print out_rec; gsub( /[^\t]/, "-", out_rec ); print out_rec

  # Build output field list for join(1).
  if ( ! no_jcol ) field_list = " 1." P1[ N1[ j1 ] ]

  while ( N1[++c] != NULL )
  {
	if ( N1[c] !="." && c != j1 )
	  field_list = field_list ",1." P1[ N1[c] ]
  }

  if ( no_jcol ) sub( /^,/, " ", field_list )
  c = 0
  while ( N2[++c] != NULL )
  {
	if ( N2[c] != "." && c != j2 )
	  field_list = field_list ",2." P2[ N2[c] ]
  }

  join_args = join_args " -o " field_list
  join_args = join_args " -1 " j1 " -2 " j2 " " tbl[1] " " tbl[2]
  join_cmd = "join -t \"\011\" " join_args

  if ( debug ) print join_cmd > "/dev/stderr"

  # Make sure we print the header before calling join(1).
  fflush()
}

NR > 2 { print |join_cmd }

