/* xslt-sort.c -- Sample code for doing XSLT style text collation.
 *
 * Written by Behdad Esfahbod, July 2005.
 * Not copyrighted, in Public Domain
 *
 *
 *
 * This program when run, reads lines from the standard input, sorts them
 * and prints them back in standard output.
 *
 * By default, it sorts in ascending order.  If passed the -d option, it
 * sorts in descending order.  The case-order is by default the locale's
 * default order.  If -l is passed, lower case is sorted before upper case,
 * and if -u is passed, upper is sorted before lower.  Note that this is
 * independent of the ascending/descending order.  If an argument is passed on
 * the command line, that would be used as the name of the locale to use,
 * otherwise the current locale is used.
 *
 * To show some of the subtleties of the process, note that if you get the
 * following strings in the ascending order:
 *
 *   behdad
 *   Behdad
 *   zzzzzz
 *
 * then the descending order should be:
 *
 *   zzzzzz
 *   behdad
 *   Behdad
 *
 * and NOT:
 *
 *   zzzzzz
 *   Behdad
 *   behdad
 *
 * The part of XSLT specification about this is here:
 * 
 *   http://www.w3.org/TR/xslt#sorting
 *
 *
 *
 * Here is the algorithm I have implemented:
 *
 * - Find the locale case ordering, which means whether behdad is sorted
 * before Behdad or after.  To do that, I pick a letter from the alphabet of
 * the locale, do a toupper() and a tolower() of that (working with the wide
 * characters of course), and then collate the lower and upper strings and
 * name that the locale case order.  But for doing this you need a letter from
 * the alphabet of the locale.  For that I use the first character of the name
 * of the first month of the locale!  There are locales that have month names
 * starting with digits, like Japanese, but no locale I know that has case
 * ordering has such month names.  If the alphabet of the locale is caseless,
 * the order of Latin letters is used instead.  This functionality is
 * implemented in find_locale_case_order().
 *
 * - Now given a set of strings, a locale to use, an ascending/descending
 * preference, and a lower-first/upper-first/locale's-default case-ordering
 * preference, I do:
 *
 *   o If case-ordering preference is set to locale's default, set it to
 *   locale's case-ordering.
 *
 *   o If descending sort is asked for, invert the case-ordering preference.
 *
 *   o Finally, if requested and locale case-ordering differ, set the "reversed"
 *   variable to true, otherwise set it to false.
 *
 *   o Choose one of the four compare functions based on ascending/descending
 *   and reversed/not-reversed options.
 *
 * The compare functions are:
 *
 *   o ascending, not reversed:  Simply calls strcoll (or in fact strcmp on
 *   strxfrm'ed strings.)
 *
 *   o descending, not reversed:  Uses the one from previous case but negates
 *   the output.
 *
 *   o descending, reversed:  Uses the one from the next case but negates the
 *   output.
 *
 *   o ascending, reversed:  This is the most tricky one.  First compares the
 *   tolower'ed versions of the strings, if they are not equal, returns that.
 *   If they are equal, now compares the strings themselves and returns the
 *   output.
 *
 * That's all!
 *
 *
 *
 * WARNING:  This is just a proof-of-concept program.  I have tried to write
 * something useful.  Make sure you read and understand the code before
 * depending on it.
 */




/* header */




struct str_entry {
  char *s;    /* the actualy string */
  void *data; /* reserved for local use of xslt_sort */
};

void
xslt_sort (struct str_entry *entries,
	   int n_entries,
	   const char *locale,
	   int descending,
	   int case_order);




/* implementation */




#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <langinfo.h>
#include <locale.h>
#include <wchar.h>
#include <wctype.h>

/* Finds the case ordering of locale.
 * returns < 0 if lower case is collated before upper case,
 *         > 0 if lower case is collated after upper case.
 *
 * Note that, if it looks like the script of the locale has no
 * upper/lower case letters, then the order of the Latin script
 * is returned.
 *
 * If locale is NULL, it is assumed that the LC_ALL is already
 * set to a non-empty string.
 */
static int find_locale_case_order (const char *locale)
{
  int case_order;
  char *old_LC_ALL;

  char *sample;
  wchar_t sample_char, sample_lower[2], sample_upper[2];

  if (locale)
    {
      old_LC_ALL = setlocale (LC_ALL, NULL);
      setlocale (LC_ALL, locale);
    }

  case_order = 0;

  /* Get a sample letter from the alphabet of the locale */
  sample = nl_langinfo (MON_1);

  mbtowc (NULL, NULL, 0);
  if (-1 == mbtowc (&sample_char, sample, strlen (sample)))
    goto out;
  
  /* Get lower and upper variants of the sample letter */
  sample_lower[0] = towlower (sample_char);
  sample_upper[0] = towupper (sample_char);
  if (sample_lower == sample_upper)
    goto out;
  sample_lower[1] = sample_upper[1] = 0;

  /* We've got two different strings for lower and upper to compare now */
  case_order = wcscoll (sample_lower, sample_upper);

out:

  if (!case_order)
    case_order = strcoll ("a", "A");

  if (locale)
    {
      /* Set LC_ALL back */
      setlocale (LC_ALL, old_LC_ALL);
    }

  return case_order;
}


struct str_xfrm_entry {
  char *s;
  char *xfrm;
};

struct str_case_order_data {
  char *xfrm;
  wchar_t *wxfrm_lower;
};


/* The four compare functions: */

static int
str_compare_func (const void *a, const void *b)
{
  return strcmp (((struct str_xfrm_entry *)a)->xfrm,
                 ((struct str_xfrm_entry *)b)->xfrm);
}

static int
str_compare_func_desc (const void *a, const void *b)
{
  return - str_compare_func (a, b);
}

static int
str_compare_func_reversed_case (const void *a, const void *b)
{
  struct str_case_order_data *da =
    (struct str_case_order_data *)(((struct str_entry *)a)->data);
  struct str_case_order_data *db =
    (struct str_case_order_data *)(((struct str_entry *)b)->data);
  int order;

  /* case-insensitive comparison first */
  order = wcscmp (da->wxfrm_lower, db->wxfrm_lower);
  if (order)
    return order;

  /* if equal, go case-sensitive and return the inverse */
  return - strcmp (da->xfrm, db->xfrm);
}

static int
str_compare_func_desc_reversed_case (const void *a, const void *b)
{
  return - str_compare_func_reversed_case (a, b);
}



/* returns the strxfrm(.) of the input string, in newly allocated memory */
static char *
strxfrm_dup (const char *src, int len)
{
  int ret;
  char *buf = NULL;

  len = len / 2 + 2;

  do {
    len <<= 1;
    free (buf);
    buf = malloc (len);
    ret = strxfrm (buf, src, len);
  } while (ret >= len);

  return buf;
}

/* returns the wcsxfrm(.) of the input string, in newly allocated memory */
static wchar_t *
wcsxfrm_dup (const wchar_t *src, int len)
{
  int ret;
  wchar_t *buf = NULL;

  len = len / 2 + 2;

  do {
    len <<= 1;
    free (buf);
    buf = malloc (len * sizeof (buf[0]));
    ret = wcsxfrm (buf, src, len);
  } while (ret >= len);

  return buf;
}


void
xslt_sort (struct str_entry *entries,
	   int n_entries,
	   const char *locale,
	   int descending,
	   int case_order)
{
  char *old_LC_ALL = NULL;
  int case_order_locale = 0;
  int (*compare_func) (const void *, const void *);
  int reversed;
  struct str_case_order_data *case_order_data_array = NULL;
  int i;

  if (locale)
    {
      old_LC_ALL = setlocale (LC_ALL, NULL);
      setlocale (LC_ALL, locale);
    }

  /* Find out case order and from that the "reversed" */

  if (descending || case_order)
    case_order_locale = find_locale_case_order (NULL);

  if (!case_order)
    case_order = case_order_locale;

  if (descending)
    case_order = - case_order;

  reversed = case_order_locale && case_order * case_order_locale < 0;



  if (reversed)
    {
      /* If reversed, work with wide strings, it's cheaper,
       * since we don't have to convert back to multibyte after
       * towlower()ing.
       */

      compare_func = descending ? str_compare_func_desc_reversed_case
				: str_compare_func_reversed_case;

      case_order_data_array = malloc (n_entries *
				      sizeof (case_order_data_array[0]));

      for (i = 0; i < n_entries; i++)
        {
	  wchar_t *wcs;
	  int len;
	  int j;

	  entries[i].data = (void *)(case_order_data_array + i);

          len = mbstowcs (NULL, entries[i].s, 0);
	  if (len < 0)
	    len = 0;

	  wcs = malloc ((len + 1) * sizeof (wcs[0]));

	  len = mbstowcs (wcs, entries[i].s, len);
	  if (len < 0)
	    {
	      len = 0;
	      wcs[0] = 0;
	    }

	  for (j = 0; j < len; j++)
	    wcs[j] = towlower (wcs[j]);

          case_order_data_array[i].wxfrm_lower =
	          (void *)wcsxfrm_dup (wcs, len);

	  free (wcs);

          case_order_data_array[i].xfrm =
	          (void *)strxfrm_dup (entries[i].s, strlen (entries[i].s));
	}
    }
  else
    {
      /* If not reversed, work with multibyte strings, easier. */

      compare_func = descending ? str_compare_func_desc
				: str_compare_func;

      for (i = 0; i < n_entries; i++)
        {
	  entries[i].data =
	          (void *)strxfrm_dup (entries[i].s, strlen (entries[i].s));
        }
    }
  

  qsort (entries, n_entries, sizeof (entries[0]), compare_func);


  if (reversed)
    {
      for (i = 0; i < n_entries; i++)
        {
          free (case_order_data_array[i].xfrm);
          free (case_order_data_array[i].wxfrm_lower);
          entries[i].data = 0;
	}
      free (case_order_data_array);
    }
  else
    {
      for (i = 0; i < n_entries; i++)
        {
          free (entries[i].data);
          entries[i].data = 0;
	}
    }

  if (locale)
    {
      /* Set LC_ALL back */
      setlocale (LC_ALL, old_LC_ALL);
    }
}




/* driver */




char buf[64000];

int
main(int argc, char **argv)
{
  struct str_entry *lines = NULL;
  int lines_buf_size = 0, lines_num = 0;
  int i;
  int descending = 0;
  int case_order = 0;
  const char *locale = "";


  while (-1 != (i = getopt (argc, argv, "dlu")))
    switch (i) {

      case 'd':
        descending = 1;
	break;
    
      case 'l':
        case_order = -1;
	break;

      case 'u':
        case_order = 1;
	break;

    }

  if (optind < argc)
    locale = strdup (argv[optind++]);
  
  while (fgets (buf, sizeof (buf), stdin))
    {
      lines_num++;
      if (lines_num > lines_buf_size)
        {
	  lines_buf_size = (lines_buf_size + 1) * 2 - 1;
	  lines = realloc (lines, lines_buf_size * sizeof (lines[0]));
	}
      lines[lines_num - 1].s = strdup (buf);
    }

  if (!setlocale (LC_ALL, locale)) {
    fprintf (stderr, "Error setting locale to %s\n", locale);
    exit (1);
  }

  xslt_sort (lines, lines_num, locale, descending, case_order);

  for (i = 0; i < lines_num; i++)
    fputs (lines[i].s, stdout);

  return 0;
}
