/* Make an index file from raw index entries.
 *
 * $Id: mkindex.c,v 1.6 2003/03/28 05:10:22 grog Exp $
 *
 */
#include <ctype.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#define NRECS	16384					    /* number of records we can handle in core */
#define BIGBUF	1024
#define ENTLEN	128					    /* maxiumum entry length */
#define MIN_KW	9					    /* minimum length of keyword */
#define SUBS	8					    /* this many substrings (see below) */

int romannumber (char *text);				    /* these are in roman.c */
char *toroman (int number);

char outbuf [BIGBUF];
int outptr;						    /* pointer in outbuf */
char inbuf [NRECS] [BIGBUF];
char keyword [BIGBUF];
int keywordlen;

#define debug 0						    /* for testing */

#if debug
#define dprintf(a) printf (a)
#else
#define dprintf(a)
#endif

struct ref;
struct ref
{
  struct ref *next;
  int firstpage;
  int lastpage;
  int isroman;						    /* this is a roman number (ugh) */
  }
reference;
struct ref *refptr;
struct ref *lastref = &reference;

/*
 * To ensure correct indentation, we maintain a list of substrings
 * representing the text up to a certain level of commas.  For
 * example,
 *
 *   file system, ufs, creating
 *
 * would consist of the substrings
 *
 *   file system
 *   file system, ufs
 *   file system, ufs, creating
 *
 * Store them here.
 */
char substring [SUBS] [ENTLEN];				    /* substrings of current  */
int substrings;						    /* and number currently active */
char letter;						    /* letter for heading  */

void put_line ()
{
  char *s;						    /* pointer in substring */
  char *rest;						    /* differing rest from last entry */
  int i;						    /* level of substring */
  int common;						    /* levels in common with the previous */
  int len;						    /* length of possible substring */

  if (*keyword)
    {

    /*
     * Find out if there are commas in the keyword.  If so, it may be
     * time to indent.
     */
    s = keyword;
    rest = s;
    common = 0;						    /* nothing in common yet */
    for (i = 0; i < SUBS; i++)
      {
      while (*s && *s != ',')				    /* look for the next comma */
	s++;
      len = s - keyword;				    /* length of possible substring */

      if (*s)						    /* not at end yet, */
	{
	s++;						    /* point past the comma */
	while (*s == ' ')
	  s++;						    /* and skip any blanks */
	}
      if ((i >= substrings) || memcmp (substring [i], keyword, len)) /* not the same */
	{
	memcpy (substring [i], keyword, len);		    /* put in this keyword */
	substring [i] [len] = '\0';			    /* and delimit string */
	substrings = i + 1;				    /* we currently have this many */
	}
      else
	{
	common = i + 1;					    /* this many levels in common */
	rest = s;					    /* and this is the uncommon text */
	}
      if (! *s)						    /* got to the end, */
	break;
      }
    if (isalpha (*keyword) && (toupper (*keyword) != letter)) /* changed initial letter */
      {
      letter = toupper (*keyword);
      printf (".Indexletter %c\n", letter);
      }

    /* XXX */
    switch (common)
      {
    case 0:
      printf (".Indexent0 \"%s\" \"", rest);
      break;

    case 1:
      printf (".Indexent1 \"%s\" \"%s\" \"", substring [0], rest);
      break;

    case 2:
      printf (".Indexent2 \"%s\" \"%s\" \"", substring [1], rest);
      break;

    default:
      printf (".Indexent%d \"%s\" \"", common, rest);
      }

    refptr = reference.next;				    /* go through the references */
    /* This will break if we have no references.  Do we care? */
    while (refptr)
      {
      char *oldref = (char *) refptr;

      if (refptr->isroman)				    /* roman number */
	{
	if (refptr->lastpage != refptr->firstpage)	    /* range */
	  printf ("%s\\(en%s",
		  toroman (refptr->firstpage),
		  toroman (refptr->lastpage));
	else
	  printf ("%s", toroman (refptr->firstpage));	    /* only one */
	}
      else
	{
	if (refptr->lastpage != refptr->firstpage)	    /* range */
	  printf ("%d\\(en%d",
		  refptr->firstpage,
		  refptr->lastpage );
	else
	  printf ("%d", refptr->firstpage);		    /* only one */
	}
      if (refptr->next)					    /* more to come after this one, */
	printf (", ");
      else
	printf ("\"\n");
      refptr = refptr->next;				    /* get off the branch */
      free (oldref);					    /* and chop it off */
      }
    reference.next = NULL;				    /* no refs any more */
    }
  }

void insert_ref (char *start)
{
  int pagenumber;
  int isroman = 0;					    /* until proven otherwise */
  struct ref *lastref = &reference;			    /* last reference */
  struct ref *thisref = reference.next;			    /* pointer in list of references */

  if (isdigit (*start))					    /* normal number */
    {
    pagenumber = atoi (start);				    /* convert it */
    while (thisref					    /* look for a place to lay our egg */
	   && (thisref->isroman
	       || (thisref->lastpage < pagenumber) ) )	    /* move on */
      {
      if (thisref->lastpage == pagenumber - 1)		    /* we can coalesce */
	{
	thisref->lastpage = pagenumber;			    /* update last page number */
	return;						    /* and out */
	}
      lastref = thisref;
      thisref = thisref->next;
      }
    }
  else
    {
    isroman = 1;
    pagenumber = romannumber (start);
    while (thisref
	   && thisref->isroman				    /* look for a place to lay our egg */
	   && (thisref->lastpage < pagenumber) )	    /* move on */
      {
      if (thisref->lastpage == pagenumber - 1)		    /* we can coalesce */
	{
	thisref->lastpage = pagenumber;			    /* update last page number */
	return;						    /* and out */
	}
      lastref = thisref;
      thisref = thisref->next;
      }
    }
  lastref->next = (struct ref *) malloc (sizeof (struct ref)); /* allocate a reference */
  lastref = lastref->next;				    /* and point to it */
  lastref->next = thisref;
  lastref->isroman = isroman;
  lastref->firstpage = pagenumber;
  lastref->lastpage = pagenumber;
  }

/*
 * Auxiliary function for collate().
 * This is a mess.  Our input lines contain a single page number at
 * the end.  That's not part of the sorting order.  Fix this by noting
 * the last character for each that we want to be part of the sort.
 */

void trimlast (char *c)
{
  char *a = &c [strlen (c) - 1];			    /* last char */
  while (*a == ' ')					    /* skip any trailing blanks */
    a--;
  while (*a != ' ')					    /* skip line number */
    a--;
  while (*a == ' ')					    /* skip intervening blanks */
    a--;
  a [1] = '\0';						    /* chop off at first char after string */
  }

/*
 * sort function for mergesort. This does a word-by-word sort which
 * ignores case and special characters.  Case will still prevail if
 * it's the only difference, since the input stream has been sorted in
 * the normal ASCII collating sequence, and mergesort maintains order
 * unless told differently.
 */

#define BIGINDEX 1024					    /* largest index entry */
int collate (const void *a, const void *b)
{
  char ac [BIGINDEX];
  char bc [BIGINDEX];
  char *first = ac;
  char *second = bc;
  char f;						    /* pointer in first string */
  char s;						    /* and in second */
  int casediff = 0;					    /* set if we have only a case difference */

  strcpy (ac, a);
  strcpy (bc, b);
  trimlast (ac);					    /* limit length to what we want to to examine */
  trimlast (bc);					    /* limit length to what we want to to examine */
#if debug
  printf ("collate '%s' '%s': ", ac, bc);
#endif
  while (1)
    {
    while (*first
	   && ! isalpha (*first)
	   && ! isdigit (*first)
	   && (*first != ',')
	   && (*first != '-')
	   && (*first != ' ') )
      first++;
    f = *first;
    if (islower (f))
      f = toupper (f);
    while (*second
	   && ! isalpha (*second)
	   && ! isdigit (*second)
	   && (*second != ',')
	   && (*second != '-')
	   && (*second != ' ') )			    /* skip to letter, digit, space or comma */
      second++;
    s = *second;
    if (islower (s))
      s = toupper (s);
    if ((f == s)					    /* same except maybe for case */
	&& (! casediff) )
      casediff = *first - *second;
    if (f == '\0')					    /* ran out of first one */
      {
      dprintf ("a shorter\n");
      if (s)
	return -1;					    /* s wins */
      else
	return casediff;
      }
    if (s == '\0')					    /* ran out of second */
      {
      dprintf ("b shorter\n");
      return 1;						    /* first is larger */
      }
    if (f < s)
      {
      if (s == ',')					    /* comma is special, */
	{
	dprintf ("comma, a > b\n");
	return 1;					    /* pretend to be smaller */
	}
      else
	{
	dprintf ("a < b\n");
	return -1;
	}
      }
    else if (f > s)
      {
      if (f == ',')					    /* comma is special, */
	{
	dprintf ("comma, a < b\n");
	return -1;					    /* pretend to be smaller */
	}
      else
	{
	dprintf ("a > b\n");
	return 1;
	}
      }
    /* Same so far.  Move on and try again. */
    first++;
    second++;
    }
  }

int main (int argc, char *argv [])
{
  int kwend;						    /* copy indices */
  int pnstart;
  int pnend;
  int recs = 0;						    /* number of entries */
  int i;

  if (argc > 1)
    {
    printf ("%s: this program is a filter, no arguments please\n", argv [0]);
    exit (1);
    }

  while (fgets (inbuf [recs], BIGBUF, stdin))
    if (++recs > NRECS)
      {
      fprintf (stderr, "Too many records: %d\n", NRECS);
      exit (1);
      }

  /*
   * The records we read in are already in some sort of order, but we
   * want to ignore case and leading / and things.
   */
  if (mergesort (inbuf, recs, BIGBUF, &collate))
    {
    fprintf (stderr, "Can't sort entries: %s (%d)\n", strerror (errno), errno);
    exit (1);
    }
#if debug
  for (i = 0; i < recs; i++)
    puts (inbuf [i]);
#endif

  puts (".Index-head");
  for (i = 0; i < recs; i++)
    {
    if (inbuf [i] [0] != ',')				    /* don't take broken entries (from man pages) */
      {
      pnend = strlen (inbuf [i]) - 1;			    /* point to end of entry */
      for (pnstart = pnend - 1;				    /* find blank after keyword */
	   (inbuf [i] [pnstart] == '\n') || isalnum (inbuf [i] [pnstart]);
	   pnstart--);

      for (kwend = pnstart;
	   inbuf [i] [kwend] == ' ';
	   kwend--);
      pnstart++;					    /* point to page number */
      kwend++;						    /* point past keyword */
      if ((kwend != keywordlen)				    /* keywords are different length */
	  || memcmp (inbuf [i], keyword, kwend) )	    /* or they're different */
	{
	put_line ();					    /* print previous line */
	memcpy (keyword, inbuf [i], kwend);		    /* save new keyword */
	keyword [keywordlen = kwend] = '\0';		    /* and delimit it */
	}
      if (kwend > 0)					    /* more than just the tag? */
	insert_ref (&inbuf [i] [pnstart]);		    /* insert a reference */
      }
    }
  put_line ();						    /* show it */
  puts (".Indexend");
  return 0;
  }