NT4/private/windows/media/tools/extract/extract.c

/*
 * EXTRACT.C
 *
 * Documentation extractor.  Extracts tagged comment blocks from source
 * code, interprets and reformats the tag definitions, and outputs an
 * intermediate level 2 tag file, suitable for processing by a final
 * formatting tool to coerce the level 2 tags into something appropriate
 * for the presentation medium (paper, WinHelp RTF, Ventura, etc).
 *
 */

#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <assert.h>
#include "extract.h"
#include "tags.h"
#include "version.h"
#if MMWIN
#include <mmsysver.h>
#endif

/* Whether to do any output at all?  */
BOOL	fNoOutput	= False;
/*  The output file to use if not stdout */
PSTR	szOutputFile	= NULL;
/*  The actual output file pointer  */
FILE	*fpOutput;

/*
 *  File-private procedure templates
 */
void ProcessSourceFile( NPSourceFile sf );
void AppendLineToBuf(NPSourceFile sf, PSTR buf);
BOOL LookForCommentStart(NPSourceFile sf, PSTR buf, PSTR *nbuf);
BOOL IsTag(PSTR p);
BOOL PrepLine( NPSourceFile sf, PSTR buf, PSTR *nbuf );

/*
 *  User messages
 */
char msgStdin[] = "Using Standard Input for source text...\n";
char msgCurFile[] = "Processing file %s...\n";
char msgSyntaxCheck[] = "Syntax check only.\n";

char msgTypeMASM[] = "%s (%d): File is MASM source.\n";
char msgTypeC[] = "%s (%d): File is C source.\n";

char errOutputFile[] = "%s: Can not open output file\n";
char errInputFile[] = "%s: Can not open file.\n";
char errEOFinComment[] = "%s (%d): Premature end of file within comment block.\n";
char errRead[] = "%s (%d): Unable to read.\n";


/*
 * @doc EXTRACT
 *
 * @func int | main | This program extracts documentation information
 * from the given input file and sends it to the standard output.
 * Information is not sorted or formatted, but parsed from the
 * initial tag types to an intermediate tag output format that contains
 * full information as to tag placement within documentation/function
 * declarations.
 *
 * @rdesc The return value is zero if there are no errors, otherwise the
 * return value is a non-zero error code.
 *
 */
void main(argc, argv)
int argc;	/* Specifies the number of arguments. */
char *argv[];	/* Specifies an array of pointers to the arguments */
{
	SourceFile	sourceBuf;
	FileEntry	fileEntry;
	BOOL		fStdin = False;

	#define INITIAL_BUF	8192

#ifdef MMWIN
	/* announce our existance */
	fprintf(stderr, "%s\n", VERSIONNAME);
	fprintf(stderr, "Program Version %d.%d.%d\t%s\n", rmj, rmm, rup,
		MMSYSVERSIONSTR);
#ifdef DEBUG
	fprintf(stderr, "Compiled: %s %s by %s\n", __DATE__, __TIME__,
		szVerUser);
	fDebug = 1;
#endif
#endif

	ParseArgs(argc, argv);

	if (fNoOutput) {
	  fprintf(stderr, msgSyntaxCheck);
	  szOutputFile == NULL;
	}
	else {
	   /*  Open the output file, if one was specified.  If !szOutputFile,
	    *  then use stdout.
	    */
	   if (szOutputFile) {
	     fpOutput = fopen(szOutputFile, "w");
	     if (fpOutput == NULL) {
		fprintf(stderr, errOutputFile, szOutputFile);
		exit(1);
	     }
	   }
	   else {		/* Using stdout for output */
	     fpOutput = stdout;
	     szOutputFile = StringAlloc("stdout");
	   }

	   OutputFileHeader(fpOutput);
	}

	/*  If no files were specified on command line, use stdin.
	 *  Fake a fileEntry structure for stdin.
	 */
	if (FilesToProcess == NULL) {
		/* No files specified, use stdin */
		fileEntry.filename = StringAlloc("stdin");
		fileEntry.next = NULL;
		fileEntry.type = SRC_UNKNOWN;
		FilesToProcess = &fileEntry;
		fStdin = True;
	}

	/*
	 *  Loop over all files specified on command line
	 */
	while (FilesToProcess) {
		/*
		 *  Setup the source file access buffer
		 */
		sourceBuf.fileEntry = FilesToProcess;	// get head of list.

		/*  Open the file, except when using stdin */
		if (fStdin) {
			sourceBuf.fp = stdin;
			fprintf(stderr, msgStdin);
		}
		else {	// deal with normal file, need to open it.
			sourceBuf.fp = fopen(FilesToProcess->filename, "r");
			/* couldn't open file */
			if (!sourceBuf.fp) {
			  fprintf(stderr, errInputFile,
				  FilesToProcess->filename);
			  /* Skip to next file in list */
			  FilesToProcess = FilesToProcess->next;
			  continue;
			}

			/* Send message telling current file */
			fprintf(stderr, msgCurFile, FilesToProcess->filename);
		}

		/* Reset line numbers of input files to zero */
		sourceBuf.wLineNo = 0;
		sourceBuf.wLineBuf = 0;
		/* Setup copy buffer */
		sourceBuf.lpbuf = NearMalloc(INITIAL_BUF, False);
		sourceBuf.pt = sourceBuf.mark = sourceBuf.lpbuf;
		sourceBuf.fHasTags = sourceBuf.fTag = False;
		sourceBuf.fExitAfter = FALSE;

		ProcessSourceFile( &sourceBuf );

		if (!fStdin)
			fclose(sourceBuf.fp);
		NearFree(sourceBuf.lpbuf);
		NearFree(FilesToProcess->filename);
		FilesToProcess = FilesToProcess->next;
		/*
		 * Bail out with non-zero exit if fatal error encountered
		 */
		if (sourceBuf.fExitAfter) {
			fcloseall();
			exit(1);
		}
	}

	/*
	 *  Close output file if not stdout.
	 */
	fcloseall();
	exit(0);
}


/*
 * @doc	EXTRACT
 * @api	void | ProcessSourceFile | Process a given file, searching
 * for and extracting doc tagged comment blocks and processing and
 * outputting these comment blocks.
 *
 * @parm	NPSourceFile | sf | Specifies the source file comment block.
 * It must have a valid file pointer, and a valid buffer (lpbuf field)
 * before calling this function.  The file pointer will be open upon
 * return.
 *
 * @comm	This proc sits in a loop reading lines until it finds a
 * comment.  Once inside a comment, the lines are stripped of fuzz
 * pretty printing characters and examined for being an autodoc tagged
 * line.  If a tag is found in the comment block, the following comment
 * lines are copied into the lpbuf buffer of <p sf>, and passed to the
 * <f TagProcessBuffer> function to parse and output the tags.
 *
 */
#define LOCALBUF_SIZE	1024

void ProcessSourceFile( NPSourceFile sf )
{
  char	*buf;
  char	*pOrigBuf;
  char	*nBuf, *nBuf2;
  int	inComment;
  int	w;

  inComment = False;
  pOrigBuf = NearMalloc(LOCALBUF_SIZE, False);
  buf = pOrigBuf + 1;	// give one space of padding at beginning

  while (!feof(sf->fp)) {
    /*
     *  Grab the next line
     */
    #ifdef HEAPDEBUG
	NearHeapCheck();
    #endif

    w = (int) fgets(buf, LOCALBUF_SIZE, sf->fp);

    #ifdef HEAPDEBUG
	NearHeapCheck();
    #endif

    /*  Handle error or EOF conditions  */
    if (w == 0) {
      /*  Am i at EOF?  */
      if (feof(sf->fp)) {
	      /* Message is EOF happened while in a comment block */
	      if (inComment) {
		      /* MASM comment blocks can end on EOF,
		       * so go handle it if in a masm file.
		       */
		      if (sf->fileEntry->type == SRC_MASM) {
			      if (sf->fTag)
				      /*  This is BOGUS!! */
				      TagProcessBuffer(sf);
		      }
		      else {	// premature eof otherwise
			      fprintf(stderr, errEOFinComment,
				      sf->fileEntry->filename, sf->wLineNo);
		      }
	      }
	      /* Cause the enclosing while loop to exit on EOF */
	      continue;
      }
      else {	// error condition, bail out!
	      fprintf(stderr, errRead, sf->fileEntry->filename, sf->wLineNo);
	      goto BailOut;
      }
    }
    else {
      /*
       * Process this line - depending on current mode:
       *
       * -- CommentSearch mode:  inComment = False
       * Not currently in a comment, looking for comment begin
       * characters.  If commentBegin found, enter InsideComment
       * mode to look for end of comment and prep lines for
       * output processing.
       *
       * -- InsideComment mode:  inComment = True
       * Inside a comment block, taking each line, stripping beginning
       * whitespace, and appending to global buffer for output
       * processing.  When end of comment is found, send the entire
       * buffer for tag processing.  (only if there was a tag
       * detected!).  Enter CommentSearch mode.
       *
       */
      sf->wLineNo++;		// line count for file - now current line no.

      /*
       * I'm in InsideComment mode, so process the next line as a comment
       * line.  The magic is in PrepLine(), which strips whitespace, sets the
       * fTag flag of the sourceBuf if a tag is detected, and returns TRUE
       * when end of comment is detected.
       *
       */
      if (inComment) {
	w = PrepLine(sf, buf, &nBuf);
	AppendLineToBuf(sf, nBuf);
	if (w) {	// detected end of comment, exit in comment state
	  if (sf->fTag) {	// a tag was in the current buffer
		TagProcessBuffer(sf);
	  }

	  /* Go back to comment-search mode */
	  inComment = False;

	}
      }
      /*
       * Otherwise, I'm in CommentSearch mode, looking for a comment begin.
       * LookForCommentStart() returns TRUE when a comment start is detected.
       * It also fiddles <buf> so that the beginning of <buf> now points to
       * the character following the comment start.
       *
       * Pass to PrepLine() to detect an immediate comment close, and then
       * add this initial line to the global buffer after reseting buffer
       * status.
       *
       * Enter InsideComment mode.
       */
      else {		// not in a comment buffer
	if (LookForCommentStart(sf, buf, &nBuf)) {
	  // dprintf("Entering InsideComment mode, point is %d\n",
	  //	  (int) (sf->pt - sf->lpbuf));

	  /* Reset source file buffer status */
	  sf->fTag = sf->fHasTags = False;
	  sf->wLineBuf = sf->wLineNo;
	  sf->pt = sf->mark = sf->lpbuf;

	  /* Check for immediate comment close */
	  if (PrepLine(sf, nBuf, &nBuf2)) {
		  assert(sf->fTag == False);
		  continue;		// detected immediate end comment
	  }

	  AppendLineToBuf(sf, nBuf2);

	  /*  Enter InsideComment mode  */
	  inComment = True;
	}
	/* else, no comment start found, continue scan */
      }  // endof CommentSearch mode stuff.
    }/* else not a string read error */
  } /* file-level while loop */

BailOut:

  NearFree(pOrigBuf);

}


#define ISSPACE(c) ((c) == ' ' || (c) == '\t')

/*
 * @doc	EXTRACT
 * @api	BOOL | PrepLine | Prepares an InsideComment mode line,
 * stripping off initial whitespace and fuzz characters, and detecting
 * end of comment conditions.
 *
 * @parm	NPSourceFile | sf | Pointer to source file status buffer.
 * @parm	PSTR | buf | Pointer to beginning of source text line, as
 * read from the source file.
 * @parm	PSTR * | nbuf | Pointer to a char pointer, which is altered
 * to point the post-processed and stripped beginning of the line upon
 * procedure exit.
 *
 * @rdesc	Returns TRUE when end of comment is encountered.  In this
 * case, the end of comment characters are not included in the return
 * string.  Returns FALSE when no end of comment is detected.
 *
 * The char pointer pointed to by the <p nbuf> parameter is altered to
 * point to the new (post-processed and stripped) beginning of the line.
 * This new beginning is the beginning of the text of interest, having
 * had all comment leader characters and whitespace stripped off.  NULL
 * is an acceptable string to return, which will simply add nothing to
 * the tag buffer.  If a blank line is encountered, (ie simply a
 * newline), then the newline should be returned.
 *
 * If a tag is detected on the line, then the <p sf->fTag> flag is set
 * to True to indicate that this is a valid tagged comment block.
 *
 * @comm	This procedure does the stripping of language specific fuzz
 * characters into a simple text block.  The setting of <p sf->fTag> is
 * critical, and may be accomplished by calling the <f IsTag> procedure when
 * the tag should appear within the source line.
 *
 */
BOOL PrepLine( NPSourceFile sf, PSTR buf, PSTR *nbuf )
{
  PSTR	chClose;
  PSTR	pend;

  /* Scan forward, removing initial whitespace */
  for (; *buf && ISSPACE(*buf); buf++);

  /* I never have to deal with begin comment processing, this is done
   * by the LookForCommentStart() proc.  In C, PrepLine() is invoked on
   * the char following the '/ *'.  In MASM, the ';' is left in.
   */

  switch (sf->fileEntry->type) {
    case SRC_MASM:

	/*  End of comment check:  If this first character (after whitespace
	 *  stripped out) is not a ';', then this is the end of the comment
	 *  block.  Return TRUE to indicate this.
	 */
	if (*buf && *buf != ';') {
		*buf = '\0';
		*nbuf = buf;
		return True;
	}

	/* strip contiguous ';' and '*', followed by whitespace */
	for (; *buf && (*buf == ';' || *buf == '*'); buf++);
	for (; *buf && ISSPACE(*buf); buf++);
	if (IsTag(buf)) {
		sf->fTag = True;
		*nbuf = buf;
	}
	else {
		/* HACK!
		 * If first char is a @ (and not a tag), pad with a space
		 */
		if (*buf == TAG) {
			*(--buf) = ' ';
		}
		*nbuf = buf;
	}

	/* Very hack way of kicking out extra comments */
	if ((buf = strstr(buf, "//")) != NULL)
		*buf = '\0';

	return False;

    case SRC_C:
	/* Remove leading stars */
	for (; *buf && *buf == '*'; buf++);
	/* Quick check for close comment - */
	if (*buf && *buf == '/') {
		*buf = '\0';
		*nbuf = buf;
		return True;
	}

	/* Otherwise, remove whitespace between the '*' and the text */
	for (; *buf && ISSPACE(*buf); buf++);
	/* Check for a tag here */
	if (IsTag(buf))
		sf->fTag = True;
	else {
		/*  If not tag but a @ on first char of line  */
		if (*buf == TAG) {
			buf--;	// can do this since buf is padded by one
			*buf = ' ';
		}
	}

	/* Implement the comment scheme of Rick's request */
	if ((pend = strstr(buf, "//")) != NULL)
		*pend = '\0';

	/* And if the line hasn't ended, search line for a close comment */
	chClose = strstr(buf, "*/");
	if (chClose) {
		/* found end of comment, NULL this spot, and return from func
		 * with TRUE, with nbuf pointing the beginning of non-white
		 * space text above
		 */
		*nbuf = buf;
		*chClose = '\0';
		return True;
	}

	/* Otherwise, found no end of comment on this line, so simply
	 * return whole line
	 */
	*nbuf = buf;
	return False;

    default:
	// dprintf("Invalid source type in PrepLine()!\n");
	assert(False);
	exit(5);

  }  /* switch */

}


/*
 * @doc	EXTRACT
 * @api	BOOL | IsTag | Perform a quick and dirty check to see if the
 * word pointed to by <p p> is a tag.
 *
 * @parm	PSTR | p | Buffer, queued to the start of a word/tag.  If
 * this is a possible tag, then it must point to the initial '@'
 * character.
 *
 * @rdesc	Returns TRUE if this is probably a tag, or FALSE otherwise.
 *
 * @comm	This is a hack test, but works 99.9% of the time.
 *
 */
BOOL IsTag(PSTR p)
{
  PSTR	pbegin;

  pbegin = p;

  if (*p != TAG)
	  return False;

  /*  For this procedure, allow newline as a whitespace delimeter */

  /* Skip to next whitespace */
  for (; *p && !(ISSPACE(*p) || *p == '\n'); p++);

  /* This is a very lame test for a tag, but if the first char was
   * a '@' and there is a space following the word, then I'm going to
   * say it is a tag.
   */
  if (*p && (p > pbegin + 1) && (ISSPACE(*p) || *p == '\n'))
	  return True;

  return False;
}


/*
 * @doc	EXTRACT
 * @api	BOOL | LookForCommentStart | Search a source line for comment
 * start characters.
 *
 * @parm	NPSourceFile | sf | Pointer to the source file block
 * structure.
 * @parm	PSTR | buf | Pointer to beginning of source text file line to
 * examine.
 * @parm	PSTR * | nbuf | Pointer to a pointer that is modified to
 * indicate the beginning of the true source text line if a comment
 * block begin is found.
 *
 * @rdesc	Returns False if no comment start characters are found.
 * Returns True if a comment start is found.  If True is returned,
 * <p *nbuf> will point to the start of the source text line as it
 * should be passed to <f AppendLineToBuf>.
 *
 * This examination method for determining start of comment depends on
 * the source file type (as obtained from the fileEntry.type field of
 * <p sf>).  Unknown file types are examined and placed into one of the
 * other known source types as soon as distinguishing characters are
 * found.  (ie if '/ *' is found in an unknown, the file is marked as C
 * source file the remainder of file processing.  Note that this can
 * cause unknown file types to be incorrectly processed.)
 *
 */
BOOL LookForCommentStart(NPSourceFile sf, PSTR buf, PSTR *nbuf)
{

  /*  Skip leading whitespace  */
  for (; *buf && ISSPACE(*buf); buf++);

  if (!*buf)
	  return False;

  switch (sf->fileEntry->type) {
    case SRC_C:
	if (!*(buf + 1))
		return False;
	if ((*buf == '/') && (*(buf+1) == '*')) {
		*nbuf = buf+2;
		return True;
	}
	break;

    case SRC_MASM:
	if (*buf == ';') {
		*nbuf = buf;
		return True;
	}
	break;

    /*
     *  The catch all.  This has serious potential for disaster!
     */
    case SRC_UNKNOWN:
	/* Try the MASM comment character */
	if (*buf == ';') {
		fprintf(stderr, msgTypeMASM,
			sf->fileEntry->filename, sf->wLineNo);
		sf->fileEntry->type = SRC_MASM;
		*nbuf = buf;
		return True;
	}

	/* Otherwise, try the C-method */
	if (!*(buf + 1))
		return False;
	if ((*buf == '/') && (*(buf+1) == '*')) {
		fprintf(stderr, msgTypeC,
			sf->fileEntry->filename, sf->wLineNo);
		sf->fileEntry->type = SRC_C;
		*nbuf = buf+2;
		return True;
	}
	break;

    default:
	// dprintf("Unknown filetype identifier in sourceFile buffer.\n");
	assert(False);

  }

  return False;

}

/*
 * @doc	EXTRACT
 * @api	void | AppendLineToBuf | Appends an stripped comment line the
 * comment buffer contained in <p sf>.
 *
 * @parm	NPSourceFile | sf | Source file buffer block pointer.
 * Contains the buffer that is appended to.
 * @parm	PSTR | buf | Pointer to NULL terminated line to add to the
 * comment buffer.
 *
 * @comm	Appends <p buf> to the comment buffer, contained in the lpbuf
 * field of <p sf>.  The current point in the comment buffer, (given by
 * the pt field of <p sf>) is advanced to the end of the appended
 * string.
 *
 */
void AppendLineToBuf(NPSourceFile sf, PSTR buf)
{
  int	size;
  PSTR	ch;
  PSTR	end;

  #define GROWSIZE	1024

  if (!sf->fHasTags)
	  /* If buffer doesn't yet have tags, check if one was just
	   * found, and the copy
	   */
	  if (sf->fTag) {
		  sf->fHasTags = True;
		  sf->wLineBuf = sf->wLineNo;
	  }
	  /*  Or no tags in buffer yet, return  */
	  else {
		  *sf->pt = '\0';
		  return;
	  }

  // dprintf("AppendLineToBuf:  %d\n", (int) (sf->pt - sf->lpbuf));

  /*  Otherwise, the buffer has tags, so copy the new string  */
  end = (PSTR) (sf->lpbuf + (int) NearSize(sf->lpbuf));

  for (ch = buf; *ch && (sf->pt < end); *sf->pt++ = *ch++);
  /* Deal with possible buffer overrun */
  if (sf->pt >= end) {
	WORD	origPt;
	int	needSize;

/*	dprintf("AppendLine:  expanding buf %x, pt %x, end %x\n",
		sf->lpbuf, sf->pt, end);
*/
	origPt = (WORD) (sf->pt - sf->lpbuf);	// save current offset
	needSize = strlen(ch) + 1;		// grow by this much

	sf->lpbuf = NearRealloc(sf->lpbuf,
		       (WORD)(NearSize(sf->lpbuf) + max(needSize, GROWSIZE)));
	sf->pt = sf->lpbuf + origPt;

	/* Continue with the copy */
	for (; *ch; *sf->pt++ = *ch++);
  }

  /* make sure that final buffer is null terminated */
  *sf->pt = '\0';
}