OutputFrom = 256

PresentOutputOffset:
  .word OutputStart + OutputFrom
LocationCounter:
  .word OutputFrom

OutputByte:
  push di
  mov  di,PresentOutputOffset
|  call DisplayRegister
|  call DisplayMessage
|  db   ':Outputting ',0dh,0ah,0
  stosb
  mov  PresentOutputOffset,di
  call CheckOutputOverflow
  sub  di,#OutputStart
  mov  LocationCounter,di
  pop  di
  ret

OutputWord:
  call OutputByte
  xchgb ah,al
  call OutputByte
  xchgb ah,al
  ret

CheckOutputOverflow:
  cmp  di,#OutputEnd
  jc   OutputStillOK
  mov  bx,#OutputBigMessage
  call Panic
OutputStillOK:
  ret

WriteOutputFile:
  mov  cx,PresentOutputOffset
  mov  dx,#OutputStart
  add  dx,#OutputFrom
  sub  cx,dx
  mov  bx,OutputFileHandle
  movb ah,#WriteFunction
  int  #DosInterrupt
  jc   OutputError
  ret
OutputError:
  mov  bx,#OutputFileMessage
  call Panic

                                                                                                                                      
|Fields of the entry in the offent
| are offset and changeval

size_offent = 4


MaxNumberOfSymbols = 750
MaxOffsetEntries   = 1000
MaxStringTableSize = 10000
PostFixBufferSize  = 300

SymTabStart = EndOfCode
SymTabEnd = SymTabStart + (MaxNumberOfSymbols * size_syment)

OffsetTableStart = SymTabEnd
OffsetTableEnd = OffsetTableStart + (MaxOffsetEntries * size_offent)

StringTableStart = OffsetTableEnd
StringTableEnd = StringTableStart + MaxStringTableSize

PostFixBufferStart = StringTableEnd
PostFixBufferEnd = PostFixBufferStart + PostFixBufferSize

OutputStart = PostFixBufferEnd
OutputEnd = 62535

PresentFileNameOffset:
  .word 0
OutputFileNameOffset:
  .word 0
ListFileNameOffset:
  .word 0
EmptyOffent:
  .word OffsetTableStart

LastFilledSymbol:
  .word SymTabStart
StringSpace:
  .word StringTableStart

|This procedure finds the symbol in the symbol table. The symbol is
|assumed to be in the present input word
|The offset of the start of the entry found is in di and si

FindSymbol:
  mov  si,#SymTabStart
FindSym1:
  mov  bp, si
  mov  di,#InputWord
  lodsw
  or   ax,ax
  jz   DSymbolNotFound
  mov  si,ax
  lodsb
  movb cl,al
  xorb ch,ch
  rep
  cmpsb
  jnz  NotThisDSymbol
  cmpb [di],#0
  jnz  NotThisDSymbol
  mov  si,bp
  mov  di,si
  clc
  ret

RecordXref:
  push ax
  mov  ax,si
  call WriteListWord
  mov  ax,PresentFileNameOffset
  call WriteListWord
  mov  ax,InputLineNumber
  call WriteListWord
  pop  ax
  ret

NotThisDSymbol:
  mov  si,bp
  add  si,#size_syment
  jmps FindSym1

DSymbolNotFound:
  stc
  ret

|Add a symbol into the symbol table. The identifier to be added will
|be in the InputWord. When it returns, di and si (like in the previous
|function) points to the attributes.
AddSymbol:
  mov  di,LastFilledSymbol
  cmp  di,#SymTabEnd
  jnz  NoSymTabOverflow
  mov  bx,#SymTabOverflowMessage
  call Panic
NoSymTabOverflow:
  mov  bp,di
  mov  ax,StringSpace
  mov  idname[di], ax
  mov  di,StringSpace
  mov  dx,di
  inc  di
  mov  si,#InputWord
  mov  cx,#MaxIdentifierSize - 2
MoreCharsInIdentifier:
  lodsb
  stosb
  orb  al,al
  jz   EndOfInputWord
  jcxz ReportLargeIdentifier
  loop MoreCharsInIdentifier
ReportLargeIdentifier:
  mov  bx,#LargeIdentMessage
  call PanicRecover
EndOfInputWord:
  call CheckStringTableOverflow
  mov  StringSpace,di
  neg  cx
  add  cx,#MaxIdentifierSize - 2
  mov  di,dx
  movb [di],cl
  mov  di,bp
  mov  ax,PresentFileNameOffset
  mov  DefFileNameOffset[di],ax
  mov  ax,InputLineNumber
  mov  DefLineNumber[di],ax
  mov  size_syment[di],#0
  mov  LastFilledSymbol,di
  add  LastFilledSymbol,#size_syment
  mov  si,di
  ret


|Find a Fake Symbol, if not present,add it
|
|ax has the value of the fake symbol
|
FindFakeSymbol:
  mov di,#InputWord
  push ax
  movb al,#'_'
  stosb
  pop  ax
  push ax
  call SprintRegister
  xorb al,al
  stosb
  call FindSymbol
  pop  ax
  jnc  FindFakeEnd
  push ax
  call AddSymbol
  mov  Attributes[di],#IsFake + Calculated + IsDefined + IsEquate
  pop  ax
  mov  Value[di],ax
FindFakeEnd:
  ret


|WriteListFile, writes the list file out the format of the list file is
|
|1.Symbol table start   :word   as in the assembler
|2.Symbol table end     :word   points to one byte past the true end.
|3.Symbol table         :2 - 1  bytes of data, raw symbol table
|4.String table start   :word   as in the assembler
|5.String table end     :word   points to one past the end.
|6.String table         :5 - 4  bytes of data, raw string table

WriteListFile:
  mov  ax,#0                    |Terminate the xref list in the list file
  call WriteListWord
  mov  ax,#SymTabStart
  call WriteListWord
  mov  ax,LastFilledSymbol
  call WriteListWord
  mov  cx,LastFilledSymbol
  mov  dx,#SymTabStart
  sub  cx,dx
  call WriteList
  mov  ax,#StringTableStart
  call WriteListWord
  mov  ax,StringSpace
  call WriteListWord
  mov  cx,StringSpace
  mov  dx,#StringTableStart
  sub  cx,dx
  call WriteList
  ret

ListError:
  mov  bx,#OutputFileMessage
  call Panic

DummyListWord:
  .word 0

WriteListWord:
  mov  DummyListWord,ax
  mov  dx,#DummyListWord
  mov  cx,#2
WriteList:
  movb ah,#64
  mov  bx,ListFileHandle
  int  #DosInterrupt
  jc   ListError
  ret

|Initialise the symbol table

InitSymbolTable:
  mov SymTabStart,#0
  ret


SprintRegister:
  push cx
  movb ch,#4
SprintRegisterMore:
  rol  ax
  rol  ax
  rol  ax
  rol  ax
  call SprintHexDigit
  decb ch
  jnz  SprintRegisterMore
  pop  cx
  ret

SprintHexDigit:
  push ax
  push dx
  push bx
  andb al,#15
  mov  bx,#HexDigitTable
  xlat
  stosb
  pop  bx
  pop  dx
  pop  ax
  ret

FindValue:
  mov  bx,ax
  test Attributes[bx],#Calculated
  jz   FoundNoValue
  mov  ax,Value[bx]
  clc
  ret

FoundNoValue:
  stc
  ret

NoteErrorOnDef:
  mov  ax,DefLineNumber[si]
  mov  InputLineNumber,ax
  mov  ax,DefFileNameOffset[si]
  mov  PresentFileNameOffset,ax
  ret


UndefinedError:
  mov  bx,[si]
  inc  bx
  push bx
  mov  ax,Value[si]
  call NoteErrorOnDef
  call PutErrorAndPosition
  pop  bx
  call DisplayOtherMessage
  call DisplayMessage
  .asciz   ":"
  mov  bx,#SymbolNotDefinedMessage
  call DisplayOtherMessage
  call PutCarriageReturn
  or Attributes[si],#IsDefined + Calculated + NeverDefined
  jmps FixNextSymbol

fixation:
  .byte 0

FixUnknowns:
  mov  si,#SymTabStart
  sub  si,#size_syment
  movb fixation,#0
FixNextSymbol:
  add  si,#size_syment
  cmp  [si],#0
  jz   EndedFix
  test Attributes[si],#IsDefined
  jz   UndefinedError
  test Attributes[si],#Calculated
  jnz  FixNextSymbol
  test Attributes[si],#IsFake
  jnz  JustAnotherSymbol
  push si
  mov  si,Value[si]
  call EvaluateExpression
  pop  si
  jnc  Evaluated
  jmps FixNextSymbol

JustAnotherSymbol:
  mov  bx,Value[si]
  test Attributes[bx],#Calculated
  jz   FixNextSymbol
  mov  ax,Value[bx]
Evaluated:
  movb fixation,#1
  or   Attributes[si],#Calculated
  mov  Value[si],ax
  jmps FixNextSymbol
EndedFix:
  cmpb fixation,#1
  jnz  DoneFixes
  jmp  FixUnknowns
DoneFixes:
  ret

AddOffEnt:
  push di
  mov  di,EmptyOffent                   
  stosw
  mov  ax,LocationCounter
  add  ax,bx
  stosw
  mov  EmptyOffent,di
  pop  di
  ret

JumpOutOfRange:
  mov  bx,#JumpErrorMessage
  call PanicRecover

PatchCode:
  mov  si,#OffsetTableStart
NextOffent:
  cmp  si,EmptyOffent
  jz   EndOfOffs
  lodsw
  mov  bx,ax
  lodsw
  add  ax,#OutputStart
  mov  di,ax
  mov  ax,[di]
  testb al,#2
  jnz  RelativePatch
  testb al,#1
  mov  ax,Value[bx]
  jz   bytemove
  mov  [di],ax
  jmps NextOffent
EndOfOffs:
  ret
RelativePatch:
  testb al,#1
  jz   bytepatch
  mov  ax,di
  add  ax,#2
  sub  ax,#OutputStart
  sub  ax,Value[bx]
  neg  ax
  mov  [di],ax
  jmps NextOffent
bytepatch:
  mov  ax,di
  inc  ax
  sub  ax,#OutputStart
  sub  ax,Value[bx]
  neg  ax
  rolb al
  rorb al
  adcb ah,#0
  jnz  JumpOutOfRange
  movb [di],al
  jmps NextOffent
bytemove:
  movb [di],al
  jmps NextOffent
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |Lister - print the symbol table of the assembler from the list file.
| Version 1.6
| 1. Accomodated undefined symbols in the symbol table.
| 2. Recognizes the new flag in symtab.i
| 3. Generates 'undef' for such symbols. (still gives a line no and a 
|    file name though) perhaps that should be prevented!
|
| Version 1.5
| 1. Accomodate xref at the beginning of file
| 2. Made functions modular - adjusting functions
| 3. Generate Symbol wise Cross References
| 4. Added commandline options - default is to just display symbol table
|    options include 
|    -x to generate references and counts
|    -z to list only unreferenced symbols
|
|Usage
|       lister filename
|
|No default extension is assumed, the file, as specified should exist
|
CR              = 13

FileNameSize    = 13            |these include one more space
LineNumberSize  =  6
SymbolSize      = 30

.include dos.i
.include symtab.i

  call OpenInputFile
  call AnalyzeFile

ExitToDos:
  call CloseInputFile
  movb ah,#TerminateFunction
  int  #DosInterrupt

|Get the filename from the command line
|Display Message about file name
|Open the file,
|
OpenInputFile:
  call GetInputFileName
  call DisplayMessage
  .asciz "Analyzing File: '"
  mov  bx,dx
  call DisplayOtherMessage
  call DisplayMessage
  .asciz "'"
  call PutCarriageReturn

|Open the input file for reading
  movb ah,#OpenFileFunction
  movb al,#ReadOnly
  int  #DosInterrupt
  jc   OpenFileError
  mov  InputFileHandle,ax
  ret

OpenFileError:
  mov  bx,#FileOpenErrorMessage
  call Panic

AnalyzeFile:
  call LoadTables
  call AdjustTables
  call DisplayTables
  ret

DisplayTables:
  call DisplaySymbolTable
  ret

DisplaySymbolTable:
  mov  si,ListSymbolTableStart
ListNextSymbol:
  cmp  si,ListSymbolTableEnd
  jz   DisplayedSymbolTable
  test Attributes[si],#IsFake
  jnz  AFakeSymbol
  cmpb Option,#'z'
  jz   DontDisplayEntry
  call DisplaySymbolEntry
DontDisplayEntry:
  cmpb Option,#0
  jz  DontDisplayXref
  call DisplaySymbolXref
DontDisplayXref:
AFakeSymbol:
  add  si,#size_syment
  jmps  ListNextSymbol
DisplayedSymbolTable:
  ret

DisplayPaddedSymbol:
  movb al,#SymbolSize
  jmps OtherPaddedDisplay
DisplayPaddedFileName:
  movb al,#FileNameSize         |Chars for the filename
OtherPaddedDisplay:
  call DisplayOtherMessage
PaddedDisplayEnd:
  call PadWithSpaces
  ret

DisplayPaddedLineNumber:
  call DisplayAXInDecimal
  movb al,#LineNumberSize       |Chars for the line number
  jmps PaddedDisplayEnd

DisplaySymbolEntry:
  mov  bx,si
  mov  bx,[bx]
  inc  bx
  call DisplayPaddedSymbol
  mov  bx,DefFileNameOffset[si]
  call DisplayPaddedFileName
  mov  ax,DefLineNumber[si]
  call DisplayPaddedLineNumber
  test Attributes[si],#NeverDefined
  jz   WasActuallyDefined
  call DisplayMessage
  .asciz "Undef"
  jmps ListTypeDone
WasActuallyDefined:
  test Attributes[si],#IsEquate
  jnz  ListNotLabel
  call DisplayMessage
  .asciz "Label"
  jmps ListTypeDone
ListNotLabel:
  call DisplayMessage
  .asciz "Equate"
  jmps ListTypeDone
ListTypeDone:
  movb al,#7                    |Chars for description
  call PadWithSpaces

  mov  ax,Value[si]
  call DisplayRegister
  call DisplayMessage
  .asciz  " "
  call DisplayPaddedLineNumber
  call PutCarriageReturn
  ret

DisplaySymbolXref:
  push si
  mov  di,si
  mov  References,#0
  mov  si,XrefTableStart
CheckNextXref:
  cmp  si,XrefTableEnd
  jz   DisplayedSymbolXrefs
  cmp  [si],di
  jnz  NotThisSymbol
  cmpb Option,#'x'
  jnz  DontDisplayFoundXref
  call DisplayXrefFound
DontDisplayFoundXref:
  inc  References
NotThisSymbol:
  add  si,#6
  jmps CheckNextXref
DisplayedSymbolXrefs:
  cmpb Option,#'x'
  jnz  DontDisplayReferenceCount
  call DisplayReferenceCount
DontDisplayReferenceCount:
  pop  si
  cmpb Option,#'z'
  jnz  DontDisplayZeroRef
  call DisplayZeroReferenceSymbol       |On for listing unreferreds
DontDisplayZeroRef:
  ret

DisplayZeroReferenceSymbol:
  cmp  References,#0
  jnz  NonZeroReferenceCount
  xchg si,di
  call DisplaySymbolEntry
  xchg si,di
NonZeroReferenceCount:
  ret

DisplayReferenceCount:
  call DisplayMessage
  .asciz "  "
  mov  ax,References
  call DisplayPaddedLineNumber
  call DisplayMessage
  .asciz " references found"
  call PutCarriageReturn
  ret

DisplayXrefFound:
  call DisplayMessage
  .asciz "  "
  mov  bx,2[si]
  call DisplayPaddedFileName
  mov  ax,4[si]
  call DisplayPaddedLineNumber
  call PutCarriageReturn
  ret

LoadTables:
  call LoadXrefTable
  call LoadSymbolTable
  call LoadStringTable
  ret

LoadXrefTable:
  mov  dx,#EndOfCode
  mov  XrefTableStart,dx
MoreXrefEntries:
  call ReadWord
  mov  si,dx
  lodsw
  or   ax,ax
  jz   EndOfXrefTable
  inc  dx
  inc  dx
  call ReadWord
  inc  dx
  inc  dx
  call ReadWord
  inc  dx
  inc  dx
  jmps MoreXrefEntries
EndOfXrefTable:
  mov  XrefTableEnd,dx
  ret

LoadSymbolTable:
  mov  dx,#AsmSymbolTableStart
  call ReadWord
  mov  dx,#AsmSymbolTableEnd
  call ReadWord
  mov  dx,XrefTableEnd
  mov  ListSymbolTableStart,dx
  mov  cx,AsmSymbolTableEnd
  sub  cx,AsmSymbolTableStart
  jc   CorruptListFile
  call Read
  add  cx,ListSymbolTableStart
  jc   CorruptListFile
  mov  ListSymbolTableEnd,cx
  ret

LoadStringTable:
  mov  dx,#AsmStringTableStart
  call ReadWord
  mov  dx,#AsmStringTableEnd
  call ReadWord
  mov  dx,ListSymbolTableEnd
  mov  ListStringTableStart,dx
  mov  cx,AsmStringTableEnd
  sub  cx,AsmStringTableStart
  jc   CorruptListFile
  call Read
  add  cx,ListStringTableStart
  jc   CorruptListFile
  mov  ListStringTableEnd,cx
  ret

CorruptListFile:
  mov  bx,#CorruptListFileMessage
  call Panic

AdjustTables:
  call AdjustSymbolTable
  call AdjustXrefTable
  ret

AdjustSymbolTable:
  mov  si,ListSymbolTableStart
  mov  cx,ListStringTableStart          |find the diff in the positions
  sub  cx,AsmStringTableStart
MoreSymbolEntriesToAdjust:
  cmp  si,ListSymbolTableEnd
  jz   AdjustedSymbolTable
  mov  bx,#idname
  call AdjustName
  mov  bx,#DefFileNameOffset
  call AdjustName
  add  si,#size_syment
  jmps MoreSymbolEntriesToAdjust
AdjustedSymbolTable:
  ret

AdjustXrefTable:
  mov  si,XrefTableStart
  mov  dx,ListSymbolTableStart
  sub  dx,AsmSymbolTableStart
MoreXrefEntriesToAdjust:
  cmp  si,XrefTableEnd
  jz   AdjustedXrefTable
  mov  bx,#0
  call AdjustSymbol
  mov  bx,#2
  call AdjustName
  add  si,#6
  jmps MoreXrefEntriesToAdjust
AdjustedXrefTable:
  ret

|si pointer to symbol table entry
|cx difference between the string tables
  

AdjustName:
  mov  ax,[bx_si]
  add  ax,cx
  cmp  ax,ListStringTableStart
  jc   CorruptListFile
  cmp  ax,ListStringTableEnd
  jnc  CorruptListFile
  mov  [bx_si],ax
  ret

AdjustSymbol:
  mov  ax,[bx_si]
  add  ax,dx
  cmp  ax,ListSymbolTableStart
  jc   CorruptListFile
  cmp  ax,ListStringTableEnd
  jnc  CorruptListFile
  mov  [bx_si],ax
  ret

ReadWord:
  mov  cx,#2
Read:
  mov  bx,InputFileHandle
  movb ah,#ReadFunction
  int  #DosInterrupt
  jc   ReadError
  ret

ReadError:
  mov  bx,#ReadErrorMessage
  call Panic

CloseInputFile:
  mov  bx,InputFileHandle
  movb ah,#CloseFileFunction
  int  #DosInterrupt
  ret

|Find the name of the file from the command line and terminate it
|with a nul character. The offset of the string is returned in dx!
|If there is no file name then display usage message.
|
|Destroys:
| si : to make it point to the command line
| dx : for return
| al : for temp storage.

GetInputFileName:
  mov  si,#CommandLineStart 
SkipCommandLineSpaces:
  lodsb
  cmpb al,#' '
  jz   SkipCommandLineSpaces
  cmpb al,#9
  jz   SkipCommandLineSpaces
  cmpb al,#CR
  jz   Usage
  cmpb al,#'-'
  jz   OptionFound
  mov  dx,si
  dec  dx
MoreCharsInFileName:
  lodsb
  cmpb al,#' '
  jz   EndOfFileNameFound
  cmpb al,#9
  jz   EndOfFileNameFound
  cmpb al,#CR
  jnz  MoreCharsInFileName
EndOfFileNameFound:
  dec  si
  movb [si],#0
  ret

OptionFound:
  lodsb
  cmpb al,#'x'
  jz   ValidOption
  cmpb al,#'z'
  jnz  InvalidOption
ValidOption:
  movb Option,al
  jmp  SkipCommandLineSpaces

InvalidOption:
  mov  bx,#InvalidOptionMessage
  call Panic

|No filename on command line - display usage message and quit

Usage:
  mov  bx,#UsageMessage
  call Panic

Panic:
  call DisplayOtherMessage
  call PutCarriageReturn
  jmp  ExitToDos
  

|Messages for the lister

UsageMessage:
  .asciz  "Usage: lister [-xz] <filename>"
FileOpenErrorMessage:
  .asciz  "Error opening input file for reading"
ReadErrorMessage:
  .asciz  "Error reading input file"
CorruptListFileMessage:
  .asciz  "The List file is corrupt! Can't analyze"
InvalidOptionMessage:
  .asciz  "The valid options are only x or z"
|Data
InputFileHandle:
  .word 0
AsmSymbolTableStart:
  .word 0
AsmSymbolTableEnd:
  .word 0
AsmStringTableStart:
  .word 0
AsmStringTableEnd:
  .word 0
ListSymbolTableStart:
  .word 0
ListSymbolTableEnd:
  .word 0
ListStringTableStart:
  .word 0
ListStringTableEnd:
  .word 0
XrefTableStart:
  .word 0
XrefTableEnd:
  .word 0
References:
  .word 0
Option:
  .byte 0
.include display.s

EndOfCode:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          		A ONE-PASS ASSEMBLER FOR DOS/MINIX
		------------------------------

			1 ABSTRACT

The assembler that comes with the MINIX operating system is heavily
used. At present it is extremely slow and requires a large amount of
disk space for its temporary file. I wanted to develop a fast
assembler that could do without much extra space.

This project also tries to find good data structures and methods of
coding a fast one pass assembler(`asm') with the restrictions on a
MINIX system.


			2 INTRODUCTION

MINIX is a UNIX clone that was designed to work on the IBM PC range of
machines. It is a very popular operating system among students because
of its low requirements from the hardware (a normal PC with 540KB
memory and 2 360KB floppy drives does quite well), for its proximity
to the UNIX operating system (its almost like System V) and most
importantly, the sources for the operating system and most of the
tools are included in the distribution.

The sources of the C compiler (`cc') and the assembler and loader
(`asld') are not included in the distribution.


			3 THE MINIX ASSEMBLER

The assembler plays an important role in the MINIX operating
system. There are no object files in the system.  The C compiler
generates code in the assembly language and the assembly language is
directly converted into an executable by `asld'. The C compiler has no
other form of output. (`cc', with or without the -S or the -c option
generates a `.s' assembly file only). The assembler takes the `.s'
assembly files and generates an executable.

The specifications of the assembler [1;384-385] are also adhered to by
this assembler. There are slight differences, which are mentioned in a
later chapter.

The assembler also accepts something called as packed assembly
language. Packed assembly language is generated from normal assembly
language by the utility called as the `libpack' and the reverse is
performed by the `libupack' utility [1;393]. What `libpack' does is
recognize commonly occurring strings and replace them with codes 128
to 255.  e.g. `push ax' is 128 and so on. Packing strips an assembly
file of its comments.[1;412]

Whether the input is packed or not does not matter to the
assembler. If there is a byte on the input which is greater than 127
then the assembler unpacks it. `asm' does not do unpacking now.

The MINIX `asld' also searches libraries for the files to be linked to
the current file and links them. The code for multiple files is
already there in `asm', but the code for searching the libraries
isn't.

		4 DEVELOPMENT OF THE ASSEMBLER

The assembler on the MINIX is extremely slow, and even with no
libraries to link to, it takes ages to assemble a program. Therefore I
decided to do the work on DOS using the Microsoft assembler
`MASM'. The debugging facilities on DOS are great compared to MINIX. I
used only `debug' on the generated files. With the options available
in MASM, (such as selective listing) debugging of the `.COM' file,
which doesn't have any symbolic information, even on a PC was easy.

The assembler is written in such a way that it should be able to
assemble itself under DOS to create a `.COM' file or under MINIX to
create an executable file directly.

The testing of the assembler was done on DOS, and then the source was
put onto MINIX. Since the input syntax of the MINIX assembler are not
exactly compatible with the DOS syntax, I've made some slight
adjustments to the assembler so that it accepts a language, which can
be assembled on DOS and which with very little modifications can be
assembled under MINIX.

		5 OPERATING SYSTEM CONSTRAINTS

The current assembler on the MINIX system does not generate separate
Instruction and Data space programs. The operating system supports two
models. The `small' model (The equivalent of the `tiny' model on the
MASM) in which the Instruction and the Data space (including stack and
all) together should occupy less than 64K and a separate I & D space
model (the equivalent of the MASM `small' model) where 64K is allowed
for the instruction and 64K for Data and Stack. Other modules are not
supported, to encourage programmers to use small modular
programs. Separate I and D programs have to be generated by the PCIX
assembler.

As I intended to assemble the assembler using the host assembler
itself, it meant that I had only 64K for the data and instruction
space. This means that the code and the data space of the assembler
should be as little as possible, so that larger programs can be
assembled.

Another restriction is that disk space is precious on the MINIX
because of the platforms that it operates on. Smaller the assembler
executable size, better is it.

		6 OPERATING SYSTEM INCOMPATIBILITIES

The output executable file formats for DOS and MINIX are quite
different.

On DOS for the `.COM' file, there is no header. The memory image of
the program is stored on the disk. The first 100H or 256 bytes of the
program are not stored. The program starts at location 100H. No
relocation information can be stored in the file. The same is the case
with the debug information.

This is managed by the assembler, with the help of a few assemble time
variables. The variable `OutputFrom' decides what part at the
beginning should be skipped. On DOS, this is set to 256 so that the
first 256 bytes are skipped. Also the location counter automatically
starts at 256 (100H) without any directive.

On MINIX, the executable which is generated has a 32 byte header,
which is generated by the assembler. Execution starts at location 0.

			7 DESIGN ISSUES


One of the primary aims was to keep the assembler small and fast. The
bast way to do this is to write the assembler in assembly language.

Since the amount of memory space is restricted, (64K for the
instruction and the data), I had to decide between creating a
temporary file or to restrict the output file size to 64K less the
space taken up by the assembler's instruction and data. I chose the
second option i.e.  restricting the output file size.

This means that at present the maximum size of the output file is
around 43K.

			8 THE INPUT LANGUAGE

The language accepted by asm is similar to the language accepted by
the PC-IX assembler. The symbols are made up of letters, digits and
underscores. (though [1] does not mention it, the period is also
allowed in an identifier, also as the starting character). The modes
using multiple registers are coded as in this example.

mov ax,34[bx+si]

The `bx+si' is treated as a single identifier and cannot have spaces
between it.  Constant operands have to be preceded by the number sign
`#'. Local variables are not permitted.

The pseudo instructions or the assembler directives are a superset of
the MINIX directives and they are as follows.

	.align	n	Align to a multiple of n bytes
	.ascii	str	Assemble a string
	.asciz	str	Assemble a zero terminated string
	.bss		What follows goes to the bss segment
	.byte	n	Assemble one or more bytes
	.data		What follows goes to the data segment
	.define	sym	Export the symbol from the file
	.errnz	n	Force Error if n is nonzero
	.even		Align to an even address
	.extern	sym	Declare sym external
	.globl	sym	same as .extern
	.include file	include the file into the current file
	.long	n	assemble a long
	.org	addr	Set Address within the current segment
	.short	n	assemble n as a short
	.space	n	skip n bytes
	.text		what follows goes into the text segment
	.word		assemble n as a word
	.zerow	n	assemble n words of zeros


The segment directives don't make much sense here as everything goes
into the same segment anyway. They are just ignored by `asm'.

The new directive that has been added is the include directive. It
includes another file at the current position into the source
file. After the include file finishes, processing is continued with
the main file, from where it was left off. There is no restriction on
the number of times includes can be nested. The filename should not be
enclosed in quotes and the full filename with path (relative or
absolute) and extension should be given


	9 DIFFERENCES BETWEEN THE MINIX ASSEMBLER AND ASM

There are some minor differences between the language that `asld'
assembles and the language that `asm' assembles.  Some of the
differences are detailed below.

The addressing mode in an instruction is enclosed in sqaure brackets
and not in parantheses. e.g. [bx].

The addressing mode, when it consists of multiple registers, has the
`+' sign separating the register names instead of the `_'
sign. e.g. mov ax,[bx+si]. This was done because the DOS assemblers
also accept this syntax and this makes the porting to minix simpler.

`asm' does not allow local labels as the documentation about their
behaviour is not very clear.

The documentation is also not very clear about when the decimal point
`.' is allowed in labels and when it isn't.  Looking at the sources of
`klib88.s' [1;456-470], it seems that the `.' is allowed in symbols
wherever alphabets are allowed.

One major difference is not in the specifications but in the behaviour
of the assembler. If you give an instruction like `add al,bl' (when
actually the instruction should have been `addb al,bl'), the assembler
silently accepts it and generates the instruction `add ax,bx' which is
wrong. `asm' reports errors whenever the operand sizes do not match
the mnemonics.  


			   10 STRING TABLE

The string table, in spite of its name doesn't mean a table. It means
string space. Keeping any string fixed size meant that there would be
a wastage of space.

e.g. The maximum size of the path is say 64 bytes. If you want to keep
an array of filenames that have been encountered, then the amount of
space that would have been wasted is tremendous.

The maximum size of the symbols is 32 bytes and again if these names
were kept in the symbol table, then the size of the symbol table would
have been tremendous.

Therefore all strings are kept in the string table. It works more or
less like a heap. There are no deletions from the string table, only
additions. Therefore, there is no complex management done. The only
thing kept track of is the position upto which the string table is
full.

All the references to the strings in the string table is done through
absolute pointers (not indices relative to the start of the string
table or the like).

At present the things stored in the symbol table are the names of the
files being assembled, the names of the symbols (assembler generated
as well as user specified) and the compiled expressions.

			    11 EXPRESSIONS

In assembly you keep encoutering assemble time expressions.  These
range from constant equates to complex operands.  Expressions can
occur at the right hand side of an equate or in operands, either as an
immediate operand or as the displacement in an addressing mode.

Expressions are normal infix expressions with only a few
operators. The operators are `+', `-', `*', `/' and `%' with the same
meanings that they have in C. Expressions can be paranthesized to any
level. The operands are symbols or constants. All values are treated
as 16 bit integer values.  whether symbols or constants. The number of
operators can be very easily changed as mentioned in `Improvements'.

The expressions can be arbitrarily complex and may contain symbols
that are defined, symbols that are not defined and
constants. Expressions may not be evaluatable at the time that they
are encountered. Therefore the evaluation of the expressions may have
to be delayed. All the symbols will be defined after the first
pass. `asm' is a one pass assembler so it doesn't have a second pass
to evaluate the expressions.

The expressions that are evaluatable when they are encountered are
evaluated. The expressions that are not evaluatable are stored as
compiled expressions in the string space.

The compiled expression is actually a postfix form of the expression,
in which the operands preceed the operators.  The operators are stored
as the ascii value of the character. One word is used for the operands
and one word is used for the operators. Since there is only one word
for the operands, it is not possible to specify whether the operand is
an index into the symbol table or it is a constant that was input.

The solution is that all operands are considered as inde �  !�  "�  #�  $�  %�  &�  '�  (�  )�  *�  +�  ,�  -�  .�  /�  0�  1�  2�  3�  4�                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              xes into the
symbol table. For constants fake entries are created, which is
actually an underscore `_' followed by the value of the symbol in hex
e.g. the fake symbol with the value 10 has the symbol `_000A'. When
another constant of the same value is used, the same entry is used and
new entries are not created.

The procedure of converting an infix paranthesized expression to a
postfix expression is given in most books.  The rules are all operands
on the input are put onto the output directly. When an operator comes
on the input. All the operators that are on the stack with precedence
more than (or equal to also for normal left associative operators) are
popped into the output and the input operator is pushed onto the
stack. If the end of the expression is encountered, then the operators
on the stack are popped onto the output.

The structure of the compiled expression is as follows. The first word
has the lower byte 0 the higher byte (as in the operators described
later) specifies the number of the operands that follow this
word. This is because, just looking at the word, it is impossible for
the expression evaluator to know whether the word is an operator or an
operand. All operands, as mentioned before are indexes into the symbol
table. Again indexes are actually not indexes of the symbol table
array, but pointers to the base of the entry in the symbol table.

After the specified number of operands have been read, there will be
another operator word. The operator word has the low byte as the ascii
value of the operator and the high byte as the number of operands that
follow this operator. (this might be zero, if the operands come right
one after the other). The sequence continues till you reach an
operator word that is zero, which means that the expression ended. The
expression always ends with a zero word.

The evaluation of the expression is simple. All the operands, whenever
they are encountered are pushed onto the stack and whenever an
operator is encountered the operands are popped from the stack and the
result is pushed onto the stack.

For operands in this case, the operands are actually pointers into the
symbol table, so for each operand encountered during evaluation, the
evaluator looks up the value of the operand from the symbol table. If
the attributes of the symbols say that he symbol hasn't been
evaluated, then the evaluation is abandoned and a flag is returned to
signify that the expression could not be evaluated.


			12 SYMBOL TABLE

Considering all the space restrictions that were mentioned before, one
of the main issues while designing the symbol table was the space
required.

The name of the symbol is not kept in the symbol table, instead the
pointer to the start of the name in the string space is kept.

Since the assembler is written in the `tiny' model, the code and the
data space is 64K. That means that 16 bits are enough for the pointers
into the string space. These pointers are absolute pointers and not
relative to the start of the string table (or anything else).

The other values that are stored in the symbol table are the offset of
the filename and the line number of the definition of the symbol (see
`string table').  Upto the point when the definition of the symbol is
encountered these actually hold the line number and the filename of
the first reference to the symbol. This is done for reporting errors
about undefined symbols. When the symbols are listed, the line number
and the filename of the definition are picked up from here.

A symbol can be defined by one of only two ways. The first way is as
the left hand side of an equate and the second way is as a
label. `asm' keeps track of whether a symbol is an equate or a label
for listing purposes, but otherwise the usage for both of them is the
same.

There is a word reserved in the symbol table for the attributes of the
symbol. The various attributes are stored as bits. The attributes are
`calculated, equate, fake, label, defined'. The bits are on if that
attribute is yes.  For labels, they are always defined and
calculated. Equates and other expressions may or may not be
evaluated. The calculated bit in the attributes is for that.

Fake symbols are generated for constants in expressions and also for
nameless expressions. When there are exoressions for operands and the
expression can not be evaluated, then an entry is made in the symbol
table for this expression with a fake symbol. These (like the fakes
discussed before) have two underscores `__' followed by a number in
hexadecimal.

If an expression can not be evaluated, then the compiled expression is
put into the string space and the value field of the symbol table
entry for that symbol points to the compiled expression. If the
expression can be evaluated, then all the fakes that were added for
this expression are also deleted from the symbol table as their use is
over.

An exception is when the expression is just the name of another symbol
that hasn't been evaluated. This is called a simple expression. In
this case, the compiled expression is not kept, but another bit in the
attributes (Fake bit) is set and the symbol entry offset of the target
symbol is put in the value field. This saves string space.

All expressions that could not be evaluated are reevalutaed at the end
of the assembly. If they are not evaluatable at that time, it is
because an undefined symbol was encountered and so an error is
declared.

The contents of the symbol table are printed out at the end of the
assembly.

The symbol tables are searched linearly. i.e. for every symbol the
search starts at the beginning of the symbol table and goes on down
the symbol table till the symbol is found. For a failure the whole
symbol table is searched.


			13 PREDEFINED SYMBOLS

The predefined symbols are the symbols that are defined in the
assembler itself. The predefined symbols are in groups.  These groups
are - the assembler directives or the pseudo instructions, the
instructions, the addressing modes, the 16-bit registers, the 8-bit
registers and the segment registers.

The organizaton of the predefined symbol tables is simple.  There are
many symbol tables and a generic symbol table driver.Each of the
tables is organized as follows.

The first byte in the symbol table is the number of bytes of
attributes (and other information) that have to be skipped after every
symbol. Then follows the list of symbols (with attributes and
all). After the last symbol there is a zero length symbol indicating
that the symbol table is over.

Each symbol in the symbol table consists of one byte that indicates
the length of the symbol. Then comes the string, that is the symbol
itself and then come the attributes. The attributes could be anything
from register numbers (for the register symbol tables) to names of
procedures to be called (for the directives) to the opcodes for the
instructions.

The symbol tables may be lexically ordered, but no use is made of this
as the searches through these symbol tables is always done
linearly. The number of options at the beginning of the instruction
are large compared to the high level languages. The number of
instructions are 136 compared to the few dozen that are there in the
higher level languages.


		13 INSTRUCTIONS SYMBOL TABLE

This was the most time-consuming table to build. During the design I
had to decide about how many bytes of attributes should be put in the
symbol table and how they should be processed. After analysing the
instruction set (for which the appendices at the end of [2] were very
useful), I decided to have two bytes of information.

There are routines written for each set of instructions, whose
encodings are similar. e.g. for add, sub and the other mathematical
and logical instructions except `test' the type of instructions that
can be generated and their encoding is the same except for a few
bits. These few bits are there in the attributes for the
instruction. All of these are directed to the same subroutine.

For each subroutine, the sets of bits mean different things. The aim
was to reduce the number of subroutines that have to be coded to
reduce the code size. This has led to quite a lot of bit twiddling in
the support routines.

One of the vaguest things in the 8086 is the `test' and the `xchg'
instructions. The `test' instruction, though a logical instruction has
a totally different encoding scheme. Therefore there is an exception
made in the add subroutine to accomodate the test and the xchg
instructions.


			14 BACKPATCHING CODE

When expressions are not evaluatable, then there should be a record
that when the expression is evaluated the result of the expression
should be put in such and such a location.

Therefore there is an `offent' table, which keeps track of what
symbols' values are to be put in what location. The form of the table
is a.symbol table entry offset and b.patch location.

All symbols are evaluated as words and the symbols can be used as
bytes as well. This means that the size of the patch is not a property
of the symbol but a property of the location at which the patch is
done.

Therefore the size of the symbol is not stored in the symbol
table. Each patch will be atleast one byte long. The least significant
bit in the patch lovation specifies whether the patch is a byte patch
(0) or a word patch (1).

Some of the most common expressions will be relative expressions. The
value of a relative patch is got as the patch location - the target
value location. If for all forward jumps or calls, these expressions
are put on the string space (see the `expressions' chapter) then a lot
of string space would be wasted.

This property of being relative or absolute is also a property of the
patch and not the symbol. Therefore the second least significant bit
in the patch location is kept for indicating whether the patch is
relative or absolute.  Relative patches can also be bytes or words.

Patching is done only after all the symbols have been evaluated
successfully. For each entry in the `offent' table, the value of the
symbol pointed to is taken and is put at that location. If it is a
byte patch only one byte is put, else a word is put. If it is a
relative patch then the value of the symbol is subtracted from the end
of the relative patch location (for jumps the PC is actually pointing
to the instruction after the jump) before the patch is done.

For byte patches if the value to be patched exceeds one byte then an
error is declared.


			15 IMPLEMENTATION

The assembler is implemented as separate files, which do not form
separate compilation modules. There is one main file - `asm.asm' and
the rest of the files are included in this file.

Here is a list of the files that form the assembler and a short
description of the functionality of the code in each file.

  ASM.ASM : This is the main assembler file. It contains code wich
opens the input and the output files. It also contains code to analyze
the command line and form the input and the output filenames. It also
contains the main loop of the assembler that scans each line and
decides what to do with it.


  SYMBOLS.ASM : In this file, there is a small procedure in the
beginning that searches through the predefined symbol tables. Then
follow the tables themselves. Ths structure of the tables has been
described before. The table for the directives is simple. There is ony
one word which is the offset if the routine to be called. The tables
for the registers and the addressing modes are also simple with only
one byte of attributes (the number of the register or the addressing
mode). The tables for the instructions are a little complex and are
described in one of the previous chapters.


  SYMTAB.ASM : This file has the code to manipulate the symbol
table. There are two main procedures that manage the symbol table and
they are `AddSymbol' and `FindSymbol'.  There is also a procedure to
find the value of a symbol and for creating `offent' entries (see one
of the previous chapters). It also has code to keep making passes over
the symbol table till all the symbols are evaluated and also to patch
the code.

  MESSAGE.ASM : This file contains the procedures that form the
message displaying mechanism of the assembler. All the messages are
also stored in this file and so also some general purpose displaying
routines. Some of the globals are also declared in this file.

  SUPPORT.ASM : This file contains the procedures that are called for
the instructions in the symbol table. These are more or less self
explanatory and most of the routines do mundane, similar things.

  EQU.ASM : This file contains the routines for processing the
definition of a symbol. The symbols can be defined either as a label
or as an equate. The labels are simple to process, but the equates -
considering that the expressions on the right hand sides might not be
evaluatable, are difficult to handle. At the end of this file are
procedures that also get operands for instructions and expressions and
return appropriate values.

  INPUT.ASM : This file contains the procedures to read the input file
(with buffering) and process it. It allows for ungetting upto one
character. It also provides routines to get a token from the input - a
number an identifer or any other character. There are procedures to
classify characters as numeric or alphabetic or alphanumeric etc.

  EXPR.ASM : This file contains the procedures to compile an exprssion
and also to evaluate it. The procedure is described in a previous
chapter `expressions'.

  OUTPUT.ASM : This file contains the routines for outputting a byte
or a word to the output file and for also writing the output file.

  DIRECT.ASM : This file contains the routines that process the
assembler directives. These are more or less trivial.

			16 CONCLUSIONS

The assembler size on disk was 7365 bytes. The speed of assembly was
faster than `tasm' the turbo assembler on DOS, which in turn (being
one-pass) is a lot faster than `masm' the Microsoft Assembler. The
minix assembler was way below in comparison.

The assembler was tested by converting two of the modules in the
assembler to the MINIX input format and assembling them. The result
was checked using DOS's `DEBUG'.

The data structures seemed to be quite effective for the purpose. The
idea of compiled assemble time expressions and their partial
evaluation could be very useful for constant folding in higher level
languages.

			17 IMPROVEMENTS

Since the number of symbols that can be stored in the symbol table is
restricted, the number of labels in a big program can exceed this
limit - especially if the assembler program is generated (by a C
compiler say). Therefore there should be some way to discard useless
values.

I propose that two new directives should be added - `.mark' and
`.release'. All the symbols that are added to the symbol table after
the last mark are released when the release is encoutered. Note that
it is not the symbols that are `defined' after the mark, but the
symbols that are added to the symbol table after the mark. This means
that symbols that weren't defined at the mark (they have been added to
the symbol table because they were referenced) but are there in the
symbol table, and which get defined between the mark and the release
do not get removed at the release.

A common usage would be in a C file. After defining the globals we can
put a mark. Since the globals have been referenced (in the `.globl'
statament), they would have been added to the symbol table.  At the
end of the file you can put a release. All the symbols local to that
file will be removed from the symbol table.

The operators that are usable now should be increased. The addition of
new operators is simple. I suggest the operators
'>','<','!','~','^','&', and '|', with the same meanings as in C. The
operators are restricted to one character only. If left shift and
right shift are essential then the '}' and '{' could be used.

Error reporting is fine, but now the assembler stops at the first
error. Error recovery can be easily implemented as the
resynchronization with the input text is easy in an assembler. The
new-line is considered as the resynchronization point in the
assembler. And as such the recovery actions that can be done are - the
symbols added during this statement can be deleted, the string space
used up can be recovered. Anyway, the output should not be written
onto the output file.

At least for the predefined symbols, there could be a better way of
organising the symbol tables so that a binary search would be possible
or a hashing of sime sort. If this is done for the user defined
symbols as well then there would be a big improvement in performance.

In the routines that exist for the instructions, there are many
redundancies in the code and lots of code duplication.  With some
effort the size of this code could be reduced by a fourth.

Generation of cross-reference and other listings. With a few changes
to the symbol table and the patching routines, it will be easy to
write the information required for the cross reference listings onto a
file. This need not be in a readable format. We could have a post
processor that can go through this file and generate a redable output.

The format of the intermediate file (with the extension `.xrf') would
be tuples which have the following information in binary a.File name
offset, b.line number, c.Symbol name offset and
d.Definition/reference.

There would also be another file generated, which would be a listing
of all the symbols that were found. This would contain tuples of the
form a.Offset and b.Name. The Name would be null terminated. Symbol
names and file names would be recorded. The offset in this case refers
to the offset in the string table.

In addition to the name of the assembly file and the line number
information that we put in the symbol table for the definition. We
could also put the C file name and the line number in the C
source. This can be done by adding another assembler directive -
(`.line' and `.file' seem to be fine or possibly only `.line'). The
cross reference procedure can also be enhanced so that these file and
line numbers also occur in the xref file.

More testing is required to find out whether all the data in the
symbol table is accurate or not. During the little testing that I had
done. I found one mistake in the symbol table (sub always got
translated to subb). THe possibility of more errors in the symbols
cannot be ruled out.


			       18 USAGE

The source is distributed as one .COM file and a bunch of .s
(assembly) and .i (include) files.  It takes one filename as an
argument (with an optional .s suffix).  There are no other arguments.

To rebuild the assembler:

  C:> asm asm

This produces 2 files. asm.com (which is overwritten) and asm.lst 
which contains the symbol references.   

The .lst is a binary file.  See symtab.s for a description of it's
contents.  You need to build the lister utility to read it.

  C:> asm lister

This creates a file called lister.com.    The usage for lister is

   lister [-xz] <filename.lst>

Only one of -x or -z must be specified.  The -x option prints a
complete xref dump (definitions + references) The -z option prints a
list of labels that were not referenced anywhere.


e.g.
   
To print labels not referenced

C:> lister -z asm.lst
....
DisplaySignedAX               display.s    11    Label  115D 4445  
PadWithSpaces                 display.s    90    Label  11D2 4562  
DisplayRegister               display.s    113   Label  11F4 4596  
....

Format is:
Symbol-Name File-Name Line-No. Symbol-Type Value-Hex Value-Decimal


To print all defined symbols:
C:> lister asm.lst
....
PathSize                      asm.s        2     Equate 0040 64    
BufferSize                    asm.s        3     Equate 0014 20    
WordSize                      asm.s        4     Equate 0020 32    
....

Format is:
Symbol-Name File-Name Line-No. Number-of-Refs Symbol-Type Value-Hex Value-Dec


To print crossreferences:
C:> lister -x asm.lst
....
PathSize                      asm.s        2     Equate 0040 64    
  asm.s        148   
  asm.s        153   
  2      references found
...
 

Format is:
Def: Symbol-Name File-Name Line-No. Number-of-Refs Symbol-Type Value-Hex Value-Dec
Ref:    File-Name   Line-No.

			      REFERENCES

1. Tannenbaum A S, "Operating Systems : Design and
   Implementation", Prentice Hall of India, New Delhi,
   1989. 

2. Rector R and Alexy G, "The 8086 Book", Osborne /
   McGraw-Hill, California, 1980.
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |CompileExpression


OpenBrackPrec  = 1
AddOpPrec      = 2
MulOpPrec      = 3
BangOpPrec     = 4		    |of the binary ! operator
UnaryOpPrec    = 50


|CompileExpression
|

CompileExpression:
  mov  bx , #1                      |last was operator(bh) = 0 start (bl) = 1
  movb  dl, #0                      |operand count.
  movb  dh, #0                      |operands needed
  pushf
  xor  ax,ax
  stosw
  popf
  push ax
  jmps FirstTokenThere
CompileNext:
  call GetToken
FirstTokenThere:
  jnc  TokenGot
  jmp  EndofExpression
TokenGot:
  movb  al,InputWord
  call IsAlphaNumDotAL
  jc   ConstantGot
  cmpb al,#'''
  jnz  NotNumber
ConstantGot:
  orb   bh,bh
  jnz  CanGetConstNow
  orb   bl,bl
  jnz  CanGetConstNow
SyntaxErrorMessage:
  mov  bx,#SyntaxErrMessage
  call PanicRecover
CanGetConstNow:
  call GetIdForToken
  stosw
  xor  bx,bx                       |Last was operator = 0 | start = 0
  incb  dl                          |operand count
  incb  dh
  jmps CompileNext

NotNumber:
  call IsOperator
  jnz  NotAnOperator
  or   bx,bx                       |last was operator or start
  jz   LastWasNotOperator
  call IsUnaryOperator
  jz   WasUnaryOperator
  jmp  SyntaxErrorMessage
WasUnaryOperator:
  incb  dh                          |unary ops need 0 operands
  addb  al,#0x80
  movb  ah,#UnaryOpPrec
LastWasNotOperator:
  mov  cx,ax
  decb  dh
ContinuePopping:
  mov  bp,sp
  cmpb  1[bp],ah                   |if the operator on stack has a higher
  jc   FinishedPopping             |or same precedence
  cmp  0[bp],#0         |if the stack isn't finished
  jz   FinishedPopping
  testb cl,#0x80
  jnz  FinishedPopping
  pop  ax
  call PatchLastOperator
  stosw
  jmps ContinuePopping
FinishedPopping:
  push cx
  movb  bh,#1                        |last was operator = 1
  testb cl,#0x80
  jnz  dontchangestart
  xorb bl,bl                       | Start = 0
dontchangestart:
  jmp  CompileNext

NotAnOperator:
  cmpb  al,#'('
  jnz  NotOpenBraces
  movb  ah,#OpenBrackPrec
  push ax
  mov  bx,#1                        |Last was operator = 0, Start = 1
  jmp  CompileNext

NotOpenBraces:
  cmpb  al,#')'
  jnz  NotCloseBraces
HaveNotFoundOpen:
  pop  ax
  cmp  ax,#0
  jnz  NoUnmatchedBraces
  mov  bx,#BracketsErrMessage
  call PanicRecover
NoUnmatchedBraces:
  cmpb  al,#'('
  jnz  StoreThisOne
  jmp  CompileNext
StoreThisOne:
  call PatchLastOperator
  stosw
  jmps HaveNotFoundOpen


NotCloseBraces:
EndofExpression:
  orb   bl,bl          |start
  jz   FoundSomething
  jmp  SyntaxErrorMessage

FoundSomething:
  decb  dh
  jz   OperatorCountOK
  jmp  SyntaxErrorMessage

OperatorCountOK:
  pop  ax
  call PatchLastOperator
  stosw
  cmpb  al,#'('
  jnz  Notextraleft
  mov  bx,#BracketsErrMessage
  call PanicRecover

Notextraleft:
  or   ax,ax
  jnz  OperatorCountOK

|  mov  si, #PostFixBufferStart
|  call DisplayMessage
|  db   'Compiled Expr is :',0
|NextOperator:
|  lodsw
|  call DisplayRegister
|  call DisplayMessage
|  db   ' ',0
|  mov  cl,ah
|  or   ax,ax
|  jz   ExpressionEnd
|  or   cl,cl
|  jz   NextOperator
|moretodisplay:
|  lodsw
|  call DisplayRegister
|  call DisplayMessage
|  db   ' ',0
|  dec  cl
|  jnz  moretodisplay
|  jmp  short NextOperator
|ExpressionEnd:
|  mov  ax, StringSpace
|  call DisplayRegister
|  call DisplayMessage
|  db   0dh,0ah,0
  ret


PatchLastOperator:
  push di
  push dx
  xorb  dh,dh
  inc  dx
  sal  dx
  sub  di,dx
  sarb  dl
  decb  dl
  movb  1[di],dl
  pop  dx
  pop  di
  xorb  dl,dl
  ret

Overflow:
  mov  bx,#OverFlowMessage
  call PanicRecover

EvalErrorEnd:
  mov  sp,bp
  stc
  ret

EvaluateExpression:
  mov  bp,sp
  lodsw
  orb   ah,ah
  jz   EvalErrorEnd
  movb  cl,ah
NextCycle:
  orb   cl,cl
  jz   OperatorFound
MoreConsts:
  lodsw
  call FindValue
  jc   EvalErrorEnd
  push ax
  decb  cl
  jnz  MoreConsts
OperatorFound:
  lodsw
  or   ax,ax
  jz   EvalEnd
  movb  cl,ah
  cmpb  al,#'+'
  jnz  notplus
  pop  bx
  pop  ax
  add  ax,bx
  jc   Overflow
  push ax
  jmp  NextCycle
notplus:
  cmpb  al,#'-'
  jnz  notminus
  pop  bx
  pop  ax
  sub  ax,bx
  jc   Overflow
  push ax
  jmp  NextCycle
notminus:
  cmpb  al,#'*'
  jnz  notstar
  pop  bx
  pop  ax
  xor  dx,dx
  mul bx
  jc   Overflow
  push ax
  jmp  NextCycle
notstar:
  cmpb al,#'!'
  jnz  notbang
  pop  ax
  orb  ah,ah
  jnz  Overflow
  movb bl,al
  pop  ax
  orb  ah,ah
  jnz  Overflow
  movb ah,al
  movb al,bl
  push ax
  jmp  NextCycle
notbang:
  cmpb  al,#'/'
  jnz  notslash
  pop  bx
  pop  ax
  xor  dx,dx
  div bx
  push ax
  jmp  NextCycle
notslash:
  cmpb  al,#'-'+0x80
  jnz  notuminus
  pop  ax
  neg  ax
  push ax
  jmp  NextCycle
notuminus:
  cmpb  al,#'%'
  jnz  notmod
  pop  bx
  pop  ax
  xor  dx,dx
  div bx
  push dx
  jmp  NextCycle
notmod:
  cmpb  al,#'!'+0x80
  jnz   notnot
  pop  ax
  not  ax
  push ax
  jmp  NextCycle
notnot:
  jmp  NextCycle
EvalEnd:
  pop  ax
  ret

|Identifies unary as well as binary operators. For binary operators,
|it returns with ah as the precedence of the operator. For unary operators,
|ah might not make sense, 'cos some operators are binary as well as unary
|operators - specifically + and -. In such cases, it returns the precedence
|of the binary operator even if the usage was as a unary operator. This
|distinction is done later on.


IsOperator:
  movb  ah,#AddOpPrec
  cmpb  al,#'+'
  jz   IsOperatorEnd
  cmpb  al,#'-'
  jz   IsOperatorEnd
  movb  ah,#MulOpPrec
  cmpb  al,#'*'
  jz   IsOperatorEnd
  cmpb  al,#'/'
  jz   IsOperatorEnd
  cmpb  al,#'%'
  jz   IsOperatorEnd
  movb  ah,#BangOpPrec
  cmpb  al,#'!'
IsOperatorEnd:
  ret

IsUnaryOperator:
  cmpb  al,#'-'
  jz   IsUnaryOperatorEnd
  cmpb  al,#'+'
  jz   IsUnaryOperatorEnd
  cmpb  al,#'!'
IsUnaryOperatorEnd:
  ret


GetIdForToken:
  push bx
  push cx
  push dx
  push si
  push di
  movb  al,InputWord
  call IsAlphaAL
  jc  IsName
  call DecimalConvertNumber
  call FindFakeSymbol
GotIndex:
  mov  ax,si
  pop  di
  pop  si
  pop  dx
  pop  cx
  pop  bx
  ret

IsName:
  call FindSymbol
  jnc  HaveIndexAlready
  call AddSymbol
  mov  Attributes[di],#0
  mov  Value[di],#0
HaveIndexAlready:
  call RecordXref
  jmps GotIndex
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     CharsDisplayed:
  .byte 0

PutCarriageReturn:
  call DisplayMessage
  .byte 13
  .byte 10
  .byte 0
  ret

DisplaySignedAX:
  movb CharsDisplayed,#0
  or   ax,ax
  jns  DisplayAXInDecimal
  push ax
  movb  al,#'-'
  call DisplayCharacter
  pop  ax
  neg  ax
  call DisplayAXInDecimal
  neg  ax
  ret

DisplayAXInDecimal:
  movb CharsDisplayed,#0
  push ax
  push dx
  push cx
  push bx
  mov  cx,#10
  mov  bx,#0
MoreDigitsToBeFound:
  xor  dx,dx
  div  cx
  push dx
  inc  bx
  or   ax,ax
  jnz  MoreDigitsToBeFound
MoreDigitsToBeDisplayed:
  pop  ax
  call DisplayDecimalDigit
  dec  bx
  jnz  MoreDigitsToBeDisplayed
  pop  bx
  pop  cx
  pop  dx
  pop  ax
  ret


DisplayMessage:
  movb CharsDisplayed,#0
  push bp
  mov  bp,sp
  push bx
  push dx
  push ax
  mov  bx,2[bp]
MoreCharsToBeDisplayed:
  movb  al,[bx]
  inc  bx
  orb   al,al
  jz   MessageEnded
  call DisplayCharacter
  jmps MoreCharsToBeDisplayed
MessageEnded:
  mov  2[bp],bx
  pop  ax
  pop  dx
  pop  bx
  pop  bp
  ret

DisplayOtherMessage:
  movb CharsDisplayed,#0
  push bx
  push ax
StillMoreOtherChars:
  movb al,[bx]
  inc  bx
  orb  al,al
  jz   OtherMessageEnded
  call DisplayCharacter
  jmps StillMoreOtherChars
OtherMessageEnded:
  pop  ax
  pop  bx
  ret

PadWithSpaces:
  push ax
  cmpb al,CharsDisplayed
  jc   PaddedWithSpaces
  jz   PaddedWithSpaces
  movb ah,al
  movb al,#' '
MorePaddingRequired:
  call DisplayCharacter
  cmpb ah,CharsDisplayed
  jnz  MorePaddingRequired
PaddedWithSpaces:
  pop  ax
  ret

DisplayDecimalDigit:
  push ax
  andb  al,#0x0F
  addb  al,#'0'
  call DisplayCharacter
  pop  ax
  ret

DisplayRegister:
  push cx
  movb  ch,#4
DisplayRegisterMore:
  rol  ax
  rol  ax
  rol  ax
  rol  ax
  call DisplayHexDigit
  decb  ch
  jnz  DisplayRegisterMore
  pop  cx
  ret

HexDigitTable:
  .ascii "0123456789ABCDEF"

DisplayHexDigit:
  push ax
  push bx
  andb  al,#15
  mov  bx,#HexDigitTable
  xlat
  call  DisplayCharacter
  pop  bx
  pop  ax
  ret

DisplayCharacter:
  push ax
  push dx
  movb  dl,al
  movb  ah,#2
  int  #0x21
  incb CharsDisplayed
  pop  dx
  pop  ax
  ret
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                |symbols.asm
|
|This file contains the symbol tables for the assembler.
|
MatchKeyword:
  lodsb
  movb  bl,al
  xorb  bh,bh

MatchKeyword1:
  mov  bp,si
  mov  di, #InputWord
  lodsb
  cmpb  al,#0
  jz   SymbolNotFound
  movb  cl,al
  xorb  ch,ch
  mov  dx,cx
  rep
  cmpsb
  jnz  NotThisSymbol
  cmpb -1[di],#0
  jz   ThisSymbol
  cmpb [di],#0
  jnz  NotThisSymbol
ThisSymbol:
  clc
  ret

NotThisSymbol:
  mov si,bp
  add si,dx
  add si,bx                             |Number of bytes to skip
  inc si                                |One more for the length
  jmps MatchKeyword1
SymbolNotFound:
  stc
  ret

|The pseudo instructions that are accepted by the assembler are
|    .align n            align on multiple of n bytes
|    .ascii str          assemble a string
|    .asciz str          assemble a zero terminated string
|    .bss                What follows goes to the bss segment
|    .byte n             Assemble one or more bytes
|    .data               What follo