How to resolve the algorithm Compiler/lexical analyzer step by step in the FreeBASIC programming language
How to resolve the algorithm Compiler/lexical analyzer step by step in the FreeBASIC programming language
Table of Contents
Problem Statement
Definition from Wikipedia: Create a lexical analyzer for the simple programming language specified below. The program should read input from a file and/or stdin, and write output to a file and/or stdout. If the language being used has a lexer module/library/class, it would be great if two versions of the solution are provided: One without the lexer module, and one with. The simple programming language to be analyzed is more or less a subset of C. It supports the following tokens: These differ from the the previous tokens, in that each occurrence of them has a value associated with it. For example, the following two program fragments are equivalent, and should produce the same token stream except for the line and column positions: The program output should be a sequence of lines, each consisting of the following whitespace-separated fields:
This task is intended to be used as part of a pipeline, with the other compiler tasks - for example: lex < hello.t | parse | gen | vm Or possibly: lex hello.t lex.out parse lex.out parse.out gen parse.out gen.out vm gen.out
This implies that the output of this task (the lexical analyzer) should be suitable as input to any of the Syntax Analyzer task programs. The following error conditions should be caught: Your solution should pass all the test cases above and the additional tests found Here.
The C and Python versions can be considered reference implementations.
Let's start with the solution:
Step by Step solution about How to resolve the algorithm Compiler/lexical analyzer step by step in the FreeBASIC programming language
Source code in the freebasic programming language
enum Token_type
tk_EOI
tk_Mul
tk_Div
tk_Mod
tk_Add
tk_Sub
tk_Negate
tk_Not
tk_Lss
tk_Leq
tk_Gtr
tk_Geq
tk_Eq
tk_Neq
tk_Assign
tk_And
tk_Or
tk_If
tk_Else
tk_While
tk_Print
tk_Putc
tk_Lparen
tk_Rparen
tk_Lbrace
tk_Rbrace
tk_Semi
tk_Comma
tk_Ident
tk_Integer
tk_String
end enum
const NewLine = chr(10)
const DoubleQuote = chr(34)
const BackSlash = chr(92)
' where we store keywords and variables
type Symbol
s_name as string
tok as Token_type
end type
dim shared symtab() as Symbol
dim shared cur_line as string
dim shared cur_ch as string
dim shared line_num as integer
dim shared col_num as integer
function is_digit(byval ch as string) as long
is_digit = ch >= "0" AndAlso ch <= "9"
end function
function is_alnum(byval ch as string) as long
is_alnum = (ucase(ch) >= "A" AndAlso ucase(ch) <= "Z") OrElse is_digit(ch)
end function
sub error_msg(byval eline as integer, byval ecol as integer, byval msg as string)
print "("; eline; ":"; ecol; ") "; msg
print : print "Hit any to end program"
sleep
system
end sub
' add an identifier to the symbol table
function install(byval s_name as string, byval tok as Token_type) as integer
dim n as integer = ubound(symtab) + 1
redim preserve symtab(n)
symtab(n).s_name = s_name
symtab(n).tok = tok
return n
end function
' search for an identifier in the symbol table
function lookup(byval s_name as string) as integer
dim i as integer
for i = lbound(symtab) to ubound(symtab)
if symtab(i).s_name = s_name then return i
next
return -1
end function
sub next_line() ' read the next line of input from the source file
cur_line = ""
cur_ch = "" ' empty cur_ch means end-of-file
if eof(1) then exit sub
line input #1, cur_line
cur_line = cur_line + NewLine
line_num += + 1
col_num = 1
end sub
sub next_char() ' get the next char
cur_ch = ""
col_num += 1
if col_num > len(cur_line) then next_line()
if col_num <= len(cur_line) then cur_ch = mid(cur_line, col_num, 1)
end sub
function follow(byval err_line as integer, byval err_col as integer, byval expect as string, byval ifyes as Token_type, byval ifno as Token_type) as Token_type
if cur_ch = expect then
next_char()
return ifyes
end if
if ifno = tk_eoi then error_msg(err_line, err_col, "follow unrecognized character: " + cur_ch)
return ifno
end function
sub gettok(byref err_line as integer, byref err_col as integer, byref tok as Token_type, byref v as string)
' skip whitespace
do while (cur_ch = " " or cur_ch = chr(9) or cur_ch = NewLine) and (cur_ch <> "")
next_char()
loop
err_line = line_num
err_col = col_num
select case cur_ch
case "": tok = tk_eoi: exit sub
case "{": tok = tk_lbrace: next_char(): exit sub
case "}": tok = tk_rbrace: next_char(): exit sub
case "(": tok = tk_lparen: next_char(): exit sub
case ")": tok = tk_rparen: next_char(): exit sub
case "+": tok = tk_add: next_char(): exit sub
case "-": tok = tk_sub: next_char(): exit sub
case "*": tok = tk_mul: next_char(): exit sub
case "%": tok = tk_Mod: next_char(): exit sub
case ";": tok = tk_semi: next_char(): exit sub
case ",": tok = tk_comma: next_char(): exit sub
case "/": ' div or comment
next_char()
if cur_ch <> "*" then
tok = tk_div
exit sub
end if
' skip comments
next_char()
do
if cur_ch = "*" then
next_char()
if cur_ch = "/" then
next_char()
gettok(err_line, err_col, tok, v)
exit sub
end if
elseif cur_ch = "" then error_msg(err_line, err_col, "EOF in comment")
else
next_char()
end if
loop
case "'": ' single char literals
next_char()
v = str(asc(cur_ch))
if cur_ch = "'" then error_msg(err_line, err_col, "empty character constant")
if cur_ch = BackSlash then
next_char()
if cur_ch = "n" then
v = "10"
elseif cur_ch = BackSlash then
v = "92"
else error_msg(err_line, err_col, "unknown escape sequence: " + cur_ch)
end if
end if
next_char()
if cur_ch <> "'" then error_msg(err_line, err_col, "multi-character constant")
next_char()
tok = tk_integer
exit sub
case "<": next_char(): tok = follow(err_line, err_col, "=", tk_Leq, tk_Lss): exit sub
case ">": next_char(): tok = follow(err_line, err_col, "=", tk_Geq, tk_Gtr): exit sub
case "!": next_char(): tok = follow(err_line, err_col, "=", tk_Neq, tk_Not): exit sub
case "=": next_char(): tok = follow(err_line, err_col, "=", tk_Eq, tk_Assign): exit sub
case "&": next_char(): tok = follow(err_line, err_col, "&", tk_And, tk_EOI): exit sub
case "|": next_char(): tok = follow(err_line, err_col, "|", tk_Or, tk_EOI): exit sub
case DoubleQuote: ' string
v = cur_ch
next_char()
do while cur_ch <> DoubleQuote
if cur_ch = NewLine then error_msg(err_line, err_col, "EOL in string")
if cur_ch = "" then error_msg(err_line, err_col, "EOF in string")
v += cur_ch
next_char()
loop
v += cur_ch
next_char()
tok = tk_string
exit sub
case else ' integers or identifiers
dim is_number as boolean = is_digit(cur_ch)
v = ""
do while is_alnum(cur_ch) orelse cur_ch = "_"
if not is_digit(cur_ch) then is_number = false
v += cur_ch
next_char()
loop
if len(v) = 0 then error_msg(err_line, err_col, "unknown character: " + cur_ch)
if is_digit(mid(v, 1, 1)) then
if not is_number then error_msg(err_line, err_col, "invalid number: " + v)
tok = tk_integer
exit sub
end if
dim as integer index = lookup(v)
if index = -1 then
tok = tk_ident
else
tok = symtab(index).tok
end if
exit sub
end select
end sub
sub init_lex(byval filein as string)
install("else", tk_else)
install("if", tk_if)
install("print", tk_print)
install("putc", tk_putc)
install("while", tk_while)
open filein for input as #1
cur_line = ""
line_num = 0
col_num = 0
next_char()
end sub
sub scanner()
dim err_line as integer
dim err_col as integer
dim tok as Token_type
dim v as string
dim tok_list(tk_eoi to tk_string) as string
tok_list(tk_EOI ) = "End_of_input"
tok_list(tk_Mul ) = "Op_multiply"
tok_list(tk_Div ) = "Op_divide"
tok_list(tk_Mod ) = "Op_mod"
tok_list(tk_Add ) = "Op_add"
tok_list(tk_Sub ) = "Op_subtract"
tok_list(tk_Negate ) = "Op_negate"
tok_list(tk_Not ) = "Op_not"
tok_list(tk_Lss ) = "Op_less"
tok_list(tk_Leq ) = "Op_lessequal"
tok_list(tk_Gtr ) = "Op_greater"
tok_list(tk_Geq ) = "Op_greaterequal"
tok_list(tk_Eq ) = "Op_equal"
tok_list(tk_Neq ) = "Op_notequal"
tok_list(tk_Assign ) = "Op_assign"
tok_list(tk_And ) = "Op_and"
tok_list(tk_Or ) = "Op_or"
tok_list(tk_If ) = "Keyword_if"
tok_list(tk_Else ) = "Keyword_else"
tok_list(tk_While ) = "Keyword_while"
tok_list(tk_Print ) = "Keyword_print"
tok_list(tk_Putc ) = "Keyword_putc"
tok_list(tk_Lparen ) = "LeftParen"
tok_list(tk_Rparen ) = "RightParen"
tok_list(tk_Lbrace ) = "LeftBrace"
tok_list(tk_Rbrace ) = "RightBrace"
tok_list(tk_Semi ) = "Semicolon"
tok_list(tk_Comma ) = "Comma"
tok_list(tk_Ident ) = "Identifier"
tok_list(tk_Integer) = "Integer"
tok_list(tk_String ) = "String"
do
gettok(err_line, err_col, tok, v)
print using "##### ##### \ " + BackSlash; err_line; err_col; tok_list(tok);
if tok = tk_integer orelse tok = tk_ident orelse tok = tk_string then print " " + v;
print
loop until tok = tk_eoi
end sub
sub main()
if command(1) = "" then print "filename required" : exit sub
init_lex(command(1))
scanner()
end sub
main()
print : print "Hit any to end program"
sleep
system
You may also check:How to resolve the algorithm Forward difference step by step in the Raku programming language
You may also check:How to resolve the algorithm Time a function step by step in the Elena programming language
You may also check:How to resolve the algorithm Hough transform step by step in the BBC BASIC programming language
You may also check:How to resolve the algorithm Loops/For step by step in the Wart programming language
You may also check:How to resolve the algorithm HTTPS/Authenticated step by step in the PicoLisp programming language