How to resolve the algorithm Compiler/lexical analyzer step by step in the Euphoria programming language
How to resolve the algorithm Compiler/lexical analyzer step by step in the Euphoria programming language
Table of Contents
Problem Statement
Definition from Wikipedia: Create a lexical analyzer for the simple programming language specified below. The program should read input from a file and/or stdin, and write output to a file and/or stdout. If the language being used has a lexer module/library/class, it would be great if two versions of the solution are provided: One without the lexer module, and one with. The simple programming language to be analyzed is more or less a subset of C. It supports the following tokens: These differ from the the previous tokens, in that each occurrence of them has a value associated with it. For example, the following two program fragments are equivalent, and should produce the same token stream except for the line and column positions: The program output should be a sequence of lines, each consisting of the following whitespace-separated fields:
This task is intended to be used as part of a pipeline, with the other compiler tasks - for example: lex < hello.t | parse | gen | vm Or possibly: lex hello.t lex.out parse lex.out parse.out gen parse.out gen.out vm gen.out
This implies that the output of this task (the lexical analyzer) should be suitable as input to any of the Syntax Analyzer task programs. The following error conditions should be caught: Your solution should pass all the test cases above and the additional tests found Here.
The C and Python versions can be considered reference implementations.
Let's start with the solution:
Step by Step solution about How to resolve the algorithm Compiler/lexical analyzer step by step in the Euphoria programming language
Source code in the euphoria programming language
include std/io.e
include std/map.e
include std/types.e
include std/convert.e
constant true = 1, false = 0, EOF = -1
enum tk_EOI, tk_Mul, tk_Div, tk_Mod, tk_Add, tk_Sub, tk_Negate, tk_Not, tk_Lss, tk_Leq,
tk_Gtr, tk_Geq, tk_Eq, tk_Neq, tk_Assign, tk_And, tk_Or, tk_If, tk_Else, tk_While,
tk_Print, tk_Putc, tk_Lparen, tk_Rparen, tk_Lbrace, tk_Rbrace, tk_Semi, tk_Comma,
tk_Ident, tk_Integer, tk_String
constant all_syms = {"End_of_input", "Op_multiply", "Op_divide", "Op_mod", "Op_add",
"Op_subtract", "Op_negate", "Op_not", "Op_less", "Op_lessequal", "Op_greater",
"Op_greaterequal", "Op_equal", "Op_notequal", "Op_assign", "Op_and", "Op_or",
"Keyword_if", "Keyword_else", "Keyword_while", "Keyword_print", "Keyword_putc",
"LeftParen", "RightParen", "LeftBrace", "RightBrace", "Semicolon", "Comma",
"Identifier", "Integer", "String"}
integer input_file, the_ch = ' ', the_col = 0, the_line = 1
sequence symbols
map key_words = new()
procedure error(sequence format, sequence data)
printf(STDOUT, format, data)
abort(1)
end procedure
-- get the next character from the input
function next_ch()
the_ch = getc(input_file)
the_col += 1
if the_ch = '\n' then
the_line += 1
the_col = 0
end if
return the_ch
end function
-- 'x' - character constants
function char_lit(integer err_line, integer err_col)
integer n = next_ch() -- skip opening quote
if the_ch = '\'' then
error("%d %d empty character constant", {err_line, err_col})
elsif the_ch = '\\' then
next_ch()
if the_ch = 'n' then
n = 10
elsif the_ch = '\\' then
n = '\\'
else
error("%d %d unknown escape sequence \\%c", {err_line, err_col, the_ch})
end if
end if
if next_ch() != '\'' then
error("%d %d multi-character constant", {err_line, err_col})
end if
next_ch()
return {tk_Integer, err_line, err_col, n}
end function
-- process divide or comments
function div_or_cmt(integer err_line, integer err_col)
if next_ch() != '*' then
return {tk_Div, err_line, err_col}
end if
-- comment found
next_ch()
while true do
if the_ch = '*' then
if next_ch() = '/' then
next_ch()
return get_tok()
end if
elsif the_ch = EOF then
error("%d %d EOF in comment", {err_line, err_col})
else
next_ch()
end if
end while
end function
-- "string"
function string_lit(integer start, integer err_line, integer err_col)
string text = ""
while next_ch() != start do
if the_ch = EOF then
error("%d %d EOF while scanning string literal", {err_line, err_col})
end if
if the_ch = '\n' then
error("%d %d EOL while scanning string literal", {err_line, err_col})
end if
text &= the_ch
end while
next_ch()
return {tk_String, err_line, err_col, text}
end function
-- handle identifiers and integers
function ident_or_int(integer err_line, integer err_col)
integer n, is_number = true
string text = ""
while t_alnum(the_ch) or the_ch = '_' do
text &= the_ch
if not t_digit(the_ch) then
is_number = false
end if
next_ch()
end while
if length(text) = 0 then
error("%d %d ident_or_int: unrecognized character: (%d) '%s'", {err_line, err_col, the_ch, the_ch})
end if
if t_digit(text[1]) then
if not is_number then
error("%d %d invalid number: %s", {err_line, err_col, text})
end if
n = to_integer(text)
return {tk_Integer, err_line, err_col, n}
end if
if has(key_words, text) then
return {get(key_words, text), err_line, err_col}
end if
return {tk_Ident, err_line, err_col, text}
end function
-- look ahead for '>=', etc.
function follow(integer expect, integer ifyes, integer ifno, integer err_line, integer err_col)
if next_ch() = expect then
next_ch()
return {ifyes, err_line, err_col}
end if
if ifno = tk_EOI then
error("%d %d follow: unrecognized character: (%d)", {err_line, err_col, the_ch})
end if
return {ifno, err_line, err_col}
end function
-- return the next token type
function get_tok()
while t_space(the_ch) do
next_ch()
end while
integer err_line = the_line
integer err_col = the_col
switch the_ch do
case EOF then return {tk_EOI, err_line, err_col}
case '/' then return div_or_cmt(err_line, err_col)
case '\'' then return char_lit(err_line, err_col)
case '<' then return follow('=', tk_Leq, tk_Lss, err_line, err_col)
case '>' then return follow('=', tk_Geq, tk_Gtr, err_line, err_col)
case '=' then return follow('=', tk_Eq, tk_Assign, err_line, err_col)
case '!' then return follow('=', tk_Neq, tk_Not, err_line, err_col)
case '&' then return follow('&', tk_And, tk_EOI, err_line, err_col)
case '|' then return follow('|', tk_Or, tk_EOI, err_line, err_col)
case '"' then return string_lit(the_ch, err_line, err_col)
case else
integer sym = symbols[the_ch]
if sym != tk_EOI then
next_ch()
return {sym, err_line, err_col}
end if
return ident_or_int(err_line, err_col)
end switch
end function
procedure init()
put(key_words, "else", tk_Else)
put(key_words, "if", tk_If)
put(key_words, "print", tk_Print)
put(key_words, "putc", tk_Putc)
put(key_words, "while", tk_While)
symbols = repeat(tk_EOI, 256)
symbols['{'] = tk_Lbrace
symbols['}'] = tk_Rbrace
symbols['('] = tk_Lparen
symbols[')'] = tk_Rparen
symbols['+'] = tk_Add
symbols['-'] = tk_Sub
symbols['*'] = tk_Mul
symbols['%'] = tk_Mod
symbols[';'] = tk_Semi
symbols[','] = tk_Comma
end procedure
procedure main(sequence cl)
sequence file_name
input_file = STDIN
if length(cl) > 2 then
file_name = cl[3]
input_file = open(file_name, "r")
if input_file = -1 then
error("Could not open %s", {file_name})
end if
end if
init()
sequence t
loop do
t = get_tok()
printf(STDOUT, "%5d %5d %-8s", {t[2], t[3], all_syms[t[1]]})
switch t[1] do
case tk_Integer then printf(STDOUT, " %5d\n", {t[4]})
case tk_Ident then printf(STDOUT, " %s\n", {t[4]})
case tk_String then printf(STDOUT, " \"%s\"\n", {t[4]})
case else printf(STDOUT, "\n")
end switch
until t[1] = tk_EOI
end loop
end procedure
main(command_line())
You may also check:How to resolve the algorithm Extreme floating point values step by step in the Wren programming language
You may also check:How to resolve the algorithm Xiaolin Wu's line algorithm step by step in the Scala programming language
You may also check:How to resolve the algorithm Josephus problem step by step in the ERRE programming language
You may also check:How to resolve the algorithm Program termination step by step in the Forth programming language
You may also check:How to resolve the algorithm Huffman coding step by step in the Ada programming language