How to resolve the algorithm Compiler/lexical analyzer step by step in the FreeBASIC programming language

Published on 12 May 2024 09:40 PM

How to resolve the algorithm Compiler/lexical analyzer step by step in the FreeBASIC programming language

Table of Contents

Problem Statement

Definition from Wikipedia: Create a lexical analyzer for the simple programming language specified below. The program should read input from a file and/or stdin, and write output to a file and/or stdout. If the language being used has a lexer module/library/class, it would be great if two versions of the solution are provided: One without the lexer module, and one with. The simple programming language to be analyzed is more or less a subset of C. It supports the following tokens: These differ from the the previous tokens, in that each occurrence of them has a value associated with it. For example, the following two program fragments are equivalent, and should produce the same token stream except for the line and column positions: The program output should be a sequence of lines, each consisting of the following whitespace-separated fields:

This task is intended to be used as part of a pipeline, with the other compiler tasks - for example: lex < hello.t | parse | gen | vm Or possibly: lex hello.t lex.out parse lex.out parse.out gen parse.out gen.out vm gen.out

This implies that the output of this task (the lexical analyzer) should be suitable as input to any of the Syntax Analyzer task programs. The following error conditions should be caught: Your solution should pass all the test cases above and the additional tests found Here.

The C and Python versions can be considered reference implementations.

Let's start with the solution:

Step by Step solution about How to resolve the algorithm Compiler/lexical analyzer step by step in the FreeBASIC programming language

Source code in the freebasic programming language

enum Token_type
    tk_EOI
    tk_Mul
    tk_Div
    tk_Mod
    tk_Add
    tk_Sub
    tk_Negate
    tk_Not
    tk_Lss
    tk_Leq
    tk_Gtr
    tk_Geq
    tk_Eq
    tk_Neq
    tk_Assign
    tk_And
    tk_Or
    tk_If
    tk_Else
    tk_While
    tk_Print
    tk_Putc
    tk_Lparen
    tk_Rparen
    tk_Lbrace
    tk_Rbrace
    tk_Semi
    tk_Comma
    tk_Ident
    tk_Integer
    tk_String
end enum

const NewLine     = chr(10)
const DoubleQuote = chr(34)
const BackSlash   = chr(92)

' where we store keywords and variables
type Symbol
    s_name as string
    tok as Token_type
end type

dim shared symtab() as Symbol

dim shared cur_line as string
dim shared cur_ch as string
dim shared line_num as integer
dim shared col_num as integer

function is_digit(byval ch as string) as long
    is_digit = ch >= "0" AndAlso ch <= "9"
end function

function is_alnum(byval ch as string) as long
    is_alnum = (ucase(ch) >= "A" AndAlso ucase(ch) <= "Z") OrElse is_digit(ch)
end function

sub error_msg(byval eline as integer, byval ecol as integer, byval msg as string)
    print "("; eline; ":"; ecol; ") "; msg   
    print : print "Hit any to end program"   
    sleep                                    
    system
end sub

' add an identifier to the symbol table
function install(byval s_name as string, byval tok as Token_type) as integer
    dim n as integer = ubound(symtab) + 1
    redim preserve symtab(n)

    symtab(n).s_name = s_name
    symtab(n).tok    = tok
    return n
end function

' search for an identifier in the symbol table
function lookup(byval s_name as string) as integer
    dim i as integer

    for i = lbound(symtab) to ubound(symtab)
        if symtab(i).s_name = s_name then return i
    next
    return -1
end function

sub next_line()         ' read the next line of input from the source file
    cur_line = ""
    cur_ch  = ""        ' empty cur_ch means end-of-file
    if eof(1) then exit sub
    line input #1, cur_line
    cur_line = cur_line + NewLine
    line_num += + 1
    col_num = 1
end sub

sub next_char()         ' get the next char
    cur_ch = ""
    col_num += 1
    if col_num > len(cur_line) then next_line()
    if col_num <= len(cur_line) then cur_ch = mid(cur_line, col_num, 1)
end sub

function follow(byval err_line as integer, byval err_col as integer, byval expect as string, byval ifyes as Token_type, byval ifno as Token_type) as Token_type
    if cur_ch = expect then
        next_char()
        return ifyes
    end if
    if ifno = tk_eoi then error_msg(err_line, err_col, "follow unrecognized character: " + cur_ch)
    return ifno
end function

sub gettok(byref err_line as integer, byref err_col as integer, byref tok as Token_type, byref v as string)
    ' skip whitespace
    do while (cur_ch = " " or cur_ch = chr(9) or cur_ch = NewLine) and (cur_ch <> "")
        next_char()
    loop

    err_line = line_num
    err_col  = col_num

    select case cur_ch
        case "":  tok = tk_eoi: exit sub
        case "{": tok = tk_lbrace: next_char(): exit sub
        case "}": tok = tk_rbrace: next_char(): exit sub
        case "(": tok = tk_lparen: next_char(): exit sub
        case ")": tok = tk_rparen: next_char(): exit sub
        case "+": tok = tk_add:    next_char(): exit sub
        case "-": tok = tk_sub:    next_char(): exit sub
        case "*": tok = tk_mul:    next_char(): exit sub
        case "%": tok = tk_Mod:    next_char(): exit sub
        case ";": tok = tk_semi:   next_char(): exit sub
        case ",": tok = tk_comma:  next_char(): exit sub
        case "/": ' div or comment
            next_char()
            if cur_ch <> "*" then
                tok = tk_div
                exit sub
            end if
            ' skip comments
            next_char()
            do
                if cur_ch = "*" then
                    next_char()
                    if cur_ch = "/" then
                        next_char()
                        gettok(err_line, err_col, tok, v)
                        exit sub
                    end if
                elseif cur_ch = "" then error_msg(err_line, err_col, "EOF in comment")
                else
                    next_char()
                end if
            loop
        case "'":   ' single char literals
            next_char()
            v = str(asc(cur_ch))
            if cur_ch = "'" then error_msg(err_line, err_col, "empty character constant")
            if cur_ch = BackSlash then          
                next_char()
                if cur_ch = "n" then
                    v = "10"
                elseif cur_ch = BackSlash then  
                    v = "92"                  
                else error_msg(err_line, err_col, "unknown escape sequence: " + cur_ch)
                end if
            end if
            next_char()
            if cur_ch <> "'" then error_msg(err_line, err_col, "multi-character constant")
            next_char()
            tok = tk_integer
            exit sub
        case "<": next_char(): tok = follow(err_line, err_col, "=", tk_Leq, tk_Lss): exit sub
        case ">": next_char(): tok = follow(err_line, err_col, "=", tk_Geq, tk_Gtr): exit sub
        case "!": next_char(): tok = follow(err_line, err_col, "=", tk_Neq, tk_Not): exit sub
        case "=": next_char(): tok = follow(err_line, err_col, "=", tk_Eq,  tk_Assign): exit sub
        case "&": next_char(): tok = follow(err_line, err_col, "&", tk_And, tk_EOI): exit sub
        case "|": next_char(): tok = follow(err_line, err_col, "|", tk_Or,  tk_EOI): exit sub
        case DoubleQuote: ' string
            v = cur_ch
            next_char()
            do while cur_ch <> DoubleQuote
                if cur_ch = NewLine then error_msg(err_line, err_col, "EOL in string")
                if cur_ch = "" then error_msg(err_line, err_col, "EOF in string")
                v += cur_ch
                next_char()
            loop
            v += cur_ch
            next_char()
            tok = tk_string
            exit sub
        case else   ' integers or identifiers
            dim is_number as boolean = is_digit(cur_ch)
            v = ""
            do while is_alnum(cur_ch) orelse cur_ch = "_"
                if not is_digit(cur_ch) then is_number = false
                v += cur_ch
                next_char()
            loop
            if len(v) = 0 then error_msg(err_line, err_col, "unknown character: " + cur_ch)
            if is_digit(mid(v, 1, 1)) then
                if not is_number then error_msg(err_line, err_col, "invalid number: " + v)
                tok = tk_integer
                exit sub
            end if
            dim as integer index = lookup(v)
            if index = -1 then
                tok = tk_ident
            else
                tok = symtab(index).tok
            end if
            exit sub
    end select
end sub

sub init_lex(byval filein as string)
    install("else",  tk_else)
    install("if",    tk_if)
    install("print", tk_print)
    install("putc",  tk_putc)
    install("while", tk_while)

    open filein for input as #1

    cur_line = ""
    line_num = 0
    col_num = 0
    next_char()
end sub

sub scanner()
    dim err_line as integer
    dim err_col as integer
    dim tok as Token_type
    dim v as string
    dim tok_list(tk_eoi to tk_string) as string

    tok_list(tk_EOI    ) = "End_of_input"
    tok_list(tk_Mul    ) = "Op_multiply"
    tok_list(tk_Div    ) = "Op_divide"
    tok_list(tk_Mod    ) = "Op_mod"
    tok_list(tk_Add    ) = "Op_add"
    tok_list(tk_Sub    ) = "Op_subtract"
    tok_list(tk_Negate ) = "Op_negate"
    tok_list(tk_Not    ) = "Op_not"
    tok_list(tk_Lss    ) = "Op_less"
    tok_list(tk_Leq    ) = "Op_lessequal"
    tok_list(tk_Gtr    ) = "Op_greater"
    tok_list(tk_Geq    ) = "Op_greaterequal"
    tok_list(tk_Eq     ) = "Op_equal"
    tok_list(tk_Neq    ) = "Op_notequal"
    tok_list(tk_Assign ) = "Op_assign"
    tok_list(tk_And    ) = "Op_and"
    tok_list(tk_Or     ) = "Op_or"
    tok_list(tk_If     ) = "Keyword_if"
    tok_list(tk_Else   ) = "Keyword_else"
    tok_list(tk_While  ) = "Keyword_while"
    tok_list(tk_Print  ) = "Keyword_print"
    tok_list(tk_Putc   ) = "Keyword_putc"
    tok_list(tk_Lparen ) = "LeftParen"
    tok_list(tk_Rparen ) = "RightParen"
    tok_list(tk_Lbrace ) = "LeftBrace"
    tok_list(tk_Rbrace ) = "RightBrace"
    tok_list(tk_Semi   ) = "Semicolon"
    tok_list(tk_Comma  ) = "Comma"
    tok_list(tk_Ident  ) = "Identifier"
    tok_list(tk_Integer) = "Integer"
    tok_list(tk_String ) = "String"

    do
        gettok(err_line, err_col, tok, v)
        print using "#####  ##### \               " + BackSlash; err_line; err_col; tok_list(tok);
        if tok = tk_integer orelse tok = tk_ident orelse tok = tk_string then print " " + v;
        print
    loop until tok = tk_eoi
end sub

sub main()
    if command(1) = "" then print "filename required" : exit sub   
    init_lex(command(1))
    scanner()
end sub

main()
print : print "Hit any to end program"  
sleep                                   
system

  

You may also check:How to resolve the algorithm Forward difference step by step in the Raku programming language
You may also check:How to resolve the algorithm Time a function step by step in the Elena programming language
You may also check:How to resolve the algorithm Hough transform step by step in the BBC BASIC programming language
You may also check:How to resolve the algorithm Loops/For step by step in the Wart programming language
You may also check:How to resolve the algorithm HTTPS/Authenticated step by step in the PicoLisp programming language