How to resolve the algorithm Compiler/lexical analyzer step by step in the Prolog programming language

Problem Statement
Step by Step Solution
Sourcecode

Problem Statement

Definition from Wikipedia: Create a lexical analyzer for the simple programming language specified below. The program should read input from a file and/or stdin, and write output to a file and/or stdout. If the language being used has a lexer module/library/class, it would be great if two versions of the solution are provided: One without the lexer module, and one with. The simple programming language to be analyzed is more or less a subset of C. It supports the following tokens: These differ from the the previous tokens, in that each occurrence of them has a value associated with it. For example, the following two program fragments are equivalent, and should produce the same token stream except for the line and column positions: The program output should be a sequence of lines, each consisting of the following whitespace-separated fields:

This task is intended to be used as part of a pipeline, with the other compiler tasks - for example: lex < hello.t | parse | gen | vm Or possibly: lex hello.t lex.out parse lex.out parse.out gen parse.out gen.out vm gen.out

This implies that the output of this task (the lexical analyzer) should be suitable as input to any of the Syntax Analyzer task programs. The following error conditions should be caught: Your solution should pass all the test cases above and the additional tests found Here.

The C and Python versions can be considered reference implementations.

Let's start with the solution:

Step by Step solution about How to resolve the algorithm Compiler/lexical analyzer step by step in the Prolog programming language

Source code in the prolog programming language

/*
    Test harness for the analyzer, not needed if we are actually using the output. 
*/
load_file(File, Input) :-
	read_file_to_codes(File, Codes, []),
	maplist(char_code, Chars, Codes),
	atom_chars(Input,Chars).

test_file(File) :-
	load_file(File, Input),
	tester(Input).

tester(S) :-
	atom_chars(S,Chars),
	tokenize(Chars,L),
	maplist(print_tok, L),
	!.

print_tok(L) :- 
	L =.. [Op,Line,Pos],
	format('~d\t~d\t~p~n', [Line,Pos,Op]).
print_tok(string(Value,Line,Pos)) :- 
	format('~d\t~d\tstring\t\t"~w"~n', [Line,Pos,Value]).	
print_tok(identifier(Value,Line,Pos)) :- 
	format('~d\t~d\tidentifier\t~p~n', [Line,Pos,Value]).	
print_tok(integer(Value,Line,Pos)) :- 
	format('~d\t~d\tinteger\t\t~p~n', [Line,Pos,Value]).	


/* 
	Tokenize 
	
	run the input over a DCG to get out the tokens. 
	
	In - a list of chars to tokenize
	Tokens = a list of tokens (excluding spaces).
*/
tokenize(In,RelTokens) :-
	newline_positions(In,1,NewLines),
	tokenize(In,[0|NewLines],1,1,Tokens),
	check_for_exceptions(Tokens),
	exclude(token_name(space),Tokens,RelTokens).

tokenize([],NewLines,Pos,LineNo,[end_of_input(LineNo,Offset)]) :- 
	position_offset(NewLines,Pos,Offset).
tokenize(In,NewLines,Pos,LineNo,Out) :-
	position_offset(NewLines,Pos,Offset),
	phrase(tok(Tok,TokLen,LineNo,Offset),In,T),
	(
		Tok = [] -> Out = Toks
		; Out = [Tok|Toks]
	),
	Pos1 is Pos + TokLen,
	update_line_no(LineNo,NewLines,Pos1,NewLineNo,NewNewLines),
	tokenize(T,NewNewLines,Pos1,NewLineNo,Toks).
	
update_line_no(LNo,[L],_,LNo,[L]). 
update_line_no(LNo,[L,Nl|T],Pos,LNo,[L,Nl|T]) :- 
	Pos =< Nl. 
update_line_no(LNo,[_,Nl|T],Pos,LNo2,Nlines) :- 
	Pos > Nl, 
	succ(LNo,LNo1), 
	update_line_no(LNo1,[Nl|T],Pos,LNo2,Nlines).

position_offset([Line|_],Pos,Offset) :- Offset is Pos - Line.	
	
token_name(Name,Tok) :- functor(Tok,Name,_).	

% Get a list of all the newlines and their position in the data
% This is used to create accurate row/column numbers. 	
newline_positions([],N,[N]).
newline_positions(['\n'|T],N,[N|Nt]) :- succ(N,N1), newline_positions(T,N1,Nt).
newline_positions([C|T],N,Nt) :- dif(C,'\n'), succ(N,N1), newline_positions(T,N1,Nt).	
	
% The tokenizer can tokenize some things that it shouldn't, deal with them here. 	
check_for_exceptions([]). % all ok
check_for_exceptions([op_divide(L,P),op_multiply(_,_)|_]) :- 
	format(atom(Error), 'Unclosed comment at line ~d,~d', [L,P]), 
	throw(Error).
check_for_exceptions([integer(_,L,P),identifier(_,_,_)|_]) :-
	format(atom(Error), 'Invalid identifier at line ~d,~d', [L,P]), 
	throw(Error).	
check_for_exceptions([_|T]) :- check_for_exceptions(T).	


/* 
	A set of helper DCGs for the more complicated token types
*/	
:- set_prolog_flag(double_quotes, chars).

identifier(I) --> c_types(I,csym).
identifier(['_']) --> ['_'].
identifier([]) --> [].

integer_(I,L) --> c_types(N,digit), { number_codes(I,N), length(N,L) }.

% get a sequence of characters of the same type (https://www.swi-prolog.org/pldoc/doc_for?object=char_type/2)
c_types([C|T],Type) --> c_type(C,Type), c_types(T,Type).
c_types([C],Type) --> c_type(C,Type).
c_type(C,Type) --> [C],{ char_type(C,Type) }.

anything([]) --> [].
anything([A|T]) --> [A], anything(T).

string_([]) --> [].
string_([A|T]) --> [A], { dif(A,'\n') }, string_(T).


/* 
	The token types are all handled by the tok DCG, order of predicates is important here. 
*/	
% comment
tok([],CLen,_,_) --> "/*", anything(A), "*/", { length(A,Len), CLen is Len + 4 }.

% toks
tok(op_and(L,P),2,L,P) --> "&&".
tok(op_or(L,P),2,L,P) --> "||".
tok(op_lessequal(L,P),2,L,P) --> "<=".
tok(op_greaterequal(L,P),2,L,P) --> ">=".
tok(op_greaterequal(L,P),2,L,P) --> ">=".
tok(op_equal(L,P),2,L,P) --> "==".
tok(op_notequal(L,P),2,L,P) --> "!=".
tok(op_assign(L,P),1,L,P) --> "=".
tok(op_multiply(L,P),1,L,P) --> "*".
tok(op_divide(L,P),1,L,P) --> "/".
tok(op_mod(L,P),1,L,P) --> "%".
tok(op_add(L,P),1,L,P) --> "+".
tok(op_subtract(L,P),1,L,P) --> "-".
tok(op_negate(L,P),1,L,P) --> "-".
tok(op_less(L,P),1,L,P) --> "<".
tok(op_greater(L,P),1,L,P) --> ">".
tok(op_not(L,P),1,L,P) --> "!".

% symbols
tok(left_paren(L,P),1,L,P) --> "(".
tok(right_paren(L,P),1,L,P) --> ")".
tok(left_brace(L,P),1,L,P) --> "{".
tok(right_brace(L,P),1,L,P) --> "}".
tok(semicolon(L,P),1,L,P) --> ";".
tok(comma(L,P),1,L,P) --> ",".

% keywords
tok(keyword_if(L,P),2,L,P) --> "if".
tok(keyword_else(L,P),4,L,P) --> "else".
tok(keyword_while(L,P),5,L,P) --> "while".
tok(keyword_print(L,P),5,L,P) --> "print".
tok(keyword_putc(L,P),4,L,P) --> "putc".

% identifier and literals
tok(identifier(I,L,P),Len,L,P) --> c_type(S,csymf), identifier(T), { atom_chars(I,[S|T]), length([S|T],Len) }. 
tok(integer(V,L,P),Len,L,P) --> integer_(V,Len).
tok(integer(I,L,P),4,L,P) --> "'\\\\'", { char_code('\\', I) }.
tok(integer(I,L,P),4,L,P) --> "'\\n'", { char_code('\n', I) }.
tok(integer(I,L,P),3,L,P) --> ['\''], [C], ['\''], { dif(C,'\n'), dif(C,'\''), char_code(C,I) }.
tok(string(S,L,P),SLen,L,P) --> ['"'], string_(A),['"'], { atom_chars(S,A), length(A,Len), SLen is Len + 2 }.

% spaces
tok(space(L,P),Len,L,P) --> c_types(S,space), { length(S,Len) }.

% anything else is an error
tok(_,_,L,P) --> { format(atom(Error), 'Invalid token at line ~d,~d', [L,P]), throw(Error) }.

You may also check:How to resolve the algorithm Palindrome detection step by step in the Emacs Lisp programming language
You may also check:How to resolve the algorithm Subtractive generator step by step in the zkl programming language
You may also check:How to resolve the algorithm Rhonda numbers step by step in the Rust programming language
You may also check:How to resolve the algorithm Matrix-exponentiation operator step by step in the PARI/GP programming language
You may also check:How to resolve the algorithm Sorting algorithms/Patience sort step by step in the C programming language

How to resolve the algorithm Compiler/lexical analyzer step by step in the Prolog programming language

Table of Contents

Problem Statement

Step by Step solution about How to resolve the algorithm Compiler/lexical analyzer step by step in the Prolog programming language

Source code in the prolog programming language