Lexical Analyzer

SLY (Sly Lex-Yacc)

Ahmad Yoosofan

Compiler course

University of Kashan

https://yoosofan.github.io/course/compiler.html

Sly

https://github.com/dabeaz/sly
https://sly.readthedocs.io/en/latest/
zero-dependency Python implementation of the traditional parsing tools lex and yacc
Using Python Langauge which is very good at text processing.
SLY provides very extensive error reporting and diagnostic information to assist in parser construction.
SLY provides full support for empty productions, error recovery, precedence specifiers, and moderately ambiguous grammars.
SLY uses various Python metaprogramming features to specify lexers and parsers.
A moderinized form of PLY (It is not follow the same rule of old Lex/Yacc)
SLY requires the use of Python 3.6 or greater.
3-BSD like license

Install SLY

# install python (already installed in most Linux)
# install pip3
# sudo apt-get install python3-pip # for Debian and Ubuntu

pip3 install sly

# or just copy sly files in the same folder of the project

Simple Lexer Code

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   PLUS    = r'\+'
9   NUMBER =  r'[0-9]+' #\d+
10 
11 if __name__ == '__main__':
12   data = '3 + 42 +8'
13   lexer = CalcLexer()
14   for tok in lexer.tokenize(data):
15     print('type=%r, value=%r' % (tok.type, tok.value))

1 python3 222.plus.py
2 
3 type='NUMBER', value='3'
4 type='PLUS', value='+'
5 type='NUMBER', value='42'
6 type='PLUS', value='+'
7 type='NUMBER', value='8'

Simple Error handling

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   PLUS    = r'\+'
9   NUMBER =  r'[0-9]+' #\d+
10 
11   def error(self, t):
12     print("Illegal character '%s'" % t.value[0])
13     self.index += 1
14 
15 if __name__ == '__main__':
16   data = '3 + 42 +8.43+456'
17   lexer = CalcLexer()
18   for tok in lexer.tokenize(data):
19     print('type=%r, value=%r' % (tok.type, tok.value))

 1 python3 233.error.py
 2 
 3 type='NUMBER', value='3'
 4 type='PLUS', value='+'
 5 type='NUMBER', value='42'
 6 type='PLUS', value='+'
 7 type='NUMBER', value='8'
 8 Illegal character '.'
 9 type='NUMBER', value='43'
10 type='PLUS', value='+'
11 type='NUMBER', value='456'

Converting Value Based on Type

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   PLUS    = r'\+'
9   NUMBER =  r'\d+' #[0-9]+
10 
11   def NUMBER(self, t):
12     t.value = int(t.value)
13     print('number:', t.value)
14     return t
15 
16   def error(self, t):
17     print("Illegal character '%s'" % t.value[0])
18     self.index += 1
19 
20 if __name__ == '__main__':
21   data = '3 + 42 +8'
22   lexer = CalcLexer()
23   for tok in lexer.tokenize(data):
24     print('type=%r, value=%r' % (tok.type, tok.value))

 1 python3 244.plus.py
 2 
 3 number: 3
 4 type='NUMBER', value=3
 5 type='PLUS', value='+'
 6 number: 42
 7 type='NUMBER', value=42
 8 type='PLUS', value='+'
 9 number: 8
10 type='NUMBER', value=8

Index of Token in Buffer

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   PLUS    = r'\+'
9   NUMBER =  r'\d+' #[0-9]+
10 
11   def NUMBER(self, t):
12     t.value = int(t.value)
13     return t
14 
15   def error(self, t):
16     print("Illegal character '%s'" % t.value[0])
17     self.index += 1
18 
19 if __name__ == '__main__':
20   data = '3 + 42 +8'
21   lexer = CalcLexer()
22   for tok in lexer.tokenize(data):
23     print('type=%r, value=%r, index=%r' % (
24       tok.type, tok.value, tok.index))

1 python3 255.index.py
2 
3 type='NUMBER', value=3, index=0
4 type='PLUS', value='+', index=2
5 type='NUMBER', value=42, index=4
6 type='PLUS', value='+', index=7
7 type='NUMBER', value=8, index=8

Just Number Function

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   PLUS    = r'\+'
9 
10   @_(r'\d+')
11   def NUMBER(self, t):
12     t.value = int(t.value)
13     return t
14 
15   def error(self, t):
16     print("Illegal character '%s'" % t.value[0])
17     self.index += 1
18 
19 if __name__ == '__main__':
20   data = '3 + 42 +8'
21   lexer = CalcLexer()
22   for tok in lexer.tokenize(data):
23     print('type=%r, value=%r, index=%r' % (
24       tok.type, tok.value, tok.index))

1 python3 264.just.number.function.py
2 
3 type='NUMBER', value=3, index=0
4 type='PLUS', value='+', index=2
5 type='NUMBER', value=42, index=4
6 type='PLUS', value='+', index=7
7 type='NUMBER', value=8, index=8

Ignore New Line and One Line Comment

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   ignore_comment = r'\#.*'
9   ignore_newline = r'\n+'
10 
11   PLUS    = r'\+'
12 
13   @_(r'\d+')
14   def NUMBER(self, t):
15     t.value = int(t.value)
16     return t
17 
18   def error(self, t):
19     print("Illegal character '%s'" % t.value[0])
20     self.index += 1
21 
22 if __name__ == '__main__':
23   data = '''3 + 42 +8 # First comment
24     + 46+980+51+# Another commnet
25     343+43
26   '''
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r, index=%r'
30        % (tok.type, tok.value, tok.index))

 1 python3 274.ignore.lines.py
 2 
 3 type='NUMBER', value=3, index=0
 4 type='PLUS', value='+', index=2
 5 type='NUMBER', value=42, index=4
 6 type='PLUS', value='+', index=7
 7 type='NUMBER', value=8, index=8
 8 type='PLUS', value='+', index=30
 9 type='NUMBER', value=46, index=32
10 type='PLUS', value='+', index=34
11 type='NUMBER', value=980, index=35
12 type='PLUS', value='+', index=38
13 type='NUMBER', value=51, index=39
14 type='PLUS', value='+', index=41
15 type='NUMBER', value=343, index=64
16 type='PLUS', value='+', index=67
17 type='NUMBER', value=43, index=68

Counting Lines (Error)

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER,PLUS}
6   ignore = ' \t'
7 
8   ignore_comment = r'\#.*'
9   ignore_newline = r'\n+'
10 
11   PLUS    = r'\+'
12 
13   @_(r'\d+')
14   def NUMBER(self, t):
15     t.value = int(t.value)
16     return t
17 
18   def error(self, t):
19     print("Illegal character '%s'" % t.value[0])
20     self.index += 1
21 
22 if __name__ == '__main__':
23   data = '''3 + 42 +8 # First comment
24     + 46+980+51+# Another commnet
25     343+43
26   '''
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r, index=%r, lineno=%r'
30        % (tok.type, tok.value, tok.index, tok.lineno))

 1 python3 280.lineno.wrong.py
 2 
 3 type='NUMBER', value=3, index=0, lineno=1
 4 type='PLUS', value='+', index=2, lineno=1
 5 type='NUMBER', value=42, index=4, lineno=1
 6 type='PLUS', value='+', index=7, lineno=1
 7 type='NUMBER', value=8, index=8, lineno=1
 8 type='PLUS', value='+', index=30, lineno=1
 9 type='NUMBER', value=46, index=32, lineno=1
10 type='PLUS', value='+', index=34, lineno=1
11 type='NUMBER', value=980, index=35, lineno=1
12 type='PLUS', value='+', index=38, lineno=1
13 type='NUMBER', value=51, index=39, lineno=1
14 type='PLUS', value='+', index=41, lineno=1
15 type='NUMBER', value=343, index=64, lineno=1
16 type='PLUS', value='+', index=67, lineno=1
17 type='NUMBER', value=43, index=68, lineno=1

Line Number of Tokens

1 from sly import Lexer
2 class CalcLexer(Lexer):
3   tokens = {NUMBER, PLUS}
4   ignore = ' \t'
5   ignore_comment = r'\#.*'
6   PLUS    = r'\+'
7 
8   @_(r'\d+')
9   def NUMBER(self, t):
10     t.value = int(t.value)
11     return t
12 
13   # Line number tracking
14   @_(r'\n+')
15   def ignore_newline(self, t):
16     self.lineno += t.value.count('\n')
17 
18   def error(self, t):
19     print("Illegal character '%s'" % t.value[0])
20     self.index += 1
21 
22 if __name__ == '__main__':
23   data = '''3 + 42 +8 # First comment
24     + 46+980+51+# Another commnet
25     343+43
26   '''
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r, index=%r, lineno=%r'
30        % (tok.type, tok.value, tok.index, tok.lineno))

 1 python3 284.lineno.py
 2 
 3 type='NUMBER', value=3, index=0, lineno=1
 4 type='PLUS', value='+', index=2, lineno=1
 5 type='NUMBER', value=42, index=4, lineno=1
 6 type='PLUS', value='+', index=7, lineno=1
 7 type='NUMBER', value=8, index=8, lineno=1
 8 type='PLUS', value='+', index=30, lineno=2
 9 type='NUMBER', value=46, index=32, lineno=2
10 type='PLUS', value='+', index=34, lineno=2
11 type='NUMBER', value=980, index=35, lineno=2
12 type='PLUS', value='+', index=38, lineno=2
13 type='NUMBER', value=51, index=39, lineno=2
14 type='PLUS', value='+', index=41, lineno=2
15 type='NUMBER', value=343, index=64, lineno=3
16 type='PLUS', value='+', index=67, lineno=3
17 type='NUMBER', value=43, index=68, lineno=3

Token Remapping

1 from sly import Lexer
2 class CalcLexer(Lexer):
3   tokens = {NUMBER, PLUS, ID, IF, ELSE, WHILE}
4   ignore = ' \t'
5   ignore_comment = r'\#.*'
6   PLUS    = r'\+'
7   ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
8 
9   # Special cases
10   ID['if'] = IF
11   ID['else'] = ELSE
12   ID['while'] = WHILE  
13 
14   @_(r'\d+')
15   def NUMBER(self, t):
16     t.value = int(t.value)
17     return t
18 
19   # Line number tracking
20   @_(r'\n+')
21   def ignore_newline(self, t):
22     self.lineno += t.value.count('\n')
23 
24   def error(self, t):
25     print("Illegal character '%s'" % t.value[0])
26     self.index += 1
27 
28 if __name__ == '__main__':
29   data = 'count+ if +ifelse+8+while # First comment'
30   lexer = CalcLexer()
31   for tok in lexer.tokenize(data):
32     print('type=%r, value=%r' % (tok.type, tok.value))

 1 python3 322.token.remapping.py
 2 
 3 type='ID', value='count'
 4 type='PLUS', value='+'
 5 type='IF', value='if'
 6 type='PLUS', value='+'
 7 type='ID', value='ifelse'
 8 type='PLUS', value='+'
 9 type='NUMBER', value=8
10 type='PLUS', value='+'
11 type='WHILE', value='while'

More Token Type

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4   # Set of token names.   
5   # This is always required
6   tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
7              DIVIDE, ASSIGN, LPAREN, RPAREN }
8 
9   # String containing ignored characters
10   # between tokens
11   ignore = ' \t'
12 
13   # Regular expression rules for tokens
14   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
15   NUMBER  = r'\d+'
16   PLUS    = r'\+'
17   MINUS   = r'-'
18   TIMES   = r'\*'
19   DIVIDE  = r'/'
20   ASSIGN  = r'='
21   LPAREN  = r'\('
22   RPAREN  = r'\)'
23 
24 if __name__ == '__main__':
25   data = 'x = 3 + 42 * (s - t)'
26   lexer = CalcLexer()
27   for tok in lexer.tokenize(data):
28     print('type=%r, value=%r' % (tok.type, tok.value))

 1 python3 524.more.tokens.py
 2 
 3 type='ID', value='x'
 4 type='ASSIGN', value='='
 5 type='NUMBER', value='3'
 6 type='PLUS', value='+'
 7 type='NUMBER', value='42'
 8 type='TIMES', value='*'
 9 type='LPAREN', value='('
10 type='ID', value='s'
11 type='MINUS', value='-'
12 type='ID', value='t'
13 type='RPAREN', value=')'

Longer Tokens before Short Tokens

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4   # Set of token names.   
5   # This is always required
6   tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
7     DIVIDE, LPAREN, RPAREN, ASSIGN, EQ }
8 
9   # String containing ignored characters
10   # between tokens
11   ignore = ' \t'
12 
13   # Regular expression rules for tokens
14   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
15   NUMBER  = r'\d+'
16   PLUS    = r'\+'
17   MINUS   = r'-'
18   TIMES   = r'\*'
19   DIVIDE  = r'/'
20   EQ      = r'=='  # MUST APPEAR FIRST! (LONGER)
21   ASSIGN  = r'='
22   LPAREN  = r'\('
23   RPAREN  = r'\)'
24 
25 if __name__ == '__main__':
26   data = 'x = 3 + 42 * (s == t)'
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r' % (tok.type, tok.value))

 1 python3 544.longer.before.shorter.py
 2 
 3 type='ID', value='x'
 4 type='ASSIGN', value='='
 5 type='NUMBER', value='3'
 6 type='PLUS', value='+'
 7 type='NUMBER', value='42'
 8 type='TIMES', value='*'
 9 type='LPAREN', value='('
10 type='ID', value='s'
11 type='EQ', value='=='
12 type='ID', value='t'
13 type='RPAREN', value=')'

Longer Tokens before Short Tokens(Error)

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4   # Set of token names.   
5   # This is always required
6   tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
7     DIVIDE, LPAREN, RPAREN, ASSIGN, EQ }
8 
9   # String containing ignored characters
10   # between tokens
11   ignore = ' \t'
12 
13   # Regular expression rules for tokens
14   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
15   NUMBER  = r'\d+'
16   PLUS    = r'\+'
17   MINUS   = r'-'
18   TIMES   = r'\*'
19   DIVIDE  = r'/'
20   ASSIGN  = r'='
21   EQ      = r'=='  # MUST APPEAR FIRST! (LONGER)  
22   LPAREN  = r'\('
23   RPAREN  = r'\)'
24 
25 if __name__ == '__main__':
26   data = 'x = 3 + 42 * (s == t)'
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r' % (tok.type, tok.value))

 1 python3 555.longer.before.shorter.error.py
 2 
 3 type='ID', value='x'
 4 type='ASSIGN', value='='
 5 type='NUMBER', value='3'
 6 type='PLUS', value='+'
 7 type='NUMBER', value='42'
 8 type='TIMES', value='*'
 9 type='LPAREN', value='('
10 type='ID', value='s'
11 type='ASSIGN', value='='
12 type='ASSIGN', value='='
13 type='ID', value='t'
14 type='RPAREN', value=')'

Adding Match Actions

1 from sly import Lexer
2 class CalcLexer(Lexer):
3   # Set of token names.   
4   # This is always required
5   tokens = { ID, NUMBER, PLUS, TIMES, ASSIGN, EQ}
6 
7   # String containing ignored characters
8   # between tokens
9   ignore = ' \t'
10 
11   # Regular expression rules for tokens
12   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
13   PLUS    = r'\+'
14   ASSIGN  = r'='
15   EQ      = r'=='  # MUST APPEAR FIRST! (LONGER)  
16 
17   @_(r'0x[0-9a-fA-F]+', r'\d+')
18   def NUMBER(self, t):
19     if t.value.startswith('0x'):
20       t.value = int(t.value[2:], 16)
21     else:
22       t.value = int(t.value)
23     return t
24     
25 if __name__ == '__main__':
26   data = 'x = 3 + 0xa2 33ab  s == t'
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r' % (tok.type, tok.value))

 1 python3 575.adding.match.actions.py
 2 
 3 type='ID', value='x'
 4 type='ASSIGN', value='='
 5 type='NUMBER', value=3
 6 type='PLUS', value='+'
 7 type='NUMBER', value=162
 8 type='NUMBER', value=33
 9 type='ID', value='ab'
10 type='TIMES', value='*'
11 type='LPAREN', value='('
12 type='ID', value='s'
13 type='ASSIGN', value='='
14 type='ASSIGN', value='='
15 type='ID', value='t'
16 type='RPAREN', value=')'

Literals(I)

1 from sly import Lexer
2 class CalcLexer(Lexer):
3   tokens = {ID, NUMBER, ASSIGN, ASSIGN}
4   literals = {'+', '-', '(', ')'}
5   ignore = ' \t'
6   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
7   ASSIGN  = r'='
8   NUMBER  = '\d+'
9 
10 if __name__ == '__main__':
11   data = 'x += (234+d t) aa)'
12   lexer = CalcLexer()
13   for tok in lexer.tokenize(data):
14     print('type=%r, value=%r' % (tok.type, tok.value))

 1 python3 584.literal.I.py
 2 
 3 type='ID', value='x'
 4 type='+', value='+'
 5 type='ASSIGN', value='='
 6 type='(', value='('
 7 type='NUMBER', value='234'
 8 type='+', value='+'
 9 type='ID', value='d'
10 type='ID', value='t'
11 type=')', value=')'
12 type='ID', value='aa'
13 type=')', value=')'

Literals(II) - Do Something

1 from sly import Lexer
2 class CalcLexer(Lexer):
3   tokens = {ID, NUMBER, ASSIGN}
4   literals = {'{', '}', '+', '-', '(', ')'}
5   ignore = ' \t'
6   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
7   ASSIGN  = r'='
8   NUMBER  = '\d+'
9 
10   def __init__(self): self.nesting_level = 0
11 
12   @_(r'\{')
13   def lbrace(self, t):
14     # Set token type to the expected literal
15     t.type = '{'
16     self.nesting_level += 1
17     print('nesting level: ', self.nesting_level)
18     return t
19 
20   @_(r'\}')
21   def rbrace(self, t):
22     # Set token type to the expected literal
23     t.type = '}'
24     self.nesting_level -=1
25     print('nesting level: ', self.nesting_level)
26     return t
27 
28 if __name__ == '__main__':
29   data = 'x += {s ( {t)} aa}'
30   lexer = CalcLexer()
31   for tok in lexer.tokenize(data):
32     print('type=%r, value=%r' % (tok.type, tok.value))

 1 python3 588.literal.II.py
 2 
 3 type='ID', value='x'
 4 type='+', value='+'
 5 type='ASSIGN', value='='
 6 nesting level:  1
 7 type='{', value='{'
 8 type='ID', value='s'
 9 type='(', value='('
10 nesting level:  2
11 type='{', value='{'
12 type='ID', value='t'
13 type=')', value=')'
14 nesting level:  1
15 type='}', value='}'
16 type='ID', value='aa'
17 nesting level:  0
18 type='}', value='}'

Complete Example

1 class CalcLexer(Lexer):
2   tokens = {INT, FLOAT, ID, WHILE, PRINT, ASSIGN, LT, LE}
3   literals = {'(', ')', '{', '}', ';', '+', '-', '*', '/'}
4   ignore = ' \t';  ignore_comment = r'\#.*'
5   ID = r'[a-zA-Z_][a-zA-Z0-9_]*'; ID['while'] = WHILE
6   ID['print'] = PRINT # ID['if'] = IF # ID['else'] = ELSE
7   ASSIGN  = r'=' #  EQ = r'=='
8   LE      = r'<=' # NE = r'!='
9   LT      = r'<' #GE=r'>='; GT=r'>'
10 
11   @_(r'\d+\.\d+')
12   def FLOAT(self, t): t.value = float(t.value); return t
13 
14   @_(r'\d+')
15   def INT(self, t): t.value = int(t.value); return t
16 
17   @_(r'\n+')
18   def ignore_newline(self, t): self.lineno += t.value.count('\n')
19 
20   def error(self, t):
21     print('Line %d: Bad character %r'%(self.lineno, t.value[0]))
22     self.index += 1
23 
24 data = '''x = 0.1;
25 while (x < 10) {
26   print x: # Counting
27   x = x + 1;
28 }'''
29 lexer = CalcLexer()
30 for tok in lexer.tokenize(data):
31   print('(',tok.type,',',tok.value,',', tok.lineno, ',', 
32     tok.index,')')

 1 python3 624.py
 2 
 3 ( ID , x , 1 , 0 )
 4 ( ASSIGN , = , 1 , 2 )
 5 ( FLOAT , 0.1 , 1 , 4 )
 6 ( ; , ; , 1 , 7 )
 7 ( WHILE , while , 2 , 9 )
 8 ( ( , ( , 2 , 15 )
 9 ( ID , x , 2 , 16 )
10 ( LT , < , 2 , 18 )
11 ( INT , 10 , 2 , 20 )
12 ( ) , ) , 2 , 22 )
13 ( { , { , 2 , 24 )
14 ( PRINT , print , 3 , 28 )
15 ( ID , x , 3 , 34 )
16 Line 3: Bad character ':'
17 ( ID , x , 4 , 50 )
18 ( ASSIGN , = , 4 , 52 )
19 ( ID , x , 4 , 54 )
20 ( + , + , 4 , 56 )
21 ( INT , 1 , 4 , 58 )
22 ( ; , ; , 4 , 59 )
23 ( } , } , 5 , 61 )
24

Syntax Analysis(Recursive Descendant)