Lexical Analyzer

SLY (Sly Lex-Yacc)

Ahmad Yoosofan

Compiler course

University of Kashan

Sly

Install SLY

# install python (already installed in most Linux)
# install pip3
# sudo apt-get install python3-pip # for Debian and Ubuntu

pip3 install sly

# or just copy sly files in the same folder of the project

Simple Lexer Code

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   PLUS    = r'\+'
9   NUMBER =  r'[0-9]+' #\d+
10 
11 if __name__ == '__main__':
12   data = '3 + 42 +8'
13   lexer = CalcLexer()
14   for tok in lexer.tokenize(data):
15     print('type=%r, value=%r' % (tok.type, tok.value))
1 python3 222.plus.py
2 
3 type='NUMBER', value='3'
4 type='PLUS', value='+'
5 type='NUMBER', value='42'
6 type='PLUS', value='+'
7 type='NUMBER', value='8'

Simple Error handling

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   PLUS    = r'\+'
9   NUMBER =  r'[0-9]+' #\d+
10 
11   def error(self, t):
12     print("Illegal character '%s'" % t.value[0])
13     self.index += 1
14 
15 if __name__ == '__main__':
16   data = '3 + 42 +8.43+456'
17   lexer = CalcLexer()
18   for tok in lexer.tokenize(data):
19     print('type=%r, value=%r' % (tok.type, tok.value))
 1 python3 233.error.py
 2 
 3 type='NUMBER', value='3'
 4 type='PLUS', value='+'
 5 type='NUMBER', value='42'
 6 type='PLUS', value='+'
 7 type='NUMBER', value='8'
 8 Illegal character '.'
 9 type='NUMBER', value='43'
10 type='PLUS', value='+'
11 type='NUMBER', value='456'

Converting Value Based on Type

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   PLUS    = r'\+'
9   NUMBER =  r'\d+' #[0-9]+
10 
11   def NUMBER(self, t):
12     t.value = int(t.value)
13     print('number:', t.value)
14     return t
15 
16   def error(self, t):
17     print("Illegal character '%s'" % t.value[0])
18     self.index += 1
19 
20 if __name__ == '__main__':
21   data = '3 + 42 +8'
22   lexer = CalcLexer()
23   for tok in lexer.tokenize(data):
24     print('type=%r, value=%r' % (tok.type, tok.value))
 1 python3 244.plus.py
 2 
 3 number: 3
 4 type='NUMBER', value=3
 5 type='PLUS', value='+'
 6 number: 42
 7 type='NUMBER', value=42
 8 type='PLUS', value='+'
 9 number: 8
10 type='NUMBER', value=8

Index of Token in Buffer

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   PLUS    = r'\+'
9   NUMBER =  r'\d+' #[0-9]+
10 
11   def NUMBER(self, t):
12     t.value = int(t.value)
13     return t
14 
15   def error(self, t):
16     print("Illegal character '%s'" % t.value[0])
17     self.index += 1
18 
19 if __name__ == '__main__':
20   data = '3 + 42 +8'
21   lexer = CalcLexer()
22   for tok in lexer.tokenize(data):
23     print('type=%r, value=%r, index=%r' % (
24       tok.type, tok.value, tok.index))
1 python3 255.index.py
2 
3 type='NUMBER', value=3, index=0
4 type='PLUS', value='+', index=2
5 type='NUMBER', value=42, index=4
6 type='PLUS', value='+', index=7
7 type='NUMBER', value=8, index=8

Just Number Function

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   PLUS    = r'\+'
9 
10   @_(r'\d+')
11   def NUMBER(self, t):
12     t.value = int(t.value)
13     return t
14 
15   def error(self, t):
16     print("Illegal character '%s'" % t.value[0])
17     self.index += 1
18 
19 if __name__ == '__main__':
20   data = '3 + 42 +8'
21   lexer = CalcLexer()
22   for tok in lexer.tokenize(data):
23     print('type=%r, value=%r, index=%r' % (
24       tok.type, tok.value, tok.index))
1 python3 264.just.number.function.py
2 
3 type='NUMBER', value=3, index=0
4 type='PLUS', value='+', index=2
5 type='NUMBER', value=42, index=4
6 type='PLUS', value='+', index=7
7 type='NUMBER', value=8, index=8

Ignore New Line and One Line Comment

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER, PLUS}
6   ignore = ' \t'
7 
8   ignore_comment = r'\#.*'
9   ignore_newline = r'\n+'
10 
11   PLUS    = r'\+'
12 
13   @_(r'\d+')
14   def NUMBER(self, t):
15     t.value = int(t.value)
16     return t
17 
18   def error(self, t):
19     print("Illegal character '%s'" % t.value[0])
20     self.index += 1
21 
22 if __name__ == '__main__':
23   data = '''3 + 42 +8 # First comment
24     + 46+980+51+# Another commnet
25     343+43
26   '''
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r, index=%r'
30        % (tok.type, tok.value, tok.index))
 1 python3 274.ignore.lines.py
 2 
 3 type='NUMBER', value=3, index=0
 4 type='PLUS', value='+', index=2
 5 type='NUMBER', value=42, index=4
 6 type='PLUS', value='+', index=7
 7 type='NUMBER', value=8, index=8
 8 type='PLUS', value='+', index=30
 9 type='NUMBER', value=46, index=32
10 type='PLUS', value='+', index=34
11 type='NUMBER', value=980, index=35
12 type='PLUS', value='+', index=38
13 type='NUMBER', value=51, index=39
14 type='PLUS', value='+', index=41
15 type='NUMBER', value=343, index=64
16 type='PLUS', value='+', index=67
17 type='NUMBER', value=43, index=68

Counting Lines (Error)

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4 
5   tokens = {NUMBER,PLUS}
6   ignore = ' \t'
7 
8   ignore_comment = r'\#.*'
9   ignore_newline = r'\n+'
10 
11   PLUS    = r'\+'
12 
13   @_(r'\d+')
14   def NUMBER(self, t):
15     t.value = int(t.value)
16     return t
17 
18   def error(self, t):
19     print("Illegal character '%s'" % t.value[0])
20     self.index += 1
21 
22 if __name__ == '__main__':
23   data = '''3 + 42 +8 # First comment
24     + 46+980+51+# Another commnet
25     343+43
26   '''
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r, index=%r, lineno=%r'
30        % (tok.type, tok.value, tok.index, tok.lineno))
 1 python3 280.lineno.wrong.py
 2 
 3 type='NUMBER', value=3, index=0, lineno=1
 4 type='PLUS', value='+', index=2, lineno=1
 5 type='NUMBER', value=42, index=4, lineno=1
 6 type='PLUS', value='+', index=7, lineno=1
 7 type='NUMBER', value=8, index=8, lineno=1
 8 type='PLUS', value='+', index=30, lineno=1
 9 type='NUMBER', value=46, index=32, lineno=1
10 type='PLUS', value='+', index=34, lineno=1
11 type='NUMBER', value=980, index=35, lineno=1
12 type='PLUS', value='+', index=38, lineno=1
13 type='NUMBER', value=51, index=39, lineno=1
14 type='PLUS', value='+', index=41, lineno=1
15 type='NUMBER', value=343, index=64, lineno=1
16 type='PLUS', value='+', index=67, lineno=1
17 type='NUMBER', value=43, index=68, lineno=1

Line Number of Tokens

1 from sly import Lexer
2 class CalcLexer(Lexer):
3   tokens = {NUMBER, PLUS}
4   ignore = ' \t'
5   ignore_comment = r'\#.*'
6   PLUS    = r'\+'
7 
8   @_(r'\d+')
9   def NUMBER(self, t):
10     t.value = int(t.value)
11     return t
12 
13   # Line number tracking
14   @_(r'\n+')
15   def ignore_newline(self, t):
16     self.lineno += t.value.count('\n')
17 
18   def error(self, t):
19     print("Illegal character '%s'" % t.value[0])
20     self.index += 1
21 
22 if __name__ == '__main__':
23   data = '''3 + 42 +8 # First comment
24     + 46+980+51+# Another commnet
25     343+43
26   '''
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r, index=%r, lineno=%r'
30        % (tok.type, tok.value, tok.index, tok.lineno))
 1 python3 284.lineno.py
 2 
 3 type='NUMBER', value=3, index=0, lineno=1
 4 type='PLUS', value='+', index=2, lineno=1
 5 type='NUMBER', value=42, index=4, lineno=1
 6 type='PLUS', value='+', index=7, lineno=1
 7 type='NUMBER', value=8, index=8, lineno=1
 8 type='PLUS', value='+', index=30, lineno=2
 9 type='NUMBER', value=46, index=32, lineno=2
10 type='PLUS', value='+', index=34, lineno=2
11 type='NUMBER', value=980, index=35, lineno=2
12 type='PLUS', value='+', index=38, lineno=2
13 type='NUMBER', value=51, index=39, lineno=2
14 type='PLUS', value='+', index=41, lineno=2
15 type='NUMBER', value=343, index=64, lineno=3
16 type='PLUS', value='+', index=67, lineno=3
17 type='NUMBER', value=43, index=68, lineno=3

Token Remapping

1 from sly import Lexer
2 class CalcLexer(Lexer):
3   tokens = {NUMBER, PLUS, ID, IF, ELSE, WHILE}
4   ignore = ' \t'
5   ignore_comment = r'\#.*'
6   PLUS    = r'\+'
7   ID = r'[a-zA-Z_][a-zA-Z0-9_]*'
8 
9   # Special cases
10   ID['if'] = IF
11   ID['else'] = ELSE
12   ID['while'] = WHILE  
13 
14   @_(r'\d+')
15   def NUMBER(self, t):
16     t.value = int(t.value)
17     return t
18 
19   # Line number tracking
20   @_(r'\n+')
21   def ignore_newline(self, t):
22     self.lineno += t.value.count('\n')
23 
24   def error(self, t):
25     print("Illegal character '%s'" % t.value[0])
26     self.index += 1
27 
28 if __name__ == '__main__':
29   data = 'count+ if +ifelse+8+while # First comment'
30   lexer = CalcLexer()
31   for tok in lexer.tokenize(data):
32     print('type=%r, value=%r' % (tok.type, tok.value))
 1 python3 322.token.remapping.py
 2 
 3 type='ID', value='count'
 4 type='PLUS', value='+'
 5 type='IF', value='if'
 6 type='PLUS', value='+'
 7 type='ID', value='ifelse'
 8 type='PLUS', value='+'
 9 type='NUMBER', value=8
10 type='PLUS', value='+'
11 type='WHILE', value='while'

More Token Type

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4   # Set of token names.   
5   # This is always required
6   tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
7              DIVIDE, ASSIGN, LPAREN, RPAREN }
8 
9   # String containing ignored characters
10   # between tokens
11   ignore = ' \t'
12 
13   # Regular expression rules for tokens
14   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
15   NUMBER  = r'\d+'
16   PLUS    = r'\+'
17   MINUS   = r'-'
18   TIMES   = r'\*'
19   DIVIDE  = r'/'
20   ASSIGN  = r'='
21   LPAREN  = r'\('
22   RPAREN  = r'\)'
23 
24 if __name__ == '__main__':
25   data = 'x = 3 + 42 * (s - t)'
26   lexer = CalcLexer()
27   for tok in lexer.tokenize(data):
28     print('type=%r, value=%r' % (tok.type, tok.value))
 1 python3 524.more.tokens.py
 2 
 3 type='ID', value='x'
 4 type='ASSIGN', value='='
 5 type='NUMBER', value='3'
 6 type='PLUS', value='+'
 7 type='NUMBER', value='42'
 8 type='TIMES', value='*'
 9 type='LPAREN', value='('
10 type='ID', value='s'
11 type='MINUS', value='-'
12 type='ID', value='t'
13 type='RPAREN', value=')'

Longer Tokens before Short Tokens

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4   # Set of token names.   
5   # This is always required
6   tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
7     DIVIDE, LPAREN, RPAREN, ASSIGN, EQ }
8 
9   # String containing ignored characters
10   # between tokens
11   ignore = ' \t'
12 
13   # Regular expression rules for tokens
14   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
15   NUMBER  = r'\d+'
16   PLUS    = r'\+'
17   MINUS   = r'-'
18   TIMES   = r'\*'
19   DIVIDE  = r'/'
20   EQ      = r'=='  # MUST APPEAR FIRST! (LONGER)
21   ASSIGN  = r'='
22   LPAREN  = r'\('
23   RPAREN  = r'\)'
24 
25 if __name__ == '__main__':
26   data = 'x = 3 + 42 * (s == t)'
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r' % (tok.type, tok.value))
 1 python3 544.longer.before.shorter.py
 2 
 3 type='ID', value='x'
 4 type='ASSIGN', value='='
 5 type='NUMBER', value='3'
 6 type='PLUS', value='+'
 7 type='NUMBER', value='42'
 8 type='TIMES', value='*'
 9 type='LPAREN', value='('
10 type='ID', value='s'
11 type='EQ', value='=='
12 type='ID', value='t'
13 type='RPAREN', value=')'

Longer Tokens before Short Tokens(Error)

1 from sly import Lexer
2 
3 class CalcLexer(Lexer):
4   # Set of token names.   
5   # This is always required
6   tokens = { ID, NUMBER, PLUS, MINUS, TIMES,
7     DIVIDE, LPAREN, RPAREN, ASSIGN, EQ }
8 
9   # String containing ignored characters
10   # between tokens
11   ignore = ' \t'
12 
13   # Regular expression rules for tokens
14   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
15   NUMBER  = r'\d+'
16   PLUS    = r'\+'
17   MINUS   = r'-'
18   TIMES   = r'\*'
19   DIVIDE  = r'/'
20   ASSIGN  = r'='
21   EQ      = r'=='  # MUST APPEAR FIRST! (LONGER)  
22   LPAREN  = r'\('
23   RPAREN  = r'\)'
24 
25 if __name__ == '__main__':
26   data = 'x = 3 + 42 * (s == t)'
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r' % (tok.type, tok.value))
 1 python3 555.longer.before.shorter.error.py
 2 
 3 type='ID', value='x'
 4 type='ASSIGN', value='='
 5 type='NUMBER', value='3'
 6 type='PLUS', value='+'
 7 type='NUMBER', value='42'
 8 type='TIMES', value='*'
 9 type='LPAREN', value='('
10 type='ID', value='s'
11 type='ASSIGN', value='='
12 type='ASSIGN', value='='
13 type='ID', value='t'
14 type='RPAREN', value=')'

Adding Match Actions

1 from sly import Lexer
2 class CalcLexer(Lexer):
3   # Set of token names.   
4   # This is always required
5   tokens = { ID, NUMBER, PLUS, TIMES, ASSIGN, EQ}
6 
7   # String containing ignored characters
8   # between tokens
9   ignore = ' \t'
10 
11   # Regular expression rules for tokens
12   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
13   PLUS    = r'\+'
14   ASSIGN  = r'='
15   EQ      = r'=='  # MUST APPEAR FIRST! (LONGER)  
16 
17   @_(r'0x[0-9a-fA-F]+', r'\d+')
18   def NUMBER(self, t):
19     if t.value.startswith('0x'):
20       t.value = int(t.value[2:], 16)
21     else:
22       t.value = int(t.value)
23     return t
24     
25 if __name__ == '__main__':
26   data = 'x = 3 + 0xa2 33ab  s == t'
27   lexer = CalcLexer()
28   for tok in lexer.tokenize(data):
29     print('type=%r, value=%r' % (tok.type, tok.value))
 1 python3 575.adding.match.actions.py
 2 
 3 type='ID', value='x'
 4 type='ASSIGN', value='='
 5 type='NUMBER', value=3
 6 type='PLUS', value='+'
 7 type='NUMBER', value=162
 8 type='NUMBER', value=33
 9 type='ID', value='ab'
10 type='TIMES', value='*'
11 type='LPAREN', value='('
12 type='ID', value='s'
13 type='ASSIGN', value='='
14 type='ASSIGN', value='='
15 type='ID', value='t'
16 type='RPAREN', value=')'

Literals(I)

1 from sly import Lexer
2 class CalcLexer(Lexer):
3   tokens = {ID, NUMBER, ASSIGN, ASSIGN}
4   literals = {'+', '-', '(', ')'}
5   ignore = ' \t'
6   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
7   ASSIGN  = r'='
8   NUMBER  = '\d+'
9 
10 if __name__ == '__main__':
11   data = 'x += (234+d t) aa)'
12   lexer = CalcLexer()
13   for tok in lexer.tokenize(data):
14     print('type=%r, value=%r' % (tok.type, tok.value))
 1 python3 584.literal.I.py
 2 
 3 type='ID', value='x'
 4 type='+', value='+'
 5 type='ASSIGN', value='='
 6 type='(', value='('
 7 type='NUMBER', value='234'
 8 type='+', value='+'
 9 type='ID', value='d'
10 type='ID', value='t'
11 type=')', value=')'
12 type='ID', value='aa'
13 type=')', value=')'

Literals(II) - Do Something

1 from sly import Lexer
2 class CalcLexer(Lexer):
3   tokens = {ID, NUMBER, ASSIGN}
4   literals = {'{', '}', '+', '-', '(', ')'}
5   ignore = ' \t'
6   ID      = r'[a-zA-Z_][a-zA-Z0-9_]*'
7   ASSIGN  = r'='
8   NUMBER  = '\d+'
9 
10   def __init__(self): self.nesting_level = 0
11 
12   @_(r'\{')
13   def lbrace(self, t):
14     # Set token type to the expected literal
15     t.type = '{'
16     self.nesting_level += 1
17     print('nesting level: ', self.nesting_level)
18     return t
19 
20   @_(r'\}')
21   def rbrace(self, t):
22     # Set token type to the expected literal
23     t.type = '}'
24     self.nesting_level -=1
25     print('nesting level: ', self.nesting_level)
26     return t
27 
28 if __name__ == '__main__':
29   data = 'x += {s ( {t)} aa}'
30   lexer = CalcLexer()
31   for tok in lexer.tokenize(data):
32     print('type=%r, value=%r' % (tok.type, tok.value))
 1 python3 588.literal.II.py
 2 
 3 type='ID', value='x'
 4 type='+', value='+'
 5 type='ASSIGN', value='='
 6 nesting level:  1
 7 type='{', value='{'
 8 type='ID', value='s'
 9 type='(', value='('
10 nesting level:  2
11 type='{', value='{'
12 type='ID', value='t'
13 type=')', value=')'
14 nesting level:  1
15 type='}', value='}'
16 type='ID', value='aa'
17 nesting level:  0
18 type='}', value='}'

Complete Example

1 class CalcLexer(Lexer):
2   tokens = {INT, FLOAT, ID, WHILE, PRINT, ASSIGN, LT, LE}
3   literals = {'(', ')', '{', '}', ';', '+', '-', '*', '/'}
4   ignore = ' \t';  ignore_comment = r'\#.*'
5   ID = r'[a-zA-Z_][a-zA-Z0-9_]*'; ID['while'] = WHILE
6   ID['print'] = PRINT # ID['if'] = IF # ID['else'] = ELSE
7   ASSIGN  = r'=' #  EQ = r'=='
8   LE      = r'<=' # NE = r'!='
9   LT      = r'<' #GE=r'>='; GT=r'>'
10 
11   @_(r'\d+\.\d+')
12   def FLOAT(self, t): t.value = float(t.value); return t
13 
14   @_(r'\d+')
15   def INT(self, t): t.value = int(t.value); return t
16 
17   @_(r'\n+')
18   def ignore_newline(self, t): self.lineno += t.value.count('\n')
19 
20   def error(self, t):
21     print('Line %d: Bad character %r'%(self.lineno, t.value[0]))
22     self.index += 1
23 
24 data = '''x = 0.1;
25 while (x < 10) {
26   print x: # Counting
27   x = x + 1;
28 }'''
29 lexer = CalcLexer()
30 for tok in lexer.tokenize(data):
31   print('(',tok.type,',',tok.value,',', tok.lineno, ',', 
32     tok.index,')')
 1 python3 624.py
 2 
 3 ( ID , x , 1 , 0 )
 4 ( ASSIGN , = , 1 , 2 )
 5 ( FLOAT , 0.1 , 1 , 4 )
 6 ( ; , ; , 1 , 7 )
 7 ( WHILE , while , 2 , 9 )
 8 ( ( , ( , 2 , 15 )
 9 ( ID , x , 2 , 16 )
10 ( LT , < , 2 , 18 )
11 ( INT , 10 , 2 , 20 )
12 ( ) , ) , 2 , 22 )
13 ( { , { , 2 , 24 )
14 ( PRINT , print , 3 , 28 )
15 ( ID , x , 3 , 34 )
16 Line 3: Bad character ':'
17 ( ID , x , 4 , 50 )
18 ( ASSIGN , = , 4 , 52 )
19 ( ID , x , 4 , 54 )
20 ( + , + , 4 , 56 )
21 ( INT , 1 , 4 , 58 )
22 ( ; , ; , 4 , 59 )
23 ( } , } , 5 , 61 )
24 

Syntax Analysis(Recursive Descendant)

1