flex_bison学习

利用flex,bison实现词法和语法分析,利用llvm实现代码优化和代码生成.

这里介绍一下lex和yacc(unix环境下).

linux环境下是flex和bison.(当然lex和flex,yacc和bison还是有差别的)

lex

用lex源语言编写的程序,扩展名为.l的文件.经lex系统处理后输出c程序文件.该文件包含依据正规式构建的状态转移表,用来驱动该状态表的总控程序yylex().

文件格式

1
2
3
4
5
定义部分
%%
识别规则部分
%%
辅助函数部分

第二个%%可以省略(即辅助函数部分没有的情况下)

一个例子

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
%{
#include <stdio.h>
int num_lines = 0,num_chars = 0;
%}

%%
\n {num_lines++;num_chars++;}
. {++num_chars;} //除空白字符
%%
int main(void)
{
yylex();
printf("num of lines =%d,num of chars=%d\n",num_lines,num_chars);
return 0;
}

定义部分对规则部分要引用的文件和变量进行说明.可包含头文件,常数定义,全局变量定义,外部变量定义以及正规式定义.

识别规则部分

1
P {/\*do something*/}

辅助函数部分

变量和函数介绍
yyinFILE*类型 指向输入文件
yyout指向输出文件
yytextchar* 指向识别规则中的一个正规式匹配的单词的首字符
yylex()从该函数分析
yylengint类型 记录与识别规则中正规式匹配的单词的首字符长度
yywrap()文件结束处理函数 若返回值1就停止解析 可用来解析多个文件
echo将yytext打印到yyout

正规式就不多介绍了

yacc

基于LALR(1)语法分析,可以输出两个文件,一个是包含语法分析函数yyparse()的c程序,另一个是包含原文件中所有终结符编码的宏定义文件,扩展名.h.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
%{
#include <stdio.h>
void yyerror(const char* msg) {}
%}

%token T_NUM

%left '+' '-'
%left '*' '/'

%%

S : S E '\n' { printf("ans = %d\n", $2); }
| /* empty */ { /* empty */ }
;

E : E '+' E { $$ = $1 + $3; }
| E '-' E { $$ = $1 - $3; }
| E '*' E { $$ = $1 * $3; }
| E '/' E { $$ = $1 / $3; }
| T_NUM { $$ = $1; }
| '(' E ')' { $$ = $2; }
;

%%

int main() {
return yyparse();
}

生成的头文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
#ifndef YY_YY_CAL2_TAB_H_INCLUDED
# define YY_YY_CAL2_TAB_H_INCLUDED
/* Debug traces. */
#ifndef YYDEBUG
# define YYDEBUG 0
#endif
#if YYDEBUG
extern int yydebug;
#endif

/* Token type. */
#ifndef YYTOKENTYPE
# define YYTOKENTYPE
enum yytokentype
{
NUMBER = 258,
ADD = 259,
SUB = 260,
MUL = 261,
DIV = 262,
ABS = 263,
EOL = 264
};
#endif

/* Value type. */
#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
typedef int YYSTYPE;
# define YYSTYPE_IS_TRIVIAL 1
# define YYSTYPE_IS_DECLARED 1
#endif


extern YYSTYPE yylval;

int yyparse (void);

#endif /* !YY_YY_CAL2_TAB_H_INCLUDED */

可以直接引入到.l文件中

文件格式

1
2
3
4
5
[说明部分]
%%
翻译规则
[%%
辅助过程]

说明部分包括两部分,一部分是处于%{%}之间的c语言代码,定义全局变量和相关文件.

第二部分是文法记号声明,该部分每一项均以%开头.

语义值类型定义

语法分析器除了包含用来保存文法符号和状态的分析栈之外,还有一个用来保存与分析栈中文法符号相对应的语义值的语义栈.即YYSTYPE,抽象数据类型,缺省为int

用户定义YYSTYPE

若为简单类型

1
2
3
%{
#define YYSTYPE float
%}

若为复杂类型,使用共用体.

1
2
3
4
5
%union
{
SYMBOL *sym;
ENODE *NODE
}

引用时

1
2
%token <sym> id
%type <node> expr

%token表示终结符

%type表示非终结符

文法开始符定义

一般以%start S表明S是文法的开始符号.默认为规则部分中出现的第一个产生式的左部非终极符号.

优先性与结合性

%left 左结合

%nonassoc 无结合性

结合性说明排列的次序决定运算符的优先级,在后写的优先级越高

翻译规则

1
2
3
A: a1 {语义动作}
|a2 {}
;

$$表示与产生式做不非终结符号相关的属性值.

$i 表示与产生式右部第i个文法符号相关的属性值. 在每个$i算出来之后再进行归约.

辅助过程

包含主程序,词法分析程序yylex,yyerror.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
YY_DECL
{
yy_state_type yy_current_state;
char *yy_cp, *yy_bp;
int yy_act;

if ( !(yy_init) )
{
(yy_init) = 1;

#ifdef YY_USER_INIT
YY_USER_INIT;
#endif

if ( ! (yy_start) )
(yy_start) = 1; /* first start state */

if ( ! yyin )
yyin = stdin;

if ( ! yyout )
yyout = stdout;

if ( ! YY_CURRENT_BUFFER ) {
yyensure_buffer_stack ();
YY_CURRENT_BUFFER_LVALUE =
yy_create_buffer( yyin, YY_BUF_SIZE );
}

yy_load_buffer_state( );
}

{
#line 15 "cal2.l"

#line 681 "lex.yy.c"

while ( /*CONSTCOND*/1 ) /* loops until end-of-file is reached */
{
yy_cp = (yy_c_buf_p);

/* Support of yytext. */
*yy_cp = (yy_hold_char);

/* yy_bp points to the position in yy_ch_buf of the start of
* the current run.
*/
yy_bp = yy_cp;

yy_current_state = (yy_start);
yy_match:
do
{
YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
if ( yy_accept[yy_current_state] )
{
(yy_last_accepting_state) = yy_current_state;
(yy_last_accepting_cpos) = yy_cp;
}
while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
{
yy_current_state = (int) yy_def[yy_current_state];
if ( yy_current_state >= 15 )
yy_c = yy_meta[yy_c];
}
yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
++yy_cp;
}
while ( yy_base[yy_current_state] != 13 );

yy_find_action:
yy_act = yy_accept[yy_current_state];
if ( yy_act == 0 )
{ /* have to back up */
yy_cp = (yy_last_accepting_cpos);
yy_current_state = (yy_last_accepting_state);
yy_act = yy_accept[yy_current_state];
}

YY_DO_BEFORE_ACTION;

do_action: /* This label is used only to access EOF actions. */

switch ( yy_act )
{ /* beginning of action switch */
case 0: /* must back up */
/* undo the effects of YY_DO_BEFORE_ACTION */
*yy_cp = (yy_hold_char);
yy_cp = (yy_last_accepting_cpos);
yy_current_state = (yy_last_accepting_state);
goto yy_find_action;

case 1:
YY_RULE_SETUP
#line 16 "cal2.l"
{return ADD; }
YY_BREAK
case 2:
YY_RULE_SETUP
#line 17 "cal2.l"
{return SUB; }
YY_BREAK
case 3:
YY_RULE_SETUP
#line 18 "cal2.l"
{return MUL; }
YY_BREAK
case 4:
YY_RULE_SETUP
#line 19 "cal2.l"
{return DIV; }
YY_BREAK
case 5:
YY_RULE_SETUP
#line 20 "cal2.l"
{return ABS; }
YY_BREAK
case 6:
YY_RULE_SETUP
#line 21 "cal2.l"
{ yylval = atoi(yytext); return NUMBER; } //yytext匹配 yylval返回值
YY_BREAK
case 7:
/* rule 7 can match eol */
YY_RULE_SETUP
#line 22 "cal2.l"
{return EOL;}
YY_BREAK
case 8:
YY_RULE_SETUP
#line 23 "cal2.l"
{}
YY_BREAK
case 9:
YY_RULE_SETUP
#line 24 "cal2.l"
{ ECHO ;}
YY_BREAK
case 10:
YY_RULE_SETUP
#line 25 "cal2.l"
ECHO;
YY_BREAK
#line 789 "lex.yy.c"
case YY_STATE_EOF(INITIAL):
yyterminate();

case YY_END_OF_BUFFER:
{
/* Amount of text matched not including the EOB char. */
int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1;

/* Undo the effects of YY_DO_BEFORE_ACTION. */
*yy_cp = (yy_hold_char);
YY_RESTORE_YY_MORE_OFFSET

if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
{
/* We're scanning a new file or input source. It's
* possible that this happened because the user
* just pointed yyin at a new source and called
* yylex(). If so, then we have to assure
* consistency between YY_CURRENT_BUFFER and our
* globals. Here is the right place to do so, because
* this is the first action (other than possibly a
* back-up) that will match for the new input source.
*/
(yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
}

/* Note that here we test for yy_c_buf_p "<=" to the position
* of the first EOB in the buffer, since yy_c_buf_p will
* already have been incremented past the NUL character
* (since all states make transitions on EOB to the
* end-of-buffer state). Contrast this with the test
* in input().
*/
if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
{ /* This was really a NUL. */
yy_state_type yy_next_state;

(yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text;

yy_current_state = yy_get_previous_state( );

/* Okay, we're now positioned to make the NUL
* transition. We couldn't have
* yy_get_previous_state() go ahead and do it
* for us because it doesn't know how to deal
* with the possibility of jamming (and we don't
* want to build jamming into it because then it
* will run more slowly).
*/

yy_next_state = yy_try_NUL_trans( yy_current_state );

yy_bp = (yytext_ptr) + YY_MORE_ADJ;

if ( yy_next_state )
{
/* Consume the NUL. */
yy_cp = ++(yy_c_buf_p);
yy_current_state = yy_next_state;
goto yy_match;
}

else
{
yy_cp = (yy_c_buf_p);
goto yy_find_action;
}
}

else switch ( yy_get_next_buffer( ) )
{
case EOB_ACT_END_OF_FILE:
{
(yy_did_buffer_switch_on_eof) = 0;

if ( yywrap( ) )
{
/* Note: because we've taken care in
* yy_get_next_buffer() to have set up
* yytext, we can now set up
* yy_c_buf_p so that if some total
* hoser (like flex itself) wants to
* call the scanner after we return the
* YY_NULL, it'll still work - another
* YY_NULL will get returned.
*/
(yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ;

yy_act = YY_STATE_EOF(YY_START);
goto do_action;
}

else
{
if ( ! (yy_did_buffer_switch_on_eof) )
YY_NEW_FILE;
}
break;
}

case EOB_ACT_CONTINUE_SCAN:
(yy_c_buf_p) =
(yytext_ptr) + yy_amount_of_matched_text;

yy_current_state = yy_get_previous_state( );

yy_cp = (yy_c_buf_p);
yy_bp = (yytext_ptr) + YY_MORE_ADJ;
goto yy_match;

case EOB_ACT_LAST_MATCH:
(yy_c_buf_p) =
&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)];

yy_current_state = yy_get_previous_state( );

yy_cp = (yy_c_buf_p);
yy_bp = (yytext_ptr) + YY_MORE_ADJ;
goto yy_find_action;
}
break;
}

default:
YY_FATAL_ERROR(
"fatal flex scanner internal error--no action found" );
} /* end of action switch */
} /* end of scanning one token */
} /* end of user's declarations */
} /* end of yylex */

以上是yylex函数.

流程

yyparse调用yylex,调用yylex时得到一个二元式,<记号 属性值>

记号通过return会送给yyparse(),记号必须在yacc中%token声明.属性值通过全局变量yylval会送给yyparse

参考书籍6

flex和bison

编译与反编译实战

-------------本文结束感谢您的阅读-------------
感谢阅读.

欢迎关注我的其它发布渠道