flex_bison学习

利用flex,bison实现词法和语法分析,利用llvm实现代码优化和代码生成.

这里介绍一下lex和yacc(unix环境下).

linux环境下是flex和bison.(当然lex和flex,yacc和bison还是有差别的)

lex

用lex源语言编写的程序,扩展名为.l的文件.经lex系统处理后输出c程序文件.该文件包含依据正规式构建的状态转移表,用来驱动该状态表的总控程序yylex().

文件格式

定义部分
%%
识别规则部分
%%
辅助函数部分

第二个%%可以省略(即辅助函数部分没有的情况下)

一个例子

%{
#include <stdio.h>
int num_lines = 0,num_chars = 0;
%}

%%
\n {num_lines++;num_chars++;}
. {++num_chars;} //除空白字符
%%
int main(void)
{
    yylex();
    printf("num of lines =%d,num of chars=%d\n",num_lines,num_chars);
    return 0;
}

定义部分对规则部分要引用的文件和变量进行说明.可包含头文件,常数定义,全局变量定义,外部变量定义以及正规式定义.

识别规则部分

1	P {/\do something/}

辅助函数部分

变量和函数	介绍
yyin	FILE*类型指向输入文件
yyout	指向输出文件
yytext	char* 指向识别规则中的一个正规式匹配的单词的首字符
yylex()	从该函数分析
yyleng	int类型记录与识别规则中正规式匹配的单词的首字符长度
yywrap()	文件结束处理函数若返回值1就停止解析可用来解析多个文件
echo	将yytext打印到yyout

正规式就不多介绍了

yacc

基于LALR(1)语法分析,可以输出两个文件,一个是包含语法分析函数yyparse()的c程序,另一个是包含原文件中所有终结符编码的宏定义文件,扩展名.h.

%{
#include <stdio.h>
void yyerror(const char* msg) {}
%}

%token T_NUM

%left '+' '-'
%left '*' '/'

%%

S   :   S E '\n'        { printf("ans = %d\n", $2); }
    |   /* empty */     { /* empty */ }
    ;

E   :   E '+' E         { $$ = $1 + $3; }
    |   E '-' E         { $$ = $1 - $3; }
    |   E '*' E         { $$ = $1 * $3; }
    |   E '/' E         { $$ = $1 / $3; }
    |   T_NUM           { $$ = $1; }
    |   '(' E ')'       { $$ = $2; }
    ;

%%

int main() {
    return yyparse();
}

生成的头文件

#ifndef YY_YY_CAL2_TAB_H_INCLUDED
# define YY_YY_CAL2_TAB_H_INCLUDED
/* Debug traces.  */
#ifndef YYDEBUG
# define YYDEBUG 0
#endif
#if YYDEBUG
extern int yydebug;
#endif

/* Token type.  */
#ifndef YYTOKENTYPE
# define YYTOKENTYPE
  enum yytokentype
  {
    NUMBER = 258,
    ADD = 259,
    SUB = 260,
    MUL = 261,
    DIV = 262,
    ABS = 263,
    EOL = 264
  };
#endif

/* Value type.  */
#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED
typedef int YYSTYPE;
# define YYSTYPE_IS_TRIVIAL 1
# define YYSTYPE_IS_DECLARED 1
#endif


extern YYSTYPE yylval;

int yyparse (void);

#endif /* !YY_YY_CAL2_TAB_H_INCLUDED  */

可以直接引入到.l文件中

文件格式

[说明部分]
%%
翻译规则
[%%
辅助过程]

说明部分包括两部分,一部分是处于%{%}之间的c语言代码,定义全局变量和相关文件.

第二部分是文法记号声明,该部分每一项均以%开头.

语义值类型定义

语法分析器除了包含用来保存文法符号和状态的分析栈之外,还有一个用来保存与分析栈中文法符号相对应的语义值的语义栈.即YYSTYPE,抽象数据类型,缺省为int

用户定义YYSTYPE

若为简单类型

1
2
3

%{
#define YYSTYPE float
%}

若为复杂类型,使用共用体.

%union
{
SYMBOL *sym;
ENODE *NODE
}

引用时

1 2	%token <sym> id %type <node> expr

%token表示终结符

%type表示非终结符

文法开始符定义

一般以%start S表明S是文法的开始符号.默认为规则部分中出现的第一个产生式的左部非终极符号.

优先性与结合性

%left 左结合

%nonassoc 无结合性

结合性说明排列的次序决定运算符的优先级,在后写的优先级越高

翻译规则

1
2
3

A: a1 {语义动作}
|a2 {}
;

$$表示与产生式做不非终结符号相关的属性值.

$i 表示与产生式右部第i个文法符号相关的属性值. 在每个$i算出来之后再进行归约.

辅助过程

包含主程序,词法分析程序yylex,yyerror.

YY_DECL
{
	yy_state_type yy_current_state;
	char *yy_cp, *yy_bp;
	int yy_act;
    
	if ( !(yy_init) )
		{
		(yy_init) = 1;

#ifdef YY_USER_INIT
		YY_USER_INIT;
#endif

		if ( ! (yy_start) )
			(yy_start) = 1;	/* first start state */

		if ( ! yyin )
			yyin = stdin;

		if ( ! yyout )
			yyout = stdout;

		if ( ! YY_CURRENT_BUFFER ) {
			yyensure_buffer_stack ();
			YY_CURRENT_BUFFER_LVALUE =
				yy_create_buffer( yyin, YY_BUF_SIZE );
		}

		yy_load_buffer_state(  );
		}

	{
#line 15 "cal2.l"

#line 681 "lex.yy.c"

	while ( /*CONSTCOND*/1 )		/* loops until end-of-file is reached */
		{
		yy_cp = (yy_c_buf_p);

		/* Support of yytext. */
		*yy_cp = (yy_hold_char);

		/* yy_bp points to the position in yy_ch_buf of the start of
		 * the current run.
		 */
		yy_bp = yy_cp;

		yy_current_state = (yy_start);
yy_match:
		do
			{
			YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ;
			if ( yy_accept[yy_current_state] )
				{
				(yy_last_accepting_state) = yy_current_state;
				(yy_last_accepting_cpos) = yy_cp;
				}
			while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state )
				{
				yy_current_state = (int) yy_def[yy_current_state];
				if ( yy_current_state >= 15 )
					yy_c = yy_meta[yy_c];
				}
			yy_current_state = yy_nxt[yy_base[yy_current_state] + yy_c];
			++yy_cp;
			}
		while ( yy_base[yy_current_state] != 13 );

yy_find_action:
		yy_act = yy_accept[yy_current_state];
		if ( yy_act == 0 )
			{ /* have to back up */
			yy_cp = (yy_last_accepting_cpos);
			yy_current_state = (yy_last_accepting_state);
			yy_act = yy_accept[yy_current_state];
			}

		YY_DO_BEFORE_ACTION;

do_action:	/* This label is used only to access EOF actions. */

		switch ( yy_act )
	{ /* beginning of action switch */
			case 0: /* must back up */
			/* undo the effects of YY_DO_BEFORE_ACTION */
			*yy_cp = (yy_hold_char);
			yy_cp = (yy_last_accepting_cpos);
			yy_current_state = (yy_last_accepting_state);
			goto yy_find_action;

case 1:
YY_RULE_SETUP
#line 16 "cal2.l"
{return ADD; }
	YY_BREAK
case 2:
YY_RULE_SETUP
#line 17 "cal2.l"
{return SUB; }
	YY_BREAK
case 3:
YY_RULE_SETUP
#line 18 "cal2.l"
{return MUL; }
	YY_BREAK
case 4:
YY_RULE_SETUP
#line 19 "cal2.l"
{return DIV; }
	YY_BREAK
case 5:
YY_RULE_SETUP
#line 20 "cal2.l"
{return ABS; }
	YY_BREAK
case 6:
YY_RULE_SETUP
#line 21 "cal2.l"
{ yylval = atoi(yytext); return NUMBER; } //yytext匹配 yylval返回值
	YY_BREAK
case 7:
/* rule 7 can match eol */
YY_RULE_SETUP
#line 22 "cal2.l"
{return EOL;}
	YY_BREAK
case 8:
YY_RULE_SETUP
#line 23 "cal2.l"
{}
	YY_BREAK
case 9:
YY_RULE_SETUP
#line 24 "cal2.l"
{ ECHO ;}
	YY_BREAK
case 10:
YY_RULE_SETUP
#line 25 "cal2.l"
ECHO;
	YY_BREAK
#line 789 "lex.yy.c"
case YY_STATE_EOF(INITIAL):
	yyterminate();

	case YY_END_OF_BUFFER:
		{
		/* Amount of text matched not including the EOB char. */
		int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1;

		/* Undo the effects of YY_DO_BEFORE_ACTION. */
		*yy_cp = (yy_hold_char);
		YY_RESTORE_YY_MORE_OFFSET

		if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW )
			{
			/* We're scanning a new file or input source.  It's
			 * possible that this happened because the user
			 * just pointed yyin at a new source and called
			 * yylex().  If so, then we have to assure
			 * consistency between YY_CURRENT_BUFFER and our
			 * globals.  Here is the right place to do so, because
			 * this is the first action (other than possibly a
			 * back-up) that will match for the new input source.
			 */
			(yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars;
			YY_CURRENT_BUFFER_LVALUE->yy_input_file = yyin;
			YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL;
			}

		/* Note that here we test for yy_c_buf_p "<=" to the position
		 * of the first EOB in the buffer, since yy_c_buf_p will
		 * already have been incremented past the NUL character
		 * (since all states make transitions on EOB to the
		 * end-of-buffer state).  Contrast this with the test
		 * in input().
		 */
		if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] )
			{ /* This was really a NUL. */
			yy_state_type yy_next_state;

			(yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text;

			yy_current_state = yy_get_previous_state(  );

			/* Okay, we're now positioned to make the NUL
			 * transition.  We couldn't have
			 * yy_get_previous_state() go ahead and do it
			 * for us because it doesn't know how to deal
			 * with the possibility of jamming (and we don't
			 * want to build jamming into it because then it
			 * will run more slowly).
			 */

			yy_next_state = yy_try_NUL_trans( yy_current_state );

			yy_bp = (yytext_ptr) + YY_MORE_ADJ;

			if ( yy_next_state )
				{
				/* Consume the NUL. */
				yy_cp = ++(yy_c_buf_p);
				yy_current_state = yy_next_state;
				goto yy_match;
				}

			else
				{
				yy_cp = (yy_c_buf_p);
				goto yy_find_action;
				}
			}

		else switch ( yy_get_next_buffer(  ) )
			{
			case EOB_ACT_END_OF_FILE:
				{
				(yy_did_buffer_switch_on_eof) = 0;

				if ( yywrap(  ) )
					{
					/* Note: because we've taken care in
					 * yy_get_next_buffer() to have set up
					 * yytext, we can now set up
					 * yy_c_buf_p so that if some total
					 * hoser (like flex itself) wants to
					 * call the scanner after we return the
					 * YY_NULL, it'll still work - another
					 * YY_NULL will get returned.
					 */
					(yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ;

					yy_act = YY_STATE_EOF(YY_START);
					goto do_action;
					}

				else
					{
					if ( ! (yy_did_buffer_switch_on_eof) )
						YY_NEW_FILE;
					}
				break;
				}

			case EOB_ACT_CONTINUE_SCAN:
				(yy_c_buf_p) =
					(yytext_ptr) + yy_amount_of_matched_text;

				yy_current_state = yy_get_previous_state(  );

				yy_cp = (yy_c_buf_p);
				yy_bp = (yytext_ptr) + YY_MORE_ADJ;
				goto yy_match;

			case EOB_ACT_LAST_MATCH:
				(yy_c_buf_p) =
				&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)];

				yy_current_state = yy_get_previous_state(  );

				yy_cp = (yy_c_buf_p);
				yy_bp = (yytext_ptr) + YY_MORE_ADJ;
				goto yy_find_action;
			}
		break;
		}

	default:
		YY_FATAL_ERROR(
			"fatal flex scanner internal error--no action found" );
	} /* end of action switch */
		} /* end of scanning one token */
	} /* end of user's declarations */
} /* end of yylex */

以上是yylex函数.

流程

yyparse调用yylex,调用yylex时得到一个二元式,<记号属性值>

记号通过return会送给yyparse(),记号必须在yacc中%token声明.属性值通过全局变量yylval会送给yyparse

参考书籍6

flex和bison

编译与反编译实战